From 99c7b5bbf5a519dd61fec475c4cbe81f0175dff9 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Mon, 18 Jan 2021 10:38:13 -0700 Subject: [PATCH 01/44] Add query and generator class --- experimenter/modify_generator.py | 52 +++++++++++++++++++ experimenter/query.py | 85 ++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 experimenter/modify_generator.py create mode 100644 experimenter/query.py diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py new file mode 100644 index 0000000..fc4c49a --- /dev/null +++ b/experimenter/modify_generator.py @@ -0,0 +1,52 @@ +from query import query_on_seeds, query_on_primitive + + +class ModifyGenerator: + """ Generator to be used for creating modified pipelines based on existing + pipelines in the database + """ + def __init__(self, *args): + self.args = args + #intialize commonly used variables + self.modifier_type = args.modifier_type + self.num_pipelines_to_run = args.num_pipelines_to_run + self.num_complete = 0 + #run the query on initializing to define the query results + self.query_results = self._query(self.modifier_type, self.args) + + def __next__(self): + #iterate through query results + for query_result in self.query_results: + pipeline, dataset, pipeline_run = query_result + #iterate through modifier results + for new_pipeline, dataset in self._modify(self.args): + self.num_complete += 1 + #check to run until the generator stops iterating (if no input for num_pipelines_to_run) + if (self.num_pipelines_to_run): + if (self.num_complete >= self.num_pipelines_to_run): + raise StopIteration + return (new_pipeline, new_dataset) + raise StopIteration + + def __iter__(self): + return self + + def _query(self, *args): + if (self.modifier_type=='random-seed'): + return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter): + if (self.modifier_type=='swap-primitive'): + return query_on_primitive(args.primitive_id, args.limit_indeces) + else: + raise ValueError("This type of modification is not yet an option") + + def _modify(self,*args): + if self.modifier_type=='random-seed': + self._modify_random_seed(args.random_seed, args.seed_limit) + if self.modifier_type=='swap-primitive': + self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive) + else: + raise ValueError("This type of modification is not yet an option") + + def _modify_random_seed(self, pipeline, dataset, args): + ##======== Create the random seed modifier + #yield random seeds and the pipeline/dataset to run on diff --git a/experimenter/query.py b/experimenter/query.py new file mode 100644 index 0000000..1e70e8e --- /dev/null +++ b/experimenter/query.py @@ -0,0 +1,85 @@ +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Search, Q +from tqdm import tqdm +from experimenter.utils import get_problem_parent_dir, build_problem_reference + +HOST = 'https://metalearning.datadrivendiscovery.org/es' +CONNECTION = Elasticsearch(hosts=[HOST], timeout=300) + +def query_on_primitive(primitive_id: str, limit_indexes=False): + '''Queries the metalearning database for pipelines using the specified primitive. + Queries the metalearning database using the Elasticsearch endpoint documented + on D3M's website (see https://metalearning.datadrivendiscovery.org for more + info). Finds all pipelines containing a certain primitive as specified by the + keyword argument. Also determines the index(es) of that primitive in each + matching pipeline and gets the datasets that were used in pipeline runs. + + Arguments + --------- + primitive_id : str + A primitive's unique id. + limit_indexes : 'first', 'last', or False (default) + Limits which index of the primitive is returned for each pipeline match. + Use 'first' to get the index of the first matching primitive specified by + the keyword arg. Use 'last' to get the index of the last match. Use False + (default) to get a list of all indexes for each pipeline specifying where + the primitive is. + + Yields + ------- + A list of tuples where each tuple contains (in this order): + 1. a matching pipeline + 2. the index(es) of the desired primitives in the given pipeline's steps + 3. a dictionary containing the datasets used in pipeline runs where the key + is the dataset digest and the value is the dataset id (human-readable string). + 4. the random seeds used in pipeline runs. + ''' + + if limit_indexes not in { 'first', 'last', False }: + raise ValueError(f'Invalid value "{limit_indexes}" for arg limit_indexes') + + match_query = Q('match', steps__primitive__id=primitive_id) + nested_query = Q('nested', path='steps', query=match_query) + pipeline_search = Search(using=CONNECTION, index='pipelines').query(nested_query) + + for pipeline in pipeline_search.scan(): + problem_ids, random_seeds = scan_pipeline_runs(pipeline.id) + + locs = [i for i, step in enumerate(pipeline.steps) if primitive_id == step.primitive.id] + if limit_indexes == 'last': + locs = locs[-1] + elif limit_indexes == 'first': + locs = locs[0] + + for problem_id in problem_ids: + yield pipeline.to_dict(), build_problem_reference(problem_id), locs, random_seeds + +def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): + pipeline_search = Search(using=CONNECTION, index='pipelines') + if pipeline_id: + pipeline_search = pipeline_search.query('match', id=pipeline_id) + if submitter: + pipeline_search = pipeline_search.query('match', _submitter=submitter) + + for pipeline in pipeline_search.scan(): + results = scan_pipeline_runs(pipeline.id, submitter) + for (problem_id, dataset_id), random_seeds in results.items(): + if limit and len(random_seeds) > limit: + continue + yield pipeline.to_dict(), build_problem_reference(problem_id), random_seeds + +def scan_pipeline_runs(pipeline_id, submitter=None): + pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ + .query('match', pipeline__id=pipeline_id) \ + .query('match', run__phase='PRODUCE') \ + .query('match', status__state='SUCCESS') + if submitter: + pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) + + results = dict() + for pipeline_run in pipeline_run_search.scan(): + for dataset in pipeline_run.datasets: + dataset_prob_tuple = (pipeline_run.problem.id, dataset.id) + results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) + results[dataset_prob_tuple].add(pipeline_run.random_seed) + return results From deee5fafa7c7ac16c79f887a41a486e7c8709f48 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Mon, 18 Jan 2021 14:13:00 -0700 Subject: [PATCH 02/44] Untested seed swap functionality --- experimenter/cli.py | 49 ++++++++++++++++++++++++++++++-- experimenter/modify_generator.py | 42 ++++++++++++++++++--------- experimenter/query.py | 2 +- 3 files changed, 76 insertions(+), 17 deletions(-) diff --git a/experimenter/cli.py b/experimenter/cli.py index fd6296c..ce681f7 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -125,11 +125,56 @@ def search_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParse def configure_modify_parser(parser: argparse.ArgumentParser) -> None: - pass + #create the subparsers for the different types of modifications + + #seed swapper functionality + subparser = parser.add_subparsers(dest='modify_type') + subparsers.required = True + swap_seed_subparser = subparsers.add_parser( + 'random-seed', + description='Uses database data to search pipelines and run functional pipelines on different random seeds', + ) + #subparser arguments + swap_seed_subparser.add_argument( + '--pipeline_id', + description='The pipeline id to search for in the query, if none, searches all pipelines', + default=None, + type=str) + swap_seed_subparser.add_argument( + '--submitter', + help='The pipeline submitter to add to the query', + default=None, + type=str) + swap_seed_subparser.add_argument( + '--seed_limit', + help='The amount of random seeds that each ran pipeline will have at the end of the test', + default=2, + type=int) + + #Primitive swapper functionality + primitive_swap_subparser = subparsers.add_parser( + 'primitive-swap', + description='Searches database for pipeline runs containing a primitive a swaps out primitive for a different given primitive') + #subparser arguments + primitive_swap_subparser.add_argument( + '--primitive_id', + help='The id of the primitive to swap out', + default=None, + type=str) + primitive_swap_subparser.add_argument( + '--limit_indeces', + help='Details for primitive swapping', + default=None) def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None: - raise exceptions.NotImplementedError() + modify_type = arguments.modify_type + modify_type_parser = parser._subparsers._group_actions[0].choices[modify_type] + modify_arguments = modify_type_parser.parse_args(argv[1:]) + modify_generator = ModifyGenerator(modify_type, modify_arguments, arguments.max-jobs) + #now run the enqueuer part + enqueuer = queue.JobEnqueuer(arguments) + enqueuer.enqueue(modify_generator) def configure_update_parser(parser: argparse.ArgumentParser) -> None: diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index fc4c49a..06cc786 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -5,31 +5,33 @@ class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing pipelines in the database """ - def __init__(self, *args): + def __init__(self, modify_type: str='random-seed', max_jobs: int=None, *args): self.args = args #intialize commonly used variables - self.modifier_type = args.modifier_type - self.num_pipelines_to_run = args.num_pipelines_to_run + self.modifier_type = modify_type + self.max_jobs = max_jobs self.num_complete = 0 #run the query on initializing to define the query results self.query_results = self._query(self.modifier_type, self.args) + def __next__(self): #iterate through query results for query_result in self.query_results: - pipeline, dataset, pipeline_run = query_result #iterate through modifier results - for new_pipeline, dataset in self._modify(self.args): + for job in self._modify(query_result, self.args): self.num_complete += 1 #check to run until the generator stops iterating (if no input for num_pipelines_to_run) - if (self.num_pipelines_to_run): - if (self.num_complete >= self.num_pipelines_to_run): + if (self.max_jobs): + if (self.num_complete >= self.max_jobs): raise StopIteration - return (new_pipeline, new_dataset) + return job raise StopIteration + def __iter__(self): return self + def _query(self, *args): if (self.modifier_type=='random-seed'): @@ -38,15 +40,27 @@ def _query(self, *args): return query_on_primitive(args.primitive_id, args.limit_indeces) else: raise ValueError("This type of modification is not yet an option") + - def _modify(self,*args): + def _modify(self, query_args: dict, *args): if self.modifier_type=='random-seed': - self._modify_random_seed(args.random_seed, args.seed_limit) + return self._modify_random_seed(args.seed_limit, query_args) if self.modifier_type=='swap-primitive': - self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive) + return self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive) else: raise ValueError("This type of modification is not yet an option") - def _modify_random_seed(self, pipeline, dataset, args): - ##======== Create the random seed modifier - #yield random seeds and the pipeline/dataset to run on + + def _modify_random_seed(self, seed_limit, query_args): + used_seeds = query_args.tested_seeds + num_run = len(used_seeds) + #run until the right number of seeds have been run + while (num_run < seed_limit): + new_seed = randint(1,100000) + if (new_seed in used_seeds): + continue + num_run += 1 + used_seeds.append(new_seed) + #yield the necessary job requirements + yield query_args.pipeline, query_args.problem_ref, new_seed + diff --git a/experimenter/query.py b/experimenter/query.py index 1e70e8e..93ebe1d 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -66,7 +66,7 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') for (problem_id, dataset_id), random_seeds in results.items(): if limit and len(random_seeds) > limit: continue - yield pipeline.to_dict(), build_problem_reference(problem_id), random_seeds + yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'tested_seeds': random_seeds} def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ From cdf770a90cb6b6c5fef51bfcf612b5ec5b33b642 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 22 Jan 2021 09:56:25 -0700 Subject: [PATCH 03/44] Minor syntax fixes --- experimenter/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experimenter/cli.py b/experimenter/cli.py index ce681f7..b6b6489 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -128,7 +128,7 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: #create the subparsers for the different types of modifications #seed swapper functionality - subparser = parser.add_subparsers(dest='modify_type') + subparsers = parser.add_subparsers(dest='modify_type') subparsers.required = True swap_seed_subparser = subparsers.add_parser( 'random-seed', @@ -137,7 +137,7 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: #subparser arguments swap_seed_subparser.add_argument( '--pipeline_id', - description='The pipeline id to search for in the query, if none, searches all pipelines', + help='The pipeline id to search for in the query, if none, searches all pipelines', default=None, type=str) swap_seed_subparser.add_argument( From b323c1f08ded784f3b80fd52fb44d9616a324d45 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 26 Jan 2021 14:07:15 -0700 Subject: [PATCH 04/44] Unit test, and job maker --- experimenter/cli.py | 12 ++++++++---- experimenter/modify_generator.py | 29 +++++++++++++++++++++++++---- experimenter/query.py | 2 +- tests/test_modifier.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 9 deletions(-) create mode 100644 tests/test_modifier.py diff --git a/experimenter/cli.py b/experimenter/cli.py index 9e80365..a6dc2dc 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -158,7 +158,7 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: #Primitive swapper functionality primitive_swap_subparser = subparsers.add_parser( 'primitive-swap', - description='Searches database for pipeline runs containing a primitive a swaps out primitive for a different given primitive') + description='Searches database for pipeline runs containing a primitive and swaps out primitive for a different given primitive') #subparser arguments primitive_swap_subparser.add_argument( '--primitive_id', @@ -169,16 +169,20 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: '--limit_indeces', help='Details for primitive swapping', default=None) + primitive_swap_subparser.add_argument( + '--swap_primitive_id', + help='The id of the primitve to swap in', + default=None + type=str) def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None: modify_type = arguments.modify_type modify_type_parser = parser._subparsers._group_actions[0].choices[modify_type] modify_arguments = modify_type_parser.parse_args(argv[1:]) - modify_generator = ModifyGenerator(modify_type, modify_arguments, arguments.max-jobs) + modify_generator = ModifyGenerator(modify_type, arguments.max-jobs, modify_arguments) #now run the enqueuer part - enqueuer = queue.JobEnqueuer(arguments) - enqueuer.enqueue(modify_generator) + queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port) def configure_update_parser(parser: argparse.ArgumentParser) -> None: diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 06cc786..3f9fd36 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,4 +1,6 @@ from query import query_on_seeds, query_on_primitive +from . import queue +import d3m.metadata.pipeline class ModifyGenerator: @@ -19,7 +21,8 @@ def __next__(self): #iterate through query results for query_result in self.query_results: #iterate through modifier results - for job in self._modify(query_result, self.args): + for job_args in self._modify(query_result, self.args): + job = queue.make_job(exectue_pipeline_on_problem, jobs_args) self.num_complete += 1 #check to run until the generator stops iterating (if no input for num_pipelines_to_run) if (self.max_jobs): @@ -31,7 +34,7 @@ def __next__(self): def __iter__(self): return self - + def _query(self, *args): if (self.modifier_type=='random-seed'): @@ -46,10 +49,23 @@ def _modify(self, query_args: dict, *args): if self.modifier_type=='random-seed': return self._modify_random_seed(args.seed_limit, query_args) if self.modifier_type=='swap-primitive': - return self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive) + return self._modifiy_swap_primitive(args.swap_primitive_id ,query_args) else: raise ValueError("This type of modification is not yet an option") + + def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check): + """Pseudo function/method for duplicate checking - this is not complete + """ + #create the pipeline to check for duplicates from the path + pipeline_object = d3m.metadata.pipeline.Pipeline.from_json(pipeline_to_check) + #query through the database for equal pipelines + similar_pipeline_runs_in_database = query.generate_similar_pipeline_runs() + for pipeline in similar_pipeline_runs_in_database: + if(pipeline_object.equals(pipeline)): + return True + return False + def _modify_random_seed(self, seed_limit, query_args): used_seeds = query_args.tested_seeds @@ -62,5 +78,10 @@ def _modify_random_seed(self, seed_limit, query_args): num_run += 1 used_seeds.append(new_seed) #yield the necessary job requirements - yield query_args.pipeline, query_args.problem_ref, new_seed + yield query_args.pipeline, query_args.problem_ref, new_seed + + + def _modify_swap_primitive(self, swap_pipeline, query_args): + raise ValueError("No functionality for swapping primitives yet") + diff --git a/experimenter/query.py b/experimenter/query.py index 93ebe1d..ec95f58 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -52,7 +52,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False): locs = locs[0] for problem_id in problem_ids: - yield pipeline.to_dict(), build_problem_reference(problem_id), locs, random_seeds + yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'location': locs, 'tested_seeds': random_seeds} def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): pipeline_search = Search(using=CONNECTION, index='pipelines') diff --git a/tests/test_modifier.py b/tests/test_modifier.py new file mode 100644 index 0000000..3b1fb9a --- /dev/null +++ b/tests/test_modifier.py @@ -0,0 +1,32 @@ +import unittest +from experimenter import modify_generator, queue, exceptions, utils +from query import query_on_seeds + +class ModifierTestCase(unittest.TestCase): + + def test_seed_modifier(self): + #initialize the modifier with random-seed and a given max jobs + args = {'seed_limit':25, 'submitter':None, 'pipeline_id':None} + num_test = 21 + modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25}) + #start the counter to make sure there are the right amount of jobs + counter = 0 + seed_old = 12.1 + #begin the test if number of jobs is correct + for job in modifier: + counter += 1 + _,_,seed_new = job + self.assertNotEqual(seed_old, seed_new) + seed_old = seed_new + self.assertEqual(counter,num_test) + + def test_query_seeds(self): + args = {'seed_limit':25, 'submitter':'byu', 'pipeline_id':None} + query_results = query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter) + #test 10 query results + for i in range(10): + _,_,seed_list = next(query_results) + self.assertTrue(len(seed_list) < seed_limit) + +if __name__ == '__main__': + unittest.main() From fdd281320c8bdb347ab0c4ceb709b194bfb50626 Mon Sep 17 00:00:00 2001 From: Joseph Clark Date: Tue, 26 Jan 2021 15:38:18 -0700 Subject: [PATCH 05/44] functions to execute D3M runtime evaluate via cli --- experimenter/execute_pipeline_new.py | 149 +++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 experimenter/execute_pipeline_new.py diff --git a/experimenter/execute_pipeline_new.py b/experimenter/execute_pipeline_new.py new file mode 100644 index 0000000..b6eafbf --- /dev/null +++ b/experimenter/execute_pipeline_new.py @@ -0,0 +1,149 @@ +import itertools as it +import os +from typing import Any, List, Tuple + + +from d3m.metadata.pipeline import Pipeline +from d3m import cli + +from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID + +def execute_pipeline_on_problem( + pipe: Pipeline, + problem: ProblemReference, + random_seed: int): + """ TODO: function one-liner + + TODO doc + """ + pipeline_path = pipeline.id + problem_path = problem.path + input_path = problem.dataset_doc_path + output_run_path = '-' + data_random_seed = random_seed + + execute_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path, + output_run_path, data_random_seed) + +def execute_pipeline_via_d3m_cli(pipeline_path: str, + problem_path: str, + input_path: str, + output_run_path: str, + data_random_seed: int, + data_params: List[Tuple[str,Any]] = None, + data_pipeline: str = K_FOLD_TABULAR_SPLIT_PIPELINE_ID, + scoring_pipeline: str = SCORING_PIPELINE_ID, + input_run_path: str = None, + metric: str = None, + scoring_params: List[Tuple[str,Any]] = None, + scores_path: str = None, + scoring_random_seed: int = None, + data_split_file_path: str = None): + """ TODO: function one-liner + + TODO: function summary + + # data_pipeline_path - 10 fold cross validation default + + Required Arguments: + --------------------------------- + pipeline_path -- TODO: arg doc + problem_path -- TODO: arg doc + input_path -- TODO: arg doc + output_run_path -- TODO: arg doc + data_random_seed -- TODO: arg doc + + Optional Arguments: + --------------------------------- + data_params -- TODO: arg doc + data_pipeline -- TODO: arg doc + scoring_pipeline -- TODO: arg doc + input_run_path -- TODO: arg doc + metric -- TODO: arg doc + scoring_params -- TODO: arg doc + scores_path -- TODO: arg doc + scoring_random_seed -- TODO: arg doc + data_split_file_path -- TODO: arg doc + + Raises: + ------- + ValueError: TODO: doc + + Return: + ------- + TODO: return doc + """ + args = ['d3m', 'runtime', 'evaluate'] + + if (not os.path.isfile(pipeline_path)): + raise ValueError('\'pipeline_path\' param is not a file') + + if (not os.path.isfile(problem_path)): # TODO: check for URI + raise ValueError('\'problem_path\' param is not a file') + + if (not os.path.isfile(input_path)): # TODO: check for URI + raise ValueError('\'input_path\' param is not a file') + + if (not isinstance(output_run_path, str) and output_run_path != '-'): + # TODO: how to check for nonexistent file? parse? + raise ValueError('\'output_run_path\' param is not a valid value') + + if (not isinstance(data_random_seed, int)): + raise TypeError('\'{}\' param is not of type \'{}\''.format('data_random_seed','int')) + + if (input_run_path): + # TODO: input_run_path validation + pass + + args.extend(('--pipeline ', pipeline_path)) + args.extend(('--problem', problem_path)) + args.extend(('--input', input_path)) + args.extend(('--output-run', output_run_path)) + args.extend(('--data-random-seed', data_random_seed)) + + for data_param in data_params: + args.extend(('--data-param', data_param[0], data_param[1])) + + if (data_params): + if (not isinstance(data_params, List)): + raise TypeError('\'{}\' param is not of type \'{}\''.format('data_params','List')) + for data_param in data_params: + args.extend(('--data-param', data_param[0], data_param[1])) + + if (data_pipeline): + # TODO: how to check if data_pipeline is pipeline id? (guid?) + args.extend(('--data-pipeline', data_pipeline)) + + if (scoring_pipeline): + # TODO: how to check if scoring_pipeline is pipeline id? + args.extend(('--scoring-pipeline', scoring_pipeline)) + + if (metric): + # TODO: set of valid metric args? + args.extend(('--metric', metric)) + + if (scoring_params): + if (not isinstance(scoring_params, List)): + raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_params','List')) + for scoring_param in scoring_params: + args.extend(('--scoring-param', scoring_param[0], scoring_param[1])) + + if (scores_path): + # TODO: how to check for nonexistent file? parse? + args.extend(('--scores', scores_path)) + + if (scoring_random_seed): + if (not isinstance(scoring_random_seed, int)): + raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_random_seed','int')) + args.extend(('--scoring-random-seed', scoring_random_seed)) + + if (data_split_file_path): + if (not os.path.isfile(data_split_file_path)): + raise ValueError('\'data_split_file_path\' param is not a file') + args.extend(('--data-split-file', data_split_file_path)) + + cli.main(args) + +if __name__ == '__main__': + path = 'README.md' + execute_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)]) From 275cbc6040f22fb678533f59996eb35f9b63362f Mon Sep 17 00:00:00 2001 From: Joseph Clark Date: Thu, 28 Jan 2021 14:41:26 -0700 Subject: [PATCH 06/44] added data_prep_pipelines --- .../3c11d171-e2ad-4d26-a034-04f3b062306c.yml | 82 +++++++++++ .../79ce71bd-db96-494b-a455-14f2e2ac5040.yml | 84 +++++++++++ .../9c18472e-fff7-4129-93f6-1ab996e82adb.yml | 84 +++++++++++ .../data_preparation_pipelines/__init__.py | 31 ++++ .../c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml | 82 +++++++++++ .../f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml | 31 ++++ experimenter/execute_pipeline_new.py | 137 ++++++++++++------ experimenter/run_pipeline.py | 2 +- 8 files changed, 489 insertions(+), 44 deletions(-) create mode 100644 experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml create mode 100644 experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml create mode 100644 experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml create mode 100644 experimenter/data_preparation_pipelines/__init__.py create mode 100644 experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml create mode 100644 experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml diff --git a/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml b/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml new file mode 100644 index 0000000..695f53c --- /dev/null +++ b/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml @@ -0,0 +1,82 @@ +id: 3c11d171-e2ad-4d26-a034-04f3b062306c +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2018-07-28T01:24:39.642266Z" +name: Train-test split of tabular datasets +description: | + Train-test split of tabular datasets. +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.0.produce + - name: test datasets + data: steps.2.produce + - name: score datasets + data: steps.1.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 3fcc6dc4-6681-4c86-948e-066d14e7d803 + version: 0.1.0 + python_path: d3m.primitives.evaluation.train_score_dataset_split.Common + name: Train-score tabular dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce + - id: produce_score_data + # Step 1. We redact privileged attributes for both score and test splits. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.0.produce_score_data + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/PrivilegedData + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData + - https://metadata.datadrivendiscovery.org/types/MissingData + # Step 2. We further redact targets in test split. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/TrueTarget + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedTarget + - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml b/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml new file mode 100644 index 0000000..6a91f91 --- /dev/null +++ b/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml @@ -0,0 +1,84 @@ +id: 79ce71bd-db96-494b-a455-14f2e2ac5040 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2018-10-26T00:48:08.341897Z" +name: No split of tabular datasets +description: | + A pipeline which splits a tabular dataset in a way that for all splits it + produces the same (full) dataset. It still redacts the test split. + Useful for unsupervised learning tasks. +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.0.produce + - name: test datasets + data: steps.2.produce + - name: score datasets + data: steps.1.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 48c683ad-da9e-48cf-b3a0-7394dba5e5d2 + version: 0.1.0 + python_path: d3m.primitives.evaluation.no_split_dataset_split.Common + name: No-split tabular dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce + - id: produce_score_data + # Step 1. We redact privileged attributes for both score and test splits. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.0.produce_score_data + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/PrivilegedData + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData + - https://metadata.datadrivendiscovery.org/types/MissingData + # Step 2. We further redact targets in test split. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/TrueTarget + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedTarget + - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml b/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml new file mode 100644 index 0000000..80e2a2c --- /dev/null +++ b/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml @@ -0,0 +1,84 @@ +id: 9c18472e-fff7-4129-93f6-1ab996e82adb +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2018-10-27T01:30:10.245934Z" +name: Fixed split of tabular datasets +description: | + A pipeline which splits a tabular dataset in a way that uses for the test + (score) split a fixed list of primary index values or row indices of the main + resource to be used. +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.0.produce + - name: test datasets + data: steps.2.produce + - name: score datasets + data: steps.1.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 1654f000-2178-4520-be4c-a95bc26b8d3a + version: 0.1.0 + python_path: d3m.primitives.evaluation.fixed_split_dataset_split.Commmon + name: Fixed split tabular dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce + - id: produce_score_data + # Step 1. We redact privileged attributes for both score and test splits. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.0.produce_score_data + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/PrivilegedData + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData + - https://metadata.datadrivendiscovery.org/types/MissingData + # Step 2. We further redact targets in test split. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/TrueTarget + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedTarget + - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/__init__.py b/experimenter/data_preparation_pipelines/__init__.py new file mode 100644 index 0000000..4b52dec --- /dev/null +++ b/experimenter/data_preparation_pipelines/__init__.py @@ -0,0 +1,31 @@ +import os.path + +SCORING_PIPELINE_ID = 'f596cd77-25f8-4d4c-a350-bb30ab1e58f6' +SCORING_PIPELINE_PATH = os.path.join( + os.path.dirname(__file__), SCORING_PIPELINE_ID + '.yml' +) +assert os.path.exists(SCORING_PIPELINE_PATH) + +NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '79ce71bd-db96-494b-a455-14f2e2ac5040' +NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( + os.path.dirname(__file__), NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml' +) +assert os.path.exists(NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH) + +FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '9c18472e-fff7-4129-93f6-1ab996e82adb' +FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( + os.path.dirname(__file__), FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml' +) +assert os.path.exists(FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH) + +TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID = '3c11d171-e2ad-4d26-a034-04f3b062306c' +TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( + os.path.dirname(__file__), TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID + '.yml' +) +assert os.path.exists(TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH) + +K_FOLD_TABULAR_SPLIT_PIPELINE_ID = 'c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8' +K_FOLD_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( + os.path.dirname(__file__), K_FOLD_TABULAR_SPLIT_PIPELINE_ID + '.yml' +) +assert os.path.exists(K_FOLD_TABULAR_SPLIT_PIPELINE_PATH) diff --git a/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml b/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml new file mode 100644 index 0000000..91f14f2 --- /dev/null +++ b/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml @@ -0,0 +1,82 @@ +id: c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2018-07-27T19:39:00.676949Z" +name: K-fold split of tabular datasets +description: | + K-fold split of tabular datasets for cross-validation. +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.0.produce + - name: test datasets + data: steps.2.produce + - name: score datasets + data: steps.1.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: bfedaf3a-6dd0-4a83-ad83-3a50fe882bf8 + version: 0.1.0 + python_path: d3m.primitives.evaluation.kfold_dataset_split.Common + name: K-fold cross-validation tabular dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce + - id: produce_score_data + # Step 1. We redact privileged attributes for both score and test splits. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.0.produce_score_data + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/PrivilegedData + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData + - https://metadata.datadrivendiscovery.org/types/MissingData + # Step 2. We further redact targets in test split. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/TrueTarget + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedTarget + - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml b/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml new file mode 100644 index 0000000..e95ecd5 --- /dev/null +++ b/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml @@ -0,0 +1,31 @@ +id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2020-04-18T11:42:44.138742Z" +name: Scoring pipeline +description: |- + A general scoring pipeline. +inputs: + - name: predictions + - name: score dataset +outputs: + - name: scores + data: steps.0.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 799802fb-2e11-4ab7-9c5e-dda09eb52a70 + version: 0.5.0 + python_path: d3m.primitives.evaluation.compute_scores.Core + name: Compute scores given the metrics to use + arguments: + inputs: + type: CONTAINER + data: inputs.0 + score_dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce diff --git a/experimenter/execute_pipeline_new.py b/experimenter/execute_pipeline_new.py index b6eafbf..e6d1c7d 100644 --- a/experimenter/execute_pipeline_new.py +++ b/experimenter/execute_pipeline_new.py @@ -1,7 +1,8 @@ import itertools as it import os -from typing import Any, List, Tuple +from typing import Any, List, Tuple +from uuid import UUID from d3m.metadata.pipeline import Pipeline from d3m import cli @@ -25,20 +26,20 @@ def execute_pipeline_on_problem( execute_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path, output_run_path, data_random_seed) -def execute_pipeline_via_d3m_cli(pipeline_path: str, - problem_path: str, - input_path: str, - output_run_path: str, +def execute_pipeline_via_d3m_cli(pipeline: str, + problem: str, + input: str, + output_run: str, data_random_seed: int, data_params: List[Tuple[str,Any]] = None, data_pipeline: str = K_FOLD_TABULAR_SPLIT_PIPELINE_ID, scoring_pipeline: str = SCORING_PIPELINE_ID, - input_run_path: str = None, + input_run: str = None, metric: str = None, scoring_params: List[Tuple[str,Any]] = None, - scores_path: str = None, + scores: str = None, scoring_random_seed: int = None, - data_split_file_path: str = None): + data_split_file: str = None): """ TODO: function one-liner TODO: function summary @@ -47,10 +48,10 @@ def execute_pipeline_via_d3m_cli(pipeline_path: str, Required Arguments: --------------------------------- - pipeline_path -- TODO: arg doc - problem_path -- TODO: arg doc - input_path -- TODO: arg doc - output_run_path -- TODO: arg doc + pipeline -- TODO: arg doc + problem -- TODO: arg doc + input -- TODO: arg doc + output_run -- TODO: arg doc data_random_seed -- TODO: arg doc Optional Arguments: @@ -58,15 +59,16 @@ def execute_pipeline_via_d3m_cli(pipeline_path: str, data_params -- TODO: arg doc data_pipeline -- TODO: arg doc scoring_pipeline -- TODO: arg doc - input_run_path -- TODO: arg doc + input_run -- TODO: arg doc metric -- TODO: arg doc scoring_params -- TODO: arg doc - scores_path -- TODO: arg doc + scores -- TODO: arg doc scoring_random_seed -- TODO: arg doc - data_split_file_path -- TODO: arg doc + data_split_file -- TODO: arg doc Raises: ------- + TypeError: TODO: doc ValueError: TODO: doc Return: @@ -75,75 +77,124 @@ def execute_pipeline_via_d3m_cli(pipeline_path: str, """ args = ['d3m', 'runtime', 'evaluate'] - if (not os.path.isfile(pipeline_path)): - raise ValueError('\'pipeline_path\' param is not a file') + if (not isinstance(pipeline, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('pipeline', 'str')) - if (not os.path.isfile(problem_path)): # TODO: check for URI - raise ValueError('\'problem_path\' param is not a file') + if (not isinstance(problem_path, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('problem', 'str')) - if (not os.path.isfile(input_path)): # TODO: check for URI - raise ValueError('\'input_path\' param is not a file') + if (not isinstance(input, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('input', 'str')) - if (not isinstance(output_run_path, str) and output_run_path != '-'): - # TODO: how to check for nonexistent file? parse? - raise ValueError('\'output_run_path\' param is not a valid value') + if (not isinstance(output_run, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('output_run', 'str')) if (not isinstance(data_random_seed, int)): - raise TypeError('\'{}\' param is not of type \'{}\''.format('data_random_seed','int')) + raise TypeError('\'{}\' param not of type \'{}\''.format('data_random_seed','int')) - if (input_run_path): - # TODO: input_run_path validation - pass + if (not os.path.isfile(pipeline) and not is_valid_uuid(pipeline)): + raise ValueError('\'{}\' param not a file path'.format('pipeline')) - args.extend(('--pipeline ', pipeline_path)) - args.extend(('--problem', problem_path)) - args.extend(('--input', input_path)) + if (not os.path.isfile(problem)): # TODO: check for URI + raise ValueError('\'{}\' param not a file path'.format('problem')) + + if (not os.path.isfile(input)): # TODO: check for URI + raise ValueError('\'{}\' param not a file path'.format('input')) + + if (output_run != '-'): # TODO: output_run value check. how to check for nonexistent file? parse? + raise ValueError('\'{}\' param invalid: {\'-\'}'.format('output_run')) + + args.extend(('--pipeline ', pipeline)) + args.extend(('--problem', problem)) + args.extend(('--input', input)) args.extend(('--output-run', output_run_path)) args.extend(('--data-random-seed', data_random_seed)) - for data_param in data_params: - args.extend(('--data-param', data_param[0], data_param[1])) + if (input_run): + if (not isinstance(input_run, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('input_run','str')) + if (not os.path.isfile(input_run) and input_run != '-'): + raise ValueError('\'{}\' param invalid: {file_path, \'-\'}'.format('input_run')) + # TODO: input_run validation + pass if (data_params): if (not isinstance(data_params, List)): - raise TypeError('\'{}\' param is not of type \'{}\''.format('data_params','List')) + raise TypeError('\'{}\' param not of type \'{}\''.format('data_params','List')) for data_param in data_params: args.extend(('--data-param', data_param[0], data_param[1])) if (data_pipeline): - # TODO: how to check if data_pipeline is pipeline id? (guid?) + if (not isinstance(data_pipeline, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('data_pipeline','str')) + if (not os.path.isfile(data_pipeline) and not is_valid_uuid(data_pipeline)): + raise ValueError('\'{}\' param not a file path'.format('data_pipeline')) args.extend(('--data-pipeline', data_pipeline)) if (scoring_pipeline): - # TODO: how to check if scoring_pipeline is pipeline id? + if (not isinstance(scoring_pipeline, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str')) + if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)): + raise ValueError('\'{}\' param not a file path'.format('scoring_pipeline')) args.extend(('--scoring-pipeline', scoring_pipeline)) if (metric): + if (not isinstance(metric, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('metric','str')) # TODO: set of valid metric args? args.extend(('--metric', metric)) if (scoring_params): if (not isinstance(scoring_params, List)): - raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_params','List')) + raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_params','List')) for scoring_param in scoring_params: args.extend(('--scoring-param', scoring_param[0], scoring_param[1])) - if (scores_path): + if (scores): # TODO: how to check for nonexistent file? parse? args.extend(('--scores', scores_path)) if (scoring_random_seed): if (not isinstance(scoring_random_seed, int)): - raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_random_seed','int')) + raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_random_seed','int')) args.extend(('--scoring-random-seed', scoring_random_seed)) - if (data_split_file_path): - if (not os.path.isfile(data_split_file_path)): - raise ValueError('\'data_split_file_path\' param is not a file') - args.extend(('--data-split-file', data_split_file_path)) + if (data_split_file): + if (not isinstance(data_split_file, str)): + raise TypeError('\'{}\' param not of type \'{}\''.format('data_split_file','str')) + if (not os.path.isfile(data_split_file)): + raise ValueError('\'{}\' param invalid value: {file_path, \'-\'}'.format('data_split_file')) + args.extend(('--data-split-file', data_split_file)) cli.main(args) +def is_valid_uuid(uuid_to_test: str, version=4): + """ + Check if uuid_to_test is a valid UUID. + + Parameters + ---------- + uuid_to_test : str + version : {1, 2, 3, 4} + + Returns + ------- + `True` if uuid_to_test is a valid UUID, otherwise `False`. + + Examples + -------- + >>> is_valid_uuid('c9bf9e57-1685-4c89-bafb-ff5af830be8a') + True + >>> is_valid_uuid('c9bf9e58') + False + """ + + try: + uuid_obj = UUID(uuid_to_test, version=version) + except Exception: + return False + return str(uuid_obj) == uuid_to_test + if __name__ == '__main__': path = 'README.md' execute_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)]) diff --git a/experimenter/run_pipeline.py b/experimenter/run_pipeline.py index 6d722b0..c6752f0 100644 --- a/experimenter/run_pipeline.py +++ b/experimenter/run_pipeline.py @@ -57,7 +57,7 @@ def run(self, pipeline: Pipeline, metric_names: list = None) -> list: simimlar to that of `_evaluate` in the Runtime code. The aforementioned function does not allow for returning the data, so it did not fit in the workflow. - + :param pipeline: the pipeline object to be run OR the path to the pipeline file to be used :param metric_names: if provided, the pipeline will be scored against this custom From 9b029881946a03f65c5da9b9d4f249b9d78986c5 Mon Sep 17 00:00:00 2001 From: Joseph Clark Date: Thu, 28 Jan 2021 15:19:19 -0700 Subject: [PATCH 07/44] renamed execute --> evaluate. created blank file for new implementation of a problem --- .../{execute_pipeline_new.py => evaluate_pipeline_new.py} | 8 ++++---- experimenter/problem_new.py | 0 2 files changed, 4 insertions(+), 4 deletions(-) rename experimenter/{execute_pipeline_new.py => evaluate_pipeline_new.py} (96%) create mode 100644 experimenter/problem_new.py diff --git a/experimenter/execute_pipeline_new.py b/experimenter/evaluate_pipeline_new.py similarity index 96% rename from experimenter/execute_pipeline_new.py rename to experimenter/evaluate_pipeline_new.py index e6d1c7d..2fc4eb3 100644 --- a/experimenter/execute_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -9,7 +9,7 @@ from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID -def execute_pipeline_on_problem( +def evaluate_pipeline_on_problem( pipe: Pipeline, problem: ProblemReference, random_seed: int): @@ -23,10 +23,10 @@ def execute_pipeline_on_problem( output_run_path = '-' data_random_seed = random_seed - execute_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path, + evaluate_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path, output_run_path, data_random_seed) -def execute_pipeline_via_d3m_cli(pipeline: str, +def evaluate_pipeline_via_d3m_cli(pipeline: str, problem: str, input: str, output_run: str, @@ -197,4 +197,4 @@ def is_valid_uuid(uuid_to_test: str, version=4): if __name__ == '__main__': path = 'README.md' - execute_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)]) + evaluate_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)]) diff --git a/experimenter/problem_new.py b/experimenter/problem_new.py new file mode 100644 index 0000000..e69de29 From a0c9991feb6660718c6759448e3f5aff15629ef9 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 29 Jan 2021 10:31:51 -0700 Subject: [PATCH 08/44] Bug fixes and queueing --- experimenter/cli.py | 7 +++---- experimenter/modify_generator.py | 16 ++++++++-------- experimenter/query.py | 3 ++- experimenter/utils.py | 22 +++++++++++++++++++++- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/experimenter/cli.py b/experimenter/cli.py index a6dc2dc..cb83e9d 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -2,6 +2,7 @@ import typing from experimenter import exceptions, queue +from experimenter.modify_generator import ModifyGenerator def main(argv: typing.Sequence) -> None: @@ -172,15 +173,13 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: primitive_swap_subparser.add_argument( '--swap_primitive_id', help='The id of the primitve to swap in', - default=None + default=None, type=str) def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None: modify_type = arguments.modify_type - modify_type_parser = parser._subparsers._group_actions[0].choices[modify_type] - modify_arguments = modify_type_parser.parse_args(argv[1:]) - modify_generator = ModifyGenerator(modify_type, arguments.max-jobs, modify_arguments) + modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments) #now run the enqueuer part queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 3f9fd36..f344581 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,5 +1,5 @@ -from query import query_on_seeds, query_on_primitive -from . import queue +from experimenter.query import query_on_seeds, query_on_primitive +from experimenter import queue import d3m.metadata.pipeline @@ -7,14 +7,14 @@ class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing pipelines in the database """ - def __init__(self, modify_type: str='random-seed', max_jobs: int=None, *args): + def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None): self.args = args #intialize commonly used variables self.modifier_type = modify_type self.max_jobs = max_jobs self.num_complete = 0 #run the query on initializing to define the query results - self.query_results = self._query(self.modifier_type, self.args) + self.query_results = self._query(self.args) def __next__(self): @@ -36,20 +36,20 @@ def __iter__(self): return self - def _query(self, *args): + def _query(self, args): if (self.modifier_type=='random-seed'): - return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter): + return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter) if (self.modifier_type=='swap-primitive'): return query_on_primitive(args.primitive_id, args.limit_indeces) else: raise ValueError("This type of modification is not yet an option") - def _modify(self, query_args: dict, *args): + def _modify(self, query_args: dict, args): if self.modifier_type=='random-seed': return self._modify_random_seed(args.seed_limit, query_args) if self.modifier_type=='swap-primitive': - return self._modifiy_swap_primitive(args.swap_primitive_id ,query_args) + return self._modifiy_swap_primitive(args.swap_primitive_id, query_args) else: raise ValueError("This type of modification is not yet an option") diff --git a/experimenter/query.py b/experimenter/query.py index ec95f58..14d368a 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -1,7 +1,7 @@ from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q from tqdm import tqdm -from experimenter.utils import get_problem_parent_dir, build_problem_reference +from experimenter.utils import build_problem_reference HOST = 'https://metalearning.datadrivendiscovery.org/es' CONNECTION = Elasticsearch(hosts=[HOST], timeout=300) @@ -52,6 +52,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False): locs = locs[0] for problem_id in problem_ids: + yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'location': locs, 'tested_seeds': random_seeds} def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): diff --git a/experimenter/utils.py b/experimenter/utils.py index ed429af..b8a1bea 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -10,7 +10,8 @@ import docker from d3m.metadata import problem as problem_module - +from experimenter.problem import ProblemReference +from d3m.utils import get_datasets_and_problems DEFAULT_DATASET_DIR = "/datasets/training_datasets/LL0" @@ -29,6 +30,25 @@ def get_dataset_doc_path( ) +def get_problem_parent_dir(problem_id: str): + """ + Getting the problem parent directory based on the given problem id and + DEFAULT_DATASET_DIR + """ + dir_name = problem_id + if any([x in problem_id for x in {'_problem', '_solution', '_dataset'}]): + dir_name = '_'.join(problem_id.split('_')[:-1]) + path_chunks = get_problem_path(problem_id).split('/') + return '/'.join(path_chunks[:path_chunks.index(dir_name)+1]) + + +def build_problem_reference(problem_id: str): + parent_dir = get_problem_parent_dir(problem_id) + dir_id = parent_dir.split('/')[-1] + enclosing_dir = '/'.join(parent_dir.split('/')[:-1]) + return ProblemReference(dir_id, '', enclosing_dir) + + def get_dataset_doc(dataset_name: str, dataset_dir: str = DEFAULT_DATASET_DIR) -> dict: """ Gets a dataset doc from a path and loads it From a4cc817faa8b082d5813ee11118c0602daafa4c1 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 12 Feb 2021 07:06:46 -0700 Subject: [PATCH 09/44] Syntax fixes and return paths from query --- experimenter/evaluate_pipeline_new.py | 21 ++------------------- experimenter/modify_generator.py | 6 +++--- experimenter/query.py | 14 +++++++------- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 2fc4eb3..39312da 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -7,24 +7,7 @@ from d3m.metadata.pipeline import Pipeline from d3m import cli -from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID - -def evaluate_pipeline_on_problem( - pipe: Pipeline, - problem: ProblemReference, - random_seed: int): - """ TODO: function one-liner - - TODO doc - """ - pipeline_path = pipeline.id - problem_path = problem.path - input_path = problem.dataset_doc_path - output_run_path = '-' - data_random_seed = random_seed - - evaluate_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path, - output_run_path, data_random_seed) +from experimenter.data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID def evaluate_pipeline_via_d3m_cli(pipeline: str, problem: str, @@ -134,7 +117,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (scoring_pipeline): if (not isinstance(scoring_pipeline, str)): raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str')) - if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)): + if (not os.path.isfile(scoring_pipeline) and not is_valid_uuid(scoring_pipeline)): raise ValueError('\'{}\' param not a file path'.format('scoring_pipeline')) args.extend(('--scoring-pipeline', scoring_pipeline)) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index f344581..4a6c1ec 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,7 +1,7 @@ from experimenter.query import query_on_seeds, query_on_primitive from experimenter import queue import d3m.metadata.pipeline - +from experimenter.evaluate_pipeline_new import evaluate_pipeline_via_d3m_cli as evaluate_pipeline class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing @@ -22,7 +22,7 @@ def __next__(self): for query_result in self.query_results: #iterate through modifier results for job_args in self._modify(query_result, self.args): - job = queue.make_job(exectue_pipeline_on_problem, jobs_args) + job = queue.make_job(evaluate_pipeline, jobs_args) self.num_complete += 1 #check to run until the generator stops iterating (if no input for num_pipelines_to_run) if (self.max_jobs): @@ -78,7 +78,7 @@ def _modify_random_seed(self, seed_limit, query_args): num_run += 1 used_seeds.append(new_seed) #yield the necessary job requirements - yield query_args.pipeline, query_args.problem_ref, new_seed + yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, '-', new_seed def _modify_swap_primitive(self, swap_pipeline, query_args): diff --git a/experimenter/query.py b/experimenter/query.py index 14d368a..41f48cc 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -1,7 +1,7 @@ from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q from tqdm import tqdm -from experimenter.utils import build_problem_reference +from experimenter.utils import get_problem_path, get_dataset_doc_path HOST = 'https://metalearning.datadrivendiscovery.org/es' CONNECTION = Elasticsearch(hosts=[HOST], timeout=300) @@ -43,7 +43,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False): pipeline_search = Search(using=CONNECTION, index='pipelines').query(nested_query) for pipeline in pipeline_search.scan(): - problem_ids, random_seeds = scan_pipeline_runs(pipeline.id) + results = scan_pipeline_runs(pipeline.id) locs = [i for i, step in enumerate(pipeline.steps) if primitive_id == step.primitive.id] if limit_indexes == 'last': @@ -51,9 +51,9 @@ def query_on_primitive(primitive_id: str, limit_indexes=False): elif limit_indexes == 'first': locs = locs[0] - for problem_id in problem_ids: + for (problem_id, dataset_name), random_seeds in results.items(): - yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'location': locs, 'tested_seeds': random_seeds} + yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds} def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): pipeline_search = Search(using=CONNECTION, index='pipelines') @@ -64,10 +64,10 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') for pipeline in pipeline_search.scan(): results = scan_pipeline_runs(pipeline.id, submitter) - for (problem_id, dataset_id), random_seeds in results.items(): + for (problem_id, dataset_name), random_seeds in results.items(): if limit and len(random_seeds) > limit: continue - yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'tested_seeds': random_seeds} + yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds} def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ @@ -80,7 +80,7 @@ def scan_pipeline_runs(pipeline_id, submitter=None): results = dict() for pipeline_run in pipeline_run_search.scan(): for dataset in pipeline_run.datasets: - dataset_prob_tuple = (pipeline_run.problem.id, dataset.id) + dataset_prob_tuple = (pipeline_run.problem.id, dataset.name) results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) results[dataset_prob_tuple].add(pipeline_run.random_seed) return results From 991ad7fa7d612754a90645dcabe5a3e368a3e9b7 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 12 Feb 2021 08:32:42 -0700 Subject: [PATCH 10/44] setup.py updates --- experimenter/query.py | 1 - setup.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/experimenter/query.py b/experimenter/query.py index 41f48cc..1cfd3a2 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -1,6 +1,5 @@ from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q -from tqdm import tqdm from experimenter.utils import get_problem_path, get_dataset_doc_path HOST = 'https://metalearning.datadrivendiscovery.org/es' diff --git a/setup.py b/setup.py index b22b2b4..7f1457f 100644 --- a/setup.py +++ b/setup.py @@ -14,5 +14,7 @@ 'docker>=4.4.0<4.5.0', 'redis>=3.5.0<3.6.0', 'rq>=1.7.0<1.8.0', + 'elasticsearch_dsl>=7.0.0<8.0.0', + 'elastcisearch>=7.0.0<8.0.0', ], ) From 83cceb461597c7b37906cd7b33843d25d799276d Mon Sep 17 00:00:00 2001 From: Joseph Clark Date: Thu, 18 Feb 2021 17:10:19 -0700 Subject: [PATCH 11/44] added function to save pipeline_run docs to DB. --- ...ipeline_new.py => execute_pipeline_new.py} | 83 +++++++++++++++---- 1 file changed, 66 insertions(+), 17 deletions(-) rename experimenter/{evaluate_pipeline_new.py => execute_pipeline_new.py} (77%) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/execute_pipeline_new.py similarity index 77% rename from experimenter/evaluate_pipeline_new.py rename to experimenter/execute_pipeline_new.py index 2fc4eb3..9afbb79 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/execute_pipeline_new.py @@ -1,4 +1,5 @@ import itertools as it +import json import os from typing import Any, List, Tuple @@ -7,24 +8,76 @@ from d3m.metadata.pipeline import Pipeline from d3m import cli +from experimenter.databases.d3m_mtl import D3MMtLDB from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID -def evaluate_pipeline_on_problem( - pipe: Pipeline, - problem: ProblemReference, - random_seed: int): - """ TODO: function one-liner +def save_pipeline_run_to_d3m_db(pipeline_run_path: str): + """ TODO: one-liner + + TODO: description + + Required Arguments: + --------------------------------- + pipeline_run_path -- path to pipeline_run doc to save + + Optional Arguments: + --------------------------------- + + Raises: + --------------------------------- + TODO + + Returns: + --------------------------------- + TODO + + """ + d3m_db = D3MMtLDB() + + pipeline_run_save_response = D3MMtLDB().save_pipeline_run(pipeline_run_path) - TODO doc +def evaluate_pipeline_on_problem(pipeline_path: str, + problem_path: str, + input_path: str, + data_random_seed: int): + """ TODO: one-liner + + TODO: description + + Required Arguments: + --------------------------------- + pipeline_path -- path to pipeline doc + problem_path -- path to problem doc + input_path -- path to input full data + data_random_seed -- random seed to be used for data preparation + + Optional Arguments: + --------------------------------- + + Raises: + --------------------------------- + TODO + + Returns: + --------------------------------- + TODO """ - pipeline_path = pipeline.id - problem_path = problem.path - input_path = problem.dataset_doc_path - output_run_path = '-' - data_random_seed = random_seed + output_run_path = [] + + with open(pipeline_path, 'r') as pipeline: + output_run_path.append(pipeline['properties']['digest']) + with open(problem_path, 'r') as problem: + output_run_path.append(problem['properties']['digest']) + with open(input_path, 'r') as input_f: + output_run_path.append(input_f['properties']['digest']) - evaluate_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path, - output_run_path, data_random_seed) + output_run_path = '_'.join(output_run_path) + + execute_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, + input=input_path, output_run=output_run_path, + data_random_seed=data_random_seed) + + save_pipeline_run_to_d3m_db(output_run_path) def evaluate_pipeline_via_d3m_cli(pipeline: str, problem: str, @@ -194,7 +247,3 @@ def is_valid_uuid(uuid_to_test: str, version=4): except Exception: return False return str(uuid_obj) == uuid_to_test - -if __name__ == '__main__': - path = 'README.md' - evaluate_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)]) From 5c4bb388cbaa127ba0f3f5fdff3ae2b6b3a5626a Mon Sep 17 00:00:00 2001 From: Joseph Clark Date: Fri, 19 Feb 2021 12:54:11 -0700 Subject: [PATCH 12/44] updated documentation --- experimenter/execute_pipeline_new.py | 201 ++++++++++++++++----------- 1 file changed, 117 insertions(+), 84 deletions(-) diff --git a/experimenter/execute_pipeline_new.py b/experimenter/execute_pipeline_new.py index 9afbb79..e9b02ec 100644 --- a/experimenter/execute_pipeline_new.py +++ b/experimenter/execute_pipeline_new.py @@ -12,55 +12,54 @@ from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID def save_pipeline_run_to_d3m_db(pipeline_run_path: str): - """ TODO: one-liner + """ + Saves a pipeline run document to the d3m database. - TODO: description - - Required Arguments: - --------------------------------- - pipeline_run_path -- path to pipeline_run doc to save - - Optional Arguments: - --------------------------------- + Parameters + ---------- + pipeline_run_path : path_like str + path to pipeline_run document - Raises: - --------------------------------- - TODO - Returns: - --------------------------------- + ---------- TODO + Raises: + ---------- + TODO """ d3m_db = D3MMtLDB() - - pipeline_run_save_response = D3MMtLDB().save_pipeline_run(pipeline_run_path) + return D3MMtLDB().save_pipeline_run(pipeline_run_path) def evaluate_pipeline_on_problem(pipeline_path: str, problem_path: str, input_path: str, data_random_seed: int): - """ TODO: one-liner - - TODO: description + """ + Evaluate pipeline on problem. + A less verbose form of running d3m's runtime cli 'evaluate' command. + See 'evaluate_pipeline_via_d3m_cli' for more options for running + the 'evaluate' command. - Required Arguments: - --------------------------------- - pipeline_path -- path to pipeline doc - problem_path -- path to problem doc - input_path -- path to input full data - data_random_seed -- random seed to be used for data preparation + Parameters + ---------- + pipeline_path : path_like str + path to pipeline doc + problem_path : path_like str + path to problem doc + input_path : path_like str + path to input full data + data_random_seed : int + random seed to be used for data preparation - Optional Arguments: - --------------------------------- + Returns: + ---------- + None Raises: --------------------------------- - TODO - - Returns: - --------------------------------- - TODO + OSError + when a file cannot be opened """ output_run_path = [] @@ -71,7 +70,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str, with open(input_path, 'r') as input_f: output_run_path.append(input_f['properties']['digest']) - output_run_path = '_'.join(output_run_path) + output_run_path = '_'.join(output_run_path) + '.json' execute_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, @@ -93,40 +92,77 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, scores: str = None, scoring_random_seed: int = None, data_split_file: str = None): - """ TODO: function one-liner - - TODO: function summary - - # data_pipeline_path - 10 fold cross validation default + """ + Evaluate pipeline on problem using d3m's runtime cli. + Wrapper function to execute d3m's runtime cli 'evaluate' command. + Arguments mirror the same arguments using the cli. - Required Arguments: - --------------------------------- - pipeline -- TODO: arg doc - problem -- TODO: arg doc - input -- TODO: arg doc - output_run -- TODO: arg doc - data_random_seed -- TODO: arg doc - - Optional Arguments: - --------------------------------- - data_params -- TODO: arg doc - data_pipeline -- TODO: arg doc - scoring_pipeline -- TODO: arg doc - input_run -- TODO: arg doc - metric -- TODO: arg doc - scoring_params -- TODO: arg doc - scores -- TODO: arg doc - scoring_random_seed -- TODO: arg doc - data_split_file -- TODO: arg doc - - Raises: - ------- - TypeError: TODO: doc - ValueError: TODO: doc + Parameters + ---------- + pipeline : path_like or uuid4 str + path to pipeline doc or pipeline ID + problem : path_like str + path to problem doc + input : path_like str + path to input full data + output_run : path_like str or '-' + path where pipeline_run doc + will be saved. + use '-' for stdin + data_random_seed : int + random seed to use for + data preparation + data_params : list of tuples, optional + hyper-parameter names and values + for data preparation. + None by default + data_pipeline : path_like str or uuid4 str, optional + path to data preparation pipeline file + or pipeline ID. + K_FOLD_TABULAR_SPLIT_PIPELINE_ID by default + scoring_pipeline : path_like str or uuid4 str, optional + path to scoring pipeline file + or pipeline ID. + SCORING_PIPELINE_ID by default + input_run : path_like str or '-', optional + path to pipeline_run file + with configuration. + use '-' for stdin. + None by default + metric : str, optional + metric to use. + Metric from problem by default + scoring_params : list of tuples, optional + hyper-parameter names and values + for scoring pipeline. + None by default + scores : path_like str, optional + path to save scores. + None by default + scoring_random_seed : int, optional + random seed to use for scoring. + None by default + data_split_file : path_like str, optional + reads the split file and populates + "primary_index_values" hyper-parameter + for data preparation pipeline with values + from the "d3mIndex" column corresponding + to the test data. + use '-' for stdin. + None by default Return: ------- - TODO: return doc + None + + Raises: + ------- + TypeError + when parameter value has + incorrect type + ValueError + when parameter value is + invalid """ args = ['d3m', 'runtime', 'evaluate'] @@ -146,7 +182,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, raise TypeError('\'{}\' param not of type \'{}\''.format('data_random_seed','int')) if (not os.path.isfile(pipeline) and not is_valid_uuid(pipeline)): - raise ValueError('\'{}\' param not a file path'.format('pipeline')) + raise ValueError('\'{}\' param not a file path or pipeline ID'.format('pipeline')) if (not os.path.isfile(problem)): # TODO: check for URI raise ValueError('\'{}\' param not a file path'.format('problem')) @@ -154,9 +190,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): # TODO: check for URI raise ValueError('\'{}\' param not a file path'.format('input')) - if (output_run != '-'): # TODO: output_run value check. how to check for nonexistent file? parse? - raise ValueError('\'{}\' param invalid: {\'-\'}'.format('output_run')) - args.extend(('--pipeline ', pipeline)) args.extend(('--problem', problem)) args.extend(('--input', input)) @@ -181,14 +214,14 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not isinstance(data_pipeline, str)): raise TypeError('\'{}\' param not of type \'{}\''.format('data_pipeline','str')) if (not os.path.isfile(data_pipeline) and not is_valid_uuid(data_pipeline)): - raise ValueError('\'{}\' param not a file path'.format('data_pipeline')) + raise ValueError('\'{}\' param not a file path or pipeline ID'.format('data_pipeline')) args.extend(('--data-pipeline', data_pipeline)) if (scoring_pipeline): if (not isinstance(scoring_pipeline, str)): raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str')) if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)): - raise ValueError('\'{}\' param not a file path'.format('scoring_pipeline')) + raise ValueError('\'{}\' param not a file path or pipeline ID'.format('scoring_pipeline')) args.extend(('--scoring-pipeline', scoring_pipeline)) if (metric): @@ -204,7 +237,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--scoring-param', scoring_param[0], scoring_param[1])) if (scores): - # TODO: how to check for nonexistent file? parse? args.extend(('--scores', scores_path)) if (scoring_random_seed): @@ -215,7 +247,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (data_split_file): if (not isinstance(data_split_file, str)): raise TypeError('\'{}\' param not of type \'{}\''.format('data_split_file','str')) - if (not os.path.isfile(data_split_file)): + if (data_split_file != '-' and not os.path.isfile(data_split_file)): raise ValueError('\'{}\' param invalid value: {file_path, \'-\'}'.format('data_split_file')) args.extend(('--data-split-file', data_split_file)) @@ -225,25 +257,26 @@ def is_valid_uuid(uuid_to_test: str, version=4): """ Check if uuid_to_test is a valid UUID. - Parameters - ---------- + Parmaters + ------- uuid_to_test : str + str to test if valid uuid version : {1, 2, 3, 4} - + version of uuid for which to test + Returns ------- - `True` if uuid_to_test is a valid UUID, otherwise `False`. - - Examples - -------- - >>> is_valid_uuid('c9bf9e57-1685-4c89-bafb-ff5af830be8a') - True - >>> is_valid_uuid('c9bf9e58') - False + bool + `True` if uuid_to_test is a valid UUID, + otherwise `False` + + Raises: + ------- + TypeError + when str is not valid uuid """ - try: uuid_obj = UUID(uuid_to_test, version=version) - except Exception: + except TypeError: return False return str(uuid_obj) == uuid_to_test From 2635fddc906580bd573d25631e31f6d4cd997ca0 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 19 Feb 2021 14:06:37 -0700 Subject: [PATCH 13/44] setup.py add dependencies --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7f1457f..7d11df2 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ 'docker>=4.4.0<4.5.0', 'redis>=3.5.0<3.6.0', 'rq>=1.7.0<1.8.0', - 'elasticsearch_dsl>=7.0.0<8.0.0', - 'elastcisearch>=7.0.0<8.0.0', + 'elasticsearch>=7.0.0<8.0.0', + 'elasticsearch_dsl>=7.0.0<8.0.0' ], ) From ba5089af57fd1c8b56e25ba88f55981b69a7a03f Mon Sep 17 00:00:00 2001 From: Joseph Clark Date: Fri, 19 Feb 2021 14:16:17 -0700 Subject: [PATCH 14/44] fixed condition typo. renamed file --- .../{execute_pipeline_new.py => evaluate_pipeline_new.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename experimenter/{execute_pipeline_new.py => evaluate_pipeline_new.py} (99%) diff --git a/experimenter/execute_pipeline_new.py b/experimenter/evaluate_pipeline_new.py similarity index 99% rename from experimenter/execute_pipeline_new.py rename to experimenter/evaluate_pipeline_new.py index e9b02ec..93909f1 100644 --- a/experimenter/execute_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -220,7 +220,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (scoring_pipeline): if (not isinstance(scoring_pipeline, str)): raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str')) - if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)): + if (not os.path.isfile(scoring_pipeline) and not is_valid_uuid(scoring_pipeline)): raise ValueError('\'{}\' param not a file path or pipeline ID'.format('scoring_pipeline')) args.extend(('--scoring-pipeline', scoring_pipeline)) From 62d639ea0aa24d43effc6a0a919ec9bc59f29a36 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 19 Feb 2021 14:30:42 -0700 Subject: [PATCH 15/44] Update job maker in the seed swap functionality --- experimenter/modify_generator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 4a6c1ec..5942802 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,7 +1,7 @@ from experimenter.query import query_on_seeds, query_on_primitive from experimenter import queue import d3m.metadata.pipeline -from experimenter.evaluate_pipeline_new import evaluate_pipeline_via_d3m_cli as evaluate_pipeline +from experimenter.evaluate_pipeline_new import evalute_pipeline_on_problem as evaluate_pipeline class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing @@ -21,8 +21,12 @@ def __next__(self): #iterate through query results for query_result in self.query_results: #iterate through modifier results - for job_args in self._modify(query_result, self.args): - job = queue.make_job(evaluate_pipeline, jobs_args) + for pipeline_path, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args): + job = queue.make_job(evaluate_pipeline, + pipeline_path=pipeline_path, + problem_path=problem_path, + input_path=dataset_doc_path, + data_random_seed=seed) self.num_complete += 1 #check to run until the generator stops iterating (if no input for num_pipelines_to_run) if (self.max_jobs): @@ -78,7 +82,7 @@ def _modify_random_seed(self, seed_limit, query_args): num_run += 1 used_seeds.append(new_seed) #yield the necessary job requirements - yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, '-', new_seed + yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, new_seed def _modify_swap_primitive(self, swap_pipeline, query_args): From 37d94bc9c3348d5a9c1500d8c4ee8227666c32ea Mon Sep 17 00:00:00 2001 From: Joseph Clark Date: Mon, 22 Feb 2021 12:40:45 -0700 Subject: [PATCH 16/44] implemented review suggestions --- .../3c11d171-e2ad-4d26-a034-04f3b062306c.yml | 82 --------- .../79ce71bd-db96-494b-a455-14f2e2ac5040.yml | 84 --------- .../9c18472e-fff7-4129-93f6-1ab996e82adb.yml | 84 --------- .../data_preparation_pipelines/__init__.py | 31 ---- .../c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml | 82 --------- .../f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml | 31 ---- experimenter/evaluate_pipeline_new.py | 165 +----------------- setup.py | 1 + 8 files changed, 9 insertions(+), 551 deletions(-) delete mode 100644 experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml delete mode 100644 experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml delete mode 100644 experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml delete mode 100644 experimenter/data_preparation_pipelines/__init__.py delete mode 100644 experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml delete mode 100644 experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml diff --git a/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml b/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml deleted file mode 100644 index 695f53c..0000000 --- a/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml +++ /dev/null @@ -1,82 +0,0 @@ -id: 3c11d171-e2ad-4d26-a034-04f3b062306c -schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json -source: - name: Mitar -created: "2018-07-28T01:24:39.642266Z" -name: Train-test split of tabular datasets -description: | - Train-test split of tabular datasets. -inputs: - - name: folds - - name: full dataset -outputs: - - name: train datasets - data: steps.0.produce - - name: test datasets - data: steps.2.produce - - name: score datasets - data: steps.1.produce -steps: - # Step 0. - - type: PRIMITIVE - primitive: - id: 3fcc6dc4-6681-4c86-948e-066d14e7d803 - version: 0.1.0 - python_path: d3m.primitives.evaluation.train_score_dataset_split.Common - name: Train-score tabular dataset splits - arguments: - inputs: - type: CONTAINER - data: inputs.0 - dataset: - type: CONTAINER - data: inputs.1 - outputs: - - id: produce - - id: produce_score_data - # Step 1. We redact privileged attributes for both score and test splits. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.0.produce_score_data - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/PrivilegedData - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData - - https://metadata.datadrivendiscovery.org/types/MissingData - # Step 2. We further redact targets in test split. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.1.produce - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/TrueTarget - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedTarget - - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml b/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml deleted file mode 100644 index 6a91f91..0000000 --- a/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml +++ /dev/null @@ -1,84 +0,0 @@ -id: 79ce71bd-db96-494b-a455-14f2e2ac5040 -schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json -source: - name: Mitar -created: "2018-10-26T00:48:08.341897Z" -name: No split of tabular datasets -description: | - A pipeline which splits a tabular dataset in a way that for all splits it - produces the same (full) dataset. It still redacts the test split. - Useful for unsupervised learning tasks. -inputs: - - name: folds - - name: full dataset -outputs: - - name: train datasets - data: steps.0.produce - - name: test datasets - data: steps.2.produce - - name: score datasets - data: steps.1.produce -steps: - # Step 0. - - type: PRIMITIVE - primitive: - id: 48c683ad-da9e-48cf-b3a0-7394dba5e5d2 - version: 0.1.0 - python_path: d3m.primitives.evaluation.no_split_dataset_split.Common - name: No-split tabular dataset splits - arguments: - inputs: - type: CONTAINER - data: inputs.0 - dataset: - type: CONTAINER - data: inputs.1 - outputs: - - id: produce - - id: produce_score_data - # Step 1. We redact privileged attributes for both score and test splits. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.0.produce_score_data - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/PrivilegedData - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData - - https://metadata.datadrivendiscovery.org/types/MissingData - # Step 2. We further redact targets in test split. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.1.produce - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/TrueTarget - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedTarget - - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml b/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml deleted file mode 100644 index 80e2a2c..0000000 --- a/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml +++ /dev/null @@ -1,84 +0,0 @@ -id: 9c18472e-fff7-4129-93f6-1ab996e82adb -schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json -source: - name: Mitar -created: "2018-10-27T01:30:10.245934Z" -name: Fixed split of tabular datasets -description: | - A pipeline which splits a tabular dataset in a way that uses for the test - (score) split a fixed list of primary index values or row indices of the main - resource to be used. -inputs: - - name: folds - - name: full dataset -outputs: - - name: train datasets - data: steps.0.produce - - name: test datasets - data: steps.2.produce - - name: score datasets - data: steps.1.produce -steps: - # Step 0. - - type: PRIMITIVE - primitive: - id: 1654f000-2178-4520-be4c-a95bc26b8d3a - version: 0.1.0 - python_path: d3m.primitives.evaluation.fixed_split_dataset_split.Commmon - name: Fixed split tabular dataset splits - arguments: - inputs: - type: CONTAINER - data: inputs.0 - dataset: - type: CONTAINER - data: inputs.1 - outputs: - - id: produce - - id: produce_score_data - # Step 1. We redact privileged attributes for both score and test splits. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.0.produce_score_data - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/PrivilegedData - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData - - https://metadata.datadrivendiscovery.org/types/MissingData - # Step 2. We further redact targets in test split. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.1.produce - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/TrueTarget - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedTarget - - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/__init__.py b/experimenter/data_preparation_pipelines/__init__.py deleted file mode 100644 index 4b52dec..0000000 --- a/experimenter/data_preparation_pipelines/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -import os.path - -SCORING_PIPELINE_ID = 'f596cd77-25f8-4d4c-a350-bb30ab1e58f6' -SCORING_PIPELINE_PATH = os.path.join( - os.path.dirname(__file__), SCORING_PIPELINE_ID + '.yml' -) -assert os.path.exists(SCORING_PIPELINE_PATH) - -NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '79ce71bd-db96-494b-a455-14f2e2ac5040' -NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( - os.path.dirname(__file__), NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml' -) -assert os.path.exists(NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH) - -FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '9c18472e-fff7-4129-93f6-1ab996e82adb' -FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( - os.path.dirname(__file__), FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml' -) -assert os.path.exists(FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH) - -TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID = '3c11d171-e2ad-4d26-a034-04f3b062306c' -TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( - os.path.dirname(__file__), TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID + '.yml' -) -assert os.path.exists(TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH) - -K_FOLD_TABULAR_SPLIT_PIPELINE_ID = 'c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8' -K_FOLD_TABULAR_SPLIT_PIPELINE_PATH = os.path.join( - os.path.dirname(__file__), K_FOLD_TABULAR_SPLIT_PIPELINE_ID + '.yml' -) -assert os.path.exists(K_FOLD_TABULAR_SPLIT_PIPELINE_PATH) diff --git a/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml b/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml deleted file mode 100644 index 91f14f2..0000000 --- a/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml +++ /dev/null @@ -1,82 +0,0 @@ -id: c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8 -schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json -source: - name: Mitar -created: "2018-07-27T19:39:00.676949Z" -name: K-fold split of tabular datasets -description: | - K-fold split of tabular datasets for cross-validation. -inputs: - - name: folds - - name: full dataset -outputs: - - name: train datasets - data: steps.0.produce - - name: test datasets - data: steps.2.produce - - name: score datasets - data: steps.1.produce -steps: - # Step 0. - - type: PRIMITIVE - primitive: - id: bfedaf3a-6dd0-4a83-ad83-3a50fe882bf8 - version: 0.1.0 - python_path: d3m.primitives.evaluation.kfold_dataset_split.Common - name: K-fold cross-validation tabular dataset splits - arguments: - inputs: - type: CONTAINER - data: inputs.0 - dataset: - type: CONTAINER - data: inputs.1 - outputs: - - id: produce - - id: produce_score_data - # Step 1. We redact privileged attributes for both score and test splits. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.0.produce_score_data - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/PrivilegedData - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData - - https://metadata.datadrivendiscovery.org/types/MissingData - # Step 2. We further redact targets in test split. - - type: PRIMITIVE - primitive: - id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 - version: 0.2.0 - python_path: d3m.primitives.evaluation.redact_columns.Common - name: Redact columns for evaluation - arguments: - inputs: - type: CONTAINER - data: steps.1.produce - outputs: - - id: produce - hyperparams: - semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/TrueTarget - add_semantic_types: - type: VALUE - data: - - https://metadata.datadrivendiscovery.org/types/RedactedTarget - - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml b/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml deleted file mode 100644 index e95ecd5..0000000 --- a/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml +++ /dev/null @@ -1,31 +0,0 @@ -id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6 -schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json -source: - name: Mitar -created: "2020-04-18T11:42:44.138742Z" -name: Scoring pipeline -description: |- - A general scoring pipeline. -inputs: - - name: predictions - - name: score dataset -outputs: - - name: scores - data: steps.0.produce -steps: - # Step 0. - - type: PRIMITIVE - primitive: - id: 799802fb-2e11-4ab7-9c5e-dda09eb52a70 - version: 0.5.0 - python_path: d3m.primitives.evaluation.compute_scores.Core - name: Compute scores given the metrics to use - arguments: - inputs: - type: CONTAINER - data: inputs.0 - score_dataset: - type: CONTAINER - data: inputs.1 - outputs: - - id: produce diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 93909f1..5e4d8e1 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -1,15 +1,15 @@ -import itertools as it +mport itertools as it import json import os from typing import Any, List, Tuple from uuid import UUID -from d3m.metadata.pipeline import Pipeline from d3m import cli +from d3m.d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, + SCORING_PIPELINE_ID) from experimenter.databases.d3m_mtl import D3MMtLDB -from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID def save_pipeline_run_to_d3m_db(pipeline_run_path: str): """ @@ -76,22 +76,11 @@ def evaluate_pipeline_on_problem(pipeline_path: str, input=input_path, output_run=output_run_path, data_random_seed=data_random_seed) - save_pipeline_run_to_d3m_db(output_run_path) - def evaluate_pipeline_via_d3m_cli(pipeline: str, problem: str, input: str, output_run: str, - data_random_seed: int, - data_params: List[Tuple[str,Any]] = None, - data_pipeline: str = K_FOLD_TABULAR_SPLIT_PIPELINE_ID, - scoring_pipeline: str = SCORING_PIPELINE_ID, - input_run: str = None, - metric: str = None, - scoring_params: List[Tuple[str,Any]] = None, - scores: str = None, - scoring_random_seed: int = None, - data_split_file: str = None): + data_random_seed: int): """ Evaluate pipeline on problem using d3m's runtime cli. Wrapper function to execute d3m's runtime cli 'evaluate' command. @@ -112,44 +101,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, data_random_seed : int random seed to use for data preparation - data_params : list of tuples, optional - hyper-parameter names and values - for data preparation. - None by default - data_pipeline : path_like str or uuid4 str, optional - path to data preparation pipeline file - or pipeline ID. - K_FOLD_TABULAR_SPLIT_PIPELINE_ID by default - scoring_pipeline : path_like str or uuid4 str, optional - path to scoring pipeline file - or pipeline ID. - SCORING_PIPELINE_ID by default - input_run : path_like str or '-', optional - path to pipeline_run file - with configuration. - use '-' for stdin. - None by default - metric : str, optional - metric to use. - Metric from problem by default - scoring_params : list of tuples, optional - hyper-parameter names and values - for scoring pipeline. - None by default - scores : path_like str, optional - path to save scores. - None by default - scoring_random_seed : int, optional - random seed to use for scoring. - None by default - data_split_file : path_like str, optional - reads the split file and populates - "primary_index_values" hyper-parameter - for data preparation pipeline with values - from the "d3mIndex" column corresponding - to the test data. - use '-' for stdin. - None by default Return: ------- @@ -157,37 +108,19 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, Raises: ------- - TypeError - when parameter value has - incorrect type ValueError when parameter value is invalid """ args = ['d3m', 'runtime', 'evaluate'] - if (not isinstance(pipeline, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('pipeline', 'str')) - - if (not isinstance(problem_path, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('problem', 'str')) - - if (not isinstance(input, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('input', 'str')) - - if (not isinstance(output_run, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('output_run', 'str')) - - if (not isinstance(data_random_seed, int)): - raise TypeError('\'{}\' param not of type \'{}\''.format('data_random_seed','int')) - - if (not os.path.isfile(pipeline) and not is_valid_uuid(pipeline)): + if (not os.path.isfile(pipeline)): raise ValueError('\'{}\' param not a file path or pipeline ID'.format('pipeline')) - if (not os.path.isfile(problem)): # TODO: check for URI + if (not os.path.isfile(problem)): raise ValueError('\'{}\' param not a file path'.format('problem')) - if (not os.path.isfile(input)): # TODO: check for URI + if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) args.extend(('--pipeline ', pipeline)) @@ -196,87 +129,5 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--output-run', output_run_path)) args.extend(('--data-random-seed', data_random_seed)) - if (input_run): - if (not isinstance(input_run, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('input_run','str')) - if (not os.path.isfile(input_run) and input_run != '-'): - raise ValueError('\'{}\' param invalid: {file_path, \'-\'}'.format('input_run')) - # TODO: input_run validation - pass - - if (data_params): - if (not isinstance(data_params, List)): - raise TypeError('\'{}\' param not of type \'{}\''.format('data_params','List')) - for data_param in data_params: - args.extend(('--data-param', data_param[0], data_param[1])) - - if (data_pipeline): - if (not isinstance(data_pipeline, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('data_pipeline','str')) - if (not os.path.isfile(data_pipeline) and not is_valid_uuid(data_pipeline)): - raise ValueError('\'{}\' param not a file path or pipeline ID'.format('data_pipeline')) - args.extend(('--data-pipeline', data_pipeline)) - - if (scoring_pipeline): - if (not isinstance(scoring_pipeline, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str')) - if (not os.path.isfile(scoring_pipeline) and not is_valid_uuid(scoring_pipeline)): - raise ValueError('\'{}\' param not a file path or pipeline ID'.format('scoring_pipeline')) - args.extend(('--scoring-pipeline', scoring_pipeline)) - - if (metric): - if (not isinstance(metric, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('metric','str')) - # TODO: set of valid metric args? - args.extend(('--metric', metric)) - - if (scoring_params): - if (not isinstance(scoring_params, List)): - raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_params','List')) - for scoring_param in scoring_params: - args.extend(('--scoring-param', scoring_param[0], scoring_param[1])) - - if (scores): - args.extend(('--scores', scores_path)) - - if (scoring_random_seed): - if (not isinstance(scoring_random_seed, int)): - raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_random_seed','int')) - args.extend(('--scoring-random-seed', scoring_random_seed)) - - if (data_split_file): - if (not isinstance(data_split_file, str)): - raise TypeError('\'{}\' param not of type \'{}\''.format('data_split_file','str')) - if (data_split_file != '-' and not os.path.isfile(data_split_file)): - raise ValueError('\'{}\' param invalid value: {file_path, \'-\'}'.format('data_split_file')) - args.extend(('--data-split-file', data_split_file)) - cli.main(args) - -def is_valid_uuid(uuid_to_test: str, version=4): - """ - Check if uuid_to_test is a valid UUID. - - Parmaters - ------- - uuid_to_test : str - str to test if valid uuid - version : {1, 2, 3, 4} - version of uuid for which to test - - Returns - ------- - bool - `True` if uuid_to_test is a valid UUID, - otherwise `False` - - Raises: - ------- - TypeError - when str is not valid uuid - """ - try: - uuid_obj = UUID(uuid_to_test, version=version) - except TypeError: - return False - return str(uuid_obj) == uuid_to_test + save_pipeline_run_to_d3m_db(output_run_path) diff --git a/setup.py b/setup.py index b22b2b4..884e224 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ install_requires=[ 'd3m', # TODO: add version bounds 'docker>=4.4.0<4.5.0', + 'mypy==0.812', 'redis>=3.5.0<3.6.0', 'rq>=1.7.0<1.8.0', ], From d308e927df80decfc8c56ce4fd1b8df2241d9d8f Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Tue, 23 Feb 2021 02:16:58 +0000 Subject: [PATCH 17/44] setup.py elasticsearch and working queue/enqueue --- experimenter/cli.py | 2 +- experimenter/config.py | 14 +++++++------- experimenter/evaluate_pipeline_new.py | 4 ++-- experimenter/modify_generator.py | 14 ++++++++------ experimenter/query.py | 8 ++++---- setup.py | 6 +++--- 6 files changed, 25 insertions(+), 23 deletions(-) diff --git a/experimenter/cli.py b/experimenter/cli.py index b7e02a4..5bbd356 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -189,7 +189,7 @@ def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParse modify_type = arguments.modify_type modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments) #now run the enqueuer part - queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port) + queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port,arguments.job_timeout) def configure_update_parser(parser: argparse.ArgumentParser) -> None: diff --git a/experimenter/config.py b/experimenter/config.py index 9150182..ccb884f 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -17,10 +17,10 @@ D3M_DB_SUBMITTER = os.getenv("D3M_DB_SUBMITTER") D3M_DB_TOKEN = os.getenv("D3M_DB_TOKEN") -try: - MONGO_HOST = os.environ["MONGO_HOST"] - MONGO_PORT = int(os.environ["MONGO_PORT"]) - REDIS_HOST = os.environ["REDIS_HOST"] - REDIS_PORT = int(os.environ["REDIS_PORT"]) -except Exception: - logger.exception("environment variables not set") +#try: +# MONGO_HOST = os.environ["MONGO_HOST"] +# MONGO_PORT = int(os.environ["MONGO_PORT"]) +# REDIS_HOST = os.environ["REDIS_HOST"] +# REDIS_PORT = int(os.environ["REDIS_PORT"]) +#except Exception: +# logger.exception("environment variables not set") diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 93909f1..904c3e6 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -9,7 +9,7 @@ from d3m import cli from experimenter.databases.d3m_mtl import D3MMtLDB -from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID +from experimenter.data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID def save_pipeline_run_to_d3m_db(pipeline_run_path: str): """ @@ -72,7 +72,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str, output_run_path = '_'.join(output_run_path) + '.json' - execute_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, + evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, data_random_seed=data_random_seed) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 5942802..ec5106c 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,7 +1,8 @@ from experimenter.query import query_on_seeds, query_on_primitive from experimenter import queue -import d3m.metadata.pipeline -from experimenter.evaluate_pipeline_new import evalute_pipeline_on_problem as evaluate_pipeline +import d3m.metadata.pipeline +from random import randint +from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing @@ -22,6 +23,7 @@ def __next__(self): for query_result in self.query_results: #iterate through modifier results for pipeline_path, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args): + job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, @@ -30,7 +32,7 @@ def __next__(self): self.num_complete += 1 #check to run until the generator stops iterating (if no input for num_pipelines_to_run) if (self.max_jobs): - if (self.num_complete >= self.max_jobs): + if (self.num_complete > self.max_jobs): raise StopIteration return job raise StopIteration @@ -72,7 +74,7 @@ def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check): def _modify_random_seed(self, seed_limit, query_args): - used_seeds = query_args.tested_seeds + used_seeds = query_args['tested_seeds'] num_run = len(used_seeds) #run until the right number of seeds have been run while (num_run < seed_limit): @@ -80,9 +82,9 @@ def _modify_random_seed(self, seed_limit, query_args): if (new_seed in used_seeds): continue num_run += 1 - used_seeds.append(new_seed) + used_seeds.add(new_seed) #yield the necessary job requirements - yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, new_seed + yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed def _modify_swap_primitive(self, swap_pipeline, query_args): diff --git a/experimenter/query.py b/experimenter/query.py index 1cfd3a2..4660943 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -52,7 +52,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False): for (problem_id, dataset_name), random_seeds in results.items(): - yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds} + yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds} def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): pipeline_search = Search(using=CONNECTION, index='pipelines') @@ -63,10 +63,10 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') for pipeline in pipeline_search.scan(): results = scan_pipeline_runs(pipeline.id, submitter) - for (problem_id, dataset_name), random_seeds in results.items(): + for (problem_id, dataset_id), random_seeds in results.items(): if limit and len(random_seeds) > limit: continue - yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds} + yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds} def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ @@ -79,7 +79,7 @@ def scan_pipeline_runs(pipeline_id, submitter=None): results = dict() for pipeline_run in pipeline_run_search.scan(): for dataset in pipeline_run.datasets: - dataset_prob_tuple = (pipeline_run.problem.id, dataset.name) + dataset_prob_tuple = (pipeline_run.problem.id, dataset.id) results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) results[dataset_prob_tuple].add(pipeline_run.random_seed) return results diff --git a/setup.py b/setup.py index 7d11df2..b4267c4 100644 --- a/setup.py +++ b/setup.py @@ -13,8 +13,8 @@ 'd3m', # TODO: add version bounds 'docker>=4.4.0<4.5.0', 'redis>=3.5.0<3.6.0', - 'rq>=1.7.0<1.8.0', - 'elasticsearch>=7.0.0<8.0.0', - 'elasticsearch_dsl>=7.0.0<8.0.0' + 'rq>=1.7.0<1.8.0' + 'elasticsearch==7.11.0', + 'elasticsearch_dsl==7.3.0' ], ) From 6edfb62bda0888ba6cfe5d754fac584a4e997aa6 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 23 Feb 2021 15:25:15 -0700 Subject: [PATCH 18/44] D3M configuration variables --- config-example.ini | 5 +++++ experimenter/config.py | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/config-example.ini b/config-example.ini index ba1011d..5ab6c77 100644 --- a/config-example.ini +++ b/config-example.ini @@ -10,3 +10,8 @@ DATA_DIR = redis DOCKER_IMAGE_NAME = redis:latest DOCKER_PORT = 6379 DOCKER_DATA_DIR = /data + +[D3MINFO] +D3M_DB_SUBMITTER = {SUBMITTER_NAME} +D3M_DB_TOKEN = {UNIQUE_TOKEN} +SAVE_TO_D3M = True diff --git a/experimenter/config.py b/experimenter/config.py index 1ae115e..ea3a265 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -32,7 +32,13 @@ def __init__(self, config_path: str = None) -> None: def get(self, section, key): return self._config.get(section, key) - +class D3MConfig(metaclass=Singleton): + def __init__(self): + config = Config() + self.d3m_submitter = config.get('D3MINFO','D3M_DB_SUBMITTER') + self.d3m_token = config.get('D3MINFO', 'D3M_DB_TOKEN') + self.save_to_d3m = config.get('D3MINFO', 'SAVE_TO_D3M')=="True" + class RedisConfig(metaclass=Singleton): def __init__(self): config = Config() From 4f8cfa13d1dcc9b4ac15a8ba2f5c469c46b8d6db Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 2 Mar 2021 09:30:19 -0700 Subject: [PATCH 19/44] fix environment variables and pipeline run to dict for saving --- experimenter/databases/d3m_mtl.py | 6 +++--- experimenter/evaluate_pipeline_new.py | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/experimenter/databases/d3m_mtl.py b/experimenter/databases/d3m_mtl.py index f8dde8d..6badcb8 100644 --- a/experimenter/databases/d3m_mtl.py +++ b/experimenter/databases/d3m_mtl.py @@ -23,7 +23,7 @@ def __init__(self) -> None: self._post_url = D3M_MTL_DB_POST_URL # This env var allows code calling this class to be run during # unit tests without actually saving to the production DB. - self.should_save = config.SAVE_TO_D3M + self.should_save = config.D3MConfig().save_to_d3m # A reference to a low-level elasticsearch client. This can be # used to query the D3M DB, or this classe's `search` method # can be used, and is preferred, since its API is more straightforward. @@ -31,9 +31,9 @@ def __init__(self) -> None: # certain things though. self.es = Elasticsearch(hosts=[D3M_MTL_DB_GET_URL], timeout=30) # Our submitter name. - self._submitter = config.D3M_DB_SUBMITTER + self._submitter = config.D3MConfig().d3m_submitter # The secret verifying us as the submitter we say we are. - self._x_token = config.D3M_DB_TOKEN + self._x_token = config.D3MConfig().d3m_token if self._is_identifying_as_submitter(): logger.info( f"Documents will be saved under submitter name: '{self._submitter}'" diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index f3850fb..e4cfc7f 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -5,7 +5,7 @@ from typing import Any, List, Tuple from uuid import UUID -from d3m import cli +from d3m import cli as d3m_cli from d3m.d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID) @@ -29,7 +29,9 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str): TODO """ d3m_db = D3MMtLDB() - return D3MMtLDB().save_pipeline_run(pipeline_run_path) + with open(pipeline_run_path) as pipeline_data: + pipeline_run = json.load(pipeline_data) + return D3MMtLDB().save_pipeline_run(pipeline_run) def evaluate_pipeline_on_problem(pipeline_path: str, problem_path: str, @@ -129,5 +131,5 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--output-run', output_run_path)) args.extend(('--data-random-seed', data_random_seed)) - cli.main(args) + d3m_cli.main(args) save_pipeline_run_to_d3m_db(output_run_path) From 92047d0ddca3c10eabd02597f6e513b94d80af21 Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Tue, 2 Mar 2021 22:27:32 +0000 Subject: [PATCH 20/44] Working queue with queue refactor --- experimenter/cli.py | 2 +- experimenter/config.py | 1 - experimenter/evaluate_pipeline_new.py | 4 ++-- setup.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/experimenter/cli.py b/experimenter/cli.py index e27b106..bfe88e5 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -210,7 +210,7 @@ def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParse modify_type = arguments.modify_type modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments) #now run the enqueuer part - queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port,arguments.job_timeout) + queue.enqueue_jobs(jobs=modify_generator, job_timeout=arguments.job_timeout) def configure_update_parser(parser: argparse.ArgumentParser) -> None: diff --git a/experimenter/config.py b/experimenter/config.py index ea4975d..3507891 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -52,4 +52,3 @@ def __init__(self): self.docker_data_dir = config.get('REDIS', 'DOCKER_DATA_DIR') self.dashboard_port = config.get('REDIS', 'DASHBOARD_PORT') self.dashboard_docker_image_name = config.get('REDIS', 'DASHBOARD_DOCKER_IMAGE_NAME') ->>>>>>> f8f7e7ac914d104149bf62f9353a4c8a65a4f726 diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index e4cfc7f..a236ce4 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -1,4 +1,4 @@ -mport itertools as it +import itertools as it import json import os @@ -6,7 +6,7 @@ from uuid import UUID from d3m import cli as d3m_cli -from d3m.d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, +from d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID) from experimenter.databases.d3m_mtl import D3MMtLDB diff --git a/setup.py b/setup.py index e5d3f24..03f5870 100644 --- a/setup.py +++ b/setup.py @@ -10,12 +10,12 @@ packages=find_packages(include=['experimenter']), python_requires='>=3.6,<4.0', install_requires=[ - 'd3m', # TODO: add version bounds 'docker>=4.4.0<4.5.0', 'mypy==0.812', 'redis>=3.5.0<3.6.0', 'rq>=1.7.0<1.8.0', 'rq-dashboard>=0.6.0<0.7.0', + 'd3m @ git+https://gitlab.com/datadrivendiscovery/d3m@devel#egg=d3m' 'elasticsearch==7.11.0', 'elasticsearch_dsl==7.3.0' ], From 78b9605bb98adb92d92d7d55f9ec90cf54f5fbe9 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 2 Mar 2021 17:58:34 -0700 Subject: [PATCH 21/44] Added job count and worker info tracking for queue status command --- experimenter/queue.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/experimenter/queue.py b/experimenter/queue.py index 2887624..8be73e0 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -19,6 +19,7 @@ _STOP_SUCCESS_MESSAGE = 'queue successfully stopped' _STATUS_RUNNING_MESSAGE = 'queue is running on port {port}' _STATUS_STOPPED_MESSAGE = 'queue is stopped' +_QUEUE_LENGTH_MESSAGE = 'number of jobs on queue {name}: {num_jobs}' _EMPTIED_MESSAGE = 'queue emptied' @@ -56,14 +57,28 @@ def stop() -> None: docker_utils.stop_container(config.RedisConfig().docker_image_name) print(_STOP_SUCCESS_MESSAGE) - -def status() -> None: +def get_worker_message(workers, queue_name: str = _DEFAULT_QUEUE) -> str: + num_workers = len(workers) + message = 'number of workers on queue {}: {}'.format(queue_name, num_workers) + for it, worker in enumerate(workers): + success = worker.successful_job_count + fail = worker.failed_job_count + message = message+'\n worker: {}'.format(it) + message = message+'\n\t number of successful jobs: {}'.format(success) + message = message+'\n\t number of failed jobs: {}'.format(fail) + return message + +def status(queue_name: str = _DEFAULT_QUEUE) -> None: # TODO: report container port instead of config port if is_running(): + connection = redis.StrictRedis(host=config.RedisConfig().host, port=config.RedisConfig().port) + queue = rq.Queue(queue_name, connection=connection) + workers = rq.Worker.all(queue=queue) print(_STATUS_RUNNING_MESSAGE.format(port=config.RedisConfig().port)) + print(_QUEUE_LENGTH_MESSAGE.format(name=queue_name, num_jobs=len(queue))) + print(get_worker_message(workers,queue_name)) else: print(_STATUS_STOPPED_MESSAGE) - # TODO: report number of jobs in each queue def empty(queue_name: str = _DEFAULT_QUEUE) -> None: From c8c7cc4413af81ec99bddfac679a40df8b26dc30 Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Wed, 3 Mar 2021 02:49:02 +0000 Subject: [PATCH 22/44] rq-worker Popen commands updated --- experimenter/evaluate_pipeline_new.py | 5 ++++- experimenter/queue.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index a236ce4..f1590ab 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -4,6 +4,7 @@ from typing import Any, List, Tuple from uuid import UUID +from experimenter import config from d3m import cli as d3m_cli from d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, @@ -132,4 +133,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--data-random-seed', data_random_seed)) d3m_cli.main(args) - save_pipeline_run_to_d3m_db(output_run_path) + if (config.D3MConfig().save_to_d3m is True): + print("Saving pipeline run to d3m database") + save_pipeline_run_to_d3m_db(output_run_path) diff --git a/experimenter/queue.py b/experimenter/queue.py index 8be73e0..c41e14d 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -126,7 +126,7 @@ def enqueue_jobs( def start_worker(max_jobs: int = None, *, queue_name: str = _DEFAULT_QUEUE) -> None: args = [ - 'rq', 'worker', queue_name, '--burst', '--url', + './env/bin/rq','worker', queue_name, '--burst', '--url', 'redis://{}:{}'.format(config.RedisConfig().host, config.RedisConfig().port), ] From 1a56219164870c56b88ed5b197f1079bbdaf6e32 Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Fri, 5 Mar 2021 19:17:15 +0000 Subject: [PATCH 23/44] Updated paths for saving pipelines and pipeline runs, queue and evaluate working --- experimenter/config.py | 5 +++- experimenter/evaluate_pipeline_new.py | 38 ++++++++++++++++----------- experimenter/modify_generator.py | 6 +++-- experimenter/query.py | 2 +- experimenter/utils.py | 20 +++++++++++--- 5 files changed, 47 insertions(+), 24 deletions(-) diff --git a/experimenter/config.py b/experimenter/config.py index 3507891..ecf6c37 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -32,13 +32,16 @@ def __init__(self, config_path: str = None) -> None: def get(self, section, key): return self._config.get(section, key) + class D3MConfig(metaclass=Singleton): def __init__(self): config = Config() self.d3m_submitter = config.get('D3MINFO','D3M_DB_SUBMITTER') self.d3m_token = config.get('D3MINFO', 'D3M_DB_TOKEN') self.save_to_d3m = config.get('D3MINFO', 'SAVE_TO_D3M')=="True" - + self.datasets_directory = config.get('D3MINFO','DATASET_DIRECTORY') + + class RedisConfig(metaclass=Singleton): def __init__(self): config = Config() diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index f1590ab..899cf30 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -1,6 +1,7 @@ import itertools as it import json import os +import parser from typing import Any, List, Tuple from uuid import UUID @@ -66,15 +67,20 @@ def evaluate_pipeline_on_problem(pipeline_path: str, """ output_run_path = [] - with open(pipeline_path, 'r') as pipeline: - output_run_path.append(pipeline['properties']['digest']) - with open(problem_path, 'r') as problem: - output_run_path.append(problem['properties']['digest']) - with open(input_path, 'r') as input_f: - output_run_path.append(input_f['properties']['digest']) - - output_run_path = '_'.join(output_run_path) + '.json' - + with open(pipeline_path, 'r') as data: + pipeline = json.load(data) + output_run_path.append(pipeline['id']) + with open(problem_path, 'r') as data: + problem = json.load(data) + output_run_path.append(problem['about']['problemID']) + with open(input_path, 'r') as data: + input_f = json.load(data) + output_run_path.append(input_f['about']['digest']) + #get the output run path + output_run_path = os.path.abspath(os.path.join(config.Config().get('MAIN','CACHE_DIR'), 'Pipeline_Run', '_'.join(output_run_path) + '.json')) + #create the directory + os.makedirs(os.path.dirname(output_run_path),exist_ok=True) + #evaluate pipeline evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, data_random_seed=data_random_seed) @@ -118,7 +124,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args = ['d3m', 'runtime', 'evaluate'] if (not os.path.isfile(pipeline)): - raise ValueError('\'{}\' param not a file path or pipeline ID'.format('pipeline')) + raise ValueError('\'{}\' param not a file path'.format('pipeline')) if (not os.path.isfile(problem)): raise ValueError('\'{}\' param not a file path'.format('problem')) @@ -126,13 +132,13 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) - args.extend(('--pipeline ', pipeline)) + args.extend(('--pipeline', pipeline)) args.extend(('--problem', problem)) args.extend(('--input', input)) - args.extend(('--output-run', output_run_path)) - args.extend(('--data-random-seed', data_random_seed)) - + args.extend(('--output-run', output_run)) + args.extend(('--data-random-seed', str(data_random_seed))) + args.extend(('--data-pipeline', K_FOLD_TABULAR_SPLIT_PIPELINE_ID)) + args.extend(('--scoring-pipeline', SCORING_PIPELINE_ID)) d3m_cli.main(args) if (config.D3MConfig().save_to_d3m is True): - print("Saving pipeline run to d3m database") - save_pipeline_run_to_d3m_db(output_run_path) + save_pipeline_run_to_d3m_db(output_run) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index ec5106c..afd86b8 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,5 +1,6 @@ from experimenter.query import query_on_seeds, query_on_primitive from experimenter import queue +from experimenter.utils import download_from_database import d3m.metadata.pipeline from random import randint from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline @@ -22,8 +23,9 @@ def __next__(self): #iterate through query results for query_result in self.query_results: #iterate through modifier results - for pipeline_path, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args): - + for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args): + #save the pipeline to path and return pipeline path + pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, diff --git a/experimenter/query.py b/experimenter/query.py index 4660943..63779bc 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -66,7 +66,7 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') for (problem_id, dataset_id), random_seeds in results.items(): if limit and len(random_seeds) > limit: continue - yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds} + yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id[:-8]), 'dataset_doc_path': get_dataset_doc_path(dataset_id[:-13]), 'tested_seeds': random_seeds} def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ diff --git a/experimenter/utils.py b/experimenter/utils.py index 61bfbb7..4df07a7 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -9,10 +9,22 @@ from d3m.metadata import problem as problem_module from d3m.utils import get_datasets_and_problems -from experimenter import exceptions - -DEFAULT_DATASET_DIR = "/datasets/training_datasets/LL0" - +from experimenter import exceptions, config + +DEFAULT_DATASET_DIR = config.D3MConfig().datasets_directory + +def download_from_database(data, type_to_download: str = 'Pipeline'): + if (type_to_download == 'Pipeline'): + i_d = data['id'] + save_path = os.path.abspath(os.path.join(config.Config().get('MAIN','CACHE_DIR'), 'Pipeline', i_d+str('.json'))) + #create the new directory + os.makedirs(os.path.dirname(save_path),exist_ok=True) + #save the file to the directory + with open(save_path, 'w') as to_save: + json.dump(data, to_save, indent=4) + else: + raise ValueError("type: {}, not available for download".format(type_to_download)) + return save_path def get_dataset_doc_path( dataset_name: str, dataset_dir: str = DEFAULT_DATASET_DIR From c9b0b7786f624efb45f16f94a715a88791e5604b Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Mon, 15 Mar 2021 22:15:44 +0000 Subject: [PATCH 24/44] experimenter logging updates --- docker-compose.yml | 2 +- experimenter/cli.py | 8 +++++++ experimenter/evaluate_pipeline_new.py | 8 ++++++- experimenter/modify_generator.py | 5 ++++ experimenter/queue.py | 33 +++++++++++++++++++++------ experimenter/utils.py | 2 +- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 199c06b..c9b73e4 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -43,7 +43,7 @@ services: - type: bind source: '${EXPERIMENTER_DIR}' target: /d3m-experimenter - read_only: true + read_only: false working_dir: /d3m-experimenter networks: - default diff --git a/experimenter/cli.py b/experimenter/cli.py index 5f9f41a..6734b91 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -23,6 +23,12 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: ) configure_queue_parser(queue_parser) + generator_parser = subparsers.add_parser( + 'generator', + description='generates new pipelines and queues them to run on available datasets', + ) + configure_generator_parser(generator_parser) + def handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None: experimenter_command = arguments.experimenter_command @@ -30,6 +36,8 @@ def handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> N if experimenter_command == 'queue': queue_handler(arguments, subparser) + elif experimenter_command == 'generator': + generator_handler(arguments, subparser) else: raise exceptions.InvalidStateError('Unknown experimenter command: {}'.format(experimenter_command)) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 1e7416f..64d68a4 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -2,6 +2,7 @@ import json import os import parser +import logging from typing import Any, List, Tuple from uuid import UUID @@ -12,6 +13,7 @@ SCORING_PIPELINE_ID) from experimenter.databases.d3m_mtl import D3MMtLDB +logging.basicConfig(filename='logger.log', level=logging.INFO) def save_pipeline_run_to_d3m_db(pipeline_run_path: str): """ @@ -66,7 +68,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str, when a file cannot be opened """ output_run_path = [] - + logging.info('getting files') with open(pipeline_path, 'r') as data: pipeline = json.load(data) output_run_path.append(pipeline['id']) @@ -82,6 +84,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str, #create the directory os.makedirs(os.path.dirname(output_run_path),exist_ok=True) #evaluate pipeline + logging.info('begin evaluation') evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, data_random_seed=data_random_seed) @@ -133,6 +136,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) + logging.info('extending arguments') args.extend(('--pipeline', pipeline)) args.extend(('--problem', problem)) args.extend(('--input', input)) @@ -140,6 +144,8 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--data-random-seed', str(data_random_seed))) args.extend(('--data-pipeline', K_FOLD_TABULAR_SPLIT_PIPELINE_ID)) args.extend(('--scoring-pipeline', SCORING_PIPELINE_ID)) + logging.info('evaluating') d3m_cli.main(args) if (config.save_to_d3m is True): + save_pipeline_run_to_d3m_db(output_run) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index afd86b8..ad654a2 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -4,6 +4,8 @@ import d3m.metadata.pipeline from random import randint from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline +import logging +logging.basicConfig(filename='logger.log', level=logging.INFO) class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing @@ -24,8 +26,10 @@ def __next__(self): for query_result in self.query_results: #iterate through modifier results for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args): + logging.info('downloading pipeline path') #save the pipeline to path and return pipeline path pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') + logging.info('creating job') job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, @@ -45,6 +49,7 @@ def __iter__(self): def _query(self, args): + logging.info('logging') if (self.modifier_type=='random-seed'): return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter) if (self.modifier_type=='swap-primitive'): diff --git a/experimenter/queue.py b/experimenter/queue.py index b6b4337..ede755e 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -7,6 +7,7 @@ import redis import rq +import logging from experimenter import config, exceptions, utils @@ -28,21 +29,38 @@ def get_worker_message(workers: list, queue_name: str = _DEFAULT_QUEUE): message = 'number of workers on queue {}: {}'.format(queue_name, num_workers) for it, worker in enumerate(workers): success = worker.successful_job_count - fail = worker.failed_job_count - message = message+'\n worker: {}'.format(it) - message = message+'\n\t number of successful jobs: {}'.format(success) - message = message+'\n\t number of failed jobs: {}'.format(fail) + fail = worker.failed_job_count + if (fail > 0): + failed_job = get_failed_job(queue_name=queue_name) + with open ('failed_job.txt', 'w') as failed_file: + failed_file.write(failed_job) + message = message+'\n\t\t\t worker: {}'.format(it) + message = message+'\n\t\t\t\t number of successful jobs: {}'.format(success) + message = message+'\n\t\t\t\t number of failed jobs: {}'.format(fail) return message +def get_failed_job(queue_name='default', job_num=0): + conn = get_connection() + #pass name and connection + reg = rq.registry.FailedJobRegistry(name=queue_name, connection=conn) + print(len(reg)) + job_ids = reg.get_job_ids() + if (len(job_ids)<=0): + return "None" + job = job_ids[0] + job = Job.fetch(job, connection=conn) + return job.exc_info + + def get_queue_message(queues: list): - queues_message = 'getting queues, jobs and workers' + queues_message = 'getting queues, jobs, and workers' for queue in queues: queues_message = queues_message + '\n\t number of jobs on queue {}: {}'.format(queue, len(queue)) workers = rq.Worker.all(queue=queue) - queues_message = queues_message + '\n\t\t get_worker_message(queue=workers, queue_name=queue) + queues_message = queues_message + '\n\t\t' + str(get_worker_message(workers=workers, queue_name=queue)) - return queue_message + return queues_message def status() -> None: @@ -91,4 +109,5 @@ def enqueue_jobs( queue = rq.Queue(queue_name, connection=connection) for job in jobs: + print("Queueing Job - ") queue.enqueue(**job, job_timeout=job_timeout) diff --git a/experimenter/utils.py b/experimenter/utils.py index 40fa93e..16b0368 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -13,7 +13,7 @@ from experimenter import exceptions, config -DEFAULT_DATASET_DIR = "/datasets/training_datasets/LL0" +DEFAULT_DATASET_DIR = "/datasets" datasets, problems = None, None From 075844c819dd4285c1f6847517d43dcbccaafa00 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 16 Mar 2021 16:14:00 -0600 Subject: [PATCH 25/44] Experimenter and queue updates --- docker-compose.yml | 19 +++++++- experimenter/__init__.py | 0 experimenter/cli.py | 15 ++++++- experimenter/config.py | 1 - experimenter/evaluate_pipeline_new.py | 29 ++++++------- experimenter/modify_generator.py | 62 +++++++++++++++++---------- experimenter/queue.py | 62 +++++++++++++++++---------- experimenter/utils.py | 2 +- 8 files changed, 126 insertions(+), 64 deletions(-) mode change 100755 => 100644 experimenter/__init__.py diff --git a/docker-compose.yml b/docker-compose.yml index c9b73e4..643f499 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,11 +7,26 @@ services: volumes: - type: bind source: '${DATA_DIR}/${REDIS_DATA_DIR}' - target: '/data' + target: /data networks: - default rq_worker: + env_file: + - ./.env + volumes: + - type: bind + source: '${DATASETS_DIR}' + target: /datasets + read_only: true + - type: bind + source: '${DATA_DIR}' + target: /data + - type: bind + source: '${EXPERIMENTER_DIR}' + target: /d3m-experimenter + read_only: true + working_dir: /d3m-experimenter image: 'd3m-experimenter:latest' command: 'rq worker --url redis://${REDIS_HOST} ${RQ_QUEUES}' networks: @@ -43,7 +58,7 @@ services: - type: bind source: '${EXPERIMENTER_DIR}' target: /d3m-experimenter - read_only: false + read_only: true working_dir: /d3m-experimenter networks: - default diff --git a/experimenter/__init__.py b/experimenter/__init__.py old mode 100755 new mode 100644 diff --git a/experimenter/cli.py b/experimenter/cli.py index 6734b91..c27947f 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -50,6 +50,12 @@ def configure_queue_parser(parser: argparse.ArgumentParser) -> None: empty_parser = subparsers.add_parser('empty', help='remove all jobs from a queue') empty_parser.add_argument('-q', '--queue-name', help='the name of the queue to empty') + empty_parser.add_argument('-f', '--failed', default='false', help='remove the failed queue') + + #save a failed traceback parser + save_failed_parser = subparsers.add_parser('save-failed', help='save failed job error output') + save_failed_parser.add_argument('-q', '--queue-name', help='the name of the queue to empty') + save_failed_parser.add_argument('-j', '--job-num', type=int, default=0, help='the failed job number') def queue_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None: @@ -58,7 +64,9 @@ def queue_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser if queue_command == 'status': queue.status() elif queue_command == 'empty': - queue.empty(arguments.queue_name) + queue.empty(arguments.queue_name, arguments.failed) + elif queue_command == 'save-failed': + queue.save_failed_job(arguments.queue_name, arguments.job_num) else: raise exceptions.InvalidStateError('Unknown queue command: {}'.format(queue_command)) @@ -137,6 +145,11 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: help='The amount of random seeds that each ran pipeline will have at the end of the test', default=2, type=int) + swap_seed_subparser.add_argument( + '--test', + help='run the test instead of random pipeline generation', + default='false', + type=str) #Primitive swapper functionality primitive_swap_subparser = subparsers.add_parser( diff --git a/experimenter/config.py b/experimenter/config.py index a405f1d..8291f93 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -12,7 +12,6 @@ # TODO: these should not have to be set unless needed - datasets_dir: str = os.environ.get('DATASETS_DIR', None) if datasets_dir is None: raise exceptions.ConfigError(_ERROR_MESSAGE.format('DATASETS_DIR')) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 64d68a4..ff30a36 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -2,18 +2,16 @@ import json import os import parser -import logging from typing import Any, List, Tuple from uuid import UUID -from experimenter import config +from experimenter import config, utils from d3m import cli as d3m_cli -from d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, - SCORING_PIPELINE_ID) - +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as TSPP +from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as SPP from experimenter.databases.d3m_mtl import D3MMtLDB -logging.basicConfig(filename='logger.log', level=logging.INFO) + def save_pipeline_run_to_d3m_db(pipeline_run_path: str): """ @@ -68,7 +66,6 @@ def evaluate_pipeline_on_problem(pipeline_path: str, when a file cannot be opened """ output_run_path = [] - logging.info('getting files') with open(pipeline_path, 'r') as data: pipeline = json.load(data) output_run_path.append(pipeline['id']) @@ -79,12 +76,11 @@ def evaluate_pipeline_on_problem(pipeline_path: str, input_f = json.load(data) output_run_path.append(input_f['about']['digest']) #get the output run path - output_run_path = os.path.abspath(os.path.join(config.data_dir, 'Pipeline_Run', + output_run_path = os.path.abspath(os.path.join(os.getenv('DATA_DIR'), 'Pipeline_Run', '_'.join(output_run_path)+'.json')) #create the directory os.makedirs(os.path.dirname(output_run_path),exist_ok=True) #evaluate pipeline - logging.info('begin evaluation') evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, data_random_seed=data_random_seed) @@ -135,17 +131,20 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) - - logging.info('extending arguments') + + if (not os.path.isfile(TSPP)): + raise ValueError('\'{}\' pipeline not a file path'.format('data split')) + + if (not os.path.isfile(SPP)): + raise ValueError('\'{}\' pipeline not a file path'.format('scoring')) + args.extend(('--pipeline', pipeline)) args.extend(('--problem', problem)) args.extend(('--input', input)) args.extend(('--output-run', output_run)) args.extend(('--data-random-seed', str(data_random_seed))) - args.extend(('--data-pipeline', K_FOLD_TABULAR_SPLIT_PIPELINE_ID)) - args.extend(('--scoring-pipeline', SCORING_PIPELINE_ID)) - logging.info('evaluating') + args.extend(('--data-pipeline', TSPP)) + args.extend(('--scoring-pipeline', SPP)) d3m_cli.main(args) if (config.save_to_d3m is True): - save_pipeline_run_to_d3m_db(output_run) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index ad654a2..6f4ed55 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,11 +1,10 @@ from experimenter.query import query_on_seeds, query_on_primitive -from experimenter import queue +from experimenter import queue, utils from experimenter.utils import download_from_database import d3m.metadata.pipeline from random import randint +import json from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline -import logging -logging.basicConfig(filename='logger.log', level=logging.INFO) class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing @@ -18,38 +17,48 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None self.max_jobs = max_jobs self.num_complete = 0 #run the query on initializing to define the query results - self.query_results = self._query(self.args) + if (args.test == 'true'): + self.query_results = self._run_seed_test(self.args) + else: + self.query_results = self._query(self.args) + self.generator = self._get_generator() + + def __iter__(self): + return self def __next__(self): #iterate through query results + job = self.next() + if (self.max_jobs): + if (self.num_complete > self.max_jobs): + raise StopIteration + return job + + + def next(self): + #iterate through query results + return next(self.generator) + + + def _get_generator(self): for query_result in self.query_results: #iterate through modifier results - for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args): - logging.info('downloading pipeline path') + for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result,self.args): #save the pipeline to path and return pipeline path pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') - logging.info('creating job') + evaluate_pipeline(pipeline_path=pipeline_path, problem_path=problem_path, + input_path=dataset_doc_path, data_random_seed=seed) job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, input_path=dataset_doc_path, data_random_seed=seed) self.num_complete += 1 - #check to run until the generator stops iterating (if no input for num_pipelines_to_run) - if (self.max_jobs): - if (self.num_complete > self.max_jobs): - raise StopIteration - return job - raise StopIteration - - - def __iter__(self): - return self + yield job + - def _query(self, args): - logging.info('logging') if (self.modifier_type=='random-seed'): return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter) if (self.modifier_type=='swap-primitive'): @@ -62,7 +71,7 @@ def _modify(self, query_args: dict, args): if self.modifier_type=='random-seed': return self._modify_random_seed(args.seed_limit, query_args) if self.modifier_type=='swap-primitive': - return self._modifiy_swap_primitive(args.swap_primitive_id, query_args) + return self._modify_swap_primitive(args.swap_primitive_id, query_args) else: raise ValueError("This type of modification is not yet an option") @@ -75,7 +84,7 @@ def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check): #query through the database for equal pipelines similar_pipeline_runs_in_database = query.generate_similar_pipeline_runs() for pipeline in similar_pipeline_runs_in_database: - if(pipeline_object.equals(pipeline)): + if (pipeline_object.equals(pipeline)): return True return False @@ -89,10 +98,19 @@ def _modify_random_seed(self, seed_limit, query_args): if (new_seed in used_seeds): continue num_run += 1 - used_seeds.add(new_seed) + used_seeds.append(new_seed) #yield the necessary job requirements yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed + + def _run_seed_test(self,args): + with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file: + pipeline = json.load(pipeline_file) + dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') + problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') + used_seeds = [2,15] + yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, + 'tested_seeds': used_seeds } def _modify_swap_primitive(self, swap_pipeline, query_args): raise ValueError("No functionality for swapping primitives yet") diff --git a/experimenter/queue.py b/experimenter/queue.py index ede755e..81680da 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -7,7 +7,6 @@ import redis import rq -import logging from experimenter import config, exceptions, utils @@ -24,41 +23,45 @@ def get_queue(queue_name: str = _DEFAULT_QUEUE) -> rq.Queue: return rq.Queue(queue_name, connection=get_connection()) -def get_worker_message(workers: list, queue_name: str = _DEFAULT_QUEUE): +def get_worker_message(workers: list, queue): num_workers = len(workers) - message = 'number of workers on queue {}: {}'.format(queue_name, num_workers) + message = 'number of workers on queue {}: {}'.format(queue.name, num_workers) for it, worker in enumerate(workers): success = worker.successful_job_count fail = worker.failed_job_count - if (fail > 0): - failed_job = get_failed_job(queue_name=queue_name) - with open ('failed_job.txt', 'w') as failed_file: - failed_file.write(failed_job) message = message+'\n\t\t\t worker: {}'.format(it) message = message+'\n\t\t\t\t number of successful jobs: {}'.format(success) message = message+'\n\t\t\t\t number of failed jobs: {}'.format(fail) return message -def get_failed_job(queue_name='default', job_num=0): - conn = get_connection() +def get_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0): #pass name and connection - reg = rq.registry.FailedJobRegistry(name=queue_name, connection=conn) - print(len(reg)) + reg = rq.registry.FailedJobRegistry(name = queue_name, connection = get_connection()) job_ids = reg.get_job_ids() if (len(job_ids)<=0): - return "None" + return "None", reg job = job_ids[0] - job = Job.fetch(job, connection=conn) - return job.exc_info + job = rq.job.Job.fetch(job, connection=get_connection()) + return job.exc_info, reg + + +def save_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0): + if (queue_name is None): + queue_name = _DEFAULT_QUEUE + with open (os.path.join('/data',"failed_job_{}.txt".format(job_num)), 'w') as job_file: + job_file.write(get_failed_job(queue_name=queue_name, job_num=job_num)[0]) def get_queue_message(queues: list): queues_message = 'getting queues, jobs, and workers' for queue in queues: - queues_message = queues_message + '\n\t number of jobs on queue {}: {}'.format(queue, len(queue)) + queues_message = queues_message + '\n\t number of jobs on queue {}: {}'.format(queue.name, len(queue)) + _, reg = get_failed_job(queue.name) + num_fails = len(reg) + queues_message = queues_message + '\n\t number of failed jobs on queue {}: {}'.format(queue.name, num_fails) workers = rq.Worker.all(queue=queue) - queues_message = queues_message + '\n\t\t' + str(get_worker_message(workers=workers, queue_name=queue)) + queues_message = queues_message + '\n\t\t' + str(get_worker_message(workers=workers, queue=queue)) return queues_message @@ -71,12 +74,16 @@ def status() -> None: print(queues_message) -def empty(queue_name: str = None) -> None: +def empty(queue_name: str = None, empty_failed_queue: str = 'false') -> None: if queue_name is None: queue_name = _DEFAULT_QUEUE - queue = get_queue(queue_name) - queue.empty() - print(_EMPTIED_MESSAGE.format(queue_name)) + #empty the failed queue or just the normal one + if (empty_failed_queue == 'true'): + empty_failed(queue_name=queue_name) + else: + queue = get_queue(queue_name) + queue.empty() + print(_EMPTIED_MESSAGE.format(queue_name)) def _check_redis_connection() -> typing.Optional[Exception]: @@ -86,7 +93,19 @@ def _check_redis_connection() -> typing.Optional[Exception]: except redis.exceptions.RedisError as e: error = e return error - + + +def empty_failed(queue_name: str = None) -> None: + if queue_name is None: + queue_name = _DEFAULT_QUEUE + _, failed_queue = get_failed_job(queue_name=queue_name) + #loop through the jobs and remove them + conn = get_connection() + job_ids = failed_queue.get_job_ids() + for job_id in job_ids: + result = failed_queue.remove(job_id, delete_job=True) + print(_EMPTIED_MESSAGE.format(queue_name+str(' failed'))) + def make_job(f: typing.Callable, *args: typing.Any, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]: return {'f':f, 'args': args, 'kwargs': kwargs} @@ -109,5 +128,4 @@ def enqueue_jobs( queue = rq.Queue(queue_name, connection=connection) for job in jobs: - print("Queueing Job - ") queue.enqueue(**job, job_timeout=job_timeout) diff --git a/experimenter/utils.py b/experimenter/utils.py index 16b0368..f88d0ce 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -20,7 +20,7 @@ def download_from_database(data, type_to_download: str = 'Pipeline'): if (type_to_download == 'Pipeline'): i_d = data['id'] - save_path = os.path.abspath(os.path.join(config.data_dir, 'Pipeline', i_d+str('.json'))) + save_path = os.path.join('/data', 'Pipeline', i_d+str('.json')) #create the new directory os.makedirs(os.path.dirname(save_path),exist_ok=True) #save the file to the directory From 9b3e6499b53a496cbb2fb05ed4a9e7c3b2dc4272 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 19 Mar 2021 10:09:15 -0600 Subject: [PATCH 26/44] Working queue and pipeline run local --- docker-compose.yml | 5 +--- experimenter/evaluate_pipeline_new.py | 35 +++++++++++---------------- experimenter/modify_generator.py | 12 +++++---- experimenter/queue.py | 2 +- experimenter/utils.py | 26 ++++++++++++++++++-- 5 files changed, 47 insertions(+), 33 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index ae5baa0..61bbae1 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,7 @@ services: - default rq_worker: + image: 'd3m-experimenter:latest' env_file: - ./.env volumes: @@ -26,10 +27,6 @@ services: source: '${EXPERIMENTER_DIR}' target: /d3m-experimenter read_only: true - working_dir: /d3m-experimenter - image: 'd3m-experimenter:latest' - env_file: - - ./.env command: 'rq worker --url redis://${REDIS_HOST} ${RQ_QUEUES}' networks: - default diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index ff30a36..65e68e5 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -8,8 +8,7 @@ from experimenter import config, utils from d3m import cli as d3m_cli -from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as TSPP -from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as SPP +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file from experimenter.databases.d3m_mtl import D3MMtLDB @@ -38,7 +37,7 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str): def evaluate_pipeline_on_problem(pipeline_path: str, problem_path: str, input_path: str, - data_random_seed: int): + random_seed: int): """ Evaluate pipeline on problem. A less verbose form of running d3m's runtime cli 'evaluate' command. @@ -53,8 +52,8 @@ def evaluate_pipeline_on_problem(pipeline_path: str, path to problem doc input_path : path_like str path to input full data - data_random_seed : int - random seed to be used for data preparation + random_seed : int + random seed to be used for pipeline run Returns: ---------- @@ -72,24 +71,22 @@ def evaluate_pipeline_on_problem(pipeline_path: str, with open(problem_path, 'r') as data: problem = json.load(data) output_run_path.append(problem['about']['problemID']) - with open(input_path, 'r') as data: - input_f = json.load(data) - output_run_path.append(input_f['about']['digest']) + output_run_path.append(str(random_seed)) #get the output run path - output_run_path = os.path.abspath(os.path.join(os.getenv('DATA_DIR'), 'Pipeline_Run', + output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', '_'.join(output_run_path)+'.json')) #create the directory os.makedirs(os.path.dirname(output_run_path),exist_ok=True) #evaluate pipeline evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, - data_random_seed=data_random_seed) + random_seed=random_seed) def evaluate_pipeline_via_d3m_cli(pipeline: str, problem: str, input: str, output_run: str, - data_random_seed: int): + random_seed: int): """ Evaluate pipeline on problem using d3m's runtime cli. Wrapper function to execute d3m's runtime cli 'evaluate' command. @@ -107,9 +104,9 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, path where pipeline_run doc will be saved. use '-' for stdin - data_random_seed : int - random seed to use for - data preparation + random_seed : int + random seed to used for + pipeline run Return: ------- @@ -121,7 +118,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, when parameter value is invalid """ - args = ['d3m', 'runtime', 'evaluate'] + args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate'] if (not os.path.isfile(pipeline)): raise ValueError('\'{}\' param not a file path'.format('pipeline')) @@ -132,19 +129,15 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) - if (not os.path.isfile(TSPP)): + if (not os.path.isfile(data_split_file)): raise ValueError('\'{}\' pipeline not a file path'.format('data split')) - if (not os.path.isfile(SPP)): - raise ValueError('\'{}\' pipeline not a file path'.format('scoring')) args.extend(('--pipeline', pipeline)) args.extend(('--problem', problem)) args.extend(('--input', input)) args.extend(('--output-run', output_run)) - args.extend(('--data-random-seed', str(data_random_seed))) - args.extend(('--data-pipeline', TSPP)) - args.extend(('--scoring-pipeline', SPP)) + args.extend(('--data-pipeline', data_split_file)) d3m_cli.main(args) if (config.save_to_d3m is True): save_pipeline_run_to_d3m_db(output_run) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 6f4ed55..ea1a650 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -47,13 +47,15 @@ def _get_generator(self): for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result,self.args): #save the pipeline to path and return pipeline path pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') - evaluate_pipeline(pipeline_path=pipeline_path, problem_path=problem_path, - input_path=dataset_doc_path, data_random_seed=seed) + #catch error returning none for file paths + if (problem_path is None or dataset_doc_path is None): + continue + #create the job if file paths are returned from query job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, input_path=dataset_doc_path, - data_random_seed=seed) + random_seed=seed) self.num_complete += 1 yield job @@ -98,7 +100,7 @@ def _modify_random_seed(self, seed_limit, query_args): if (new_seed in used_seeds): continue num_run += 1 - used_seeds.append(new_seed) + used_seeds.add(new_seed) #yield the necessary job requirements yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed @@ -108,7 +110,7 @@ def _run_seed_test(self,args): pipeline = json.load(pipeline_file) dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') - used_seeds = [2,15] + used_seeds = {2,15} yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 'tested_seeds': used_seeds } diff --git a/experimenter/queue.py b/experimenter/queue.py index 0813b8a..a62bfa2 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -83,7 +83,7 @@ def enqueue(job, queue_name: str = _DEFAULT_QUEUE, job_timeout: int = None) -> r return q.enqueue(**job, job_timeout=job_timeout) -def empty(queue_name: str = None) -> None: +def empty(queue_name: str = None, empty_failed_queue: str = 'false') -> None: if queue_name is None: queue_name = _DEFAULT_QUEUE #empty the failed queue or just the normal one diff --git a/experimenter/utils.py b/experimenter/utils.py index f88d0ce..f22b081 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -16,6 +16,18 @@ DEFAULT_DATASET_DIR = "/datasets" datasets, problems = None, None +def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None): + #create the directory + os.makedirs(os.path.join('/data','DoesNotExist'),exist_ok=True) + #get the tag to write or append + if (os.path.exists(os.path.join('/data','DoesNotExist',filename))): + tag = 'a' # append to file + else: + tag = 'w' # write and create + #append the non existing value to the file + with open(os.path.join('/data','DoesNotExist',filename),tag) as to_save: + to_save.write(save_id+'\n') + def download_from_database(data, type_to_download: str = 'Pipeline'): if (type_to_download == 'Pipeline'): @@ -43,7 +55,12 @@ def get_dataset_doc_path(dataset_id: str, datasets_dir: str=None) -> str: if datasets_dir is None: datasets_dir = os.getenv('DATASETS', DEFAULT_DATASET_DIR) datasets, problems = get_datasets_and_problems(datasets_dir) - return datasets[dataset_id] + try: + return datasets[dataset_id] + except: + #save to dataset id does not exist file + save_to_not_exist_file('dataset_dne.txt', dataset_id) + return None def get_dataset_doc(dataset_id: str, datasets_dir: str=None) -> dict: @@ -71,7 +88,12 @@ def get_problem_path(problem_id: str, datasets_dir: str=None) -> str: if datasets_dir is None: datasets_dir = os.getenv('DATASETS', DEFAULT_DATASET_DIR) datasets, problems = get_datasets_and_problems(datasets_dir) - return problems[problem_id] + try: + return problems[problem_id] + except: + #save to problem id does not exist file + save_to_not_exist_file('problem_dne.txt', problem_id) + return None def get_problem(problem_path: str, *, parse: bool = True) -> dict: From a53478c85c21299585ebc04644dab77158a20a66 Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Fri, 19 Mar 2021 16:45:02 +0000 Subject: [PATCH 27/44] Minor typos, need to update queue and query --- config-example.ini | 17 ----------------- experimenter/queue.py | 1 + 2 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 config-example.ini diff --git a/config-example.ini b/config-example.ini deleted file mode 100644 index 5ab6c77..0000000 --- a/config-example.ini +++ /dev/null @@ -1,17 +0,0 @@ -[MAIN] -CACHE_DIR = /d3m-experimenter - -[REDIS] -HOST = localhost -PORT = 6379 -DASHBOARD_PORT = 9181 -# the following should not be changed -DATA_DIR = redis -DOCKER_IMAGE_NAME = redis:latest -DOCKER_PORT = 6379 -DOCKER_DATA_DIR = /data - -[D3MINFO] -D3M_DB_SUBMITTER = {SUBMITTER_NAME} -D3M_DB_TOKEN = {UNIQUE_TOKEN} -SAVE_TO_D3M = True diff --git a/experimenter/queue.py b/experimenter/queue.py index a62bfa2..be615cd 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -2,6 +2,7 @@ import redis import rq +import os from experimenter import config, exceptions From d0fe148a6c0d32b20de60739e9478a3c10fcf542 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 19 Mar 2021 12:45:12 -0600 Subject: [PATCH 28/44] Updates beginning datapreparation functionality --- experimenter/cli.py | 7 +++--- experimenter/evaluate_pipeline_new.py | 22 +++++++++++----- experimenter/modify_generator.py | 36 ++++++++++++++++++--------- experimenter/query.py | 10 ++++++-- experimenter/queue.py | 4 +-- experimenter/utils.py | 13 ++++++---- 6 files changed, 61 insertions(+), 31 deletions(-) diff --git a/experimenter/cli.py b/experimenter/cli.py index c27947f..fb1d996 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -50,7 +50,7 @@ def configure_queue_parser(parser: argparse.ArgumentParser) -> None: empty_parser = subparsers.add_parser('empty', help='remove all jobs from a queue') empty_parser.add_argument('-q', '--queue-name', help='the name of the queue to empty') - empty_parser.add_argument('-f', '--failed', default='false', help='remove the failed queue') + empty_parser.add_argument('-f', '--failed', help='remove the failed queue', action='store_true') #save a failed traceback parser save_failed_parser = subparsers.add_parser('save-failed', help='save failed job error output') @@ -141,15 +141,14 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: default=None, type=str) swap_seed_subparser.add_argument( - '--seed_limit', + '--seed-limit', help='The amount of random seeds that each ran pipeline will have at the end of the test', default=2, type=int) swap_seed_subparser.add_argument( '--test', help='run the test instead of random pipeline generation', - default='false', - type=str) + action='store_true') #Primitive swapper functionality primitive_swap_subparser = subparsers.add_parser( diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 65e68e5..c5608de 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -37,7 +37,9 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str): def evaluate_pipeline_on_problem(pipeline_path: str, problem_path: str, input_path: str, - random_seed: int): + random_seed: int, + data_pipeline_path: str=data_split_file, + data_random_seed: int=0): """ Evaluate pipeline on problem. A less verbose form of running d3m's runtime cli 'evaluate' command. @@ -80,13 +82,16 @@ def evaluate_pipeline_on_problem(pipeline_path: str, #evaluate pipeline evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, - random_seed=random_seed) + random_seed=random_seed, data_pipeline_path = data_pipeline_path, + data_random_seed=data_random_seed) def evaluate_pipeline_via_d3m_cli(pipeline: str, problem: str, input: str, output_run: str, - random_seed: int): + random_seed: int, + data_pipeline_path: str=data_split_file, + data_random_seed: int=0): """ Evaluate pipeline on problem using d3m's runtime cli. Wrapper function to execute d3m's runtime cli 'evaluate' command. @@ -107,6 +112,10 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, random_seed : int random seed to used for pipeline run + data_pipeline_path: str + path to data prepation pipeline + data_random_seed: int + random_seed to be used in data preparation Return: ------- @@ -117,7 +126,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, ValueError when parameter value is invalid - """ + """ args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate'] if (not os.path.isfile(pipeline)): @@ -129,7 +138,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) - if (not os.path.isfile(data_split_file)): + if (not os.path.isfile(data_pipeline_path)): raise ValueError('\'{}\' pipeline not a file path'.format('data split')) @@ -137,7 +146,8 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--problem', problem)) args.extend(('--input', input)) args.extend(('--output-run', output_run)) - args.extend(('--data-pipeline', data_split_file)) + args.extend(('--data-pipeline', data_pipeline_path)) + args.extend(('--data-random-seed', data_random_seed)) d3m_cli.main(args) if (config.save_to_d3m is True): save_pipeline_run_to_d3m_db(output_run) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index ea1a650..3f8f6d8 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -3,6 +3,7 @@ from experimenter.utils import download_from_database import d3m.metadata.pipeline from random import randint +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file import json from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline @@ -17,7 +18,7 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None self.max_jobs = max_jobs self.num_complete = 0 #run the query on initializing to define the query results - if (args.test == 'true'): + if (args.test is True): self.query_results = self._run_seed_test(self.args) else: self.query_results = self._query(self.args) @@ -29,22 +30,20 @@ def __iter__(self): def __next__(self): #iterate through query results - job = self.next() + job = next(self.generator) if (self.max_jobs): if (self.num_complete > self.max_jobs): raise StopIteration return job - - - def next(self): - #iterate through query results - return next(self.generator) - + def _get_generator(self): + """ + Main generator to be used of ModifyGenerator class + """ for query_result in self.query_results: #iterate through modifier results - for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result,self.args): + for pipeline, problem_path, dataset_doc_path, random_seed in self._modify(query_result,self.args): #save the pipeline to path and return pipeline path pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') #catch error returning none for file paths @@ -55,12 +54,14 @@ def _get_generator(self): pipeline_path=pipeline_path, problem_path=problem_path, input_path=dataset_doc_path, - random_seed=seed) + random_seed=random_seed) self.num_complete += 1 yield job def _query(self, args): + """method for querying database according to pipeline modification type + """ if (self.modifier_type=='random-seed'): return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter) if (self.modifier_type=='swap-primitive'): @@ -70,6 +71,8 @@ def _query(self, args): def _modify(self, query_args: dict, args): + """Handler for different types of pipeline modification tasks + """ if self.modifier_type=='random-seed': return self._modify_random_seed(args.seed_limit, query_args) if self.modifier_type=='swap-primitive': @@ -79,7 +82,8 @@ def _modify(self, query_args: dict, args): def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check): - """Pseudo function/method for duplicate checking - this is not complete + """Pseudo function/method for duplicate checking + - This function is not complete and will be used for future generation type jobs """ #create the pipeline to check for duplicates from the path pipeline_object = d3m.metadata.pipeline.Pipeline.from_json(pipeline_to_check) @@ -92,6 +96,9 @@ def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check): def _modify_random_seed(self, seed_limit, query_args): + """Generates new seeds for a given pipeline, problem, and dataset + It is dependent on the seed limit for how many it will generate + """ used_seeds = query_args['tested_seeds'] num_run = len(used_seeds) #run until the right number of seeds have been run @@ -106,13 +113,18 @@ def _modify_random_seed(self, seed_limit, query_args): def _run_seed_test(self,args): + """ Test designed for development and functionality purposes. + It uses and dataset and pipeline that is saved in d3m-experimenter + """ with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file: pipeline = json.load(pipeline_file) dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') + data_random_seed = 0 used_seeds = {2,15} yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, - 'tested_seeds': used_seeds } + 'tested_seeds': used_seeds} + def _modify_swap_primitive(self, swap_pipeline, query_args): raise ValueError("No functionality for swapping primitives yet") diff --git a/experimenter/query.py b/experimenter/query.py index 2a17228..dfc5ff5 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -66,7 +66,14 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') for (problem_id, dataset_id), random_seeds in results.items(): if limit and len(random_seeds) > limit: continue - yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds} + yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), + 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds} + +def get_data_preparation_pipeline(data_pred_id: str=None): + data_prep_search = Search(using=CONNECTION, index='pipelines') + data_prep_search = data_prep_search.query('match', id=data_prep_id) + data_prep_pipeline = next(data_prep_search.scan()) + return data_prep_pipeline def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ @@ -75,7 +82,6 @@ def scan_pipeline_runs(pipeline_id, submitter=None): .query('match', status__state='SUCCESS') if submitter: pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) - results = dict() for pipeline_run in pipeline_run_search.scan(): for dataset in pipeline_run.datasets: diff --git a/experimenter/queue.py b/experimenter/queue.py index a62bfa2..689f1db 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -83,11 +83,11 @@ def enqueue(job, queue_name: str = _DEFAULT_QUEUE, job_timeout: int = None) -> r return q.enqueue(**job, job_timeout=job_timeout) -def empty(queue_name: str = None, empty_failed_queue: str = 'false') -> None: +def empty(queue_name: str = None, empty_failed_queue: bool = False) -> None: if queue_name is None: queue_name = _DEFAULT_QUEUE #empty the failed queue or just the normal one - if (empty_failed_queue == 'true'): + if (empty_failed_queue is True): empty_failed(queue_name=queue_name) else: queue = get_queue(queue_name) diff --git a/experimenter/utils.py b/experimenter/utils.py index f22b081..a995e18 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -33,13 +33,16 @@ def download_from_database(data, type_to_download: str = 'Pipeline'): if (type_to_download == 'Pipeline'): i_d = data['id'] save_path = os.path.join('/data', 'Pipeline', i_d+str('.json')) - #create the new directory - os.makedirs(os.path.dirname(save_path),exist_ok=True) - #save the file to the directory - with open(save_path, 'w') as to_save: - json.dump(data, to_save, indent=4) + elif (type_to_download == 'Preparation'): + save_path = os.path.join('/data', 'DataPreparation', i_d+str('.json')) else: raise ValueError("type: {}, not available for download".format(type_to_download)) + #create the new directory + os.makedirs(os.path.dirname(save_path),exist_ok=True) + #save the file to the directory + with open(save_path, 'w') as to_save: + json.dump(data, to_save, indent=4) + #return the location return save_path From 8c26b4ad4d75fb00b42099cbfef00e385530d3de Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Mon, 22 Mar 2021 13:43:52 -0600 Subject: [PATCH 29/44] Working with test and when the data preparation is explicitly defined in the pipeline run file --- experimenter/evaluate_pipeline_new.py | 12 +-- experimenter/modify_generator.py | 32 +++++-- experimenter/query.py | 126 +++++++++++--------------- experimenter/utils.py | 2 +- 4 files changed, 82 insertions(+), 90 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index c5608de..929a3c4 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -76,7 +76,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str, output_run_path.append(str(random_seed)) #get the output run path output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', - '_'.join(output_run_path)+'.json')) + '_'.join(output_run_path)+'.yaml')) #create the directory os.makedirs(os.path.dirname(output_run_path),exist_ok=True) #evaluate pipeline @@ -96,6 +96,8 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, Evaluate pipeline on problem using d3m's runtime cli. Wrapper function to execute d3m's runtime cli 'evaluate' command. Arguments mirror the same arguments using the cli. + Only handles cases with a data preparation pipeline in the + pipeline run. Parameters ---------- @@ -116,6 +118,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, path to data prepation pipeline data_random_seed: int random_seed to be used in data preparation + input_run: path to pipeline run file Return: ------- @@ -127,8 +130,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, when parameter value is invalid """ - args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate'] - if (not os.path.isfile(pipeline)): raise ValueError('\'{}\' param not a file path'.format('pipeline')) @@ -138,10 +139,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) - if (not os.path.isfile(data_pipeline_path)): - raise ValueError('\'{}\' pipeline not a file path'.format('data split')) - - + args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate'] args.extend(('--pipeline', pipeline)) args.extend(('--problem', problem)) args.extend(('--input', input)) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 3f8f6d8..2c3ea67 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,10 +1,11 @@ -from experimenter.query import query_on_seeds, query_on_primitive +from experimenter.query import query_on_seeds from experimenter import queue, utils from experimenter.utils import download_from_database import d3m.metadata.pipeline from random import randint from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file import json +import yaml from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline class ModifyGenerator: @@ -40,21 +41,28 @@ def __next__(self): def _get_generator(self): """ Main generator to be used of ModifyGenerator class + Can only handle cases where there is a data preparation + pipeline in the pipeline run """ for query_result in self.query_results: #iterate through modifier results - for pipeline, problem_path, dataset_doc_path, random_seed in self._modify(query_result,self.args): + for pipeline, problem_path, dataset_doc, random_seed, prep in self._modify(query_result,self.args): #save the pipeline to path and return pipeline path + data_prep_pipeline, data_random_seed = prep pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') - #catch error returning none for file paths - if (problem_path is None or dataset_doc_path is None): + if (data_prep_pipeline is not None): + data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation') + #catch error returning none for file paths or preparation pipeline + #TODO get data preparation pipeline even when it is not explicitly defined + if (problem_path is None or dataset_doc is None or data_prep_pipeline is None): continue - #create the job if file paths are returned from query job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, - input_path=dataset_doc_path, - random_seed=random_seed) + input_path=dataset_doc, + random_seed=random_seed, + data_pipeline_path=data_prep_pipeline, + data_random_seed=data_random_seed) self.num_complete += 1 yield job @@ -109,7 +117,7 @@ def _modify_random_seed(self, seed_limit, query_args): num_run += 1 used_seeds.add(new_seed) #yield the necessary job requirements - yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed + yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, (query_args['data_prep_pipeline'], query_args['data_prep_seed']) def _run_seed_test(self,args): @@ -120,10 +128,14 @@ def _run_seed_test(self,args): pipeline = json.load(pipeline_file) dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') - data_random_seed = 0 + data_prep_seed = 0 + with open(data_split_file, 'r') as pipeline_file: + data_prep_pipeline = yaml.full_load(pipeline_file) + data_prep_pipeline = data_prep_pipeline used_seeds = {2,15} yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, - 'tested_seeds': used_seeds} + 'tested_seeds': used_seeds, 'data_prep_pipeline': + data_prep_pipeline, 'data_prep_seed': data_prep_seed} def _modify_swap_primitive(self, swap_pipeline, query_args): diff --git a/experimenter/query.py b/experimenter/query.py index dfc5ff5..75e750e 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -5,87 +5,69 @@ HOST = 'https://metalearning.datadrivendiscovery.org/es' CONNECTION = Elasticsearch(hosts=[HOST], timeout=300) -def query_on_primitive(primitive_id: str, limit_indexes=False): - '''Queries the metalearning database for pipelines using the specified primitive. - Queries the metalearning database using the Elasticsearch endpoint documented - on D3M's website (see https://metalearning.datadrivendiscovery.org for more - info). Finds all pipelines containing a certain primitive as specified by the - keyword argument. Also determines the index(es) of that primitive in each - matching pipeline and gets the datasets that were used in pipeline runs. - - Arguments - --------- - primitive_id : str - A primitive's unique id. - limit_indexes : 'first', 'last', or False (default) - Limits which index of the primitive is returned for each pipeline match. - Use 'first' to get the index of the first matching primitive specified by - the keyword arg. Use 'last' to get the index of the last match. Use False - (default) to get a list of all indexes for each pipeline specifying where - the primitive is. - - Yields - ------- - A list of tuples where each tuple contains (in this order): - 1. a matching pipeline - 2. the index(es) of the desired primitives in the given pipeline's steps - 3. a dictionary containing the datasets used in pipeline runs where the key - is the dataset digest and the value is the dataset id (human-readable string). - 4. the random seeds used in pipeline runs. - ''' - - if limit_indexes not in { 'first', 'last', False }: - raise ValueError(f'Invalid value "{limit_indexes}" for arg limit_indexes') - - match_query = Q('match', steps__primitive__id=primitive_id) - nested_query = Q('nested', path='steps', query=match_query) - pipeline_search = Search(using=CONNECTION, index='pipelines').query(nested_query) - - for pipeline in pipeline_search.scan(): - results = scan_pipeline_runs(pipeline.id) - - locs = [i for i, step in enumerate(pipeline.steps) if primitive_id == step.primitive.id] - if limit_indexes == 'last': - locs = locs[-1] - elif limit_indexes == 'first': - locs = locs[0] - - for (problem_id, dataset_name), random_seeds in results.items(): - yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds} +def get_search_query(arguments: dict = None, connection = CONNECTION, index='pipelines'): + index_search = Search(using=CONNECTION, index=index) + if arguments['id'] is not None: + index_search = index_search.query('match', id=arguments['id']) + if arguments['submitter'] is not None: + index_search = index_search.query('match', _submitter=arguments['submitter']) + return index_search + def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): - pipeline_search = Search(using=CONNECTION, index='pipelines') - if pipeline_id: - pipeline_search = pipeline_search.query('match', id=pipeline_id) - if submitter: - pipeline_search = pipeline_search.query('match', _submitter=submitter) - - for pipeline in pipeline_search.scan(): - results = scan_pipeline_runs(pipeline.id, submitter) - for (problem_id, dataset_id), random_seeds in results.items(): - if limit and len(random_seeds) > limit: - continue - yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), - 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds} + arguments = {'id': pipeline_id, 'submitter': submitter} + pipeline_search = get_search_query(arguments=arguments, index='pipelines') + for pipeline in pipeline_search.scan(): + results = scan_pipeline_runs(pipeline.id, submitter) + for (problem_id, dataset_id, data_prep), random_seeds in results.items(): + if limit and len(random_seeds) > limit: + continue + #data_prep_pipeline, data_prep_seed = data_prep + input_run = data_prep[0] + yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), + 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds, + 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed} + def get_data_preparation_pipeline(data_pred_id: str=None): - data_prep_search = Search(using=CONNECTION, index='pipelines') - data_prep_search = data_prep_search.query('match', id=data_prep_id) + arguments = {'submitter': None, 'id': data_prep_id} + data_prep_search = get_search_query(arguments=arguments) data_prep_pipeline = next(data_prep_search.scan()) return data_prep_pipeline + +def check_for_data_prep(pipeline_run=None): + """Only handles cases with an explicit data preparation pipeline + in the pipeline run + """ + data_prep = None + data_prep_pipeline = None + data_prep_seed = None + try: + data_prep = pipeline_run.run.data_preparation + except: + data_prep = None + data_prep_seed = None + if (data_prep is not None): + data_prep_seed = data_prep.random_seed + data_prep_pipeline = get_data_preparation_pipeline(data_prep.pipeline.id) + return data_prep_pipeline, data_prep_seed + + def scan_pipeline_runs(pipeline_id, submitter=None): - pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ + pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ .query('match', pipeline__id=pipeline_id) \ .query('match', run__phase='PRODUCE') \ .query('match', status__state='SUCCESS') - if submitter: - pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) - results = dict() - for pipeline_run in pipeline_run_search.scan(): - for dataset in pipeline_run.datasets: - dataset_prob_tuple = (pipeline_run.problem.id, dataset.id) - results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) - results[dataset_prob_tuple].add(pipeline_run.random_seed) - return results + if submitter: + pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) + results = dict() + for pipeline_run in pipeline_run_search.scan(): + data_prep_pipeline, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run) + for dataset in pipeline_run.datasets: + dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_pipeline, data_prep_seed)) + results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) + results[dataset_prob_tuple].add(pipeline_run.random_seed) + return results + diff --git a/experimenter/utils.py b/experimenter/utils.py index a995e18..830ad9c 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -30,8 +30,8 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None) def download_from_database(data, type_to_download: str = 'Pipeline'): + i_d = data['id'] if (type_to_download == 'Pipeline'): - i_d = data['id'] save_path = os.path.join('/data', 'Pipeline', i_d+str('.json')) elif (type_to_download == 'Preparation'): save_path = os.path.join('/data', 'DataPreparation', i_d+str('.json')) From 614be9136bb71c867bf88baebc2eced83777d05f Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Mon, 22 Mar 2021 22:36:50 +0000 Subject: [PATCH 30/44] Bug fixes for remote work --- experimenter/evaluate_pipeline_new.py | 12 +++++++----- experimenter/modify_generator.py | 8 +++++++- experimenter/query.py | 21 ++++++++++++--------- experimenter/queue.py | 15 ++++++++++----- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 929a3c4..279c5ef 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -1,5 +1,6 @@ import itertools as it import json +import yaml import os import parser @@ -8,7 +9,8 @@ from experimenter import config, utils from d3m import cli as d3m_cli -from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as k_fold_split_path +from d3m.contrib.pipelines import FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH as fixed_split_path from experimenter.databases.d3m_mtl import D3MMtLDB @@ -31,14 +33,14 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str): """ d3m_db = D3MMtLDB() with open(pipeline_run_path) as pipeline_data: - pipeline_run = json.load(pipeline_data) + pipeline_run = yaml.full_load(pipeline_data) return D3MMtLDB().save_pipeline_run(pipeline_run) def evaluate_pipeline_on_problem(pipeline_path: str, problem_path: str, input_path: str, random_seed: int, - data_pipeline_path: str=data_split_file, + data_pipeline_path: str=k_fold_split_path, data_random_seed: int=0): """ Evaluate pipeline on problem. @@ -90,7 +92,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, input: str, output_run: str, random_seed: int, - data_pipeline_path: str=data_split_file, + data_pipeline_path: str=k_fold_split_path, data_random_seed: int=0): """ Evaluate pipeline on problem using d3m's runtime cli. @@ -144,7 +146,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--problem', problem)) args.extend(('--input', input)) args.extend(('--output-run', output_run)) - args.extend(('--data-pipeline', data_pipeline_path)) + args.extend(('--data-pipeline', fixed_split_path)) args.extend(('--data-random-seed', data_random_seed)) d3m_cli.main(args) if (config.save_to_d3m is True): diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 2c3ea67..70b4d58 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -54,8 +54,14 @@ def _get_generator(self): data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation') #catch error returning none for file paths or preparation pipeline #TODO get data preparation pipeline even when it is not explicitly defined - if (problem_path is None or dataset_doc is None or data_prep_pipeline is None): + if (problem_path is None or dataset_doc is None or data_prep_pipeline is None): continue + evaluate_pipeline(pipeline_path=pipeline_path, + problem_path=problem_path, + input_path=dataset_doc, + random_seed=random_seed, + data_pipeline_path=data_prep_pipeline, + data_random_seed=data_random_seed) job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, diff --git a/experimenter/query.py b/experimenter/query.py index 75e750e..06aea7a 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -23,17 +23,20 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') for (problem_id, dataset_id, data_prep), random_seeds in results.items(): if limit and len(random_seeds) > limit: continue - #data_prep_pipeline, data_prep_seed = data_prep - input_run = data_prep[0] + data_prep_id, data_prep_seed = data_prep + data_prep_pipeline = get_data_preparation_pipeline(data_prep_id) yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds, 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed} -def get_data_preparation_pipeline(data_pred_id: str=None): +def get_data_preparation_pipeline(data_prep_id: str=None): + if (data_prep_id is None): + return None arguments = {'submitter': None, 'id': data_prep_id} data_prep_search = get_search_query(arguments=arguments) data_prep_pipeline = next(data_prep_search.scan()) + data_prep_pipeline = data_prep_pipeline.to_dict() return data_prep_pipeline @@ -42,7 +45,7 @@ def check_for_data_prep(pipeline_run=None): in the pipeline run """ data_prep = None - data_prep_pipeline = None + data_prep_id = None data_prep_seed = None try: data_prep = pipeline_run.run.data_preparation @@ -51,8 +54,8 @@ def check_for_data_prep(pipeline_run=None): data_prep_seed = None if (data_prep is not None): data_prep_seed = data_prep.random_seed - data_prep_pipeline = get_data_preparation_pipeline(data_prep.pipeline.id) - return data_prep_pipeline, data_prep_seed + data_prep_id = data_prep.pipeline.id + return data_prep_id, data_prep_seed def scan_pipeline_runs(pipeline_id, submitter=None): @@ -62,11 +65,11 @@ def scan_pipeline_runs(pipeline_id, submitter=None): .query('match', status__state='SUCCESS') if submitter: pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) - results = dict() + results = dict() for pipeline_run in pipeline_run_search.scan(): - data_prep_pipeline, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run) + data_prep_id, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run) for dataset in pipeline_run.datasets: - dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_pipeline, data_prep_seed)) + dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_id, data_prep_seed)) results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) results[dataset_prob_tuple].add(pipeline_run.random_seed) return results diff --git a/experimenter/queue.py b/experimenter/queue.py index 87fd7a9..6dadbbf 100644 --- a/experimenter/queue.py +++ b/experimenter/queue.py @@ -9,7 +9,7 @@ _DEFAULT_QUEUE = 'default' _EMPTIED_MESSAGE = 'queue {} emptied' - +_SAVE_FAILED_MESSAGE = 'Failed job output saved to {}' def get_connection(): config.validate_redis_host() @@ -46,16 +46,21 @@ def get_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0): job_ids = reg.get_job_ids() if (len(job_ids)<=0): return "None", reg - job = job_ids[0] - job = rq.job.Job.fetch(job, connection=get_connection()) - return job.exc_info, reg + job_id = job_ids[0] + return job_id, reg def save_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0): if (queue_name is None): queue_name = _DEFAULT_QUEUE + job_id, failed_queue = get_failed_job() + job = rq.job.Job.fetch(job_id, connection=get_connection()) with open (os.path.join('/data',"failed_job_{}.txt".format(job_num)), 'w') as job_file: - job_file.write(get_failed_job(queue_name=queue_name, job_num=job_num)[0]) + job_file.write(job.exc_info) + #remove the job + failed_queue.remove(job_id, delete_job=True) + print(_SAVE_FAILED_MESSAGE.format(os.path.join('/data', + "failed_job_{}.txt".format(job_num)))) def get_queue_message(queues: list): From ce3d768bf00b1169ce9bef3908f37d7af5e75568 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Mon, 22 Mar 2021 16:47:31 -0600 Subject: [PATCH 31/44] Added data preparation checks for d3m module --- experimenter/evaluate_pipeline_new.py | 2 +- experimenter/modify_generator.py | 4 +++- experimenter/query.py | 14 +++++++++----- experimenter/utils.py | 27 +++++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 279c5ef..656c49f 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -146,7 +146,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--problem', problem)) args.extend(('--input', input)) args.extend(('--output-run', output_run)) - args.extend(('--data-pipeline', fixed_split_path)) + args.extend(('--data-pipeline', data_pipeline_path)) args.extend(('--data-random-seed', data_random_seed)) d3m_cli.main(args) if (config.save_to_d3m is True): diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 70b4d58..fa65736 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -51,7 +51,9 @@ def _get_generator(self): data_prep_pipeline, data_random_seed = prep pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') if (data_prep_pipeline is not None): - data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation') + if (~os.path.exist(data_prep_pipeline)): + data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation') + print(data_prep_pipeline) #catch error returning none for file paths or preparation pipeline #TODO get data preparation pipeline even when it is not explicitly defined if (problem_path is None or dataset_doc is None or data_prep_pipeline is None): diff --git a/experimenter/query.py b/experimenter/query.py index 06aea7a..e5bc555 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -1,6 +1,7 @@ from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q -from experimenter.utils import get_problem_path, get_dataset_doc_path +from experimenter.utils import get_problem_path, get_dataset_doc_path, get_data_prep_from_d3m + HOST = 'https://metalearning.datadrivendiscovery.org/es' CONNECTION = Elasticsearch(hosts=[HOST], timeout=300) @@ -33,10 +34,13 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') def get_data_preparation_pipeline(data_prep_id: str=None): if (data_prep_id is None): return None - arguments = {'submitter': None, 'id': data_prep_id} - data_prep_search = get_search_query(arguments=arguments) - data_prep_pipeline = next(data_prep_search.scan()) - data_prep_pipeline = data_prep_pipeline.to_dict() + data_prep_pipeline = get_data_prep_from_d3m(data_prep_id) + #get from database if not in d3m module + if (data_prep_pipeline is None): + arguments = {'submitter': None, 'id': data_prep_id} + data_prep_search = get_search_query(arguments=arguments) + data_prep_pipeline = next(data_prep_search.scan()) + data_prep_pipeline = data_prep_pipeline.to_dict() return data_prep_pipeline diff --git a/experimenter/utils.py b/experimenter/utils.py index 830ad9c..e2d1a0d 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -9,6 +9,7 @@ from d3m.metadata import problem as problem_module from d3m.utils import get_datasets_and_problems +from d3m.contrib import pipelines from experimenter import exceptions, config @@ -16,6 +17,32 @@ DEFAULT_DATASET_DIR = "/datasets" datasets, problems = None, None +def get_dict_data_prep_pipelines(): + data_prep_dict = dict() + data_prep_id_list = list() + #save the relevant paths and ids for data preparation + data_prep_id_list.append(pipelines.NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID) + data_prep_dict[pipelines.NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID] = pipelines.NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH + data_prep_id_list.append(pipelines.FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID) + data_prep_dict[pipelines.FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID] = pipelines.FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH + data_prep_id_list.append(pipelines.TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID) + data_prep_dict[pipelines.TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID] = pipelines.TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH + data_prep_id_list.append(pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID) + data_prep_dict[pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID] = pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_PATH + return data_prep_dict, data_prep_id_list + + +def get_data_prep_from_d3m(pipeline_id: str = None): + """Checks if data preparation pipeline is in d3m module, + if not, return None + + """ + data_prep_dict, data_prep_id_list = get_dict_data_prep_pipelines() + if (pipeline_id in data_prep_id_list): + return data_prep_dict[pipeline_id] + else: + return None + def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None): #create the directory os.makedirs(os.path.join('/data','DoesNotExist'),exist_ok=True) From 9ce7db7f5028bd9a5ca128cca421089317ca3690 Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Mon, 22 Mar 2021 23:47:37 +0000 Subject: [PATCH 32/44] Minor changes to query, still failed pipelines that probably should not be failing --- experimenter/evaluate_pipeline_new.py | 2 +- experimenter/modify_generator.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 656c49f..6aa53d1 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -84,7 +84,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str, #evaluate pipeline evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, input=input_path, output_run=output_run_path, - random_seed=random_seed, data_pipeline_path = data_pipeline_path, + random_seed=random_seed, data_pipeline_path=data_pipeline_path, data_random_seed=data_random_seed) def evaluate_pipeline_via_d3m_cli(pipeline: str, diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index fa65736..233d4ec 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -5,6 +5,7 @@ from random import randint from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file import json +import os import yaml from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline @@ -50,20 +51,13 @@ def _get_generator(self): #save the pipeline to path and return pipeline path data_prep_pipeline, data_random_seed = prep pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') - if (data_prep_pipeline is not None): - if (~os.path.exist(data_prep_pipeline)): - data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation') - print(data_prep_pipeline) #catch error returning none for file paths or preparation pipeline #TODO get data preparation pipeline even when it is not explicitly defined if (problem_path is None or dataset_doc is None or data_prep_pipeline is None): continue - evaluate_pipeline(pipeline_path=pipeline_path, - problem_path=problem_path, - input_path=dataset_doc, - random_seed=random_seed, - data_pipeline_path=data_prep_pipeline, - data_random_seed=data_random_seed) + #check if query returned a path or an id + if (os.path.exists(data_prep_pipeline) is False): + data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation') job = queue.make_job(evaluate_pipeline, pipeline_path=pipeline_path, problem_path=problem_path, From 6165dbd20a4d9dc20546b24c6d1b91046243210a Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 23 Mar 2021 13:18:40 -0600 Subject: [PATCH 33/44] More robust data preparation and scoring pipelines from pipeline run --- experimenter/evaluate_pipeline_new.py | 90 ++++++++++++++++++++------- experimenter/modify_generator.py | 72 ++++++++++++++------- experimenter/query.py | 50 ++++++++++----- experimenter/utils.py | 43 ++++++++----- 4 files changed, 178 insertions(+), 77 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 6aa53d1..46ee5cb 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -36,12 +36,16 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str): pipeline_run = yaml.full_load(pipeline_data) return D3MMtLDB().save_pipeline_run(pipeline_run) -def evaluate_pipeline_on_problem(pipeline_path: str, - problem_path: str, - input_path: str, - random_seed: int, +def evaluate_pipeline_on_problem(pipeline: str=None, + problem: str=None, + input: str=None, + random_seed: int=0, data_pipeline_path: str=k_fold_split_path, - data_random_seed: int=0): + data_random_seed: int=0, + data_params=None, + scoring_pipeline: str=None, + scoring_params=None, + scoring_random_seed: int=0): """ Evaluate pipeline on problem. A less verbose form of running d3m's runtime cli 'evaluate' command. @@ -58,6 +62,18 @@ def evaluate_pipeline_on_problem(pipeline_path: str, path to input full data random_seed : int random seed to be used for pipeline run + data_pipeline_path: str + path to data prepation pipeline + data_random_seed: int + random_seed to be used in data preparation + data_params: + parameters for data preparation + scoring_params: + parameters for scoring pipeline + scoring_random_seed: int + random seed for scoring + scoring_pipeline: str + path to scoring pipeline Returns: ---------- @@ -69,12 +85,12 @@ def evaluate_pipeline_on_problem(pipeline_path: str, when a file cannot be opened """ output_run_path = [] - with open(pipeline_path, 'r') as data: - pipeline = json.load(data) - output_run_path.append(pipeline['id']) - with open(problem_path, 'r') as data: - problem = json.load(data) - output_run_path.append(problem['about']['problemID']) + with open(pipeline, 'r') as data: + pipe = json.load(data) + output_run_path.append(pipe['id']) + with open(problem, 'r') as data: + prob = json.load(data) + output_run_path.append(prob['about']['problemID']) output_run_path.append(str(random_seed)) #get the output run path output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', @@ -82,18 +98,24 @@ def evaluate_pipeline_on_problem(pipeline_path: str, #create the directory os.makedirs(os.path.dirname(output_run_path),exist_ok=True) #evaluate pipeline - evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path, - input=input_path, output_run=output_run_path, + evaluate_pipeline_via_d3m_cli(pipeline=pipeline, problem=problem, + input=input, output_run=output_run_path, random_seed=random_seed, data_pipeline_path=data_pipeline_path, - data_random_seed=data_random_seed) - -def evaluate_pipeline_via_d3m_cli(pipeline: str, - problem: str, - input: str, - output_run: str, - random_seed: int, + data_random_seed=data_random_seed, data_params=data_params, + scoring_pipeline=scoring_pipeline, scoring_params=scoring_params, + scoring_random_seed=scoring_random_seed) + +def evaluate_pipeline_via_d3m_cli(pipeline: str=None, + problem: str=None, + input: str=None, + output_run: str=None, + random_seed: int=0, data_pipeline_path: str=k_fold_split_path, - data_random_seed: int=0): + data_random_seed: int=0, + data_params=None, + scoring_pipeline: str=None, + scoring_params=None, + scoring_random_seed: int=0): """ Evaluate pipeline on problem using d3m's runtime cli. Wrapper function to execute d3m's runtime cli 'evaluate' command. @@ -120,8 +142,14 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, path to data prepation pipeline data_random_seed: int random_seed to be used in data preparation - input_run: path to pipeline run file - + data_params: + parameters for data preparation + scoring_params: + parameters for scoring pipeline + scoring_random_seed: int + random seed for scoring + scoring_pipeline: str + path to scoring pipeline Return: ------- None @@ -140,7 +168,14 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, if (not os.path.isfile(input)): raise ValueError('\'{}\' param not a file path'.format('input')) - + + if (not os.path.isfile(data_pipeline_path)): + raise ValueError('\'{}\' param not a file path'.format('input')) + + if (not os.path.isfile(scoring_pipeline)): + raise ValueError('\'{}\' param not a file path'.format('input')) + + #TODO - call fit-score when the data pipeline is not defined in the pipeline run args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate'] args.extend(('--pipeline', pipeline)) args.extend(('--problem', problem)) @@ -148,6 +183,13 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str, args.extend(('--output-run', output_run)) args.extend(('--data-pipeline', data_pipeline_path)) args.extend(('--data-random-seed', data_random_seed)) + if (data_params is not None): + args.extend(('--data-param', data_params)) + args.extend(('--scoring-pipeline', scoring_pipeline)) + args.extend(('--scoring-random-seed', scoring_random_seed)) + if (scoring_params is not None): + args.extend(('--scoring-param', scoring_params)) d3m_cli.main(args) if (config.save_to_d3m is True): save_pipeline_run_to_d3m_db(output_run) + diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 233d4ec..6a5237c 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -1,14 +1,17 @@ -from experimenter.query import query_on_seeds -from experimenter import queue, utils -from experimenter.utils import download_from_database -import d3m.metadata.pipeline from random import randint -from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file import json import os import yaml + +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file +from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file + +from experimenter.query import query_on_seeds +from experimenter import queue, utils +from experimenter.utils import download_from_database from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline + class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing pipelines in the database @@ -25,7 +28,8 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None else: self.query_results = self._query(self.args) self.generator = self._get_generator() - + + def __iter__(self): return self @@ -47,24 +51,41 @@ def _get_generator(self): """ for query_result in self.query_results: #iterate through modifier results - for pipeline, problem_path, dataset_doc, random_seed, prep in self._modify(query_result,self.args): + for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result,self.args): #save the pipeline to path and return pipeline path - data_prep_pipeline, data_random_seed = prep + data_prep_pipeline, data_random_seed, data_params = data + scoring_pipeline, scoring_random_seed, scoring_params = score pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') + #TODO - catch when there is no data preparation pipeline and pass it further to evaluate #catch error returning none for file paths or preparation pipeline - #TODO get data preparation pipeline even when it is not explicitly defined if (problem_path is None or dataset_doc is None or data_prep_pipeline is None): continue #check if query returned a path or an id if (os.path.exists(data_prep_pipeline) is False): - data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation') + data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Data Preparation') + if (os.path.exists(scoring_pipeline) is False): + scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='Scoring') + evaluate_pipeline(pipeline=pipeline_path, + problem=problem_path, + input=dataset_doc, + random_seed=seed, + data_pipeline_path=data_prep_pipeline, + data_random_seed=data_random_seed, + data_params=data_params, + scoring_pipeline=scoring_pipeline, + scoring_random_seed=scoring_random_seed, + scoring_params=scoring_params) job = queue.make_job(evaluate_pipeline, - pipeline_path=pipeline_path, - problem_path=problem_path, - input_path=dataset_doc, - random_seed=random_seed, + pipeline=pipeline_path, + problem=problem_path, + input=dataset_doc, + random_seed=seed, data_pipeline_path=data_prep_pipeline, - data_random_seed=data_random_seed) + data_random_seed=data_random_seed, + data_params=data_params, + scoring_pipeline=scoring_pipeline, + scoring_random_seed=scoring_random_seed, + scoring_params=scoring_params) self.num_complete += 1 yield job @@ -119,7 +140,9 @@ def _modify_random_seed(self, seed_limit, query_args): num_run += 1 used_seeds.add(new_seed) #yield the necessary job requirements - yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, (query_args['data_prep_pipeline'], query_args['data_prep_seed']) + yield (query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, + (query_args['data_prep_pipeline'], query_args['data_prep_seed'], query_args['data_params']), + (query_args['scoring_pipeline'], query_args['scoring_seed'], query_args['scoring_params'])) def _run_seed_test(self,args): @@ -131,13 +154,20 @@ def _run_seed_test(self,args): dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') data_prep_seed = 0 - with open(data_split_file, 'r') as pipeline_file: - data_prep_pipeline = yaml.full_load(pipeline_file) - data_prep_pipeline = data_prep_pipeline + #with open(data_split_file, 'r') as pipeline_file: + # data_prep_pipeline = yaml.full_load(pipeline_file) + #with open(scoring_file, 'r') as pipeline_file: + # scoring_pipeline = yaml.full_load(pipeline_file) + data_prep_seed = 0 + data_prep_pipeline = data_split_file + scoring_pipeline = scoring_file + scoring_seed = 0 used_seeds = {2,15} yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, - 'tested_seeds': used_seeds, 'data_prep_pipeline': - data_prep_pipeline, 'data_prep_seed': data_prep_seed} + 'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, + 'data_prep_seed': data_prep_seed, 'data_params': None, + 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed, + 'scoring_params': None} def _modify_swap_primitive(self, swap_pipeline, query_args): diff --git a/experimenter/query.py b/experimenter/query.py index e5bc555..d9d1f39 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -1,6 +1,7 @@ from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q -from experimenter.utils import get_problem_path, get_dataset_doc_path, get_data_prep_from_d3m +from experimenter.utils import get_problem_path, get_dataset_doc_path, get_pipelines_from_d3m +from d3m.runtime import _get_data_and_scoring_params_from_pipeline_run as _data_score_params HOST = 'https://metalearning.datadrivendiscovery.org/es' @@ -21,27 +22,31 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') pipeline_search = get_search_query(arguments=arguments, index='pipelines') for pipeline in pipeline_search.scan(): results = scan_pipeline_runs(pipeline.id, submitter) - for (problem_id, dataset_id, data_prep), random_seeds in results.items(): + for (problem_id, dataset_id, data_prep, scoring), random_seeds in results.items(): if limit and len(random_seeds) > limit: continue - data_prep_id, data_prep_seed = data_prep - data_prep_pipeline = get_data_preparation_pipeline(data_prep_id) + data_prep_id, data_prep_seed, data_params = data_prep + scoring_id, scoring_seed, scoring_params = scoring + data_prep_pipeline = get_pipeline(data_prep_id, types='Data') + scoring_pipeline = get_pipeline(scoring_id, types='Scoring') yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds, - 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed} + 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed, + 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed, + 'scoring_params': scoring_params, 'data_params': data_params} -def get_data_preparation_pipeline(data_prep_id: str=None): - if (data_prep_id is None): +def get_pipeline(pipeline_id: str=None, types: str='Data'): + if (pipeline_id is None): return None - data_prep_pipeline = get_data_prep_from_d3m(data_prep_id) + pipeline = get_pipelines_from_d3m(pipeline_id, types=types) #get from database if not in d3m module - if (data_prep_pipeline is None): + if (pipeline is None): arguments = {'submitter': None, 'id': data_prep_id} - data_prep_search = get_search_query(arguments=arguments) - data_prep_pipeline = next(data_prep_search.scan()) - data_prep_pipeline = data_prep_pipeline.to_dict() - return data_prep_pipeline + search = get_search_query(arguments=arguments) + pipeline = next(search.scan()) + pipeline = pipeline.to_dict() + return pipeline def check_for_data_prep(pipeline_run=None): @@ -59,8 +64,18 @@ def check_for_data_prep(pipeline_run=None): if (data_prep is not None): data_prep_seed = data_prep.random_seed data_prep_id = data_prep.pipeline.id - return data_prep_id, data_prep_seed - + data_params = _data_score_params(data_prep.get('steps', [])) + + return data_prep_id, data_prep_seed, data_params + + +def get_scoring_pipeline(pipeline_run=None): + scoring = pipeline_run.run.scoring + scoring_seed = scoring.random_seed + scoring_params = _data_score_params(scoring.get('steps', [])) + + return scoring.pipeline.id, scoring_seed, scoring_params + def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ @@ -71,9 +86,10 @@ def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) results = dict() for pipeline_run in pipeline_run_search.scan(): - data_prep_id, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run) + data_prep = check_for_data_prep(pipeline_run=pipeline_run) + scoring = get_scoring_pipeline(pipeline_run) for dataset in pipeline_run.datasets: - dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_id, data_prep_seed)) + dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring) results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) results[dataset_prob_tuple].add(pipeline_run.random_seed) return results diff --git a/experimenter/utils.py b/experimenter/utils.py index e2d1a0d..11f9eb5 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -17,7 +17,11 @@ DEFAULT_DATASET_DIR = "/datasets" datasets, problems = None, None -def get_dict_data_prep_pipelines(): + +def get_data_prep_pipelines(): + """ + Get data preparation pipelines that are already in the d3m module + """ data_prep_dict = dict() data_prep_id_list = list() #save the relevant paths and ids for data preparation @@ -30,18 +34,32 @@ def get_dict_data_prep_pipelines(): data_prep_id_list.append(pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID) data_prep_dict[pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID] = pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_PATH return data_prep_dict, data_prep_id_list + - -def get_data_prep_from_d3m(pipeline_id: str = None): +def get_scoring_pipelines(): + """ + Get the scoring pipelines that are already in the d3m module + """ + scoring_dict = dict() + scoring_id_list = list() + #save relevant paths and ids for scoring pipelines + scoring_id_list.append(pipelines.SCORING_PIPELINE_ID) + scoring_dict[pipelines.SCORING_PIPELINE_ID] = pipelines.SCORING_PIPELINE_PATH + return scoring_dict, scoring_id_list + + +def get_pipelines_from_d3m(pipeline_id: str = None, types='Data'): """Checks if data preparation pipeline is in d3m module, if not, return None - """ - data_prep_dict, data_prep_id_list = get_dict_data_prep_pipelines() - if (pipeline_id in data_prep_id_list): - return data_prep_dict[pipeline_id] - else: - return None + if (types=='Data'): + dict_ids, id_list = get_data_prep_pipelines() + elif (types=='Scoring'): + dict_ids, id_list = get_scoring_pipelines() + if (pipeline_id in id_list): + return dict_ids[pipeline_id] + return None + def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None): #create the directory @@ -58,12 +76,7 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None) def download_from_database(data, type_to_download: str = 'Pipeline'): i_d = data['id'] - if (type_to_download == 'Pipeline'): - save_path = os.path.join('/data', 'Pipeline', i_d+str('.json')) - elif (type_to_download == 'Preparation'): - save_path = os.path.join('/data', 'DataPreparation', i_d+str('.json')) - else: - raise ValueError("type: {}, not available for download".format(type_to_download)) + save_path = os.path.join('/data', type_to_download, i_d+str('.json')) #create the new directory os.makedirs(os.path.dirname(save_path),exist_ok=True) #save the file to the directory From 5208a461669d6a524719616aa16143026b2f6653 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 23 Mar 2021 13:19:59 -0600 Subject: [PATCH 34/44] Unnecessary commenting in _run_seed_test --- experimenter/modify_generator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 6a5237c..02aef61 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -154,10 +154,6 @@ def _run_seed_test(self,args): dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') data_prep_seed = 0 - #with open(data_split_file, 'r') as pipeline_file: - # data_prep_pipeline = yaml.full_load(pipeline_file) - #with open(scoring_file, 'r') as pipeline_file: - # scoring_pipeline = yaml.full_load(pipeline_file) data_prep_seed = 0 data_prep_pipeline = data_split_file scoring_pipeline = scoring_file From c59ad21510e7360d96a69f86f297bd5f493bffd2 Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Tue, 23 Mar 2021 21:46:42 +0000 Subject: [PATCH 35/44] query changes for data params and scoring params --- experimenter/query.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/experimenter/query.py b/experimenter/query.py index d9d1f39..455c834 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -22,11 +22,14 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') pipeline_search = get_search_query(arguments=arguments, index='pipelines') for pipeline in pipeline_search.scan(): results = scan_pipeline_runs(pipeline.id, submitter) - for (problem_id, dataset_id, data_prep, scoring), random_seeds in results.items(): + for (problem_id, dataset_id, data_prep, scoring), params_dict in results.items(): if limit and len(random_seeds) > limit: continue - data_prep_id, data_prep_seed, data_params = data_prep - scoring_id, scoring_seed, scoring_params = scoring + data_prep_id, data_prep_seed = data_prep + scoring_id, scoring_seed = scoring + random_seeds = params_dict['random_seeds'] + data_params = params_dict['data_params'] + scoring_params = params_dict['scoring_params'] data_prep_pipeline = get_pipeline(data_prep_id, types='Data') scoring_pipeline = get_pipeline(scoring_id, types='Scoring') yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), @@ -64,19 +67,26 @@ def check_for_data_prep(pipeline_run=None): if (data_prep is not None): data_prep_seed = data_prep.random_seed data_prep_id = data_prep.pipeline.id + data_prep = data_prep.to_dict() data_params = _data_score_params(data_prep.get('steps', [])) - return data_prep_id, data_prep_seed, data_params + return (data_prep_id, data_prep_seed), data_params def get_scoring_pipeline(pipeline_run=None): scoring = pipeline_run.run.scoring scoring_seed = scoring.random_seed - scoring_params = _data_score_params(scoring.get('steps', [])) - - return scoring.pipeline.id, scoring_seed, scoring_params + scoring_id = scoring.pipeline.id + scoring = scoring.to_dict() + scoring_params = _data_score_params(scoring.get('steps', [])) + return (scoring_id, scoring_seed), scoring_params +def get_unique_results(results: dict = None): + #function for getting unique results from the result dictionary + pass + + def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ .query('match', pipeline__id=pipeline_id) \ @@ -86,11 +96,12 @@ def scan_pipeline_runs(pipeline_id, submitter=None): pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) results = dict() for pipeline_run in pipeline_run_search.scan(): - data_prep = check_for_data_prep(pipeline_run=pipeline_run) - scoring = get_scoring_pipeline(pipeline_run) + data_prep, data_params = check_for_data_prep(pipeline_run=pipeline_run) + scoring, scoring_params = get_scoring_pipeline(pipeline_run) for dataset in pipeline_run.datasets: dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring) - results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set()) - results[dataset_prob_tuple].add(pipeline_run.random_seed) + results[dataset_prob_tuple] = results.get(dataset_prob_tuple, list()) + result_add_dict = {'random_seed': pipeline_run.random_seed, 'data_params': data_params, 'scoring_params': scoring_params} + results[dataset_prob_tuple].append(results_add_dict) return results From 500ea108985de42149468ce84f700886e1010606 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Tue, 23 Mar 2021 20:25:17 -0600 Subject: [PATCH 36/44] Adding data params and scoring params to pipeline run cli works locally --- experimenter/evaluate_pipeline_new.py | 14 ++++-- experimenter/query.py | 72 +++++++++++++++++++++------ 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py index 46ee5cb..66737df 100644 --- a/experimenter/evaluate_pipeline_new.py +++ b/experimenter/evaluate_pipeline_new.py @@ -182,13 +182,17 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str=None, args.extend(('--input', input)) args.extend(('--output-run', output_run)) args.extend(('--data-pipeline', data_pipeline_path)) - args.extend(('--data-random-seed', data_random_seed)) - if (data_params is not None): - args.extend(('--data-param', data_params)) + args.extend(('--data-random-seed', str(data_random_seed))) args.extend(('--scoring-pipeline', scoring_pipeline)) - args.extend(('--scoring-random-seed', scoring_random_seed)) + args.extend(('--scoring-random-seed', str(scoring_random_seed))) + #add the data parameters to the cli arguments + if (data_params is not None): + for name, value in data_params.items(): + args.extend(('--data-param', name, value)) + #add the scoring parameters to the cli arguments if (scoring_params is not None): - args.extend(('--scoring-param', scoring_params)) + for name, value in scoring_params.items(): + args.extend(('--scoring-param', name, value)) d3m_cli.main(args) if (config.save_to_d3m is True): save_pipeline_run_to_d3m_db(output_run) diff --git a/experimenter/query.py b/experimenter/query.py index 455c834..96618ad 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -22,21 +22,26 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu') pipeline_search = get_search_query(arguments=arguments, index='pipelines') for pipeline in pipeline_search.scan(): results = scan_pipeline_runs(pipeline.id, submitter) - for (problem_id, dataset_id, data_prep, scoring), params_dict in results.items(): - if limit and len(random_seeds) > limit: - continue + for dataset_prob_tuple, results_dict in results.items(): + unique_items = get_unique_results(results_dict) + #unpack values from tuple + problem_id, dataset_id, data_prep, scoring = dataset_prob_tuple + scoring_id, scoring_random_seed = scoring data_prep_id, data_prep_seed = data_prep - scoring_id, scoring_seed = scoring - random_seeds = params_dict['random_seeds'] - data_params = params_dict['data_params'] - scoring_params = params_dict['scoring_params'] + #get preparation and scoring pipelines data_prep_pipeline = get_pipeline(data_prep_id, types='Data') scoring_pipeline = get_pipeline(scoring_id, types='Scoring') - yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), - 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds, - 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed, - 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed, - 'scoring_params': scoring_params, 'data_params': data_params} + for params in unique_items: + data_params = params['data_params'] + scoring_params = params['scoring_params'] + random_seeds = params['random_seeds'] + if limit and len(random_seeds) > limit: + continue + yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), + 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds, + 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed, + 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed, + 'scoring_params': scoring_params, 'data_params': data_params} def get_pipeline(pipeline_id: str=None, types: str='Data'): @@ -59,6 +64,7 @@ def check_for_data_prep(pipeline_run=None): data_prep = None data_prep_id = None data_prep_seed = None + data_params = None try: data_prep = pipeline_run.run.data_preparation except: @@ -82,9 +88,41 @@ def get_scoring_pipeline(pipeline_run=None): return (scoring_id, scoring_seed), scoring_params +def get_list_duplicates(params_list, match_item): + start_loc = -1 + locs = [] + while True: + try: + loc = params_list.index(match_item,start_loc+1) + except ValueError: + break + else: + locs.append(loc) + start_loc = loc + return locs + + def get_unique_results(results: dict = None): #function for getting unique results from the result dictionary - pass + random_seeds_list = results['random_seeds'] + params_list = results['params'] + final_list = list() + location_dict = dict() + #loop through the values + for it, param in enumerate(params_list): + #get matching pairs of each value + location_dict[it] = get_list_duplicates(params_list, param) + skip = set() + for loc, values in location_dict.items(): + if loc in skip: + continue + random_seeds = set() + for value in values: + random_seeds.add(random_seeds_list[value]) + skip.add(value) + data_params, scoring_params = params_list[loc] + final_list.append({'data_params': data_params, 'scoring_params': scoring_params, 'random_seeds': random_seeds}) + return final_list def scan_pipeline_runs(pipeline_id, submitter=None): @@ -100,8 +138,10 @@ def scan_pipeline_runs(pipeline_id, submitter=None): scoring, scoring_params = get_scoring_pipeline(pipeline_run) for dataset in pipeline_run.datasets: dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring) - results[dataset_prob_tuple] = results.get(dataset_prob_tuple, list()) - result_add_dict = {'random_seed': pipeline_run.random_seed, 'data_params': data_params, 'scoring_params': scoring_params} - results[dataset_prob_tuple].append(results_add_dict) + results[dataset_prob_tuple] = results.get(dataset_prob_tuple, dict()) + results[dataset_prob_tuple]['random_seeds'] = results[dataset_prob_tuple].get('random_seed', list()) + results[dataset_prob_tuple]['params'] = results[dataset_prob_tuple].get('params', list()) + results[dataset_prob_tuple]['random_seeds'].append(pipeline_run.random_seed) + results[dataset_prob_tuple]['params'].append((data_params, scoring_params)) return results From 5c90be4e2cbb93ea619187a48b78242965658c05 Mon Sep 17 00:00:00 2001 From: Eric Manner Date: Wed, 24 Mar 2021 03:21:55 +0000 Subject: [PATCH 37/44] using data and scoring params working remotely --- experimenter/modify_generator.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 02aef61..895f2d0 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -65,16 +65,6 @@ def _get_generator(self): data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Data Preparation') if (os.path.exists(scoring_pipeline) is False): scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='Scoring') - evaluate_pipeline(pipeline=pipeline_path, - problem=problem_path, - input=dataset_doc, - random_seed=seed, - data_pipeline_path=data_prep_pipeline, - data_random_seed=data_random_seed, - data_params=data_params, - scoring_pipeline=scoring_pipeline, - scoring_random_seed=scoring_random_seed, - scoring_params=scoring_params) job = queue.make_job(evaluate_pipeline, pipeline=pipeline_path, problem=problem_path, From c3b4b60b4bfb05f4ff9af958d3ce97497ab5f066 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Wed, 24 Mar 2021 08:07:11 -0600 Subject: [PATCH 38/44] Working on merge suggestions --- experimenter/config.py | 5 + experimenter/databases/d3m_mtl.py | 6 +- experimenter/evaluate_pipeline_new.py | 199 ----------------------- experimenter/execute_pipeline.py | 218 -------------------------- experimenter/modify_generator.py | 4 +- experimenter/problem_new.py | 0 experimenter/utils.py | 17 ++ 7 files changed, 27 insertions(+), 422 deletions(-) delete mode 100644 experimenter/evaluate_pipeline_new.py delete mode 100644 experimenter/execute_pipeline.py delete mode 100644 experimenter/problem_new.py diff --git a/experimenter/config.py b/experimenter/config.py index 41c5787..3233730 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -46,3 +46,8 @@ def validate_save(): if save_to_d3m is None: raise exceptions.ConfigError(_ERROR_MESSAGE.format('SAVE_TO_D3M')) + +output_run_path: str = os.path.abspath(os.path.join('/data', 'pipeline_runs')) +if (not os.path.exists(output_run_path)) + #create the directory + os.makedirs(os.path.dirname(output_run_path), exist_ok=True) diff --git a/experimenter/databases/d3m_mtl.py b/experimenter/databases/d3m_mtl.py index 6badcb8..da6c77e 100644 --- a/experimenter/databases/d3m_mtl.py +++ b/experimenter/databases/d3m_mtl.py @@ -23,7 +23,7 @@ def __init__(self) -> None: self._post_url = D3M_MTL_DB_POST_URL # This env var allows code calling this class to be run during # unit tests without actually saving to the production DB. - self.should_save = config.D3MConfig().save_to_d3m + self.should_save = config.save_to_d3m # A reference to a low-level elasticsearch client. This can be # used to query the D3M DB, or this classe's `search` method # can be used, and is preferred, since its API is more straightforward. @@ -31,9 +31,9 @@ def __init__(self) -> None: # certain things though. self.es = Elasticsearch(hosts=[D3M_MTL_DB_GET_URL], timeout=30) # Our submitter name. - self._submitter = config.D3MConfig().d3m_submitter + self._submitter = config.d3m_db_submitter # The secret verifying us as the submitter we say we are. - self._x_token = config.D3MConfig().d3m_token + self._x_token = config.d3m_db_token if self._is_identifying_as_submitter(): logger.info( f"Documents will be saved under submitter name: '{self._submitter}'" diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py deleted file mode 100644 index 66737df..0000000 --- a/experimenter/evaluate_pipeline_new.py +++ /dev/null @@ -1,199 +0,0 @@ -import itertools as it -import json -import yaml -import os -import parser - -from typing import Any, List, Tuple -from uuid import UUID -from experimenter import config, utils - -from d3m import cli as d3m_cli -from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as k_fold_split_path -from d3m.contrib.pipelines import FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH as fixed_split_path -from experimenter.databases.d3m_mtl import D3MMtLDB - - -def save_pipeline_run_to_d3m_db(pipeline_run_path: str): - """ - Saves a pipeline run document to the d3m database. - - Parameters - ---------- - pipeline_run_path : path_like str - path to pipeline_run document - - Returns: - ---------- - TODO - - Raises: - ---------- - TODO - """ - d3m_db = D3MMtLDB() - with open(pipeline_run_path) as pipeline_data: - pipeline_run = yaml.full_load(pipeline_data) - return D3MMtLDB().save_pipeline_run(pipeline_run) - -def evaluate_pipeline_on_problem(pipeline: str=None, - problem: str=None, - input: str=None, - random_seed: int=0, - data_pipeline_path: str=k_fold_split_path, - data_random_seed: int=0, - data_params=None, - scoring_pipeline: str=None, - scoring_params=None, - scoring_random_seed: int=0): - """ - Evaluate pipeline on problem. - A less verbose form of running d3m's runtime cli 'evaluate' command. - See 'evaluate_pipeline_via_d3m_cli' for more options for running - the 'evaluate' command. - - Parameters - ---------- - pipeline_path : path_like str - path to pipeline doc - problem_path : path_like str - path to problem doc - input_path : path_like str - path to input full data - random_seed : int - random seed to be used for pipeline run - data_pipeline_path: str - path to data prepation pipeline - data_random_seed: int - random_seed to be used in data preparation - data_params: - parameters for data preparation - scoring_params: - parameters for scoring pipeline - scoring_random_seed: int - random seed for scoring - scoring_pipeline: str - path to scoring pipeline - - Returns: - ---------- - None - - Raises: - --------------------------------- - OSError - when a file cannot be opened - """ - output_run_path = [] - with open(pipeline, 'r') as data: - pipe = json.load(data) - output_run_path.append(pipe['id']) - with open(problem, 'r') as data: - prob = json.load(data) - output_run_path.append(prob['about']['problemID']) - output_run_path.append(str(random_seed)) - #get the output run path - output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', - '_'.join(output_run_path)+'.yaml')) - #create the directory - os.makedirs(os.path.dirname(output_run_path),exist_ok=True) - #evaluate pipeline - evaluate_pipeline_via_d3m_cli(pipeline=pipeline, problem=problem, - input=input, output_run=output_run_path, - random_seed=random_seed, data_pipeline_path=data_pipeline_path, - data_random_seed=data_random_seed, data_params=data_params, - scoring_pipeline=scoring_pipeline, scoring_params=scoring_params, - scoring_random_seed=scoring_random_seed) - -def evaluate_pipeline_via_d3m_cli(pipeline: str=None, - problem: str=None, - input: str=None, - output_run: str=None, - random_seed: int=0, - data_pipeline_path: str=k_fold_split_path, - data_random_seed: int=0, - data_params=None, - scoring_pipeline: str=None, - scoring_params=None, - scoring_random_seed: int=0): - """ - Evaluate pipeline on problem using d3m's runtime cli. - Wrapper function to execute d3m's runtime cli 'evaluate' command. - Arguments mirror the same arguments using the cli. - Only handles cases with a data preparation pipeline in the - pipeline run. - - Parameters - ---------- - pipeline : path_like or uuid4 str - path to pipeline doc or pipeline ID - problem : path_like str - path to problem doc - input : path_like str - path to input full data - output_run : path_like str or '-' - path where pipeline_run doc - will be saved. - use '-' for stdin - random_seed : int - random seed to used for - pipeline run - data_pipeline_path: str - path to data prepation pipeline - data_random_seed: int - random_seed to be used in data preparation - data_params: - parameters for data preparation - scoring_params: - parameters for scoring pipeline - scoring_random_seed: int - random seed for scoring - scoring_pipeline: str - path to scoring pipeline - Return: - ------- - None - - Raises: - ------- - ValueError - when parameter value is - invalid - """ - if (not os.path.isfile(pipeline)): - raise ValueError('\'{}\' param not a file path'.format('pipeline')) - - if (not os.path.isfile(problem)): - raise ValueError('\'{}\' param not a file path'.format('problem')) - - if (not os.path.isfile(input)): - raise ValueError('\'{}\' param not a file path'.format('input')) - - if (not os.path.isfile(data_pipeline_path)): - raise ValueError('\'{}\' param not a file path'.format('input')) - - if (not os.path.isfile(scoring_pipeline)): - raise ValueError('\'{}\' param not a file path'.format('input')) - - #TODO - call fit-score when the data pipeline is not defined in the pipeline run - args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate'] - args.extend(('--pipeline', pipeline)) - args.extend(('--problem', problem)) - args.extend(('--input', input)) - args.extend(('--output-run', output_run)) - args.extend(('--data-pipeline', data_pipeline_path)) - args.extend(('--data-random-seed', str(data_random_seed))) - args.extend(('--scoring-pipeline', scoring_pipeline)) - args.extend(('--scoring-random-seed', str(scoring_random_seed))) - #add the data parameters to the cli arguments - if (data_params is not None): - for name, value in data_params.items(): - args.extend(('--data-param', name, value)) - #add the scoring parameters to the cli arguments - if (scoring_params is not None): - for name, value in scoring_params.items(): - args.extend(('--scoring-param', name, value)) - d3m_cli.main(args) - if (config.save_to_d3m is True): - save_pipeline_run_to_d3m_db(output_run) - diff --git a/experimenter/execute_pipeline.py b/experimenter/execute_pipeline.py deleted file mode 100644 index c1bb34b..0000000 --- a/experimenter/execute_pipeline.py +++ /dev/null @@ -1,218 +0,0 @@ -""" -FILE INFORMATION: -This file needs to be a stand alone file so that it can be imported and used by the -experimenter_driver.py. This is because RQ only accepts a function that is imported -and not defined in __main__. These functions are what is needed to execute a pipeline -on a problem and can be used by an individual machine, or used in a RQ job queue. -""" -import logging -from typing import List - -from d3m.metadata.pipeline import Pipeline - -from experimenter.run_fit_pipeline import RunFitPipeline -from experimenter.run_pipeline import RunPipeline -from experimenter.databases.aml_mtl import PipelineDB -from experimenter.databases.d3m_mtl import D3MMtLDB -from experimenter.problem import ProblemReference -from experimenter.config import SAVE_TO_D3M -from experimenter.constants import METRICS_BY_PROBLEM_TYPE - - -logger = logging.getLogger(__name__) - - -def execute_pipeline_on_problem( - pipe: Pipeline, - problem: ProblemReference, - volumes_dir: str, - all_metrics: bool = True, -): - """ - The main function to execute a pipeline. Called in `experimenter_driver.py`. - This function will check if the pipeline and dataset has been executed before, - run the pipeline, and record the results. - - :param pipe: the pipeline object that will be executed - :param problem: a reference to the problem to run the pipeline on. - :param volumes_dir: a string containing the path to the volumes directory - :param all_metrics: if `True`, the pipeline will be scored against all metrics - registered for `problem`'s problem type. If `False`, it will only be scored - against the metrics listed in `problem`'s description. - """ - # Validate args - if all_metrics and problem.problem_type not in METRICS_BY_PROBLEM_TYPE: - raise ValueError( - f"cannot compute all metrics for problem {problem.name}, " - "it does not have a supported problem type." - ) - - # If the experimenter is configured to save documents to the D3M database, - # we only want to execute and save this pipeline run if it doesn't already - # exist in the D3M database. - if SAVE_TO_D3M and D3MMtLDB().has_pipeline_been_run_on_problem(pipe, problem): - logger.info("Pipeline has already been run on this dataset, SKIPPING.") - return - - metric_names = ( - METRICS_BY_PROBLEM_TYPE[problem.problem_type] if all_metrics else None - ) - - # Attempt to run the pipeline - logger.info("\n Running pipeline on problem {}".format(problem.name)) - run_pipeline = RunPipeline(volumes_dir, problem) - try: - scores, (fit_result, produce_result) = run_pipeline.run( - pipeline=pipe, metric_names=metric_names - ) - except Exception as e: - logger.exception("pipeline was not successfully run") - print_pipeline(pipe.to_json_structure()) - raise e - - score = scores[0] - # put in the fit pipeline run - handle_successful_pipeline_run( - fit_result.pipeline_run.to_json_structure(), pipe, score - ) - # put in the produce pipeline run - handle_successful_pipeline_run( - produce_result.pipeline_run.to_json_structure(), pipe, score - ) - - -def execute_metafeatures_pipeline_on_problem( - pipe: Pipeline, problem: ProblemReference, volumes_dir: str -): - """ - The main function to execute a `metafeatures` pipeline. Differs from - `execute_pipeline_on_problem` by only handling metafeatures, and by - computing them on every subset of the problem e.g. TRAIN, TEST, SCORE, etc. - Called in `experimenter_driver.py`. This function will run the pipeline, - and record the results. - - :param pipe: the pipeline object that will be executed - :param problem: a reference to the problem to run the pipeline on. - :param volumes_dir: a string containing the path to the volumes directory - """ - mongo_db = PipelineDB() - - for subset in problem.valid_subsets: - if problem.has_subset(subset): - problem.subset = subset - logger.info( - f"computing metafeatures for problem {problem.name} ({problem.subset} subset)..." - ) - # Compute and store the metafeatures for this subset of the problem. - run_pipeline = RunFitPipeline(volumes_dir, problem) - try: - results = run_pipeline.run(pipeline=pipe) - except Exception as e: - logger.exception("pipeline was not successfully run") - print_pipeline(pipe._to_json_structure()) - raise e - - logger.info(results) - fit_result = results - mongo_db.add_to_metafeatures(fit_result._to_json_structure()) - - -def handle_successful_pipeline_run( - pipeline_run: dict, pipeline: Pipeline, score: float -): - """ - Called after a successful pipeline run. It will output the results to the console - and write it to the database. - - :param pipeline_run: the pipeline run object that will be recorded - :param pipeline: the pipeline that was run - :param score: the results from the execution of the pipeline - """ - if score["value"][0] == 0: - # F-SCORE was calculated wrong - quit and don't keep this run - return - - print_pipeline(pipeline.to_json_structure(), score) - d3m_db = D3MMtLDB() - - if not d3m_db.does_pipeline_exist_in_db(pipeline): - pipeline_save_response = d3m_db.save_pipeline(pipeline, save_primitives=True) - if pipeline_save_response.ok: - logger.info( - f"pipeline {pipeline.get_digest()} " - f"saved successfully, response: {pipeline_save_response.json()}" - ) - - pipeline_run_save_response = d3m_db.save_pipeline_run(pipeline_run) - if pipeline_run_save_response.ok: - logger.info( - f"pipeline run {pipeline_run['id']} " - f"saved successfully, response: {pipeline_run_save_response.json()}" - ) - - -def print_pipeline_and_problem(pipeline: dict, problem: str): - """ - A simple function to print the pipeline and problem, for debugging - - :param pipeline: the pipeline that was executed - :param problem: the dataset/problem that was used - """ - logger.info("Pipeline:") - logger.info(get_list_vertically(primitive_list_from_pipeline_object(pipeline))) - logger.info("on problem {} \n\n".format(problem)) - - -def get_primitive_combo_string(pipeline): - prim_string = "" - for p in pipeline["steps"]: - prim_string += p["primitive"]["id"] - return prim_string - - -def print_pipeline(pipeline: dict, score: float = None) -> List[str]: - """ - A helper function for printing a succesful run - - :param pipeline: the pipeline that we will print - :param score: the results of the metric used in training - :return primitive_list: a list of all the primitives used in the pipeline - """ - primitive_list = primitive_list_from_pipeline_json(pipeline) - logger.info("pipeline:\n") - logger.info(get_list_vertically(primitive_list)) - if score is not None: - logger.info("with a {} of {}".format(score["metric"][0], score["value"][0])) - return primitive_list - - -def primitive_list_from_pipeline_object(pipeline: Pipeline): - """ - A helper function to return all the primitives used in a pipeline - - :param pipeline: a pipeline object - """ - primitives = [] - for p in pipeline.steps: - primitives.append(p.to_json_structure()["primitive"]["python_path"]) - return primitives - - -def primitive_list_from_pipeline_json(pipeline_json: dict): - """ - A helper function to return all the primitives used in a pipeline - - :param pipeline_json a pipeline object in JSON form - """ - primitives = [] - for step in pipeline_json["steps"]: - primitives.append(step["primitive"]["python_path"]) - return primitives - - -def get_list_vertically(list_to_use: list, indent: bool = True): - """ - A helper function to join a list vertically. Used for debugging printing. - """ - final_list = ["\t" + item for item in list_to_use] if indent else list_to_use - return "\n" + "\n".join(final_list) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 02aef61..391ec5b 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -3,13 +3,13 @@ import os import yaml -from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file from experimenter.query import query_on_seeds from experimenter import queue, utils from experimenter.utils import download_from_database -from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline +from experimenter.runtime import evaluate class ModifyGenerator: diff --git a/experimenter/problem_new.py b/experimenter/problem_new.py deleted file mode 100644 index e69de29..0000000 diff --git a/experimenter/utils.py b/experimenter/utils.py index 11f9eb5..dc10180 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -47,6 +47,23 @@ def get_scoring_pipelines(): scoring_dict[pipelines.SCORING_PIPELINE_ID] = pipelines.SCORING_PIPELINE_PATH return scoring_dict, scoring_id_list + +def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str): + """ + get the output path of the pipeline run + """ + output_run_path = [] + #get the digests from the dataset and problem paths + with open(pipeline_path, 'r') as data: + pipeline = json.load(data) + output_run_path.append(pipeline['digest']) + with open(dataset_path, 'r') as data: + dataset = json.load(data) + output_run_path.append(dataset['digest']) + output_run_path.append(str(random_seed)) + output_run_path = os.path.abspath(os.path.join(config.output_run_path, '_'.join(output_run_path)+'.yaml')) + return output_run_path + def get_pipelines_from_d3m(pipeline_id: str = None, types='Data'): """Checks if data preparation pipeline is in d3m module, From 55e8ce706ef3b7d02da81dfd4eeea52d3827b6af Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Wed, 24 Mar 2021 10:33:43 -0600 Subject: [PATCH 39/44] First round of suggested changes (mostly on runtime.py) --- experimenter/config.py | 4 ++-- experimenter/databases/d3m_mtl.py | 13 +++++++++++-- experimenter/modify_generator.py | 27 ++++++++++++++++++++------- experimenter/utils.py | 6 +++--- setup.py | 2 -- 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/experimenter/config.py b/experimenter/config.py index 3233730..d21f745 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -48,6 +48,6 @@ def validate_save(): output_run_path: str = os.path.abspath(os.path.join('/data', 'pipeline_runs')) -if (not os.path.exists(output_run_path)) +if (not os.path.exists(output_run_path)): #create the directory - os.makedirs(os.path.dirname(output_run_path), exist_ok=True) + os.makedirs(output_run_path, exist_ok=True) diff --git a/experimenter/databases/d3m_mtl.py b/experimenter/databases/d3m_mtl.py index da6c77e..b63a333 100644 --- a/experimenter/databases/d3m_mtl.py +++ b/experimenter/databases/d3m_mtl.py @@ -1,5 +1,6 @@ import logging import json +import yaml import requests from d3m.primitive_interfaces.base import PrimitiveBase @@ -105,7 +106,15 @@ def does_pipeline_exist_in_db(self, pipeline: Pipeline) -> bool: .count() ) return num_pipeline_matches > 0 - + + def save_pipeline_runs_from_path(self, pipeline_run_path: str) -> requests.Response: + responses = list() + with open(pipeline_run_path, 'r') as pipeline_data: + pipeline_runs = yaml.safe_load_all(pipeline_data) + for pipeline_run in pipeline_runs: + responses.append(self.save_pipeline_run(pipeline_run).content) + return responses + def save_pipeline_run(self, pipeline_run: dict) -> requests.Response: return self._save(pipeline_run, "pipeline-run") @@ -156,7 +165,7 @@ def _create_no_save_response(self) -> requests.Response: response.status_code = 200 response._content = ( b'{ "result" : "No request was made to the D3M DB API to save a record, ' - b'since the SAVE_TO_D3M environment variable is not set." }' + b'since the SAVE_TO_D3M environment variable is not set to true." }' ) return response diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 57cd391..ad1acab 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -55,17 +55,29 @@ def _get_generator(self): #save the pipeline to path and return pipeline path data_prep_pipeline, data_random_seed, data_params = data scoring_pipeline, scoring_random_seed, scoring_params = score - pipeline_path = download_from_database(pipeline, type_to_download='Pipeline') + pipeline_path = download_from_database(pipeline, type_to_download='pipelines') #TODO - catch when there is no data preparation pipeline and pass it further to evaluate #catch error returning none for file paths or preparation pipeline if (problem_path is None or dataset_doc is None or data_prep_pipeline is None): continue #check if query returned a path or an id if (os.path.exists(data_prep_pipeline) is False): - data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Data Preparation') + data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='data-preparation-pipelines') if (os.path.exists(scoring_pipeline) is False): - scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='Scoring') - job = queue.make_job(evaluate_pipeline, + scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='scoring-pipelines') + evaluate(pipeline=pipeline_path, + problem=problem_path, + input=dataset_doc, + random_seed=seed, + data_pipeline=data_prep_pipeline, + data_random_seed=data_random_seed, + data_params=data_params, + scoring_pipeline=scoring_pipeline, + scoring_random_seed=scoring_random_seed, + scoring_params=scoring_params, + runtime_arg='evaluate') + + job = queue.make_job(evaluate, pipeline=pipeline_path, problem=problem_path, input=dataset_doc, @@ -75,7 +87,8 @@ def _get_generator(self): data_params=data_params, scoring_pipeline=scoring_pipeline, scoring_random_seed=scoring_random_seed, - scoring_params=scoring_params) + scoring_params=scoring_params, + runtime_arg='evaluate') self.num_complete += 1 yield job @@ -137,7 +150,7 @@ def _modify_random_seed(self, seed_limit, query_args): def _run_seed_test(self,args): """ Test designed for development and functionality purposes. - It uses and dataset and pipeline that is saved in d3m-experimenter + It uses a dataset and pipeline that is saved in the d3m-experimenter """ with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file: pipeline = json.load(pipeline_file) @@ -145,7 +158,7 @@ def _run_seed_test(self,args): problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') data_prep_seed = 0 data_prep_seed = 0 - data_prep_pipeline = data_split_file + data_prep_pipeline = K_FOLD_TABULAR_SPLIT_PIPELINE_PATH scoring_pipeline = scoring_file scoring_seed = 0 used_seeds = {2,15} diff --git a/experimenter/utils.py b/experimenter/utils.py index dc10180..27cf135 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -48,7 +48,7 @@ def get_scoring_pipelines(): return scoring_dict, scoring_id_list -def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str): +def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str, random_seed: int): """ get the output path of the pipeline run """ @@ -59,7 +59,7 @@ def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str): output_run_path.append(pipeline['digest']) with open(dataset_path, 'r') as data: dataset = json.load(data) - output_run_path.append(dataset['digest']) + output_run_path.append(dataset['about']['digest']) output_run_path.append(str(random_seed)) output_run_path = os.path.abspath(os.path.join(config.output_run_path, '_'.join(output_run_path)+'.yaml')) return output_run_path @@ -91,7 +91,7 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None) to_save.write(save_id+'\n') -def download_from_database(data, type_to_download: str = 'Pipeline'): +def download_from_database(data, type_to_download: str = 'pipeline'): i_d = data['id'] save_path = os.path.join('/data', type_to_download, i_d+str('.json')) #create the new directory diff --git a/setup.py b/setup.py index 156b02d..fe96b85 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,6 @@ packages=find_packages(), python_requires='>=3.6,<4.0', install_requires=[ - 'docker>=4.4.0<4.5.0', - 'mypy==0.812', 'd3m', # TODO: add version bounds 'redis>=3.5.0<3.6.0', 'rq>=1.7.0<1.8.0', From 5777cd8b25b0dbadd4fc4f19ca024b93b163f384 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Wed, 24 Mar 2021 20:43:55 -0600 Subject: [PATCH 40/44] Cleaned query.py and added tests to test_modifier --- experimenter/config.py | 32 ++++++++ experimenter/modify_generator.py | 64 +++++++-------- experimenter/query.py | 134 +++++++++++++++---------------- experimenter/utils.py | 2 - tests/test_modifier.py | 47 ++++++++--- 5 files changed, 166 insertions(+), 113 deletions(-) diff --git a/experimenter/config.py b/experimenter/config.py index d21f745..04d6332 100644 --- a/experimenter/config.py +++ b/experimenter/config.py @@ -10,6 +10,7 @@ _ERROR_MESSAGE = 'environment variable not set: {}' +#parse the .env file datasets_dir: str = os.environ.get('DATASETS_DIR', None) def validate_datasets_dir(): if datasets_dir is None: @@ -47,7 +48,38 @@ def validate_save(): raise exceptions.ConfigError(_ERROR_MESSAGE.format('SAVE_TO_D3M')) +query_host: str = os.environ.get('QUERY_HOST', 'https://metalearning.datadrivendiscovery.org/es') +def validate_query_host(): + if query_host is None: + raise exceptions.ConfigError(_ERROR_MESSAGE.format('QUERY_HOST')) + + +query_timeout: int = int(os.environ.get('QUERY_TIMEOUT', '500')) +def validate_query_timeout(): + if query_timeout is None: + raise exceptions.ConfigError(_ERROR_MESSAGE.format('QUERY_TIMEOUT')) + + +#get the save paths for the experimenter from the point of view of the docker container output_run_path: str = os.path.abspath(os.path.join('/data', 'pipeline_runs')) if (not os.path.exists(output_run_path)): #create the directory os.makedirs(output_run_path, exist_ok=True) + + +pipelines_path: str = os.path.abspath(os.path.join('/data', 'pipelines')) +if (not os.path.exists(pipelines_path)): + #create the directory + os.makedirs(pipelines_path, exist_ok=True) + + +data_prep_pipelines_path: str = os.path.abspath(os.path.join('/data', 'data-preparation-pipelines')) +if (not os.path.exists(data_prep_pipelines_path)): + #create the directory + os.makedirs(data_prep_pipelines_path, exist_ok=True) + + +scoring_pipelines_path: str = os.path.abspath(os.path.join('/data', 'scoring-pipelines')) +if (not os.path.exists(scoring_pipelines_path)): + #create the directory + os.makedirs(scoring_pipelines_path, exist_ok=True) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index ad1acab..1255357 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -6,8 +6,7 @@ from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file -from experimenter.query import query_on_seeds -from experimenter import queue, utils +from experimenter import queue, utils, query from experimenter.utils import download_from_database from experimenter.runtime import evaluate @@ -23,11 +22,7 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None self.max_jobs = max_jobs self.num_complete = 0 #run the query on initializing to define the query results - if (args.test is True): - self.query_results = self._run_seed_test(self.args) - else: - self.query_results = self._query(self.args) - self.generator = self._get_generator() + self.query_results = None def __iter__(self): @@ -36,7 +31,7 @@ def __iter__(self): def __next__(self): #iterate through query results - job = next(self.generator) + job = next(self._get_generator()) if (self.max_jobs): if (self.num_complete > self.max_jobs): raise StopIteration @@ -49,6 +44,8 @@ def _get_generator(self): Can only handle cases where there is a data preparation pipeline in the pipeline run """ + if (self.query_results is None): + self.query_results = self._query(self.args) for query_result in self.query_results: #iterate through modifier results for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result,self.args): @@ -74,8 +71,7 @@ def _get_generator(self): data_params=data_params, scoring_pipeline=scoring_pipeline, scoring_random_seed=scoring_random_seed, - scoring_params=scoring_params, - runtime_arg='evaluate') + scoring_params=scoring_params) job = queue.make_job(evaluate, pipeline=pipeline_path, @@ -87,8 +83,7 @@ def _get_generator(self): data_params=data_params, scoring_pipeline=scoring_pipeline, scoring_random_seed=scoring_random_seed, - scoring_params=scoring_params, - runtime_arg='evaluate') + scoring_params=scoring_params) self.num_complete += 1 yield job @@ -146,30 +141,33 @@ def _modify_random_seed(self, seed_limit, query_args): yield (query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, (query_args['data_prep_pipeline'], query_args['data_prep_seed'], query_args['data_params']), (query_args['scoring_pipeline'], query_args['scoring_seed'], query_args['scoring_params'])) - - - def _run_seed_test(self,args): - """ Test designed for development and functionality purposes. - It uses a dataset and pipeline that is saved in the d3m-experimenter - """ - with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file: - pipeline = json.load(pipeline_file) - dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') - problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') - data_prep_seed = 0 - data_prep_seed = 0 - data_prep_pipeline = K_FOLD_TABULAR_SPLIT_PIPELINE_PATH - scoring_pipeline = scoring_file - scoring_seed = 0 - used_seeds = {2,15} - yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, - 'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, - 'data_prep_seed': data_prep_seed, 'data_params': None, - 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed, - 'scoring_params': None} def _modify_swap_primitive(self, swap_pipeline, query_args): raise ValueError("No functionality for swapping primitives yet") +def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): + """ + Helper function for generating jobs to be used in the random-seed swapping + generator + """ + arguments = {'id': pipeline_id, '_submitter': submitter} + pipeline_search = query.match_query(index='pipelines', arguments=arguments) + for pipeline in pipeline_search.scan(): + pipeline_run_query = query.scan_pipeline_runs(pipeline.id, submitter) + pipeline = pipeline.to_dict() + for run_tuple, pipeline_run_params in pipeline_run_query.items(): + #get the unqiue params from the params list + unique_run_params = query.combine_unique_params(pipeline_run_params) + #unpack values from tuple + query_arg_dict = query.unpack_run_tuple_args(run_tuple) + for params in unique_run_params: + query_args = query_arg_dict.copy() + query_args['data_params'] = params['data_params'] + query_args['scoring_params'] = params['scoring_params'] + query_args['tested_seeds'] = params['random_seeds'] + query_args['pipeline'] = pipeline + if limit and len(query_args['tested_seeds']) > limit: + continue + yield query_args diff --git a/experimenter/query.py b/experimenter/query.py index 96618ad..b037bbb 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -2,84 +2,75 @@ from elasticsearch_dsl import Search, Q from experimenter.utils import get_problem_path, get_dataset_doc_path, get_pipelines_from_d3m from d3m.runtime import _get_data_and_scoring_params_from_pipeline_run as _data_score_params +from experimenter import config - -HOST = 'https://metalearning.datadrivendiscovery.org/es' -CONNECTION = Elasticsearch(hosts=[HOST], timeout=300) +CONNECTION = Elasticsearch(hosts=[config.query_host], timeout=config.query_timeout) -def get_search_query(arguments: dict = None, connection = CONNECTION, index='pipelines'): - index_search = Search(using=CONNECTION, index=index) - if arguments['id'] is not None: - index_search = index_search.query('match', id=arguments['id']) - if arguments['submitter'] is not None: - index_search = index_search.query('match', _submitter=arguments['submitter']) +def match_query(index:str, arguments: dict = None, connection = CONNECTION): + #remove None arguments from the dictionary + filtered_args = {k:v for k,v in arguments.items() if v is not None} + #initialize the search + index_search = Search(using=connection, index=index) + for field, argument in filtered_args.items(): + arg_dict = dict() + arg_dict[field] = argument + index_search = index_search.query('match', **arg_dict) return index_search -def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'): - arguments = {'id': pipeline_id, 'submitter': submitter} - pipeline_search = get_search_query(arguments=arguments, index='pipelines') - for pipeline in pipeline_search.scan(): - results = scan_pipeline_runs(pipeline.id, submitter) - for dataset_prob_tuple, results_dict in results.items(): - unique_items = get_unique_results(results_dict) - #unpack values from tuple - problem_id, dataset_id, data_prep, scoring = dataset_prob_tuple - scoring_id, scoring_random_seed = scoring - data_prep_id, data_prep_seed = data_prep - #get preparation and scoring pipelines - data_prep_pipeline = get_pipeline(data_prep_id, types='Data') - scoring_pipeline = get_pipeline(scoring_id, types='Scoring') - for params in unique_items: - data_params = params['data_params'] - scoring_params = params['scoring_params'] - random_seeds = params['random_seeds'] - if limit and len(random_seeds) > limit: - continue - yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), - 'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds, - 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed, - 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed, - 'scoring_params': scoring_params, 'data_params': data_params} +def unpack_run_tuple_args(run_tuple: tuple): + #unpack values from tuple + problem_id, dataset_id, data_prep_data, scoring_data = run_tuple + scoring_id, scoring_random_seed = scoring_data + data_prep_id, data_prep_seed = data_prep_data + #get preparation and scoring pipelines + data_prep_pipeline = get_pipeline(data_prep_id, types='Data') + scoring_pipeline = get_pipeline(scoring_id, types='Scoring') + return {'problem_path': get_problem_path(problem_id), + 'dataset_doc_path':get_dataset_doc_path(dataset_id), + 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed, + 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed,} def get_pipeline(pipeline_id: str=None, types: str='Data'): + """ + gets a pipeline from the database, if it is not already + in the d3m module + """ if (pipeline_id is None): return None pipeline = get_pipelines_from_d3m(pipeline_id, types=types) #get from database if not in d3m module if (pipeline is None): - arguments = {'submitter': None, 'id': data_prep_id} - search = get_search_query(arguments=arguments) + arguments = {'id': data_prep_id} + search = match_query(index='pipelines', arguments=arguments) pipeline = next(search.scan()) pipeline = pipeline.to_dict() return pipeline def check_for_data_prep(pipeline_run=None): - """Only handles cases with an explicit data preparation pipeline - in the pipeline run + """Handles cases with an explicit data preparation pipeline + in the pipeline run, will return none when pipeline run has + no preparation pipeline """ - data_prep = None - data_prep_id = None - data_prep_seed = None - data_params = None try: data_prep = pipeline_run.run.data_preparation - except: - data_prep = None - data_prep_seed = None - if (data_prep is not None): data_prep_seed = data_prep.random_seed data_prep_id = data_prep.pipeline.id data_prep = data_prep.to_dict() data_params = _data_score_params(data_prep.get('steps', [])) + except: + data_prep, data_prep_seed, data_prep_id, data_params = None return (data_prep_id, data_prep_seed), data_params -def get_scoring_pipeline(pipeline_run=None): +def get_scoring_pipeline(pipeline_run): + """ + returns the scoring pipeline from the pipeline run + """ scoring = pipeline_run.run.scoring scoring_seed = scoring.random_seed scoring_id = scoring.pipeline.id @@ -89,6 +80,10 @@ def get_scoring_pipeline(pipeline_run=None): def get_list_duplicates(params_list, match_item): + """ + takes in a list of params and an item to match, + returns a list of matching indeces in the list + """ start_loc = -1 locs = [] while True: @@ -102,46 +97,49 @@ def get_list_duplicates(params_list, match_item): return locs -def get_unique_results(results: dict = None): - #function for getting unique results from the result dictionary - random_seeds_list = results['random_seeds'] - params_list = results['params'] +def combine_unique_params(param_dict_list: dict = None): + """ + reduces the param_dict_list into a list of unique paramers with + combined random seeds + """ + random_seeds_list = param_dict_list['random_seeds'] + params_list = param_dict_list['params'] final_list = list() - location_dict = dict() - #loop through the values + location_dict = dict() #initalize dictionary for storing matchine indices + #loop through the parameter values for it, param in enumerate(params_list): #get matching pairs of each value location_dict[it] = get_list_duplicates(params_list, param) - skip = set() + skip = set() #initialize set of locations to skip for loc, values in location_dict.items(): + #only need to match once to match in other locations (add to skip) if loc in skip: continue random_seeds = set() for value in values: + #add matched params random seeds to same set random_seeds.add(random_seeds_list[value]) skip.add(value) data_params, scoring_params = params_list[loc] + #combine matching params with aggregated set of random seeds final_list.append({'data_params': data_params, 'scoring_params': scoring_params, 'random_seeds': random_seeds}) return final_list def scan_pipeline_runs(pipeline_id, submitter=None): - pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \ - .query('match', pipeline__id=pipeline_id) \ - .query('match', run__phase='PRODUCE') \ - .query('match', status__state='SUCCESS') - if submitter: - pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter) - results = dict() + query_arguments = {'pipeline__id': pipeline_id, 'run__phase': 'PRODUCE', + 'status__state': 'SUCCESS', '_submitter': submitter} + pipeline_run_search = match_query(index='pipeline_runs', arguments=query_arguments) + query_results = dict() for pipeline_run in pipeline_run_search.scan(): data_prep, data_params = check_for_data_prep(pipeline_run=pipeline_run) scoring, scoring_params = get_scoring_pipeline(pipeline_run) for dataset in pipeline_run.datasets: - dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring) - results[dataset_prob_tuple] = results.get(dataset_prob_tuple, dict()) - results[dataset_prob_tuple]['random_seeds'] = results[dataset_prob_tuple].get('random_seed', list()) - results[dataset_prob_tuple]['params'] = results[dataset_prob_tuple].get('params', list()) - results[dataset_prob_tuple]['random_seeds'].append(pipeline_run.random_seed) - results[dataset_prob_tuple]['params'].append((data_params, scoring_params)) - return results + run_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring) + query_results[run_tuple] = query_results.get(run_tuple, dict()) + query_results[run_tuple]['random_seeds'] = query_results[run_tuple].get('random_seed', list()) + query_results[run_tuple]['params'] = query_results[run_tuple].get('params', list()) + query_results[run_tuple]['random_seeds'].append(pipeline_run.random_seed) + query_results[run_tuple]['params'].append((data_params, scoring_params)) + return query_results diff --git a/experimenter/utils.py b/experimenter/utils.py index 27cf135..bec55ac 100644 --- a/experimenter/utils.py +++ b/experimenter/utils.py @@ -94,8 +94,6 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None) def download_from_database(data, type_to_download: str = 'pipeline'): i_d = data['id'] save_path = os.path.join('/data', type_to_download, i_d+str('.json')) - #create the new directory - os.makedirs(os.path.dirname(save_path),exist_ok=True) #save the file to the directory with open(save_path, 'w') as to_save: json.dump(data, to_save, indent=4) diff --git a/tests/test_modifier.py b/tests/test_modifier.py index 3b1fb9a..8c11571 100644 --- a/tests/test_modifier.py +++ b/tests/test_modifier.py @@ -1,14 +1,17 @@ import unittest from experimenter import modify_generator, queue, exceptions, utils -from query import query_on_seeds +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH +from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file -class ModifierTestCase(unittest.TestCase): +class GeneratorModifierTestCase(unittest.TestCase): - def test_seed_modifier(self): + + def test_random_seed_modifier_job_count(self): #initialize the modifier with random-seed and a given max jobs - args = {'seed_limit':25, 'submitter':None, 'pipeline_id':None} - num_test = 21 + args = {'seed_limit':35, 'submitter':None, 'pipeline_id':None} + num_test = 5 modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25}) + modifier.query_args = get_seed_test_args #start the counter to make sure there are the right amount of jobs counter = 0 seed_old = 12.1 @@ -19,14 +22,38 @@ def test_seed_modifier(self): self.assertNotEqual(seed_old, seed_new) seed_old = seed_new self.assertEqual(counter,num_test) + - def test_query_seeds(self): + def test_query_random_seeds_set_size(self): args = {'seed_limit':25, 'submitter':'byu', 'pipeline_id':None} - query_results = query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter) + query_results = modify_generator.query_on_seeds(args['pipeline_id'], args['seed_limit'], args['submitter']) #test 10 query results for i in range(10): - _,_,seed_list = next(query_results) - self.assertTrue(len(seed_list) < seed_limit) - + query = next(query_results) + self.assertTrue(len(query['tested_seeds']) < seed_limit) + + + def get_seed_test_args(self,args): + """ returns args for testing modify generator random-seed + functionality purposes. It uses a dataset and pipeline + that is saved in the d3m-experimenter + """ + with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file: + pipeline = json.load(pipeline_file) + dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset') + problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem') + data_prep_seed = 0 + data_prep_seed = 0 + data_prep_pipeline = K_FOLD_TABULAR_SPLIT_PIPELINE_PATH + scoring_pipeline = scoring_file + scoring_seed = 0 + used_seeds = {2,15} + yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, + 'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, + 'data_prep_seed': data_prep_seed, 'data_params': None, + 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed, + 'scoring_params': None} + + if __name__ == '__main__': unittest.main() From a235dfbd3c96a61634aae1065402bcd98079a169 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 26 Mar 2021 09:37:31 -0600 Subject: [PATCH 41/44] Test Updates and query cleaning --- experimenter/query.py | 5 +++-- tests/test_modifier.py | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/experimenter/query.py b/experimenter/query.py index b037bbb..c0e477d 100644 --- a/experimenter/query.py +++ b/experimenter/query.py @@ -2,7 +2,7 @@ from elasticsearch_dsl import Search, Q from experimenter.utils import get_problem_path, get_dataset_doc_path, get_pipelines_from_d3m from d3m.runtime import _get_data_and_scoring_params_from_pipeline_run as _data_score_params -from experimenter import config +from experimenter import config, exceptions CONNECTION = Elasticsearch(hosts=[config.query_host], timeout=config.query_timeout) @@ -61,7 +61,8 @@ def check_for_data_prep(pipeline_run=None): data_prep_id = data_prep.pipeline.id data_prep = data_prep.to_dict() data_params = _data_score_params(data_prep.get('steps', [])) - except: + except KeyError: + #no data preparation pipeline in pipeline run, return none data_prep, data_prep_seed, data_prep_id, data_params = None return (data_prep_id, data_prep_seed), data_params diff --git a/tests/test_modifier.py b/tests/test_modifier.py index 8c11571..87761b4 100644 --- a/tests/test_modifier.py +++ b/tests/test_modifier.py @@ -1,5 +1,6 @@ import unittest from experimenter import modify_generator, queue, exceptions, utils +from experimenter.databases.d3m_mtl import D3MMtLDB from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file @@ -11,7 +12,7 @@ def test_random_seed_modifier_job_count(self): args = {'seed_limit':35, 'submitter':None, 'pipeline_id':None} num_test = 5 modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25}) - modifier.query_args = get_seed_test_args + modifier.query_args = self.get_seed_test_args #start the counter to make sure there are the right amount of jobs counter = 0 seed_old = 12.1 @@ -31,8 +32,17 @@ def test_query_random_seeds_set_size(self): for i in range(10): query = next(query_results) self.assertTrue(len(query['tested_seeds']) < seed_limit) + - + def test_d3m_interface_init(self): + init_fail = False + try: + d3m_db = D3MMtLDB() + except: + init_fail = True + self.assertFalse(init, "D3M Interface Failed") + + def get_seed_test_args(self,args): """ returns args for testing modify generator random-seed functionality purposes. It uses a dataset and pipeline From 635b1c00a25be6d150cf36efa9f5f95dce2ceaa8 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 26 Mar 2021 10:01:45 -0600 Subject: [PATCH 42/44] docker compose update and finish suggested changes --- docker-compose.yml | 4 ---- experimenter/modify_generator.py | 13 +------------ 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 61bbae1..0ff1bda 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,10 +23,6 @@ services: - type: bind source: '${DATA_DIR}' target: /data - - type: bind - source: '${EXPERIMENTER_DIR}' - target: /d3m-experimenter - read_only: true command: 'rq worker --url redis://${REDIS_HOST} ${RQ_QUEUES}' networks: - default diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 1255357..3e912bc 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -62,23 +62,12 @@ def _get_generator(self): data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='data-preparation-pipelines') if (os.path.exists(scoring_pipeline) is False): scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='scoring-pipelines') - evaluate(pipeline=pipeline_path, - problem=problem_path, - input=dataset_doc, - random_seed=seed, - data_pipeline=data_prep_pipeline, - data_random_seed=data_random_seed, - data_params=data_params, - scoring_pipeline=scoring_pipeline, - scoring_random_seed=scoring_random_seed, - scoring_params=scoring_params) - job = queue.make_job(evaluate, pipeline=pipeline_path, problem=problem_path, input=dataset_doc, random_seed=seed, - data_pipeline_path=data_prep_pipeline, + data_pipeline=data_prep_pipeline, data_random_seed=data_random_seed, data_params=data_params, scoring_pipeline=scoring_pipeline, From 1e5a9469d96a805bfed451fcb9de394a7df9e990 Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Fri, 26 Mar 2021 16:25:06 -0600 Subject: [PATCH 43/44] Fix tests and generator part of the modify generator --- experimenter/cli.py | 6 ++++- experimenter/modify_generator.py | 37 ++++++++++++++++++----------- tests/test_modifier.py | 40 ++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/experimenter/cli.py b/experimenter/cli.py index fb1d996..e87ff94 100644 --- a/experimenter/cli.py +++ b/experimenter/cli.py @@ -173,7 +173,11 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None: def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None: modify_type = arguments.modify_type - modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments) + modify_generator = ModifyGenerator(modify_type = modify_type, + max_jobs = arguments.max_jobs, + seed_limit = arguments.seed_limit, + submitter = arguments.submitter, + pipeline_id = arguments.pipeline_id) #now run the enqueuer part queue.enqueue_jobs(jobs=modify_generator, job_timeout=arguments.job_timeout) diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py index 3e912bc..211ac2f 100644 --- a/experimenter/modify_generator.py +++ b/experimenter/modify_generator.py @@ -15,14 +15,18 @@ class ModifyGenerator: """ Generator to be used for creating modified pipelines based on existing pipelines in the database """ - def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None): - self.args = args + def __init__(self, modify_type: str='random-seed', + max_jobs: int=None, seed_limit = None, + submitter = None, pipeline_id = None): #intialize commonly used variables self.modifier_type = modify_type self.max_jobs = max_jobs + self.seed_limit = seed_limit + self.submitter = submitter + self.pipeline_id = pipeline_id self.num_complete = 0 #run the query on initializing to define the query results - self.query_results = None + self._set_query_results() def __iter__(self): @@ -31,24 +35,29 @@ def __iter__(self): def __next__(self): #iterate through query results - job = next(self._get_generator()) + job = next(self.generator) if (self.max_jobs): if (self.num_complete > self.max_jobs): raise StopIteration return job + def _set_query_results(self, query_results=None): + self.query_results = query_results + if query_results is None: + self.query_results = self._query() + self.generator = self._get_generator() + + def _get_generator(self): """ Main generator to be used of ModifyGenerator class Can only handle cases where there is a data preparation pipeline in the pipeline run """ - if (self.query_results is None): - self.query_results = self._query(self.args) for query_result in self.query_results: #iterate through modifier results - for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result,self.args): + for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result): #save the pipeline to path and return pipeline path data_prep_pipeline, data_random_seed, data_params = data scoring_pipeline, scoring_random_seed, scoring_params = score @@ -77,24 +86,24 @@ def _get_generator(self): yield job - def _query(self, args): + def _query(self): """method for querying database according to pipeline modification type """ if (self.modifier_type=='random-seed'): - return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter) + return query_on_seeds(self.pipeline_id, self.seed_limit, self.submitter) if (self.modifier_type=='swap-primitive'): - return query_on_primitive(args.primitive_id, args.limit_indeces) + return query_on_primitive(self.primitive_id, self.limit_indeces) else: raise ValueError("This type of modification is not yet an option") - def _modify(self, query_args: dict, args): + def _modify(self, query_args): """Handler for different types of pipeline modification tasks """ if self.modifier_type=='random-seed': - return self._modify_random_seed(args.seed_limit, query_args) + return self._modify_random_seed(self.seed_limit, query_args) if self.modifier_type=='swap-primitive': - return self._modify_swap_primitive(args.swap_primitive_id, query_args) + return self._modify_swap_primitive(self.swap_primitive_id, query_args) else: raise ValueError("This type of modification is not yet an option") @@ -120,7 +129,7 @@ def _modify_random_seed(self, seed_limit, query_args): used_seeds = query_args['tested_seeds'] num_run = len(used_seeds) #run until the right number of seeds have been run - while (num_run < seed_limit): + while (num_run < self.seed_limit): new_seed = randint(1,100000) if (new_seed in used_seeds): continue diff --git a/tests/test_modifier.py b/tests/test_modifier.py index 87761b4..4206c0e 100644 --- a/tests/test_modifier.py +++ b/tests/test_modifier.py @@ -1,38 +1,42 @@ import unittest +import json from experimenter import modify_generator, queue, exceptions, utils from experimenter.databases.d3m_mtl import D3MMtLDB from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file + class GeneratorModifierTestCase(unittest.TestCase): def test_random_seed_modifier_job_count(self): #initialize the modifier with random-seed and a given max jobs - args = {'seed_limit':35, 'submitter':None, 'pipeline_id':None} num_test = 5 - modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25}) - modifier.query_args = self.get_seed_test_args - #start the counter to make sure there are the right amount of jobs - counter = 0 - seed_old = 12.1 - #begin the test if number of jobs is correct - for job in modifier: - counter += 1 - _,_,seed_new = job - self.assertNotEqual(seed_old, seed_new) - seed_old = seed_new - self.assertEqual(counter,num_test) + seed_limit = 25 + modifier = modify_generator.ModifyGenerator(modify_type='random-seed', + seed_limit=seed_limit, + max_jobs=num_test) + modifier._set_query_results(self.get_seed_test_args()) + #begin the test if number of generated seed jobs is correct + self.assertEqual(len(list(modifier._modify_random_seed(seed_limit, next(modifier.query_results)))), seed_limit-2) + #reinitialize to test if total job count is right + modifier = modify_generator.ModifyGenerator(modify_type = 'random-seed', + max_jobs = num_test, + seed_limit = seed_limit) + modifier.query_results = self.get_seed_test_args() + self.assertEqual(modifier.max_jobs, num_test) + self.assertEqual(len(list(modifier)), modifier.max_jobs) def test_query_random_seeds_set_size(self): args = {'seed_limit':25, 'submitter':'byu', 'pipeline_id':None} + seed_limit = 25 query_results = modify_generator.query_on_seeds(args['pipeline_id'], args['seed_limit'], args['submitter']) #test 10 query results for i in range(10): query = next(query_results) - self.assertTrue(len(query['tested_seeds']) < seed_limit) - + self.assertTrue(len(query['tested_seeds']) < seed_limit) + def test_d3m_interface_init(self): init_fail = False @@ -40,10 +44,10 @@ def test_d3m_interface_init(self): d3m_db = D3MMtLDB() except: init_fail = True - self.assertFalse(init, "D3M Interface Failed") + self.assertFalse(init_fail, "D3M Interface Failed") - def get_seed_test_args(self,args): + def get_seed_test_args(self): """ returns args for testing modify generator random-seed functionality purposes. It uses a dataset and pipeline that is saved in the d3m-experimenter @@ -62,7 +66,7 @@ def get_seed_test_args(self,args): 'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed, 'data_params': None, 'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed, - 'scoring_params': None} + 'scoring_params': None} if __name__ == '__main__': From 3714ac190f1157c337322444ae779bade9fd059f Mon Sep 17 00:00:00 2001 From: Benson Manner Date: Mon, 29 Mar 2021 15:25:34 -0600 Subject: [PATCH 44/44] Add runtime and old execute pipeline files --- experimenter/execute_pipeline_old.py | 218 +++++++++++++++++++++++++++ experimenter/runtime.py | 96 ++++++++++++ 2 files changed, 314 insertions(+) create mode 100644 experimenter/execute_pipeline_old.py create mode 100644 experimenter/runtime.py diff --git a/experimenter/execute_pipeline_old.py b/experimenter/execute_pipeline_old.py new file mode 100644 index 0000000..c1bb34b --- /dev/null +++ b/experimenter/execute_pipeline_old.py @@ -0,0 +1,218 @@ +""" +FILE INFORMATION: +This file needs to be a stand alone file so that it can be imported and used by the +experimenter_driver.py. This is because RQ only accepts a function that is imported +and not defined in __main__. These functions are what is needed to execute a pipeline +on a problem and can be used by an individual machine, or used in a RQ job queue. +""" +import logging +from typing import List + +from d3m.metadata.pipeline import Pipeline + +from experimenter.run_fit_pipeline import RunFitPipeline +from experimenter.run_pipeline import RunPipeline +from experimenter.databases.aml_mtl import PipelineDB +from experimenter.databases.d3m_mtl import D3MMtLDB +from experimenter.problem import ProblemReference +from experimenter.config import SAVE_TO_D3M +from experimenter.constants import METRICS_BY_PROBLEM_TYPE + + +logger = logging.getLogger(__name__) + + +def execute_pipeline_on_problem( + pipe: Pipeline, + problem: ProblemReference, + volumes_dir: str, + all_metrics: bool = True, +): + """ + The main function to execute a pipeline. Called in `experimenter_driver.py`. + This function will check if the pipeline and dataset has been executed before, + run the pipeline, and record the results. + + :param pipe: the pipeline object that will be executed + :param problem: a reference to the problem to run the pipeline on. + :param volumes_dir: a string containing the path to the volumes directory + :param all_metrics: if `True`, the pipeline will be scored against all metrics + registered for `problem`'s problem type. If `False`, it will only be scored + against the metrics listed in `problem`'s description. + """ + # Validate args + if all_metrics and problem.problem_type not in METRICS_BY_PROBLEM_TYPE: + raise ValueError( + f"cannot compute all metrics for problem {problem.name}, " + "it does not have a supported problem type." + ) + + # If the experimenter is configured to save documents to the D3M database, + # we only want to execute and save this pipeline run if it doesn't already + # exist in the D3M database. + if SAVE_TO_D3M and D3MMtLDB().has_pipeline_been_run_on_problem(pipe, problem): + logger.info("Pipeline has already been run on this dataset, SKIPPING.") + return + + metric_names = ( + METRICS_BY_PROBLEM_TYPE[problem.problem_type] if all_metrics else None + ) + + # Attempt to run the pipeline + logger.info("\n Running pipeline on problem {}".format(problem.name)) + run_pipeline = RunPipeline(volumes_dir, problem) + try: + scores, (fit_result, produce_result) = run_pipeline.run( + pipeline=pipe, metric_names=metric_names + ) + except Exception as e: + logger.exception("pipeline was not successfully run") + print_pipeline(pipe.to_json_structure()) + raise e + + score = scores[0] + # put in the fit pipeline run + handle_successful_pipeline_run( + fit_result.pipeline_run.to_json_structure(), pipe, score + ) + # put in the produce pipeline run + handle_successful_pipeline_run( + produce_result.pipeline_run.to_json_structure(), pipe, score + ) + + +def execute_metafeatures_pipeline_on_problem( + pipe: Pipeline, problem: ProblemReference, volumes_dir: str +): + """ + The main function to execute a `metafeatures` pipeline. Differs from + `execute_pipeline_on_problem` by only handling metafeatures, and by + computing them on every subset of the problem e.g. TRAIN, TEST, SCORE, etc. + Called in `experimenter_driver.py`. This function will run the pipeline, + and record the results. + + :param pipe: the pipeline object that will be executed + :param problem: a reference to the problem to run the pipeline on. + :param volumes_dir: a string containing the path to the volumes directory + """ + mongo_db = PipelineDB() + + for subset in problem.valid_subsets: + if problem.has_subset(subset): + problem.subset = subset + logger.info( + f"computing metafeatures for problem {problem.name} ({problem.subset} subset)..." + ) + # Compute and store the metafeatures for this subset of the problem. + run_pipeline = RunFitPipeline(volumes_dir, problem) + try: + results = run_pipeline.run(pipeline=pipe) + except Exception as e: + logger.exception("pipeline was not successfully run") + print_pipeline(pipe._to_json_structure()) + raise e + + logger.info(results) + fit_result = results + mongo_db.add_to_metafeatures(fit_result._to_json_structure()) + + +def handle_successful_pipeline_run( + pipeline_run: dict, pipeline: Pipeline, score: float +): + """ + Called after a successful pipeline run. It will output the results to the console + and write it to the database. + + :param pipeline_run: the pipeline run object that will be recorded + :param pipeline: the pipeline that was run + :param score: the results from the execution of the pipeline + """ + if score["value"][0] == 0: + # F-SCORE was calculated wrong - quit and don't keep this run + return + + print_pipeline(pipeline.to_json_structure(), score) + d3m_db = D3MMtLDB() + + if not d3m_db.does_pipeline_exist_in_db(pipeline): + pipeline_save_response = d3m_db.save_pipeline(pipeline, save_primitives=True) + if pipeline_save_response.ok: + logger.info( + f"pipeline {pipeline.get_digest()} " + f"saved successfully, response: {pipeline_save_response.json()}" + ) + + pipeline_run_save_response = d3m_db.save_pipeline_run(pipeline_run) + if pipeline_run_save_response.ok: + logger.info( + f"pipeline run {pipeline_run['id']} " + f"saved successfully, response: {pipeline_run_save_response.json()}" + ) + + +def print_pipeline_and_problem(pipeline: dict, problem: str): + """ + A simple function to print the pipeline and problem, for debugging + + :param pipeline: the pipeline that was executed + :param problem: the dataset/problem that was used + """ + logger.info("Pipeline:") + logger.info(get_list_vertically(primitive_list_from_pipeline_object(pipeline))) + logger.info("on problem {} \n\n".format(problem)) + + +def get_primitive_combo_string(pipeline): + prim_string = "" + for p in pipeline["steps"]: + prim_string += p["primitive"]["id"] + return prim_string + + +def print_pipeline(pipeline: dict, score: float = None) -> List[str]: + """ + A helper function for printing a succesful run + + :param pipeline: the pipeline that we will print + :param score: the results of the metric used in training + :return primitive_list: a list of all the primitives used in the pipeline + """ + primitive_list = primitive_list_from_pipeline_json(pipeline) + logger.info("pipeline:\n") + logger.info(get_list_vertically(primitive_list)) + if score is not None: + logger.info("with a {} of {}".format(score["metric"][0], score["value"][0])) + return primitive_list + + +def primitive_list_from_pipeline_object(pipeline: Pipeline): + """ + A helper function to return all the primitives used in a pipeline + + :param pipeline: a pipeline object + """ + primitives = [] + for p in pipeline.steps: + primitives.append(p.to_json_structure()["primitive"]["python_path"]) + return primitives + + +def primitive_list_from_pipeline_json(pipeline_json: dict): + """ + A helper function to return all the primitives used in a pipeline + + :param pipeline_json a pipeline object in JSON form + """ + primitives = [] + for step in pipeline_json["steps"]: + primitives.append(step["primitive"]["python_path"]) + return primitives + + +def get_list_vertically(list_to_use: list, indent: bool = True): + """ + A helper function to join a list vertically. Used for debugging printing. + """ + final_list = ["\t" + item for item in list_to_use] if indent else list_to_use + return "\n" + "\n".join(final_list) diff --git a/experimenter/runtime.py b/experimenter/runtime.py new file mode 100644 index 0000000..648bf29 --- /dev/null +++ b/experimenter/runtime.py @@ -0,0 +1,96 @@ +import json +import yaml +import os + +from typing import Any, List, Tuple +from experimenter import config, utils, exceptions + +from d3m import cli as d3m_cli +from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as k_fold_split_path +from experimenter.databases.d3m_mtl import D3MMtLDB + + +def evaluate(pipeline: str=None, + problem: str=None, + input: str=None, + random_seed: int=0, + data_pipeline: str=k_fold_split_path, + data_random_seed: int=0, + data_params=None, + scoring_pipeline: str=None, + scoring_params=None, + scoring_random_seed: int=0): + """ + Evaluate pipeline on problem using d3m's runtime cli. + Wrapper function to execute d3m's runtime cli 'evaluate' command. + Arguments mirror the same arguments using the cli. + Only handles cases with a data preparation pipeline in the + pipeline run. + + Parameters + ---------- + pipeline : path_like str + path to pipeline doc or pipeline ID + problem : path_like str + path to problem doc + input : path_like str + path to input full data + random_seed : int + random seed to used for + pipeline run + data_pipeline_path: str + path to data prepation pipeline + data_random_seed: int + random_seed to be used in data preparation + data_params: + parameters for data preparation + scoring_params: + parameters for scoring pipeline + scoring_random_seed: int + random seed for scoring + scoring_pipeline: str + path to scoring pipeline + Return: + ------- + None + + Raises: + ------- + ValueError + when parameter value is + invalid + """ + if (not os.path.isfile(pipeline)): + raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('pipeline')) + + if (not os.path.isfile(problem)): + raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('problem')) + + if (not os.path.isfile(input)): + raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('input')) + + if (not os.path.isfile(data_pipeline)): + raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('input')) + + if (not os.path.isfile(scoring_pipeline)): + raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('input')) + + output_run = utils.get_pipeline_run_output_path(pipeline, input, random_seed) + #get the runtime arguments for the d3m cli + args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate', + '--pipeline', pipeline, '--problem', problem, '--input', input, + '--output-run', output_run, '--data-pipeline', data_pipeline, + '--data-random-seed', str(data_random_seed), + '--scoring-pipeline', scoring_pipeline, + '--scoring-random-seed', str(scoring_random_seed)] + #add the data parameters to the cli arguments + if (data_params is not None): + for name, value in data_params.items(): + args.extend(('--data-param', name, value)) + #add the scoring parameters to the cli arguments + if (scoring_params is not None): + for name, value in scoring_params.items(): + args.extend(('--scoring-param', name, value)) + d3m_cli.main(args) + #save if proper system variable SAVE_TO_D3M is set to true + responses = D3MMtLDB().save_pipeline_runs_from_path(output_run)