From 99c7b5bbf5a519dd61fec475c4cbe81f0175dff9 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Mon, 18 Jan 2021 10:38:13 -0700
Subject: [PATCH 01/44] Add query and generator class

---
 experimenter/modify_generator.py | 52 +++++++++++++++++++
 experimenter/query.py            | 85 ++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 experimenter/modify_generator.py
 create mode 100644 experimenter/query.py

diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
new file mode 100644
index 0000000..fc4c49a
--- /dev/null
+++ b/experimenter/modify_generator.py
@@ -0,0 +1,52 @@
+from query import query_on_seeds, query_on_primitive
+
+
+class ModifyGenerator:
+    """ Generator to be used for creating modified pipelines based on existing
+        pipelines in the database
+    """
+    def __init__(self, *args):
+        self.args = args
+        #intialize commonly used variables
+        self.modifier_type = args.modifier_type
+        self.num_pipelines_to_run = args.num_pipelines_to_run
+        self.num_complete = 0
+        #run the query on initializing to define the query results
+        self.query_results = self._query(self.modifier_type, self.args)
+
+    def __next__(self):
+        #iterate through query results
+        for query_result in self.query_results:
+            pipeline, dataset, pipeline_run = query_result
+            #iterate through modifier results
+            for new_pipeline, dataset in self._modify(self.args):
+                self.num_complete += 1
+                #check to run until the generator stops iterating (if no input for num_pipelines_to_run)
+                if (self.num_pipelines_to_run):
+                    if (self.num_complete >= self.num_pipelines_to_run):
+                        raise StopIteration
+                return (new_pipeline, new_dataset)
+        raise StopIteration
+
+    def __iter__(self):
+        return self
+            
+    def _query(self, *args):
+        if (self.modifier_type=='random-seed'):
+            return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter):
+        if (self.modifier_type=='swap-primitive'):
+            return query_on_primitive(args.primitive_id, args.limit_indeces)
+        else:
+            raise ValueError("This type of modification is not yet an option")
+            
+    def _modify(self,*args):
+        if self.modifier_type=='random-seed':
+            self._modify_random_seed(args.random_seed, args.seed_limit)
+        if self.modifier_type=='swap-primitive':
+            self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive)
+        else:
+            raise ValueError("This type of modification is not yet an option")
+    
+    def _modify_random_seed(self, pipeline, dataset, args):
+        ##======== Create the random seed modifier
+        #yield random seeds and the pipeline/dataset to run on
diff --git a/experimenter/query.py b/experimenter/query.py
new file mode 100644
index 0000000..1e70e8e
--- /dev/null
+++ b/experimenter/query.py
@@ -0,0 +1,85 @@
+from elasticsearch import Elasticsearch
+from elasticsearch_dsl import Search, Q
+from tqdm import tqdm
+from experimenter.utils import get_problem_parent_dir, build_problem_reference
+
+HOST = 'https://metalearning.datadrivendiscovery.org/es'
+CONNECTION = Elasticsearch(hosts=[HOST], timeout=300)
+
+def query_on_primitive(primitive_id: str, limit_indexes=False):
+   '''Queries the metalearning database for pipelines using the specified primitive.
+   Queries the metalearning database using the Elasticsearch endpoint documented
+   on D3M's website (see https://metalearning.datadrivendiscovery.org for more
+   info). Finds all pipelines containing a certain primitive as specified by the
+   keyword argument. Also determines the index(es) of that primitive in each
+   matching pipeline and gets the datasets that were used in pipeline runs.
+   
+   Arguments
+   ---------
+   primitive_id : str
+      A primitive's unique id.
+   limit_indexes : 'first', 'last', or False (default)
+      Limits which index of the primitive is returned for each pipeline match.
+      Use 'first' to get the index of the first matching primitive specified by
+      the keyword arg. Use 'last' to get the index of the last match. Use False
+      (default) to get a list of all indexes for each pipeline specifying where
+      the primitive is.
+   
+   Yields
+   -------
+   A list of tuples where each tuple contains (in this order):
+      1. a matching pipeline
+      2. the index(es) of the desired primitives in the given pipeline's steps
+      3. a dictionary containing the datasets used in pipeline runs where the key
+         is the dataset digest and the value is the dataset id (human-readable string).
+      4. the random seeds used in pipeline runs.
+   '''
+
+   if limit_indexes not in { 'first', 'last', False }:
+      raise ValueError(f'Invalid value "{limit_indexes}" for arg limit_indexes')
+   
+   match_query = Q('match', steps__primitive__id=primitive_id)
+   nested_query = Q('nested', path='steps', query=match_query)
+   pipeline_search = Search(using=CONNECTION, index='pipelines').query(nested_query)
+
+   for pipeline in pipeline_search.scan():
+      problem_ids, random_seeds = scan_pipeline_runs(pipeline.id)
+
+      locs = [i for i, step in enumerate(pipeline.steps) if primitive_id == step.primitive.id]
+      if limit_indexes == 'last':
+         locs = locs[-1]
+      elif limit_indexes == 'first':
+         locs = locs[0]
+      
+      for problem_id in problem_ids:
+         yield pipeline.to_dict(), build_problem_reference(problem_id), locs, random_seeds
+
+def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
+   pipeline_search = Search(using=CONNECTION, index='pipelines')
+   if pipeline_id:
+      pipeline_search = pipeline_search.query('match', id=pipeline_id)
+   if submitter:
+      pipeline_search = pipeline_search.query('match', _submitter=submitter)
+   
+   for pipeline in pipeline_search.scan():
+      results = scan_pipeline_runs(pipeline.id, submitter)
+      for (problem_id, dataset_id), random_seeds in results.items():
+         if limit and len(random_seeds) > limit:
+            continue
+         yield pipeline.to_dict(), build_problem_reference(problem_id), random_seeds
+
+def scan_pipeline_runs(pipeline_id, submitter=None):
+   pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
+      .query('match', pipeline__id=pipeline_id) \
+      .query('match', run__phase='PRODUCE') \
+      .query('match', status__state='SUCCESS')
+   if submitter:
+      pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
+
+   results = dict()
+   for pipeline_run in pipeline_run_search.scan():
+      for dataset in pipeline_run.datasets:
+         dataset_prob_tuple = (pipeline_run.problem.id, dataset.id)
+         results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
+         results[dataset_prob_tuple].add(pipeline_run.random_seed)
+   return results

From deee5fafa7c7ac16c79f887a41a486e7c8709f48 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Mon, 18 Jan 2021 14:13:00 -0700
Subject: [PATCH 02/44] Untested seed swap functionality

---
 experimenter/cli.py              | 49 ++++++++++++++++++++++++++++++--
 experimenter/modify_generator.py | 42 ++++++++++++++++++---------
 experimenter/query.py            |  2 +-
 3 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/experimenter/cli.py b/experimenter/cli.py
index fd6296c..ce681f7 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -125,11 +125,56 @@ def search_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParse
 
 
 def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
-    pass
+    #create the subparsers for the different types of modifications
+    
+    #seed swapper functionality
+    subparser = parser.add_subparsers(dest='modify_type')
+    subparsers.required = True
+    swap_seed_subparser = subparsers.add_parser(
+         'random-seed',
+         description='Uses database data to search pipelines and run functional pipelines on different random seeds',
+     )
+    #subparser arguments
+    swap_seed_subparser.add_argument(
+         '--pipeline_id',
+         description='The pipeline id to search for in the query, if none, searches all pipelines',
+         default=None,
+         type=str)
+    swap_seed_subparser.add_argument(
+         '--submitter',
+         help='The pipeline submitter to add to the query',
+         default=None,
+         type=str)
+    swap_seed_subparser.add_argument(
+         '--seed_limit',
+         help='The amount of random seeds that each ran pipeline will have at the end of the test',
+         default=2,
+         type=int)
+         
+    #Primitive swapper functionality
+    primitive_swap_subparser = subparsers.add_parser(
+        'primitive-swap',
+        description='Searches database for pipeline runs containing a primitive a swaps out primitive for a different given primitive')
+    #subparser arguments
+    primitive_swap_subparser.add_argument(
+         '--primitive_id',
+         help='The id of the primitive to swap out',
+         default=None,
+         type=str)
+    primitive_swap_subparser.add_argument(
+         '--limit_indeces',
+         help='Details for primitive swapping',
+         default=None)
 
 
 def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
-    raise exceptions.NotImplementedError()
+    modify_type = arguments.modify_type
+    modify_type_parser = parser._subparsers._group_actions[0].choices[modify_type]
+    modify_arguments = modify_type_parser.parse_args(argv[1:])
+    modify_generator = ModifyGenerator(modify_type, modify_arguments, arguments.max-jobs)
+    #now run the enqueuer part
+    enqueuer = queue.JobEnqueuer(arguments)
+    enqueuer.enqueue(modify_generator)
 
 
 def configure_update_parser(parser: argparse.ArgumentParser) -> None:
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index fc4c49a..06cc786 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -5,31 +5,33 @@ class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
         pipelines in the database
     """
-    def __init__(self, *args):
+    def __init__(self, modify_type: str='random-seed', max_jobs: int=None, *args):
         self.args = args
         #intialize commonly used variables
-        self.modifier_type = args.modifier_type
-        self.num_pipelines_to_run = args.num_pipelines_to_run
+        self.modifier_type = modify_type
+        self.max_jobs = max_jobs
         self.num_complete = 0
         #run the query on initializing to define the query results
         self.query_results = self._query(self.modifier_type, self.args)
 
+
     def __next__(self):
         #iterate through query results
         for query_result in self.query_results:
-            pipeline, dataset, pipeline_run = query_result
             #iterate through modifier results
-            for new_pipeline, dataset in self._modify(self.args):
+            for job in self._modify(query_result, self.args):
                 self.num_complete += 1
                 #check to run until the generator stops iterating (if no input for num_pipelines_to_run)
-                if (self.num_pipelines_to_run):
-                    if (self.num_complete >= self.num_pipelines_to_run):
+                if (self.max_jobs):
+                    if (self.num_complete >= self.max_jobs):
                         raise StopIteration
-                return (new_pipeline, new_dataset)
+                return job
         raise StopIteration
 
+
     def __iter__(self):
         return self
+    
             
     def _query(self, *args):
         if (self.modifier_type=='random-seed'):
@@ -38,15 +40,27 @@ def _query(self, *args):
             return query_on_primitive(args.primitive_id, args.limit_indeces)
         else:
             raise ValueError("This type of modification is not yet an option")
+    
             
-    def _modify(self,*args):
+    def _modify(self, query_args: dict, *args):
         if self.modifier_type=='random-seed':
-            self._modify_random_seed(args.random_seed, args.seed_limit)
+            return self._modify_random_seed(args.seed_limit, query_args)
         if self.modifier_type=='swap-primitive':
-            self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive)
+            return self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive)
         else:
             raise ValueError("This type of modification is not yet an option")
     
-    def _modify_random_seed(self, pipeline, dataset, args):
-        ##======== Create the random seed modifier
-        #yield random seeds and the pipeline/dataset to run on
+    
+    def _modify_random_seed(self, seed_limit, query_args):
+        used_seeds = query_args.tested_seeds
+        num_run = len(used_seeds)
+        #run until the right number of seeds have been run
+        while (num_run < seed_limit):
+            new_seed = randint(1,100000)
+            if (new_seed in used_seeds):
+                continue
+            num_run += 1
+            used_seeds.append(new_seed)
+            #yield the necessary job requirements
+            yield query_args.pipeline, query_args.problem_ref, new_seed
+        
diff --git a/experimenter/query.py b/experimenter/query.py
index 1e70e8e..93ebe1d 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -66,7 +66,7 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
       for (problem_id, dataset_id), random_seeds in results.items():
          if limit and len(random_seeds) > limit:
             continue
-         yield pipeline.to_dict(), build_problem_reference(problem_id), random_seeds
+         yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'tested_seeds': random_seeds}
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
    pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \

From cdf770a90cb6b6c5fef51bfcf612b5ec5b33b642 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 22 Jan 2021 09:56:25 -0700
Subject: [PATCH 03/44] Minor syntax fixes

---
 experimenter/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/experimenter/cli.py b/experimenter/cli.py
index ce681f7..b6b6489 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -128,7 +128,7 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
     #create the subparsers for the different types of modifications
     
     #seed swapper functionality
-    subparser = parser.add_subparsers(dest='modify_type')
+    subparsers = parser.add_subparsers(dest='modify_type')
     subparsers.required = True
     swap_seed_subparser = subparsers.add_parser(
          'random-seed',
@@ -137,7 +137,7 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
     #subparser arguments
     swap_seed_subparser.add_argument(
          '--pipeline_id',
-         description='The pipeline id to search for in the query, if none, searches all pipelines',
+         help='The pipeline id to search for in the query, if none, searches all pipelines',
          default=None,
          type=str)
     swap_seed_subparser.add_argument(

From b323c1f08ded784f3b80fd52fb44d9616a324d45 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 26 Jan 2021 14:07:15 -0700
Subject: [PATCH 04/44] Unit test, and job maker

---
 experimenter/cli.py              | 12 ++++++++----
 experimenter/modify_generator.py | 29 +++++++++++++++++++++++++----
 experimenter/query.py            |  2 +-
 tests/test_modifier.py           | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_modifier.py

diff --git a/experimenter/cli.py b/experimenter/cli.py
index 9e80365..a6dc2dc 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -158,7 +158,7 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
     #Primitive swapper functionality
     primitive_swap_subparser = subparsers.add_parser(
         'primitive-swap',
-        description='Searches database for pipeline runs containing a primitive a swaps out primitive for a different given primitive')
+        description='Searches database for pipeline runs containing a primitive and swaps out primitive for a different given primitive')
     #subparser arguments
     primitive_swap_subparser.add_argument(
          '--primitive_id',
@@ -169,16 +169,20 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
          '--limit_indeces',
          help='Details for primitive swapping',
          default=None)
+    primitive_swap_subparser.add_argument(
+         '--swap_primitive_id',
+         help='The id of the primitve to swap in',
+         default=None
+         type=str)
 
 
 def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
     modify_type = arguments.modify_type
     modify_type_parser = parser._subparsers._group_actions[0].choices[modify_type]
     modify_arguments = modify_type_parser.parse_args(argv[1:])
-    modify_generator = ModifyGenerator(modify_type, modify_arguments, arguments.max-jobs)
+    modify_generator = ModifyGenerator(modify_type, arguments.max-jobs, modify_arguments)
     #now run the enqueuer part
-    enqueuer = queue.JobEnqueuer(arguments)
-    enqueuer.enqueue(modify_generator)
+    queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port)
 
 
 def configure_update_parser(parser: argparse.ArgumentParser) -> None:
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 06cc786..3f9fd36 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,4 +1,6 @@
 from query import query_on_seeds, query_on_primitive
+from . import queue
+import d3m.metadata.pipeline 
 
 
 class ModifyGenerator:
@@ -19,7 +21,8 @@ def __next__(self):
         #iterate through query results
         for query_result in self.query_results:
             #iterate through modifier results
-            for job in self._modify(query_result, self.args):
+            for job_args in self._modify(query_result, self.args):
+                job = queue.make_job(exectue_pipeline_on_problem, jobs_args)
                 self.num_complete += 1
                 #check to run until the generator stops iterating (if no input for num_pipelines_to_run)
                 if (self.max_jobs):
@@ -31,7 +34,7 @@ def __next__(self):
 
     def __iter__(self):
         return self
-    
+        
             
     def _query(self, *args):
         if (self.modifier_type=='random-seed'):
@@ -46,10 +49,23 @@ def _modify(self, query_args: dict, *args):
         if self.modifier_type=='random-seed':
             return self._modify_random_seed(args.seed_limit, query_args)
         if self.modifier_type=='swap-primitive':
-            return self._modifiy_swap_primitive(args.pipeline, args.primitive_loc, args.new_primitive)
+            return self._modifiy_swap_primitive(args.swap_primitive_id ,query_args)
         else:
             raise ValueError("This type of modification is not yet an option")
     
+ 
+    def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check):
+        """Pseudo function/method for duplicate checking - this is not complete
+        """
+        #create the pipeline to check for duplicates from the path
+        pipeline_object = d3m.metadata.pipeline.Pipeline.from_json(pipeline_to_check)
+        #query through the database for equal pipelines
+        similar_pipeline_runs_in_database = query.generate_similar_pipeline_runs()
+        for pipeline in similar_pipeline_runs_in_database:
+            if(pipeline_object.equals(pipeline)):
+                return True
+        return False   
+    
     
     def _modify_random_seed(self, seed_limit, query_args):
         used_seeds = query_args.tested_seeds
@@ -62,5 +78,10 @@ def _modify_random_seed(self, seed_limit, query_args):
             num_run += 1
             used_seeds.append(new_seed)
             #yield the necessary job requirements
-            yield query_args.pipeline, query_args.problem_ref, new_seed
+            yield query_args.pipeline, query_args.problem_ref, new_seed 
+            
+
+    def _modify_swap_primitive(self, swap_pipeline, query_args):
+        raise ValueError("No functionality for swapping primitives yet")
+        
         
diff --git a/experimenter/query.py b/experimenter/query.py
index 93ebe1d..ec95f58 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -52,7 +52,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False):
          locs = locs[0]
       
       for problem_id in problem_ids:
-         yield pipeline.to_dict(), build_problem_reference(problem_id), locs, random_seeds
+         yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'location': locs, 'tested_seeds': random_seeds}
 
 def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
    pipeline_search = Search(using=CONNECTION, index='pipelines')
diff --git a/tests/test_modifier.py b/tests/test_modifier.py
new file mode 100644
index 0000000..3b1fb9a
--- /dev/null
+++ b/tests/test_modifier.py
@@ -0,0 +1,32 @@
+import unittest
+from experimenter import modify_generator, queue, exceptions, utils
+from query import query_on_seeds
+
+class ModifierTestCase(unittest.TestCase):
+    
+    def test_seed_modifier(self):
+        #initialize the modifier with random-seed and a given max jobs
+        args = {'seed_limit':25, 'submitter':None, 'pipeline_id':None}
+        num_test = 21
+        modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25})
+        #start the counter to make sure there are the right amount of jobs
+        counter = 0
+        seed_old = 12.1
+        #begin the test if number of jobs is correct
+        for job in modifier:
+           counter += 1
+           _,_,seed_new = job
+           self.assertNotEqual(seed_old, seed_new)
+           seed_old = seed_new
+        self.assertEqual(counter,num_test) 
+        
+    def test_query_seeds(self):
+        args = {'seed_limit':25, 'submitter':'byu', 'pipeline_id':None}
+        query_results = query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter)
+        #test 10 query results
+        for i in range(10):
+            _,_,seed_list = next(query_results)
+            self.assertTrue(len(seed_list) < seed_limit)    
+    
+if __name__ == '__main__':
+    unittest.main()

From fdd281320c8bdb347ab0c4ceb709b194bfb50626 Mon Sep 17 00:00:00 2001
From: Joseph Clark <clarkjoe.co@gmail.com>
Date: Tue, 26 Jan 2021 15:38:18 -0700
Subject: [PATCH 05/44] functions to execute D3M runtime evaluate via cli

---
 experimenter/execute_pipeline_new.py | 149 +++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 experimenter/execute_pipeline_new.py

diff --git a/experimenter/execute_pipeline_new.py b/experimenter/execute_pipeline_new.py
new file mode 100644
index 0000000..b6eafbf
--- /dev/null
+++ b/experimenter/execute_pipeline_new.py
@@ -0,0 +1,149 @@
+import itertools as it
+import os
+from typing import Any, List, Tuple
+
+
+from d3m.metadata.pipeline import Pipeline
+from d3m import cli
+
+from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
+
+def execute_pipeline_on_problem(
+    pipe: Pipeline,
+    problem: ProblemReference,
+    random_seed: int):
+    """ TODO: function one-liner
+
+    TODO doc
+    """
+    pipeline_path = pipeline.id
+    problem_path = problem.path
+    input_path = problem.dataset_doc_path
+    output_run_path = '-'
+    data_random_seed = random_seed
+
+    execute_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path,
+        output_run_path, data_random_seed)
+
+def execute_pipeline_via_d3m_cli(pipeline_path: str,
+    problem_path: str,
+    input_path: str,
+    output_run_path: str,
+    data_random_seed: int,
+    data_params: List[Tuple[str,Any]] = None,
+    data_pipeline: str = K_FOLD_TABULAR_SPLIT_PIPELINE_ID,
+    scoring_pipeline: str = SCORING_PIPELINE_ID,
+    input_run_path: str = None,
+    metric: str = None,
+    scoring_params: List[Tuple[str,Any]] = None,
+    scores_path: str = None,
+    scoring_random_seed: int = None,
+    data_split_file_path: str = None):
+    """ TODO: function one-liner
+
+    TODO: function summary
+
+    # data_pipeline_path - 10 fold cross validation default
+
+    Required Arguments:
+    ---------------------------------
+    pipeline_path -- TODO: arg doc
+    problem_path -- TODO: arg doc
+    input_path -- TODO: arg doc
+    output_run_path -- TODO: arg doc
+    data_random_seed -- TODO: arg doc
+
+    Optional Arguments:
+    ---------------------------------
+    data_params -- TODO: arg doc
+    data_pipeline -- TODO: arg doc
+    scoring_pipeline -- TODO: arg doc
+    input_run_path -- TODO: arg doc
+    metric -- TODO: arg doc
+    scoring_params -- TODO: arg doc
+    scores_path -- TODO: arg doc
+    scoring_random_seed -- TODO: arg doc
+    data_split_file_path -- TODO: arg doc
+
+    Raises:
+    -------
+    ValueError: TODO: doc
+
+    Return:
+    -------
+    TODO: return doc
+    """
+    args = ['d3m', 'runtime', 'evaluate']
+
+    if (not os.path.isfile(pipeline_path)):
+        raise ValueError('\'pipeline_path\' param is not a file')
+
+    if (not os.path.isfile(problem_path)): # TODO: check for URI
+        raise ValueError('\'problem_path\' param is not a file')
+
+    if (not os.path.isfile(input_path)): # TODO: check for URI
+        raise ValueError('\'input_path\' param is not a file')
+
+    if (not isinstance(output_run_path, str) and output_run_path != '-'):
+        # TODO: how to check for nonexistent file? parse?
+        raise ValueError('\'output_run_path\' param is not a valid value')
+
+    if (not isinstance(data_random_seed, int)):
+        raise TypeError('\'{}\' param is not of type \'{}\''.format('data_random_seed','int'))
+
+    if (input_run_path):
+        # TODO: input_run_path validation
+        pass
+
+    args.extend(('--pipeline ', pipeline_path))
+    args.extend(('--problem', problem_path))
+    args.extend(('--input', input_path))
+    args.extend(('--output-run', output_run_path))
+    args.extend(('--data-random-seed', data_random_seed))
+
+    for data_param in data_params:
+        args.extend(('--data-param', data_param[0], data_param[1]))
+
+    if (data_params):
+        if (not isinstance(data_params, List)):
+            raise TypeError('\'{}\' param is not of type \'{}\''.format('data_params','List'))
+        for data_param in data_params:
+            args.extend(('--data-param', data_param[0], data_param[1]))
+
+    if (data_pipeline):
+        # TODO: how to check if data_pipeline is pipeline id? (guid?)
+        args.extend(('--data-pipeline', data_pipeline))
+
+    if (scoring_pipeline):
+        # TODO: how to check if scoring_pipeline is pipeline id?
+        args.extend(('--scoring-pipeline', scoring_pipeline))
+
+    if (metric):
+        # TODO: set of valid metric args?
+        args.extend(('--metric', metric))
+
+    if (scoring_params):
+        if (not isinstance(scoring_params, List)):
+            raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_params','List'))
+        for scoring_param in scoring_params:
+            args.extend(('--scoring-param', scoring_param[0], scoring_param[1]))
+
+    if (scores_path):
+        # TODO: how to check for nonexistent file? parse?
+        args.extend(('--scores', scores_path))
+
+    if (scoring_random_seed):
+        if (not isinstance(scoring_random_seed, int)):
+            raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_random_seed','int'))
+        args.extend(('--scoring-random-seed', scoring_random_seed))
+
+    if (data_split_file_path):
+        if (not os.path.isfile(data_split_file_path)):
+            raise ValueError('\'data_split_file_path\' param is not a file')
+        args.extend(('--data-split-file', data_split_file_path))
+
+    cli.main(args)
+
+if __name__ == '__main__':
+    path = 'README.md'
+    execute_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)])

From 275cbc6040f22fb678533f59996eb35f9b63362f Mon Sep 17 00:00:00 2001
From: Joseph Clark <clarkjoe.co@gmail.com>
Date: Thu, 28 Jan 2021 14:41:26 -0700
Subject: [PATCH 06/44] added data_prep_pipelines

---
 .../3c11d171-e2ad-4d26-a034-04f3b062306c.yml  |  82 +++++++++++
 .../79ce71bd-db96-494b-a455-14f2e2ac5040.yml  |  84 +++++++++++
 .../9c18472e-fff7-4129-93f6-1ab996e82adb.yml  |  84 +++++++++++
 .../data_preparation_pipelines/__init__.py    |  31 ++++
 .../c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml  |  82 +++++++++++
 .../f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml  |  31 ++++
 experimenter/execute_pipeline_new.py          | 137 ++++++++++++------
 experimenter/run_pipeline.py                  |   2 +-
 8 files changed, 489 insertions(+), 44 deletions(-)
 create mode 100644 experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml
 create mode 100644 experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml
 create mode 100644 experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml
 create mode 100644 experimenter/data_preparation_pipelines/__init__.py
 create mode 100644 experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml
 create mode 100644 experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml

diff --git a/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml b/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml
new file mode 100644
index 0000000..695f53c
--- /dev/null
+++ b/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml
@@ -0,0 +1,82 @@
+id: 3c11d171-e2ad-4d26-a034-04f3b062306c
+schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
+source:
+  name: Mitar
+created: "2018-07-28T01:24:39.642266Z"
+name: Train-test split of tabular datasets
+description: |
+  Train-test split of tabular datasets.
+inputs:
+  - name: folds
+  - name: full dataset
+outputs:
+  - name: train datasets
+    data: steps.0.produce
+  - name: test datasets
+    data: steps.2.produce
+  - name: score datasets
+    data: steps.1.produce
+steps:
+  # Step 0.
+  - type: PRIMITIVE
+    primitive:
+      id: 3fcc6dc4-6681-4c86-948e-066d14e7d803
+      version: 0.1.0
+      python_path: d3m.primitives.evaluation.train_score_dataset_split.Common
+      name: Train-score tabular dataset splits
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: inputs.0
+      dataset:
+        type: CONTAINER
+        data: inputs.1
+    outputs:
+      - id: produce
+      - id: produce_score_data
+  # Step 1. We redact privileged attributes for both score and test splits.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.0.produce_score_data
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
+          - https://metadata.datadrivendiscovery.org/types/MissingData
+  # Step 2. We further redact targets in test split.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.1.produce
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/TrueTarget
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
+          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml b/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml
new file mode 100644
index 0000000..6a91f91
--- /dev/null
+++ b/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml
@@ -0,0 +1,84 @@
+id: 79ce71bd-db96-494b-a455-14f2e2ac5040
+schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
+source:
+  name: Mitar
+created: "2018-10-26T00:48:08.341897Z"
+name: No split of tabular datasets
+description: |
+  A pipeline which splits a tabular dataset in a way that for all splits it
+  produces the same (full) dataset. It still redacts the test split.
+  Useful for unsupervised learning tasks.
+inputs:
+  - name: folds
+  - name: full dataset
+outputs:
+  - name: train datasets
+    data: steps.0.produce
+  - name: test datasets
+    data: steps.2.produce
+  - name: score datasets
+    data: steps.1.produce
+steps:
+  # Step 0.
+  - type: PRIMITIVE
+    primitive:
+      id: 48c683ad-da9e-48cf-b3a0-7394dba5e5d2
+      version: 0.1.0
+      python_path: d3m.primitives.evaluation.no_split_dataset_split.Common
+      name: No-split tabular dataset splits
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: inputs.0
+      dataset:
+        type: CONTAINER
+        data: inputs.1
+    outputs:
+      - id: produce
+      - id: produce_score_data
+  # Step 1. We redact privileged attributes for both score and test splits.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.0.produce_score_data
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
+          - https://metadata.datadrivendiscovery.org/types/MissingData
+  # Step 2. We further redact targets in test split.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.1.produce
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/TrueTarget
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
+          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml b/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml
new file mode 100644
index 0000000..80e2a2c
--- /dev/null
+++ b/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml
@@ -0,0 +1,84 @@
+id: 9c18472e-fff7-4129-93f6-1ab996e82adb
+schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
+source:
+  name: Mitar
+created: "2018-10-27T01:30:10.245934Z"
+name: Fixed split of tabular datasets
+description: |
+  A pipeline which splits a tabular dataset in a way that uses for the test
+  (score) split a fixed list of primary index values or row indices of the main
+  resource to be used.
+inputs:
+  - name: folds
+  - name: full dataset
+outputs:
+  - name: train datasets
+    data: steps.0.produce
+  - name: test datasets
+    data: steps.2.produce
+  - name: score datasets
+    data: steps.1.produce
+steps:
+  # Step 0.
+  - type: PRIMITIVE
+    primitive:
+      id: 1654f000-2178-4520-be4c-a95bc26b8d3a
+      version: 0.1.0
+      python_path: d3m.primitives.evaluation.fixed_split_dataset_split.Commmon
+      name: Fixed split tabular dataset splits
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: inputs.0
+      dataset:
+        type: CONTAINER
+        data: inputs.1
+    outputs:
+      - id: produce
+      - id: produce_score_data
+  # Step 1. We redact privileged attributes for both score and test splits.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.0.produce_score_data
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
+          - https://metadata.datadrivendiscovery.org/types/MissingData
+  # Step 2. We further redact targets in test split.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.1.produce
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/TrueTarget
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
+          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/__init__.py b/experimenter/data_preparation_pipelines/__init__.py
new file mode 100644
index 0000000..4b52dec
--- /dev/null
+++ b/experimenter/data_preparation_pipelines/__init__.py
@@ -0,0 +1,31 @@
+import os.path
+
+SCORING_PIPELINE_ID = 'f596cd77-25f8-4d4c-a350-bb30ab1e58f6'
+SCORING_PIPELINE_PATH = os.path.join(
+    os.path.dirname(__file__), SCORING_PIPELINE_ID + '.yml'
+)
+assert os.path.exists(SCORING_PIPELINE_PATH)
+
+NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '79ce71bd-db96-494b-a455-14f2e2ac5040'
+NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
+    os.path.dirname(__file__), NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml'
+)
+assert os.path.exists(NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH)
+
+FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '9c18472e-fff7-4129-93f6-1ab996e82adb'
+FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
+    os.path.dirname(__file__), FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml'
+)
+assert os.path.exists(FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH)
+
+TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID = '3c11d171-e2ad-4d26-a034-04f3b062306c'
+TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
+    os.path.dirname(__file__), TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID + '.yml'
+)
+assert os.path.exists(TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH)
+
+K_FOLD_TABULAR_SPLIT_PIPELINE_ID = 'c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8'
+K_FOLD_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
+    os.path.dirname(__file__), K_FOLD_TABULAR_SPLIT_PIPELINE_ID + '.yml'
+)
+assert os.path.exists(K_FOLD_TABULAR_SPLIT_PIPELINE_PATH)
diff --git a/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml b/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml
new file mode 100644
index 0000000..91f14f2
--- /dev/null
+++ b/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml
@@ -0,0 +1,82 @@
+id: c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8
+schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
+source:
+  name: Mitar
+created: "2018-07-27T19:39:00.676949Z"
+name: K-fold split of tabular datasets
+description: |
+  K-fold split of tabular datasets for cross-validation.
+inputs:
+  - name: folds
+  - name: full dataset
+outputs:
+  - name: train datasets
+    data: steps.0.produce
+  - name: test datasets
+    data: steps.2.produce
+  - name: score datasets
+    data: steps.1.produce
+steps:
+  # Step 0.
+  - type: PRIMITIVE
+    primitive:
+      id: bfedaf3a-6dd0-4a83-ad83-3a50fe882bf8
+      version: 0.1.0
+      python_path: d3m.primitives.evaluation.kfold_dataset_split.Common
+      name: K-fold cross-validation tabular dataset splits
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: inputs.0
+      dataset:
+        type: CONTAINER
+        data: inputs.1
+    outputs:
+      - id: produce
+      - id: produce_score_data
+  # Step 1. We redact privileged attributes for both score and test splits.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.0.produce_score_data
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
+          - https://metadata.datadrivendiscovery.org/types/MissingData
+  # Step 2. We further redact targets in test split.
+  - type: PRIMITIVE
+    primitive:
+      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
+      version: 0.2.0
+      python_path: d3m.primitives.evaluation.redact_columns.Common
+      name: Redact columns for evaluation
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: steps.1.produce
+    outputs:
+      - id: produce
+    hyperparams:
+      semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/TrueTarget
+      add_semantic_types:
+        type: VALUE
+        data:
+          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
+          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml b/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml
new file mode 100644
index 0000000..e95ecd5
--- /dev/null
+++ b/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml
@@ -0,0 +1,31 @@
+id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6
+schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
+source:
+  name: Mitar
+created: "2020-04-18T11:42:44.138742Z"
+name: Scoring pipeline
+description: |-
+  A general scoring pipeline.
+inputs:
+  - name: predictions
+  - name: score dataset
+outputs:
+  - name: scores
+    data: steps.0.produce
+steps:
+  # Step 0.
+  - type: PRIMITIVE
+    primitive:
+      id: 799802fb-2e11-4ab7-9c5e-dda09eb52a70
+      version: 0.5.0
+      python_path: d3m.primitives.evaluation.compute_scores.Core
+      name: Compute scores given the metrics to use
+    arguments:
+      inputs:
+        type: CONTAINER
+        data: inputs.0
+      score_dataset:
+        type: CONTAINER
+        data: inputs.1
+    outputs:
+      - id: produce
diff --git a/experimenter/execute_pipeline_new.py b/experimenter/execute_pipeline_new.py
index b6eafbf..e6d1c7d 100644
--- a/experimenter/execute_pipeline_new.py
+++ b/experimenter/execute_pipeline_new.py
@@ -1,7 +1,8 @@
 import itertools as it
 import os
-from typing import Any, List, Tuple
 
+from typing import Any, List, Tuple
+from uuid import UUID
 
 from d3m.metadata.pipeline import Pipeline
 from d3m import cli
@@ -25,20 +26,20 @@ def execute_pipeline_on_problem(
     execute_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path,
         output_run_path, data_random_seed)
 
-def execute_pipeline_via_d3m_cli(pipeline_path: str,
-    problem_path: str,
-    input_path: str,
-    output_run_path: str,
+def execute_pipeline_via_d3m_cli(pipeline: str,
+    problem: str,
+    input: str,
+    output_run: str,
     data_random_seed: int,
     data_params: List[Tuple[str,Any]] = None,
     data_pipeline: str = K_FOLD_TABULAR_SPLIT_PIPELINE_ID,
     scoring_pipeline: str = SCORING_PIPELINE_ID,
-    input_run_path: str = None,
+    input_run: str = None,
     metric: str = None,
     scoring_params: List[Tuple[str,Any]] = None,
-    scores_path: str = None,
+    scores: str = None,
     scoring_random_seed: int = None,
-    data_split_file_path: str = None):
+    data_split_file: str = None):
     """ TODO: function one-liner
 
     TODO: function summary
@@ -47,10 +48,10 @@ def execute_pipeline_via_d3m_cli(pipeline_path: str,
 
     Required Arguments:
     ---------------------------------
-    pipeline_path -- TODO: arg doc
-    problem_path -- TODO: arg doc
-    input_path -- TODO: arg doc
-    output_run_path -- TODO: arg doc
+    pipeline -- TODO: arg doc
+    problem -- TODO: arg doc
+    input -- TODO: arg doc
+    output_run -- TODO: arg doc
     data_random_seed -- TODO: arg doc
 
     Optional Arguments:
@@ -58,15 +59,16 @@ def execute_pipeline_via_d3m_cli(pipeline_path: str,
     data_params -- TODO: arg doc
     data_pipeline -- TODO: arg doc
     scoring_pipeline -- TODO: arg doc
-    input_run_path -- TODO: arg doc
+    input_run -- TODO: arg doc
     metric -- TODO: arg doc
     scoring_params -- TODO: arg doc
-    scores_path -- TODO: arg doc
+    scores -- TODO: arg doc
     scoring_random_seed -- TODO: arg doc
-    data_split_file_path -- TODO: arg doc
+    data_split_file -- TODO: arg doc
 
     Raises:
     -------
+    TypeError: TODO: doc
     ValueError: TODO: doc
 
     Return:
@@ -75,75 +77,124 @@ def execute_pipeline_via_d3m_cli(pipeline_path: str,
     """
     args = ['d3m', 'runtime', 'evaluate']
 
-    if (not os.path.isfile(pipeline_path)):
-        raise ValueError('\'pipeline_path\' param is not a file')
+    if (not isinstance(pipeline, str)):
+        raise TypeError('\'{}\' param not of type \'{}\''.format('pipeline', 'str'))
 
-    if (not os.path.isfile(problem_path)): # TODO: check for URI
-        raise ValueError('\'problem_path\' param is not a file')
+    if (not isinstance(problem_path, str)):
+        raise TypeError('\'{}\' param not of type \'{}\''.format('problem', 'str'))
 
-    if (not os.path.isfile(input_path)): # TODO: check for URI
-        raise ValueError('\'input_path\' param is not a file')
+    if (not isinstance(input, str)):
+        raise TypeError('\'{}\' param not of type \'{}\''.format('input', 'str'))
 
-    if (not isinstance(output_run_path, str) and output_run_path != '-'):
-        # TODO: how to check for nonexistent file? parse?
-        raise ValueError('\'output_run_path\' param is not a valid value')
+    if (not isinstance(output_run, str)):
+        raise TypeError('\'{}\' param not of type \'{}\''.format('output_run', 'str'))
 
     if (not isinstance(data_random_seed, int)):
-        raise TypeError('\'{}\' param is not of type \'{}\''.format('data_random_seed','int'))
+        raise TypeError('\'{}\' param not of type \'{}\''.format('data_random_seed','int'))
 
-    if (input_run_path):
-        # TODO: input_run_path validation
-        pass
+    if (not os.path.isfile(pipeline) and not is_valid_uuid(pipeline)):
+        raise ValueError('\'{}\' param not a file path'.format('pipeline'))
 
-    args.extend(('--pipeline ', pipeline_path))
-    args.extend(('--problem', problem_path))
-    args.extend(('--input', input_path))
+    if (not os.path.isfile(problem)): # TODO: check for URI
+        raise ValueError('\'{}\' param not a file path'.format('problem'))
+
+    if (not os.path.isfile(input)): # TODO: check for URI
+        raise ValueError('\'{}\' param not a file path'.format('input'))
+
+    if (output_run != '-'): # TODO: output_run value check. how to check for nonexistent file? parse?
+        raise ValueError('\'{}\' param invalid: {\'-\'}'.format('output_run'))
+
+    args.extend(('--pipeline ', pipeline))
+    args.extend(('--problem', problem))
+    args.extend(('--input', input))
     args.extend(('--output-run', output_run_path))
     args.extend(('--data-random-seed', data_random_seed))
 
-    for data_param in data_params:
-        args.extend(('--data-param', data_param[0], data_param[1]))
+    if (input_run):
+        if (not isinstance(input_run, str)):
+            raise TypeError('\'{}\' param not of type \'{}\''.format('input_run','str'))
+        if (not os.path.isfile(input_run) and input_run != '-'):
+            raise ValueError('\'{}\' param invalid: {file_path, \'-\'}'.format('input_run'))
+        # TODO: input_run validation
+        pass
 
     if (data_params):
         if (not isinstance(data_params, List)):
-            raise TypeError('\'{}\' param is not of type \'{}\''.format('data_params','List'))
+            raise TypeError('\'{}\' param not of type \'{}\''.format('data_params','List'))
         for data_param in data_params:
             args.extend(('--data-param', data_param[0], data_param[1]))
 
     if (data_pipeline):
-        # TODO: how to check if data_pipeline is pipeline id? (guid?)
+        if (not isinstance(data_pipeline, str)):
+            raise TypeError('\'{}\' param not of type \'{}\''.format('data_pipeline','str'))
+        if (not os.path.isfile(data_pipeline) and not is_valid_uuid(data_pipeline)):
+            raise ValueError('\'{}\' param not a file path'.format('data_pipeline'))
         args.extend(('--data-pipeline', data_pipeline))
 
     if (scoring_pipeline):
-        # TODO: how to check if scoring_pipeline is pipeline id?
+        if (not isinstance(scoring_pipeline, str)):
+            raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str'))
+        if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)):
+            raise ValueError('\'{}\' param not a file path'.format('scoring_pipeline'))
         args.extend(('--scoring-pipeline', scoring_pipeline))
 
     if (metric):
+        if (not isinstance(metric, str)):
+            raise TypeError('\'{}\' param not of type \'{}\''.format('metric','str'))
         # TODO: set of valid metric args?
         args.extend(('--metric', metric))
 
     if (scoring_params):
         if (not isinstance(scoring_params, List)):
-            raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_params','List'))
+            raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_params','List'))
         for scoring_param in scoring_params:
             args.extend(('--scoring-param', scoring_param[0], scoring_param[1]))
 
-    if (scores_path):
+    if (scores):
         # TODO: how to check for nonexistent file? parse?
         args.extend(('--scores', scores_path))
 
     if (scoring_random_seed):
         if (not isinstance(scoring_random_seed, int)):
-            raise TypeError('\'{}\' param is not of type \'{}\''.format('scoring_random_seed','int'))
+            raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_random_seed','int'))
         args.extend(('--scoring-random-seed', scoring_random_seed))
 
-    if (data_split_file_path):
-        if (not os.path.isfile(data_split_file_path)):
-            raise ValueError('\'data_split_file_path\' param is not a file')
-        args.extend(('--data-split-file', data_split_file_path))
+    if (data_split_file):
+        if (not isinstance(data_split_file, str)):
+            raise TypeError('\'{}\' param not of type \'{}\''.format('data_split_file','str'))
+        if (not os.path.isfile(data_split_file)):
+            raise ValueError('\'{}\' param invalid value: {file_path, \'-\'}'.format('data_split_file'))
+        args.extend(('--data-split-file', data_split_file))
 
     cli.main(args)
 
+def is_valid_uuid(uuid_to_test: str, version=4):
+    """
+    Check if uuid_to_test is a valid UUID.
+
+    Parameters
+    ----------
+    uuid_to_test : str
+    version : {1, 2, 3, 4}
+
+    Returns
+    -------
+    `True` if uuid_to_test is a valid UUID, otherwise `False`.
+
+    Examples
+    --------
+    >>> is_valid_uuid('c9bf9e57-1685-4c89-bafb-ff5af830be8a')
+    True
+    >>> is_valid_uuid('c9bf9e58')
+    False
+    """
+
+    try:
+        uuid_obj = UUID(uuid_to_test, version=version)
+    except Exception:
+        return False
+    return str(uuid_obj) == uuid_to_test
+
 if __name__ == '__main__':
     path = 'README.md'
     execute_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)])
diff --git a/experimenter/run_pipeline.py b/experimenter/run_pipeline.py
index 6d722b0..c6752f0 100644
--- a/experimenter/run_pipeline.py
+++ b/experimenter/run_pipeline.py
@@ -57,7 +57,7 @@ def run(self, pipeline: Pipeline, metric_names: list = None) -> list:
         simimlar to that of `_evaluate` in the Runtime code. The aforementioned
         function does not allow for returning the data, so it did not fit in the
         workflow.
-        
+
         :param pipeline: the pipeline object to be run OR the path to the pipeline file
             to be used
         :param metric_names: if provided, the pipeline will be scored against this custom

From 9b029881946a03f65c5da9b9d4f249b9d78986c5 Mon Sep 17 00:00:00 2001
From: Joseph Clark <clarkjoe.co@gmail.com>
Date: Thu, 28 Jan 2021 15:19:19 -0700
Subject: [PATCH 07/44] renamed execute --> evaluate. created blank file for
 new implementation of a problem

---
 .../{execute_pipeline_new.py => evaluate_pipeline_new.py} | 8 ++++----
 experimenter/problem_new.py                               | 0
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename experimenter/{execute_pipeline_new.py => evaluate_pipeline_new.py} (96%)
 create mode 100644 experimenter/problem_new.py

diff --git a/experimenter/execute_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
similarity index 96%
rename from experimenter/execute_pipeline_new.py
rename to experimenter/evaluate_pipeline_new.py
index e6d1c7d..2fc4eb3 100644
--- a/experimenter/execute_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -9,7 +9,7 @@
 
 from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
 
-def execute_pipeline_on_problem(
+def evaluate_pipeline_on_problem(
     pipe: Pipeline,
     problem: ProblemReference,
     random_seed: int):
@@ -23,10 +23,10 @@ def execute_pipeline_on_problem(
     output_run_path = '-'
     data_random_seed = random_seed
 
-    execute_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path,
+    evaluate_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path,
         output_run_path, data_random_seed)
 
-def execute_pipeline_via_d3m_cli(pipeline: str,
+def evaluate_pipeline_via_d3m_cli(pipeline: str,
     problem: str,
     input: str,
     output_run: str,
@@ -197,4 +197,4 @@ def is_valid_uuid(uuid_to_test: str, version=4):
 
 if __name__ == '__main__':
     path = 'README.md'
-    execute_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)])
+    evaluate_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)])
diff --git a/experimenter/problem_new.py b/experimenter/problem_new.py
new file mode 100644
index 0000000..e69de29

From a0c9991feb6660718c6759448e3f5aff15629ef9 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 29 Jan 2021 10:31:51 -0700
Subject: [PATCH 08/44] Bug fixes and queueing

---
 experimenter/cli.py              |  7 +++----
 experimenter/modify_generator.py | 16 ++++++++--------
 experimenter/query.py            |  3 ++-
 experimenter/utils.py            | 22 +++++++++++++++++++++-
 4 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/experimenter/cli.py b/experimenter/cli.py
index a6dc2dc..cb83e9d 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -2,6 +2,7 @@
 import typing
 
 from experimenter import exceptions, queue
+from experimenter.modify_generator import ModifyGenerator
 
 
 def main(argv: typing.Sequence) -> None:
@@ -172,15 +173,13 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
     primitive_swap_subparser.add_argument(
          '--swap_primitive_id',
          help='The id of the primitve to swap in',
-         default=None
+         default=None,
          type=str)
 
 
 def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
     modify_type = arguments.modify_type
-    modify_type_parser = parser._subparsers._group_actions[0].choices[modify_type]
-    modify_arguments = modify_type_parser.parse_args(argv[1:])
-    modify_generator = ModifyGenerator(modify_type, arguments.max-jobs, modify_arguments)
+    modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments)
     #now run the enqueuer part
     queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port)
 
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 3f9fd36..f344581 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,5 +1,5 @@
-from query import query_on_seeds, query_on_primitive
-from . import queue
+from experimenter.query import query_on_seeds, query_on_primitive
+from experimenter import queue
 import d3m.metadata.pipeline 
 
 
@@ -7,14 +7,14 @@ class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
         pipelines in the database
     """
-    def __init__(self, modify_type: str='random-seed', max_jobs: int=None, *args):
+    def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None):
         self.args = args
         #intialize commonly used variables
         self.modifier_type = modify_type
         self.max_jobs = max_jobs
         self.num_complete = 0
         #run the query on initializing to define the query results
-        self.query_results = self._query(self.modifier_type, self.args)
+        self.query_results = self._query(self.args)
 
 
     def __next__(self):
@@ -36,20 +36,20 @@ def __iter__(self):
         return self
         
             
-    def _query(self, *args):
+    def _query(self, args):
         if (self.modifier_type=='random-seed'):
-            return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter):
+            return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter)
         if (self.modifier_type=='swap-primitive'):
             return query_on_primitive(args.primitive_id, args.limit_indeces)
         else:
             raise ValueError("This type of modification is not yet an option")
     
             
-    def _modify(self, query_args: dict, *args):
+    def _modify(self, query_args: dict, args):
         if self.modifier_type=='random-seed':
             return self._modify_random_seed(args.seed_limit, query_args)
         if self.modifier_type=='swap-primitive':
-            return self._modifiy_swap_primitive(args.swap_primitive_id ,query_args)
+            return self._modifiy_swap_primitive(args.swap_primitive_id, query_args)
         else:
             raise ValueError("This type of modification is not yet an option")
     
diff --git a/experimenter/query.py b/experimenter/query.py
index ec95f58..14d368a 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -1,7 +1,7 @@
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search, Q
 from tqdm import tqdm
-from experimenter.utils import get_problem_parent_dir, build_problem_reference
+from experimenter.utils import build_problem_reference
 
 HOST = 'https://metalearning.datadrivendiscovery.org/es'
 CONNECTION = Elasticsearch(hosts=[HOST], timeout=300)
@@ -52,6 +52,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False):
          locs = locs[0]
       
       for problem_id in problem_ids:
+         
          yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'location': locs, 'tested_seeds': random_seeds}
 
 def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
diff --git a/experimenter/utils.py b/experimenter/utils.py
index ed429af..b8a1bea 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -10,7 +10,8 @@
 import docker
 
 from d3m.metadata import problem as problem_module
-
+from experimenter.problem import ProblemReference
+from d3m.utils import get_datasets_and_problems
 
 DEFAULT_DATASET_DIR = "/datasets/training_datasets/LL0"
 
@@ -29,6 +30,25 @@ def get_dataset_doc_path(
     )
 
 
+def get_problem_parent_dir(problem_id: str):
+    """
+    Getting the problem parent directory based on the given problem id and 
+    DEFAULT_DATASET_DIR
+    """
+    dir_name = problem_id
+    if any([x in problem_id for x in {'_problem', '_solution', '_dataset'}]):
+        dir_name = '_'.join(problem_id.split('_')[:-1])
+    path_chunks = get_problem_path(problem_id).split('/')
+    return '/'.join(path_chunks[:path_chunks.index(dir_name)+1])
+
+    
+def build_problem_reference(problem_id: str):
+   parent_dir = get_problem_parent_dir(problem_id)
+   dir_id = parent_dir.split('/')[-1]
+   enclosing_dir = '/'.join(parent_dir.split('/')[:-1])
+   return ProblemReference(dir_id, '', enclosing_dir)
+   
+
 def get_dataset_doc(dataset_name: str, dataset_dir: str = DEFAULT_DATASET_DIR) -> dict:
     """
     Gets a dataset doc from a path and loads it

From a4cc817faa8b082d5813ee11118c0602daafa4c1 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 12 Feb 2021 07:06:46 -0700
Subject: [PATCH 09/44] Syntax fixes and return paths from query

---
 experimenter/evaluate_pipeline_new.py | 21 ++-------------------
 experimenter/modify_generator.py      |  6 +++---
 experimenter/query.py                 | 14 +++++++-------
 3 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 2fc4eb3..39312da 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -7,24 +7,7 @@
 from d3m.metadata.pipeline import Pipeline
 from d3m import cli
 
-from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
-
-def evaluate_pipeline_on_problem(
-    pipe: Pipeline,
-    problem: ProblemReference,
-    random_seed: int):
-    """ TODO: function one-liner
-
-    TODO doc
-    """
-    pipeline_path = pipeline.id
-    problem_path = problem.path
-    input_path = problem.dataset_doc_path
-    output_run_path = '-'
-    data_random_seed = random_seed
-
-    evaluate_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path,
-        output_run_path, data_random_seed)
+from experimenter.data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
 
 def evaluate_pipeline_via_d3m_cli(pipeline: str,
     problem: str,
@@ -134,7 +117,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (scoring_pipeline):
         if (not isinstance(scoring_pipeline, str)):
             raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str'))
-        if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)):
+        if (not os.path.isfile(scoring_pipeline) and not is_valid_uuid(scoring_pipeline)):
             raise ValueError('\'{}\' param not a file path'.format('scoring_pipeline'))
         args.extend(('--scoring-pipeline', scoring_pipeline))
 
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index f344581..4a6c1ec 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,7 +1,7 @@
 from experimenter.query import query_on_seeds, query_on_primitive
 from experimenter import queue
 import d3m.metadata.pipeline 
-
+from experimenter.evaluate_pipeline_new import evaluate_pipeline_via_d3m_cli as evaluate_pipeline
 
 class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
@@ -22,7 +22,7 @@ def __next__(self):
         for query_result in self.query_results:
             #iterate through modifier results
             for job_args in self._modify(query_result, self.args):
-                job = queue.make_job(exectue_pipeline_on_problem, jobs_args)
+                job = queue.make_job(evaluate_pipeline, jobs_args)
                 self.num_complete += 1
                 #check to run until the generator stops iterating (if no input for num_pipelines_to_run)
                 if (self.max_jobs):
@@ -78,7 +78,7 @@ def _modify_random_seed(self, seed_limit, query_args):
             num_run += 1
             used_seeds.append(new_seed)
             #yield the necessary job requirements
-            yield query_args.pipeline, query_args.problem_ref, new_seed 
+            yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, '-', new_seed 
             
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):
diff --git a/experimenter/query.py b/experimenter/query.py
index 14d368a..41f48cc 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -1,7 +1,7 @@
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search, Q
 from tqdm import tqdm
-from experimenter.utils import build_problem_reference
+from experimenter.utils import get_problem_path, get_dataset_doc_path
 
 HOST = 'https://metalearning.datadrivendiscovery.org/es'
 CONNECTION = Elasticsearch(hosts=[HOST], timeout=300)
@@ -43,7 +43,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False):
    pipeline_search = Search(using=CONNECTION, index='pipelines').query(nested_query)
 
    for pipeline in pipeline_search.scan():
-      problem_ids, random_seeds = scan_pipeline_runs(pipeline.id)
+      results = scan_pipeline_runs(pipeline.id)
 
       locs = [i for i, step in enumerate(pipeline.steps) if primitive_id == step.primitive.id]
       if limit_indexes == 'last':
@@ -51,9 +51,9 @@ def query_on_primitive(primitive_id: str, limit_indexes=False):
       elif limit_indexes == 'first':
          locs = locs[0]
       
-      for problem_id in problem_ids:
+      for (problem_id, dataset_name), random_seeds in results.items():
          
-         yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'location': locs, 'tested_seeds': random_seeds}
+         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds}
 
 def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
    pipeline_search = Search(using=CONNECTION, index='pipelines')
@@ -64,10 +64,10 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
    
    for pipeline in pipeline_search.scan():
       results = scan_pipeline_runs(pipeline.id, submitter)
-      for (problem_id, dataset_id), random_seeds in results.items():
+      for (problem_id, dataset_name), random_seeds in results.items():
          if limit and len(random_seeds) > limit:
             continue
-         yield {'pipeline': pipeline.to_dict(), 'problem_ref': build_problem_reference(problem_id), 'tested_seeds': random_seeds}
+         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds}
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
    pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
@@ -80,7 +80,7 @@ def scan_pipeline_runs(pipeline_id, submitter=None):
    results = dict()
    for pipeline_run in pipeline_run_search.scan():
       for dataset in pipeline_run.datasets:
-         dataset_prob_tuple = (pipeline_run.problem.id, dataset.id)
+         dataset_prob_tuple = (pipeline_run.problem.id, dataset.name)
          results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
          results[dataset_prob_tuple].add(pipeline_run.random_seed)
    return results

From 991ad7fa7d612754a90645dcabe5a3e368a3e9b7 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 12 Feb 2021 08:32:42 -0700
Subject: [PATCH 10/44] setup.py updates

---
 experimenter/query.py | 1 -
 setup.py              | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/experimenter/query.py b/experimenter/query.py
index 41f48cc..1cfd3a2 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -1,6 +1,5 @@
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search, Q
-from tqdm import tqdm
 from experimenter.utils import get_problem_path, get_dataset_doc_path
 
 HOST = 'https://metalearning.datadrivendiscovery.org/es'
diff --git a/setup.py b/setup.py
index b22b2b4..7f1457f 100644
--- a/setup.py
+++ b/setup.py
@@ -14,5 +14,7 @@
         'docker>=4.4.0<4.5.0',
         'redis>=3.5.0<3.6.0',
         'rq>=1.7.0<1.8.0',
+        'elasticsearch_dsl>=7.0.0<8.0.0',
+        'elastcisearch>=7.0.0<8.0.0',
     ],
 )

From 83cceb461597c7b37906cd7b33843d25d799276d Mon Sep 17 00:00:00 2001
From: Joseph Clark <clarkjoe.co@gmail.com>
Date: Thu, 18 Feb 2021 17:10:19 -0700
Subject: [PATCH 11/44] added function to save pipeline_run docs to DB.

---
 ...ipeline_new.py => execute_pipeline_new.py} | 83 +++++++++++++++----
 1 file changed, 66 insertions(+), 17 deletions(-)
 rename experimenter/{evaluate_pipeline_new.py => execute_pipeline_new.py} (77%)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/execute_pipeline_new.py
similarity index 77%
rename from experimenter/evaluate_pipeline_new.py
rename to experimenter/execute_pipeline_new.py
index 2fc4eb3..9afbb79 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/execute_pipeline_new.py
@@ -1,4 +1,5 @@
 import itertools as it
+import json
 import os
 
 from typing import Any, List, Tuple
@@ -7,24 +8,76 @@
 from d3m.metadata.pipeline import Pipeline
 from d3m import cli
 
+from experimenter.databases.d3m_mtl import D3MMtLDB
 from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
 
-def evaluate_pipeline_on_problem(
-    pipe: Pipeline,
-    problem: ProblemReference,
-    random_seed: int):
-    """ TODO: function one-liner
+def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
+    """ TODO: one-liner
+
+    TODO: description
+
+    Required Arguments:
+    ---------------------------------
+    pipeline_run_path -- path to pipeline_run doc to save
+
+    Optional Arguments:
+    ---------------------------------
+
+    Raises:
+    ---------------------------------
+    TODO
+    
+    Returns:
+    ---------------------------------
+    TODO
+
+    """
+    d3m_db = D3MMtLDB()
+
+    pipeline_run_save_response = D3MMtLDB().save_pipeline_run(pipeline_run_path)
 
-    TODO doc
+def evaluate_pipeline_on_problem(pipeline_path: str,
+    problem_path: str,
+    input_path: str,
+    data_random_seed: int):
+    """ TODO: one-liner
+
+    TODO: description
+
+    Required Arguments:
+    ---------------------------------
+    pipeline_path -- path to pipeline doc
+    problem_path -- path to problem doc
+    input_path -- path to input full data
+    data_random_seed -- random seed to be used for data preparation
+
+    Optional Arguments:
+    ---------------------------------
+
+    Raises:
+    ---------------------------------
+    TODO
+    
+    Returns:
+    ---------------------------------
+    TODO
     """
-    pipeline_path = pipeline.id
-    problem_path = problem.path
-    input_path = problem.dataset_doc_path
-    output_run_path = '-'
-    data_random_seed = random_seed
+    output_run_path = []
+
+    with open(pipeline_path, 'r') as pipeline:
+        output_run_path.append(pipeline['properties']['digest'])
+    with open(problem_path, 'r') as problem:
+        output_run_path.append(problem['properties']['digest'])
+    with open(input_path, 'r') as input_f:
+        output_run_path.append(input_f['properties']['digest'])
 
-    evaluate_pipeline_via_d3m_cli(pipeline_path, problem_path, input_path,
-        output_run_path, data_random_seed)
+    output_run_path = '_'.join(output_run_path)
+
+    execute_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
+        input=input_path, output_run=output_run_path,
+        data_random_seed=data_random_seed)
+
+    save_pipeline_run_to_d3m_db(output_run_path)
 
 def evaluate_pipeline_via_d3m_cli(pipeline: str,
     problem: str,
@@ -194,7 +247,3 @@ def is_valid_uuid(uuid_to_test: str, version=4):
     except Exception:
         return False
     return str(uuid_obj) == uuid_to_test
-
-if __name__ == '__main__':
-    path = 'README.md'
-    evaluate_pipeline_via_d3m_cli(path,path,path,path,1,[(1,2),(3,4)])

From 5c4bb388cbaa127ba0f3f5fdff3ae2b6b3a5626a Mon Sep 17 00:00:00 2001
From: Joseph Clark <clarkjoe.co@gmail.com>
Date: Fri, 19 Feb 2021 12:54:11 -0700
Subject: [PATCH 12/44] updated documentation

---
 experimenter/execute_pipeline_new.py | 201 ++++++++++++++++-----------
 1 file changed, 117 insertions(+), 84 deletions(-)

diff --git a/experimenter/execute_pipeline_new.py b/experimenter/execute_pipeline_new.py
index 9afbb79..e9b02ec 100644
--- a/experimenter/execute_pipeline_new.py
+++ b/experimenter/execute_pipeline_new.py
@@ -12,55 +12,54 @@
 from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
 
 def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
-    """ TODO: one-liner
+    """ 
+    Saves a pipeline run document to the d3m database.
 
-    TODO: description
-
-    Required Arguments:
-    ---------------------------------
-    pipeline_run_path -- path to pipeline_run doc to save
-
-    Optional Arguments:
-    ---------------------------------
+    Parameters
+    ----------
+    pipeline_run_path : path_like str
+        path to pipeline_run document
 
-    Raises:
-    ---------------------------------
-    TODO
-    
     Returns:
-    ---------------------------------
+    ----------
     TODO
 
+    Raises:
+    ----------
+    TODO
     """
     d3m_db = D3MMtLDB()
-
-    pipeline_run_save_response = D3MMtLDB().save_pipeline_run(pipeline_run_path)
+    return D3MMtLDB().save_pipeline_run(pipeline_run_path)
 
 def evaluate_pipeline_on_problem(pipeline_path: str,
     problem_path: str,
     input_path: str,
     data_random_seed: int):
-    """ TODO: one-liner
-
-    TODO: description
+    """ 
+    Evaluate pipeline on problem.
+    A less verbose form of running d3m's runtime cli 'evaluate' command.
+    See 'evaluate_pipeline_via_d3m_cli' for more options for running 
+    the 'evaluate' command.
 
-    Required Arguments:
-    ---------------------------------
-    pipeline_path -- path to pipeline doc
-    problem_path -- path to problem doc
-    input_path -- path to input full data
-    data_random_seed -- random seed to be used for data preparation
+    Parameters
+    ----------
+    pipeline_path : path_like str
+        path to pipeline doc
+    problem_path : path_like str 
+        path to problem doc
+    input_path : path_like str
+        path to input full data
+    data_random_seed : int   
+        random seed to be used for data preparation
 
-    Optional Arguments:
-    ---------------------------------
+    Returns:
+    ----------
+    None
 
     Raises:
     ---------------------------------
-    TODO
-    
-    Returns:
-    ---------------------------------
-    TODO
+    OSError
+        when a file cannot be opened
     """
     output_run_path = []
 
@@ -71,7 +70,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     with open(input_path, 'r') as input_f:
         output_run_path.append(input_f['properties']['digest'])
 
-    output_run_path = '_'.join(output_run_path)
+    output_run_path = '_'.join(output_run_path) + '.json'
 
     execute_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
@@ -93,40 +92,77 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     scores: str = None,
     scoring_random_seed: int = None,
     data_split_file: str = None):
-    """ TODO: function one-liner
-
-    TODO: function summary
-
-    # data_pipeline_path - 10 fold cross validation default
+    """ 
+    Evaluate pipeline on problem using d3m's runtime cli. 
+    Wrapper function to execute d3m's runtime cli 'evaluate' command.
+    Arguments mirror the same arguments using the cli.
 
-    Required Arguments:
-    ---------------------------------
-    pipeline -- TODO: arg doc
-    problem -- TODO: arg doc
-    input -- TODO: arg doc
-    output_run -- TODO: arg doc
-    data_random_seed -- TODO: arg doc
-
-    Optional Arguments:
-    ---------------------------------
-    data_params -- TODO: arg doc
-    data_pipeline -- TODO: arg doc
-    scoring_pipeline -- TODO: arg doc
-    input_run -- TODO: arg doc
-    metric -- TODO: arg doc
-    scoring_params -- TODO: arg doc
-    scores -- TODO: arg doc
-    scoring_random_seed -- TODO: arg doc
-    data_split_file -- TODO: arg doc
-
-    Raises:
-    -------
-    TypeError: TODO: doc
-    ValueError: TODO: doc
+    Parameters
+    ----------
+    pipeline : path_like or uuid4 str
+        path to pipeline doc or pipeline ID
+    problem : path_like str
+        path to problem doc
+    input : path_like str
+        path to input full data
+    output_run : path_like str or '-'
+        path where pipeline_run doc
+        will be saved.
+        use '-' for stdin
+    data_random_seed : int
+        random seed to use for
+        data preparation
+    data_params : list of tuples, optional
+        hyper-parameter names and values
+        for data preparation.
+        None by default
+    data_pipeline : path_like str or uuid4 str, optional
+        path to data preparation pipeline file
+        or pipeline ID.
+        K_FOLD_TABULAR_SPLIT_PIPELINE_ID by default
+    scoring_pipeline : path_like str or uuid4 str, optional
+        path to scoring pipeline file
+        or pipeline ID.
+        SCORING_PIPELINE_ID by default
+    input_run : path_like str or '-', optional
+        path to pipeline_run file
+        with configuration.
+        use '-' for stdin.
+        None by default
+    metric : str, optional
+        metric to use.
+        Metric from problem by default
+    scoring_params : list of tuples, optional
+        hyper-parameter names and values
+        for scoring pipeline.
+        None by default
+    scores : path_like str, optional
+        path to save scores.
+        None by default
+    scoring_random_seed : int, optional
+        random seed to use for scoring.
+        None by default
+    data_split_file : path_like str, optional
+        reads the split file and populates
+        "primary_index_values" hyper-parameter
+        for data preparation pipeline with values
+        from the "d3mIndex" column corresponding
+        to the test data.
+        use '-' for stdin.
+        None by default
 
     Return:
     -------
-    TODO: return doc
+    None
+    
+    Raises:
+    -------
+    TypeError
+        when parameter value has 
+        incorrect type
+    ValueError
+        when parameter value is
+        invalid
     """
     args = ['d3m', 'runtime', 'evaluate']
 
@@ -146,7 +182,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
         raise TypeError('\'{}\' param not of type \'{}\''.format('data_random_seed','int'))
 
     if (not os.path.isfile(pipeline) and not is_valid_uuid(pipeline)):
-        raise ValueError('\'{}\' param not a file path'.format('pipeline'))
+        raise ValueError('\'{}\' param not a file path or pipeline ID'.format('pipeline'))
 
     if (not os.path.isfile(problem)): # TODO: check for URI
         raise ValueError('\'{}\' param not a file path'.format('problem'))
@@ -154,9 +190,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (not os.path.isfile(input)): # TODO: check for URI
         raise ValueError('\'{}\' param not a file path'.format('input'))
 
-    if (output_run != '-'): # TODO: output_run value check. how to check for nonexistent file? parse?
-        raise ValueError('\'{}\' param invalid: {\'-\'}'.format('output_run'))
-
     args.extend(('--pipeline ', pipeline))
     args.extend(('--problem', problem))
     args.extend(('--input', input))
@@ -181,14 +214,14 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
         if (not isinstance(data_pipeline, str)):
             raise TypeError('\'{}\' param not of type \'{}\''.format('data_pipeline','str'))
         if (not os.path.isfile(data_pipeline) and not is_valid_uuid(data_pipeline)):
-            raise ValueError('\'{}\' param not a file path'.format('data_pipeline'))
+            raise ValueError('\'{}\' param not a file path or pipeline ID'.format('data_pipeline'))
         args.extend(('--data-pipeline', data_pipeline))
 
     if (scoring_pipeline):
         if (not isinstance(scoring_pipeline, str)):
             raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str'))
         if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)):
-            raise ValueError('\'{}\' param not a file path'.format('scoring_pipeline'))
+            raise ValueError('\'{}\' param not a file path or pipeline ID'.format('scoring_pipeline'))
         args.extend(('--scoring-pipeline', scoring_pipeline))
 
     if (metric):
@@ -204,7 +237,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
             args.extend(('--scoring-param', scoring_param[0], scoring_param[1]))
 
     if (scores):
-        # TODO: how to check for nonexistent file? parse?
         args.extend(('--scores', scores_path))
 
     if (scoring_random_seed):
@@ -215,7 +247,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (data_split_file):
         if (not isinstance(data_split_file, str)):
             raise TypeError('\'{}\' param not of type \'{}\''.format('data_split_file','str'))
-        if (not os.path.isfile(data_split_file)):
+        if (data_split_file != '-' and not os.path.isfile(data_split_file)):
             raise ValueError('\'{}\' param invalid value: {file_path, \'-\'}'.format('data_split_file'))
         args.extend(('--data-split-file', data_split_file))
 
@@ -225,25 +257,26 @@ def is_valid_uuid(uuid_to_test: str, version=4):
     """
     Check if uuid_to_test is a valid UUID.
 
-    Parameters
-    ----------
+    Parmaters
+    -------
     uuid_to_test : str
+        str to test if valid uuid
     version : {1, 2, 3, 4}
-
+        version of uuid for which to test
+    
     Returns
     -------
-    `True` if uuid_to_test is a valid UUID, otherwise `False`.
-
-    Examples
-    --------
-    >>> is_valid_uuid('c9bf9e57-1685-4c89-bafb-ff5af830be8a')
-    True
-    >>> is_valid_uuid('c9bf9e58')
-    False
+    bool
+        `True` if uuid_to_test is a valid UUID,
+        otherwise `False`
+    
+    Raises:
+    -------
+    TypeError
+        when str is not valid uuid
     """
-
     try:
         uuid_obj = UUID(uuid_to_test, version=version)
-    except Exception:
+    except TypeError:
         return False
     return str(uuid_obj) == uuid_to_test

From 2635fddc906580bd573d25631e31f6d4cd997ca0 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 19 Feb 2021 14:06:37 -0700
Subject: [PATCH 13/44] setup.py add dependencies

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7f1457f..7d11df2 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
         'docker>=4.4.0<4.5.0',
         'redis>=3.5.0<3.6.0',
         'rq>=1.7.0<1.8.0',
-        'elasticsearch_dsl>=7.0.0<8.0.0',
-        'elastcisearch>=7.0.0<8.0.0',
+        'elasticsearch>=7.0.0<8.0.0',
+        'elasticsearch_dsl>=7.0.0<8.0.0'
     ],
 )

From ba5089af57fd1c8b56e25ba88f55981b69a7a03f Mon Sep 17 00:00:00 2001
From: Joseph Clark <clarkjoe.co@gmail.com>
Date: Fri, 19 Feb 2021 14:16:17 -0700
Subject: [PATCH 14/44] fixed condition typo. renamed file

---
 .../{execute_pipeline_new.py => evaluate_pipeline_new.py}       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename experimenter/{execute_pipeline_new.py => evaluate_pipeline_new.py} (99%)

diff --git a/experimenter/execute_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
similarity index 99%
rename from experimenter/execute_pipeline_new.py
rename to experimenter/evaluate_pipeline_new.py
index e9b02ec..93909f1 100644
--- a/experimenter/execute_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -220,7 +220,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (scoring_pipeline):
         if (not isinstance(scoring_pipeline, str)):
             raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str'))
-        if (not os.path.isfile(scoring_pipeline) not is_valid_uuid(scoring_pipeline)):
+        if (not os.path.isfile(scoring_pipeline) and not is_valid_uuid(scoring_pipeline)):
             raise ValueError('\'{}\' param not a file path or pipeline ID'.format('scoring_pipeline'))
         args.extend(('--scoring-pipeline', scoring_pipeline))
 

From 62d639ea0aa24d43effc6a0a919ec9bc59f29a36 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 19 Feb 2021 14:30:42 -0700
Subject: [PATCH 15/44] Update job maker in the seed swap functionality

---
 experimenter/modify_generator.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 4a6c1ec..5942802 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,7 +1,7 @@
 from experimenter.query import query_on_seeds, query_on_primitive
 from experimenter import queue
 import d3m.metadata.pipeline 
-from experimenter.evaluate_pipeline_new import evaluate_pipeline_via_d3m_cli as evaluate_pipeline
+from experimenter.evaluate_pipeline_new import evalute_pipeline_on_problem as evaluate_pipeline
 
 class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
@@ -21,8 +21,12 @@ def __next__(self):
         #iterate through query results
         for query_result in self.query_results:
             #iterate through modifier results
-            for job_args in self._modify(query_result, self.args):
-                job = queue.make_job(evaluate_pipeline, jobs_args)
+            for pipeline_path, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args):
+                job = queue.make_job(evaluate_pipeline,
+                                     pipeline_path=pipeline_path,
+                                     problem_path=problem_path,
+                                     input_path=dataset_doc_path,
+                                     data_random_seed=seed)
                 self.num_complete += 1
                 #check to run until the generator stops iterating (if no input for num_pipelines_to_run)
                 if (self.max_jobs):
@@ -78,7 +82,7 @@ def _modify_random_seed(self, seed_limit, query_args):
             num_run += 1
             used_seeds.append(new_seed)
             #yield the necessary job requirements
-            yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, '-', new_seed 
+            yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, new_seed 
             
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):

From 37d94bc9c3348d5a9c1500d8c4ee8227666c32ea Mon Sep 17 00:00:00 2001
From: Joseph Clark <clarkjoe.co@gmail.com>
Date: Mon, 22 Feb 2021 12:40:45 -0700
Subject: [PATCH 16/44] implemented review suggestions

---
 .../3c11d171-e2ad-4d26-a034-04f3b062306c.yml  |  82 ---------
 .../79ce71bd-db96-494b-a455-14f2e2ac5040.yml  |  84 ---------
 .../9c18472e-fff7-4129-93f6-1ab996e82adb.yml  |  84 ---------
 .../data_preparation_pipelines/__init__.py    |  31 ----
 .../c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml  |  82 ---------
 .../f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml  |  31 ----
 experimenter/evaluate_pipeline_new.py         | 165 +-----------------
 setup.py                                      |   1 +
 8 files changed, 9 insertions(+), 551 deletions(-)
 delete mode 100644 experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml
 delete mode 100644 experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml
 delete mode 100644 experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml
 delete mode 100644 experimenter/data_preparation_pipelines/__init__.py
 delete mode 100644 experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml
 delete mode 100644 experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml

diff --git a/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml b/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml
deleted file mode 100644
index 695f53c..0000000
--- a/experimenter/data_preparation_pipelines/3c11d171-e2ad-4d26-a034-04f3b062306c.yml
+++ /dev/null
@@ -1,82 +0,0 @@
-id: 3c11d171-e2ad-4d26-a034-04f3b062306c
-schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
-source:
-  name: Mitar
-created: "2018-07-28T01:24:39.642266Z"
-name: Train-test split of tabular datasets
-description: |
-  Train-test split of tabular datasets.
-inputs:
-  - name: folds
-  - name: full dataset
-outputs:
-  - name: train datasets
-    data: steps.0.produce
-  - name: test datasets
-    data: steps.2.produce
-  - name: score datasets
-    data: steps.1.produce
-steps:
-  # Step 0.
-  - type: PRIMITIVE
-    primitive:
-      id: 3fcc6dc4-6681-4c86-948e-066d14e7d803
-      version: 0.1.0
-      python_path: d3m.primitives.evaluation.train_score_dataset_split.Common
-      name: Train-score tabular dataset splits
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: inputs.0
-      dataset:
-        type: CONTAINER
-        data: inputs.1
-    outputs:
-      - id: produce
-      - id: produce_score_data
-  # Step 1. We redact privileged attributes for both score and test splits.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.0.produce_score_data
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
-          - https://metadata.datadrivendiscovery.org/types/MissingData
-  # Step 2. We further redact targets in test split.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.1.produce
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/TrueTarget
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
-          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml b/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml
deleted file mode 100644
index 6a91f91..0000000
--- a/experimenter/data_preparation_pipelines/79ce71bd-db96-494b-a455-14f2e2ac5040.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-id: 79ce71bd-db96-494b-a455-14f2e2ac5040
-schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
-source:
-  name: Mitar
-created: "2018-10-26T00:48:08.341897Z"
-name: No split of tabular datasets
-description: |
-  A pipeline which splits a tabular dataset in a way that for all splits it
-  produces the same (full) dataset. It still redacts the test split.
-  Useful for unsupervised learning tasks.
-inputs:
-  - name: folds
-  - name: full dataset
-outputs:
-  - name: train datasets
-    data: steps.0.produce
-  - name: test datasets
-    data: steps.2.produce
-  - name: score datasets
-    data: steps.1.produce
-steps:
-  # Step 0.
-  - type: PRIMITIVE
-    primitive:
-      id: 48c683ad-da9e-48cf-b3a0-7394dba5e5d2
-      version: 0.1.0
-      python_path: d3m.primitives.evaluation.no_split_dataset_split.Common
-      name: No-split tabular dataset splits
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: inputs.0
-      dataset:
-        type: CONTAINER
-        data: inputs.1
-    outputs:
-      - id: produce
-      - id: produce_score_data
-  # Step 1. We redact privileged attributes for both score and test splits.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.0.produce_score_data
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
-          - https://metadata.datadrivendiscovery.org/types/MissingData
-  # Step 2. We further redact targets in test split.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.1.produce
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/TrueTarget
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
-          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml b/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml
deleted file mode 100644
index 80e2a2c..0000000
--- a/experimenter/data_preparation_pipelines/9c18472e-fff7-4129-93f6-1ab996e82adb.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-id: 9c18472e-fff7-4129-93f6-1ab996e82adb
-schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
-source:
-  name: Mitar
-created: "2018-10-27T01:30:10.245934Z"
-name: Fixed split of tabular datasets
-description: |
-  A pipeline which splits a tabular dataset in a way that uses for the test
-  (score) split a fixed list of primary index values or row indices of the main
-  resource to be used.
-inputs:
-  - name: folds
-  - name: full dataset
-outputs:
-  - name: train datasets
-    data: steps.0.produce
-  - name: test datasets
-    data: steps.2.produce
-  - name: score datasets
-    data: steps.1.produce
-steps:
-  # Step 0.
-  - type: PRIMITIVE
-    primitive:
-      id: 1654f000-2178-4520-be4c-a95bc26b8d3a
-      version: 0.1.0
-      python_path: d3m.primitives.evaluation.fixed_split_dataset_split.Commmon
-      name: Fixed split tabular dataset splits
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: inputs.0
-      dataset:
-        type: CONTAINER
-        data: inputs.1
-    outputs:
-      - id: produce
-      - id: produce_score_data
-  # Step 1. We redact privileged attributes for both score and test splits.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.0.produce_score_data
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
-          - https://metadata.datadrivendiscovery.org/types/MissingData
-  # Step 2. We further redact targets in test split.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.1.produce
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/TrueTarget
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
-          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/__init__.py b/experimenter/data_preparation_pipelines/__init__.py
deleted file mode 100644
index 4b52dec..0000000
--- a/experimenter/data_preparation_pipelines/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import os.path
-
-SCORING_PIPELINE_ID = 'f596cd77-25f8-4d4c-a350-bb30ab1e58f6'
-SCORING_PIPELINE_PATH = os.path.join(
-    os.path.dirname(__file__), SCORING_PIPELINE_ID + '.yml'
-)
-assert os.path.exists(SCORING_PIPELINE_PATH)
-
-NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '79ce71bd-db96-494b-a455-14f2e2ac5040'
-NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
-    os.path.dirname(__file__), NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml'
-)
-assert os.path.exists(NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH)
-
-FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID = '9c18472e-fff7-4129-93f6-1ab996e82adb'
-FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
-    os.path.dirname(__file__), FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID + '.yml'
-)
-assert os.path.exists(FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH)
-
-TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID = '3c11d171-e2ad-4d26-a034-04f3b062306c'
-TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
-    os.path.dirname(__file__), TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID + '.yml'
-)
-assert os.path.exists(TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH)
-
-K_FOLD_TABULAR_SPLIT_PIPELINE_ID = 'c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8'
-K_FOLD_TABULAR_SPLIT_PIPELINE_PATH = os.path.join(
-    os.path.dirname(__file__), K_FOLD_TABULAR_SPLIT_PIPELINE_ID + '.yml'
-)
-assert os.path.exists(K_FOLD_TABULAR_SPLIT_PIPELINE_PATH)
diff --git a/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml b/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml
deleted file mode 100644
index 91f14f2..0000000
--- a/experimenter/data_preparation_pipelines/c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8.yml
+++ /dev/null
@@ -1,82 +0,0 @@
-id: c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8
-schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
-source:
-  name: Mitar
-created: "2018-07-27T19:39:00.676949Z"
-name: K-fold split of tabular datasets
-description: |
-  K-fold split of tabular datasets for cross-validation.
-inputs:
-  - name: folds
-  - name: full dataset
-outputs:
-  - name: train datasets
-    data: steps.0.produce
-  - name: test datasets
-    data: steps.2.produce
-  - name: score datasets
-    data: steps.1.produce
-steps:
-  # Step 0.
-  - type: PRIMITIVE
-    primitive:
-      id: bfedaf3a-6dd0-4a83-ad83-3a50fe882bf8
-      version: 0.1.0
-      python_path: d3m.primitives.evaluation.kfold_dataset_split.Common
-      name: K-fold cross-validation tabular dataset splits
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: inputs.0
-      dataset:
-        type: CONTAINER
-        data: inputs.1
-    outputs:
-      - id: produce
-      - id: produce_score_data
-  # Step 1. We redact privileged attributes for both score and test splits.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.0.produce_score_data
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/PrivilegedData
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
-          - https://metadata.datadrivendiscovery.org/types/MissingData
-  # Step 2. We further redact targets in test split.
-  - type: PRIMITIVE
-    primitive:
-      id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
-      version: 0.2.0
-      python_path: d3m.primitives.evaluation.redact_columns.Common
-      name: Redact columns for evaluation
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: steps.1.produce
-    outputs:
-      - id: produce
-    hyperparams:
-      semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/TrueTarget
-      add_semantic_types:
-        type: VALUE
-        data:
-          - https://metadata.datadrivendiscovery.org/types/RedactedTarget
-          - https://metadata.datadrivendiscovery.org/types/MissingData
diff --git a/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml b/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml
deleted file mode 100644
index e95ecd5..0000000
--- a/experimenter/data_preparation_pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6
-schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
-source:
-  name: Mitar
-created: "2020-04-18T11:42:44.138742Z"
-name: Scoring pipeline
-description: |-
-  A general scoring pipeline.
-inputs:
-  - name: predictions
-  - name: score dataset
-outputs:
-  - name: scores
-    data: steps.0.produce
-steps:
-  # Step 0.
-  - type: PRIMITIVE
-    primitive:
-      id: 799802fb-2e11-4ab7-9c5e-dda09eb52a70
-      version: 0.5.0
-      python_path: d3m.primitives.evaluation.compute_scores.Core
-      name: Compute scores given the metrics to use
-    arguments:
-      inputs:
-        type: CONTAINER
-        data: inputs.0
-      score_dataset:
-        type: CONTAINER
-        data: inputs.1
-    outputs:
-      - id: produce
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 93909f1..5e4d8e1 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -1,15 +1,15 @@
-import itertools as it
+mport itertools as it
 import json
 import os
 
 from typing import Any, List, Tuple
 from uuid import UUID
 
-from d3m.metadata.pipeline import Pipeline
 from d3m import cli
+from d3m.d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, 
+    SCORING_PIPELINE_ID)
 
 from experimenter.databases.d3m_mtl import D3MMtLDB
-from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
 
 def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
     """ 
@@ -76,22 +76,11 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
         input=input_path, output_run=output_run_path,
         data_random_seed=data_random_seed)
 
-    save_pipeline_run_to_d3m_db(output_run_path)
-
 def evaluate_pipeline_via_d3m_cli(pipeline: str,
     problem: str,
     input: str,
     output_run: str,
-    data_random_seed: int,
-    data_params: List[Tuple[str,Any]] = None,
-    data_pipeline: str = K_FOLD_TABULAR_SPLIT_PIPELINE_ID,
-    scoring_pipeline: str = SCORING_PIPELINE_ID,
-    input_run: str = None,
-    metric: str = None,
-    scoring_params: List[Tuple[str,Any]] = None,
-    scores: str = None,
-    scoring_random_seed: int = None,
-    data_split_file: str = None):
+    data_random_seed: int):
     """ 
     Evaluate pipeline on problem using d3m's runtime cli. 
     Wrapper function to execute d3m's runtime cli 'evaluate' command.
@@ -112,44 +101,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     data_random_seed : int
         random seed to use for
         data preparation
-    data_params : list of tuples, optional
-        hyper-parameter names and values
-        for data preparation.
-        None by default
-    data_pipeline : path_like str or uuid4 str, optional
-        path to data preparation pipeline file
-        or pipeline ID.
-        K_FOLD_TABULAR_SPLIT_PIPELINE_ID by default
-    scoring_pipeline : path_like str or uuid4 str, optional
-        path to scoring pipeline file
-        or pipeline ID.
-        SCORING_PIPELINE_ID by default
-    input_run : path_like str or '-', optional
-        path to pipeline_run file
-        with configuration.
-        use '-' for stdin.
-        None by default
-    metric : str, optional
-        metric to use.
-        Metric from problem by default
-    scoring_params : list of tuples, optional
-        hyper-parameter names and values
-        for scoring pipeline.
-        None by default
-    scores : path_like str, optional
-        path to save scores.
-        None by default
-    scoring_random_seed : int, optional
-        random seed to use for scoring.
-        None by default
-    data_split_file : path_like str, optional
-        reads the split file and populates
-        "primary_index_values" hyper-parameter
-        for data preparation pipeline with values
-        from the "d3mIndex" column corresponding
-        to the test data.
-        use '-' for stdin.
-        None by default
 
     Return:
     -------
@@ -157,37 +108,19 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     
     Raises:
     -------
-    TypeError
-        when parameter value has 
-        incorrect type
     ValueError
         when parameter value is
         invalid
     """
     args = ['d3m', 'runtime', 'evaluate']
 
-    if (not isinstance(pipeline, str)):
-        raise TypeError('\'{}\' param not of type \'{}\''.format('pipeline', 'str'))
-
-    if (not isinstance(problem_path, str)):
-        raise TypeError('\'{}\' param not of type \'{}\''.format('problem', 'str'))
-
-    if (not isinstance(input, str)):
-        raise TypeError('\'{}\' param not of type \'{}\''.format('input', 'str'))
-
-    if (not isinstance(output_run, str)):
-        raise TypeError('\'{}\' param not of type \'{}\''.format('output_run', 'str'))
-
-    if (not isinstance(data_random_seed, int)):
-        raise TypeError('\'{}\' param not of type \'{}\''.format('data_random_seed','int'))
-
-    if (not os.path.isfile(pipeline) and not is_valid_uuid(pipeline)):
+    if (not os.path.isfile(pipeline)):
         raise ValueError('\'{}\' param not a file path or pipeline ID'.format('pipeline'))
 
-    if (not os.path.isfile(problem)): # TODO: check for URI
+    if (not os.path.isfile(problem)): 
         raise ValueError('\'{}\' param not a file path'.format('problem'))
 
-    if (not os.path.isfile(input)): # TODO: check for URI
+    if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
 
     args.extend(('--pipeline ', pipeline))
@@ -196,87 +129,5 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--output-run', output_run_path))
     args.extend(('--data-random-seed', data_random_seed))
 
-    if (input_run):
-        if (not isinstance(input_run, str)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('input_run','str'))
-        if (not os.path.isfile(input_run) and input_run != '-'):
-            raise ValueError('\'{}\' param invalid: {file_path, \'-\'}'.format('input_run'))
-        # TODO: input_run validation
-        pass
-
-    if (data_params):
-        if (not isinstance(data_params, List)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('data_params','List'))
-        for data_param in data_params:
-            args.extend(('--data-param', data_param[0], data_param[1]))
-
-    if (data_pipeline):
-        if (not isinstance(data_pipeline, str)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('data_pipeline','str'))
-        if (not os.path.isfile(data_pipeline) and not is_valid_uuid(data_pipeline)):
-            raise ValueError('\'{}\' param not a file path or pipeline ID'.format('data_pipeline'))
-        args.extend(('--data-pipeline', data_pipeline))
-
-    if (scoring_pipeline):
-        if (not isinstance(scoring_pipeline, str)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_pipeline','str'))
-        if (not os.path.isfile(scoring_pipeline) and not is_valid_uuid(scoring_pipeline)):
-            raise ValueError('\'{}\' param not a file path or pipeline ID'.format('scoring_pipeline'))
-        args.extend(('--scoring-pipeline', scoring_pipeline))
-
-    if (metric):
-        if (not isinstance(metric, str)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('metric','str'))
-        # TODO: set of valid metric args?
-        args.extend(('--metric', metric))
-
-    if (scoring_params):
-        if (not isinstance(scoring_params, List)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_params','List'))
-        for scoring_param in scoring_params:
-            args.extend(('--scoring-param', scoring_param[0], scoring_param[1]))
-
-    if (scores):
-        args.extend(('--scores', scores_path))
-
-    if (scoring_random_seed):
-        if (not isinstance(scoring_random_seed, int)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('scoring_random_seed','int'))
-        args.extend(('--scoring-random-seed', scoring_random_seed))
-
-    if (data_split_file):
-        if (not isinstance(data_split_file, str)):
-            raise TypeError('\'{}\' param not of type \'{}\''.format('data_split_file','str'))
-        if (data_split_file != '-' and not os.path.isfile(data_split_file)):
-            raise ValueError('\'{}\' param invalid value: {file_path, \'-\'}'.format('data_split_file'))
-        args.extend(('--data-split-file', data_split_file))
-
     cli.main(args)
-
-def is_valid_uuid(uuid_to_test: str, version=4):
-    """
-    Check if uuid_to_test is a valid UUID.
-
-    Parmaters
-    -------
-    uuid_to_test : str
-        str to test if valid uuid
-    version : {1, 2, 3, 4}
-        version of uuid for which to test
-    
-    Returns
-    -------
-    bool
-        `True` if uuid_to_test is a valid UUID,
-        otherwise `False`
-    
-    Raises:
-    -------
-    TypeError
-        when str is not valid uuid
-    """
-    try:
-        uuid_obj = UUID(uuid_to_test, version=version)
-    except TypeError:
-        return False
-    return str(uuid_obj) == uuid_to_test
+    save_pipeline_run_to_d3m_db(output_run_path)
diff --git a/setup.py b/setup.py
index b22b2b4..884e224 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,7 @@
     install_requires=[
         'd3m',  # TODO: add version bounds
         'docker>=4.4.0<4.5.0',
+        'mypy==0.812',
         'redis>=3.5.0<3.6.0',
         'rq>=1.7.0<1.8.0',
     ],

From d308e927df80decfc8c56ce4fd1b8df2241d9d8f Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Tue, 23 Feb 2021 02:16:58 +0000
Subject: [PATCH 17/44] setup.py elasticsearch and working queue/enqueue

---
 experimenter/cli.py                   |  2 +-
 experimenter/config.py                | 14 +++++++-------
 experimenter/evaluate_pipeline_new.py |  4 ++--
 experimenter/modify_generator.py      | 14 ++++++++------
 experimenter/query.py                 |  8 ++++----
 setup.py                              |  6 +++---
 6 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/experimenter/cli.py b/experimenter/cli.py
index b7e02a4..5bbd356 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -189,7 +189,7 @@ def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParse
     modify_type = arguments.modify_type
     modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments)
     #now run the enqueuer part
-    queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port)
+    queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port,arguments.job_timeout)
 
 
 def configure_update_parser(parser: argparse.ArgumentParser) -> None:
diff --git a/experimenter/config.py b/experimenter/config.py
index 9150182..ccb884f 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -17,10 +17,10 @@
 D3M_DB_SUBMITTER = os.getenv("D3M_DB_SUBMITTER")
 D3M_DB_TOKEN = os.getenv("D3M_DB_TOKEN")
 
-try:
-    MONGO_HOST = os.environ["MONGO_HOST"]
-    MONGO_PORT = int(os.environ["MONGO_PORT"])
-    REDIS_HOST = os.environ["REDIS_HOST"]
-    REDIS_PORT = int(os.environ["REDIS_PORT"])
-except Exception:
-    logger.exception("environment variables not set")
+#try:
+#    MONGO_HOST = os.environ["MONGO_HOST"]
+#    MONGO_PORT = int(os.environ["MONGO_PORT"])
+#    REDIS_HOST = os.environ["REDIS_HOST"]
+#    REDIS_PORT = int(os.environ["REDIS_PORT"])
+#except Exception:
+#    logger.exception("environment variables not set")
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 93909f1..904c3e6 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -9,7 +9,7 @@
 from d3m import cli
 
 from experimenter.databases.d3m_mtl import D3MMtLDB
-from data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
+from experimenter.data_preparation_pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_ID, SCORING_PIPELINE_ID
 
 def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
     """ 
@@ -72,7 +72,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
 
     output_run_path = '_'.join(output_run_path) + '.json'
 
-    execute_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
+    evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
         data_random_seed=data_random_seed)
 
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 5942802..ec5106c 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,7 +1,8 @@
 from experimenter.query import query_on_seeds, query_on_primitive
 from experimenter import queue
-import d3m.metadata.pipeline 
-from experimenter.evaluate_pipeline_new import evalute_pipeline_on_problem as evaluate_pipeline
+import d3m.metadata.pipeline
+from random import randint
+from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
 
 class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
@@ -22,6 +23,7 @@ def __next__(self):
         for query_result in self.query_results:
             #iterate through modifier results
             for pipeline_path, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args):
+
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
@@ -30,7 +32,7 @@ def __next__(self):
                 self.num_complete += 1
                 #check to run until the generator stops iterating (if no input for num_pipelines_to_run)
                 if (self.max_jobs):
-                    if (self.num_complete >= self.max_jobs):
+                    if (self.num_complete > self.max_jobs):
                         raise StopIteration
                 return job
         raise StopIteration
@@ -72,7 +74,7 @@ def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check):
     
     
     def _modify_random_seed(self, seed_limit, query_args):
-        used_seeds = query_args.tested_seeds
+        used_seeds = query_args['tested_seeds']
         num_run = len(used_seeds)
         #run until the right number of seeds have been run
         while (num_run < seed_limit):
@@ -80,9 +82,9 @@ def _modify_random_seed(self, seed_limit, query_args):
             if (new_seed in used_seeds):
                 continue
             num_run += 1
-            used_seeds.append(new_seed)
+            used_seeds.add(new_seed)
             #yield the necessary job requirements
-            yield query_args.pipeline, query_args.problem_path, query_args.dataset_doc_path, new_seed 
+            yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed 
             
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):
diff --git a/experimenter/query.py b/experimenter/query.py
index 1cfd3a2..4660943 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -52,7 +52,7 @@ def query_on_primitive(primitive_id: str, limit_indexes=False):
       
       for (problem_id, dataset_name), random_seeds in results.items():
          
-         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds}
+         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds}
 
 def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
    pipeline_search = Search(using=CONNECTION, index='pipelines')
@@ -63,10 +63,10 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
    
    for pipeline in pipeline_search.scan():
       results = scan_pipeline_runs(pipeline.id, submitter)
-      for (problem_id, dataset_name), random_seeds in results.items():
+      for (problem_id, dataset_id), random_seeds in results.items():
          if limit and len(random_seeds) > limit:
             continue
-         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_name), 'tested_seeds': random_seeds}
+         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds}
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
    pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
@@ -79,7 +79,7 @@ def scan_pipeline_runs(pipeline_id, submitter=None):
    results = dict()
    for pipeline_run in pipeline_run_search.scan():
       for dataset in pipeline_run.datasets:
-         dataset_prob_tuple = (pipeline_run.problem.id, dataset.name)
+         dataset_prob_tuple = (pipeline_run.problem.id, dataset.id)
          results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
          results[dataset_prob_tuple].add(pipeline_run.random_seed)
    return results
diff --git a/setup.py b/setup.py
index 7d11df2..b4267c4 100644
--- a/setup.py
+++ b/setup.py
@@ -13,8 +13,8 @@
         'd3m',  # TODO: add version bounds
         'docker>=4.4.0<4.5.0',
         'redis>=3.5.0<3.6.0',
-        'rq>=1.7.0<1.8.0',
-        'elasticsearch>=7.0.0<8.0.0',
-        'elasticsearch_dsl>=7.0.0<8.0.0'
+        'rq>=1.7.0<1.8.0'
+        'elasticsearch==7.11.0',
+        'elasticsearch_dsl==7.3.0'
     ],
 )

From 6edfb62bda0888ba6cfe5d754fac584a4e997aa6 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 23 Feb 2021 15:25:15 -0700
Subject: [PATCH 18/44] D3M configuration variables

---
 config-example.ini     | 5 +++++
 experimenter/config.py | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/config-example.ini b/config-example.ini
index ba1011d..5ab6c77 100644
--- a/config-example.ini
+++ b/config-example.ini
@@ -10,3 +10,8 @@ DATA_DIR = redis
 DOCKER_IMAGE_NAME = redis:latest
 DOCKER_PORT = 6379
 DOCKER_DATA_DIR = /data
+
+[D3MINFO]
+D3M_DB_SUBMITTER = {SUBMITTER_NAME}
+D3M_DB_TOKEN = {UNIQUE_TOKEN}
+SAVE_TO_D3M = True
diff --git a/experimenter/config.py b/experimenter/config.py
index 1ae115e..ea3a265 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -32,7 +32,13 @@ def __init__(self, config_path: str = None) -> None:
     def get(self, section, key):
         return self._config.get(section, key)
 
-
+class D3MConfig(metaclass=Singleton):
+    def __init__(self):
+        config = Config()
+        self.d3m_submitter = config.get('D3MINFO','D3M_DB_SUBMITTER')
+        self.d3m_token = config.get('D3MINFO', 'D3M_DB_TOKEN')
+        self.save_to_d3m = config.get('D3MINFO', 'SAVE_TO_D3M')=="True"
+        
 class RedisConfig(metaclass=Singleton):
     def __init__(self):
         config = Config()

From 4f8cfa13d1dcc9b4ac15a8ba2f5c469c46b8d6db Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 2 Mar 2021 09:30:19 -0700
Subject: [PATCH 19/44] fix environment variables and pipeline run to dict for
 saving

---
 experimenter/databases/d3m_mtl.py     | 6 +++---
 experimenter/evaluate_pipeline_new.py | 8 +++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/experimenter/databases/d3m_mtl.py b/experimenter/databases/d3m_mtl.py
index f8dde8d..6badcb8 100644
--- a/experimenter/databases/d3m_mtl.py
+++ b/experimenter/databases/d3m_mtl.py
@@ -23,7 +23,7 @@ def __init__(self) -> None:
         self._post_url = D3M_MTL_DB_POST_URL
         # This env var allows code calling this class to be run during
         # unit tests without actually saving to the production DB.
-        self.should_save = config.SAVE_TO_D3M
+        self.should_save = config.D3MConfig().save_to_d3m
         # A reference to a low-level elasticsearch client. This can be
         # used to query the D3M DB, or this classe's `search` method
         # can be used, and is preferred, since its API is more straightforward.
@@ -31,9 +31,9 @@ def __init__(self) -> None:
         # certain things though.
         self.es = Elasticsearch(hosts=[D3M_MTL_DB_GET_URL], timeout=30)
         # Our submitter name.
-        self._submitter = config.D3M_DB_SUBMITTER
+        self._submitter = config.D3MConfig().d3m_submitter
         # The secret verifying us as the submitter we say we are.
-        self._x_token = config.D3M_DB_TOKEN
+        self._x_token = config.D3MConfig().d3m_token
         if self._is_identifying_as_submitter():
             logger.info(
                 f"Documents will be saved under submitter name: '{self._submitter}'"
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index f3850fb..e4cfc7f 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -5,7 +5,7 @@
 from typing import Any, List, Tuple
 from uuid import UUID
 
-from d3m import cli
+from d3m import cli as d3m_cli
 from d3m.d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, 
     SCORING_PIPELINE_ID)
 
@@ -29,7 +29,9 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
     TODO
     """
     d3m_db = D3MMtLDB()
-    return D3MMtLDB().save_pipeline_run(pipeline_run_path)
+    with open(pipeline_run_path) as pipeline_data:
+        pipeline_run = json.load(pipeline_data)
+    return D3MMtLDB().save_pipeline_run(pipeline_run)
 
 def evaluate_pipeline_on_problem(pipeline_path: str,
     problem_path: str,
@@ -129,5 +131,5 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--output-run', output_run_path))
     args.extend(('--data-random-seed', data_random_seed))
 
-    cli.main(args)
+    d3m_cli.main(args)
     save_pipeline_run_to_d3m_db(output_run_path)

From 92047d0ddca3c10eabd02597f6e513b94d80af21 Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Tue, 2 Mar 2021 22:27:32 +0000
Subject: [PATCH 20/44] Working queue with queue refactor

---
 experimenter/cli.py                   | 2 +-
 experimenter/config.py                | 1 -
 experimenter/evaluate_pipeline_new.py | 4 ++--
 setup.py                              | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/experimenter/cli.py b/experimenter/cli.py
index e27b106..bfe88e5 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -210,7 +210,7 @@ def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParse
     modify_type = arguments.modify_type
     modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments)
     #now run the enqueuer part
-    queue.enqueue_jobs(modify_generator, arguments.queue_host, arguments.queue_port,arguments.job_timeout)
+    queue.enqueue_jobs(jobs=modify_generator, job_timeout=arguments.job_timeout)
 
 
 def configure_update_parser(parser: argparse.ArgumentParser) -> None:
diff --git a/experimenter/config.py b/experimenter/config.py
index ea4975d..3507891 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -52,4 +52,3 @@ def __init__(self):
         self.docker_data_dir = config.get('REDIS', 'DOCKER_DATA_DIR')
         self.dashboard_port = config.get('REDIS', 'DASHBOARD_PORT')
         self.dashboard_docker_image_name = config.get('REDIS', 'DASHBOARD_DOCKER_IMAGE_NAME')
->>>>>>> f8f7e7ac914d104149bf62f9353a4c8a65a4f726
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index e4cfc7f..a236ce4 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -1,4 +1,4 @@
-mport itertools as it
+import itertools as it
 import json
 import os
 
@@ -6,7 +6,7 @@
 from uuid import UUID
 
 from d3m import cli as d3m_cli
-from d3m.d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, 
+from d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, 
     SCORING_PIPELINE_ID)
 
 from experimenter.databases.d3m_mtl import D3MMtLDB
diff --git a/setup.py b/setup.py
index e5d3f24..03f5870 100644
--- a/setup.py
+++ b/setup.py
@@ -10,12 +10,12 @@
     packages=find_packages(include=['experimenter']),
     python_requires='>=3.6,<4.0',
     install_requires=[
-        'd3m',  # TODO: add version bounds
         'docker>=4.4.0<4.5.0',
         'mypy==0.812',
         'redis>=3.5.0<3.6.0',
         'rq>=1.7.0<1.8.0',
         'rq-dashboard>=0.6.0<0.7.0',
+        'd3m @ git+https://gitlab.com/datadrivendiscovery/d3m@devel#egg=d3m'
         'elasticsearch==7.11.0',
         'elasticsearch_dsl==7.3.0'
     ],

From 78b9605bb98adb92d92d7d55f9ec90cf54f5fbe9 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 2 Mar 2021 17:58:34 -0700
Subject: [PATCH 21/44] Added job count and worker info tracking for queue
 status command

---
 experimenter/queue.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/experimenter/queue.py b/experimenter/queue.py
index 2887624..8be73e0 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -19,6 +19,7 @@
 _STOP_SUCCESS_MESSAGE = 'queue successfully stopped'
 _STATUS_RUNNING_MESSAGE = 'queue is running on port {port}'
 _STATUS_STOPPED_MESSAGE = 'queue is stopped'
+_QUEUE_LENGTH_MESSAGE = 'number of jobs on queue {name}: {num_jobs}'
 _EMPTIED_MESSAGE = 'queue emptied'
 
 
@@ -56,14 +57,28 @@ def stop() -> None:
     docker_utils.stop_container(config.RedisConfig().docker_image_name)
     print(_STOP_SUCCESS_MESSAGE)
 
-
-def status() -> None:
+def get_worker_message(workers, queue_name: str = _DEFAULT_QUEUE) -> str:
+    num_workers = len(workers)
+    message = 'number of workers on queue {}: {}'.format(queue_name, num_workers)
+    for it, worker in enumerate(workers):
+        success = worker.successful_job_count
+        fail = worker.failed_job_count   
+        message = message+'\n worker: {}'.format(it)
+        message = message+'\n\t number of successful jobs: {}'.format(success)
+        message = message+'\n\t number of failed jobs: {}'.format(fail) 
+    return message
+    
+def status(queue_name: str = _DEFAULT_QUEUE) -> None:
     # TODO: report container port instead of config port
     if is_running():
+        connection = redis.StrictRedis(host=config.RedisConfig().host, port=config.RedisConfig().port)
+        queue = rq.Queue(queue_name, connection=connection)
+        workers = rq.Worker.all(queue=queue)
         print(_STATUS_RUNNING_MESSAGE.format(port=config.RedisConfig().port))
+        print(_QUEUE_LENGTH_MESSAGE.format(name=queue_name, num_jobs=len(queue)))
+        print(get_worker_message(workers,queue_name))
     else:
         print(_STATUS_STOPPED_MESSAGE)
-    # TODO: report number of jobs in each queue
 
 
 def empty(queue_name: str = _DEFAULT_QUEUE) -> None:

From c8c7cc4413af81ec99bddfac679a40df8b26dc30 Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Wed, 3 Mar 2021 02:49:02 +0000
Subject: [PATCH 22/44] rq-worker Popen commands updated

---
 experimenter/evaluate_pipeline_new.py | 5 ++++-
 experimenter/queue.py                 | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index a236ce4..f1590ab 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -4,6 +4,7 @@
 
 from typing import Any, List, Tuple
 from uuid import UUID
+from experimenter import config
 
 from d3m import cli as d3m_cli
 from d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, 
@@ -132,4 +133,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--data-random-seed', data_random_seed))
 
     d3m_cli.main(args)
-    save_pipeline_run_to_d3m_db(output_run_path)
+    if (config.D3MConfig().save_to_d3m is True):
+        print("Saving pipeline run to d3m database")
+        save_pipeline_run_to_d3m_db(output_run_path)
diff --git a/experimenter/queue.py b/experimenter/queue.py
index 8be73e0..c41e14d 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -126,7 +126,7 @@ def enqueue_jobs(
 
 def start_worker(max_jobs: int = None, *, queue_name: str = _DEFAULT_QUEUE) -> None:
     args = [
-        'rq', 'worker', queue_name, '--burst', '--url',
+        './env/bin/rq','worker', queue_name, '--burst', '--url',
         'redis://{}:{}'.format(config.RedisConfig().host, config.RedisConfig().port),
     ]
 

From 1a56219164870c56b88ed5b197f1079bbdaf6e32 Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Fri, 5 Mar 2021 19:17:15 +0000
Subject: [PATCH 23/44] Updated paths for saving pipelines and pipeline runs,
 queue and evaluate working

---
 experimenter/config.py                |  5 +++-
 experimenter/evaluate_pipeline_new.py | 38 ++++++++++++++++-----------
 experimenter/modify_generator.py      |  6 +++--
 experimenter/query.py                 |  2 +-
 experimenter/utils.py                 | 20 +++++++++++---
 5 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/experimenter/config.py b/experimenter/config.py
index 3507891..ecf6c37 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -32,13 +32,16 @@ def __init__(self, config_path: str = None) -> None:
     def get(self, section, key):
         return self._config.get(section, key)
 
+
 class D3MConfig(metaclass=Singleton):
     def __init__(self):
         config = Config()
         self.d3m_submitter = config.get('D3MINFO','D3M_DB_SUBMITTER')
         self.d3m_token = config.get('D3MINFO', 'D3M_DB_TOKEN')
         self.save_to_d3m = config.get('D3MINFO', 'SAVE_TO_D3M')=="True"
-        
+        self.datasets_directory = config.get('D3MINFO','DATASET_DIRECTORY')
+
+
 class RedisConfig(metaclass=Singleton):
     def __init__(self):
         config = Config()
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index f1590ab..899cf30 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -1,6 +1,7 @@
 import itertools as it
 import json
 import os
+import parser
 
 from typing import Any, List, Tuple
 from uuid import UUID
@@ -66,15 +67,20 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     """
     output_run_path = []
 
-    with open(pipeline_path, 'r') as pipeline:
-        output_run_path.append(pipeline['properties']['digest'])
-    with open(problem_path, 'r') as problem:
-        output_run_path.append(problem['properties']['digest'])
-    with open(input_path, 'r') as input_f:
-        output_run_path.append(input_f['properties']['digest'])
-
-    output_run_path = '_'.join(output_run_path) + '.json'
-
+    with open(pipeline_path, 'r') as data:
+        pipeline = json.load(data)
+        output_run_path.append(pipeline['id'])
+    with open(problem_path, 'r') as data:
+        problem = json.load(data)
+        output_run_path.append(problem['about']['problemID'])
+    with open(input_path, 'r') as data:
+        input_f = json.load(data)
+        output_run_path.append(input_f['about']['digest'])
+    #get the output run path
+    output_run_path = os.path.abspath(os.path.join(config.Config().get('MAIN','CACHE_DIR'), 'Pipeline_Run', '_'.join(output_run_path) + '.json'))
+    #create the directory
+    os.makedirs(os.path.dirname(output_run_path),exist_ok=True)
+    #evaluate pipeline
     evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
         data_random_seed=data_random_seed)
@@ -118,7 +124,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args = ['d3m', 'runtime', 'evaluate']
 
     if (not os.path.isfile(pipeline)):
-        raise ValueError('\'{}\' param not a file path or pipeline ID'.format('pipeline'))
+        raise ValueError('\'{}\' param not a file path'.format('pipeline'))
 
     if (not os.path.isfile(problem)): 
         raise ValueError('\'{}\' param not a file path'.format('problem'))
@@ -126,13 +132,13 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
 
-    args.extend(('--pipeline ', pipeline))
+    args.extend(('--pipeline', pipeline))
     args.extend(('--problem', problem))
     args.extend(('--input', input))
-    args.extend(('--output-run', output_run_path))
-    args.extend(('--data-random-seed', data_random_seed))
-
+    args.extend(('--output-run', output_run))
+    args.extend(('--data-random-seed', str(data_random_seed)))
+    args.extend(('--data-pipeline', K_FOLD_TABULAR_SPLIT_PIPELINE_ID))
+    args.extend(('--scoring-pipeline', SCORING_PIPELINE_ID))
     d3m_cli.main(args)
     if (config.D3MConfig().save_to_d3m is True):
-        print("Saving pipeline run to d3m database")
-        save_pipeline_run_to_d3m_db(output_run_path)
+        save_pipeline_run_to_d3m_db(output_run)
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index ec5106c..afd86b8 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,5 +1,6 @@
 from experimenter.query import query_on_seeds, query_on_primitive
 from experimenter import queue
+from experimenter.utils import download_from_database
 import d3m.metadata.pipeline
 from random import randint
 from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
@@ -22,8 +23,9 @@ def __next__(self):
         #iterate through query results
         for query_result in self.query_results:
             #iterate through modifier results
-            for pipeline_path, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args):
-
+            for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args):
+                #save the pipeline to path and return pipeline path
+                pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
diff --git a/experimenter/query.py b/experimenter/query.py
index 4660943..63779bc 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -66,7 +66,7 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
       for (problem_id, dataset_id), random_seeds in results.items():
          if limit and len(random_seeds) > limit:
             continue
-         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds}
+         yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id[:-8]), 'dataset_doc_path': get_dataset_doc_path(dataset_id[:-13]), 'tested_seeds': random_seeds}
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
    pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
diff --git a/experimenter/utils.py b/experimenter/utils.py
index 61bfbb7..4df07a7 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -9,10 +9,22 @@
 
 from d3m.metadata import problem as problem_module
 from d3m.utils import get_datasets_and_problems
-from experimenter import exceptions
-
-DEFAULT_DATASET_DIR = "/datasets/training_datasets/LL0"
-
+from experimenter import exceptions, config
+
+DEFAULT_DATASET_DIR = config.D3MConfig().datasets_directory
+
+def download_from_database(data, type_to_download: str = 'Pipeline'):
+    if (type_to_download == 'Pipeline'):
+        i_d = data['id']
+        save_path = os.path.abspath(os.path.join(config.Config().get('MAIN','CACHE_DIR'), 'Pipeline', i_d+str('.json')))
+        #create the new directory
+        os.makedirs(os.path.dirname(save_path),exist_ok=True)
+        #save the file to the directory
+        with open(save_path, 'w') as to_save:
+            json.dump(data, to_save, indent=4)
+    else:
+        raise ValueError("type: {}, not available for download".format(type_to_download)) 
+    return save_path
 
 def get_dataset_doc_path(
     dataset_name: str, dataset_dir: str = DEFAULT_DATASET_DIR

From c9b0b7786f624efb45f16f94a715a88791e5604b Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Mon, 15 Mar 2021 22:15:44 +0000
Subject: [PATCH 24/44] experimenter logging updates

---
 docker-compose.yml                    |  2 +-
 experimenter/cli.py                   |  8 +++++++
 experimenter/evaluate_pipeline_new.py |  8 ++++++-
 experimenter/modify_generator.py      |  5 ++++
 experimenter/queue.py                 | 33 +++++++++++++++++++++------
 experimenter/utils.py                 |  2 +-
 6 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 199c06b..c9b73e4 100755
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -43,7 +43,7 @@ services:
       - type: bind
         source: '${EXPERIMENTER_DIR}'
         target: /d3m-experimenter
-        read_only: true
+        read_only: false
     working_dir: /d3m-experimenter
     networks:
       - default
diff --git a/experimenter/cli.py b/experimenter/cli.py
index 5f9f41a..6734b91 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -23,6 +23,12 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
     )
     configure_queue_parser(queue_parser)
 
+    generator_parser = subparsers.add_parser(
+        'generator',
+        description='generates new pipelines and queues them to run on available datasets',
+    )
+    configure_generator_parser(generator_parser) 
+
 
 def handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
     experimenter_command = arguments.experimenter_command
@@ -30,6 +36,8 @@ def handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> N
 
     if experimenter_command == 'queue':
         queue_handler(arguments, subparser)
+    elif experimenter_command == 'generator':
+        generator_handler(arguments, subparser)
     else:
         raise exceptions.InvalidStateError('Unknown experimenter command: {}'.format(experimenter_command))
 
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 1e7416f..64d68a4 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -2,6 +2,7 @@
 import json
 import os
 import parser
+import logging
 
 from typing import Any, List, Tuple
 from uuid import UUID
@@ -12,6 +13,7 @@
     SCORING_PIPELINE_ID)
 
 from experimenter.databases.d3m_mtl import D3MMtLDB
+logging.basicConfig(filename='logger.log', level=logging.INFO)
 
 def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
     """ 
@@ -66,7 +68,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
         when a file cannot be opened
     """
     output_run_path = []
-
+    logging.info('getting files')
     with open(pipeline_path, 'r') as data:
         pipeline = json.load(data)
         output_run_path.append(pipeline['id'])
@@ -82,6 +84,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     #create the directory
     os.makedirs(os.path.dirname(output_run_path),exist_ok=True)
     #evaluate pipeline
+    logging.info('begin evaluation')
     evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
         data_random_seed=data_random_seed)
@@ -133,6 +136,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
 
+    logging.info('extending arguments')
     args.extend(('--pipeline', pipeline))
     args.extend(('--problem', problem))
     args.extend(('--input', input))
@@ -140,6 +144,8 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--data-random-seed', str(data_random_seed)))
     args.extend(('--data-pipeline', K_FOLD_TABULAR_SPLIT_PIPELINE_ID))
     args.extend(('--scoring-pipeline', SCORING_PIPELINE_ID))
+    logging.info('evaluating')
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
+
         save_pipeline_run_to_d3m_db(output_run)
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index afd86b8..ad654a2 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -4,6 +4,8 @@
 import d3m.metadata.pipeline
 from random import randint
 from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
+import logging
+logging.basicConfig(filename='logger.log', level=logging.INFO)
 
 class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
@@ -24,8 +26,10 @@ def __next__(self):
         for query_result in self.query_results:
             #iterate through modifier results
             for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args):
+                logging.info('downloading pipeline path')
                 #save the pipeline to path and return pipeline path
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
+                logging.info('creating job')
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
@@ -45,6 +49,7 @@ def __iter__(self):
         
             
     def _query(self, args):
+        logging.info('logging')
         if (self.modifier_type=='random-seed'):
             return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter)
         if (self.modifier_type=='swap-primitive'):
diff --git a/experimenter/queue.py b/experimenter/queue.py
index b6b4337..ede755e 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -7,6 +7,7 @@
 
 import redis
 import rq
+import logging
 
 from experimenter import config, exceptions, utils
 
@@ -28,21 +29,38 @@ def get_worker_message(workers: list, queue_name: str = _DEFAULT_QUEUE):
     message = 'number of workers on queue {}: {}'.format(queue_name, num_workers)
     for it, worker in enumerate(workers):
         success = worker.successful_job_count
-        fail = worker.failed_job_count   
-        message = message+'\n worker: {}'.format(it)
-        message = message+'\n\t number of successful jobs: {}'.format(success)
-        message = message+'\n\t number of failed jobs: {}'.format(fail) 
+        fail = worker.failed_job_count
+        if (fail > 0):
+            failed_job = get_failed_job(queue_name=queue_name)
+            with open ('failed_job.txt', 'w') as failed_file:
+                failed_file.write(failed_job)
+        message = message+'\n\t\t\t worker: {}'.format(it)
+        message = message+'\n\t\t\t\t number of successful jobs: {}'.format(success)
+        message = message+'\n\t\t\t\t number of failed jobs: {}'.format(fail) 
     return message
     
 
+def get_failed_job(queue_name='default', job_num=0):
+    conn = get_connection()
+    #pass name and connection
+    reg = rq.registry.FailedJobRegistry(name=queue_name, connection=conn)
+    print(len(reg))
+    job_ids = reg.get_job_ids()
+    if (len(job_ids)<=0):
+        return "None"
+    job = job_ids[0]
+    job = Job.fetch(job, connection=conn)
+    return job.exc_info
+
+
 def get_queue_message(queues: list):
-    queues_message = 'getting queues, jobs and workers'
+    queues_message = 'getting queues, jobs, and workers'
     for queue in queues:
         queues_message = queues_message + '\n\t number of jobs on queue {}: {}'.format(queue, len(queue))
         workers = rq.Worker.all(queue=queue)
-        queues_message = queues_message + '\n\t\t get_worker_message(queue=workers, queue_name=queue)
+        queues_message = queues_message + '\n\t\t' + str(get_worker_message(workers=workers, queue_name=queue))
         
-    return queue_message
+    return queues_message
 
 
 def status() -> None:
@@ -91,4 +109,5 @@ def enqueue_jobs(
     queue = rq.Queue(queue_name, connection=connection)
 
     for job in jobs:
+        print("Queueing Job - ")
         queue.enqueue(**job, job_timeout=job_timeout)
diff --git a/experimenter/utils.py b/experimenter/utils.py
index 40fa93e..16b0368 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -13,7 +13,7 @@
 from experimenter import exceptions, config
 
 
-DEFAULT_DATASET_DIR = "/datasets/training_datasets/LL0"
+DEFAULT_DATASET_DIR = "/datasets"
 datasets, problems = None, None
 
 

From 075844c819dd4285c1f6847517d43dcbccaafa00 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 16 Mar 2021 16:14:00 -0600
Subject: [PATCH 25/44] Experimenter and queue updates

---
 docker-compose.yml                    | 19 +++++++-
 experimenter/__init__.py              |  0
 experimenter/cli.py                   | 15 ++++++-
 experimenter/config.py                |  1 -
 experimenter/evaluate_pipeline_new.py | 29 ++++++-------
 experimenter/modify_generator.py      | 62 +++++++++++++++++----------
 experimenter/queue.py                 | 62 +++++++++++++++++----------
 experimenter/utils.py                 |  2 +-
 8 files changed, 126 insertions(+), 64 deletions(-)
 mode change 100755 => 100644 experimenter/__init__.py

diff --git a/docker-compose.yml b/docker-compose.yml
index c9b73e4..643f499 100755
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,11 +7,26 @@ services:
     volumes:
       - type: bind
         source: '${DATA_DIR}/${REDIS_DATA_DIR}'
-        target: '/data'
+        target: /data
     networks:
       - default
 
   rq_worker:
+    env_file:
+      - ./.env
+    volumes:
+      - type: bind
+        source: '${DATASETS_DIR}'
+        target: /datasets
+        read_only: true
+      - type: bind
+        source: '${DATA_DIR}'
+        target: /data
+      - type: bind
+        source: '${EXPERIMENTER_DIR}'
+        target: /d3m-experimenter
+        read_only: true
+    working_dir: /d3m-experimenter
     image: 'd3m-experimenter:latest'
     command: 'rq worker --url redis://${REDIS_HOST} ${RQ_QUEUES}'
     networks:
@@ -43,7 +58,7 @@ services:
       - type: bind
         source: '${EXPERIMENTER_DIR}'
         target: /d3m-experimenter
-        read_only: false
+        read_only: true
     working_dir: /d3m-experimenter
     networks:
       - default
diff --git a/experimenter/__init__.py b/experimenter/__init__.py
old mode 100755
new mode 100644
diff --git a/experimenter/cli.py b/experimenter/cli.py
index 6734b91..c27947f 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -50,6 +50,12 @@ def configure_queue_parser(parser: argparse.ArgumentParser) -> None:
 
     empty_parser = subparsers.add_parser('empty', help='remove all jobs from a queue')
     empty_parser.add_argument('-q', '--queue-name', help='the name of the queue to empty')
+    empty_parser.add_argument('-f', '--failed', default='false', help='remove the failed queue')
+    
+    #save a failed traceback parser
+    save_failed_parser = subparsers.add_parser('save-failed', help='save failed job error output')
+    save_failed_parser.add_argument('-q', '--queue-name', help='the name of the queue to empty')
+    save_failed_parser.add_argument('-j', '--job-num', type=int, default=0, help='the failed job number')
 
 
 def queue_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
@@ -58,7 +64,9 @@ def queue_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser
     if queue_command == 'status':
         queue.status()
     elif queue_command == 'empty':
-        queue.empty(arguments.queue_name)
+        queue.empty(arguments.queue_name, arguments.failed)
+    elif queue_command == 'save-failed':
+        queue.save_failed_job(arguments.queue_name, arguments.job_num)
     else:
         raise exceptions.InvalidStateError('Unknown queue command: {}'.format(queue_command))
 
@@ -137,6 +145,11 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
          help='The amount of random seeds that each ran pipeline will have at the end of the test',
          default=2,
          type=int)
+    swap_seed_subparser.add_argument(
+         '--test',
+         help='run the test instead of random pipeline generation',
+         default='false',
+         type=str)
          
     #Primitive swapper functionality
     primitive_swap_subparser = subparsers.add_parser(
diff --git a/experimenter/config.py b/experimenter/config.py
index a405f1d..8291f93 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -12,7 +12,6 @@
 
 # TODO: these should not have to be set unless needed
 
-
 datasets_dir: str = os.environ.get('DATASETS_DIR', None)
 if datasets_dir is None:
     raise exceptions.ConfigError(_ERROR_MESSAGE.format('DATASETS_DIR'))
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 64d68a4..ff30a36 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -2,18 +2,16 @@
 import json
 import os
 import parser
-import logging
 
 from typing import Any, List, Tuple
 from uuid import UUID
-from experimenter import config
+from experimenter import config, utils
 
 from d3m import cli as d3m_cli
-from d3m.contrib.pipelines import (K_FOLD_TABULAR_SPLIT_PIPELINE_ID, 
-    SCORING_PIPELINE_ID)
-
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as TSPP
+from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as SPP
 from experimenter.databases.d3m_mtl import D3MMtLDB
-logging.basicConfig(filename='logger.log', level=logging.INFO)
+
 
 def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
     """ 
@@ -68,7 +66,6 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
         when a file cannot be opened
     """
     output_run_path = []
-    logging.info('getting files')
     with open(pipeline_path, 'r') as data:
         pipeline = json.load(data)
         output_run_path.append(pipeline['id'])
@@ -79,12 +76,11 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
         input_f = json.load(data)
         output_run_path.append(input_f['about']['digest'])
     #get the output run path
-    output_run_path = os.path.abspath(os.path.join(config.data_dir, 'Pipeline_Run', 
+    output_run_path = os.path.abspath(os.path.join(os.getenv('DATA_DIR'), 'Pipeline_Run', 
                                                    '_'.join(output_run_path)+'.json'))
     #create the directory
     os.makedirs(os.path.dirname(output_run_path),exist_ok=True)
     #evaluate pipeline
-    logging.info('begin evaluation')
     evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
         data_random_seed=data_random_seed)
@@ -135,17 +131,20 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
 
     if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
-
-    logging.info('extending arguments')
+    
+    if (not os.path.isfile(TSPP)):
+        raise ValueError('\'{}\' pipeline not a file path'.format('data split'))
+    
+    if (not os.path.isfile(SPP)):
+        raise ValueError('\'{}\' pipeline not a file path'.format('scoring'))
+            
     args.extend(('--pipeline', pipeline))
     args.extend(('--problem', problem))
     args.extend(('--input', input))
     args.extend(('--output-run', output_run))
     args.extend(('--data-random-seed', str(data_random_seed)))
-    args.extend(('--data-pipeline', K_FOLD_TABULAR_SPLIT_PIPELINE_ID))
-    args.extend(('--scoring-pipeline', SCORING_PIPELINE_ID))
-    logging.info('evaluating')
+    args.extend(('--data-pipeline', TSPP))
+    args.extend(('--scoring-pipeline', SPP))
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
-
         save_pipeline_run_to_d3m_db(output_run)
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index ad654a2..6f4ed55 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,11 +1,10 @@
 from experimenter.query import query_on_seeds, query_on_primitive
-from experimenter import queue
+from experimenter import queue, utils
 from experimenter.utils import download_from_database
 import d3m.metadata.pipeline
 from random import randint
+import json
 from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
-import logging
-logging.basicConfig(filename='logger.log', level=logging.INFO)
 
 class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
@@ -18,38 +17,48 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None
         self.max_jobs = max_jobs
         self.num_complete = 0
         #run the query on initializing to define the query results
-        self.query_results = self._query(self.args)
+        if (args.test == 'true'):
+            self.query_results = self._run_seed_test(self.args)
+        else:
+            self.query_results = self._query(self.args)
+        self.generator = self._get_generator()
+            
+    def __iter__(self):
+        return self
 
 
     def __next__(self):
         #iterate through query results
+        job = self.next()
+        if (self.max_jobs):
+            if (self.num_complete > self.max_jobs):
+                raise StopIteration
+        return job
+        
+    
+    def next(self):
+        #iterate through query results
+        return next(self.generator)
+       
+             
+    def _get_generator(self):
         for query_result in self.query_results:
             #iterate through modifier results
-            for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result, self.args):
-                logging.info('downloading pipeline path')
+            for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result,self.args):
                 #save the pipeline to path and return pipeline path
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
-                logging.info('creating job')
+                evaluate_pipeline(pipeline_path=pipeline_path, problem_path=problem_path,
+                                  input_path=dataset_doc_path, data_random_seed=seed)
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
                                      input_path=dataset_doc_path,
                                      data_random_seed=seed)
                 self.num_complete += 1
-                #check to run until the generator stops iterating (if no input for num_pipelines_to_run)
-                if (self.max_jobs):
-                    if (self.num_complete > self.max_jobs):
-                        raise StopIteration
-                return job
-        raise StopIteration
-
-
-    def __iter__(self):
-        return self
+                yield job
+        
         
-            
     def _query(self, args):
-        logging.info('logging')
         if (self.modifier_type=='random-seed'):
             return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter)
         if (self.modifier_type=='swap-primitive'):
@@ -62,7 +71,7 @@ def _modify(self, query_args: dict, args):
         if self.modifier_type=='random-seed':
             return self._modify_random_seed(args.seed_limit, query_args)
         if self.modifier_type=='swap-primitive':
-            return self._modifiy_swap_primitive(args.swap_primitive_id, query_args)
+            return self._modify_swap_primitive(args.swap_primitive_id, query_args)
         else:
             raise ValueError("This type of modification is not yet an option")
     
@@ -75,7 +84,7 @@ def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check):
         #query through the database for equal pipelines
         similar_pipeline_runs_in_database = query.generate_similar_pipeline_runs()
         for pipeline in similar_pipeline_runs_in_database:
-            if(pipeline_object.equals(pipeline)):
+            if (pipeline_object.equals(pipeline)):
                 return True
         return False   
     
@@ -89,10 +98,19 @@ def _modify_random_seed(self, seed_limit, query_args):
             if (new_seed in used_seeds):
                 continue
             num_run += 1
-            used_seeds.add(new_seed)
+            used_seeds.append(new_seed)
             #yield the necessary job requirements
             yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed 
             
+            
+    def _run_seed_test(self,args):
+        with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file:
+            pipeline = json.load(pipeline_file) 
+        dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
+        problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
+        used_seeds = [2,15]
+        yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 
+               'tested_seeds': used_seeds }
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):
         raise ValueError("No functionality for swapping primitives yet")
diff --git a/experimenter/queue.py b/experimenter/queue.py
index ede755e..81680da 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -7,7 +7,6 @@
 
 import redis
 import rq
-import logging
 
 from experimenter import config, exceptions, utils
 
@@ -24,41 +23,45 @@ def get_queue(queue_name: str = _DEFAULT_QUEUE) -> rq.Queue:
     return rq.Queue(queue_name, connection=get_connection())
     
     
-def get_worker_message(workers: list, queue_name: str = _DEFAULT_QUEUE):
+def get_worker_message(workers: list, queue):
     num_workers = len(workers)
-    message = 'number of workers on queue {}: {}'.format(queue_name, num_workers)
+    message = 'number of workers on queue {}: {}'.format(queue.name, num_workers)
     for it, worker in enumerate(workers):
         success = worker.successful_job_count
         fail = worker.failed_job_count
-        if (fail > 0):
-            failed_job = get_failed_job(queue_name=queue_name)
-            with open ('failed_job.txt', 'w') as failed_file:
-                failed_file.write(failed_job)
         message = message+'\n\t\t\t worker: {}'.format(it)
         message = message+'\n\t\t\t\t number of successful jobs: {}'.format(success)
         message = message+'\n\t\t\t\t number of failed jobs: {}'.format(fail) 
     return message
     
 
-def get_failed_job(queue_name='default', job_num=0):
-    conn = get_connection()
+def get_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0):
     #pass name and connection
-    reg = rq.registry.FailedJobRegistry(name=queue_name, connection=conn)
-    print(len(reg))
+    reg = rq.registry.FailedJobRegistry(name = queue_name, connection = get_connection())
     job_ids = reg.get_job_ids()
     if (len(job_ids)<=0):
-        return "None"
+        return "None", reg
     job = job_ids[0]
-    job = Job.fetch(job, connection=conn)
-    return job.exc_info
+    job = rq.job.Job.fetch(job, connection=get_connection())
+    return job.exc_info, reg
+    
+    
+def save_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0):
+    if (queue_name is None):
+        queue_name = _DEFAULT_QUEUE
+    with open (os.path.join('/data',"failed_job_{}.txt".format(job_num)), 'w') as job_file:
+        job_file.write(get_failed_job(queue_name=queue_name, job_num=job_num)[0])
 
 
 def get_queue_message(queues: list):
     queues_message = 'getting queues, jobs, and workers'
     for queue in queues:
-        queues_message = queues_message + '\n\t number of jobs on queue {}: {}'.format(queue, len(queue))
+        queues_message = queues_message + '\n\t number of jobs on queue {}: {}'.format(queue.name, len(queue))
+        _, reg = get_failed_job(queue.name)
+        num_fails = len(reg)
+        queues_message = queues_message + '\n\t number of failed jobs on queue {}: {}'.format(queue.name, num_fails)
         workers = rq.Worker.all(queue=queue)
-        queues_message = queues_message + '\n\t\t' + str(get_worker_message(workers=workers, queue_name=queue))
+        queues_message = queues_message + '\n\t\t' + str(get_worker_message(workers=workers, queue=queue))
         
     return queues_message
 
@@ -71,12 +74,16 @@ def status() -> None:
     print(queues_message)
     
 
-def empty(queue_name: str = None) -> None:
+def empty(queue_name: str = None, empty_failed_queue: str = 'false') -> None:
     if queue_name is None:
         queue_name = _DEFAULT_QUEUE
-    queue = get_queue(queue_name)
-    queue.empty()
-    print(_EMPTIED_MESSAGE.format(queue_name))
+    #empty the failed queue or just the normal one    
+    if (empty_failed_queue == 'true'):
+        empty_failed(queue_name=queue_name)
+    else:
+        queue = get_queue(queue_name)
+        queue.empty()
+        print(_EMPTIED_MESSAGE.format(queue_name))
 
 
 def _check_redis_connection() -> typing.Optional[Exception]:
@@ -86,7 +93,19 @@ def _check_redis_connection() -> typing.Optional[Exception]:
     except redis.exceptions.RedisError as e:
         error = e
     return error
-
+    
+    
+def empty_failed(queue_name: str = None) -> None:
+    if queue_name is None:
+        queue_name = _DEFAULT_QUEUE
+    _, failed_queue = get_failed_job(queue_name=queue_name)
+    #loop through the jobs and remove them
+    conn = get_connection()
+    job_ids = failed_queue.get_job_ids()
+    for job_id in job_ids:
+        result = failed_queue.remove(job_id, delete_job=True)
+    print(_EMPTIED_MESSAGE.format(queue_name+str(' failed')))
+        
 
 def make_job(f: typing.Callable, *args: typing.Any, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
     return {'f':f, 'args': args, 'kwargs': kwargs}
@@ -109,5 +128,4 @@ def enqueue_jobs(
     queue = rq.Queue(queue_name, connection=connection)
 
     for job in jobs:
-        print("Queueing Job - ")
         queue.enqueue(**job, job_timeout=job_timeout)
diff --git a/experimenter/utils.py b/experimenter/utils.py
index 16b0368..f88d0ce 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -20,7 +20,7 @@
 def download_from_database(data, type_to_download: str = 'Pipeline'):
     if (type_to_download == 'Pipeline'):
         i_d = data['id']
-        save_path = os.path.abspath(os.path.join(config.data_dir, 'Pipeline', i_d+str('.json')))
+        save_path = os.path.join('/data', 'Pipeline', i_d+str('.json'))
         #create the new directory
         os.makedirs(os.path.dirname(save_path),exist_ok=True)
         #save the file to the directory

From 9b3e6499b53a496cbb2fb05ed4a9e7c3b2dc4272 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 19 Mar 2021 10:09:15 -0600
Subject: [PATCH 26/44] Working queue and pipeline run local

---
 docker-compose.yml                    |  5 +---
 experimenter/evaluate_pipeline_new.py | 35 +++++++++++----------------
 experimenter/modify_generator.py      | 12 +++++----
 experimenter/queue.py                 |  2 +-
 experimenter/utils.py                 | 26 ++++++++++++++++++--
 5 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index ae5baa0..61bbae1 100755
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,6 +12,7 @@ services:
       - default
 
   rq_worker:
+    image: 'd3m-experimenter:latest'
     env_file:
       - ./.env
     volumes:
@@ -26,10 +27,6 @@ services:
         source: '${EXPERIMENTER_DIR}'
         target: /d3m-experimenter
         read_only: true
-    working_dir: /d3m-experimenter
-    image: 'd3m-experimenter:latest'
-    env_file:
-      - ./.env
     command: 'rq worker --url redis://${REDIS_HOST} ${RQ_QUEUES}'
     networks:
       - default
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index ff30a36..65e68e5 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -8,8 +8,7 @@
 from experimenter import config, utils
 
 from d3m import cli as d3m_cli
-from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as TSPP
-from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as SPP
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
 from experimenter.databases.d3m_mtl import D3MMtLDB
 
 
@@ -38,7 +37,7 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
 def evaluate_pipeline_on_problem(pipeline_path: str,
     problem_path: str,
     input_path: str,
-    data_random_seed: int):
+    random_seed: int):
     """ 
     Evaluate pipeline on problem.
     A less verbose form of running d3m's runtime cli 'evaluate' command.
@@ -53,8 +52,8 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
         path to problem doc
     input_path : path_like str
         path to input full data
-    data_random_seed : int   
-        random seed to be used for data preparation
+    random_seed : int   
+        random seed to be used for pipeline run
 
     Returns:
     ----------
@@ -72,24 +71,22 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     with open(problem_path, 'r') as data:
         problem = json.load(data)
         output_run_path.append(problem['about']['problemID'])
-    with open(input_path, 'r') as data:
-        input_f = json.load(data)
-        output_run_path.append(input_f['about']['digest'])
+    output_run_path.append(str(random_seed))
     #get the output run path
-    output_run_path = os.path.abspath(os.path.join(os.getenv('DATA_DIR'), 'Pipeline_Run', 
+    output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', 
                                                    '_'.join(output_run_path)+'.json'))
     #create the directory
     os.makedirs(os.path.dirname(output_run_path),exist_ok=True)
     #evaluate pipeline
     evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
-        data_random_seed=data_random_seed)
+        random_seed=random_seed)
 
 def evaluate_pipeline_via_d3m_cli(pipeline: str,
     problem: str,
     input: str,
     output_run: str,
-    data_random_seed: int):
+    random_seed: int):
     """ 
     Evaluate pipeline on problem using d3m's runtime cli. 
     Wrapper function to execute d3m's runtime cli 'evaluate' command.
@@ -107,9 +104,9 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
         path where pipeline_run doc
         will be saved.
         use '-' for stdin
-    data_random_seed : int
-        random seed to use for
-        data preparation
+    random_seed : int
+        random seed to used for
+        pipeline run
 
     Return:
     -------
@@ -121,7 +118,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
         when parameter value is
         invalid
     """
-    args = ['d3m', 'runtime', 'evaluate']
+    args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate']
 
     if (not os.path.isfile(pipeline)):
         raise ValueError('\'{}\' param not a file path'.format('pipeline'))
@@ -132,19 +129,15 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
     
-    if (not os.path.isfile(TSPP)):
+    if (not os.path.isfile(data_split_file)):
         raise ValueError('\'{}\' pipeline not a file path'.format('data split'))
     
-    if (not os.path.isfile(SPP)):
-        raise ValueError('\'{}\' pipeline not a file path'.format('scoring'))
             
     args.extend(('--pipeline', pipeline))
     args.extend(('--problem', problem))
     args.extend(('--input', input))
     args.extend(('--output-run', output_run))
-    args.extend(('--data-random-seed', str(data_random_seed)))
-    args.extend(('--data-pipeline', TSPP))
-    args.extend(('--scoring-pipeline', SPP))
+    args.extend(('--data-pipeline', data_split_file))
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
         save_pipeline_run_to_d3m_db(output_run)
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 6f4ed55..ea1a650 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -47,13 +47,15 @@ def _get_generator(self):
             for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result,self.args):
                 #save the pipeline to path and return pipeline path
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
-                evaluate_pipeline(pipeline_path=pipeline_path, problem_path=problem_path,
-                                  input_path=dataset_doc_path, data_random_seed=seed)
+                #catch error returning none for file paths
+                if (problem_path is None or dataset_doc_path is None):
+                    continue
+                #create the job if file paths are returned from query
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
                                      input_path=dataset_doc_path,
-                                     data_random_seed=seed)
+                                     random_seed=seed)
                 self.num_complete += 1
                 yield job
         
@@ -98,7 +100,7 @@ def _modify_random_seed(self, seed_limit, query_args):
             if (new_seed in used_seeds):
                 continue
             num_run += 1
-            used_seeds.append(new_seed)
+            used_seeds.add(new_seed)
             #yield the necessary job requirements
             yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed 
             
@@ -108,7 +110,7 @@ def _run_seed_test(self,args):
             pipeline = json.load(pipeline_file) 
         dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
         problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
-        used_seeds = [2,15]
+        used_seeds = {2,15}
         yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 
                'tested_seeds': used_seeds }
 
diff --git a/experimenter/queue.py b/experimenter/queue.py
index 0813b8a..a62bfa2 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -83,7 +83,7 @@ def enqueue(job, queue_name: str = _DEFAULT_QUEUE, job_timeout: int = None) -> r
     return q.enqueue(**job, job_timeout=job_timeout)
 
 
-def empty(queue_name: str = None) -> None:
+def empty(queue_name: str = None, empty_failed_queue: str = 'false') -> None:
     if queue_name is None:
         queue_name = _DEFAULT_QUEUE
     #empty the failed queue or just the normal one    
diff --git a/experimenter/utils.py b/experimenter/utils.py
index f88d0ce..f22b081 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -16,6 +16,18 @@
 DEFAULT_DATASET_DIR = "/datasets"
 datasets, problems = None, None
 
+def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None):
+    #create the directory
+    os.makedirs(os.path.join('/data','DoesNotExist'),exist_ok=True)
+    #get the tag to write or append
+    if (os.path.exists(os.path.join('/data','DoesNotExist',filename))):
+       tag = 'a' # append to file
+    else:
+       tag = 'w' # write and create
+    #append the non existing value to the file
+    with open(os.path.join('/data','DoesNotExist',filename),tag) as to_save:
+        to_save.write(save_id+'\n')    
+
 
 def download_from_database(data, type_to_download: str = 'Pipeline'):
     if (type_to_download == 'Pipeline'):
@@ -43,7 +55,12 @@ def get_dataset_doc_path(dataset_id: str, datasets_dir: str=None) -> str:
         if datasets_dir is None:
             datasets_dir = os.getenv('DATASETS', DEFAULT_DATASET_DIR)
         datasets, problems = get_datasets_and_problems(datasets_dir)
-    return datasets[dataset_id]
+    try:
+        return datasets[dataset_id]
+    except:
+        #save to dataset id does not exist file
+        save_to_not_exist_file('dataset_dne.txt', dataset_id)
+        return None
 
 
 def get_dataset_doc(dataset_id: str, datasets_dir: str=None) -> dict:
@@ -71,7 +88,12 @@ def get_problem_path(problem_id: str, datasets_dir: str=None) -> str:
         if datasets_dir is None:
             datasets_dir = os.getenv('DATASETS', DEFAULT_DATASET_DIR)
         datasets, problems = get_datasets_and_problems(datasets_dir)
-    return problems[problem_id]
+    try:
+        return problems[problem_id]
+    except:
+        #save to problem id does not exist file
+        save_to_not_exist_file('problem_dne.txt', problem_id)
+        return None
 
 
 def get_problem(problem_path: str, *, parse: bool = True) -> dict:

From a53478c85c21299585ebc04644dab77158a20a66 Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Fri, 19 Mar 2021 16:45:02 +0000
Subject: [PATCH 27/44] Minor typos, need to update queue and query

---
 config-example.ini    | 17 -----------------
 experimenter/queue.py |  1 +
 2 files changed, 1 insertion(+), 17 deletions(-)
 delete mode 100644 config-example.ini

diff --git a/config-example.ini b/config-example.ini
deleted file mode 100644
index 5ab6c77..0000000
--- a/config-example.ini
+++ /dev/null
@@ -1,17 +0,0 @@
-[MAIN]
-CACHE_DIR = /d3m-experimenter
-
-[REDIS]
-HOST = localhost
-PORT = 6379
-DASHBOARD_PORT = 9181
-# the following should not be changed
-DATA_DIR = redis  
-DOCKER_IMAGE_NAME = redis:latest
-DOCKER_PORT = 6379
-DOCKER_DATA_DIR = /data
-
-[D3MINFO]
-D3M_DB_SUBMITTER = {SUBMITTER_NAME}
-D3M_DB_TOKEN = {UNIQUE_TOKEN}
-SAVE_TO_D3M = True
diff --git a/experimenter/queue.py b/experimenter/queue.py
index a62bfa2..be615cd 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -2,6 +2,7 @@
 
 import redis
 import rq
+import os
 
 from experimenter import config, exceptions
 

From d0fe148a6c0d32b20de60739e9478a3c10fcf542 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 19 Mar 2021 12:45:12 -0600
Subject: [PATCH 28/44] Updates beginning datapreparation functionality

---
 experimenter/cli.py                   |  7 +++---
 experimenter/evaluate_pipeline_new.py | 22 +++++++++++-----
 experimenter/modify_generator.py      | 36 ++++++++++++++++++---------
 experimenter/query.py                 | 10 ++++++--
 experimenter/queue.py                 |  4 +--
 experimenter/utils.py                 | 13 ++++++----
 6 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/experimenter/cli.py b/experimenter/cli.py
index c27947f..fb1d996 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -50,7 +50,7 @@ def configure_queue_parser(parser: argparse.ArgumentParser) -> None:
 
     empty_parser = subparsers.add_parser('empty', help='remove all jobs from a queue')
     empty_parser.add_argument('-q', '--queue-name', help='the name of the queue to empty')
-    empty_parser.add_argument('-f', '--failed', default='false', help='remove the failed queue')
+    empty_parser.add_argument('-f', '--failed', help='remove the failed queue', action='store_true')
     
     #save a failed traceback parser
     save_failed_parser = subparsers.add_parser('save-failed', help='save failed job error output')
@@ -141,15 +141,14 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
          default=None,
          type=str)
     swap_seed_subparser.add_argument(
-         '--seed_limit',
+         '--seed-limit',
          help='The amount of random seeds that each ran pipeline will have at the end of the test',
          default=2,
          type=int)
     swap_seed_subparser.add_argument(
          '--test',
          help='run the test instead of random pipeline generation',
-         default='false',
-         type=str)
+         action='store_true')
          
     #Primitive swapper functionality
     primitive_swap_subparser = subparsers.add_parser(
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 65e68e5..c5608de 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -37,7 +37,9 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
 def evaluate_pipeline_on_problem(pipeline_path: str,
     problem_path: str,
     input_path: str,
-    random_seed: int):
+    random_seed: int,
+    data_pipeline_path: str=data_split_file,
+    data_random_seed: int=0):
     """ 
     Evaluate pipeline on problem.
     A less verbose form of running d3m's runtime cli 'evaluate' command.
@@ -80,13 +82,16 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     #evaluate pipeline
     evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
-        random_seed=random_seed)
+        random_seed=random_seed, data_pipeline_path = data_pipeline_path,
+        data_random_seed=data_random_seed)
 
 def evaluate_pipeline_via_d3m_cli(pipeline: str,
     problem: str,
     input: str,
     output_run: str,
-    random_seed: int):
+    random_seed: int,
+    data_pipeline_path: str=data_split_file,
+    data_random_seed: int=0):
     """ 
     Evaluate pipeline on problem using d3m's runtime cli. 
     Wrapper function to execute d3m's runtime cli 'evaluate' command.
@@ -107,6 +112,10 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     random_seed : int
         random seed to used for
         pipeline run
+    data_pipeline_path: str
+        path to data prepation pipeline
+    data_random_seed: int
+        random_seed to be used in data preparation
 
     Return:
     -------
@@ -117,7 +126,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     ValueError
         when parameter value is
         invalid
-    """
+    """    
     args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate']
 
     if (not os.path.isfile(pipeline)):
@@ -129,7 +138,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
     
-    if (not os.path.isfile(data_split_file)):
+    if (not os.path.isfile(data_pipeline_path)):
         raise ValueError('\'{}\' pipeline not a file path'.format('data split'))
     
             
@@ -137,7 +146,8 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--problem', problem))
     args.extend(('--input', input))
     args.extend(('--output-run', output_run))
-    args.extend(('--data-pipeline', data_split_file))
+    args.extend(('--data-pipeline', data_pipeline_path))
+    args.extend(('--data-random-seed', data_random_seed))
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
         save_pipeline_run_to_d3m_db(output_run)
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index ea1a650..3f8f6d8 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -3,6 +3,7 @@
 from experimenter.utils import download_from_database
 import d3m.metadata.pipeline
 from random import randint
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
 import json
 from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
 
@@ -17,7 +18,7 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None
         self.max_jobs = max_jobs
         self.num_complete = 0
         #run the query on initializing to define the query results
-        if (args.test == 'true'):
+        if (args.test is True):
             self.query_results = self._run_seed_test(self.args)
         else:
             self.query_results = self._query(self.args)
@@ -29,22 +30,20 @@ def __iter__(self):
 
     def __next__(self):
         #iterate through query results
-        job = self.next()
+        job = next(self.generator)
         if (self.max_jobs):
             if (self.num_complete > self.max_jobs):
                 raise StopIteration
         return job
-        
-    
-    def next(self):
-        #iterate through query results
-        return next(self.generator)
-       
+
              
     def _get_generator(self):
+        """
+        Main generator to be used of ModifyGenerator class
+        """
         for query_result in self.query_results:
             #iterate through modifier results
-            for pipeline, problem_path, dataset_doc_path, seed in self._modify(query_result,self.args):
+            for pipeline, problem_path, dataset_doc_path, random_seed in self._modify(query_result,self.args):
                 #save the pipeline to path and return pipeline path
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
                 #catch error returning none for file paths
@@ -55,12 +54,14 @@ def _get_generator(self):
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
                                      input_path=dataset_doc_path,
-                                     random_seed=seed)
+                                     random_seed=random_seed)
                 self.num_complete += 1
                 yield job
         
         
     def _query(self, args):
+        """method for querying database according to pipeline modification type
+        """
         if (self.modifier_type=='random-seed'):
             return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter)
         if (self.modifier_type=='swap-primitive'):
@@ -70,6 +71,8 @@ def _query(self, args):
     
             
     def _modify(self, query_args: dict, args):
+        """Handler for different types of pipeline modification tasks
+        """
         if self.modifier_type=='random-seed':
             return self._modify_random_seed(args.seed_limit, query_args)
         if self.modifier_type=='swap-primitive':
@@ -79,7 +82,8 @@ def _modify(self, query_args: dict, args):
     
  
     def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check):
-        """Pseudo function/method for duplicate checking - this is not complete
+        """Pseudo function/method for duplicate checking 
+            - This function is not complete and will be used for future generation type jobs
         """
         #create the pipeline to check for duplicates from the path
         pipeline_object = d3m.metadata.pipeline.Pipeline.from_json(pipeline_to_check)
@@ -92,6 +96,9 @@ def _check_for_duplicates(self, pipeline_to_check, problem_ref_to_check):
     
     
     def _modify_random_seed(self, seed_limit, query_args):
+        """Generates new seeds for a given pipeline, problem, and dataset
+           It is dependent on the seed limit for how many it will generate
+        """
         used_seeds = query_args['tested_seeds']
         num_run = len(used_seeds)
         #run until the right number of seeds have been run
@@ -106,13 +113,18 @@ def _modify_random_seed(self, seed_limit, query_args):
             
             
     def _run_seed_test(self,args):
+        """ Test designed for development and functionality purposes.
+            It uses and dataset and pipeline that is saved in d3m-experimenter
+        """
         with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file:
             pipeline = json.load(pipeline_file) 
         dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
         problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
+        data_random_seed = 0
         used_seeds = {2,15}
         yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 
-               'tested_seeds': used_seeds }
+               'tested_seeds': used_seeds}
+
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):
         raise ValueError("No functionality for swapping primitives yet")
diff --git a/experimenter/query.py b/experimenter/query.py
index 2a17228..dfc5ff5 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -66,7 +66,14 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
       for (problem_id, dataset_id), random_seeds in results.items():
          if limit and len(random_seeds) > limit:
             continue
-         yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds}
+         yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
+         'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds}
+
+def get_data_preparation_pipeline(data_pred_id: str=None):
+      data_prep_search = Search(using=CONNECTION, index='pipelines')
+      data_prep_search = data_prep_search.query('match', id=data_prep_id)
+      data_prep_pipeline = next(data_prep_search.scan())
+      return data_prep_pipeline
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
    pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
@@ -75,7 +82,6 @@ def scan_pipeline_runs(pipeline_id, submitter=None):
       .query('match', status__state='SUCCESS')
    if submitter:
       pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
-
    results = dict()
    for pipeline_run in pipeline_run_search.scan():
       for dataset in pipeline_run.datasets:
diff --git a/experimenter/queue.py b/experimenter/queue.py
index a62bfa2..689f1db 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -83,11 +83,11 @@ def enqueue(job, queue_name: str = _DEFAULT_QUEUE, job_timeout: int = None) -> r
     return q.enqueue(**job, job_timeout=job_timeout)
 
 
-def empty(queue_name: str = None, empty_failed_queue: str = 'false') -> None:
+def empty(queue_name: str = None, empty_failed_queue: bool = False) -> None:
     if queue_name is None:
         queue_name = _DEFAULT_QUEUE
     #empty the failed queue or just the normal one    
-    if (empty_failed_queue == 'true'):
+    if (empty_failed_queue is True):
         empty_failed(queue_name=queue_name)
     else:
         queue = get_queue(queue_name)
diff --git a/experimenter/utils.py b/experimenter/utils.py
index f22b081..a995e18 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -33,13 +33,16 @@ def download_from_database(data, type_to_download: str = 'Pipeline'):
     if (type_to_download == 'Pipeline'):
         i_d = data['id']
         save_path = os.path.join('/data', 'Pipeline', i_d+str('.json'))
-        #create the new directory
-        os.makedirs(os.path.dirname(save_path),exist_ok=True)
-        #save the file to the directory
-        with open(save_path, 'w') as to_save:
-            json.dump(data, to_save, indent=4)
+    elif (type_to_download == 'Preparation'):
+        save_path = os.path.join('/data', 'DataPreparation', i_d+str('.json'))
     else:
         raise ValueError("type: {}, not available for download".format(type_to_download)) 
+    #create the new directory
+    os.makedirs(os.path.dirname(save_path),exist_ok=True)
+    #save the file to the directory
+    with open(save_path, 'w') as to_save:
+        json.dump(data, to_save, indent=4)
+    #return the location
     return save_path
 
 

From 8c26b4ad4d75fb00b42099cbfef00e385530d3de Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Mon, 22 Mar 2021 13:43:52 -0600
Subject: [PATCH 29/44] Working with test and when the data preparation is
 explicitly defined in the pipeline run file

---
 experimenter/evaluate_pipeline_new.py |  12 +--
 experimenter/modify_generator.py      |  32 +++++--
 experimenter/query.py                 | 126 +++++++++++---------------
 experimenter/utils.py                 |   2 +-
 4 files changed, 82 insertions(+), 90 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index c5608de..929a3c4 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -76,7 +76,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     output_run_path.append(str(random_seed))
     #get the output run path
     output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', 
-                                                   '_'.join(output_run_path)+'.json'))
+                                                   '_'.join(output_run_path)+'.yaml'))
     #create the directory
     os.makedirs(os.path.dirname(output_run_path),exist_ok=True)
     #evaluate pipeline
@@ -96,6 +96,8 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     Evaluate pipeline on problem using d3m's runtime cli. 
     Wrapper function to execute d3m's runtime cli 'evaluate' command.
     Arguments mirror the same arguments using the cli.
+    Only handles cases with a data preparation pipeline in the 
+    pipeline run.
 
     Parameters
     ----------
@@ -116,6 +118,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
         path to data prepation pipeline
     data_random_seed: int
         random_seed to be used in data preparation
+    input_run: path to pipeline run file
 
     Return:
     -------
@@ -127,8 +130,6 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
         when parameter value is
         invalid
     """    
-    args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate']
-
     if (not os.path.isfile(pipeline)):
         raise ValueError('\'{}\' param not a file path'.format('pipeline'))
 
@@ -138,10 +139,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
     
-    if (not os.path.isfile(data_pipeline_path)):
-        raise ValueError('\'{}\' pipeline not a file path'.format('data split'))
-    
-            
+    args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate']
     args.extend(('--pipeline', pipeline))
     args.extend(('--problem', problem))
     args.extend(('--input', input))
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 3f8f6d8..2c3ea67 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,10 +1,11 @@
-from experimenter.query import query_on_seeds, query_on_primitive
+from experimenter.query import query_on_seeds
 from experimenter import queue, utils
 from experimenter.utils import download_from_database
 import d3m.metadata.pipeline
 from random import randint
 from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
 import json
+import yaml
 from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
 
 class ModifyGenerator:
@@ -40,21 +41,28 @@ def __next__(self):
     def _get_generator(self):
         """
         Main generator to be used of ModifyGenerator class
+        Can only handle cases where there is a data preparation
+        pipeline in the pipeline run
         """
         for query_result in self.query_results:
             #iterate through modifier results
-            for pipeline, problem_path, dataset_doc_path, random_seed in self._modify(query_result,self.args):
+            for pipeline, problem_path, dataset_doc, random_seed, prep in self._modify(query_result,self.args):
                 #save the pipeline to path and return pipeline path
+                data_prep_pipeline, data_random_seed = prep
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
-                #catch error returning none for file paths
-                if (problem_path is None or dataset_doc_path is None):
+                if (data_prep_pipeline is not None):
+                    data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation')
+                #catch error returning none for file paths or preparation pipeline
+                #TODO get data preparation pipeline even when it is not explicitly defined
+                if (problem_path is None or dataset_doc is None or data_prep_pipeline is None):
                     continue
-                #create the job if file paths are returned from query
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
-                                     input_path=dataset_doc_path,
-                                     random_seed=random_seed)
+                                     input_path=dataset_doc,
+                                     random_seed=random_seed,
+                                     data_pipeline_path=data_prep_pipeline,
+                                     data_random_seed=data_random_seed)
                 self.num_complete += 1
                 yield job
         
@@ -109,7 +117,7 @@ def _modify_random_seed(self, seed_limit, query_args):
             num_run += 1
             used_seeds.add(new_seed)
             #yield the necessary job requirements
-            yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed 
+            yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, (query_args['data_prep_pipeline'], query_args['data_prep_seed']) 
             
             
     def _run_seed_test(self,args):
@@ -120,10 +128,14 @@ def _run_seed_test(self,args):
             pipeline = json.load(pipeline_file) 
         dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
         problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
-        data_random_seed = 0
+        data_prep_seed = 0
+        with open(data_split_file, 'r') as pipeline_file:
+            data_prep_pipeline = yaml.full_load(pipeline_file)
+        data_prep_pipeline = data_prep_pipeline
         used_seeds = {2,15}
         yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 
-               'tested_seeds': used_seeds}
+               'tested_seeds': used_seeds, 'data_prep_pipeline': 
+               data_prep_pipeline, 'data_prep_seed': data_prep_seed}
 
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):
diff --git a/experimenter/query.py b/experimenter/query.py
index dfc5ff5..75e750e 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -5,87 +5,69 @@
 HOST = 'https://metalearning.datadrivendiscovery.org/es'
 CONNECTION = Elasticsearch(hosts=[HOST], timeout=300)
 
-def query_on_primitive(primitive_id: str, limit_indexes=False):
-   '''Queries the metalearning database for pipelines using the specified primitive.
-   Queries the metalearning database using the Elasticsearch endpoint documented
-   on D3M's website (see https://metalearning.datadrivendiscovery.org for more
-   info). Finds all pipelines containing a certain primitive as specified by the
-   keyword argument. Also determines the index(es) of that primitive in each
-   matching pipeline and gets the datasets that were used in pipeline runs.
-   
-   Arguments
-   ---------
-   primitive_id : str
-      A primitive's unique id.
-   limit_indexes : 'first', 'last', or False (default)
-      Limits which index of the primitive is returned for each pipeline match.
-      Use 'first' to get the index of the first matching primitive specified by
-      the keyword arg. Use 'last' to get the index of the last match. Use False
-      (default) to get a list of all indexes for each pipeline specifying where
-      the primitive is.
-   
-   Yields
-   -------
-   A list of tuples where each tuple contains (in this order):
-      1. a matching pipeline
-      2. the index(es) of the desired primitives in the given pipeline's steps
-      3. a dictionary containing the datasets used in pipeline runs where the key
-         is the dataset digest and the value is the dataset id (human-readable string).
-      4. the random seeds used in pipeline runs.
-   '''
-
-   if limit_indexes not in { 'first', 'last', False }:
-      raise ValueError(f'Invalid value "{limit_indexes}" for arg limit_indexes')
-   
-   match_query = Q('match', steps__primitive__id=primitive_id)
-   nested_query = Q('nested', path='steps', query=match_query)
-   pipeline_search = Search(using=CONNECTION, index='pipelines').query(nested_query)
-
-   for pipeline in pipeline_search.scan():
-      results = scan_pipeline_runs(pipeline.id)
-
-      locs = [i for i, step in enumerate(pipeline.steps) if primitive_id == step.primitive.id]
-      if limit_indexes == 'last':
-         locs = locs[-1]
-      elif limit_indexes == 'first':
-         locs = locs[0]
-      
-      for (problem_id, dataset_name), random_seeds in results.items():
          
-         yield {'pipeline': pipeline.id, 'problem_path': get_problem_path(problem_id), 'location': locs, 'dataset_doc_path': get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds}
+def get_search_query(arguments: dict = None, connection = CONNECTION, index='pipelines'):
+    index_search = Search(using=CONNECTION, index=index)
+    if arguments['id'] is not None:
+        index_search = index_search.query('match', id=arguments['id'])
+    if arguments['submitter'] is not None:
+        index_search = index_search.query('match', _submitter=arguments['submitter'])
+    return index_search
+
 
 def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
-   pipeline_search = Search(using=CONNECTION, index='pipelines')
-   if pipeline_id:
-      pipeline_search = pipeline_search.query('match', id=pipeline_id)
-   if submitter:
-      pipeline_search = pipeline_search.query('match', _submitter=submitter)
-   
-   for pipeline in pipeline_search.scan():
-      results = scan_pipeline_runs(pipeline.id, submitter)
-      for (problem_id, dataset_id), random_seeds in results.items():
-         if limit and len(random_seeds) > limit:
-            continue
-         yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
-         'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds}
+    arguments = {'id': pipeline_id, 'submitter': submitter}
+    pipeline_search = get_search_query(arguments=arguments, index='pipelines')
+    for pipeline in pipeline_search.scan():
+        results = scan_pipeline_runs(pipeline.id, submitter)
+        for (problem_id, dataset_id, data_prep), random_seeds in results.items():
+            if limit and len(random_seeds) > limit:
+                continue
+            #data_prep_pipeline, data_prep_seed = data_prep
+            input_run = data_prep[0]
+            yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
+                   'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds,
+                   'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed}
+
 
 def get_data_preparation_pipeline(data_pred_id: str=None):
-      data_prep_search = Search(using=CONNECTION, index='pipelines')
-      data_prep_search = data_prep_search.query('match', id=data_prep_id)
+      arguments = {'submitter': None, 'id': data_prep_id}
+      data_prep_search = get_search_query(arguments=arguments)
       data_prep_pipeline = next(data_prep_search.scan())
       return data_prep_pipeline
 
+
+def check_for_data_prep(pipeline_run=None):
+    """Only handles cases with an explicit data preparation pipeline
+    in the pipeline run
+    """
+    data_prep = None
+    data_prep_pipeline = None
+    data_prep_seed = None
+    try:
+        data_prep = pipeline_run.run.data_preparation
+    except:
+        data_prep = None
+        data_prep_seed = None
+    if (data_prep is not None):
+        data_prep_seed = data_prep.random_seed
+        data_prep_pipeline = get_data_preparation_pipeline(data_prep.pipeline.id)
+    return data_prep_pipeline, data_prep_seed
+
+
 def scan_pipeline_runs(pipeline_id, submitter=None):
-   pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
+    pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
       .query('match', pipeline__id=pipeline_id) \
       .query('match', run__phase='PRODUCE') \
       .query('match', status__state='SUCCESS')
-   if submitter:
-      pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
-   results = dict()
-   for pipeline_run in pipeline_run_search.scan():
-      for dataset in pipeline_run.datasets:
-         dataset_prob_tuple = (pipeline_run.problem.id, dataset.id)
-         results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
-         results[dataset_prob_tuple].add(pipeline_run.random_seed)
-   return results
+    if submitter:
+        pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
+        results = dict()
+    for pipeline_run in pipeline_run_search.scan():
+        data_prep_pipeline, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run)
+        for dataset in pipeline_run.datasets:
+            dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_pipeline, data_prep_seed))
+            results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
+            results[dataset_prob_tuple].add(pipeline_run.random_seed)
+    return results
+    
diff --git a/experimenter/utils.py b/experimenter/utils.py
index a995e18..830ad9c 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -30,8 +30,8 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None)
 
 
 def download_from_database(data, type_to_download: str = 'Pipeline'):
+    i_d = data['id']
     if (type_to_download == 'Pipeline'):
-        i_d = data['id']
         save_path = os.path.join('/data', 'Pipeline', i_d+str('.json'))
     elif (type_to_download == 'Preparation'):
         save_path = os.path.join('/data', 'DataPreparation', i_d+str('.json'))

From 614be9136bb71c867bf88baebc2eced83777d05f Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Mon, 22 Mar 2021 22:36:50 +0000
Subject: [PATCH 30/44] Bug fixes for remote work

---
 experimenter/evaluate_pipeline_new.py | 12 +++++++-----
 experimenter/modify_generator.py      |  8 +++++++-
 experimenter/query.py                 | 21 ++++++++++++---------
 experimenter/queue.py                 | 15 ++++++++++-----
 4 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 929a3c4..279c5ef 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -1,5 +1,6 @@
 import itertools as it
 import json
+import yaml
 import os
 import parser
 
@@ -8,7 +9,8 @@
 from experimenter import config, utils
 
 from d3m import cli as d3m_cli
-from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as k_fold_split_path
+from d3m.contrib.pipelines import FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH as fixed_split_path
 from experimenter.databases.d3m_mtl import D3MMtLDB
 
 
@@ -31,14 +33,14 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
     """
     d3m_db = D3MMtLDB()
     with open(pipeline_run_path) as pipeline_data:
-        pipeline_run = json.load(pipeline_data)
+        pipeline_run = yaml.full_load(pipeline_data)
     return D3MMtLDB().save_pipeline_run(pipeline_run)
 
 def evaluate_pipeline_on_problem(pipeline_path: str,
     problem_path: str,
     input_path: str,
     random_seed: int,
-    data_pipeline_path: str=data_split_file,
+    data_pipeline_path: str=k_fold_split_path,
     data_random_seed: int=0):
     """ 
     Evaluate pipeline on problem.
@@ -90,7 +92,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     input: str,
     output_run: str,
     random_seed: int,
-    data_pipeline_path: str=data_split_file,
+    data_pipeline_path: str=k_fold_split_path,
     data_random_seed: int=0):
     """ 
     Evaluate pipeline on problem using d3m's runtime cli. 
@@ -144,7 +146,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--problem', problem))
     args.extend(('--input', input))
     args.extend(('--output-run', output_run))
-    args.extend(('--data-pipeline', data_pipeline_path))
+    args.extend(('--data-pipeline', fixed_split_path))
     args.extend(('--data-random-seed', data_random_seed))
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 2c3ea67..70b4d58 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -54,8 +54,14 @@ def _get_generator(self):
                     data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation')
                 #catch error returning none for file paths or preparation pipeline
                 #TODO get data preparation pipeline even when it is not explicitly defined
-                if (problem_path is None or dataset_doc is None or data_prep_pipeline is None):
+                if (problem_path is None or dataset_doc is None or data_prep_pipeline is None):     
                     continue
+                evaluate_pipeline(pipeline_path=pipeline_path,
+                                    problem_path=problem_path,
+                                     input_path=dataset_doc,
+                                     random_seed=random_seed,
+                                     data_pipeline_path=data_prep_pipeline,
+                                     data_random_seed=data_random_seed)
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,
diff --git a/experimenter/query.py b/experimenter/query.py
index 75e750e..06aea7a 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -23,17 +23,20 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
         for (problem_id, dataset_id, data_prep), random_seeds in results.items():
             if limit and len(random_seeds) > limit:
                 continue
-            #data_prep_pipeline, data_prep_seed = data_prep
-            input_run = data_prep[0]
+            data_prep_id, data_prep_seed = data_prep
+            data_prep_pipeline = get_data_preparation_pipeline(data_prep_id)
             yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
                    'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds,
                    'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed}
 
 
-def get_data_preparation_pipeline(data_pred_id: str=None):
+def get_data_preparation_pipeline(data_prep_id: str=None):
+      if (data_prep_id is None):
+          return None
       arguments = {'submitter': None, 'id': data_prep_id}
       data_prep_search = get_search_query(arguments=arguments)
       data_prep_pipeline = next(data_prep_search.scan())
+      data_prep_pipeline = data_prep_pipeline.to_dict()
       return data_prep_pipeline
 
 
@@ -42,7 +45,7 @@ def check_for_data_prep(pipeline_run=None):
     in the pipeline run
     """
     data_prep = None
-    data_prep_pipeline = None
+    data_prep_id = None
     data_prep_seed = None
     try:
         data_prep = pipeline_run.run.data_preparation
@@ -51,8 +54,8 @@ def check_for_data_prep(pipeline_run=None):
         data_prep_seed = None
     if (data_prep is not None):
         data_prep_seed = data_prep.random_seed
-        data_prep_pipeline = get_data_preparation_pipeline(data_prep.pipeline.id)
-    return data_prep_pipeline, data_prep_seed
+        data_prep_id = data_prep.pipeline.id
+    return data_prep_id, data_prep_seed
 
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
@@ -62,11 +65,11 @@ def scan_pipeline_runs(pipeline_id, submitter=None):
       .query('match', status__state='SUCCESS')
     if submitter:
         pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
-        results = dict()
+    results = dict()
     for pipeline_run in pipeline_run_search.scan():
-        data_prep_pipeline, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run)
+        data_prep_id, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run)
         for dataset in pipeline_run.datasets:
-            dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_pipeline, data_prep_seed))
+            dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_id, data_prep_seed))
             results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
             results[dataset_prob_tuple].add(pipeline_run.random_seed)
     return results
diff --git a/experimenter/queue.py b/experimenter/queue.py
index 87fd7a9..6dadbbf 100644
--- a/experimenter/queue.py
+++ b/experimenter/queue.py
@@ -9,7 +9,7 @@
 
 _DEFAULT_QUEUE = 'default'
 _EMPTIED_MESSAGE = 'queue {} emptied'
-
+_SAVE_FAILED_MESSAGE = 'Failed job output saved to {}'
 
 def get_connection():
     config.validate_redis_host()
@@ -46,16 +46,21 @@ def get_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0):
     job_ids = reg.get_job_ids()
     if (len(job_ids)<=0):
         return "None", reg
-    job = job_ids[0]
-    job = rq.job.Job.fetch(job, connection=get_connection())
-    return job.exc_info, reg
+    job_id = job_ids[0]
+    return job_id, reg
     
     
 def save_failed_job(queue_name:str = _DEFAULT_QUEUE, job_num:int = 0):
     if (queue_name is None):
         queue_name = _DEFAULT_QUEUE
+    job_id, failed_queue = get_failed_job()
+    job = rq.job.Job.fetch(job_id, connection=get_connection())
     with open (os.path.join('/data',"failed_job_{}.txt".format(job_num)), 'w') as job_file:
-        job_file.write(get_failed_job(queue_name=queue_name, job_num=job_num)[0])
+        job_file.write(job.exc_info)
+    #remove the job
+    failed_queue.remove(job_id, delete_job=True)
+    print(_SAVE_FAILED_MESSAGE.format(os.path.join('/data',
+                              "failed_job_{}.txt".format(job_num))))
 
 
 def get_queue_message(queues: list):

From ce3d768bf00b1169ce9bef3908f37d7af5e75568 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Mon, 22 Mar 2021 16:47:31 -0600
Subject: [PATCH 31/44] Added data preparation checks for d3m module

---
 experimenter/evaluate_pipeline_new.py |  2 +-
 experimenter/modify_generator.py      |  4 +++-
 experimenter/query.py                 | 14 +++++++++-----
 experimenter/utils.py                 | 27 +++++++++++++++++++++++++++
 4 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 279c5ef..656c49f 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -146,7 +146,7 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--problem', problem))
     args.extend(('--input', input))
     args.extend(('--output-run', output_run))
-    args.extend(('--data-pipeline', fixed_split_path))
+    args.extend(('--data-pipeline', data_pipeline_path))
     args.extend(('--data-random-seed', data_random_seed))
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 70b4d58..fa65736 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -51,7 +51,9 @@ def _get_generator(self):
                 data_prep_pipeline, data_random_seed = prep
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
                 if (data_prep_pipeline is not None):
-                    data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation')
+                    if (~os.path.exist(data_prep_pipeline)):
+                        data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation')
+                print(data_prep_pipeline)
                 #catch error returning none for file paths or preparation pipeline
                 #TODO get data preparation pipeline even when it is not explicitly defined
                 if (problem_path is None or dataset_doc is None or data_prep_pipeline is None):     
diff --git a/experimenter/query.py b/experimenter/query.py
index 06aea7a..e5bc555 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -1,6 +1,7 @@
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search, Q
-from experimenter.utils import get_problem_path, get_dataset_doc_path
+from experimenter.utils import get_problem_path, get_dataset_doc_path, get_data_prep_from_d3m
+
 
 HOST = 'https://metalearning.datadrivendiscovery.org/es'
 CONNECTION = Elasticsearch(hosts=[HOST], timeout=300)
@@ -33,10 +34,13 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
 def get_data_preparation_pipeline(data_prep_id: str=None):
       if (data_prep_id is None):
           return None
-      arguments = {'submitter': None, 'id': data_prep_id}
-      data_prep_search = get_search_query(arguments=arguments)
-      data_prep_pipeline = next(data_prep_search.scan())
-      data_prep_pipeline = data_prep_pipeline.to_dict()
+      data_prep_pipeline = get_data_prep_from_d3m(data_prep_id)
+      #get from database if not in d3m module
+      if (data_prep_pipeline is None):
+          arguments = {'submitter': None, 'id': data_prep_id}
+          data_prep_search = get_search_query(arguments=arguments)
+          data_prep_pipeline = next(data_prep_search.scan())
+          data_prep_pipeline = data_prep_pipeline.to_dict()
       return data_prep_pipeline
 
 
diff --git a/experimenter/utils.py b/experimenter/utils.py
index 830ad9c..e2d1a0d 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -9,6 +9,7 @@
 
 from d3m.metadata import problem as problem_module
 from d3m.utils import get_datasets_and_problems
+from d3m.contrib import pipelines
 
 from experimenter import exceptions, config
 
@@ -16,6 +17,32 @@
 DEFAULT_DATASET_DIR = "/datasets"
 datasets, problems = None, None
 
+def get_dict_data_prep_pipelines():
+    data_prep_dict = dict()
+    data_prep_id_list = list()
+    #save the relevant paths and ids for data preparation
+    data_prep_id_list.append(pipelines.NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID)
+    data_prep_dict[pipelines.NO_SPLIT_TABULAR_SPLIT_PIPELINE_ID] = pipelines.NO_SPLIT_TABULAR_SPLIT_PIPELINE_PATH
+    data_prep_id_list.append(pipelines.FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID)
+    data_prep_dict[pipelines.FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_ID] = pipelines.FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH 
+    data_prep_id_list.append(pipelines.TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID)
+    data_prep_dict[pipelines.TRAIN_TEST_TABULAR_SPLIT_PIPELINE_ID] = pipelines.TRAIN_TEST_TABULAR_SPLIT_PIPELINE_PATH 
+    data_prep_id_list.append(pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID)
+    data_prep_dict[pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID] = pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
+    return data_prep_dict, data_prep_id_list
+
+
+def get_data_prep_from_d3m(pipeline_id: str = None):
+    """Checks if data preparation pipeline is in d3m module,
+    if not, return None
+    
+    """
+    data_prep_dict, data_prep_id_list = get_dict_data_prep_pipelines()
+    if (pipeline_id in data_prep_id_list):
+        return data_prep_dict[pipeline_id]
+    else:
+        return None
+
 def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None):
     #create the directory
     os.makedirs(os.path.join('/data','DoesNotExist'),exist_ok=True)

From 9ce7db7f5028bd9a5ca128cca421089317ca3690 Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Mon, 22 Mar 2021 23:47:37 +0000
Subject: [PATCH 32/44] Minor changes to query, still failed pipelines that
 probably should not be failing

---
 experimenter/evaluate_pipeline_new.py |  2 +-
 experimenter/modify_generator.py      | 14 ++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 656c49f..6aa53d1 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -84,7 +84,7 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     #evaluate pipeline
     evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
         input=input_path, output_run=output_run_path,
-        random_seed=random_seed, data_pipeline_path = data_pipeline_path,
+        random_seed=random_seed, data_pipeline_path=data_pipeline_path,
         data_random_seed=data_random_seed)
 
 def evaluate_pipeline_via_d3m_cli(pipeline: str,
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index fa65736..233d4ec 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -5,6 +5,7 @@
 from random import randint
 from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
 import json
+import os
 import yaml
 from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
 
@@ -50,20 +51,13 @@ def _get_generator(self):
                 #save the pipeline to path and return pipeline path
                 data_prep_pipeline, data_random_seed = prep
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
-                if (data_prep_pipeline is not None):
-                    if (~os.path.exist(data_prep_pipeline)):
-                        data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation')
-                print(data_prep_pipeline)
                 #catch error returning none for file paths or preparation pipeline
                 #TODO get data preparation pipeline even when it is not explicitly defined
                 if (problem_path is None or dataset_doc is None or data_prep_pipeline is None):     
                     continue
-                evaluate_pipeline(pipeline_path=pipeline_path,
-                                    problem_path=problem_path,
-                                     input_path=dataset_doc,
-                                     random_seed=random_seed,
-                                     data_pipeline_path=data_prep_pipeline,
-                                     data_random_seed=data_random_seed)
+                #check if query returned a path or an id
+                if (os.path.exists(data_prep_pipeline) is False):
+                    data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation')
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline_path=pipeline_path,
                                      problem_path=problem_path,

From 6165dbd20a4d9dc20546b24c6d1b91046243210a Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 23 Mar 2021 13:18:40 -0600
Subject: [PATCH 33/44] More robust data preparation and scoring pipelines from
 pipeline run

---
 experimenter/evaluate_pipeline_new.py | 90 ++++++++++++++++++++-------
 experimenter/modify_generator.py      | 72 ++++++++++++++-------
 experimenter/query.py                 | 50 ++++++++++-----
 experimenter/utils.py                 | 43 ++++++++-----
 4 files changed, 178 insertions(+), 77 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 6aa53d1..46ee5cb 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -36,12 +36,16 @@ def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
         pipeline_run = yaml.full_load(pipeline_data)
     return D3MMtLDB().save_pipeline_run(pipeline_run)
 
-def evaluate_pipeline_on_problem(pipeline_path: str,
-    problem_path: str,
-    input_path: str,
-    random_seed: int,
+def evaluate_pipeline_on_problem(pipeline: str=None,
+    problem: str=None,
+    input: str=None,
+    random_seed: int=0,
     data_pipeline_path: str=k_fold_split_path,
-    data_random_seed: int=0):
+    data_random_seed: int=0,
+    data_params=None,
+    scoring_pipeline: str=None,
+    scoring_params=None,
+    scoring_random_seed: int=0):
     """ 
     Evaluate pipeline on problem.
     A less verbose form of running d3m's runtime cli 'evaluate' command.
@@ -58,6 +62,18 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
         path to input full data
     random_seed : int   
         random seed to be used for pipeline run
+    data_pipeline_path: str
+        path to data prepation pipeline
+    data_random_seed: int
+        random_seed to be used in data preparation
+    data_params:
+        parameters for data preparation
+    scoring_params:
+        parameters for scoring pipeline
+    scoring_random_seed: int
+        random seed for scoring
+    scoring_pipeline: str
+        path to scoring pipeline
 
     Returns:
     ----------
@@ -69,12 +85,12 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
         when a file cannot be opened
     """
     output_run_path = []
-    with open(pipeline_path, 'r') as data:
-        pipeline = json.load(data)
-        output_run_path.append(pipeline['id'])
-    with open(problem_path, 'r') as data:
-        problem = json.load(data)
-        output_run_path.append(problem['about']['problemID'])
+    with open(pipeline, 'r') as data:
+        pipe = json.load(data)
+        output_run_path.append(pipe['id'])
+    with open(problem, 'r') as data:
+        prob = json.load(data)
+        output_run_path.append(prob['about']['problemID'])
     output_run_path.append(str(random_seed))
     #get the output run path
     output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', 
@@ -82,18 +98,24 @@ def evaluate_pipeline_on_problem(pipeline_path: str,
     #create the directory
     os.makedirs(os.path.dirname(output_run_path),exist_ok=True)
     #evaluate pipeline
-    evaluate_pipeline_via_d3m_cli(pipeline=pipeline_path, problem=problem_path,
-        input=input_path, output_run=output_run_path,
+    evaluate_pipeline_via_d3m_cli(pipeline=pipeline, problem=problem,
+        input=input, output_run=output_run_path,
         random_seed=random_seed, data_pipeline_path=data_pipeline_path,
-        data_random_seed=data_random_seed)
-
-def evaluate_pipeline_via_d3m_cli(pipeline: str,
-    problem: str,
-    input: str,
-    output_run: str,
-    random_seed: int,
+        data_random_seed=data_random_seed, data_params=data_params, 
+        scoring_pipeline=scoring_pipeline, scoring_params=scoring_params,
+        scoring_random_seed=scoring_random_seed)
+
+def evaluate_pipeline_via_d3m_cli(pipeline: str=None,
+    problem: str=None,
+    input: str=None,
+    output_run: str=None,
+    random_seed: int=0,
     data_pipeline_path: str=k_fold_split_path,
-    data_random_seed: int=0):
+    data_random_seed: int=0,
+    data_params=None,
+    scoring_pipeline: str=None,
+    scoring_params=None,
+    scoring_random_seed: int=0):
     """ 
     Evaluate pipeline on problem using d3m's runtime cli. 
     Wrapper function to execute d3m's runtime cli 'evaluate' command.
@@ -120,8 +142,14 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
         path to data prepation pipeline
     data_random_seed: int
         random_seed to be used in data preparation
-    input_run: path to pipeline run file
-
+    data_params:
+        parameters for data preparation
+    scoring_params:
+        parameters for scoring pipeline
+    scoring_random_seed: int
+        random seed for scoring
+    scoring_pipeline: str
+        path to scoring pipeline
     Return:
     -------
     None
@@ -140,7 +168,14 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
 
     if (not os.path.isfile(input)):
         raise ValueError('\'{}\' param not a file path'.format('input'))
-    
+        
+    if (not os.path.isfile(data_pipeline_path)):
+        raise ValueError('\'{}\' param not a file path'.format('input'))
+        
+    if (not os.path.isfile(scoring_pipeline)):
+        raise ValueError('\'{}\' param not a file path'.format('input'))
+        
+    #TODO - call fit-score when the data pipeline is not defined in the pipeline run
     args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate']
     args.extend(('--pipeline', pipeline))
     args.extend(('--problem', problem))
@@ -148,6 +183,13 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str,
     args.extend(('--output-run', output_run))
     args.extend(('--data-pipeline', data_pipeline_path))
     args.extend(('--data-random-seed', data_random_seed))
+    if (data_params is not None):
+        args.extend(('--data-param', data_params))
+    args.extend(('--scoring-pipeline', scoring_pipeline))
+    args.extend(('--scoring-random-seed', scoring_random_seed))
+    if (scoring_params is not None):
+        args.extend(('--scoring-param', scoring_params))
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
         save_pipeline_run_to_d3m_db(output_run)
+        
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 233d4ec..6a5237c 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -1,14 +1,17 @@
-from experimenter.query import query_on_seeds
-from experimenter import queue, utils
-from experimenter.utils import download_from_database
-import d3m.metadata.pipeline
 from random import randint
-from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
 import json
 import os
 import yaml
+
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
+from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file
+
+from experimenter.query import query_on_seeds
+from experimenter import queue, utils
+from experimenter.utils import download_from_database
 from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
 
+
 class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
         pipelines in the database
@@ -25,7 +28,8 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None
         else:
             self.query_results = self._query(self.args)
         self.generator = self._get_generator()
-            
+
+
     def __iter__(self):
         return self
 
@@ -47,24 +51,41 @@ def _get_generator(self):
         """
         for query_result in self.query_results:
             #iterate through modifier results
-            for pipeline, problem_path, dataset_doc, random_seed, prep in self._modify(query_result,self.args):
+            for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result,self.args):
                 #save the pipeline to path and return pipeline path
-                data_prep_pipeline, data_random_seed = prep
+                data_prep_pipeline, data_random_seed, data_params = data
+                scoring_pipeline, scoring_random_seed, scoring_params = score
                 pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
+                #TODO - catch when there is no data preparation pipeline and pass it further to evaluate
                 #catch error returning none for file paths or preparation pipeline
-                #TODO get data preparation pipeline even when it is not explicitly defined
                 if (problem_path is None or dataset_doc is None or data_prep_pipeline is None):     
                     continue
                 #check if query returned a path or an id
                 if (os.path.exists(data_prep_pipeline) is False):
-                    data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Preparation')
+                    data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Data Preparation')
+                if (os.path.exists(scoring_pipeline) is False):
+                    scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='Scoring')
+                evaluate_pipeline(pipeline=pipeline_path,
+                                  problem=problem_path,
+                                  input=dataset_doc,
+                                  random_seed=seed,
+                                  data_pipeline_path=data_prep_pipeline,
+                                  data_random_seed=data_random_seed,
+                                  data_params=data_params,
+                                  scoring_pipeline=scoring_pipeline,
+                                  scoring_random_seed=scoring_random_seed,
+                                  scoring_params=scoring_params)
                 job = queue.make_job(evaluate_pipeline,
-                                     pipeline_path=pipeline_path,
-                                     problem_path=problem_path,
-                                     input_path=dataset_doc,
-                                     random_seed=random_seed,
+                                     pipeline=pipeline_path,
+                                     problem=problem_path,
+                                     input=dataset_doc,
+                                     random_seed=seed,
                                      data_pipeline_path=data_prep_pipeline,
-                                     data_random_seed=data_random_seed)
+                                     data_random_seed=data_random_seed,
+                                     data_params=data_params,
+                                     scoring_pipeline=scoring_pipeline,
+                                     scoring_random_seed=scoring_random_seed,
+                                     scoring_params=scoring_params)
                 self.num_complete += 1
                 yield job
         
@@ -119,7 +140,9 @@ def _modify_random_seed(self, seed_limit, query_args):
             num_run += 1
             used_seeds.add(new_seed)
             #yield the necessary job requirements
-            yield query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, (query_args['data_prep_pipeline'], query_args['data_prep_seed']) 
+            yield (query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, 
+                  (query_args['data_prep_pipeline'], query_args['data_prep_seed'], query_args['data_params']), 
+                  (query_args['scoring_pipeline'], query_args['scoring_seed'], query_args['scoring_params'])) 
             
             
     def _run_seed_test(self,args):
@@ -131,13 +154,20 @@ def _run_seed_test(self,args):
         dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
         problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
         data_prep_seed = 0
-        with open(data_split_file, 'r') as pipeline_file:
-            data_prep_pipeline = yaml.full_load(pipeline_file)
-        data_prep_pipeline = data_prep_pipeline
+        #with open(data_split_file, 'r') as pipeline_file:
+        #    data_prep_pipeline = yaml.full_load(pipeline_file)
+        #with open(scoring_file, 'r') as pipeline_file:
+        #    scoring_pipeline = yaml.full_load(pipeline_file)
+        data_prep_seed = 0
+        data_prep_pipeline = data_split_file
+        scoring_pipeline = scoring_file
+        scoring_seed = 0
         used_seeds = {2,15}
         yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 
-               'tested_seeds': used_seeds, 'data_prep_pipeline': 
-               data_prep_pipeline, 'data_prep_seed': data_prep_seed}
+               'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, 
+               'data_prep_seed': data_prep_seed, 'data_params': None,
+               'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed,
+               'scoring_params': None}
 
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):
diff --git a/experimenter/query.py b/experimenter/query.py
index e5bc555..d9d1f39 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -1,6 +1,7 @@
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search, Q
-from experimenter.utils import get_problem_path, get_dataset_doc_path, get_data_prep_from_d3m
+from experimenter.utils import get_problem_path, get_dataset_doc_path, get_pipelines_from_d3m
+from d3m.runtime import _get_data_and_scoring_params_from_pipeline_run as _data_score_params
 
 
 HOST = 'https://metalearning.datadrivendiscovery.org/es'
@@ -21,27 +22,31 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
     pipeline_search = get_search_query(arguments=arguments, index='pipelines')
     for pipeline in pipeline_search.scan():
         results = scan_pipeline_runs(pipeline.id, submitter)
-        for (problem_id, dataset_id, data_prep), random_seeds in results.items():
+        for (problem_id, dataset_id, data_prep, scoring), random_seeds in results.items():
             if limit and len(random_seeds) > limit:
                 continue
-            data_prep_id, data_prep_seed = data_prep
-            data_prep_pipeline = get_data_preparation_pipeline(data_prep_id)
+            data_prep_id, data_prep_seed, data_params = data_prep
+            scoring_id, scoring_seed, scoring_params = scoring
+            data_prep_pipeline = get_pipeline(data_prep_id, types='Data')
+            scoring_pipeline = get_pipeline(scoring_id, types='Scoring')
             yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
                    'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds,
-                   'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed}
+                   'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed,
+                   'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed,
+                   'scoring_params': scoring_params, 'data_params': data_params}
 
 
-def get_data_preparation_pipeline(data_prep_id: str=None):
-      if (data_prep_id is None):
+def get_pipeline(pipeline_id: str=None, types: str='Data'):
+      if (pipeline_id is None):
           return None
-      data_prep_pipeline = get_data_prep_from_d3m(data_prep_id)
+      pipeline = get_pipelines_from_d3m(pipeline_id, types=types)
       #get from database if not in d3m module
-      if (data_prep_pipeline is None):
+      if (pipeline is None):
           arguments = {'submitter': None, 'id': data_prep_id}
-          data_prep_search = get_search_query(arguments=arguments)
-          data_prep_pipeline = next(data_prep_search.scan())
-          data_prep_pipeline = data_prep_pipeline.to_dict()
-      return data_prep_pipeline
+          search = get_search_query(arguments=arguments)
+          pipeline = next(search.scan())
+          pipeline = pipeline.to_dict()
+      return pipeline
 
 
 def check_for_data_prep(pipeline_run=None):
@@ -59,8 +64,18 @@ def check_for_data_prep(pipeline_run=None):
     if (data_prep is not None):
         data_prep_seed = data_prep.random_seed
         data_prep_id = data_prep.pipeline.id
-    return data_prep_id, data_prep_seed
-
+        data_params = _data_score_params(data_prep.get('steps', []))
+        
+    return data_prep_id, data_prep_seed, data_params
+    
+    
+def get_scoring_pipeline(pipeline_run=None):
+    scoring = pipeline_run.run.scoring
+    scoring_seed = scoring.random_seed
+    scoring_params = _data_score_params(scoring.get('steps', []))
+    
+    return scoring.pipeline.id, scoring_seed, scoring_params
+     
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
     pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
@@ -71,9 +86,10 @@ def scan_pipeline_runs(pipeline_id, submitter=None):
         pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
     results = dict()
     for pipeline_run in pipeline_run_search.scan():
-        data_prep_id, data_prep_seed = check_for_data_prep(pipeline_run=pipeline_run)
+        data_prep = check_for_data_prep(pipeline_run=pipeline_run)
+        scoring = get_scoring_pipeline(pipeline_run)
         for dataset in pipeline_run.datasets:
-            dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, (data_prep_id, data_prep_seed))
+            dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring)
             results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
             results[dataset_prob_tuple].add(pipeline_run.random_seed)
     return results
diff --git a/experimenter/utils.py b/experimenter/utils.py
index e2d1a0d..11f9eb5 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -17,7 +17,11 @@
 DEFAULT_DATASET_DIR = "/datasets"
 datasets, problems = None, None
 
-def get_dict_data_prep_pipelines():
+
+def get_data_prep_pipelines():
+    """
+    Get data preparation pipelines that are already in the d3m module
+    """
     data_prep_dict = dict()
     data_prep_id_list = list()
     #save the relevant paths and ids for data preparation
@@ -30,18 +34,32 @@ def get_dict_data_prep_pipelines():
     data_prep_id_list.append(pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID)
     data_prep_dict[pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_ID] = pipelines.K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
     return data_prep_dict, data_prep_id_list
+    
 
-
-def get_data_prep_from_d3m(pipeline_id: str = None):
+def get_scoring_pipelines():
+    """
+    Get the scoring pipelines that are already in the d3m module
+    """
+    scoring_dict = dict()
+    scoring_id_list = list()
+    #save relevant paths and ids for scoring pipelines
+    scoring_id_list.append(pipelines.SCORING_PIPELINE_ID)
+    scoring_dict[pipelines.SCORING_PIPELINE_ID] = pipelines.SCORING_PIPELINE_PATH 
+    return scoring_dict, scoring_id_list
+
+ 
+def get_pipelines_from_d3m(pipeline_id: str = None, types='Data'):
     """Checks if data preparation pipeline is in d3m module,
     if not, return None
-    
     """
-    data_prep_dict, data_prep_id_list = get_dict_data_prep_pipelines()
-    if (pipeline_id in data_prep_id_list):
-        return data_prep_dict[pipeline_id]
-    else:
-        return None
+    if (types=='Data'):
+        dict_ids, id_list = get_data_prep_pipelines()
+    elif (types=='Scoring'):
+        dict_ids, id_list = get_scoring_pipelines()
+    if (pipeline_id in id_list):
+        return dict_ids[pipeline_id]
+    return None
+
 
 def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None):
     #create the directory
@@ -58,12 +76,7 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None)
 
 def download_from_database(data, type_to_download: str = 'Pipeline'):
     i_d = data['id']
-    if (type_to_download == 'Pipeline'):
-        save_path = os.path.join('/data', 'Pipeline', i_d+str('.json'))
-    elif (type_to_download == 'Preparation'):
-        save_path = os.path.join('/data', 'DataPreparation', i_d+str('.json'))
-    else:
-        raise ValueError("type: {}, not available for download".format(type_to_download)) 
+    save_path = os.path.join('/data', type_to_download, i_d+str('.json')) 
     #create the new directory
     os.makedirs(os.path.dirname(save_path),exist_ok=True)
     #save the file to the directory

From 5208a461669d6a524719616aa16143026b2f6653 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 23 Mar 2021 13:19:59 -0600
Subject: [PATCH 34/44] Unnecessary commenting in _run_seed_test

---
 experimenter/modify_generator.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 6a5237c..02aef61 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -154,10 +154,6 @@ def _run_seed_test(self,args):
         dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
         problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
         data_prep_seed = 0
-        #with open(data_split_file, 'r') as pipeline_file:
-        #    data_prep_pipeline = yaml.full_load(pipeline_file)
-        #with open(scoring_file, 'r') as pipeline_file:
-        #    scoring_pipeline = yaml.full_load(pipeline_file)
         data_prep_seed = 0
         data_prep_pipeline = data_split_file
         scoring_pipeline = scoring_file

From c59ad21510e7360d96a69f86f297bd5f493bffd2 Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Tue, 23 Mar 2021 21:46:42 +0000
Subject: [PATCH 35/44] query changes for data params and scoring params

---
 experimenter/query.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/experimenter/query.py b/experimenter/query.py
index d9d1f39..455c834 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -22,11 +22,14 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
     pipeline_search = get_search_query(arguments=arguments, index='pipelines')
     for pipeline in pipeline_search.scan():
         results = scan_pipeline_runs(pipeline.id, submitter)
-        for (problem_id, dataset_id, data_prep, scoring), random_seeds in results.items():
+        for (problem_id, dataset_id, data_prep, scoring), params_dict in results.items():
             if limit and len(random_seeds) > limit:
                 continue
-            data_prep_id, data_prep_seed, data_params = data_prep
-            scoring_id, scoring_seed, scoring_params = scoring
+            data_prep_id, data_prep_seed = data_prep
+            scoring_id, scoring_seed = scoring
+            random_seeds = params_dict['random_seeds']
+            data_params = params_dict['data_params']
+            scoring_params = params_dict['scoring_params']
             data_prep_pipeline = get_pipeline(data_prep_id, types='Data')
             scoring_pipeline = get_pipeline(scoring_id, types='Scoring')
             yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
@@ -64,19 +67,26 @@ def check_for_data_prep(pipeline_run=None):
     if (data_prep is not None):
         data_prep_seed = data_prep.random_seed
         data_prep_id = data_prep.pipeline.id
+        data_prep = data_prep.to_dict()
         data_params = _data_score_params(data_prep.get('steps', []))
         
-    return data_prep_id, data_prep_seed, data_params
+    return (data_prep_id, data_prep_seed), data_params
     
     
 def get_scoring_pipeline(pipeline_run=None):
     scoring = pipeline_run.run.scoring
     scoring_seed = scoring.random_seed
-    scoring_params = _data_score_params(scoring.get('steps', []))
-    
-    return scoring.pipeline.id, scoring_seed, scoring_params
+    scoring_id = scoring.pipeline.id
+    scoring = scoring.to_dict()
+    scoring_params = _data_score_params(scoring.get('steps', [])) 
+    return (scoring_id, scoring_seed), scoring_params
      
 
+def get_unique_results(results: dict = None):
+    #function for getting unique results from the result dictionary 
+    pass
+
+
 def scan_pipeline_runs(pipeline_id, submitter=None):
     pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
       .query('match', pipeline__id=pipeline_id) \
@@ -86,11 +96,12 @@ def scan_pipeline_runs(pipeline_id, submitter=None):
         pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
     results = dict()
     for pipeline_run in pipeline_run_search.scan():
-        data_prep = check_for_data_prep(pipeline_run=pipeline_run)
-        scoring = get_scoring_pipeline(pipeline_run)
+        data_prep, data_params = check_for_data_prep(pipeline_run=pipeline_run)
+        scoring, scoring_params = get_scoring_pipeline(pipeline_run)
         for dataset in pipeline_run.datasets:
             dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring)
-            results[dataset_prob_tuple] = results.get(dataset_prob_tuple, set())
-            results[dataset_prob_tuple].add(pipeline_run.random_seed)
+            results[dataset_prob_tuple] = results.get(dataset_prob_tuple, list())
+            result_add_dict = {'random_seed': pipeline_run.random_seed, 'data_params': data_params, 'scoring_params': scoring_params}
+            results[dataset_prob_tuple].append(results_add_dict) 
     return results
     

From 500ea108985de42149468ce84f700886e1010606 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Tue, 23 Mar 2021 20:25:17 -0600
Subject: [PATCH 36/44] Adding data params and scoring params to pipeline run
 cli works locally

---
 experimenter/evaluate_pipeline_new.py | 14 ++++--
 experimenter/query.py                 | 72 +++++++++++++++++++++------
 2 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
index 46ee5cb..66737df 100644
--- a/experimenter/evaluate_pipeline_new.py
+++ b/experimenter/evaluate_pipeline_new.py
@@ -182,13 +182,17 @@ def evaluate_pipeline_via_d3m_cli(pipeline: str=None,
     args.extend(('--input', input))
     args.extend(('--output-run', output_run))
     args.extend(('--data-pipeline', data_pipeline_path))
-    args.extend(('--data-random-seed', data_random_seed))
-    if (data_params is not None):
-        args.extend(('--data-param', data_params))
+    args.extend(('--data-random-seed', str(data_random_seed)))
     args.extend(('--scoring-pipeline', scoring_pipeline))
-    args.extend(('--scoring-random-seed', scoring_random_seed))
+    args.extend(('--scoring-random-seed', str(scoring_random_seed)))
+    #add the data parameters to the cli arguments
+    if (data_params is not None):
+        for name, value in data_params.items():
+            args.extend(('--data-param', name, value))
+    #add the scoring parameters to the cli arguments
     if (scoring_params is not None):
-        args.extend(('--scoring-param', scoring_params))
+        for name, value in scoring_params.items():
+            args.extend(('--scoring-param', name, value))
     d3m_cli.main(args)
     if (config.save_to_d3m is True):
         save_pipeline_run_to_d3m_db(output_run)
diff --git a/experimenter/query.py b/experimenter/query.py
index 455c834..96618ad 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -22,21 +22,26 @@ def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu')
     pipeline_search = get_search_query(arguments=arguments, index='pipelines')
     for pipeline in pipeline_search.scan():
         results = scan_pipeline_runs(pipeline.id, submitter)
-        for (problem_id, dataset_id, data_prep, scoring), params_dict in results.items():
-            if limit and len(random_seeds) > limit:
-                continue
+        for dataset_prob_tuple, results_dict in results.items():
+            unique_items = get_unique_results(results_dict)
+            #unpack values from tuple
+            problem_id, dataset_id, data_prep, scoring = dataset_prob_tuple
+            scoring_id, scoring_random_seed = scoring
             data_prep_id, data_prep_seed = data_prep
-            scoring_id, scoring_seed = scoring
-            random_seeds = params_dict['random_seeds']
-            data_params = params_dict['data_params']
-            scoring_params = params_dict['scoring_params']
+            #get preparation and scoring pipelines
             data_prep_pipeline = get_pipeline(data_prep_id, types='Data')
             scoring_pipeline = get_pipeline(scoring_id, types='Scoring')
-            yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
-                   'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds,
-                   'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed,
-                   'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed,
-                   'scoring_params': scoring_params, 'data_params': data_params}
+            for params in unique_items:
+                data_params = params['data_params']
+                scoring_params = params['scoring_params']
+                random_seeds = params['random_seeds']    
+                if limit and len(random_seeds) > limit:
+                    continue
+                yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
+                       'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds,
+                       'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed,
+                       'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed,
+                       'scoring_params': scoring_params, 'data_params': data_params}
 
 
 def get_pipeline(pipeline_id: str=None, types: str='Data'):
@@ -59,6 +64,7 @@ def check_for_data_prep(pipeline_run=None):
     data_prep = None
     data_prep_id = None
     data_prep_seed = None
+    data_params = None
     try:
         data_prep = pipeline_run.run.data_preparation
     except:
@@ -82,9 +88,41 @@ def get_scoring_pipeline(pipeline_run=None):
     return (scoring_id, scoring_seed), scoring_params
      
 
+def get_list_duplicates(params_list, match_item):
+    start_loc = -1
+    locs = []
+    while True:
+        try:
+            loc = params_list.index(match_item,start_loc+1)
+        except ValueError:
+            break
+        else:
+            locs.append(loc)
+            start_loc = loc
+    return locs
+
+
 def get_unique_results(results: dict = None):
     #function for getting unique results from the result dictionary 
-    pass
+    random_seeds_list = results['random_seeds']
+    params_list = results['params']
+    final_list = list()
+    location_dict = dict()
+    #loop through the values
+    for it, param in enumerate(params_list):
+        #get matching pairs of each value 
+        location_dict[it] = get_list_duplicates(params_list, param)
+    skip = set()
+    for loc, values in location_dict.items():
+        if loc in skip:
+            continue
+        random_seeds = set()
+        for value in values:
+            random_seeds.add(random_seeds_list[value])
+            skip.add(value)
+        data_params, scoring_params = params_list[loc]
+        final_list.append({'data_params': data_params, 'scoring_params': scoring_params, 'random_seeds': random_seeds})
+    return final_list
 
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
@@ -100,8 +138,10 @@ def scan_pipeline_runs(pipeline_id, submitter=None):
         scoring, scoring_params = get_scoring_pipeline(pipeline_run)
         for dataset in pipeline_run.datasets:
             dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring)
-            results[dataset_prob_tuple] = results.get(dataset_prob_tuple, list())
-            result_add_dict = {'random_seed': pipeline_run.random_seed, 'data_params': data_params, 'scoring_params': scoring_params}
-            results[dataset_prob_tuple].append(results_add_dict) 
+            results[dataset_prob_tuple] = results.get(dataset_prob_tuple, dict())
+            results[dataset_prob_tuple]['random_seeds'] = results[dataset_prob_tuple].get('random_seed', list())
+            results[dataset_prob_tuple]['params'] = results[dataset_prob_tuple].get('params', list())
+            results[dataset_prob_tuple]['random_seeds'].append(pipeline_run.random_seed)
+            results[dataset_prob_tuple]['params'].append((data_params, scoring_params))
     return results
     

From 5c90be4e2cbb93ea619187a48b78242965658c05 Mon Sep 17 00:00:00 2001
From: Eric Manner <ebensonm@0yam.aml.cs.byu.edu>
Date: Wed, 24 Mar 2021 03:21:55 +0000
Subject: [PATCH 37/44] using data and scoring params working remotely

---
 experimenter/modify_generator.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 02aef61..895f2d0 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -65,16 +65,6 @@ def _get_generator(self):
                     data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Data Preparation')
                 if (os.path.exists(scoring_pipeline) is False):
                     scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='Scoring')
-                evaluate_pipeline(pipeline=pipeline_path,
-                                  problem=problem_path,
-                                  input=dataset_doc,
-                                  random_seed=seed,
-                                  data_pipeline_path=data_prep_pipeline,
-                                  data_random_seed=data_random_seed,
-                                  data_params=data_params,
-                                  scoring_pipeline=scoring_pipeline,
-                                  scoring_random_seed=scoring_random_seed,
-                                  scoring_params=scoring_params)
                 job = queue.make_job(evaluate_pipeline,
                                      pipeline=pipeline_path,
                                      problem=problem_path,

From c3b4b60b4bfb05f4ff9af958d3ce97497ab5f066 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Wed, 24 Mar 2021 08:07:11 -0600
Subject: [PATCH 38/44] Working on merge suggestions

---
 experimenter/config.py                |   5 +
 experimenter/databases/d3m_mtl.py     |   6 +-
 experimenter/evaluate_pipeline_new.py | 199 -----------------------
 experimenter/execute_pipeline.py      | 218 --------------------------
 experimenter/modify_generator.py      |   4 +-
 experimenter/problem_new.py           |   0
 experimenter/utils.py                 |  17 ++
 7 files changed, 27 insertions(+), 422 deletions(-)
 delete mode 100644 experimenter/evaluate_pipeline_new.py
 delete mode 100644 experimenter/execute_pipeline.py
 delete mode 100644 experimenter/problem_new.py

diff --git a/experimenter/config.py b/experimenter/config.py
index 41c5787..3233730 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -46,3 +46,8 @@ def validate_save():
     if save_to_d3m is None:
         raise exceptions.ConfigError(_ERROR_MESSAGE.format('SAVE_TO_D3M'))
 
+
+output_run_path: str = os.path.abspath(os.path.join('/data', 'pipeline_runs'))
+if (not os.path.exists(output_run_path))
+    #create the directory
+    os.makedirs(os.path.dirname(output_run_path), exist_ok=True)
diff --git a/experimenter/databases/d3m_mtl.py b/experimenter/databases/d3m_mtl.py
index 6badcb8..da6c77e 100644
--- a/experimenter/databases/d3m_mtl.py
+++ b/experimenter/databases/d3m_mtl.py
@@ -23,7 +23,7 @@ def __init__(self) -> None:
         self._post_url = D3M_MTL_DB_POST_URL
         # This env var allows code calling this class to be run during
         # unit tests without actually saving to the production DB.
-        self.should_save = config.D3MConfig().save_to_d3m
+        self.should_save = config.save_to_d3m
         # A reference to a low-level elasticsearch client. This can be
         # used to query the D3M DB, or this classe's `search` method
         # can be used, and is preferred, since its API is more straightforward.
@@ -31,9 +31,9 @@ def __init__(self) -> None:
         # certain things though.
         self.es = Elasticsearch(hosts=[D3M_MTL_DB_GET_URL], timeout=30)
         # Our submitter name.
-        self._submitter = config.D3MConfig().d3m_submitter
+        self._submitter = config.d3m_db_submitter
         # The secret verifying us as the submitter we say we are.
-        self._x_token = config.D3MConfig().d3m_token
+        self._x_token = config.d3m_db_token
         if self._is_identifying_as_submitter():
             logger.info(
                 f"Documents will be saved under submitter name: '{self._submitter}'"
diff --git a/experimenter/evaluate_pipeline_new.py b/experimenter/evaluate_pipeline_new.py
deleted file mode 100644
index 66737df..0000000
--- a/experimenter/evaluate_pipeline_new.py
+++ /dev/null
@@ -1,199 +0,0 @@
-import itertools as it
-import json
-import yaml
-import os
-import parser
-
-from typing import Any, List, Tuple
-from uuid import UUID
-from experimenter import config, utils
-
-from d3m import cli as d3m_cli
-from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as k_fold_split_path
-from d3m.contrib.pipelines import FIXED_SPLIT_TABULAR_SPLIT_PIPELINE_PATH as fixed_split_path
-from experimenter.databases.d3m_mtl import D3MMtLDB
-
-
-def save_pipeline_run_to_d3m_db(pipeline_run_path: str):
-    """ 
-    Saves a pipeline run document to the d3m database.
-
-    Parameters
-    ----------
-    pipeline_run_path : path_like str
-        path to pipeline_run document
-
-    Returns:
-    ----------
-    TODO
-
-    Raises:
-    ----------
-    TODO
-    """
-    d3m_db = D3MMtLDB()
-    with open(pipeline_run_path) as pipeline_data:
-        pipeline_run = yaml.full_load(pipeline_data)
-    return D3MMtLDB().save_pipeline_run(pipeline_run)
-
-def evaluate_pipeline_on_problem(pipeline: str=None,
-    problem: str=None,
-    input: str=None,
-    random_seed: int=0,
-    data_pipeline_path: str=k_fold_split_path,
-    data_random_seed: int=0,
-    data_params=None,
-    scoring_pipeline: str=None,
-    scoring_params=None,
-    scoring_random_seed: int=0):
-    """ 
-    Evaluate pipeline on problem.
-    A less verbose form of running d3m's runtime cli 'evaluate' command.
-    See 'evaluate_pipeline_via_d3m_cli' for more options for running 
-    the 'evaluate' command.
-
-    Parameters
-    ----------
-    pipeline_path : path_like str
-        path to pipeline doc
-    problem_path : path_like str 
-        path to problem doc
-    input_path : path_like str
-        path to input full data
-    random_seed : int   
-        random seed to be used for pipeline run
-    data_pipeline_path: str
-        path to data prepation pipeline
-    data_random_seed: int
-        random_seed to be used in data preparation
-    data_params:
-        parameters for data preparation
-    scoring_params:
-        parameters for scoring pipeline
-    scoring_random_seed: int
-        random seed for scoring
-    scoring_pipeline: str
-        path to scoring pipeline
-
-    Returns:
-    ----------
-    None
-
-    Raises:
-    ---------------------------------
-    OSError
-        when a file cannot be opened
-    """
-    output_run_path = []
-    with open(pipeline, 'r') as data:
-        pipe = json.load(data)
-        output_run_path.append(pipe['id'])
-    with open(problem, 'r') as data:
-        prob = json.load(data)
-        output_run_path.append(prob['about']['problemID'])
-    output_run_path.append(str(random_seed))
-    #get the output run path
-    output_run_path = os.path.abspath(os.path.join('/data', 'Pipeline_Run', 
-                                                   '_'.join(output_run_path)+'.yaml'))
-    #create the directory
-    os.makedirs(os.path.dirname(output_run_path),exist_ok=True)
-    #evaluate pipeline
-    evaluate_pipeline_via_d3m_cli(pipeline=pipeline, problem=problem,
-        input=input, output_run=output_run_path,
-        random_seed=random_seed, data_pipeline_path=data_pipeline_path,
-        data_random_seed=data_random_seed, data_params=data_params, 
-        scoring_pipeline=scoring_pipeline, scoring_params=scoring_params,
-        scoring_random_seed=scoring_random_seed)
-
-def evaluate_pipeline_via_d3m_cli(pipeline: str=None,
-    problem: str=None,
-    input: str=None,
-    output_run: str=None,
-    random_seed: int=0,
-    data_pipeline_path: str=k_fold_split_path,
-    data_random_seed: int=0,
-    data_params=None,
-    scoring_pipeline: str=None,
-    scoring_params=None,
-    scoring_random_seed: int=0):
-    """ 
-    Evaluate pipeline on problem using d3m's runtime cli. 
-    Wrapper function to execute d3m's runtime cli 'evaluate' command.
-    Arguments mirror the same arguments using the cli.
-    Only handles cases with a data preparation pipeline in the 
-    pipeline run.
-
-    Parameters
-    ----------
-    pipeline : path_like or uuid4 str
-        path to pipeline doc or pipeline ID
-    problem : path_like str
-        path to problem doc
-    input : path_like str
-        path to input full data
-    output_run : path_like str or '-'
-        path where pipeline_run doc
-        will be saved.
-        use '-' for stdin
-    random_seed : int
-        random seed to used for
-        pipeline run
-    data_pipeline_path: str
-        path to data prepation pipeline
-    data_random_seed: int
-        random_seed to be used in data preparation
-    data_params:
-        parameters for data preparation
-    scoring_params:
-        parameters for scoring pipeline
-    scoring_random_seed: int
-        random seed for scoring
-    scoring_pipeline: str
-        path to scoring pipeline
-    Return:
-    -------
-    None
-    
-    Raises:
-    -------
-    ValueError
-        when parameter value is
-        invalid
-    """    
-    if (not os.path.isfile(pipeline)):
-        raise ValueError('\'{}\' param not a file path'.format('pipeline'))
-
-    if (not os.path.isfile(problem)): 
-        raise ValueError('\'{}\' param not a file path'.format('problem'))
-
-    if (not os.path.isfile(input)):
-        raise ValueError('\'{}\' param not a file path'.format('input'))
-        
-    if (not os.path.isfile(data_pipeline_path)):
-        raise ValueError('\'{}\' param not a file path'.format('input'))
-        
-    if (not os.path.isfile(scoring_pipeline)):
-        raise ValueError('\'{}\' param not a file path'.format('input'))
-        
-    #TODO - call fit-score when the data pipeline is not defined in the pipeline run
-    args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate']
-    args.extend(('--pipeline', pipeline))
-    args.extend(('--problem', problem))
-    args.extend(('--input', input))
-    args.extend(('--output-run', output_run))
-    args.extend(('--data-pipeline', data_pipeline_path))
-    args.extend(('--data-random-seed', str(data_random_seed)))
-    args.extend(('--scoring-pipeline', scoring_pipeline))
-    args.extend(('--scoring-random-seed', str(scoring_random_seed)))
-    #add the data parameters to the cli arguments
-    if (data_params is not None):
-        for name, value in data_params.items():
-            args.extend(('--data-param', name, value))
-    #add the scoring parameters to the cli arguments
-    if (scoring_params is not None):
-        for name, value in scoring_params.items():
-            args.extend(('--scoring-param', name, value))
-    d3m_cli.main(args)
-    if (config.save_to_d3m is True):
-        save_pipeline_run_to_d3m_db(output_run)
-        
diff --git a/experimenter/execute_pipeline.py b/experimenter/execute_pipeline.py
deleted file mode 100644
index c1bb34b..0000000
--- a/experimenter/execute_pipeline.py
+++ /dev/null
@@ -1,218 +0,0 @@
-"""
-FILE INFORMATION:
-This file needs to be a stand alone file so that it can be imported and used by the
-experimenter_driver.py.  This is because RQ only accepts a function that is imported
-and not defined in __main__.  These functions are what is needed to execute a pipeline
-on a problem and can be used by an individual machine, or used in a RQ job queue.
-"""
-import logging
-from typing import List
-
-from d3m.metadata.pipeline import Pipeline
-
-from experimenter.run_fit_pipeline import RunFitPipeline
-from experimenter.run_pipeline import RunPipeline
-from experimenter.databases.aml_mtl import PipelineDB
-from experimenter.databases.d3m_mtl import D3MMtLDB
-from experimenter.problem import ProblemReference
-from experimenter.config import SAVE_TO_D3M
-from experimenter.constants import METRICS_BY_PROBLEM_TYPE
-
-
-logger = logging.getLogger(__name__)
-
-
-def execute_pipeline_on_problem(
-    pipe: Pipeline,
-    problem: ProblemReference,
-    volumes_dir: str,
-    all_metrics: bool = True,
-):
-    """
-    The main function to execute a pipeline. Called in `experimenter_driver.py`.
-    This function will check if the  pipeline and dataset has been executed before,
-    run the pipeline, and record the results.
-
-    :param pipe: the pipeline object that will be executed
-    :param problem: a reference to the problem to run the pipeline on.
-    :param volumes_dir: a string containing the path to the volumes directory
-    :param all_metrics: if `True`, the pipeline will be scored against all metrics
-        registered for `problem`'s problem type. If `False`, it will only be scored
-        against the metrics listed in `problem`'s description.
-    """
-    # Validate args
-    if all_metrics and problem.problem_type not in METRICS_BY_PROBLEM_TYPE:
-        raise ValueError(
-            f"cannot compute all metrics for problem {problem.name}, "
-            "it does not have a supported problem type."
-        )
-
-    # If the experimenter is configured to save documents to the D3M database,
-    # we only want to execute and save this pipeline run if it doesn't already
-    # exist in the D3M database.
-    if SAVE_TO_D3M and D3MMtLDB().has_pipeline_been_run_on_problem(pipe, problem):
-        logger.info("Pipeline has already been run on this dataset, SKIPPING.")
-        return
-
-    metric_names = (
-        METRICS_BY_PROBLEM_TYPE[problem.problem_type] if all_metrics else None
-    )
-
-    # Attempt to run the pipeline
-    logger.info("\n Running pipeline on problem {}".format(problem.name))
-    run_pipeline = RunPipeline(volumes_dir, problem)
-    try:
-        scores, (fit_result, produce_result) = run_pipeline.run(
-            pipeline=pipe, metric_names=metric_names
-        )
-    except Exception as e:
-        logger.exception("pipeline was not successfully run")
-        print_pipeline(pipe.to_json_structure())
-        raise e
-
-    score = scores[0]
-    # put in the fit pipeline run
-    handle_successful_pipeline_run(
-        fit_result.pipeline_run.to_json_structure(), pipe, score
-    )
-    # put in the produce pipeline run
-    handle_successful_pipeline_run(
-        produce_result.pipeline_run.to_json_structure(), pipe, score
-    )
-
-
-def execute_metafeatures_pipeline_on_problem(
-    pipe: Pipeline, problem: ProblemReference, volumes_dir: str
-):
-    """
-    The main function to execute a `metafeatures` pipeline.  Differs from
-    `execute_pipeline_on_problem` by only handling metafeatures, and by
-    computing them on every subset of the problem e.g. TRAIN, TEST, SCORE, etc.
-    Called in `experimenter_driver.py`. This function will run the pipeline,
-    and record the results.
-
-    :param pipe: the pipeline object that will be executed
-    :param problem: a reference to the problem to run the pipeline on.
-    :param volumes_dir: a string containing the path to the volumes directory
-    """
-    mongo_db = PipelineDB()
-
-    for subset in problem.valid_subsets:
-        if problem.has_subset(subset):
-            problem.subset = subset
-            logger.info(
-                f"computing metafeatures for problem {problem.name} ({problem.subset} subset)..."
-            )
-            # Compute and store the metafeatures for this subset of the problem.
-            run_pipeline = RunFitPipeline(volumes_dir, problem)
-            try:
-                results = run_pipeline.run(pipeline=pipe)
-            except Exception as e:
-                logger.exception("pipeline was not successfully run")
-                print_pipeline(pipe._to_json_structure())
-                raise e
-
-            logger.info(results)
-            fit_result = results
-            mongo_db.add_to_metafeatures(fit_result._to_json_structure())
-
-
-def handle_successful_pipeline_run(
-    pipeline_run: dict, pipeline: Pipeline, score: float
-):
-    """
-    Called after a successful pipeline run.  It will output the results to the console
-    and write it to the database.
-
-    :param pipeline_run: the pipeline run object that will be recorded
-    :param pipeline: the pipeline that was run
-    :param score: the results from the execution of the pipeline
-    """
-    if score["value"][0] == 0:
-        # F-SCORE was calculated wrong - quit and don't keep this run
-        return
-
-    print_pipeline(pipeline.to_json_structure(), score)
-    d3m_db = D3MMtLDB()
-
-    if not d3m_db.does_pipeline_exist_in_db(pipeline):
-        pipeline_save_response = d3m_db.save_pipeline(pipeline, save_primitives=True)
-        if pipeline_save_response.ok:
-            logger.info(
-                f"pipeline {pipeline.get_digest()} "
-                f"saved successfully, response: {pipeline_save_response.json()}"
-            )
-
-    pipeline_run_save_response = d3m_db.save_pipeline_run(pipeline_run)
-    if pipeline_run_save_response.ok:
-        logger.info(
-            f"pipeline run {pipeline_run['id']} "
-            f"saved successfully, response: {pipeline_run_save_response.json()}"
-        )
-
-
-def print_pipeline_and_problem(pipeline: dict, problem: str):
-    """
-    A simple function to print the pipeline and problem, for debugging
-
-    :param pipeline: the pipeline that was executed
-    :param problem: the dataset/problem that was used
-    """
-    logger.info("Pipeline:")
-    logger.info(get_list_vertically(primitive_list_from_pipeline_object(pipeline)))
-    logger.info("on problem {} \n\n".format(problem))
-
-
-def get_primitive_combo_string(pipeline):
-    prim_string = ""
-    for p in pipeline["steps"]:
-        prim_string += p["primitive"]["id"]
-    return prim_string
-
-
-def print_pipeline(pipeline: dict, score: float = None) -> List[str]:
-    """
-    A helper function for printing a succesful run
-
-    :param pipeline: the pipeline that we will print
-    :param score: the results of the metric used in training
-    :return primitive_list: a list of all the primitives used in the pipeline
-    """
-    primitive_list = primitive_list_from_pipeline_json(pipeline)
-    logger.info("pipeline:\n")
-    logger.info(get_list_vertically(primitive_list))
-    if score is not None:
-        logger.info("with a {} of {}".format(score["metric"][0], score["value"][0]))
-    return primitive_list
-
-
-def primitive_list_from_pipeline_object(pipeline: Pipeline):
-    """
-    A helper function to return all the primitives used in a pipeline
-
-    :param pipeline: a pipeline object
-    """
-    primitives = []
-    for p in pipeline.steps:
-        primitives.append(p.to_json_structure()["primitive"]["python_path"])
-    return primitives
-
-
-def primitive_list_from_pipeline_json(pipeline_json: dict):
-    """
-    A helper function to return all the primitives used in a pipeline
-
-    :param pipeline_json a pipeline object in JSON form
-    """
-    primitives = []
-    for step in pipeline_json["steps"]:
-        primitives.append(step["primitive"]["python_path"])
-    return primitives
-
-
-def get_list_vertically(list_to_use: list, indent: bool = True):
-    """
-    A helper function to join a list vertically. Used for debugging printing.
-    """
-    final_list = ["\t" + item for item in list_to_use] if indent else list_to_use
-    return "\n" + "\n".join(final_list)
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 02aef61..391ec5b 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -3,13 +3,13 @@
 import os
 import yaml
 
-from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as data_split_file
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
 from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file
 
 from experimenter.query import query_on_seeds
 from experimenter import queue, utils
 from experimenter.utils import download_from_database
-from experimenter.evaluate_pipeline_new import evaluate_pipeline_on_problem as evaluate_pipeline
+from experimenter.runtime import evaluate
 
 
 class ModifyGenerator:
diff --git a/experimenter/problem_new.py b/experimenter/problem_new.py
deleted file mode 100644
index e69de29..0000000
diff --git a/experimenter/utils.py b/experimenter/utils.py
index 11f9eb5..dc10180 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -47,6 +47,23 @@ def get_scoring_pipelines():
     scoring_dict[pipelines.SCORING_PIPELINE_ID] = pipelines.SCORING_PIPELINE_PATH 
     return scoring_dict, scoring_id_list
 
+
+def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str):
+    """
+    get the output path of the pipeline run
+    """
+    output_run_path = []
+    #get the digests from the dataset and problem paths
+    with open(pipeline_path, 'r') as data:
+        pipeline = json.load(data)
+        output_run_path.append(pipeline['digest'])
+    with open(dataset_path, 'r') as data:
+        dataset = json.load(data)
+        output_run_path.append(dataset['digest'])
+    output_run_path.append(str(random_seed))
+    output_run_path = os.path.abspath(os.path.join(config.output_run_path, '_'.join(output_run_path)+'.yaml'))
+    return output_run_path
+     
  
 def get_pipelines_from_d3m(pipeline_id: str = None, types='Data'):
     """Checks if data preparation pipeline is in d3m module,

From 55e8ce706ef3b7d02da81dfd4eeea52d3827b6af Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Wed, 24 Mar 2021 10:33:43 -0600
Subject: [PATCH 39/44] First round of suggested changes (mostly on runtime.py)

---
 experimenter/config.py            |  4 ++--
 experimenter/databases/d3m_mtl.py | 13 +++++++++++--
 experimenter/modify_generator.py  | 27 ++++++++++++++++++++-------
 experimenter/utils.py             |  6 +++---
 setup.py                          |  2 --
 5 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/experimenter/config.py b/experimenter/config.py
index 3233730..d21f745 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -48,6 +48,6 @@ def validate_save():
 
 
 output_run_path: str = os.path.abspath(os.path.join('/data', 'pipeline_runs'))
-if (not os.path.exists(output_run_path))
+if (not os.path.exists(output_run_path)):
     #create the directory
-    os.makedirs(os.path.dirname(output_run_path), exist_ok=True)
+    os.makedirs(output_run_path, exist_ok=True)
diff --git a/experimenter/databases/d3m_mtl.py b/experimenter/databases/d3m_mtl.py
index da6c77e..b63a333 100644
--- a/experimenter/databases/d3m_mtl.py
+++ b/experimenter/databases/d3m_mtl.py
@@ -1,5 +1,6 @@
 import logging
 import json
+import yaml
 
 import requests
 from d3m.primitive_interfaces.base import PrimitiveBase
@@ -105,7 +106,15 @@ def does_pipeline_exist_in_db(self, pipeline: Pipeline) -> bool:
             .count()
         )
         return num_pipeline_matches > 0
-
+        
+    def save_pipeline_runs_from_path(self, pipeline_run_path: str) -> requests.Response:
+        responses = list()
+        with open(pipeline_run_path, 'r') as pipeline_data:
+            pipeline_runs = yaml.safe_load_all(pipeline_data)
+            for pipeline_run in pipeline_runs:
+                responses.append(self.save_pipeline_run(pipeline_run).content)
+        return responses
+               
     def save_pipeline_run(self, pipeline_run: dict) -> requests.Response:
         return self._save(pipeline_run, "pipeline-run")
 
@@ -156,7 +165,7 @@ def _create_no_save_response(self) -> requests.Response:
         response.status_code = 200
         response._content = (
             b'{ "result" : "No request was made to the D3M DB API to save a record, '
-            b'since the SAVE_TO_D3M environment variable is not set." }'
+            b'since the SAVE_TO_D3M environment variable is not set to true." }'
         )
         return response
 
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 57cd391..ad1acab 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -55,17 +55,29 @@ def _get_generator(self):
                 #save the pipeline to path and return pipeline path
                 data_prep_pipeline, data_random_seed, data_params = data
                 scoring_pipeline, scoring_random_seed, scoring_params = score
-                pipeline_path = download_from_database(pipeline, type_to_download='Pipeline')
+                pipeline_path = download_from_database(pipeline, type_to_download='pipelines')
                 #TODO - catch when there is no data preparation pipeline and pass it further to evaluate
                 #catch error returning none for file paths or preparation pipeline
                 if (problem_path is None or dataset_doc is None or data_prep_pipeline is None):     
                     continue
                 #check if query returned a path or an id
                 if (os.path.exists(data_prep_pipeline) is False):
-                    data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='Data Preparation')
+                    data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='data-preparation-pipelines')
                 if (os.path.exists(scoring_pipeline) is False):
-                    scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='Scoring')
-                job = queue.make_job(evaluate_pipeline,
+                    scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='scoring-pipelines')
+                evaluate(pipeline=pipeline_path,
+                                  problem=problem_path,
+                                  input=dataset_doc,
+                                  random_seed=seed,
+                                  data_pipeline=data_prep_pipeline,
+                                  data_random_seed=data_random_seed,
+                                  data_params=data_params,
+                                  scoring_pipeline=scoring_pipeline,
+                                  scoring_random_seed=scoring_random_seed,
+                                  scoring_params=scoring_params,
+                                  runtime_arg='evaluate')
+                
+                job = queue.make_job(evaluate,
                                      pipeline=pipeline_path,
                                      problem=problem_path,
                                      input=dataset_doc,
@@ -75,7 +87,8 @@ def _get_generator(self):
                                      data_params=data_params,
                                      scoring_pipeline=scoring_pipeline,
                                      scoring_random_seed=scoring_random_seed,
-                                     scoring_params=scoring_params)
+                                     scoring_params=scoring_params,
+                                     runtime_arg='evaluate')
                 self.num_complete += 1
                 yield job
         
@@ -137,7 +150,7 @@ def _modify_random_seed(self, seed_limit, query_args):
             
     def _run_seed_test(self,args):
         """ Test designed for development and functionality purposes.
-            It uses and dataset and pipeline that is saved in d3m-experimenter
+            It uses a dataset and pipeline that is saved in the d3m-experimenter
         """
         with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file:
             pipeline = json.load(pipeline_file) 
@@ -145,7 +158,7 @@ def _run_seed_test(self,args):
         problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
         data_prep_seed = 0
         data_prep_seed = 0
-        data_prep_pipeline = data_split_file
+        data_prep_pipeline = K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
         scoring_pipeline = scoring_file
         scoring_seed = 0
         used_seeds = {2,15}
diff --git a/experimenter/utils.py b/experimenter/utils.py
index dc10180..27cf135 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -48,7 +48,7 @@ def get_scoring_pipelines():
     return scoring_dict, scoring_id_list
 
 
-def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str):
+def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str, random_seed: int):
     """
     get the output path of the pipeline run
     """
@@ -59,7 +59,7 @@ def get_pipeline_run_output_path(pipeline_path: str, dataset_path: str):
         output_run_path.append(pipeline['digest'])
     with open(dataset_path, 'r') as data:
         dataset = json.load(data)
-        output_run_path.append(dataset['digest'])
+        output_run_path.append(dataset['about']['digest'])
     output_run_path.append(str(random_seed))
     output_run_path = os.path.abspath(os.path.join(config.output_run_path, '_'.join(output_run_path)+'.yaml'))
     return output_run_path
@@ -91,7 +91,7 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None)
         to_save.write(save_id+'\n')    
 
 
-def download_from_database(data, type_to_download: str = 'Pipeline'):
+def download_from_database(data, type_to_download: str = 'pipeline'):
     i_d = data['id']
     save_path = os.path.join('/data', type_to_download, i_d+str('.json')) 
     #create the new directory
diff --git a/setup.py b/setup.py
index 156b02d..fe96b85 100644
--- a/setup.py
+++ b/setup.py
@@ -10,8 +10,6 @@
     packages=find_packages(),
     python_requires='>=3.6,<4.0',
     install_requires=[
-        'docker>=4.4.0<4.5.0',
-        'mypy==0.812',
         'd3m',  # TODO: add version bounds
         'redis>=3.5.0<3.6.0',
         'rq>=1.7.0<1.8.0',

From 5777cd8b25b0dbadd4fc4f19ca024b93b163f384 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Wed, 24 Mar 2021 20:43:55 -0600
Subject: [PATCH 40/44] Cleaned query.py and added tests to test_modifier

---
 experimenter/config.py           |  32 ++++++++
 experimenter/modify_generator.py |  64 +++++++--------
 experimenter/query.py            | 134 +++++++++++++++----------------
 experimenter/utils.py            |   2 -
 tests/test_modifier.py           |  47 ++++++++---
 5 files changed, 166 insertions(+), 113 deletions(-)

diff --git a/experimenter/config.py b/experimenter/config.py
index d21f745..04d6332 100644
--- a/experimenter/config.py
+++ b/experimenter/config.py
@@ -10,6 +10,7 @@
 _ERROR_MESSAGE = 'environment variable not set: {}'
 
 
+#parse the .env file
 datasets_dir: str = os.environ.get('DATASETS_DIR', None)
 def validate_datasets_dir():
     if datasets_dir is None:
@@ -47,7 +48,38 @@ def validate_save():
         raise exceptions.ConfigError(_ERROR_MESSAGE.format('SAVE_TO_D3M'))
 
 
+query_host: str = os.environ.get('QUERY_HOST', 'https://metalearning.datadrivendiscovery.org/es')
+def validate_query_host():
+    if query_host is None:
+        raise exceptions.ConfigError(_ERROR_MESSAGE.format('QUERY_HOST'))
+        
+        
+query_timeout: int = int(os.environ.get('QUERY_TIMEOUT', '500'))
+def validate_query_timeout():
+    if query_timeout is None:
+        raise exceptions.ConfigError(_ERROR_MESSAGE.format('QUERY_TIMEOUT'))
+
+
+#get the save paths for the experimenter from the point of view of the docker container
 output_run_path: str = os.path.abspath(os.path.join('/data', 'pipeline_runs'))
 if (not os.path.exists(output_run_path)):
     #create the directory
     os.makedirs(output_run_path, exist_ok=True)
+    
+    
+pipelines_path: str = os.path.abspath(os.path.join('/data', 'pipelines'))
+if (not os.path.exists(pipelines_path)):
+    #create the directory
+    os.makedirs(pipelines_path, exist_ok=True)
+
+
+data_prep_pipelines_path: str = os.path.abspath(os.path.join('/data', 'data-preparation-pipelines'))
+if (not os.path.exists(data_prep_pipelines_path)):
+    #create the directory
+    os.makedirs(data_prep_pipelines_path, exist_ok=True)
+
+
+scoring_pipelines_path: str = os.path.abspath(os.path.join('/data', 'scoring-pipelines'))
+if (not os.path.exists(scoring_pipelines_path)):
+    #create the directory
+    os.makedirs(scoring_pipelines_path, exist_ok=True)
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index ad1acab..1255357 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -6,8 +6,7 @@
 from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
 from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file
 
-from experimenter.query import query_on_seeds
-from experimenter import queue, utils
+from experimenter import queue, utils, query
 from experimenter.utils import download_from_database
 from experimenter.runtime import evaluate
 
@@ -23,11 +22,7 @@ def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None
         self.max_jobs = max_jobs
         self.num_complete = 0
         #run the query on initializing to define the query results
-        if (args.test is True):
-            self.query_results = self._run_seed_test(self.args)
-        else:
-            self.query_results = self._query(self.args)
-        self.generator = self._get_generator()
+        self.query_results = None
 
 
     def __iter__(self):
@@ -36,7 +31,7 @@ def __iter__(self):
 
     def __next__(self):
         #iterate through query results
-        job = next(self.generator)
+        job = next(self._get_generator())
         if (self.max_jobs):
             if (self.num_complete > self.max_jobs):
                 raise StopIteration
@@ -49,6 +44,8 @@ def _get_generator(self):
         Can only handle cases where there is a data preparation
         pipeline in the pipeline run
         """
+        if (self.query_results is None):
+            self.query_results = self._query(self.args)
         for query_result in self.query_results:
             #iterate through modifier results
             for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result,self.args):
@@ -74,8 +71,7 @@ def _get_generator(self):
                                   data_params=data_params,
                                   scoring_pipeline=scoring_pipeline,
                                   scoring_random_seed=scoring_random_seed,
-                                  scoring_params=scoring_params,
-                                  runtime_arg='evaluate')
+                                  scoring_params=scoring_params)
                 
                 job = queue.make_job(evaluate,
                                      pipeline=pipeline_path,
@@ -87,8 +83,7 @@ def _get_generator(self):
                                      data_params=data_params,
                                      scoring_pipeline=scoring_pipeline,
                                      scoring_random_seed=scoring_random_seed,
-                                     scoring_params=scoring_params,
-                                     runtime_arg='evaluate')
+                                     scoring_params=scoring_params)
                 self.num_complete += 1
                 yield job
         
@@ -146,30 +141,33 @@ def _modify_random_seed(self, seed_limit, query_args):
             yield (query_args['pipeline'], query_args['problem_path'], query_args['dataset_doc_path'], new_seed, 
                   (query_args['data_prep_pipeline'], query_args['data_prep_seed'], query_args['data_params']), 
                   (query_args['scoring_pipeline'], query_args['scoring_seed'], query_args['scoring_params'])) 
-            
-            
-    def _run_seed_test(self,args):
-        """ Test designed for development and functionality purposes.
-            It uses a dataset and pipeline that is saved in the d3m-experimenter
-        """
-        with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file:
-            pipeline = json.load(pipeline_file) 
-        dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
-        problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
-        data_prep_seed = 0
-        data_prep_seed = 0
-        data_prep_pipeline = K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
-        scoring_pipeline = scoring_file
-        scoring_seed = 0
-        used_seeds = {2,15}
-        yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 
-               'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, 
-               'data_prep_seed': data_prep_seed, 'data_params': None,
-               'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed,
-               'scoring_params': None}
 
 
     def _modify_swap_primitive(self, swap_pipeline, query_args):
         raise ValueError("No functionality for swapping primitives yet")
         
         
+def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
+    """
+    Helper function for generating jobs to be used in the random-seed swapping 
+    generator
+    """
+    arguments = {'id': pipeline_id, '_submitter': submitter}
+    pipeline_search = query.match_query(index='pipelines', arguments=arguments)
+    for pipeline in pipeline_search.scan():
+        pipeline_run_query = query.scan_pipeline_runs(pipeline.id, submitter)
+        pipeline = pipeline.to_dict()
+        for run_tuple, pipeline_run_params in pipeline_run_query.items():
+            #get the unqiue params from the params list
+            unique_run_params = query.combine_unique_params(pipeline_run_params)
+            #unpack values from tuple
+            query_arg_dict = query.unpack_run_tuple_args(run_tuple)
+            for params in unique_run_params:
+                query_args = query_arg_dict.copy()
+                query_args['data_params'] = params['data_params']
+                query_args['scoring_params'] = params['scoring_params']
+                query_args['tested_seeds'] = params['random_seeds']
+                query_args['pipeline'] = pipeline    
+                if limit and len(query_args['tested_seeds']) > limit:
+                    continue
+                yield query_args
diff --git a/experimenter/query.py b/experimenter/query.py
index 96618ad..b037bbb 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -2,84 +2,75 @@
 from elasticsearch_dsl import Search, Q
 from experimenter.utils import get_problem_path, get_dataset_doc_path, get_pipelines_from_d3m
 from d3m.runtime import _get_data_and_scoring_params_from_pipeline_run as _data_score_params
+from experimenter import config
 
-
-HOST = 'https://metalearning.datadrivendiscovery.org/es'
-CONNECTION = Elasticsearch(hosts=[HOST], timeout=300)
+CONNECTION = Elasticsearch(hosts=[config.query_host], timeout=config.query_timeout)
 
          
-def get_search_query(arguments: dict = None, connection = CONNECTION, index='pipelines'):
-    index_search = Search(using=CONNECTION, index=index)
-    if arguments['id'] is not None:
-        index_search = index_search.query('match', id=arguments['id'])
-    if arguments['submitter'] is not None:
-        index_search = index_search.query('match', _submitter=arguments['submitter'])
+def match_query(index:str, arguments: dict = None, connection = CONNECTION):
+    #remove None arguments from the dictionary
+    filtered_args = {k:v for k,v in arguments.items() if v is not None}
+    #initialize the search
+    index_search = Search(using=connection, index=index)
+    for field, argument in filtered_args.items():
+        arg_dict = dict()
+        arg_dict[field] = argument
+        index_search = index_search.query('match', **arg_dict)
     return index_search
 
 
-def query_on_seeds(pipeline_id: str=None, limit: int=None, submitter: str='byu'):
-    arguments = {'id': pipeline_id, 'submitter': submitter}
-    pipeline_search = get_search_query(arguments=arguments, index='pipelines')
-    for pipeline in pipeline_search.scan():
-        results = scan_pipeline_runs(pipeline.id, submitter)
-        for dataset_prob_tuple, results_dict in results.items():
-            unique_items = get_unique_results(results_dict)
-            #unpack values from tuple
-            problem_id, dataset_id, data_prep, scoring = dataset_prob_tuple
-            scoring_id, scoring_random_seed = scoring
-            data_prep_id, data_prep_seed = data_prep
-            #get preparation and scoring pipelines
-            data_prep_pipeline = get_pipeline(data_prep_id, types='Data')
-            scoring_pipeline = get_pipeline(scoring_id, types='Scoring')
-            for params in unique_items:
-                data_params = params['data_params']
-                scoring_params = params['scoring_params']
-                random_seeds = params['random_seeds']    
-                if limit and len(random_seeds) > limit:
-                    continue
-                yield {'pipeline': pipeline.to_dict(), 'problem_path': get_problem_path(problem_id), 
-                       'dataset_doc_path':get_dataset_doc_path(dataset_id), 'tested_seeds': random_seeds,
-                       'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed,
-                       'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed,
-                       'scoring_params': scoring_params, 'data_params': data_params}
+def unpack_run_tuple_args(run_tuple: tuple):
+    #unpack values from tuple
+    problem_id, dataset_id, data_prep_data, scoring_data = run_tuple
+    scoring_id, scoring_random_seed = scoring_data
+    data_prep_id, data_prep_seed = data_prep_data
+    #get preparation and scoring pipelines
+    data_prep_pipeline = get_pipeline(data_prep_id, types='Data')
+    scoring_pipeline = get_pipeline(scoring_id, types='Scoring')
+    return {'problem_path': get_problem_path(problem_id), 
+           'dataset_doc_path':get_dataset_doc_path(dataset_id),
+           'data_prep_pipeline': data_prep_pipeline, 'data_prep_seed': data_prep_seed,
+           'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_random_seed,}
 
 
 def get_pipeline(pipeline_id: str=None, types: str='Data'):
+      """
+      gets a pipeline from the database, if it is not already
+      in the d3m module
+      """
       if (pipeline_id is None):
           return None
       pipeline = get_pipelines_from_d3m(pipeline_id, types=types)
       #get from database if not in d3m module
       if (pipeline is None):
-          arguments = {'submitter': None, 'id': data_prep_id}
-          search = get_search_query(arguments=arguments)
+          arguments = {'id': data_prep_id}
+          search = match_query(index='pipelines', arguments=arguments)
           pipeline = next(search.scan())
           pipeline = pipeline.to_dict()
       return pipeline
 
 
 def check_for_data_prep(pipeline_run=None):
-    """Only handles cases with an explicit data preparation pipeline
-    in the pipeline run
+    """Handles cases with an explicit data preparation pipeline
+    in the pipeline run, will return none when pipeline run has
+    no preparation pipeline
     """
-    data_prep = None
-    data_prep_id = None
-    data_prep_seed = None
-    data_params = None
     try:
         data_prep = pipeline_run.run.data_preparation
-    except:
-        data_prep = None
-        data_prep_seed = None
-    if (data_prep is not None):
         data_prep_seed = data_prep.random_seed
         data_prep_id = data_prep.pipeline.id
         data_prep = data_prep.to_dict()
         data_params = _data_score_params(data_prep.get('steps', []))
+    except:
+        data_prep, data_prep_seed, data_prep_id, data_params = None
         
     return (data_prep_id, data_prep_seed), data_params
     
     
-def get_scoring_pipeline(pipeline_run=None):
+def get_scoring_pipeline(pipeline_run):
+    """
+    returns the scoring pipeline from the pipeline run
+    """
     scoring = pipeline_run.run.scoring
     scoring_seed = scoring.random_seed
     scoring_id = scoring.pipeline.id
@@ -89,6 +80,10 @@ def get_scoring_pipeline(pipeline_run=None):
      
 
 def get_list_duplicates(params_list, match_item):
+    """
+    takes in a list of params and an item to match,
+    returns a list of matching indeces in the list
+    """
     start_loc = -1
     locs = []
     while True:
@@ -102,46 +97,49 @@ def get_list_duplicates(params_list, match_item):
     return locs
 
 
-def get_unique_results(results: dict = None):
-    #function for getting unique results from the result dictionary 
-    random_seeds_list = results['random_seeds']
-    params_list = results['params']
+def combine_unique_params(param_dict_list: dict = None):
+    """
+    reduces the param_dict_list into a list of unique paramers with
+    combined random seeds
+    """
+    random_seeds_list = param_dict_list['random_seeds']
+    params_list = param_dict_list['params']
     final_list = list()
-    location_dict = dict()
-    #loop through the values
+    location_dict = dict() #initalize dictionary for storing matchine indices
+    #loop through the parameter values
     for it, param in enumerate(params_list):
         #get matching pairs of each value 
         location_dict[it] = get_list_duplicates(params_list, param)
-    skip = set()
+    skip = set() #initialize set of locations to skip
     for loc, values in location_dict.items():
+        #only need to match once to match in other locations (add to skip)
         if loc in skip:
             continue
         random_seeds = set()
         for value in values:
+            #add matched params random seeds to same set
             random_seeds.add(random_seeds_list[value])
             skip.add(value)
         data_params, scoring_params = params_list[loc]
+        #combine matching params with aggregated set of random seeds
         final_list.append({'data_params': data_params, 'scoring_params': scoring_params, 'random_seeds': random_seeds})
     return final_list
 
 
 def scan_pipeline_runs(pipeline_id, submitter=None):
-    pipeline_run_search = Search(using=CONNECTION, index='pipeline_runs') \
-      .query('match', pipeline__id=pipeline_id) \
-      .query('match', run__phase='PRODUCE') \
-      .query('match', status__state='SUCCESS')
-    if submitter:
-        pipeline_run_search = pipeline_run_search.query('match', _submitter=submitter)
-    results = dict()
+    query_arguments = {'pipeline__id': pipeline_id, 'run__phase': 'PRODUCE', 
+                       'status__state': 'SUCCESS', '_submitter': submitter}
+    pipeline_run_search = match_query(index='pipeline_runs', arguments=query_arguments)
+    query_results = dict()
     for pipeline_run in pipeline_run_search.scan():
         data_prep, data_params = check_for_data_prep(pipeline_run=pipeline_run)
         scoring, scoring_params = get_scoring_pipeline(pipeline_run)
         for dataset in pipeline_run.datasets:
-            dataset_prob_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring)
-            results[dataset_prob_tuple] = results.get(dataset_prob_tuple, dict())
-            results[dataset_prob_tuple]['random_seeds'] = results[dataset_prob_tuple].get('random_seed', list())
-            results[dataset_prob_tuple]['params'] = results[dataset_prob_tuple].get('params', list())
-            results[dataset_prob_tuple]['random_seeds'].append(pipeline_run.random_seed)
-            results[dataset_prob_tuple]['params'].append((data_params, scoring_params))
-    return results
+            run_tuple = (pipeline_run.problem.id, dataset.id, data_prep, scoring)
+            query_results[run_tuple] = query_results.get(run_tuple, dict())
+            query_results[run_tuple]['random_seeds'] = query_results[run_tuple].get('random_seed', list())
+            query_results[run_tuple]['params'] = query_results[run_tuple].get('params', list())
+            query_results[run_tuple]['random_seeds'].append(pipeline_run.random_seed)
+            query_results[run_tuple]['params'].append((data_params, scoring_params))
+    return query_results
     
diff --git a/experimenter/utils.py b/experimenter/utils.py
index 27cf135..bec55ac 100644
--- a/experimenter/utils.py
+++ b/experimenter/utils.py
@@ -94,8 +94,6 @@ def save_to_not_exist_file(filename:str = 'dataset_dne.txt', save_id:str = None)
 def download_from_database(data, type_to_download: str = 'pipeline'):
     i_d = data['id']
     save_path = os.path.join('/data', type_to_download, i_d+str('.json')) 
-    #create the new directory
-    os.makedirs(os.path.dirname(save_path),exist_ok=True)
     #save the file to the directory
     with open(save_path, 'w') as to_save:
         json.dump(data, to_save, indent=4)
diff --git a/tests/test_modifier.py b/tests/test_modifier.py
index 3b1fb9a..8c11571 100644
--- a/tests/test_modifier.py
+++ b/tests/test_modifier.py
@@ -1,14 +1,17 @@
 import unittest
 from experimenter import modify_generator, queue, exceptions, utils
-from query import query_on_seeds
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
+from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file
 
-class ModifierTestCase(unittest.TestCase):
+class GeneratorModifierTestCase(unittest.TestCase):
     
-    def test_seed_modifier(self):
+    
+    def test_random_seed_modifier_job_count(self):
         #initialize the modifier with random-seed and a given max jobs
-        args = {'seed_limit':25, 'submitter':None, 'pipeline_id':None}
-        num_test = 21
+        args = {'seed_limit':35, 'submitter':None, 'pipeline_id':None}
+        num_test = 5
         modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25})
+        modifier.query_args = get_seed_test_args
         #start the counter to make sure there are the right amount of jobs
         counter = 0
         seed_old = 12.1
@@ -19,14 +22,38 @@ def test_seed_modifier(self):
            self.assertNotEqual(seed_old, seed_new)
            seed_old = seed_new
         self.assertEqual(counter,num_test) 
+      
         
-    def test_query_seeds(self):
+    def test_query_random_seeds_set_size(self):
         args = {'seed_limit':25, 'submitter':'byu', 'pipeline_id':None}
-        query_results = query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter)
+        query_results = modify_generator.query_on_seeds(args['pipeline_id'], args['seed_limit'], args['submitter'])
         #test 10 query results
         for i in range(10):
-            _,_,seed_list = next(query_results)
-            self.assertTrue(len(seed_list) < seed_limit)    
-    
+            query = next(query_results)
+            self.assertTrue(len(query['tested_seeds']) < seed_limit)   
+       
+            
+    def get_seed_test_args(self,args):
+        """ returns args for testing modify generator random-seed
+            functionality purposes.  It uses a dataset and pipeline 
+            that is saved in the d3m-experimenter
+        """
+        with open('experimenter/pipelines/bagging_classification.json', 'r') as pipeline_file:
+            pipeline = json.load(pipeline_file) 
+        dataset_path = utils.get_dataset_doc_path('185_baseball_MIN_METADATA_dataset')
+        problem_path = utils.get_problem_path('185_baseball_MIN_METADATA_problem')
+        data_prep_seed = 0
+        data_prep_seed = 0
+        data_prep_pipeline = K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
+        scoring_pipeline = scoring_file
+        scoring_seed = 0
+        used_seeds = {2,15}
+        yield {'pipeline': pipeline, 'problem_path': problem_path, 'dataset_doc_path': dataset_path, 
+               'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, 
+               'data_prep_seed': data_prep_seed, 'data_params': None,
+               'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed,
+               'scoring_params': None} 
+            
+              
 if __name__ == '__main__':
     unittest.main()

From a235dfbd3c96a61634aae1065402bcd98079a169 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 26 Mar 2021 09:37:31 -0600
Subject: [PATCH 41/44] Test Updates and query cleaning

---
 experimenter/query.py  |  5 +++--
 tests/test_modifier.py | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/experimenter/query.py b/experimenter/query.py
index b037bbb..c0e477d 100644
--- a/experimenter/query.py
+++ b/experimenter/query.py
@@ -2,7 +2,7 @@
 from elasticsearch_dsl import Search, Q
 from experimenter.utils import get_problem_path, get_dataset_doc_path, get_pipelines_from_d3m
 from d3m.runtime import _get_data_and_scoring_params_from_pipeline_run as _data_score_params
-from experimenter import config
+from experimenter import config, exceptions
 
 CONNECTION = Elasticsearch(hosts=[config.query_host], timeout=config.query_timeout)
 
@@ -61,7 +61,8 @@ def check_for_data_prep(pipeline_run=None):
         data_prep_id = data_prep.pipeline.id
         data_prep = data_prep.to_dict()
         data_params = _data_score_params(data_prep.get('steps', []))
-    except:
+    except KeyError:
+        #no data preparation pipeline in pipeline run, return none
         data_prep, data_prep_seed, data_prep_id, data_params = None
         
     return (data_prep_id, data_prep_seed), data_params
diff --git a/tests/test_modifier.py b/tests/test_modifier.py
index 8c11571..87761b4 100644
--- a/tests/test_modifier.py
+++ b/tests/test_modifier.py
@@ -1,5 +1,6 @@
 import unittest
 from experimenter import modify_generator, queue, exceptions, utils
+from experimenter.databases.d3m_mtl import D3MMtLDB
 from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
 from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file
 
@@ -11,7 +12,7 @@ def test_random_seed_modifier_job_count(self):
         args = {'seed_limit':35, 'submitter':None, 'pipeline_id':None}
         num_test = 5
         modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25})
-        modifier.query_args = get_seed_test_args
+        modifier.query_args = self.get_seed_test_args
         #start the counter to make sure there are the right amount of jobs
         counter = 0
         seed_old = 12.1
@@ -31,8 +32,17 @@ def test_query_random_seeds_set_size(self):
         for i in range(10):
             query = next(query_results)
             self.assertTrue(len(query['tested_seeds']) < seed_limit)   
+          
        
-            
+    def test_d3m_interface_init(self):
+        init_fail = False
+        try:
+            d3m_db = D3MMtLDB()
+        except:
+            init_fail = True
+        self.assertFalse(init, "D3M Interface Failed")    
+    
+               
     def get_seed_test_args(self,args):
         """ returns args for testing modify generator random-seed
             functionality purposes.  It uses a dataset and pipeline 

From 635b1c00a25be6d150cf36efa9f5f95dce2ceaa8 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 26 Mar 2021 10:01:45 -0600
Subject: [PATCH 42/44] docker compose update and finish suggested changes

---
 docker-compose.yml               |  4 ----
 experimenter/modify_generator.py | 13 +------------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 61bbae1..0ff1bda 100755
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -23,10 +23,6 @@ services:
       - type: bind
         source: '${DATA_DIR}'
         target: /data
-      - type: bind
-        source: '${EXPERIMENTER_DIR}'
-        target: /d3m-experimenter
-        read_only: true
     command: 'rq worker --url redis://${REDIS_HOST} ${RQ_QUEUES}'
     networks:
       - default
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 1255357..3e912bc 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -62,23 +62,12 @@ def _get_generator(self):
                     data_prep_pipeline = download_from_database(data_prep_pipeline, type_to_download='data-preparation-pipelines')
                 if (os.path.exists(scoring_pipeline) is False):
                     scoring_pipeline = download_from_database(scoring_pipeline, type_to_download='scoring-pipelines')
-                evaluate(pipeline=pipeline_path,
-                                  problem=problem_path,
-                                  input=dataset_doc,
-                                  random_seed=seed,
-                                  data_pipeline=data_prep_pipeline,
-                                  data_random_seed=data_random_seed,
-                                  data_params=data_params,
-                                  scoring_pipeline=scoring_pipeline,
-                                  scoring_random_seed=scoring_random_seed,
-                                  scoring_params=scoring_params)
-                
                 job = queue.make_job(evaluate,
                                      pipeline=pipeline_path,
                                      problem=problem_path,
                                      input=dataset_doc,
                                      random_seed=seed,
-                                     data_pipeline_path=data_prep_pipeline,
+                                     data_pipeline=data_prep_pipeline,
                                      data_random_seed=data_random_seed,
                                      data_params=data_params,
                                      scoring_pipeline=scoring_pipeline,

From 1e5a9469d96a805bfed451fcb9de394a7df9e990 Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Fri, 26 Mar 2021 16:25:06 -0600
Subject: [PATCH 43/44] Fix tests and generator part of the modify generator

---
 experimenter/cli.py              |  6 ++++-
 experimenter/modify_generator.py | 37 ++++++++++++++++++-----------
 tests/test_modifier.py           | 40 ++++++++++++++++++--------------
 3 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/experimenter/cli.py b/experimenter/cli.py
index fb1d996..e87ff94 100644
--- a/experimenter/cli.py
+++ b/experimenter/cli.py
@@ -173,7 +173,11 @@ def configure_modify_parser(parser: argparse.ArgumentParser) -> None:
 
 def modify_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
     modify_type = arguments.modify_type
-    modify_generator = ModifyGenerator(modify_type, arguments.max_jobs, arguments)
+    modify_generator = ModifyGenerator(modify_type = modify_type, 
+                                       max_jobs = arguments.max_jobs, 
+                                       seed_limit = arguments.seed_limit, 
+                                       submitter = arguments.submitter,
+                                       pipeline_id = arguments.pipeline_id)
     #now run the enqueuer part
     queue.enqueue_jobs(jobs=modify_generator, job_timeout=arguments.job_timeout)
 
diff --git a/experimenter/modify_generator.py b/experimenter/modify_generator.py
index 3e912bc..211ac2f 100644
--- a/experimenter/modify_generator.py
+++ b/experimenter/modify_generator.py
@@ -15,14 +15,18 @@ class ModifyGenerator:
     """ Generator to be used for creating modified pipelines based on existing
         pipelines in the database
     """
-    def __init__(self, modify_type: str='random-seed', max_jobs: int=None, args=None):
-        self.args = args
+    def __init__(self, modify_type: str='random-seed', 
+                 max_jobs: int=None, seed_limit = None,
+                 submitter = None, pipeline_id = None):
         #intialize commonly used variables
         self.modifier_type = modify_type
         self.max_jobs = max_jobs
+        self.seed_limit = seed_limit
+        self.submitter = submitter
+        self.pipeline_id = pipeline_id
         self.num_complete = 0
         #run the query on initializing to define the query results
-        self.query_results = None
+        self._set_query_results()
 
 
     def __iter__(self):
@@ -31,24 +35,29 @@ def __iter__(self):
 
     def __next__(self):
         #iterate through query results
-        job = next(self._get_generator())
+        job = next(self.generator)
         if (self.max_jobs):
             if (self.num_complete > self.max_jobs):
                 raise StopIteration
         return job
 
              
+    def _set_query_results(self, query_results=None):
+        self.query_results = query_results
+        if query_results is None:
+            self.query_results = self._query()
+        self.generator = self._get_generator()
+             
+             
     def _get_generator(self):
         """
         Main generator to be used of ModifyGenerator class
         Can only handle cases where there is a data preparation
         pipeline in the pipeline run
         """
-        if (self.query_results is None):
-            self.query_results = self._query(self.args)
         for query_result in self.query_results:
             #iterate through modifier results
-            for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result,self.args):
+            for pipeline, problem_path, dataset_doc, seed, data, score in self._modify(query_result):
                 #save the pipeline to path and return pipeline path
                 data_prep_pipeline, data_random_seed, data_params = data
                 scoring_pipeline, scoring_random_seed, scoring_params = score
@@ -77,24 +86,24 @@ def _get_generator(self):
                 yield job
         
         
-    def _query(self, args):
+    def _query(self):
         """method for querying database according to pipeline modification type
         """
         if (self.modifier_type=='random-seed'):
-            return query_on_seeds(args.pipeline_id, args.seed_limit, args.submitter)
+            return query_on_seeds(self.pipeline_id, self.seed_limit, self.submitter)
         if (self.modifier_type=='swap-primitive'):
-            return query_on_primitive(args.primitive_id, args.limit_indeces)
+            return query_on_primitive(self.primitive_id, self.limit_indeces)
         else:
             raise ValueError("This type of modification is not yet an option")
     
             
-    def _modify(self, query_args: dict, args):
+    def _modify(self, query_args):
         """Handler for different types of pipeline modification tasks
         """
         if self.modifier_type=='random-seed':
-            return self._modify_random_seed(args.seed_limit, query_args)
+            return self._modify_random_seed(self.seed_limit, query_args)
         if self.modifier_type=='swap-primitive':
-            return self._modify_swap_primitive(args.swap_primitive_id, query_args)
+            return self._modify_swap_primitive(self.swap_primitive_id, query_args)
         else:
             raise ValueError("This type of modification is not yet an option")
     
@@ -120,7 +129,7 @@ def _modify_random_seed(self, seed_limit, query_args):
         used_seeds = query_args['tested_seeds']
         num_run = len(used_seeds)
         #run until the right number of seeds have been run
-        while (num_run < seed_limit):
+        while (num_run < self.seed_limit):
             new_seed = randint(1,100000)
             if (new_seed in used_seeds):
                 continue
diff --git a/tests/test_modifier.py b/tests/test_modifier.py
index 87761b4..4206c0e 100644
--- a/tests/test_modifier.py
+++ b/tests/test_modifier.py
@@ -1,38 +1,42 @@
 import unittest
+import json
 from experimenter import modify_generator, queue, exceptions, utils
 from experimenter.databases.d3m_mtl import D3MMtLDB
 from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH
 from d3m.contrib.pipelines import SCORING_PIPELINE_PATH as scoring_file
 
+
 class GeneratorModifierTestCase(unittest.TestCase):
     
     
     def test_random_seed_modifier_job_count(self):
         #initialize the modifier with random-seed and a given max jobs
-        args = {'seed_limit':35, 'submitter':None, 'pipeline_id':None}
         num_test = 5
-        modifier = modify_generator.ModifyGenerator('random-seed', num_test, {'seed_limit':25})
-        modifier.query_args = self.get_seed_test_args
-        #start the counter to make sure there are the right amount of jobs
-        counter = 0
-        seed_old = 12.1
-        #begin the test if number of jobs is correct
-        for job in modifier:
-           counter += 1
-           _,_,seed_new = job
-           self.assertNotEqual(seed_old, seed_new)
-           seed_old = seed_new
-        self.assertEqual(counter,num_test) 
+        seed_limit = 25
+        modifier = modify_generator.ModifyGenerator(modify_type='random-seed', 
+                                                    seed_limit=seed_limit, 
+                                                    max_jobs=num_test)
+        modifier._set_query_results(self.get_seed_test_args())
+        #begin the test if number of generated seed jobs is correct
+        self.assertEqual(len(list(modifier._modify_random_seed(seed_limit, next(modifier.query_results)))), seed_limit-2)
+        #reinitialize to test if total job count is right
+        modifier = modify_generator.ModifyGenerator(modify_type = 'random-seed', 
+                                                    max_jobs = num_test, 
+                                                    seed_limit = seed_limit)
+        modifier.query_results = self.get_seed_test_args()
+        self.assertEqual(modifier.max_jobs, num_test)
+        self.assertEqual(len(list(modifier)), modifier.max_jobs)
       
         
     def test_query_random_seeds_set_size(self):
         args = {'seed_limit':25, 'submitter':'byu', 'pipeline_id':None}
+        seed_limit = 25
         query_results = modify_generator.query_on_seeds(args['pipeline_id'], args['seed_limit'], args['submitter'])
         #test 10 query results
         for i in range(10):
             query = next(query_results)
-            self.assertTrue(len(query['tested_seeds']) < seed_limit)   
-          
+            self.assertTrue(len(query['tested_seeds']) < seed_limit) 
+      
        
     def test_d3m_interface_init(self):
         init_fail = False
@@ -40,10 +44,10 @@ def test_d3m_interface_init(self):
             d3m_db = D3MMtLDB()
         except:
             init_fail = True
-        self.assertFalse(init, "D3M Interface Failed")    
+        self.assertFalse(init_fail, "D3M Interface Failed")    
     
                
-    def get_seed_test_args(self,args):
+    def get_seed_test_args(self):
         """ returns args for testing modify generator random-seed
             functionality purposes.  It uses a dataset and pipeline 
             that is saved in the d3m-experimenter
@@ -62,7 +66,7 @@ def get_seed_test_args(self,args):
                'tested_seeds': used_seeds, 'data_prep_pipeline': data_prep_pipeline, 
                'data_prep_seed': data_prep_seed, 'data_params': None,
                'scoring_pipeline': scoring_pipeline, 'scoring_seed': scoring_seed,
-               'scoring_params': None} 
+               'scoring_params': None}
             
               
 if __name__ == '__main__':

From 3714ac190f1157c337322444ae779bade9fd059f Mon Sep 17 00:00:00 2001
From: Benson Manner <bensonmanner13@gmail.com>
Date: Mon, 29 Mar 2021 15:25:34 -0600
Subject: [PATCH 44/44] Add runtime and old execute pipeline files

---
 experimenter/execute_pipeline_old.py | 218 +++++++++++++++++++++++++++
 experimenter/runtime.py              |  96 ++++++++++++
 2 files changed, 314 insertions(+)
 create mode 100644 experimenter/execute_pipeline_old.py
 create mode 100644 experimenter/runtime.py

diff --git a/experimenter/execute_pipeline_old.py b/experimenter/execute_pipeline_old.py
new file mode 100644
index 0000000..c1bb34b
--- /dev/null
+++ b/experimenter/execute_pipeline_old.py
@@ -0,0 +1,218 @@
+"""
+FILE INFORMATION:
+This file needs to be a stand alone file so that it can be imported and used by the
+experimenter_driver.py.  This is because RQ only accepts a function that is imported
+and not defined in __main__.  These functions are what is needed to execute a pipeline
+on a problem and can be used by an individual machine, or used in a RQ job queue.
+"""
+import logging
+from typing import List
+
+from d3m.metadata.pipeline import Pipeline
+
+from experimenter.run_fit_pipeline import RunFitPipeline
+from experimenter.run_pipeline import RunPipeline
+from experimenter.databases.aml_mtl import PipelineDB
+from experimenter.databases.d3m_mtl import D3MMtLDB
+from experimenter.problem import ProblemReference
+from experimenter.config import SAVE_TO_D3M
+from experimenter.constants import METRICS_BY_PROBLEM_TYPE
+
+
+logger = logging.getLogger(__name__)
+
+
+def execute_pipeline_on_problem(
+    pipe: Pipeline,
+    problem: ProblemReference,
+    volumes_dir: str,
+    all_metrics: bool = True,
+):
+    """
+    The main function to execute a pipeline. Called in `experimenter_driver.py`.
+    This function will check if the  pipeline and dataset has been executed before,
+    run the pipeline, and record the results.
+
+    :param pipe: the pipeline object that will be executed
+    :param problem: a reference to the problem to run the pipeline on.
+    :param volumes_dir: a string containing the path to the volumes directory
+    :param all_metrics: if `True`, the pipeline will be scored against all metrics
+        registered for `problem`'s problem type. If `False`, it will only be scored
+        against the metrics listed in `problem`'s description.
+    """
+    # Validate args
+    if all_metrics and problem.problem_type not in METRICS_BY_PROBLEM_TYPE:
+        raise ValueError(
+            f"cannot compute all metrics for problem {problem.name}, "
+            "it does not have a supported problem type."
+        )
+
+    # If the experimenter is configured to save documents to the D3M database,
+    # we only want to execute and save this pipeline run if it doesn't already
+    # exist in the D3M database.
+    if SAVE_TO_D3M and D3MMtLDB().has_pipeline_been_run_on_problem(pipe, problem):
+        logger.info("Pipeline has already been run on this dataset, SKIPPING.")
+        return
+
+    metric_names = (
+        METRICS_BY_PROBLEM_TYPE[problem.problem_type] if all_metrics else None
+    )
+
+    # Attempt to run the pipeline
+    logger.info("\n Running pipeline on problem {}".format(problem.name))
+    run_pipeline = RunPipeline(volumes_dir, problem)
+    try:
+        scores, (fit_result, produce_result) = run_pipeline.run(
+            pipeline=pipe, metric_names=metric_names
+        )
+    except Exception as e:
+        logger.exception("pipeline was not successfully run")
+        print_pipeline(pipe.to_json_structure())
+        raise e
+
+    score = scores[0]
+    # put in the fit pipeline run
+    handle_successful_pipeline_run(
+        fit_result.pipeline_run.to_json_structure(), pipe, score
+    )
+    # put in the produce pipeline run
+    handle_successful_pipeline_run(
+        produce_result.pipeline_run.to_json_structure(), pipe, score
+    )
+
+
+def execute_metafeatures_pipeline_on_problem(
+    pipe: Pipeline, problem: ProblemReference, volumes_dir: str
+):
+    """
+    The main function to execute a `metafeatures` pipeline.  Differs from
+    `execute_pipeline_on_problem` by only handling metafeatures, and by
+    computing them on every subset of the problem e.g. TRAIN, TEST, SCORE, etc.
+    Called in `experimenter_driver.py`. This function will run the pipeline,
+    and record the results.
+
+    :param pipe: the pipeline object that will be executed
+    :param problem: a reference to the problem to run the pipeline on.
+    :param volumes_dir: a string containing the path to the volumes directory
+    """
+    mongo_db = PipelineDB()
+
+    for subset in problem.valid_subsets:
+        if problem.has_subset(subset):
+            problem.subset = subset
+            logger.info(
+                f"computing metafeatures for problem {problem.name} ({problem.subset} subset)..."
+            )
+            # Compute and store the metafeatures for this subset of the problem.
+            run_pipeline = RunFitPipeline(volumes_dir, problem)
+            try:
+                results = run_pipeline.run(pipeline=pipe)
+            except Exception as e:
+                logger.exception("pipeline was not successfully run")
+                print_pipeline(pipe._to_json_structure())
+                raise e
+
+            logger.info(results)
+            fit_result = results
+            mongo_db.add_to_metafeatures(fit_result._to_json_structure())
+
+
+def handle_successful_pipeline_run(
+    pipeline_run: dict, pipeline: Pipeline, score: float
+):
+    """
+    Called after a successful pipeline run.  It will output the results to the console
+    and write it to the database.
+
+    :param pipeline_run: the pipeline run object that will be recorded
+    :param pipeline: the pipeline that was run
+    :param score: the results from the execution of the pipeline
+    """
+    if score["value"][0] == 0:
+        # F-SCORE was calculated wrong - quit and don't keep this run
+        return
+
+    print_pipeline(pipeline.to_json_structure(), score)
+    d3m_db = D3MMtLDB()
+
+    if not d3m_db.does_pipeline_exist_in_db(pipeline):
+        pipeline_save_response = d3m_db.save_pipeline(pipeline, save_primitives=True)
+        if pipeline_save_response.ok:
+            logger.info(
+                f"pipeline {pipeline.get_digest()} "
+                f"saved successfully, response: {pipeline_save_response.json()}"
+            )
+
+    pipeline_run_save_response = d3m_db.save_pipeline_run(pipeline_run)
+    if pipeline_run_save_response.ok:
+        logger.info(
+            f"pipeline run {pipeline_run['id']} "
+            f"saved successfully, response: {pipeline_run_save_response.json()}"
+        )
+
+
+def print_pipeline_and_problem(pipeline: dict, problem: str):
+    """
+    A simple function to print the pipeline and problem, for debugging
+
+    :param pipeline: the pipeline that was executed
+    :param problem: the dataset/problem that was used
+    """
+    logger.info("Pipeline:")
+    logger.info(get_list_vertically(primitive_list_from_pipeline_object(pipeline)))
+    logger.info("on problem {} \n\n".format(problem))
+
+
+def get_primitive_combo_string(pipeline):
+    prim_string = ""
+    for p in pipeline["steps"]:
+        prim_string += p["primitive"]["id"]
+    return prim_string
+
+
+def print_pipeline(pipeline: dict, score: float = None) -> List[str]:
+    """
+    A helper function for printing a succesful run
+
+    :param pipeline: the pipeline that we will print
+    :param score: the results of the metric used in training
+    :return primitive_list: a list of all the primitives used in the pipeline
+    """
+    primitive_list = primitive_list_from_pipeline_json(pipeline)
+    logger.info("pipeline:\n")
+    logger.info(get_list_vertically(primitive_list))
+    if score is not None:
+        logger.info("with a {} of {}".format(score["metric"][0], score["value"][0]))
+    return primitive_list
+
+
+def primitive_list_from_pipeline_object(pipeline: Pipeline):
+    """
+    A helper function to return all the primitives used in a pipeline
+
+    :param pipeline: a pipeline object
+    """
+    primitives = []
+    for p in pipeline.steps:
+        primitives.append(p.to_json_structure()["primitive"]["python_path"])
+    return primitives
+
+
+def primitive_list_from_pipeline_json(pipeline_json: dict):
+    """
+    A helper function to return all the primitives used in a pipeline
+
+    :param pipeline_json a pipeline object in JSON form
+    """
+    primitives = []
+    for step in pipeline_json["steps"]:
+        primitives.append(step["primitive"]["python_path"])
+    return primitives
+
+
+def get_list_vertically(list_to_use: list, indent: bool = True):
+    """
+    A helper function to join a list vertically. Used for debugging printing.
+    """
+    final_list = ["\t" + item for item in list_to_use] if indent else list_to_use
+    return "\n" + "\n".join(final_list)
diff --git a/experimenter/runtime.py b/experimenter/runtime.py
new file mode 100644
index 0000000..648bf29
--- /dev/null
+++ b/experimenter/runtime.py
@@ -0,0 +1,96 @@
+import json
+import yaml
+import os
+
+from typing import Any, List, Tuple
+from experimenter import config, utils, exceptions
+
+from d3m import cli as d3m_cli
+from d3m.contrib.pipelines import K_FOLD_TABULAR_SPLIT_PIPELINE_PATH as k_fold_split_path
+from experimenter.databases.d3m_mtl import D3MMtLDB
+
+
+def evaluate(pipeline: str=None,
+    problem: str=None,
+    input: str=None,
+    random_seed: int=0,
+    data_pipeline: str=k_fold_split_path,
+    data_random_seed: int=0,
+    data_params=None,
+    scoring_pipeline: str=None,
+    scoring_params=None,
+    scoring_random_seed: int=0):
+    """ 
+    Evaluate pipeline on problem using d3m's runtime cli. 
+    Wrapper function to execute d3m's runtime cli 'evaluate' command.
+    Arguments mirror the same arguments using the cli.
+    Only handles cases with a data preparation pipeline in the 
+    pipeline run.
+
+    Parameters
+    ----------
+    pipeline : path_like str
+        path to pipeline doc or pipeline ID
+    problem : path_like str
+        path to problem doc
+    input : path_like str
+        path to input full data
+    random_seed : int
+        random seed to used for
+        pipeline run
+    data_pipeline_path: str
+        path to data prepation pipeline
+    data_random_seed: int
+        random_seed to be used in data preparation
+    data_params:
+        parameters for data preparation
+    scoring_params:
+        parameters for scoring pipeline
+    scoring_random_seed: int
+        random seed for scoring
+    scoring_pipeline: str
+        path to scoring pipeline 
+    Return:
+    -------
+    None
+    
+    Raises:
+    -------
+    ValueError
+        when parameter value is
+        invalid
+    """    
+    if (not os.path.isfile(pipeline)):
+        raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('pipeline'))
+
+    if (not os.path.isfile(problem)): 
+        raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('problem'))
+
+    if (not os.path.isfile(input)):
+        raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('input'))
+        
+    if (not os.path.isfile(data_pipeline)):
+        raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('input'))
+        
+    if (not os.path.isfile(scoring_pipeline)):
+        raise exceptions.InvalidArgumentValueError('\'{}\' param not a file path'.format('input'))
+                
+    output_run = utils.get_pipeline_run_output_path(pipeline, input, random_seed)
+    #get the runtime arguments for the d3m cli    
+    args = ['d3m', 'runtime','--random-seed', str(random_seed), 'evaluate',
+            '--pipeline', pipeline, '--problem', problem, '--input', input,
+            '--output-run', output_run, '--data-pipeline', data_pipeline,
+            '--data-random-seed', str(data_random_seed),
+            '--scoring-pipeline', scoring_pipeline,
+            '--scoring-random-seed', str(scoring_random_seed)]
+    #add the data parameters to the cli arguments
+    if (data_params is not None):
+        for name, value in data_params.items():
+            args.extend(('--data-param', name, value))
+    #add the scoring parameters to the cli arguments
+    if (scoring_params is not None):
+        for name, value in scoring_params.items():
+            args.extend(('--scoring-param', name, value))
+    d3m_cli.main(args)
+    #save if proper system variable SAVE_TO_D3M is set to true
+    responses = D3MMtLDB().save_pipeline_runs_from_path(output_run)