From e04882b06e2d728935c5f99862b5f9ed52a14941 Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Wed, 17 May 2017 23:08:10 -0700 Subject: [PATCH] Convert to python3 --- bigjob/__init__.py | 6 +- bigjob/__init__.py.bak | 95 + bigjob/bigjob_agent.py | 64 +- bigjob/bigjob_agent.py.bak | 997 +++++++ bigjob/bigjob_agent_condor.py | 34 +- bigjob/bigjob_agent_condor.py.bak | 564 ++++ bigjob/bigjob_manager.py | 20 +- bigjob/bigjob_manager.py.bak | 1190 +++++++++ bigjob/job_plugin/ec2ssh.py | 8 +- bigjob/job_plugin/ec2ssh.py.bak | 245 ++ bigjob/job_plugin/gcessh.py | 12 +- bigjob/job_plugin/gcessh.py.bak | 237 ++ bigjob/job_plugin/slurmssh.py | 4 +- bigjob/job_plugin/slurmssh.py.bak | 200 ++ bigjob_dynamic/many_job.py | 27 +- bigjob_dynamic/many_job.py.bak | 436 +++ bigjob_dynamic/many_job_affinity.py | 2 +- bigjob_dynamic/many_job_affinity.py.bak | 53 + bootstrap/bigjob-bootstrap.py | 46 +- bootstrap/bigjob-bootstrap.py.bak | 2350 +++++++++++++++++ cli/pilot_cli.py | 46 +- cli/pilot_cli.py.bak | 275 ++ coordination/bigjob_coordination_advert.py | 2 +- .../bigjob_coordination_advert.py.bak | 278 ++ coordination/bigjob_coordination_redis.py | 2 +- coordination/bigjob_coordination_redis.py.bak | 232 ++ coordination/bigjob_coordination_zmq.py | 10 +- coordination/bigjob_coordination_zmq.py.bak | 460 ++++ docs/source/conf.py | 16 +- docs/source/conf.py.bak | 253 ++ docs/source/tutorial/rst-tables.py | 8 +- docs/source/tutorial/rst-tables.py.bak | 129 + examples/datatransfer/simple_datatransfer.py | 20 +- .../datatransfer/simple_datatransfer.py.bak | 118 + examples/example_styleguide.py | 6 +- examples/example_styleguide.py.bak | 62 + .../pilot-api/example-pilot-api-decentral.py | 4 +- .../example-pilot-api-decentral.py.bak | 57 + examples/pilot-api/example-pilot-api.py | 16 +- examples/pilot-api/example-pilot-api.py.bak | 65 + .../example-pilot-compute-data-cloud.py | 2 +- .../example-pilot-compute-data-cloud.py.bak | 138 + .../pilot-api/example-pilot-compute-direct.py | 2 +- .../example-pilot-compute-direct.py.bak | 46 + .../pilot-api/example-pilot-data-reconnect.py | 2 +- .../example-pilot-data-reconnect.py.bak | 45 + examples/pilot-api/example-pilot-data.py | 8 +- examples/pilot-api/example-pilot-data.py.bak | 85 + .../local_chained_ensembles.py | 10 +- .../local_chained_ensembles.py.bak | 93 + .../local_coupled_ensembles.py | 16 +- .../local_coupled_ensembles.py.bak | 113 + .../barebones-local/local_mandelbrot.py | 4 +- .../barebones-local/local_mandelbrot.py.bak | 79 + .../barebones-local/local_simple_ensembles.py | 8 +- .../local_simple_ensembles.py.bak | 78 + examples/tutorial/chained_ensembles.py | 10 +- examples/tutorial/chained_ensembles.py.bak | 130 + examples/tutorial/coupled_ensembles.py | 16 +- examples/tutorial/coupled_ensembles.py.bak | 153 ++ examples/tutorial/simple_ensemble_dataxfer.py | 10 +- .../tutorial/simple_ensemble_dataxfer.py.bak | 119 + examples/tutorial/simple_ensembles.py | 8 +- examples/tutorial/simple_ensembles.py.bak | 106 + .../xsede2013/01_bigjob-simple-ensemble.py | 8 +- .../01_bigjob-simple-ensemble.py.bak | 83 + .../02_bigjob-simple-ensemble-datatransfer.py | 10 +- ...bigjob-simple-ensemble-datatransfer.py.bak | 91 + .../xsede2013/03_bigjob_chained_ensemble.py | 14 +- .../03_bigjob_chained_ensemble.py.bak | 107 + .../xsede2013/04_bigjob_coupled_ensembles.py | 16 +- .../04_bigjob_coupled_ensembles.py.bak | 128 + examples/xsede2013/05_bigjob_mandelbrot.py | 14 +- .../xsede2013/05_bigjob_mandelbrot.py.bak | 130 + .../CDS-01_bigjob-simple-ensemble.py | 8 +- .../CDS-01_bigjob-simple-ensemble.py.bak | 81 + ...-02_bigjob-simple-ensemble-datatransfer.py | 10 +- ...bigjob-simple-ensemble-datatransfer.py.bak | 94 + examples/xsede2013/mandelbrot.py | 2 +- examples/xsede2013/mandelbrot.py.bak | 95 + ez_setup.py | 2 +- ez_setup.py.bak | 361 +++ pilot/api/api.py | 4 +- pilot/api/api.py.bak | 29 + pilot/coordination/advert_adaptor.py | 6 +- pilot/coordination/advert_adaptor.py.bak | 287 ++ pilot/coordination/nocoord_adaptor.py | 6 +- pilot/coordination/nocoord_adaptor.py.bak | 273 ++ pilot/coordination/redis_adaptor.py | 4 +- pilot/coordination/redis_adaptor.py.bak | 301 +++ pilot/filemanagement/globusonline_adaptor.py | 28 +- .../globusonline_adaptor.py.bak | 342 +++ pilot/filemanagement/gs_adaptor.py | 12 +- pilot/filemanagement/gs_adaptor.py.bak | 220 ++ pilot/filemanagement/irods_adaptor.py | 16 +- pilot/filemanagement/irods_adaptor.py.bak | 244 ++ pilot/filemanagement/s3_adaptor.py | 14 +- pilot/filemanagement/s3_adaptor.py.bak | 343 +++ pilot/filemanagement/ssh_adaptor.py | 28 +- pilot/filemanagement/ssh_adaptor.py.bak | 366 +++ pilot/filemanagement/webhdfs_adaptor.py | 6 +- pilot/filemanagement/webhdfs_adaptor.py.bak | 118 + pilot/impl/pilot_manager.py | 24 +- pilot/impl/pilot_manager.py.bak | 389 +++ pilot/impl/pilot_manager_decentral.py | 24 +- pilot/impl/pilot_manager_decentral.py.bak | 377 +++ pilot/impl/pilotcompute_manager.py | 36 +- pilot/impl/pilotcompute_manager.py.bak | 495 ++++ pilot/impl/pilotdata_manager.py | 24 +- pilot/impl/pilotdata_manager.py.bak | 918 +++++++ .../data_compute_affinity_scheduler.py | 10 +- .../data_compute_affinity_scheduler.py.bak | 92 + setup.py | 10 +- setup.py.bak | 96 + tests/bigjob-api/example_fg_single.py | 6 +- tests/bigjob-api/example_fg_single.py.bak | 115 + tests/bigjob-api/example_gce_single.py | 6 +- tests/bigjob-api/example_gce_single.py.bak | 96 + tests/bigjob-api/example_hector_single.py | 6 +- tests/bigjob-api/example_hector_single.py.bak | 112 + tests/bigjob-api/example_india.py | 4 +- tests/bigjob-api/example_india.py.bak | 105 + tests/bigjob-api/example_kraken_single.py | 6 +- tests/bigjob-api/example_kraken_single.py.bak | 120 + tests/bigjob-api/example_local_multiple.py | 6 +- .../bigjob-api/example_local_multiple.py.bak | 111 + .../example_local_multiple_reconnect.py | 16 +- .../example_local_multiple_reconnect.py.bak | 93 + tests/bigjob-api/example_local_single.py | 6 +- tests/bigjob-api/example_local_single.py.bak | 113 + .../example_local_single_filestaging.py | 8 +- .../example_local_single_filestaging.py.bak | 125 + tests/bigjob-api/example_ls_single.py | 6 +- tests/bigjob-api/example_ls_single.py.bak | 110 + tests/bigjob-api/example_manyjob_affinity.py | 18 +- .../example_manyjob_affinity.py.bak | 114 + tests/bigjob-api/example_manyjob_local.py | 22 +- tests/bigjob-api/example_manyjob_local.py.bak | 150 ++ ...example_single_filestaging_globusonline.py | 8 +- ...ple_single_filestaging_globusonline.py.bak | 133 + tests/condor/example_condor_single.py | 6 +- tests/condor/example_condor_single.py.bak | 120 + tests/condor/example_condorg_single.py | 6 +- tests/condor/example_condorg_single.py.bak | 117 + tests/performance/throughput.py | 2 +- tests/performance/throughput.py.bak | 56 + tests/test_connection_pooling.py | 6 +- tests/test_connection_pooling.py.bak | 59 + tests/test_pty_exhaustion.py | 8 +- tests/test_pty_exhaustion.py.bak | 130 + util/archive.py | 4 +- util/archive.py.bak | 97 + util/bigjob_usage.py | 8 +- util/bigjob_usage.py.bak | 165 ++ 154 files changed, 18254 insertions(+), 471 deletions(-) create mode 100644 bigjob/__init__.py.bak create mode 100644 bigjob/bigjob_agent.py.bak create mode 100644 bigjob/bigjob_agent_condor.py.bak create mode 100644 bigjob/bigjob_manager.py.bak create mode 100644 bigjob/job_plugin/ec2ssh.py.bak create mode 100644 bigjob/job_plugin/gcessh.py.bak create mode 100644 bigjob/job_plugin/slurmssh.py.bak create mode 100644 bigjob_dynamic/many_job.py.bak create mode 100644 bigjob_dynamic/many_job_affinity.py.bak create mode 100644 bootstrap/bigjob-bootstrap.py.bak create mode 100644 cli/pilot_cli.py.bak create mode 100644 coordination/bigjob_coordination_advert.py.bak create mode 100644 coordination/bigjob_coordination_redis.py.bak create mode 100644 coordination/bigjob_coordination_zmq.py.bak create mode 100644 docs/source/conf.py.bak create mode 100644 docs/source/tutorial/rst-tables.py.bak create mode 100644 examples/datatransfer/simple_datatransfer.py.bak create mode 100644 examples/example_styleguide.py.bak create mode 100644 examples/pilot-api/example-pilot-api-decentral.py.bak create mode 100644 examples/pilot-api/example-pilot-api.py.bak create mode 100644 examples/pilot-api/example-pilot-compute-data-cloud.py.bak create mode 100644 examples/pilot-api/example-pilot-compute-direct.py.bak create mode 100644 examples/pilot-api/example-pilot-data-reconnect.py.bak create mode 100644 examples/pilot-api/example-pilot-data.py.bak create mode 100644 examples/tutorial/barebones-local/local_chained_ensembles.py.bak create mode 100644 examples/tutorial/barebones-local/local_coupled_ensembles.py.bak create mode 100644 examples/tutorial/barebones-local/local_mandelbrot.py.bak create mode 100644 examples/tutorial/barebones-local/local_simple_ensembles.py.bak create mode 100644 examples/tutorial/chained_ensembles.py.bak create mode 100644 examples/tutorial/coupled_ensembles.py.bak create mode 100644 examples/tutorial/simple_ensemble_dataxfer.py.bak create mode 100644 examples/tutorial/simple_ensembles.py.bak create mode 100644 examples/xsede2013/01_bigjob-simple-ensemble.py.bak create mode 100644 examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py.bak create mode 100644 examples/xsede2013/03_bigjob_chained_ensemble.py.bak create mode 100644 examples/xsede2013/04_bigjob_coupled_ensembles.py.bak create mode 100644 examples/xsede2013/05_bigjob_mandelbrot.py.bak create mode 100644 examples/xsede2013/CDS-01_bigjob-simple-ensemble.py.bak create mode 100644 examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py.bak create mode 100644 examples/xsede2013/mandelbrot.py.bak create mode 100644 ez_setup.py.bak create mode 100644 pilot/api/api.py.bak create mode 100644 pilot/coordination/advert_adaptor.py.bak create mode 100644 pilot/coordination/nocoord_adaptor.py.bak create mode 100644 pilot/coordination/redis_adaptor.py.bak create mode 100644 pilot/filemanagement/globusonline_adaptor.py.bak create mode 100644 pilot/filemanagement/gs_adaptor.py.bak create mode 100644 pilot/filemanagement/irods_adaptor.py.bak create mode 100644 pilot/filemanagement/s3_adaptor.py.bak create mode 100644 pilot/filemanagement/ssh_adaptor.py.bak create mode 100644 pilot/filemanagement/webhdfs_adaptor.py.bak create mode 100644 pilot/impl/pilot_manager.py.bak create mode 100644 pilot/impl/pilot_manager_decentral.py.bak create mode 100644 pilot/impl/pilotcompute_manager.py.bak create mode 100644 pilot/impl/pilotdata_manager.py.bak create mode 100644 pilot/scheduler/data_compute_affinity_scheduler.py.bak create mode 100644 setup.py.bak create mode 100644 tests/bigjob-api/example_fg_single.py.bak create mode 100644 tests/bigjob-api/example_gce_single.py.bak create mode 100644 tests/bigjob-api/example_hector_single.py.bak create mode 100644 tests/bigjob-api/example_india.py.bak create mode 100644 tests/bigjob-api/example_kraken_single.py.bak create mode 100644 tests/bigjob-api/example_local_multiple.py.bak create mode 100644 tests/bigjob-api/example_local_multiple_reconnect.py.bak create mode 100644 tests/bigjob-api/example_local_single.py.bak create mode 100644 tests/bigjob-api/example_local_single_filestaging.py.bak create mode 100644 tests/bigjob-api/example_ls_single.py.bak create mode 100644 tests/bigjob-api/example_manyjob_affinity.py.bak create mode 100644 tests/bigjob-api/example_manyjob_local.py.bak create mode 100644 tests/bigjob-api/example_single_filestaging_globusonline.py.bak create mode 100644 tests/condor/example_condor_single.py.bak create mode 100644 tests/condor/example_condorg_single.py.bak create mode 100644 tests/performance/throughput.py.bak create mode 100644 tests/test_connection_pooling.py.bak create mode 100644 tests/test_pty_exhaustion.py.bak create mode 100644 util/archive.py.bak create mode 100644 util/bigjob_usage.py.bak diff --git a/bigjob/__init__.py b/bigjob/__init__.py index 2ccffd7c..a3a03c18 100644 --- a/bigjob/__init__.py +++ b/bigjob/__init__.py @@ -9,7 +9,7 @@ #READ config try: - import ConfigParser + import configparser _CONFIG_FILE="bigjob.conf" _conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + _CONFIG_FILE if not os.path.exists(_conf_file): @@ -18,7 +18,7 @@ #print "using conf file: " + str(_conf_file) - _config = ConfigParser.ConfigParser() + _config = configparser.ConfigParser() _config.read(_conf_file) default_dict = _config.defaults() @@ -41,7 +41,7 @@ else: # 4 = DEBUG + INFO + WARNING + ERROR if BIGJOB_VERBOSE >= 4: - print "set to DEBUG" + print("set to DEBUG") logging_level = logging.DEBUG # 3 = INFO + WARNING + ERROR elif BIGJOB_VERBOSE == 3: diff --git a/bigjob/__init__.py.bak b/bigjob/__init__.py.bak new file mode 100644 index 00000000..2ccffd7c --- /dev/null +++ b/bigjob/__init__.py.bak @@ -0,0 +1,95 @@ +import os +import sys +import logging +import traceback +version = "latest" + +#from pkg_resources import Requirement, resource_filename + +#READ config + +try: + import ConfigParser + _CONFIG_FILE="bigjob.conf" + _conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + _CONFIG_FILE + if not os.path.exists(_conf_file): + _conf_file = os.path.join(sys.prefix, _CONFIG_FILE) + + + #print "using conf file: " + str(_conf_file) + + _config = ConfigParser.ConfigParser() + _config.read(_conf_file) + default_dict = _config.defaults() + + #################################################### + # logging + logging_level = logging.FATAL + BIGJOB_VERBOSE=None + try: + BIGJOB_VERBOSE = int(os.getenv('BIGJOB_VERBOSE')) + #print("BIGJOB_VERBOSE: %d"%BIGJOB_VERBOSE) + except Exception: + pass + + if BIGJOB_VERBOSE==None: # use logging level defined in config file + #print "Read log level from bigjob.conf" + level = default_dict["logging.level"] + #print("Logging level: %s"%level) + if level.startswith("logging."): + logging_level = eval(level) + else: + # 4 = DEBUG + INFO + WARNING + ERROR + if BIGJOB_VERBOSE >= 4: + print "set to DEBUG" + logging_level = logging.DEBUG + # 3 = INFO + WARNING + ERROR + elif BIGJOB_VERBOSE == 3: + logging_level = logging.INFO + # 2 = WARNING + ERROR + elif BIGJOB_VERBOSE == 2: + logging_level = logging.WARNING + # 1 = ERROR ONLY + elif BIGJOB_VERBOSE == 1: + logging_level = logging.ERROR + + + #print("Set logging level: %s"%(logging_level)) + logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + logger = logging.getLogger(name='bigjob') + + #logger.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', + # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + logger.setLevel(logging_level) + + paramiko_logger = logging.getLogger(name="paramiko.transport") + paramiko_logger.setLevel(logging.ERROR) + #logging.basicConfig(level=logging_level) + +except: + print("bigjob.conf could not be read") + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_exc(limit=1, file=sys.stdout) + +import socket +try: + fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", 'VERSION') + if not os.path.exists(fn): + fn = os.path.join(sys.prefix, 'VERSION') + #print "Open Version file: " + str(fn) + version = open(fn).read().strip() + logger.info("Loading BigJob version: " + version + " on " + socket.gethostname()) +except IOError: + pass + + + +# define external-facing API +from bigjob.bigjob_manager import bigjob +from bigjob.bigjob_manager import subjob +try: + from bigjob.bigjob_manager import description +except: + pass + diff --git a/bigjob/bigjob_agent.py b/bigjob/bigjob_agent.py index 1512f6d5..6c90f74a 100644 --- a/bigjob/bigjob_agent.py +++ b/bigjob/bigjob_agent.py @@ -12,7 +12,7 @@ import time import pdb import traceback -import ConfigParser +import configparser import types import logging import shutil @@ -83,26 +83,26 @@ def __init__(self, args): if not os.path.exists(conf_file): conf_file = os.path.join(sys.prefix, CONFIG_FILE) logging.debug ("read configfile: " + conf_file) - config = ConfigParser.ConfigParser() + config = configparser.ConfigParser() config.read(conf_file) default_dict = config.defaults() self.CPR=False - if default_dict.has_key("cpr"): + if "cpr" in default_dict: self.CPR = default_dict["cpr"] self.SHELL="/bin/bash" - if default_dict.has_key("shell"): + if "shell" in default_dict: self.SHELL=default_dict["shell"] self.MPIRUN="mpirun" # On TACC resources the default MPICH is # linked under mpirun_rsh - if default_dict.has_key("mpirun"): + if "mpirun" in default_dict: self.MPIRUN=default_dict["mpirun"] - if default_dict.has_key("number_executor_threads"): + if "number_executor_threads" in default_dict: THREAD_POOL_SIZE=int(default_dict["number_executor_threads"]) self.OUTPUT_TAR=False - if default_dict.has_key("create_output_tar"): + if "create_output_tar" in default_dict: self.OUTPUT_TAR=eval(default_dict["create_output_tar"]) logger.debug("Create output tar: %r", self.OUTPUT_TAR) @@ -181,7 +181,7 @@ def __init__(self, args): ############################################################################ # Detect launch method self.LAUNCH_METHOD="ssh" - if default_dict.has_key("launch_method"): + if "launch_method" in default_dict: self.LAUNCH_METHOD=default_dict["launch_method"] self.LAUNCH_METHOD=self.__get_launch_method(self.LAUNCH_METHOD) @@ -194,7 +194,7 @@ def __init__(self, args): ############################################################################## # start background thread for polling new jobs and monitoring current jobs # check whether user requested a certain threadpool size - if self.pilot_description!=None and self.pilot_description.has_key("number_executor_threads"): + if self.pilot_description!=None and "number_executor_threads" in self.pilot_description: THREAD_POOL_SIZE=int(self.pilot_description["number_executor_threads"]) logger.debug("Creating executor thread pool of size: %d"%(THREAD_POOL_SIZE)) self.resource_lock=threading.RLock() @@ -306,7 +306,7 @@ def init_pbs(self): # get number of requested slots from pilot description number_of_requested_processes = self.pilot_description["number_of_processes"] - if os.environ.has_key("PBS_NNODES"): + if "PBS_NNODES" in os.environ: # use PBS assigned node count if available number_nodes = os.environ.get("PBS_NNODES") else: @@ -333,7 +333,7 @@ def init_pbs(self): node_dict[i] = num_cpus self.freenodes=[] - for i in node_dict.keys(): + for i in list(node_dict.keys()): logger.debug("host: " + i + " nodes: " + str(node_dict[i])) for j in range(0, node_dict[i]): logger.debug("add host: " + i.strip()) @@ -362,22 +362,22 @@ def execute_job(self, job_url, job_dict): logger.debug("Start job id %s specification %s: "%(job_id, str(job_dict))) numberofprocesses = "1" try: - if (job_dict.has_key("NumberOfProcesses") == True): + if (("NumberOfProcesses" in job_dict) == True): numberofprocesses = job_dict["NumberOfProcesses"] except: pass # ignore in particular if Bliss is used spmdvariation="single" try: - if (job_dict.has_key("SPMDVariation") == True): + if (("SPMDVariation" in job_dict) == True): spmdvariation = job_dict["SPMDVariation"] except: pass # ignore in particular if Bliss is used arguments = "" - if (job_dict.has_key("Arguments") == True): + if (("Arguments" in job_dict) == True): arguments_raw = job_dict['Arguments']; - if type(arguments_raw) == types.ListType: + if type(arguments_raw) == list: arguments_list = arguments_raw else: arguments_list = eval(job_dict["Arguments"]) @@ -387,9 +387,9 @@ def execute_job(self, job_url, job_dict): environment = os.environ envi = "" self.number_subjobs=1 - if (job_dict.has_key("Environment") == True): + if (("Environment" in job_dict) == True): env_raw = job_dict['Environment'] - if type(env_raw) == types.ListType: + if type(env_raw) == list: env_list = env_raw else: env_list = eval(job_dict["Environment"]) @@ -411,7 +411,7 @@ def execute_job(self, job_url, job_dict): executable = self.__expand_directory(executable) workingdirectory = os.path.join(os.getcwd(), job_id) - if (job_dict.has_key("WorkingDirectory") == True): + if (("WorkingDirectory" in job_dict) == True): workingdirectory = job_dict["WorkingDirectory"] workingdirectory = self.__expand_directory(workingdirectory) try: @@ -421,13 +421,13 @@ def execute_job(self, job_url, job_dict): logging.debug("Sub-Job: %s, Working_directory: %s"%(job_id, workingdirectory)) output="stdout" - if (job_dict.has_key("Output") == True): + if (("Output" in job_dict) == True): output = job_dict["Output"] if not os.path.isabs(output): output=os.path.join(workingdirectory, output) error=os.path.join(workingdirectory,"stderr") - if (job_dict.has_key("Error") == True): + if (("Error" in job_dict) == True): error = job_dict["Error"] if not os.path.isabs(error): error=os.path.join(workingdirectory, error) @@ -457,14 +457,14 @@ def execute_job(self, job_url, job_dict): ####################################################################################################### # File Stage-In of dependent data units - if job_dict.has_key("InputData"): + if "InputData" in job_dict: self.coordination.set_job_state(job_url, str(bigjob.state.Staging)) self.__stage_in_data_units(eval(job_dict["InputData"]), workingdirectory) # File Stage-In - Move pilot-level files to working directory of sub-job if self.pilot_description!=None: try: - if self.pilot_description.has_key("description"): + if "description" in self.pilot_description: file_list = eval(self.pilot_description["description"]) if file_list != None and len(file_list)>0: logger.debug("Copy %d files to SJ work dir"%len(file_list)>0) @@ -472,7 +472,7 @@ def execute_job(self, job_url, job_dict): logger.debug("Process file: %s"%i) if i.find(">")>0: base_filename = os.path.basename(i[:i.index(">")].strip()) - if environment.has_key("_CONDOR_SCRATCH_DIR"): + if "_CONDOR_SCRATCH_DIR" in environment: source_filename = os.path.join(environment["_CONDOR_SCRATCH_DIR"], base_filename) else: source_filename = os.path.join(self.work_dir, base_filename) @@ -598,7 +598,7 @@ def setup_charmpp_nodefile(self, allocated_nodes): nodefile_string="" for i in allocated_nodes: - if i.has_key("private_hostname"): + if "private_hostname" in i: nodefile_string=nodefile_string + "host "+ i["private_hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" else: nodefile_string=nodefile_string + "host "+ i["hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" @@ -728,7 +728,7 @@ def monitor_jobs(self): #pdb.set_trace() logger.debug("Monitor jobs - # current jobs: %d"%len(self.jobs)) for i in self.jobs: - if self.processes.has_key(i): # only if job has already been starteds + if i in self.processes: # only if job has already been starteds p = self.processes[i] p_state = p.poll() logger.debug(self.print_job(i) + " state: " + str(p_state) + " return code: " + str(p.returncode)) @@ -738,9 +738,9 @@ def monitor_jobs(self): # Handle stage-out self.update_output_file() # for Condor case job_dict = self.coordination.get_job(i) # for Pilot Data case - if job_dict.has_key("OutputData"): + if "OutputData" in job_dict: workingdirectory = os.path.join(os.getcwd(), job_dict["job-id"]) - if (job_dict.has_key("WorkingDirectory") == True): + if (("WorkingDirectory" in job_dict) == True): workingdirectory = job_dict["WorkingDirectory"] workingdirectory = self.__expand_directory(workingdirectory) self.__stage_out_data_units(eval(job_dict["OutputData"]), workingdirectory) @@ -816,7 +816,7 @@ def is_stopped(self, base_url): except: pass logger.debug("Pilot State: " + str(state)) - if state==None or state.has_key("stopped")==False or state["stopped"]==True: + if state==None or ("stopped" in state)==False or state["stopped"]==True: return True else: return False @@ -863,7 +863,7 @@ def __stage_out_data_units(self, output_data=[], workingdirectory=None): try: for data_unit_dict in output_data: logger.debug("Process: " + str(data_unit_dict)) - for du_url in data_unit_dict.keys(): # go through all dicts (each representing 1 PD) + for du_url in list(data_unit_dict.keys()): # go through all dicts (each representing 1 PD) #pd_url = self.__get_pd_url(du_url) #pilot_data = PilotData(pd_url=pd_url) #du = pilot_data.get_du(du_url) @@ -977,9 +977,9 @@ def __get_launch_method(self, requested_method): def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) @@ -990,7 +990,7 @@ def __print_traceback(self): args = sys.argv num_args = len(args) if (num_args<3): - print "Usage: \n " + args[0] + " " + print("Usage: \n " + args[0] + " ") sys.exit(1) bigjob_agent = bigjob_agent(args) diff --git a/bigjob/bigjob_agent.py.bak b/bigjob/bigjob_agent.py.bak new file mode 100644 index 00000000..1512f6d5 --- /dev/null +++ b/bigjob/bigjob_agent.py.bak @@ -0,0 +1,997 @@ +#!/usr/bin/env python + +"""bigjob_agent: bigjob agent that is executed on the resource +""" + + +import sys +import os +import bigjob.state +import socket +import threading +import time +import pdb +import traceback +import ConfigParser +import types +import logging +import shutil +import fnmatch +import hostlist +from string import Template + +logging.basicConfig(level=logging.DEBUG) + +# Optional Imports +try: + import ast +except: + logging.debug("Python version <2.6. AST coult not be imported. ") + +try: + import saga +except: + logging.debug("SAGA not imported. ") + +logging.debug(str(sys.path)) +from threadpool import * + +# BigJob/Pilot framework classes +from bigjob import logger + +try: + from pilot.impl.pilotdata_manager import PilotData, DataUnit, PilotDataService +except: + logger.warning("Pilot Data classes could not be loaded. File movement will not work!") + +logger.debug("Python Version: " + str(sys.version_info)) +if sys.version_info < (2, 5): + sys.stderr.write("Python 2.4 - Warning: Not all functionalities working\n") +if sys.version_info < (2, 4): + sys.stderr.write("Warning: Using unsupported Python version\n") +if sys.version_info < (2, 3): + sys.stderr.write("Warning: Python versions <2.3 not supported\n") + sys.exit(-1) + +import subprocess + +""" Config parameters (will move to config file in future) """ +CONFIG_FILE="bigjob_agent.conf" +THREAD_POOL_SIZE=4 +APPLICATION_NAME="bigjob" + +class bigjob_agent: + + """BigJob Agent: + - reads new job information from communication and coordination subsystem (Redis) + - starts new jobs + - monitors running jobs """ + + """Constructor""" + def __init__(self, args): + + self.coordination_url = args[1] + # objects to store running jobs and processes + self.jobs = [] + self.processes = {} + self.freenodes = [] + self.busynodes = [] + self.restarted = {} + + # read config file + conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + CONFIG_FILE + if not os.path.exists(conf_file): + conf_file = os.path.join(sys.prefix, CONFIG_FILE) + logging.debug ("read configfile: " + conf_file) + config = ConfigParser.ConfigParser() + config.read(conf_file) + default_dict = config.defaults() + self.CPR=False + if default_dict.has_key("cpr"): + self.CPR = default_dict["cpr"] + self.SHELL="/bin/bash" + if default_dict.has_key("shell"): + self.SHELL=default_dict["shell"] + self.MPIRUN="mpirun" + # On TACC resources the default MPICH is + # linked under mpirun_rsh + if default_dict.has_key("mpirun"): + self.MPIRUN=default_dict["mpirun"] + + if default_dict.has_key("number_executor_threads"): + THREAD_POOL_SIZE=int(default_dict["number_executor_threads"]) + + self.OUTPUT_TAR=False + if default_dict.has_key("create_output_tar"): + self.OUTPUT_TAR=eval(default_dict["create_output_tar"]) + logger.debug("Create output tar: %r", self.OUTPUT_TAR) + + self.failed_polls = 0 + + ############################################################################## + # initialization of coordination and communication subsystem + # Redis initialization + self.base_url = args[2] + self.cds_queue_url = None + if len(args)==4: + self.cds_queue_url = args[3] + logger.debug("External queue: " + str(self.cds_queue_url)) + self.id = self.__get_bj_id(self.base_url) + logger.debug("BigJob Agent arguments: " + str(args)) + logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url) + logger.debug("BigJob ID: %s"%self.id) + + # create bj directory + self.work_dir = os.getcwd() + if self.work_dir.find(self.id)==-1: # working directory already contains BJ id + self.bj_dir = os.path.join(os.getcwd(), self.id) + logger.debug("Agent working directory: %s"%self.bj_dir) + try: + os.makedirs(self.bj_dir) + except: + logger.debug("Directory already exists.") + else: + self.bj_dir = os.getcwd() + + os.chdir(self.bj_dir) + + if(self.coordination_url.startswith("advert://") or self.coordination_url.startswith("sqlasyncadvert://")): + try: + from coordination.bigjob_coordination_advert import bigjob_coordination + logging.debug("Utilizing ADVERT Backend: " + self.coordination_url) + except: + logger.error("Advert Backend could not be loaded") + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_exc(file=sys.stderr) + traceback.print_tb(exc_traceback, file=sys.stderr) + elif (self.coordination_url.startswith("redis://")): + try: + from coordination.bigjob_coordination_redis import bigjob_coordination + logger.debug("Utilizing Redis Backend: " + self.coordination_url + ".") + except: + logger.error("Error loading pyredis. Check configuration in bigjob_coordination_redis.py.") + elif (self.coordination_url.startswith("tcp://")): + try: + from coordination.bigjob_coordination_zmq import bigjob_coordination + logger.debug("Utilizing ZMQ Backend") + except: + logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + +"PYZMQ (http://zeromq.github.com/pyzmq/)") + + ### + # Initiate coordination sub-system of both BJ agent and Pilot Data + self.coordination = bigjob_coordination(server_connect_url=self.coordination_url) + try: + # initialize coordination subsystem of pilot data + self.pilot_data_service = PilotDataService(coordination_url=self.coordination_url) + except: + logger.warn("Pilot-Data could not be initialized.") + + # update state of pilot job to running + logger.debug("set state to : " + str(bigjob.state.Running)) + self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False) + self.pilot_description = self.coordination.get_pilot_description(self.base_url) + try: + self.pilot_description = ast.literal_eval(self.pilot_description) + except: + logger.warn("Unable to parse pilot description") + self.pilot_description = None + + + ############################################################################ + # Detect launch method + self.LAUNCH_METHOD="ssh" + if default_dict.has_key("launch_method"): + self.LAUNCH_METHOD=default_dict["launch_method"] + + self.LAUNCH_METHOD=self.__get_launch_method(self.LAUNCH_METHOD) + + logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " + self.MPIRUN + " shell: " + self.SHELL) + + # init rms (SGE/PBS) + self.init_rms() + + ############################################################################## + # start background thread for polling new jobs and monitoring current jobs + # check whether user requested a certain threadpool size + if self.pilot_description!=None and self.pilot_description.has_key("number_executor_threads"): + THREAD_POOL_SIZE=int(self.pilot_description["number_executor_threads"]) + logger.debug("Creating executor thread pool of size: %d"%(THREAD_POOL_SIZE)) + self.resource_lock=threading.RLock() + self.threadpool = ThreadPool(THREAD_POOL_SIZE) + + self.launcher_thread=threading.Thread(target=self.dequeue_new_jobs) + self.launcher_thread.start() + + self.monitoring_thread=threading.Thread(target=self.start_background_thread) + self.monitoring_thread.start() + + + def __get_bj_id(self, url): + logger.debug("parsing ID out of URL: %s"%url) + start = url.index("bj-") + end = url.index(":", start) + bj_id = url[start:end] + return bj_id + + + def init_rms(self): + if(os.environ.get("GLIDEIN_CPUS")!=None): + return self.init_condor_glidein() + elif(os.environ.get("PBS_NODEFILE")!=None): + return self.init_pbs() + elif(os.environ.get("PE_HOSTFILE")!=None): + return self.init_sge() + elif(os.environ.get("SLURM_NODELIST")!=None): + return self.init_slurm() + else: + return self.init_local() + return None + + + def init_slurm(self): + logger.debug("Init nodefile from SLURM_NODELIST") + hosts = os.environ.get("SLURM_NODELIST") + if hosts == None: + self.init_local() + return + + hosts=hostlist.expand_hostlist(hosts) + number_cpus_per_node = 1 + if os.environ.get("SLURM_CPUS_ON_NODE")!=None: + number_cpus_per_node=int(os.environ.get("SLURM_CPUS_ON_NODE")) + for h in hosts: + for i in range(0, number_cpus_per_node): + self.freenodes.append((h + "\n")) + return self.freenodes + + + + def init_condor_glidein(self): + logger.debug("Init nodefile from Condor GlideIn environment") + num_cpus = 1 + try: + if(os.environ.get("GLIDEIN_CPUS")!=None): + num_cpus = int(os.environ.get("GLIDEIN_CPUS")) + except: + pass + for i in range(0, num_cpus): + self.freenodes.append("localhost\n") + return self.freenodes + + def init_local(self): + """ initialize free nodes list with dummy (for fork jobs)""" + logger.debug("Init nodefile from /proc/cpuinfo") + try: + num_cpus=1 + if self.pilot_description != None: + num_cpus = int(self.pilot_description["number_of_processes"]) + else: + num_cpus = int(self.get_num_cpus()) + for i in range(0, num_cpus): + self.freenodes.append("localhost\n") + except IOError: + self.freenodes=["localhost\n"] + return self.freenodes + + def init_sge(self): + """ initialize free nodes list from SGE environment """ + logger.debug("Init nodeslist from SGE NODEFILE") + sge_node_file = os.environ.get("PE_HOSTFILE") + if sge_node_file == None: + return + f = open(sge_node_file) + sgenodes = f.readlines() + f.close() + for i in sgenodes: + + columns = i.split() + try: + for j in range(0, int(columns[1])): + logger.debug("add host: " + columns[0].strip()) + self.freenodes.append(columns[0]+"\n") + except: + pass + return self.freenodes + + def init_pbs(self): + """ initialize free nodes list from PBS environment """ + logger.debug("Init nodeslist from PBS NODEFILE") + if self.LAUNCH_METHOD == "aprun": + # Workaround for Kraken and Hector + # PBS_NODEFILE does only contain front node + # thus we create a dummy node file with the respective + # number of slots + # aprun does not rely on the nodefile for job launching + + # get number of requested slots from pilot description + number_of_requested_processes = self.pilot_description["number_of_processes"] + if os.environ.has_key("PBS_NNODES"): + # use PBS assigned node count if available + number_nodes = os.environ.get("PBS_NNODES") + else: + number_nodes = number_of_requested_processes + self.freenodes=[] + for i in range(0, int(number_nodes)): + slot = "slot-%d\n"%i + logger.debug("add slot: " + slot.strip()) + self.freenodes.append(slot) + else: + pbs_node_file = os.environ.get("PBS_NODEFILE") + if pbs_node_file == None: + return + f = open(pbs_node_file) + self.freenodes = f.readlines() + f.close() + + # check whether pbs node file contains the correct number of nodes + num_cpus = self.get_num_cpus() + node_dict={} + for i in set(self.freenodes): + node_dict[i] = self.freenodes.count(i) + if node_dict[i] < num_cpus: + node_dict[i] = num_cpus + + self.freenodes=[] + for i in node_dict.keys(): + logger.debug("host: " + i + " nodes: " + str(node_dict[i])) + for j in range(0, node_dict[i]): + logger.debug("add host: " + i.strip()) + self.freenodes.append(i) + + def get_num_cpus(self): + cpuinfo = open("/proc/cpuinfo", "r") + cpus = cpuinfo.readlines() + cpuinfo.close() + num = 0 + for i in cpus: + if i.startswith("processor"): + num = num+1 + return num + + + def execute_job(self, job_url, job_dict): + """ obtain job attributes from c&c and execute process """ + state=str(job_dict["state"]) + + if(state==str(bigjob.state.Unknown) or + state==str(bigjob.state.New)): + try: + #job_dict["state"]=str(saga.job.New) + job_id = job_dict["job-id"] + logger.debug("Start job id %s specification %s: "%(job_id, str(job_dict))) + numberofprocesses = "1" + try: + if (job_dict.has_key("NumberOfProcesses") == True): + numberofprocesses = job_dict["NumberOfProcesses"] + except: + pass # ignore in particular if Bliss is used + + spmdvariation="single" + try: + if (job_dict.has_key("SPMDVariation") == True): + spmdvariation = job_dict["SPMDVariation"] + except: + pass # ignore in particular if Bliss is used + + arguments = "" + if (job_dict.has_key("Arguments") == True): + arguments_raw = job_dict['Arguments']; + if type(arguments_raw) == types.ListType: + arguments_list = arguments_raw + else: + arguments_list = eval(job_dict["Arguments"]) + for i in arguments_list: + arguments = arguments + " " + str(i) + + environment = os.environ + envi = "" + self.number_subjobs=1 + if (job_dict.has_key("Environment") == True): + env_raw = job_dict['Environment'] + if type(env_raw) == types.ListType: + env_list = env_raw + else: + env_list = eval(job_dict["Environment"]) + + logger.debug("Environment: " + str(env_list)) + for i in env_list: + logger.debug("Eval " + i) + # Hack for conduction experiments on Kraken + # Kraken specific support for running n sub-jobs at a time + if i.startswith("NUMBER_SUBJOBS"): + self.number_subjobs=int(i.split("=")[1].strip()) + logger.debug("NUMBER_SUBJOBS: " + str(self.number_subjobs)) + else: + envi_1 = "export " + i +"; " + envi = envi + envi_1 + logger.debug(envi) + + executable = job_dict["Executable"] + executable = self.__expand_directory(executable) + + workingdirectory = os.path.join(os.getcwd(), job_id) + if (job_dict.has_key("WorkingDirectory") == True): + workingdirectory = job_dict["WorkingDirectory"] + workingdirectory = self.__expand_directory(workingdirectory) + try: + os.makedirs(workingdirectory) + except: + logger.debug("Directory %s already exists."%workingdirectory) + logging.debug("Sub-Job: %s, Working_directory: %s"%(job_id, workingdirectory)) + + output="stdout" + if (job_dict.has_key("Output") == True): + output = job_dict["Output"] + if not os.path.isabs(output): + output=os.path.join(workingdirectory, output) + + error=os.path.join(workingdirectory,"stderr") + if (job_dict.has_key("Error") == True): + error = job_dict["Error"] + if not os.path.isabs(error): + error=os.path.join(workingdirectory, error) + + + # append job to job list + self.jobs.append(job_url) + + ####################################################################################################### + # special setup for MPI NAMD jobs + machinefile = self.allocate_nodes(job_dict) + host = "localhost" + try: + machine_file_handler = open(machinefile, "r") + node= machine_file_handler.readlines() + machine_file_handler.close() + host = node[0].strip() + except: + pass + + + if(machinefile==None): + logger.debug("Not enough resources to run: " + job_url) + self.coordination.set_job_state(job_url, str(bigjob.state.New)) + self.coordination.queue_job(self.base_url, job_url) + return # job cannot be run at the moment + + ####################################################################################################### + # File Stage-In of dependent data units + if job_dict.has_key("InputData"): + self.coordination.set_job_state(job_url, str(bigjob.state.Staging)) + self.__stage_in_data_units(eval(job_dict["InputData"]), workingdirectory) + + # File Stage-In - Move pilot-level files to working directory of sub-job + if self.pilot_description!=None: + try: + if self.pilot_description.has_key("description"): + file_list = eval(self.pilot_description["description"]) + if file_list != None and len(file_list)>0: + logger.debug("Copy %d files to SJ work dir"%len(file_list)>0) + for i in file_list: + logger.debug("Process file: %s"%i) + if i.find(">")>0: + base_filename = os.path.basename(i[:i.index(">")].strip()) + if environment.has_key("_CONDOR_SCRATCH_DIR"): + source_filename = os.path.join(environment["_CONDOR_SCRATCH_DIR"], base_filename) + else: + source_filename = os.path.join(self.work_dir, base_filename) + target_filename = os.path.join(workingdirectory, base_filename) + try: + logger.debug("Copy: %s to %s"%(source_filename, target_filename)) + shutil.copyfile(source_filename, target_filename) + except: + logger.error("Error copy: %s to %s"%(source_filename, target_filename)) + except: + logger.debug("Moving of stage-in files failed.") + + # create stdout/stderr file descriptors + output_file = os.path.abspath(output) + error_file = os.path.abspath(error) + logger.debug("stdout: " + output_file + " stderr: " + error_file) + stdout = open(output_file, "w") + stderr = open(error_file, "w") + # build execution command + if self.LAUNCH_METHOD=="aprun": + if (spmdvariation.lower()=="mpi"): + command = envi + "aprun -n " + str(numberofprocesses) + " " + executable + " " + arguments + else: + #env_strip = envi.strip() + #env_command = env_strip[:(len(env_strip)-1)] + command = envi + "aprun -n " + str(self.number_subjobs) + " -d " + numberofprocesses + " " + executable + " " + arguments + + # MPMD Mode => all subjobs on Kraken fail because aprun returns 1 as returncode + #command = "aprun" + #for i in range(0, self.number_subjobs): + # command = command + " -d " + numberofprocesses + " " + executable + " " + arguments + # # + " 1 > "+ str(i)+ "-out.txt " + " 2 > "+ str(i)+ "-err.txt" + # if i != self.number_subjobs-1: + # command = command + " : " + elif self.LAUNCH_METHOD=="ibrun" and spmdvariation.lower()=="mpi": + # Non MPI launch is handled via standard SSH + command = envi + "mpirun_rsh -np " +str(numberofprocesses) + " -hostfile " + machinefile + " `build_env.pl` " + executable + " " + arguments + elif (spmdvariation.lower()!="mpi"): + command = envi + executable + " " + arguments + # In particular for Condor - if executable is staged x flag is not set + #command ="chmod +x " + executable +";export PATH=$PATH:" + workingdirectory + ";" +command + else: + # Environment variables need to be handled later! + command = envi + executable + " " + arguments + + # add working directory and ssh command + if self.LAUNCH_METHOD == "aprun" or (self.LAUNCH_METHOD== "ibrun" and spmdvariation.lower()=="mpi"): + command ="cd " + workingdirectory + "; " + command + elif self.LAUNCH_METHOD == "local": + command ="cd " + workingdirectory + "; " + command + else: # ssh launch is default + if (spmdvariation.lower( )=="mpi"): + command = "cd " + workingdirectory + "; " + envi + self.MPIRUN + " -np " + numberofprocesses + " -machinefile " + machinefile + " " + executable + " " + arguments + elif host == "localhost": + command ="cd " + workingdirectory + "; " + command + else: + command ="ssh " + host + " \'cd " + workingdirectory + "; " + command +"\'" + + + # start application process + shell = self.SHELL + logger.debug("execute: " + command + " in " + workingdirectory + " from: " + str(socket.gethostname()) + " (Shell: " + shell +")") + # bash works fine for launching on QB but fails for Abe :-( + p = subprocess.Popen(args=command, executable=shell, stderr=stderr, + stdout=stdout, cwd=workingdirectory, + env=environment, shell=True) + logger.debug("started " + command) + self.processes[job_url] = p + self.coordination.set_job_state(job_url, str(bigjob.state.Running)) + except: + traceback.print_exc(file=sys.stderr) + + + + def allocate_nodes(self, job_dict): + """ allocate nodes + allocated nodes will be written to machinefile advert-launcher-machines- + """ + self.resource_lock.acquire() + number_nodes = int(job_dict["NumberOfProcesses"]) + nodes = [] + machine_file_name = None + if (len(self.freenodes)>=number_nodes): + unique_nodes=set(self.freenodes) + for i in unique_nodes: + number = self.freenodes.count(i) + logger.debug("allocate: " + i + " number nodes: " + str(number) + + " current busy nodes: " + str(self.busynodes) + + " free nodes: " + str(self.freenodes)) + for j in range(0, number): + if(number_nodes > 0): + nodes.append(i) + self.freenodes.remove(i) + self.busynodes.append(i) + number_nodes = number_nodes - 1 + else: + break + + machine_file_name = self.get_machine_file_name(job_dict) + machine_file = open(machine_file_name, "w") + #machine_file.writelines(self.freenodes[:number_nodes]) + machine_file.writelines(nodes) + machine_file.close() + logger.debug("wrote machinefile: " + machine_file_name + " Nodes: " + str(nodes)) + # update node structures + #self.busynodes.extend(self.freenodes[:number_nodes]) + #del(self.freenodes[:number_nodes]) + + self.resource_lock.release() + return machine_file_name + + + + def setup_charmpp_nodefile(self, allocated_nodes): + """ Setup charm++ nodefile to use for executing NAMD + HACK!! Method violates layering principle + File $HOME/machinefile in charm++ nodefileformat is written to first node in list + """ + # Nodelist format: + # + # host tp-x001 ++cpus 2 ++shell ssh + # host tp-x002 ++cpus 2 ++shell ssh + + nodefile_string="" + for i in allocated_nodes: + if i.has_key("private_hostname"): + nodefile_string=nodefile_string + "host "+ i["private_hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" + else: + nodefile_string=nodefile_string + "host "+ i["hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" + + # copy nodefile to rank 0 node + jd = saga.job.description() + jd.executable = "echo" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + # ssh root@tp-x001.ci.uchicago.edu "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys" + jd.arguments = ["\""+nodefile_string+"\"", ">", "machinefile"] + jd.output = "stdout.txt" + jd.error = "stderr.txt" + + job_service_url = saga.url("ssh://root@"+allocated_nodes[0]["hostname"]) + job_service = saga.job.service(self.session, job_service_url) + job = job_service.create_job(jd) + job.run() + job.wait() + + + def print_machine_file(self, filename): + fh = open(filename, "r") + lines = fh.readlines() + fh.close + logger.debug("Machinefile: " + filename + " Hosts: " + str(lines)) + + + def free_nodes(self, job_url): + job_dict = self.coordination.get_job(job_url) + self.resource_lock.acquire() + number_nodes = int(job_dict["NumberOfProcesses"]) + machine_file_name = self.get_machine_file_name(job_dict) + logger.debug("Machine file: " + machine_file_name) + allocated_nodes = ["localhost\n"] + try: + machine_file = open(machine_file_name, "r") + allocated_nodes = machine_file.readlines() + machine_file.close() + except: + traceback.print_exc(file=sys.stderr) + + logger.debug("Free nodes: " + str(allocated_nodes)) + + for i in allocated_nodes: + logger.debug("free node: " + str(i) + " current busy nodes: " + str(self.busynodes) + + " free nodes: " + str(self.freenodes)) + self.busynodes.remove(i) + self.freenodes.append(i) + logger.debug("Delete " + machine_file_name) + if os.path.exists(machine_file_name): + os.remove(machine_file_name) + self.resource_lock.release() + + + def get_machine_file_name(self, job_dict): + """create machinefile based on jobid""" + job_id = job_dict["job-id"] + homedir = os.path.expanduser('~') + return homedir + "/advert-launcher-machines-"+ job_id + + def dequeue_new_jobs(self): + """Subscribe to new jobs from Redis. """ + job_counter = 0 + while self.is_stopped(self.base_url)==False: + if len(self.freenodes)==0: + time.sleep(1) + continue + if self.cds_queue_url!=None: + logger.debug("Dequeue sub-job from PilotCompute queue: " + self.base_url + + " AND ComputeDataService queue: " + self.cds_queue_url) + job_url=self.coordination.dequeue_job(self.base_url, self.cds_queue_url) + else: + logger.debug("Dequeue sub-job from PilotCompute queue: " + self.base_url) + job_url=self.coordination.dequeue_job(self.base_url) + logger.debug("Dequed:%s"%str(job_url)) +# if job_url==None: +# if self.cds_queue_url!=None: +# logger.debug("Dequeue sub-job from ComputeDataServicequeue: " + self.cds_queue_url) +# job_url=self.coordination.dequeue_job(self.cds_queue_url) +# logger.debug("Dequed:%s"%str(job_url)) +# if job_url==None: +# time.sleep(3) +# continue + if job_url=="STOP": + break + + job_counter = job_counter + 1 + if (job_counter % (THREAD_POOL_SIZE))==0: # ensure that threadpool is not too overloaded + self.threadpool.wait() + + request = WorkRequest(self.start_new_job_in_thread, [job_url]) + self.threadpool.putRequest(request) + if self.coordination.get_queue_length(self.cds_queue_url)==0 and self.coordination.get_queue_length(self.base_url)==0: + time.sleep(1) + + # wait for termination of Worker Threads + # self.threadpool.wait() + logger.debug("Terminating Agent - Dequeue Sub-Jobs Thread") + + + + def start_new_job_in_thread(self, job_url): + """evaluates job dir, sanity checks, executes job """ + #pdb.set_trace() + if job_url != None: + failed = False; + try: + logger.debug("Get job description") + job_dict = self.coordination.get_job(job_url) + except: + logger.error("Failed to get job description") + failed=True + + if job_dict==None or job_dict=="" or type(job_dict)!=dict or failed==True: + self.coordination.queue_job(self.pilot_url, job_url) + + logger.debug("start job: " + job_url + " data: " + str(job_dict)) + if(job_dict["state"]==str(bigjob.state.Unknown)): + job_dict["state"]=str(bigjob.state.New) + self.coordination.set_job_state(job_url, str(bigjob.state.New)) + self.execute_job(job_url, job_dict) + #print "Execute: " + str(job_dict) + + def monitor_jobs(self): + """Monitor running processes. """ + #pdb.set_trace() + logger.debug("Monitor jobs - # current jobs: %d"%len(self.jobs)) + for i in self.jobs: + if self.processes.has_key(i): # only if job has already been starteds + p = self.processes[i] + p_state = p.poll() + logger.debug(self.print_job(i) + " state: " + str(p_state) + " return code: " + str(p.returncode)) + if (p_state != None and (p_state==0 or p_state==255)): + logger.debug("** Job successful: " + self.print_job(i) + " - set state to Done **") + ########################################################### + # Handle stage-out + self.update_output_file() # for Condor case + job_dict = self.coordination.get_job(i) # for Pilot Data case + if job_dict.has_key("OutputData"): + workingdirectory = os.path.join(os.getcwd(), job_dict["job-id"]) + if (job_dict.has_key("WorkingDirectory") == True): + workingdirectory = job_dict["WorkingDirectory"] + workingdirectory = self.__expand_directory(workingdirectory) + self.__stage_out_data_units(eval(job_dict["OutputData"]), workingdirectory) + + ########################################################### + # Status update + self.coordination.set_job_state(i, str(bigjob.state.Done)) + self.free_nodes(i) + del self.processes[i] + elif p_state!=0 and p_state!=255 and p_state != None: + logger.debug(self.print_job(i) + " failed. ") + # do not free nodes => very likely the job will fail on these nodes + # self.free_nodes(i) + #if self.restarted.has_key(i)==False: + # logger.debug("Try to restart job " + self.print_job(i)) + # self.restarted[i]=True + # self.execute_job(i) + #else: + logger.debug("Job failed " + self.print_job(i)) + self.coordination.set_job_state(i, str(bigjob.state.Failed)) + self.free_nodes(i) + del self.processes[i] + + + def update_output_file(self): + if self.OUTPUT_TAR==True: + output_file_name = "output-" + self.id + ".tar.gz" + logger.debug("Update output file: " + output_file_name) + output = subprocess.Popen('tar --exclude=*.brg --exclude=*.bmf --exclude=*tmp* --exclude=*.bif --exclude=*.fa --exclude=*.fastq --exclude=bfast --exclude=output*.tar.gz -czf ' + output_file_name + ' *', + cwd="..", shell=True) + output.wait() + logger.debug("Files: " + str(os.listdir("."))) + else: + logger.debug("Create NO output.tar. Enable output.tar file creation in bigjob_agent.conf") + + + def print_job(self, job_url): + job_dict = self.coordination.get_job(job_url) + return ("Job: " + job_url + " Executable: " + job_dict["Executable"]) + + + def start_background_thread(self): + self.stop=False + logger.debug("##################################### New POLL/MONITOR cycle ##################################") + while True and self.stop==False: + logger.debug("Free nodes: " + str(len(self.freenodes)) + + " Busy Nodes: " + str(len(self.busynodes)) + + " Number of running sub-jobs: " + str(len(self.jobs))) + if self.is_stopped(self.base_url)==True: + logger.debug("Pilot terminated.") + break + else: + logger.debug("Pilot job entry: " + str(self.base_url) + " exists. Pilot job not in state stopped.") + try: + #self.poll_jobs() + self.monitor_jobs() + time.sleep(5) + self.failed_polls=0 + except: + traceback.print_exc(file=sys.stdout) + self.failed_polls=self.failed_polls+1 + if self.failed_polls>3: # after 3 failed attempts exit + break + + logger.debug("Terminating Agent - Background Thread") + + + + def is_stopped(self, base_url): + state = None + try: + state = self.coordination.get_pilot_state(base_url) + except: + pass + logger.debug("Pilot State: " + str(state)) + if state==None or state.has_key("stopped")==False or state["stopped"]==True: + return True + else: + return False + + + def stop_background_thread(self): + self.stop=True + + + ############################################################################# + # Private methods + + def __stage_in_data_units(self, input_data=[], target_directory="."): + """ stage in data units specified in input_data field """ + try: + logger.debug("Stage in input files to: %s"%target_directory) + for i in input_data: + du = DataUnit(du_url=i) + logger.debug("Restored DU... call get state()") + logger.debug("DU State: " + du.get_state()) + du.wait() + logger.debug("Reconnected to DU. Exporting it now...") + du.export(target_directory) + except: + logger.error("Stage-in of files failed.") + self.__print_traceback() + + + def __stage_out_data_units(self, output_data=[], workingdirectory=None): + """ stage out data to a specified data unit pilot data """ + logger.debug("Stage out output files") + + """ Parsing output data field of job description: + { + ... + "output_data": [ + { + output_data_unit.get_url(): + ["stdout.txt", "stderr.txt"] + } + ] + } + """ + try: + for data_unit_dict in output_data: + logger.debug("Process: " + str(data_unit_dict)) + for du_url in data_unit_dict.keys(): # go through all dicts (each representing 1 PD) + #pd_url = self.__get_pd_url(du_url) + #pilot_data = PilotData(pd_url=pd_url) + #du = pilot_data.get_du(du_url) + du = DataUnit(du_url=du_url) + file_list = data_unit_dict[du_url] + logger.debug("Add files: " + str(file_list)) + all_files=[] + for output_file in file_list: + expanded_files = [output_file] + if output_file.find("*")>=0 or output_file.find("?")>=0: + expanded_files = self.__expand_file_pattern(output_file, workingdirectory) + logger.debug("Expanded files: " + str(expanded_files)) + + for f in expanded_files: + all_files.append(os.path.join(workingdirectory, f)) + + du.add_files(all_files) + for f in all_files: + os.remove(f) + except: + logger.error("Stage out of files failed.") + self.__print_traceback() + + + def __expand_file_pattern(self, filename_pattern, workingdirectory): + """ expand files with wildcard * to a list """ + files = os.listdir(workingdirectory) + logger.debug("All files in directory: " + str(files)) + matches = [] + for i in files: + if fnmatch.fnmatch(i, filename_pattern): + matches.append(i) + return matches + + + def __expand_directory(self, directory): + """ expands directory name $HOME or ~ to the working directory + on the respective machine + """ + try: + if directory.startswith("$HOME"): + template = Template(directory) + directory = template.safe_substitute(HOME="~") + + expanded_directory=os.path.expanduser(directory) + logger.debug("Expanded directory: %s to %s"%(directory, expanded_directory)) + return expanded_directory + except: + pass + + return directory + + + def __get_pd_url(self, du_url): + url = du_url[:du_url.index(":du-")] + return url + + def __get_du_id(self, du_url): + du_id = du_url[du_url.index("du-"):] + return du_id + + def __ibrun_available(self): + " TACC resources use ibrun for MPI startup " + logger.debug("IBRUN and SRUN Test") + ibrun_available = False + srun_available = False + try: + ibrun_available = (subprocess.call("ibrun -h", shell=True)==0) + srun_available = (subprocess.call("srun -V", shell=True)==0) + if ibrun_available and srun_available: + return True + except: + pass + return False + + + def __get_launch_method(self, requested_method): + """ returns desired execution method: ssh, aprun """ + + aprun_available = False + try: + aprun_available = (subprocess.call("aprun -n 1 /bin/date", shell=True, stdout=None, stderr=None)==0) + except: + self.__print_traceback() + + ibrun_available = self.__ibrun_available() + + ssh_available = False + try: + ssh_available = (subprocess.call("ssh -o PasswordAuthentication=no -o NumberOfPasswordPrompts=0 localhost /bin/date", shell=True, stdout=None, stderr=None)==0) + except: + pass + + launch_method = "local" + if requested_method=="aprun" and aprun_available == True: + launch_method="aprun" + elif ibrun_available == True: + launch_method="ibrun" + elif requested_method=="ssh" and ssh_available == True: + launch_method="ssh" + # aprun fallback + elif ssh_available==False and aprun_available==True: + launch_method="aprun" + + + logger.debug("aprun: " + str(aprun_available) + " ibrun: " + str(ibrun_available) + + " ssh: " + str(ssh_available) + + " Launch method: " + str(launch_method)) + return launch_method + + + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + print "*** print_tb:" + traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) + print "*** print_exception:" + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stderr) + +######################################################### +# main # +######################################################### +if __name__ == "__main__" : + args = sys.argv + num_args = len(args) + if (num_args<3): + print "Usage: \n " + args[0] + " " + sys.exit(1) + + bigjob_agent = bigjob_agent(args) + diff --git a/bigjob/bigjob_agent_condor.py b/bigjob/bigjob_agent_condor.py index d2eb5700..5aeab815 100644 --- a/bigjob/bigjob_agent_condor.py +++ b/bigjob/bigjob_agent_condor.py @@ -9,7 +9,7 @@ import time import pdb import traceback -import ConfigParser +import configparser import types import logging logging.basicConfig(level=logging.DEBUG) @@ -62,7 +62,7 @@ def __init__(self, args): # conf_file = os.path.dirname(args[0]) + "/" + CONFIG_FILE # conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/" + CONFIG_FILE conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + CONFIG_FILE - config = ConfigParser.ConfigParser() + config = configparser.ConfigParser() logging.debug ("read configfile: " + conf_file) config.read(conf_file) default_dict = config.defaults() @@ -178,7 +178,7 @@ def init_pbs(self): node_dict[i] = num_cpus self.freenodes=[] - for i in node_dict.keys(): + for i in list(node_dict.keys()): logging.debug("host: " + i + " nodes: " + str(node_dict[i])) for j in range(0, node_dict[i]): logging.debug("add host: " + i.strip()) @@ -213,17 +213,17 @@ def execute_job(self, job_url, job_dict): #job_dict["state"]=str(saga.job.New) logging.debug("Start job: " + str(job_dict)) numberofprocesses = "1" - if (job_dict.has_key("NumberOfProcesses") == True): + if (("NumberOfProcesses" in job_dict) == True): numberofprocesses = job_dict["NumberOfProcesses"] spmdvariation="single" - if (job_dict.has_key("SPMDVariation") == True): + if (("SPMDVariation" in job_dict) == True): spmdvariation = job_dict["SPMDVariation"] arguments = "" - if (job_dict.has_key("Arguments") == True): + if (("Arguments" in job_dict) == True): arguments_raw = job_dict['Arguments']; - if type(arguments_raw) == types.ListType: + if type(arguments_raw) == list: arguments_list = arguments_raw else: arguments_list = eval(job_dict["Arguments"]) @@ -231,26 +231,26 @@ def execute_job(self, job_url, job_dict): arguments = arguments + " " + i workingdirectory = os.getcwd() - if (job_dict.has_key("WorkingDirectory") == True): + if (("WorkingDirectory" in job_dict) == True): workingdirectory = job_dict["WorkingDirectory"] environment = os.environ - if (job_dict.has_key("Environment") == True): + if (("Environment" in job_dict) == True): for i in job_dict["Environment"]: env = i.split("=") environment[env[0]]=env[1] + ":" + environment[env[0]] environment["PATH"]= workingdirectory + ":"+environment["PATH"] - print "environment[PATH]", environment["PATH"] + print("environment[PATH]", environment["PATH"]) executable = job_dict["Executable"] output="stdout" - if (job_dict.has_key("Output") == True): + if (("Output" in job_dict) == True): output = job_dict["Output"] error="stderr" - if (job_dict.has_key("Error") == True): + if (("Error" in job_dict) == True): error = job_dict["Error"] # append job to job list @@ -300,7 +300,7 @@ def execute_job(self, job_url, job_dict): #for condor debugging dirlist = os.listdir(workingdirectory) - print dirlist + print(dirlist) os.system("ls;pwd") self.processes[job_url] = p @@ -360,7 +360,7 @@ def setup_charmpp_nodefile(self, allocated_nodes): nodefile_string="" for i in allocated_nodes: - if i.has_key("private_hostname"): + if "private_hostname" in i: nodefile_string=nodefile_string + "host "+ i["private_hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" else: nodefile_string=nodefile_string + "host "+ i["hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" @@ -480,7 +480,7 @@ def monitor_jobs(self): #pdb.set_trace() logging.debug("Monitor jobs - # current jobs: %d"%len(self.jobs)) for i in self.jobs: - if self.processes.has_key(i): # only if job has already been starteds + if i in self.processes: # only if job has already been starteds p = self.processes[i] p_state = p.poll() logging.debug(self.print_job(i) + " state: " + str(p_state) + " return code: " + str(p.returncode)) @@ -540,7 +540,7 @@ def is_stopped(self, base_url): except: pass logging.debug("Pilot State: " + str(state)) - if state==None or state.has_key("stopped")==False or state["stopped"]==True: + if state==None or ("stopped" in state)==False or state["stopped"]==True: return True else: return False @@ -557,7 +557,7 @@ def stop_background_thread(self): args = sys.argv num_args = len(args) if (num_args!=3): - print "Usage: \n " + args[0] + " " + print("Usage: \n " + args[0] + " ") sys.exit(1) bigjob_agent = bigjob_agent(args) diff --git a/bigjob/bigjob_agent_condor.py.bak b/bigjob/bigjob_agent_condor.py.bak new file mode 100644 index 00000000..d2eb5700 --- /dev/null +++ b/bigjob/bigjob_agent_condor.py.bak @@ -0,0 +1,564 @@ +#!/usr/bin/env python +"""bigjob_agent: bigjob agent adapted for Condor resources +""" +import sys +import os +import bigjob.state +import socket +import threading +import time +import pdb +import traceback +import ConfigParser +import types +import logging +logging.basicConfig(level=logging.DEBUG) + +try: + import saga +except: + logging.warning("SAGA could not be found. Not all functionalities working") + +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../ext/threadpool-1.2.7/src/") +logging.debug(str(sys.path)) +from threadpool import * + +if sys.version_info < (2, 5): + sys.path.append(os.path.dirname( __file__ ) + "/../../ext/uuid-1.30/") + sys.stderr.write("Warning: Using unsupported Python version\n") +if sys.version_info < (2, 4): + sys.path.append(os.path.dirname( __file__ ) + "/../../ext/subprocess-2.6.4/") + sys.stderr.write("Warning: Using unsupported Python version\n") +if sys.version_info < (2, 3): + sys.stderr.write("Warning: Python versions <2.3 not supported\n") + sys.exit(-1) + +import subprocess + +""" Config parameters (will move to config file in future) """ +CONFIG_FILE="bigjob_agent.conf" +THREAD_POOL_SIZE=4 +APPLICATION_NAME="bigjob" + +class bigjob_agent: + + """BigJob Agent: + - reads new job information from communication and coordination subsystem (Redis) + - starts new jobs + - monitors running jobs """ + + """Constructor""" + def __init__(self, args): + + self.coordination_url = args[1] + # objects to store running jobs and processes + self.jobs = [] + self.processes = {} + self.freenodes = [] + self.busynodes = [] + self.restarted = {} + + # read config file + # conf_file = os.path.dirname(args[0]) + "/" + CONFIG_FILE + # conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/" + CONFIG_FILE + conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + CONFIG_FILE + config = ConfigParser.ConfigParser() + logging.debug ("read configfile: " + conf_file) + config.read(conf_file) + default_dict = config.defaults() + self.CPR = default_dict["cpr"] + self.SHELL=default_dict["shell"] + self.MPIRUN=default_dict["mpirun"] + logging.debug("cpr: " + self.CPR + " mpi: " + self.MPIRUN + " shell: " + self.SHELL) + + # init rms (SGE/PBS) + self.init_rms() + + self.failed_polls = 0 + + ############################################################################## + # initialization of coordination and communication subsystem + # Redis initialization + self.base_url = args[2] + logging.debug("BigJob Agent arguments: " + str(args)) + logging.debug("Initialize C&C subsystem to pilot-url: " + self.base_url) + + + if(self.coordination_url.startswith("advert://")): + try: + from coordination.bigjob_coordination_advert import bigjob_coordination + logging.debug("Utilizing ADVERT Backend: " + self.coordination_url) + except: + logging.error("Advert Backend could not be loaded") + elif (self.coordination_url.startswith("redis://")): + try: + from coordination.bigjob_coordination_redis import bigjob_coordination + logging.debug("Utilizing Redis Backend: " + self.coordination_url + ".") + except: + logger.error("Error loading pyredis. Please verify Redis is configured properly.") + elif (self.coordination_url.startswith("tcp://")): + try: + from coordination.bigjob_coordination_zmq import bigjob_coordination + logging.debug("Utilizing ZMQ Backend") + except: + logging.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + +"PYZMQ (http://zeromq.github.com/pyzmq/)") + + self.coordination = bigjob_coordination(server_connect_url=self.coordination_url) + + # update state of pilot job to running + self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False) + + + ############################################################################## + # start background thread for polling new jobs and monitoring current jobs + self.resource_lock=threading.RLock() + self.threadpool = ThreadPool(THREAD_POOL_SIZE) + + self.launcher_thread=threading.Thread(target=self.dequeue_new_jobs) + self.launcher_thread.start() + + self.monitoring_thread=threading.Thread(target=self.start_background_thread) + self.monitoring_thread.start() + + + + def init_rms(self): + if(os.environ.get("PBS_NODEFILE")!=None): + return self.init_pbs() + elif(os.environ.get("PE_HOSTFILE")!=None): + return self.init_sge() + else: + return self.init_local() + return None + + def init_local(self): + """ initialize free nodes list with dummy (for fork jobs)""" + try: + num_cpus = self.get_num_cpus() + for i in range(0, num_cpus): + self.freenodes.append("localhost\n") + except IOError: + self.freenodes=["localhost\n"] + + def init_sge(self): + """ initialize free nodes list from SGE environment """ + sge_node_file = os.environ.get("PE_HOSTFILE") + if sge_node_file == None: + return + f = open(sge_node_file) + sgenodes = f.readlines() + f.close() + for i in sgenodes: + + columns = i.split() + try: + for j in range(0, int(columns[1])): + logging.debug("add host: " + columns[0].strip()) + self.freenodes.append(columns[0]+"\n") + except: + pass + return self.freenodes + + def init_pbs(self): + """ initialize free nodes list from PBS environment """ + pbs_node_file = os.environ.get("PBS_NODEFILE") + if pbs_node_file == None: + return + f = open(pbs_node_file) + self.freenodes = f.readlines() + f.close() + + # check whether pbs node file contains the correct number of nodes + num_cpus = self.get_num_cpus() + node_dict={} + for i in set(self.freenodes): + node_dict[i] = self.freenodes.count(i) + if node_dict[i] < num_cpus: + node_dict[i] = num_cpus + + self.freenodes=[] + for i in node_dict.keys(): + logging.debug("host: " + i + " nodes: " + str(node_dict[i])) + for j in range(0, node_dict[i]): + logging.debug("add host: " + i.strip()) + self.freenodes.append(i) + + def get_num_cpus(self): + cpuinfo = open("/proc/cpuinfo", "r") + cpus = cpuinfo.readlines() + cpuinfo.close() + num = 0 + for i in cpus: + if i.startswith("processor"): + num = num+1 + return num + + + def execute_job(self, job_url, job_dict): + """ obtain job attributes from c&c and execute process """ + state=str(job_dict["state"]) + + #try: + # state = self.coordination.get_job_state(job_url) + #except: + # logging.error("Could not access job state... skip execution attempt and requeuing job") + # result = self.coordination.queue_job(self.base_url, job_url) + # if result == False: + # self.coordination.set_job_state(job_url, str(saga.job.Failed)) + + if(state==str(bigjob.state.Unknown) or + state==str(bigjob.state.New)): + try: + #job_dict["state"]=str(saga.job.New) + logging.debug("Start job: " + str(job_dict)) + numberofprocesses = "1" + if (job_dict.has_key("NumberOfProcesses") == True): + numberofprocesses = job_dict["NumberOfProcesses"] + + spmdvariation="single" + if (job_dict.has_key("SPMDVariation") == True): + spmdvariation = job_dict["SPMDVariation"] + + arguments = "" + if (job_dict.has_key("Arguments") == True): + arguments_raw = job_dict['Arguments']; + if type(arguments_raw) == types.ListType: + arguments_list = arguments_raw + else: + arguments_list = eval(job_dict["Arguments"]) + for i in arguments_list: + arguments = arguments + " " + i + + workingdirectory = os.getcwd() + if (job_dict.has_key("WorkingDirectory") == True): + workingdirectory = job_dict["WorkingDirectory"] + + environment = os.environ + if (job_dict.has_key("Environment") == True): + for i in job_dict["Environment"]: + env = i.split("=") + environment[env[0]]=env[1] + ":" + environment[env[0]] + + environment["PATH"]= workingdirectory + ":"+environment["PATH"] + print "environment[PATH]", environment["PATH"] + executable = job_dict["Executable"] + + + output="stdout" + if (job_dict.has_key("Output") == True): + output = job_dict["Output"] + + error="stderr" + if (job_dict.has_key("Error") == True): + error = job_dict["Error"] + + # append job to job list + self.jobs.append(job_url) + + # create stdout/stderr file descriptors + output_file = os.path.join(workingdirectory, output) + error_file = os.path.join(workingdirectory, error) + logging.debug("stdout: " + output_file + " stderr: " + error_file + " env: " + str(environment)) + stdout = open(output_file, "w") + stderr = open(error_file, "w") + #if not "/" in executable: + # command = workingdirectory +"/" +executable + " " + arguments + command = executable + " " + arguments + #pdb.set_trace() + # special setup for MPI NAMD jobs + machinefile = self.allocate_nodes(job_dict) + host = "localhost" + try: + machine_file_handler = open(machinefile, "r") + node= machine_file_handler.readlines() + machine_file_handler.close() + host = node[0].strip() + except: + pass + + + if(machinefile==None): + logging.debug("Not enough resources to run: " + job_url) + self.coordination.queue_job(self.base_url, job_url) + return # job cannot be run at the moment + + # start application process + if (spmdvariation.lower( )=="mpi"): + command = "cd " + workingdirectory + "; " + self.MPIRUN + " -np " + numberofprocesses + " -machinefile " + machinefile + " " + command + #if (host != socket.gethostname()): + # command ="ssh " + host + " \"cd " + workingdirectory + "; " + command +"\"" + else: + command ="chmod +x " + executable +";export PATH=$PATH:" + workingdirectory + ";" +command + shell = self.SHELL + logging.debug("execute: " + command + " in " + workingdirectory + " from: " + str(socket.gethostname()) + " (Shell: " + shell +")") + # bash works fine for launching on QB but fails for Abe :-( + p = subprocess.Popen(args=command, executable=shell, stderr=stderr, + stdout=stdout, cwd=workingdirectory, + env=environment, shell=True) + logging.debug("started " + command) + + #for condor debugging + dirlist = os.listdir(workingdirectory) + print dirlist + os.system("ls;pwd") + + self.processes[job_url] = p + self.coordination.set_job_state(job_url, str(bigjob.state.Running)) + except: + traceback.print_exc(file=sys.stderr) + + + def allocate_nodes(self, job_dict): + """ allocate nodes + allocated nodes will be written to machinefile advert-launcher-machines- + """ + self.resource_lock.acquire() + number_nodes = int(job_dict["NumberOfProcesses"]) + nodes = [] + machine_file_name = None + if (len(self.freenodes)>=number_nodes): + unique_nodes=set(self.freenodes) + for i in unique_nodes: + number = self.freenodes.count(i) + logging.debug("allocate: " + i + " number nodes: " + str(number) + + " current busy nodes: " + str(self.busynodes) + + " free nodes: " + str(self.freenodes)) + for j in range(0, number): + if(number_nodes > 0): + nodes.append(i) + self.freenodes.remove(i) + self.busynodes.append(i) + number_nodes = number_nodes - 1 + else: + break + + machine_file_name = self.get_machine_file_name(job_dict) + machine_file = open(machine_file_name, "w") + #machine_file.writelines(self.freenodes[:number_nodes]) + machine_file.writelines(nodes) + machine_file.close() + logging.debug("wrote machinefile: " + machine_file_name + " Nodes: " + str(nodes)) + # update node structures + #self.busynodes.extend(self.freenodes[:number_nodes]) + #del(self.freenodes[:number_nodes]) + + self.resource_lock.release() + return machine_file_name + + + + def setup_charmpp_nodefile(self, allocated_nodes): + """ Setup charm++ nodefile to use for executing NAMD + HACK!! Method violates layering principle + File $HOME/machinefile in charm++ nodefileformat is written to first node in list + """ + # Nodelist format: + # + # host tp-x001 ++cpus 2 ++shell ssh + # host tp-x002 ++cpus 2 ++shell ssh + + nodefile_string="" + for i in allocated_nodes: + if i.has_key("private_hostname"): + nodefile_string=nodefile_string + "host "+ i["private_hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" + else: + nodefile_string=nodefile_string + "host "+ i["hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" + + # copy nodefile to rank 0 node + jd = saga.job.description() + jd.executable = "echo" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + # ssh root@tp-x001.ci.uchicago.edu "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys" + jd.arguments = ["\""+nodefile_string+"\"", ">", "machinefile"] + jd.output = "stdout.txt" + jd.error = "stderr.txt" + + job_service_url = saga.url("ssh://root@"+allocated_nodes[0]["hostname"]) + job_service = saga.job.service(self.session, job_service_url) + job = job_service.create_job(jd) + job.run() + job.wait() + + def print_machine_file(self, filename): + fh = open(filename, "r") + lines = fh.readlines() + fh.close + logging.debug("Machinefile: " + filename + " Hosts: " + str(lines)) + + def free_nodes(self, job_url): + job_dict = self.coordination.get_job(job_url) + self.resource_lock.acquire() + number_nodes = int(job_dict["NumberOfProcesses"]) + machine_file_name = self.get_machine_file_name(job_dict) + logging.debug("Machine file: " + machine_file_name) + allocated_nodes = ["localhost\n"] + try: + machine_file = open(machine_file_name, "r") + allocated_nodes = machine_file.readlines() + machine_file.close() + except: + traceback.print_exc(file=sys.stderr) + + logging.debug("Free nodes: " + str(allocated_nodes)) + + for i in allocated_nodes: + logging.debug("free node: " + str(i) + " current busy nodes: " + str(self.busynodes) + + " free nodes: " + str(self.freenodes)) + self.busynodes.remove(i) + self.freenodes.append(i) + logging.debug("Delete " + machine_file_name) + if os.path.exists(machine_file_name): + os.remove(machine_file_name) + self.resource_lock.release() + + + def get_machine_file_name(self, job_dict): + """create machinefile based on jobid""" + job_id = job_dict["job-id"] + homedir = os.path.expanduser('~') + return homedir + "/advert-launcher-machines-"+ job_id + + def dequeue_new_jobs(self): + """Subscribe to new jobs from Redis. """ + job_counter = 0 + while self.is_stopped(self.base_url)==False: + if len(self.freenodes)==0: + time.sleep(3) + continue + logging.debug("Dequeue sub-job from: " + self.base_url) + job_url=self.coordination.dequeue_job(self.base_url) + if job_url==None: + time.sleep(3) + continue + if job_url=="STOP": + break + + job_counter = job_counter + 1 + if (job_counter % (THREAD_POOL_SIZE))==0: # ensure that threadpool is not too overloaded + self.threadpool.wait() + + request = WorkRequest(self.start_new_job_in_thread, [job_url]) + self.threadpool.putRequest(request) + + # wait for termination of Worker Threads + self.threadpool.wait() + logging.debug("Terminating Agent - Dequeue Sub-Jobs Thread") + + #def poll_jobs(self): + # self.threadpool.wait() + # new_jobs=self.redis.keys(self.base_url+":*") + # logging.debug("All jobs:" + str(new_jobs)) + # for i in new_jobs: + # request = WorkRequest(self.start_new_job_in_thread, [str(i)]) + # logging.debug("WorkRequest: " + str(request)) + # self.threadpool.putRequest(request) + + def start_new_job_in_thread(self, job_url): + """evaluates job dir, sanity checks, executes job """ + #pdb.set_trace() + if job_url != None: + failed = False; + try: + job_dict = self.coordination.get_job(job_url) + except: + failed=True + + if job_dict==None or failed==True: + self.coordination.queue_job(self.pilot_url, job_url) + + logging.debug("start job: " + job_url + " data: " + str(job_dict)) + if(job_dict["state"]==str(bigjob.state.Unknown)): + job_dict["state"]=str(bigjob.state.New) + self.coordination.set_job_state(job_url, str(bigjob.state.New)) + self.execute_job(job_url, job_dict) + #print "Execute: " + str(job_dict) + + def monitor_jobs(self): + """Monitor running processes. """ + #pdb.set_trace() + logging.debug("Monitor jobs - # current jobs: %d"%len(self.jobs)) + for i in self.jobs: + if self.processes.has_key(i): # only if job has already been starteds + p = self.processes[i] + p_state = p.poll() + logging.debug(self.print_job(i) + " state: " + str(p_state) + " return code: " + str(p.returncode)) + if (p_state != None and (p_state==0 or p_state==255)): + logging.debug("Job successful: " + self.print_job(i)) + self.coordination.set_job_state(i, str(bigjob.state.Done)) + #i.set_attribute("state", str(saga.job.Done)) + self.free_nodes(i) + del self.processes[i] + elif p_state!=0 and p_state!=255 and p_state != None: + logging.debug(self.print_job(i) + " failed. ") + # do not free nodes => very likely the job will fail on these nodes + # self.free_nodes(i) + #if self.restarted.has_key(i)==False: + # logging.debug("Try to restart job " + self.print_job(i)) + # self.restarted[i]=True + # self.execute_job(i) + #else: + logging.debug("Job failed " + self.print_job(i)) + self.coordination.set_job_state(i, str(bigjob.state.Failed)) + self.free_nodes(i) + del self.processes[i] + + def print_job(self, job_url): + job_dict = self.coordination.get_job(job_url) + return ("Job: " + job_url + + " Executable: " + job_dict["Executable"]) + + + def start_background_thread(self): + self.stop=False + logging.debug("##################################### New POLL/MONITOR cycle ##################################") + logging.debug("Free nodes: " + str(len(self.freenodes)) + " Busy Nodes: " + str(len(self.busynodes))) + while True and self.stop==False: + if self.is_stopped(self.base_url)==True: + logging.debug("Pilot job entry deleted - terminate agent") + break + else: + logging.debug("Pilot job entry: " + str(self.base_url) + " exists. Pilot job not in state stopped.") + try: + #self.poll_jobs() + self.monitor_jobs() + time.sleep(5) + self.failed_polls=0 + except: + traceback.print_exc(file=sys.stdout) + self.failed_polls=self.failed_polls+1 + if self.failed_polls>3: # after 3 failed attempts exit + break + logging.debug("Terminating Agent - Background Thread") + + + def is_stopped(self, base_url): + state = None + try: + state = self.coordination.get_pilot_state(base_url) + except: + pass + logging.debug("Pilot State: " + str(state)) + if state==None or state.has_key("stopped")==False or state["stopped"]==True: + return True + else: + return False + + + def stop_background_thread(self): + self.stop=True + + +######################################################### +# main # +######################################################### +if __name__ == "__main__" : + args = sys.argv + num_args = len(args) + if (num_args!=3): + print "Usage: \n " + args[0] + " " + sys.exit(1) + + bigjob_agent = bigjob_agent(args) + diff --git a/bigjob/bigjob_manager.py b/bigjob/bigjob_manager.py index 54ec1552..fc523ff7 100644 --- a/bigjob/bigjob_manager.py +++ b/bigjob/bigjob_manager.py @@ -14,7 +14,7 @@ import traceback import logging import textwrap -import urlparse +import urllib.parse import types import subprocess import pdb @@ -35,12 +35,12 @@ # Optional Job Plugins try: - from job_plugin.gcessh import Service as GCEService + from .job_plugin.gcessh import Service as GCEService except: pass try: - from job_plugin.ec2ssh import Service as EC2Service + from .job_plugin.ec2ssh import Service as EC2Service except: pass @@ -329,7 +329,7 @@ def start_pilot_job(self, jd.executable = "/usr/bin/env" jd.arguments = ["python", os.path.basename(condor_bootstrap_filename)] - if pilot_compute_description.has_key("candidate_hosts"): + if "candidate_hosts" in pilot_compute_description: jd.candidate_hosts = pilot_compute_description["candidate_hosts"] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename(condor_bootstrap_filename) @@ -348,7 +348,7 @@ def start_pilot_job(self, else: jd.total_cpu_count=int(number_nodes) jd.spmd_variation = "single" - if pilot_compute_description!=None and pilot_compute_description.has_key("spmd_variation"): + if pilot_compute_description!=None and "spmd_variation" in pilot_compute_description: jd.spmd_variation=pilot_compute_description["spmd_variation"] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" @@ -427,7 +427,7 @@ def get_free_nodes(self): number_used_nodes=0 for i in jobs: job_detail = self.coordination.get_job(i) - if job_detail != None and job_detail.has_key("state") == True\ + if job_detail != None and ("state" in job_detail) == True\ and job_detail["state"]==str(Running): job_np = "1" if (job_detail["NumberOfProcesses"] == True): @@ -499,7 +499,7 @@ def wait(self): state = str(self.coordination.get_job_state(sj_id)) #logger.debug("SJ: %s : State: %s"%(sj_id, str(state))) #state = job_detail["state"] - if result_map.has_key(state)==False: + if (state in result_map)==False: result_map[state]=1 else: result_map[state] = result_map[state]+1 @@ -559,7 +559,7 @@ def _add_subjob(self, queue_url, jd, job_url, job_id): logger.debug("job dict: " + str(job_dict)) - if job_dict.has_key("FileTransfer"): + if "FileTransfer" in job_dict: files = job_dict["FileTransfer"] sj_work_dir = self.__get_subjob_working_dir(job_id) self.__stage_files(files, sj_work_dir) @@ -765,7 +765,7 @@ def __parse_url(self, url): """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed"%(url)) traceback.print_exc(file=sys.stderr) - result = urlparse.urlparse(url) + result = urllib.parse.urlparse(url) logger.debug("Result: " + str(result)) host = result.hostname #host = None @@ -1178,7 +1178,7 @@ def __init__(self): # -------------------------------------------------------------------------- # def _get_input_data (self) : - print "get caled. returning: %s" % self.input_data + print("get caled. returning: %s" % self.input_data) return self.input_data # -------------------------------------------------------------------------- diff --git a/bigjob/bigjob_manager.py.bak b/bigjob/bigjob_manager.py.bak new file mode 100644 index 00000000..54ec1552 --- /dev/null +++ b/bigjob/bigjob_manager.py.bak @@ -0,0 +1,1190 @@ +#!/usr/bin/env python + +"""Module bigjob_manager. + +This Module is used to launch jobs via a central distributed coordination service (e.g. an Redis or Advert instance). + +Background: This approach avoids queueing delays since only the BigJob-Agent must be started via saga.job. +All shortrunning task will be started using the protocol implemented by subjob() and bigjob_agent.py +""" + +import sys +import time +import os +import traceback +import logging +import textwrap +import urlparse +import types +import subprocess +import pdb + +# the one and only saga +import saga +from saga.job import Description +from saga import Url as SAGAUrl +from saga.job import Description as SAGAJobDescription +from saga.job import Service as SAGAJobService +from saga import Session as SAGASession +from saga import Context as SAGAContext + +from radical.utils.object_cache import ObjectCache + +from bigjob.state import Running, New, Failed, Done, Unknown +from bigjob import logger + +# Optional Job Plugins +try: + from job_plugin.gcessh import Service as GCEService +except: + pass + +try: + from job_plugin.ec2ssh import Service as EC2Service +except: + pass + + +# import other BigJob packages +# import API +import api.base +sys.path.append(os.path.dirname(__file__)) + +# Some python version detection +if sys.version_info < (2, 5): + sys.path.append(os.path.dirname( __file__ ) + "/ext/uuid-1.30/") + sys.stderr.write("Warning: Using unsupported Python version\n") + +if sys.version_info < (2, 4): + sys.stderr.write("Error: Python versions <2.4 not supported\n") + sys.exit(-1) + +import uuid + +def get_uuid(): + wd_uuid="" + wd_uuid += str(uuid.uuid1()) + return wd_uuid + + +""" Config parameters (will move to config file in future) """ +_CLEANUP=True + +#for legacy purposes and support for old BJ API +_pilot_url_dict={} # stores a mapping of pilot_url to bigjob + +class BigJobError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + + +class bigjob(api.base.bigjob): + + ''' BigJob: Class for managing pilot jobs: + + Example: + + + bj = bigjob("redis://localhost") + + bj.start_pilot_job("fork://localhost") + + .. + + bj.cancel() + ''' + + __APPLICATION_NAME="bigjob" + + def __init__(self, + coordination_url="advert://localhost/?dbtype=sqlite3", + pilot_url=None): + """ Initializes BigJob's coordination system + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + + The following formats for pilot_url are supported: + + + 1.) Including root path at distributed coordination service: + redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost + + This path is returned when call bigjob.get_url() + + 2.) BigJob unique ID: + bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost + + + """ + + self.coordination_url = coordination_url + if self.coordination_url==None: + logger.error("Coordination URL not set. Exiting BigJob.") + #self.launch_method="" + self.__filemanager=None + self._ocache = ObjectCache () + + # restore existing BJ or initialize new BJ + if pilot_url!=None: + logger.debug("Reconnect to BJ: %s"%pilot_url) + if pilot_url.startswith("bigjob:"): + self.pilot_url=pilot_url + else: + self.coordination_url, self.pilot_url = self.__parse_pilot_url(pilot_url) + + self.uuid = self.__get_bj_id(pilot_url) + self.app_url = self.__APPLICATION_NAME +":" + str(self.uuid) + self.job = None + self.working_directory = None + # Coordination subsystem must be initialized before get_state_detail + self.coordination = self.__init_coordination(self.coordination_url) + self.state=self.get_state_detail() + _pilot_url_dict[self.pilot_url]=self + else: + self.coordination = self.__init_coordination(self.coordination_url) + self.uuid = "bj-" + str(get_uuid()) + logger.debug("init BigJob w/: " + coordination_url) + self.app_url =self. __APPLICATION_NAME +":" + str(self.uuid) + self.state=Unknown + self.pilot_url="" + self.job = None + self.working_directory = None + logger.debug("initialized BigJob: " + self.app_url) + + + def start_pilot_job(self, + lrms_url, + number_nodes=1, + queue=None, + project=None, + working_directory=None, + userproxy=None, + walltime=None, + processes_per_node=1, + filetransfers=None, + spmd_variation=None, + external_queue="", + pilot_compute_description=None): + """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: + fork://localhost/ (Default Job Adaptor + gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) + pbspro://localhost (PBS Pro Adaptor) + + """ + if self.job != None: + raise BigJobError("One BigJob already active. Please stop BigJob first.") + return + + ############################################################################## + # initialization of coordination and communication subsystem + # Communication & Coordination initialization + lrms_saga_url = SAGAUrl(lrms_url) + self.url = lrms_saga_url + self.pilot_url = self.app_url + ":" + lrms_saga_url.host + self.number_nodes=int(number_nodes)*int(processes_per_node) + + # Store references to BJ in global dict + _pilot_url_dict[self.pilot_url]=self + _pilot_url_dict[external_queue]=self + + logger.debug("create pilot job entry on backend server: " + self.pilot_url) + self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) + if pilot_compute_description==None: + pilot_compute_description={"service_url": lrms_url, + "number_of_processes": number_nodes, + "processes_per_node": processes_per_node, + "working_directory": working_directory} + self.coordination.set_pilot_description(self.pilot_url, pilot_compute_description) + logger.debug("set pilot state to: " + str(Unknown)) + + # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) + self.js =None + if lrms_saga_url.scheme=="gce+ssh": + self.js = GCEService(lrms_saga_url, pilot_compute_description) + elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ + or lrms_saga_url.scheme=="nova+ssh": + self.js = EC2Service(lrms_saga_url, pilot_compute_description) + #elif lrms_saga_url.scheme=="slurm+ssh": + # self.js = SlurmService(lrms_saga_url, pilot_compute_description) + else: + self.js = self._ocache.get_obj (lrms_saga_url, lambda : SAGAJobService (lrms_saga_url)) + ############################################################################## + # create job description + jd = SAGAJobDescription() + + # Attempt to create working directory (e.g. in local scenario) + if working_directory != None and working_directory != "": + if not os.path.isdir(working_directory) \ + and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ + and working_directory.startswith("go:")==False: + os.mkdir(working_directory) + self.working_directory = working_directory + else: + # if no working dir is set assume use home directory + # will fail if home directory is not the same on remote machine + # but this is just a guess to avoid failing + self.working_directory = "~" + #self.working_directory = "" + + if queue != None: + jd.queue = queue + if spmd_variation != None: + jd.spmd_variation = spmd_variation + if project !=None: + jd.project=project + if walltime!=None: + logger.debug("setting walltime to: " + str(walltime)) + jd.wall_time_limit=int(walltime) + + + + ############################################################################## + # File Management and Stage-In + # Determine whether target machine use gsissh or ssh to logon. + # logger.debug("Detect launch method for: " + lrms_saga_url.host) + # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) + self.bigjob_working_directory_url="" + if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ + or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): + logger.debug("File Staging for Cloud Instances currently not supported.") + elif lrms_saga_url.scheme.startswith("condor") == True: + logger.debug("Using Condor file staging") + else: + # build target url for working directory + # this will also create the remote directory for the BJ + # Fallback if working directory is not a valid URL + if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): + if lrms_saga_url.username!=None and lrms_saga_url.username!="": + self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() + else: + self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() + elif self.working_directory.startswith("go:"): + self.bigjob_working_directory_url=os.path.join(self.working_directory, self.uuid) + else: + # working directory is a valid file staging URL + self.bigjob_working_directory_url=self.working_directory + + # initialize file manager that takes care of file movement and directory creation + if self.__filemanager==None: + self.__initialize_pilot_data(self.bigjob_working_directory_url) # determines the url + + if self.__filemanager != None and not self.working_directory.startswith("/"): + self.working_directory = self.__filemanager.get_path(self.bigjob_working_directory_url) + + # determine working directory of bigjob + # if a remote sandbox can be created via ssh => create a own dir for each bj job id + # otherwise use specified working directory + logger.debug("BigJob working directory: %s"%self.bigjob_working_directory_url) + if self.__filemanager!=None and self.__filemanager.create_remote_directory(self.bigjob_working_directory_url)==True: + self.working_directory = self.__get_bigjob_working_dir() + self.__stage_files(filetransfers, self.bigjob_working_directory_url) + else: + logger.warn("No file staging adaptor found.") + + logger.debug("BJ Working Directory: %s", self.working_directory) + + if lrms_saga_url.scheme.startswith("condor")==False: + jd.working_directory = self.working_directory + else: + jd.working_directory="" + + + + ############################################################################## + # Create and process BJ bootstrap script + bootstrap_script = self.__generate_bootstrap_script( + self.coordination.get_address(), + self.pilot_url, # Queue 1 used by this BJ object + external_queue # Queue 2 used by Pilot Compute Service + # or another external scheduler + ) + logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) + bootstrap_script = self.__escape_pbs(bootstrap_script) + #bootstrap_script = self.__escape_ssh(bootstrap_script) + logger.debug(bootstrap_script) + + + # Define Agent Executable in Job description + # in Condor case bootstrap script is staged + # (Python app cannot be passed inline in Condor job description) + if lrms_saga_url.scheme.startswith("condor")==True: + + bootstrap_script = self.__generate_bootstrap_script_from_binary( + self.coordination.get_address(), + self.pilot_url, # Queue 1 used by this BJ object + external_queue # Queue 2 used by Pilot Compute Service + # or another external scheduler + ) + + condor_bootstrap_filename = os.path.join("/tmp", "bootstrap-"+str(self.uuid)) + condor_bootstrap_file = open(condor_bootstrap_filename, "w") + condor_bootstrap_file.write(bootstrap_script) + condor_bootstrap_file.close() + logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) + + jd.executable = "/usr/bin/env" + jd.arguments = ["python", os.path.basename(condor_bootstrap_filename)] + if pilot_compute_description.has_key("candidate_hosts"): + jd.candidate_hosts = pilot_compute_description["candidate_hosts"] + bj_file_transfers = [] + file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename(condor_bootstrap_filename) + bj_file_transfers.append(file_transfer_spec) + output_file_name = "output-" + str(self.uuid) + ".tar.gz" + #output_file_transfer_spec = os.path.join(self.working_directory, output_file_name) +" < " + output_file_name + output_file_transfer_spec = output_file_name +" < " + output_file_name + #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" + #logger.debug("Output transfer: " + output_file_transfer_spec) + #bj_file_transfers.append(output_file_transfer_spec) + if filetransfers != None: + for t in filetransfers: + bj_file_transfers.append(t) + logger.debug("Condor file transfers: " + str(bj_file_transfers)) + jd.file_transfer = bj_file_transfers + else: + jd.total_cpu_count=int(number_nodes) + jd.spmd_variation = "single" + if pilot_compute_description!=None and pilot_compute_description.has_key("spmd_variation"): + jd.spmd_variation=pilot_compute_description["spmd_variation"] + jd.arguments = ["python", "-c", bootstrap_script] + jd.executable = "/usr/bin/env" + + logger.debug("Working directory: " + jd.working_directory + " Job Description: " + str(jd)) + + jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") + jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") + + ############################################################################## + # Create and submit pilot job to job service + logger.debug("Creating pilot job with description: %s" % str(jd)) + self.job = self.js.create_job(jd) + logger.debug("Trying to submit pilot job to: " + str(lrms_saga_url)) + self.job.run() + + if self.job.get_state() == saga.job.FAILED: + logger.debug("SUBMISSION FAILED. Exiting... ") + sys.exit(-1) + else: + logger.debug("Submission succeeded. Job ID: %s " % self.job.id) + + return self.pilot_url + + + def list_subjobs(self): + sj_list = self.coordination.get_jobs_of_pilot(self.pilot_url) + logger.debug(str(sj_list)) + subjobs = [] + for i in sj_list: + url = i + #if url.find("/")>0: + # url = url[url.find("bigjob"):] + # url = url.replace("/", ":") + #sj = subjob(coordination_url=self.coordination_url, subjob_url=url) + sj = subjob(subjob_url=url) + subjobs.append(sj.get_url()) + return subjobs + + + def get_state(self): + """ duck typing for get_state of saga.job.job + state of saga job that is used to spawn the pilot agent + """ + return self.get_state_detail() + + + def get_state_detail(self): + """ internal state of BigJob agent """ + try: + return self.coordination.get_pilot_state(self.pilot_url)["state"] + except: + return None + + + def get_url(self): + """ Get unique URL of big-job. This URL can be used to reconnect to BJ later, e.g.: + + redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost + + """ + url = os.path.join(self.coordination.address, + self.pilot_url) + if self.coordination.dbtype!="" and self.coordination.dbtype!=None: + url = os.path.join(url, "?" + self.coordination.dbtype) + return url + + + def get_free_nodes(self): + """ Returns number of free nodes of subjob + + Attention: Only evaluates jobs directly submitted to BigJob. + This method cannot be used if Pilot is managed by a ComputeDataService. + """ + jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) + number_used_nodes=0 + for i in jobs: + job_detail = self.coordination.get_job(i) + if job_detail != None and job_detail.has_key("state") == True\ + and job_detail["state"]==str(Running): + job_np = "1" + if (job_detail["NumberOfProcesses"] == True): + job_np = job_detail["NumberOfProcesses"] + number_used_nodes=number_used_nodes + int(job_np) + return (self.number_nodes - number_used_nodes) + + + def get_details(self): + """ Return details with respect to PilotCompute (BigJob) Instance """ + details_dict={} + try: + details_dict = self.coordination.get_pilot_state(self.pilot_url) + except: + pass + details_dict["bigjob_id"] = self.pilot_url + return details_dict + + ############################################################################################################### + def cancel(self): + """ duck typing for cancel of saga.cpr.job and saga.job.job """ + logger.debug("Cancel Pilot Job") + try: + self.job.cancel() + except: + pass + #traceback.print_stack() + + logger.debug("Cancel Job Service") + try: + if not self._ocache.rem_obj (self.js) : + logger.debug("Cancel Job Service") + del (self.js) + else : + logger.debug("Cancel Job Service done") + + self.js = None + except: + pass + #traceback.print_stack() + + try: + self._stop_pilot_job() + logger.debug("delete pilot job: " + str(self.pilot_url)) + if _CLEANUP: + self.coordination.delete_pilot(self.pilot_url) + #os.remove(os.path.join("/tmp", "bootstrap-"+str(self.uuid))) + except: + pass + #traceback.print_stack() + logger.debug("Cancel Pilot Job finished") + + + def wait(self): + """ Waits for completion of all sub-jobs """ + while 1: + if self.get_state()=="Done" or self.get_state()=="Failed": + logger.debug("BigJob terminated. Exit Wait") + break + + jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) + finish_counter=0 + result_map = {} + for i in jobs: + # parse job id out of sj url + surl = SAGAUrl(i) + sj_id = surl.path + if sj_id.startswith("/"): sj_id = sj_id[1:] + state = str(self.coordination.get_job_state(sj_id)) + #logger.debug("SJ: %s : State: %s"%(sj_id, str(state))) + #state = job_detail["state"] + if result_map.has_key(state)==False: + result_map[state]=1 + else: + result_map[state] = result_map[state]+1 + if self.__has_finished(state)==True: + finish_counter = finish_counter + 1 + logger.debug("Total Jobs: %s States: %s"%(len(jobs), str(result_map))) + if finish_counter == len(jobs): + break + time.sleep(2) + + + ########################################################################### + # internal and protected methods + def _stop_pilot_job(self): + """ mark in database entry of pilot-job as stopped """ + try: + logger.debug("stop pilot job: " + self.pilot_url) + self.coordination.set_pilot_state(self.pilot_url, str(Done), True) + self.job=None + except: + pass + + def _delete_subjob(self, job_url): + self.coordination.delete_job(job_url) + + def _get_subjob_state(self, job_url): + logger.debug("Get subjob state: " + str(job_url)) + return self.coordination.get_job_state(job_url) + + def _get_subjob_details(self, job_url): + return self.coordination.get_job(job_url) + + def _add_subjob(self, queue_url, jd, job_url, job_id): + logger.debug("add subjob to queue of PJ: " + str(queue_url)) + for i in range(0,3): + try: + logger.debug("create dictionary for job description. Job-URL: " + job_url) + # put job description attributes to Coordination Service + job_dict = {} + # to accomendate current bug in saga (Number of processes is not returned from list attributes) + job_dict["NumberOfProcesses"] = "1" + attributes = jd.list_attributes() + logger.debug("SJ Attributes: " + str(jd)) + for i in attributes: + if jd.attribute_is_vector(i): + vector_attr = [] + for j in jd.get_vector_attribute(i): + vector_attr.append(j) + job_dict[i]=vector_attr + else: + #logger.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i)) + job_dict[i] = jd.get_attribute(i) + + # Other pilot state information + job_dict["state"] = str(Unknown) + job_dict["job-id"] = str(job_id) + logger.debug("job dict: " + str(job_dict)) + + + if job_dict.has_key("FileTransfer"): + files = job_dict["FileTransfer"] + sj_work_dir = self.__get_subjob_working_dir(job_id) + self.__stage_files(files, sj_work_dir) + + #logger.debug("update job description at communication & coordination sub-system") + self.coordination.set_job(job_url, job_dict) + self.coordination.queue_job(queue_url, job_url) + break + except: + self.__print_traceback() + time.sleep(2) + + + def _get_subjob_url(self, subjob_url): + """ Get unique URL for a sub-job. This URL can be used to reconnect to SJ later, e.g.: + + redis://localhost/bigjob:bj-9a9ba4d8-b162-11e1-9c42-109addae22a3:localhost:jobs:sj-6f44da6e-b178-11e1-bc99-109addae22a3 + """ + url = subjob_url + if subjob_url.find("bigjob")==0: + url = os.path.join(self.coordination.address, + subjob_url) + if self.coordination.dbtype!="" and self.coordination.dbtype!=None: + url = os.path.join(url, "?" + self.coordination.dbtype) + return url + + + def __generate_bootstrap_script(self, coordination_host, coordination_namespace, external_coordination_namespace="", bigjob_home=None): + script = textwrap.dedent("""import sys +import os +import urllib +import sys +import time +start_time = time.time() +home = os.environ.get("HOME") +#print "Home: " + home +if home==None: home = os.getcwd() +BIGJOB_AGENT_DIR= os.path.join(home, ".bigjob") +if not os.path.exists(BIGJOB_AGENT_DIR): os.mkdir (BIGJOB_AGENT_DIR) +BIGJOB_PYTHON_DIR=BIGJOB_AGENT_DIR+"/python/" +if not os.path.exists(BIGJOB_PYTHON_DIR): os.mkdir(BIGJOB_PYTHON_DIR) +BOOTSTRAP_URL="https://raw.github.com/saga-project/BigJob/master/bootstrap/bigjob-bootstrap.py" +BOOTSTRAP_FILE=BIGJOB_AGENT_DIR+"/bigjob-bootstrap.py" +#ensure that BJ in .bigjob is upfront in sys.path +sys.path.insert(0, os.getcwd() + "/../") +p = list() +for i in sys.path: + if i.find(\".bigjob/python\")>1: + p.insert(0, i) +for i in p: sys.path.insert(0, i) +print "Python path: " + str(sys.path) +print "Python version: " + str(sys.version_info) +try: import saga +except: print "SAGA not found."; +try: import bigjob.bigjob_agent +except: + print "BigJob not installed. Attempt to install it."; + try: + opener = urllib.FancyURLopener({}); + opener.retrieve(BOOTSTRAP_URL, BOOTSTRAP_FILE); + except Exception, ex: + print "Unable to download bootstrap script: " + str(ex) + ". Please install BigJob manually." + print "Execute: " + "python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR + os.system("/usr/bin/env") + try: + os.system("python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR); + activate_this = os.path.join(BIGJOB_PYTHON_DIR, "bin/activate_this.py"); + execfile(activate_this, dict(__file__=activate_this)) + except: + print "BJ installation failed. Trying system-level python (/usr/bin/python)"; + os.system("/usr/bin/python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR); + activate_this = os.path.join(BIGJOB_PYTHON_DIR, "bin/activate_this.py"); + execfile(activate_this, dict(__file__=activate_this)) +#try to import BJ once again +try: + import bigjob.bigjob_agent +except Exception, ex: + print "Unable install BigJob: " + str(ex) + ". Please install BigJob manually." +# execute bj agent +args = list() +args.append("bigjob_agent.py") +args.append(\"%s\") +args.append(\"%s\") +args.append(\"%s\") +print "Bootstrap time: " + str(time.time()-start_time) +print "Starting BigJob Agents with following args: " + str(args) +bigjob_agent = bigjob.bigjob_agent.bigjob_agent(args) +""" % (coordination_host, coordination_namespace, external_coordination_namespace)) + return script + + def __generate_bootstrap_script_from_binary(self, coordination_host, coordination_namespace, + external_coordination_namespace="", + bigjob_home=None): + script = textwrap.dedent("""import sys +import os +import urllib +import sys +import time +start_time = time.time() +home = os.environ.get("HOME") +#print "Home: " + home +if home==None: home = os.getcwd() +BIGJOB_AGENT_DIR= os.path.join(home, ".bigjob") +if not os.path.exists(BIGJOB_AGENT_DIR): os.mkdir (BIGJOB_AGENT_DIR) +BIGJOB_PYTHON_DIR=BIGJOB_AGENT_DIR+"/python/" +if not os.path.exists(BIGJOB_PYTHON_DIR): os.mkdir(BIGJOB_PYTHON_DIR) +BOOTSTRAP_URL="http://s3.amazonaws.com/bigjob/bigjob-Linux-x86_64.tar.gz" +BOOTSTRAP_FILE="bigjob-Linux-x86_64.tar.gz" +#ensure that BJ in .bigjob is upfront in sys.path +sys.path.insert(0, os.getcwd() + "/../") +#sys.path.insert(0, /User/luckow/.bigjob/python/lib") +#sys.path.insert(0, os.getcwd() + "/../../") +p = list() +for i in sys.path: + if i.find(\".bigjob/python\")>1: + p.insert(0, i) +for i in p: sys.path.insert(0, i) +print "Python path: " + str(sys.path) +print "Python version: " + str(sys.version_info) +try: import saga +except: print "SAGA and SAGA Python Bindings not found."; +try: import bigjob.bigjob_agent +except: + print "BigJob not installed. Attempt to install it."; + #opener = urllib.FancyURLopener({}); + #ret = opener.retrieve(BOOTSTRAP_URL, BOOTSTRAP_FILE); + print "Download via wget" + os.system("wget " + BOOTSTRAP_URL) + os.system("rm -rf .bigjob") + print "Execute: " + "tar -xzf " + BOOTSTRAP_FILE + os.system("ls -lta") + try: + os.system("tar -xzf " + BOOTSTRAP_FILE); + os.system("ls -lta") + os.system(".bigjob/python/bin/python -c 'import bigjob; import bigjob.bigjob_agent; print bigjob.version; bigjob.bigjob_agent.bigjob_agent([\\"bigjob_agent.py\\", \\"%s\\", \\"%s\\", \\"%s\\"])'") + + except: + print "BJ installation failed!"; +""" % (coordination_host, coordination_namespace, external_coordination_namespace)) + return script + + + def __escape_rsl(self, bootstrap_script): + logger.debug("Escape RSL") + bootstrap_script = bootstrap_script.replace("\"", "\"\"") + return bootstrap_script + + + def __escape_pbs(self, bootstrap_script): + logger.debug("Escape bootstrap script") + bootstrap_script = "\'" + bootstrap_script+ "\'" + return bootstrap_script + + + def __escape_ssh(self, bootstrap_script): + logger.debug("Escape SSH") + bootstrap_script = bootstrap_script.replace("\"", "\\\"") + bootstrap_script = bootstrap_script.replace("\'", "\\\"") + bootstrap_script = "\"" + bootstrap_script+ "\"" + return bootstrap_script + + def __escape_saga(self, bootstrap_script): + logger.debug("Escape SAGA") + #bootstrap_script = bootstrap_script.replace("\'", "\"") + #bootstrap_script = "\'" + bootstrap_script+ "\'" + bootstrap_script = bootstrap_script.replace('"','\\"') + bootstrap_script = '"' + bootstrap_script+ '"' + return bootstrap_script + + + def __parse_pilot_url(self, pilot_url): + #pdb.set_trace() + pilot_saga_url = SAGAUrl(pilot_url) + dbtype = pilot_saga_url.query + coordination = pilot_url[:pilot_url.index("bigjob")] + if dbtype!=None: + coordination = os.path.join(coordination, "?"+dbtype) + pilot_url = pilot_saga_url.path[1:] + + logger.debug("Parsed URL - Coordination: %s Pilot: %s"%(coordination, pilot_url)) + return coordination, pilot_url + + + def __has_finished(self, state): + state = state.lower() + if state=="done" or state=="failed" or state=="canceled": + return True + else: + return False + + def __parse_url(self, url): + try: + surl = SAGAUrl(url) + host = surl.host + port = surl.port + username = surl.username + password = surl.password + query = surl.query + if query!=None and query.endswith("/"): + query = query[:-1] + scheme = "%s://"%surl.scheme + except: + """ Fallback URL parser based on Python urlparse library """ + logger.error("URL %s could not be parsed"%(url)) + traceback.print_exc(file=sys.stderr) + result = urlparse.urlparse(url) + logger.debug("Result: " + str(result)) + host = result.hostname + #host = None + port = result.port + username = result.username + password = result.password + scheme = "%s://"%result.scheme + if host==None: + logger.debug("Python 2.6 fallback") + if url.find("/", len(scheme)) > 0: + host = url[len(scheme):url.find("/", len(scheme))] + else: + host = url[len(scheme):] + if host.find(":")>1: + logger.debug(host) + comp = host.split(":") + host = comp[0] + port = int(comp[1]) + + if url.find("?")>0: + query = url[url.find("?")+1:] + else: + query = None + + + logger.debug("%s %s %s"%(scheme, host, port)) + return scheme, username, password, host, port, query + + def __get_bj_id(self, pilot_url): + start = pilot_url.index("bj-") + end =pilot_url.index(":", start) + return pilot_url[start:end] + + + def __init_coordination(self, coordination_url): + + bigjob_coordination = None + if(coordination_url.startswith("advert://") or coordination_url.startswith("sqlasyncadvert://")): + try: + from coordination.bigjob_coordination_advert import bigjob_coordination + logger.debug("Utilizing ADVERT Backend") + except: + logger.error("Advert Backend could not be loaded") + elif (coordination_url.startswith("redis://")): + try: + from coordination.bigjob_coordination_redis import bigjob_coordination + logger.debug("Utilizing Redis Backend") + except: + logger.error("Error loading pyredis.") + self.__print_traceback() + elif (coordination_url.startswith("tcp://")): + try: + from coordination.bigjob_coordination_zmq import bigjob_coordination + logger.debug("Utilizing ZMQ Backend") + except: + logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + +"PYZMQ (http://zeromq.github.com/pyzmq/)") + else: + logger.error("No suitable coordination backend found.") + + # check whether coordination subsystem could be initialized + if bigjob_coordination==None: + raise BigJobError("Could not initialize coordination subsystem (Redis)") + + + logger.debug("Parsing URL: " + coordination_url) + scheme, username, password, host, port, dbtype = self.__parse_url(coordination_url) + + if port == -1: + port = None + coordination = bigjob_coordination(server=host, server_port=port, username=username, + password=password, dbtype=dbtype, url_prefix=scheme) + return coordination + + + def __get_bigjob_working_dir(self): + self.working_directory = os.path.abspath(os.path.expanduser(self.working_directory)) + if self.working_directory.find(self.uuid)!=-1: # working directory already contains BJ id + return self.working_directory + else: + return os.path.join(self.working_directory, self.uuid) + + + def __get_subjob_working_dir(self, sj_id): + base_url = self.bigjob_working_directory_url + url = os.path.join(base_url, sj_id) + return url + + + ########################################################################### + # File Management + + def __initialize_pilot_data(self, service_url): + # initialize file adaptor + # Pilot Data API for File Management + if service_url.startswith("ssh:"): + logger.debug("Use SSH backend for PilotData") + try: + from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor + self.__filemanager = SSHFileAdaptor(service_url) + except: + logger.debug("SSH package not found.") + self.__print_traceback() + elif service_url.startswith("http:"): + logger.debug("Use WebHDFS backend") + try: + from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor + self.__filemanager = WebHDFSFileAdaptor(service_url) + except: + logger.debug("WebHDFS package not found.") + elif service_url.startswith("go:"): + logger.debug("Use Globus Online backend") + try: + from pilot.filemanagement.globusonline_adaptor import GlobusOnlineFileAdaptor + self.__filemanager = GlobusOnlineFileAdaptor(service_url) + except: + logger.debug("Globus Online package not found.") + self.__print_traceback() + + + + + def __stage_files(self, filetransfers, target_url): + logger.debug("Stage: %s to %s"%(filetransfers, target_url)) + if filetransfers==None: + return + if self.__filemanager: + self.__filemanager.create_remote_directory(target_url) + for i in filetransfers: + source_file=i + if i.find(">")>0: + source_file = i[:i.find(">")].strip() + if source_file.startswith("ssh://")==False and source_file.startswith("go://")==False: + logger.error("Staging of file: %s not supported. Please use URL in form ssh://"%source_file) + continue + target_url_full = os.path.join(target_url, os.path.basename(source_file)) + logger.debug("Stage: %s to %s"%(source_file, target_url_full)) + #self.__third_party_transfer(source_file, target_url_full) + if self.__filemanager: + self.__filemanager.transfer(source_file, target_url_full) + + + + def __get_launch_method(self, hostname, user=None): + """ returns desired execution method: ssh, aprun """ + if user == None: user = self.__discover_ssh_user(hostname) + host = "" + if user!=None and user!="": + logger.debug("discovered user: " + user) + host = user + "@" + hostname + else: + host = hostname + gsissh_available = False + try: + cmd = "gsissh " + host + " /bin/date" + logger.debug("Execute: " + cmd) + gsissh_available = (subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)==0) + except: + pass + + ssh_available = False + try: + cmd = "ssh " + host + " /bin/date" + logger.debug("Execute: " + cmd) + ssh_available = (subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)==0) + except: + pass + + launch_method = "ssh" + if ssh_available == False and gsissh_available == True: + launch_method="gsissh" + else: + launch_method="ssh" + logger.info("SSH: %r GSISSH: %r Use: %s"%(ssh_available, gsissh_available, launch_method)) + return launch_method + + + def __discover_ssh_user(self, hostname): + # discover username + user = None + ssh_config = os.path.join(os.path.expanduser("~"), ".ssh/config") + ssh_config_file = open(ssh_config, "r") + lines = ssh_config_file.readlines() + for i in range(0, len(lines)): + line = lines[i] + if line.find(hostname)>0: + for k in range(i + 1, len(lines)): + sub_line = lines[k] + if sub_line.startswith(" ")==True and sub_line.startswith("\t")==True: + break # configuration for next host + elif sub_line.find("User")!=-1: + stripped_sub_line = sub_line.strip() + user = stripped_sub_line.split()[1] + break + ssh_config_file.close() + return user + + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + logger.debug("*** print_exception:", + exc_info=(exc_type, exc_value, exc_traceback)) + #traceback.print_exception(exc_type, exc_value, exc_traceback, + # limit=2, file=sys.stdout) + + def __repr__(self): + return self.pilot_url + + # def __del__(self): + # """ BJ is not cancelled when object terminates + # Application can reconnect to BJ via pilot url later on""" + # pass + # #self.cancel() + + + + +class subjob(api.base.subjob): + + def __init__(self, coordination_url=None, subjob_url=None): + """Constructor""" + + self.coordination_url = coordination_url + if subjob_url!=None: + self.job_url = subjob_url[subjob_url.index("bigjob"):] + if self.coordination_url==None: + self.coordination_url, self.job_url=self.__parse_subjob_url(subjob_url) + self.uuid = self.__get_sj_id(subjob_url) + self.pilot_url = self.__get_pilot_url(subjob_url) + if self.pilot_url.startswith("bigjob"): + self.pilot_url=os.path.join(self.coordination_url, self.pilot_url) + + self.bj = bigjob(pilot_url=self.pilot_url) + logger.debug("Reconnect SJ: %s Pilot %s"%(self.job_url, self.pilot_url)) + else: + self.uuid = "sj-" + str(get_uuid()) + self.job_url = None + self.pilot_url = None + self.bj = None + + + def get_url(self): + if self.job_url==None: + self.job_url=self.__get_subjob_url(self.pilot_url) + return self.bj._get_subjob_url(self.job_url) + + + def submit_job(self, pilot_url, jd): + """ submit subjob to referenced bigjob """ + if self.job_url==None: + self.job_url=self.__get_subjob_url(pilot_url) + + if self.pilot_url==None: + self.pilot_url = pilot_url + self.bj=_pilot_url_dict[pilot_url] + self.bj._add_subjob(pilot_url, jd, self.job_url, self.uuid) + + + def get_state(self, pilot_url=None): + """ duck typing for saga.job """ + if self.pilot_url==None: + self.pilot_url = pilot_url + self.bj=_pilot_url_dict[pilot_url] + return self.bj._get_subjob_state(self.job_url) + + + def cancel(self, pilot_url=None): + logger.debug("delete job: " + self.job_url) + if self.pilot_url==None: + self.pilot_url = pilot_url + self.bj=_pilot_url_dict[pilot_url] + if str(self.bj.get_state())=="Running": + self.bj._delete_subjob(self.job_url) + + + def get_exe(self, pilot_url=None): + if self.pilot_url==None: + self.pilot_url = pilot_url + self.bj=_pilot_url_dict[pilot_url] + sj = self.bj._get_subjob_details(self.job_url) + return sj["Executable"] + + + def get_details(self, pilot_url=None): + if self.pilot_url==None: + self.pilot_url = pilot_url + self.bj=_pilot_url_dict[pilot_url] + sj = self.bj._get_subjob_details(self.job_url) + return sj + + + def get_arguments(self, pilot_url=None): + if self.pilot_url==None: + self.pilot_url = pilot_url + self.bj=_pilot_url_dict[pilot_url] + sj = self.bj.get_subjob_details(self.job_url) + #logger.debug("Subjob details: " + str(sj)) + arguments="" + for i in sj["Arguments"]: + arguments = arguments + " " + i + return arguments + + + def __repr__(self): + if(self.job_url==None): + return "None" + else: + return self.job_url + + ########################################################################### + # Internal and protected methods + + def __get_sj_id(self, job_url): + start = job_url.index("sj-") + return job_url[start:] + + + def __get_pilot_url(self, job_url): + end =job_url.index(":jobs") + + # Make sure that DB type is appended + surl = SAGAUrl(job_url) + query = surl.query + pilot_url=job_url[:end] + if query!=None and query !="": + pilot_url = pilot_url + "?"+query + return pilot_url + + + def __get_subjob_url(self, pilot_url): + if pilot_url.find("bigjob")>1: + pilot_url = pilot_url[pilot_url.find("bigjob"):] + if pilot_url.endswith("/"): + pilot_url = pilot_url[:-1] + self.job_url = pilot_url + ":jobs:" + str(self.uuid) + return self.job_url + + def __parse_subjob_url(self, subjob_url): + #pdb.set_trace() + subjob_saga_url = SAGAUrl(subjob_url) + dbtype = subjob_saga_url.query + coordination = subjob_url[:subjob_url.index("bigjob")] + if dbtype!=None: + coordination = os.path.join(coordination, "?"+dbtype) + sj_url = subjob_saga_url.path[1:] + +# sj_url = subjob_url[subjob_url.find("bigjob"):] +# if sj_url.find("/") > 0 or dbtype!=None or dbtype!="": +# comp = sj_url.split("/") +# sj_url = comp[0] +# if comp[1].find("dbtype")>0: +# dbtype=comp[1][comp[1].find("dbtype"):] + + logger.debug("Parsed URL - Coordination: %s Pilot: %s"%(coordination, sj_url)) + return coordination, sj_url + + +############################################################################### +## Properties for description class +# + +def input_data(): + doc = "List of input data units." + def fget(self): + return self._input_data + def fset(self, val): + self._input_data = val + def fdel(self, val): + self._input_data = None + return locals() + +def output_data(): + doc = "List of output data units." + def fget(self): + return self._output_data + def fset(self, val): + self._output_data = val + def fdel(self, val): + self._output_data = None + return locals() + + +class description(SAGAJobDescription): + """ Sub-job description """ + ##input_data = property(**input_data()) + ##output_data = property(**output_data()) + ##environment = {} + + # -------------------------------------------------------------------------- + # + def __init__(self): + saga.job.Description.__init__(self) + #self.attributes_extensible_ (True) + + # Extend description class by Pilot-Data relevant attributes + self._output_data = None + self._input_data = None + + import saga.attributes as sa + + self._attributes_extensible (True) + self._attributes_camelcasing (True) + + + self._attributes_register ("InputData", None, sa.ANY, sa.VECTOR, sa.WRITEABLE) + self._attributes_register ("OutputData", None, sa.ANY, sa.VECTOR, sa.WRITEABLE) + + self._attributes_set_getter ("InputData", self._get_input_data ) + self._attributes_set_getter ("OutputData", self._get_output_data) + + # -------------------------------------------------------------------------- + # + def _get_input_data (self) : + print "get caled. returning: %s" % self.input_data + return self.input_data + + # -------------------------------------------------------------------------- + # + def _get_output_data (self) : + return self.output_data + + + diff --git a/bigjob/job_plugin/ec2ssh.py b/bigjob/job_plugin/ec2ssh.py index f0684f5c..413824b1 100644 --- a/bigjob/job_plugin/ec2ssh.py +++ b/bigjob/job_plugin/ec2ssh.py @@ -119,7 +119,7 @@ def __init__(self, job_description, resource_url, pilot_compute_description): path=path) else: aws_region = None - if self.pilot_compute_description.has_key("region"): + if "region" in self.pilot_compute_description: region = self.pilot_compute_description["region"] logger.debug("Connect to region: %s"%(str(region))) aws_region = boto.ec2.get_region(region, @@ -230,9 +230,9 @@ def cancel(self): # private methods def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) @@ -242,4 +242,4 @@ def __print_traceback(self): ec2_service = Service("ec2+ssh://aws.amazon.com") j = ec2_service.create_job("blas") j.run() - print j.get_state() + print(j.get_state()) diff --git a/bigjob/job_plugin/ec2ssh.py.bak b/bigjob/job_plugin/ec2ssh.py.bak new file mode 100644 index 00000000..f0684f5c --- /dev/null +++ b/bigjob/job_plugin/ec2ssh.py.bak @@ -0,0 +1,245 @@ +#!/usr/bin/env python + +from bigjob import logger +import os +import uuid +import time +import traceback +import sys +from boto.ec2.connection import EC2Connection +from boto.ec2.regioninfo import RegionInfo +import boto.ec2 + +import saga + +############################################################################### +# EC2 General +PLACEMENT_GROUP=None +SECURITY_GROUP="default" + +# VM/Image specific configurations +# Recommendation Ubuntu > 1104 +# apt-get install gcc python-all-dev git subversion vim +# EC2_AMI_ID="ami-c7943cae" # ami-82fa58eb official Amazon Ubuntu 12.04 LTS (requires dev tools installation) +# EC2_AMI_ID="ami-d7f742be" +# EC2_USERNAME="ubuntu" +# EC2_KEYNAME="lsu-keypair" +# EC2_KEYNAME="MyKey" + +# Authentication +# Please use ~/.boto file to configure your security credentials (if possible) +# see http://boto.readthedocs.org/en/latest/boto_config_tut.html +# +# [Credentials] +# aws_access_key_id = +# aws_secret_access_key = +# +# Alternatively you can use these two variables +AWS_ACCESS_KEY_ID=None +AWS_SECRET_ACCESS_KEY=None + + + + +class State: + UNKNOWN="unknown" + PENDING="pending" + RUNNING="running" + + +class Service(object): + """ Plugin for Amazon EC2 and EUCA + + Manages endpoint in the form of: + + ec2+ssh:// + euca+ssh:// + """ + + def __init__(self, resource_url, pilot_compute_description=None): + """Constructor""" + self.resource_url = resource_url + self.pilot_compute_description = pilot_compute_description + + + def create_job(self, job_description): + j = Job(job_description, self.resource_url, self.pilot_compute_description) + return j + + + def __del__(self): + pass + + + + +class Job(object): + """ Plugin for Amazon EC2 + + Starts VM and executes BJ agent on this VM + + + Eucalyptus on FutureGrid uses a self-signed certificate, which 1) needs to be added to boto configuration + or 2) certificate validation needs to be disabled. + """ + + def __init__(self, job_description, resource_url, pilot_compute_description): + + self.job_description = job_description + logger.debug("URL: " + str(resource_url) + " Type: " + str(type(resource_url))) + self.resource_url = saga.Url(str(resource_url)) + self.pilot_compute_description = pilot_compute_description + + self.id="bigjob-" + str(uuid.uuid1()) + self.network_ip=None + + self.ec2_conn=None + + if self.resource_url.scheme == "euca+ssh" or self.resource_url.scheme == "nova+ssh": + host = self.resource_url.host + path = "/services/Eucalyptus" + if self.resource_url.path!=None: + path = self.resource_url.path + port = 8773 + if self.resource_url.port != None: + port = self.resource_url.port + region = None + logger.debug("Host: %s, Path: %s, Port: %d"%(host, path, port)) + if self.resource_url.scheme == "euca+ssh": + region = RegionInfo(name="eucalyptus", endpoint=host) + elif self.resource_url.scheme == "nova+ssh": + region = RegionInfo(name="openstack", endpoint=host) + logger.debug("Access Key: %s Secret: %s"%(self.pilot_compute_description["access_key_id"], + self.pilot_compute_description["secret_access_key"])) + self.ec2_conn = EC2Connection(aws_access_key_id=self.pilot_compute_description["access_key_id"], + aws_secret_access_key=self.pilot_compute_description["secret_access_key"], + region=region, + is_secure=False, + port=port, + path=path) + else: + aws_region = None + if self.pilot_compute_description.has_key("region"): + region = self.pilot_compute_description["region"] + logger.debug("Connect to region: %s"%(str(region))) + aws_region = boto.ec2.get_region(region, + aws_access_key_id=self.pilot_compute_description["access_key_id"], + aws_secret_access_key=self.pilot_compute_description["secret_access_key"] + ) + + + self.ec2_conn = EC2Connection(aws_access_key_id=self.pilot_compute_description["access_key_id"], + aws_secret_access_key=self.pilot_compute_description["secret_access_key"], + region = aws_region) + + self.instance = None + + + def run(self): + """ Start VM and start BJ agent via SSH on VM """ + + """ Map fields of Pilot description to EC2 API + { "vm_id":"ami-d7f742be", + "vm_ssh_username":"ubuntu", + "vm_ssh_keyname":"MyKey", + "vm_ssh_keyfile":"", + "vm_type":"t1.micro", + "access_key_id":"xxx", + "secret_access_key":"xxx" + } + """ + + reservation = self.ec2_conn.run_instances(self.pilot_compute_description["vm_id"], + key_name=self.pilot_compute_description["vm_ssh_keyname"], + instance_type=self.pilot_compute_description["vm_type"], + security_groups=[SECURITY_GROUP]) + + self.instance = reservation.instances[0] + self.instance_id = self.instance.id + logger.debug("Started EC2/Eucalyptus/Nova instance: %s"%self.instance_id) + time.sleep(5) + self.wait_for_running() + + if self.resource_url.scheme != "euca+ssh" and self.resource_url.scheme != "nova+ssh": + self.ec2_conn.create_tags([self.instance_id], {"Name": self.id}) + + + self.network_ip = self.instance.ip_address + url = "ssh://" + str(self.network_ip) + logger.debug("Connect to: %s"%(url)) + + + # Submit job + ctx = saga.Context("SSH") + #ctx.type = saga.Context.SSH + ctx.user_id = self.pilot_compute_description["vm_ssh_username"] + ctx.user_key = self.pilot_compute_description["vm_ssh_keyfile"] + + session = saga.Session() + session.add_context(ctx) + + TRIAL_MAX=30 + trials=0 + while trials < TRIAL_MAX: + try: + js = saga.job.Service(url, session=session) + logger.debug("Job Description Type: " + str(type(self.job_description))) + job = js.create_job(self.job_description) + logger.debug("Attempt: %d, submit pilot job to: %s "%(trials,str(url))) + job.run() + if job.get_state()==saga.job.FAILED: + logger.warning("Submission failed.") + trials = trials + 1 + time.sleep(30) + continue + else: + break + except: + exc_type, exc_value, exc_traceback = sys.exc_info() + logger.warning("Submission failed: " + str(exc_value)) + #self.__print_traceback() + trials = trials + 1 + time.sleep(30) + if trials == TRIAL_MAX: + raise Exception("Submission of agent failed.") + + logger.debug("Job State : %s" % (job.get_state())) + + + + def wait_for_running(self): + while self.get_state()!=State.RUNNING: + time.sleep(5) + + + def get_state(self): + result = State.UNKNOWN + try: + self.instance.update() + result=self.instance.state + except: + logger.warning("Instance not reachable/active yet...") + return result + + + def cancel(self): + self.instance.terminate() + + + ########################################################################### + # private methods + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + print "*** print_tb:" + traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) + print "*** print_exception:" + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stdout) + + + +if __name__ == "__main__": + ec2_service = Service("ec2+ssh://aws.amazon.com") + j = ec2_service.create_job("blas") + j.run() + print j.get_state() diff --git a/bigjob/job_plugin/gcessh.py b/bigjob/job_plugin/gcessh.py index 950e1f4e..b3d8c5ec 100644 --- a/bigjob/job_plugin/gcessh.py +++ b/bigjob/job_plugin/gcessh.py @@ -77,15 +77,15 @@ def __init__(self, job_description, saga_url, pilot_compute_description): self.saga_url = saga_url self.pilot_compute_description = pilot_compute_description self.image_url = GCE_IMAGE_URL - if self.pilot_compute_description.has_key("vm_id"): + if "vm_id" in self.pilot_compute_description: self.image_url = self.pilot_compute_description["vm_id"] self.machine_type = "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/machine-types/n1-standard-1" - if self.pilot_compute_description.has_key("vm_type"): + if "vm_type" in self.pilot_compute_description: self.machine_type = self.pilot_compute_description["vm_type"] self.location = "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/zones/us-east1-a" - if self.pilot_compute_description.has_key("vm_location"): + if "vm_location" in self.pilot_compute_description: self.location = self.pilot_compute_description["vm_location"] @@ -180,7 +180,7 @@ def run(self): js = saga.job.Service(url, session=session) job = js.create_job(self.job_description) - print "Submit pilot job to: " + str(url) + print("Submit pilot job to: " + str(url)) TRIAL_MAX=15 trials=0 @@ -199,7 +199,7 @@ def run(self): - print "Job State : %s" % (job.get_state()) + print("Job State : %s" % (job.get_state())) def wait_for_running(self): @@ -234,4 +234,4 @@ def __get_instance_resource(self): j = gce_service.create_job(job_description) gce.run() - print gce.get_state() + print(gce.get_state()) diff --git a/bigjob/job_plugin/gcessh.py.bak b/bigjob/job_plugin/gcessh.py.bak new file mode 100644 index 00000000..950e1f4e --- /dev/null +++ b/bigjob/job_plugin/gcessh.py.bak @@ -0,0 +1,237 @@ +#!/usr/bin/env python + +from bigjob import logger +from apiclient.discovery import build +from oauth2client.file import Storage +from oauth2client.client import OAuth2WebServerFlow +from oauth2client.tools import run +import httplib2 +import os +import uuid +import time + +import saga + +""" +AN OAUTH2 Client Id must be created at the Google API console at: + +https://code.google.com/apis/console/ + +=> API Access + +More information with respect to OAUTH: https://developers.google.com/compute/docs/api/how-tos/authorization +""" +OAUTH2_CLIENT_ID='1004462711324-55akehip32m59u6omdfrt9s8u8ehb0hm.apps.googleusercontent.com' +OAUTH2_CLIENT_SECRET='EIMML1W7anu0XijVghws0DY-' + +GCE_PROJECT_ID='bigjob-pilot' + +""" +Google Compute Engine currently provides a default image with Ubuntu 12.04 + +To use BJ, a custom image containing gcc and build essentials needs to be +created. + +$ apt-get update +$ apt-get install gcc python-all-dev + +""" +GCE_IMAGE_URL="https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/images/bigjob-image" + +class gce_states: + PROVISIONING="PROVISIONING" + STAGING="STAGING" + RUNNING="RUNNING" + + +class Service(object): + """ Plugin for Google Compute Engine + + Manages endpoint in the form of: + + gce+ssh://api.google.com + """ + + def __init__(self, resource_url, pilot_compute_description): + """Constructor""" + self.resource_url = resource_url + self.pilot_compute_description = pilot_compute_description + + def create_job(self, job_description): + j = Job(job_description, self.resource_url, self.pilot_compute_description) + return j + + + def __del__(self): + pass + + + + +class Job(object): + + + def __init__(self, job_description, saga_url, pilot_compute_description): + + self.job_description = job_description + self.saga_url = saga_url + self.pilot_compute_description = pilot_compute_description + self.image_url = GCE_IMAGE_URL + if self.pilot_compute_description.has_key("vm_id"): + self.image_url = self.pilot_compute_description["vm_id"] + + self.machine_type = "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/machine-types/n1-standard-1" + if self.pilot_compute_description.has_key("vm_type"): + self.machine_type = self.pilot_compute_description["vm_type"] + + self.location = "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/zones/us-east1-a" + if self.pilot_compute_description.has_key("vm_location"): + self.location = self.pilot_compute_description["vm_location"] + + + self.id="bigjob-" + str(uuid.uuid1()) + self.network_ip=None + + # Do OAUTH authentication + storage = Storage('gce.dat') + self.credentials = storage.get() + if self.credentials is None or self.credentials.invalid == True: + flow = OAuth2WebServerFlow( + client_id=OAUTH2_CLIENT_ID, + client_secret=OAUTH2_CLIENT_SECRET, + scope='https://www.googleapis.com/auth/compute', + user_agent='bigjob-client/1.0') + + self.credentials = run(flow, storage) + + + + def run(self): + request_dict = { + "kind": "compute#instance", + "disks": [ + { + "kind": "compute#instanceDisk", + "type": "PERSISTENT", + "mode": "READ", + "deviceName": "reference-genome", + "source": "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/disks/reference-genome" + } + ], + "networkInterfaces": [ + { + "kind": "compute#instanceNetworkInterface", + "accessConfigs": [ + { + "name": "External NAT", + "type": "ONE_TO_ONE_NAT" + } + ], + "network": "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/networks/default" + } + ], + "serviceAccounts": [ + { + "kind": "compute#serviceAccount", + "email": "default", + "scopes": [ + "https://www.googleapis.com/auth/userinfo.email", + "https://www.googleapis.com/auth/compute", + "https://www.googleapis.com/auth/devstorage.full_control" + ] + } + ], + #"zone": "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/zones/us-east1-a", + "zone": self.location, + #"machineType": "https://www.googleapis.com/compute/v1beta12/projects/bigjob-pilot/machine-types/n1-standard-1", + "machineType": self.machine_type, + "name": self.id, + "image": self.image_url + } + + http = httplib2.Http() + http = self.credentials.authorize(http) + gce = build("compute", "v1beta12", http=http) + #result = gce.instances().get(instance="bigjob-pilot", project="bigjob-pilot").execute() + gce.instances().insert(project=GCE_PROJECT_ID, body=request_dict).execute() + + time.sleep(15) # wait for startup + #wait for compute instance to become active + self.wait_for_running() + + # spawn BJ agent via SSH + compute_instance_details = self.__get_instance_resource() + logger.debug("Compute Instance Details: " + str(compute_instance_details)) + self.network_ip = compute_instance_details["networkInterfaces"][0]["accessConfigs"][0]['natIP'] + url = "ssh://" + str(self.network_ip) + logger.debug("Connect to: %s"%(url)) + + + # Submit job + ctx = saga.Context("SSH") + #ctx.type = saga.Context.SSH + ctx.user_id = self.pilot_compute_description["vm_ssh_username"] + ctx.user_key = self.pilot_compute_description["vm_ssh_keyfile"] + #js.session.contexts = [ctx] + + session = saga.Session() + session.add_context(ctx) + + js = saga.job.Service(url, session=session) + + job = js.create_job(self.job_description) + print "Submit pilot job to: " + str(url) + + TRIAL_MAX=15 + trials=0 + while trials < TRIAL_MAX: + try: + logger.debug("Attempt: %d, submit pilot job to: %s "%(trials,str(url))) + job.run() + break + except: + trials = trials + 1 + time.sleep(10) + if trials == TRIAL_MAX: + raise Exception("Submission of agent failed.") + + logger.debug("Job State : %s" % (job.get_state())) + + + + print "Job State : %s" % (job.get_state()) + + + def wait_for_running(self): + while self.get_state()!=gce_states.RUNNING: + time.sleep(5) + + + def get_state(self): + result=self.__get_instance_resource() + return result["status"] + + + def cancel(self): + http = httplib2.Http() + http = self.credentials.authorize(http) + gce = build("compute", "v1beta12", http=http) + gce.instances().delete(project=GCE_PROJECT_ID, instance=self.id).execute() + + + def __get_instance_resource(self): + http = httplib2.Http() + http = self.credentials.authorize(http) + gce = build("compute", "v1beta12", http=http) + result = gce.instances().get(project=GCE_PROJECT_ID, instance=self.id).execute() + return result + + + + +if __name__ == "__main__": + gce_service = Service("gce+ssh://api.google.com") + j = gce_service.create_job(job_description) + + gce.run() + print gce.get_state() diff --git a/bigjob/job_plugin/slurmssh.py b/bigjob/job_plugin/slurmssh.py index 7bb7ef57..8e96f8f9 100644 --- a/bigjob/job_plugin/slurmssh.py +++ b/bigjob/job_plugin/slurmssh.py @@ -47,7 +47,7 @@ def __init__(self, job_description, resource_url, pilot_compute_description=None self.working_directory = pilot_compute_description["working_directory"] ### convert walltime in minutes to SLURM representation of time ### walltime_slurm="1:00:00" - if pilot_compute_description.has_key("walltime"): + if "walltime" in pilot_compute_description: hrs=int(pilot_compute_description["walltime"])/60 minu=int(pilot_compute_description["walltime"])%60 walltime_slurm=""+str(hrs)+":"+str(minu)+":00" @@ -197,4 +197,4 @@ def cancel(self): slurm_service = Service("slurm+ssh://stampede.tacc.utexas.edu") j = slurm_service.create_job("") j.run() - print j.get_state() + print(j.get_state()) diff --git a/bigjob/job_plugin/slurmssh.py.bak b/bigjob/job_plugin/slurmssh.py.bak new file mode 100644 index 00000000..7bb7ef57 --- /dev/null +++ b/bigjob/job_plugin/slurmssh.py.bak @@ -0,0 +1,200 @@ +#!/usr/bin/env python + +import textwrap +import re +import os +import pdb + +from bigjob import logger +import bigjob + +import saga + +class Service(object): + """ Plugin for SlURM """ + + def __init__(self, resource_url, pilot_compute_description=None): + """Constructor""" + self.resource_url = resource_url + self.pilot_compute_description = pilot_compute_description + + + def create_job(self, job_description): + j = Job(job_description, self.resource_url, self.pilot_compute_description) + return j + + + def __del__(self): + pass + +class Job(object): + """Constructor""" + def __init__(self, job_description, resource_url, pilot_compute_description=None): + self.job_description=job_description + self.bootstrap_script = self.job_description.arguments[2] + self.job_id = "" + self.resource_url = resource_url + self.resource_url.scheme="ssh" + logger.debug("BigJob/SLURM: Parsing job description") + if pilot_compute_description == None: + pilot_compute_description={} + pilot_compute_description['queue'] = job_description.queue + pilot_compute_description['project'] = job_description.project + pilot_compute_description['working_directory'] = job_description.working_directory + pilot_compute_description['walltime'] = job_description.wall_time_limit + pilot_compute_description['number_of_processes'] = job_description.total_cpu_count + + self.working_directory = pilot_compute_description["working_directory"] + ### convert walltime in minutes to SLURM representation of time ### + walltime_slurm="1:00:00" + if pilot_compute_description.has_key("walltime"): + hrs=int(pilot_compute_description["walltime"])/60 + minu=int(pilot_compute_description["walltime"])%60 + walltime_slurm=""+str(hrs)+":"+str(minu)+":00" + + logger.debug("BigJob/SLURM: generate bootstrap script") + self.bootstrap_script = textwrap.dedent("""import sys +import os +import urllib +import sys +import time +import textwrap + +sbatch_file_name="bigjob_slurm_ssh" + +sbatch_file = open(sbatch_file_name, "w") +sbatch_file.write("#!/bin/bash") +sbatch_file.write("\\n") +sbatch_file.write("#SBATCH -n %s") +sbatch_file.write("\\n") +sbatch_file.write("#SBATCH -J bigjob_slurm") +sbatch_file.write("\\n") +sbatch_file.write("#SBATCH -t %s") +sbatch_file.write("\\n") +sbatch_file.write("#SBATCH -A %s") +sbatch_file.write("\\n") +sbatch_file.write("#SBATCH -o %s/stdout-bigjob_agent.txt") +sbatch_file.write("\\n") +sbatch_file.write("#SBATCH -e %s/stderr-bigjob_agent.txt") +sbatch_file.write("\\n") +sbatch_file.write("#SBATCH -p %s") +sbatch_file.write("\\n") + +sbatch_file.write("cd %s") +sbatch_file.write("\\n") +sbatch_file.write("python -c XX" + textwrap.dedent(\"\"%s\"\") + "XX") +sbatch_file.close() +os.system( "sbatch " + sbatch_file_name) +""") % (str(pilot_compute_description["number_of_processes"]),str(walltime_slurm), str(pilot_compute_description["project"]), pilot_compute_description["working_directory"], pilot_compute_description["working_directory"], pilot_compute_description["queue"], pilot_compute_description["working_directory"], self.bootstrap_script) + ### escaping characters + self.bootstrap_script = self.bootstrap_script.replace("\"","\\\"") + self.bootstrap_script = self.bootstrap_script.replace("\\\\","\\\\\\\\\\") + self.bootstrap_script = self.bootstrap_script.replace("XX","\\\\\\\"") + self.bootstrap_script = "\"" + self.bootstrap_script+ "\"" + logger.debug(self.bootstrap_script) + + + def run(self): + jd = saga.job.Description() + jd.arguments = ["-c", self.bootstrap_script] + jd.executable = "python" + jd.working_directory = self.working_directory + jd.output = "saga_job_submission.out" + jd.error = "saga_job_submission.err" + # Submit job + js = None + js = saga.job.Service(self.resource_url) + slurmsshjob = js.create_job(jd) + logger.debug("Submit pilot job to: " + str(self.resource_url)) + slurmsshjob.run() + slurmsshjob.wait() + logger.debug("BigJob/SLURM: SSH run job finished") + + saga_surl = saga.Url(self.resource_url) + sftp_url = "sftp://" + if saga_surl.username!=None and saga_surl.username!="": + sftp_url = sftp_url + str(saga_surl.username) + "@" + sftp_url = sftp_url + saga_surl.host + "/" + outfile = sftp_url + self.working_directory+'/saga_job_submission.out' + logger.debug("BigJob/SLURM: get outfile: " + outfile) + out = saga.filesystem.File(outfile) + out.copy("sftp://localhost/"+os.getcwd() + "/tmpout") + errfile = sftp_url + self.working_directory+'/saga_job_submission.err' + err = saga.filesystem.File(errfile) + err.copy("sftp://localhost/"+os.getcwd() + "/tmperr") + + tempfile=open(os.getcwd() + "/tmpout") + outstr = tempfile.read().rstrip() + tempfile.close() + os.remove(os.getcwd() + "/tmpout") + + tempfile=open(os.getcwd() + "/tmperr") + errstr = tempfile.read().rstrip() + tempfile.close() + os.remove(os.getcwd() + "/tmperr") + + logger.debug("Output - \n" + str(outstr)) + if ((outstr).split("\n")[-1]).split()[0] == "Submitted": + self.job_id=((outstr).split("\n")[-1]).split()[3] + logger.debug("SLURM JobID: " + str(self.job_id)) + if self.job_id==None or self.job_id=="": + raise Exception("BigJob submission via slurm+ssh:// failed: %s %s" % (outstr,errstr)) + + + def get_state(self): + jd = saga.job.Description() + jd.executable = "squeue" + jd.arguments=["-j",self.job_id] + jd.output="jobstate.out" + jd.working_directory = self.working_directory + # connect to the local job service + js = saga.job.service(self.resource_url); + # submit the job + job = js.create_job(jd) + job.run() + job.wait() + # print the job's output + + outfile = 'sftp://'+saga.Url(self.resource_url).host+self.working_directory+'/jobstate.out' + out = saga.filesystem.File(outfile) + out.move("sftp://localhost/"+os.getcwd() + "/tmpstate") + + tempfile=open(os.getcwd() + "/tmpstate") + output = tempfile.read().rstrip() + tempfile.close() + os.remove(os.getcwd() + "/tmpstate") + + state=output.split("\n")[-1].split()[4] + + if state.upper() == "R": + state = "Running" + elif state.upper() == "CD" or state.upper() == "CF" or state.upper() == "CG": + state = "Done" + elif state.upper() == "PD": + state = "Queue" + else: + state = "Unknown" + return state + + + def cancel(self): + logger.debug("Cancel SLURM job") + jd = saga.job.Description() + jd.executable = "scancel" + jd.arguments = [self.job_id] + # connect to the local job service + js = saga.job.Service(self.resource_url); + # submit the job + job = js.create_job(jd) + job.run() + # wait for the job to complete + job.wait() + + + + +if __name__ == "__main__": + slurm_service = Service("slurm+ssh://stampede.tacc.utexas.edu") + j = slurm_service.create_job("") + j.run() + print j.get_state() diff --git a/bigjob_dynamic/many_job.py b/bigjob_dynamic/many_job.py index 5ae0018d..8c9d0d30 100644 --- a/bigjob_dynamic/many_job.py +++ b/bigjob_dynamic/many_job.py @@ -6,6 +6,7 @@ import pdb import sys import os +from functools import reduce sys.path.append(os.path.dirname( __file__ )) import getopt import time @@ -13,7 +14,7 @@ import socket import traceback -import Queue +import queue import threading import logging import time @@ -52,7 +53,7 @@ def __init__(self, bigjob_list, coordination_url): self.subjob_bigjob_dict = {} # queue contains unscheduled subjobs - self.subjob_queue = Queue.Queue() + self.subjob_queue = queue.Queue() # submit bigjobs to resources self.__init_bigjobs() @@ -139,7 +140,7 @@ def __cleanup_resources(self): if i["to_be_terminated"]==True: bj = i["bigjob"] total_cores = int(i["number_of_processes"]) - if i["free_cores"]==total_cores and not i.has_key("bj_stopped"): + if i["free_cores"]==total_cores and "bj_stopped" not in i: logging.debug("***Stop BigJob: " + str(bj.pilot_url)) # release resources of pilot job bj.stop_pilot_job() @@ -240,7 +241,7 @@ def __check_subjobs_states(self): def __free_resources(self, subjob): """free resources taken by subjob""" - if(self.subjob_bigjob_dict.has_key(subjob)): + if(subjob in self.subjob_bigjob_dict): logging.debug("job: " + str(subjob) + " done - free resources") bigjob = self.subjob_bigjob_dict[subjob] lock = bigjob["lock"] @@ -250,7 +251,7 @@ def __free_resources(self, subjob): bigjob["free_cores"]=free_cores del(self.subjob_bigjob_dict[subjob]) lock.release() - print "Freed resource - new state: Big Job: " + bigjob["bigjob"].pilot_url + " Cores: " + "%s"%free_cores + "/" + str(int(bigjob["number_of_processes"])) + print("Freed resource - new state: Big Job: " + bigjob["bigjob"].pilot_url + " Cores: " + "%s"%free_cores + "/" + str(int(bigjob["number_of_processes"]))) def __reschedule_subjobs_thread(self): """ periodically checks subjob_queue for unscheduled subjobs @@ -284,7 +285,7 @@ def __get_free_cores(self, bigjob): def __get_total_free_cores(self): """ get's the total number of free cores from all active bigjobs """ - free_cores = map(self.__get_free_cores, self.bigjob_list) + free_cores = list(map(self.__get_free_cores, self.bigjob_list)) #print "Free cores: " + str(free_cores) if len(free_cores)>0: total_free_cores = reduce(lambda x, y: x + y, free_cores) @@ -315,7 +316,7 @@ def print_stats(self, times, description): variance += (i - mean)**2 variance /= (n-1) variance = math.sqrt(variance) - print description + " Average: " + str(mean) + " Stdev: " + str(variance) + print(description + " Average: " + str(mean) + " Stdev: " + str(variance)) except: pass @@ -403,7 +404,7 @@ def __repr__(self): """ Test Job Submission via ManyJob abstraction """ if __name__ == "__main__": try: - print "Test ManyJob" + print("Test ManyJob") # create job description jd = SAGAJobDescription() jd.executable = "/bin/date" @@ -418,15 +419,15 @@ def __repr__(self): # {"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_nodes" : "64", "allocation" : "", "queue" : "workq", "bigjob_agent": "$(HOME)/src/REMDgManager/bigjob/advert_launcher.sh"}) resource_list = [] resource_list.append({"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_nodes" : "16", "allocation" : "", "queue" : "workq", "bigjob_agent": os.getcwd() + "/bigjob_agent_launcher.sh"}) - print "Create manyjob service " + print("Create manyjob service ") mjs = many_job_service(resource_list, None) - print "Create sub-job using manyjob " + str(mjs) + print("Create sub-job using manyjob " + str(mjs)) subjob = mjs.create_job(jd) - print "Run sub-job" + print("Run sub-job") subjob.run() - print "Wait for termination" + print("Wait for termination") subjob.wait() - print "Sub-job state: " + str(subjob.get_state()) + print("Sub-job state: " + str(subjob.get_state())) mjs.cancel() except: try: diff --git a/bigjob_dynamic/many_job.py.bak b/bigjob_dynamic/many_job.py.bak new file mode 100644 index 00000000..5ae0018d --- /dev/null +++ b/bigjob_dynamic/many_job.py.bak @@ -0,0 +1,436 @@ +#!/usr/bin/env python + +"""Dynamic BigJob (ManyJob): Manages multiple BigJob (on different resources). +Sub-jobs are distributed across the set of BJs managed by the dynamic BJ. +""" +import pdb +import sys +import os +sys.path.append(os.path.dirname( __file__ )) +import getopt +import time +import uuid +import socket +import traceback + +import Queue +import threading +import logging +import time +import math +import operator +import copy + + +import bigjob.bigjob_manager + +# Log everything, and send it to stderr. +#logging.basicConfig(level=logging.DEBUG) + +COORDINATION_URL="advert://advert.cct.lsu.edu:8080" + +class many_job_service(object): + + def __init__(self, bigjob_list, coordination_url): + """ accepts resource list as key/value pair: + ( {"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_nodes" : "32", "allocation" : "loni_stopgap2", "queue" : "workq", "bigjob_agent": "$(HOME)/src/REMDgManager/bigjob/advert_launcher.sh", "walltime":1000}, + {"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_nodes" : "32", "allocation" : "loni_stopgap2", "queue" : "workq", "bigjob_agent": "$(HOME)/src/REMDgManager/bigjob/advert_launcher.sh", "walltime":1000}) + """ + self.uuid = uuid.uuid1() + + if coordination_url==None: + self.advert_host=COORDINATION_URL + else: + self.advert_host=coordination_url + + # list of resource dicts (1 dict per resource) + # will also store state of bigjob + self.bigjob_list=copy.deepcopy(bigjob_list) + + # state variable storing state of sub-jobs + self.active_subjob_list = [] + self.subjob_bigjob_dict = {} + + # queue contains unscheduled subjobs + self.subjob_queue = Queue.Queue() + + # submit bigjobs to resources + self.__init_bigjobs() + + # thread which tries to resubmit jobs + self.stop=threading.Event() + self.rescheduler_thread=threading.Thread(target=self.__reschedule_subjobs_thread) + self.rescheduler_thread.start() + + # last queue Size + self.last_queue_size = 0 + self.submisssion_times=[] + + def __init_bigjobs(self): + """ start on specified resources a bigjob """ + self.bigjob_list = self.__schedule_bigjobs() + for i in self.bigjob_list: + self.__start_bigjob(i) + + + def __start_bigjob(self, bj_dict): + """ private method - starts a bigjob on the defined resource """ + gram_url = bj_dict["resource_url"] + logging.debug("start bigjob at: " + gram_url) + bj = bigjob.bigjob_manager.bigjob(self.advert_host) + + if("processes_per_node" not in bj_dict or bj_dict["processes_per_node"] == 'None'): + ppn="1" + bj_dict["processes_per_node"] = ppn + else: + ppn=bj_dict["processes_per_node"] + + walltime = 3600 + if ("walltime" in bj_dict): + walltime=bj_dict["walltime"] + + working_directory = (os.getcwd()+"/agent") + if ("working_directory" in bj_dict): + working_directory=bj_dict["working_directory"] + + bj_filetransfer = None + if ("file_transfer" in bj_dict): + bj_filetransfer = bj_dict["file_transfer"] + + bj.start_pilot_job(gram_url, + None, + bj_dict["number_of_processes"], + bj_dict["queue"], + bj_dict["allocation"], + working_directory, + None, + walltime, + ppn, + filetransfers=bj_filetransfer) + bj_dict["bigjob"]=bj # store bigjob for later reference in dict + bj_dict["free_cores"]=int(bj_dict["number_of_processes"]) + bj_dict["to_be_terminated"]=False + # lock for modifying the number of free nodes + bj_dict["lock"] = threading.Lock() + + def add_resource(self, resource_dictionary): + """ adds bigjob described in resource_dictionary to resources """ + bj_dict = copy.deepcopy(resource_dictionary) + + self.__start_bigjob(bj_dict) + self.bigjob_list.append(bj_dict) + return bj_dict["bigjob"] + + + def remove_resource(self, bigjob): + """ remove bigjob from resource list of manyjob """ + # mark bigjob for termination (after all sub-jobs in bj are + # finished + bigjob["to_be_terminated"]=True + + + def __cleanup_resources(self): + """ called periodically from scheduling thread + terminates big-jobs which are marked and don't have + any running sub-jobs + """ + # iterate over copy of list, but remove from orig list + for i in self.bigjob_list[:]: + if i["to_be_terminated"]==True: + bj = i["bigjob"] + total_cores = int(i["number_of_processes"]) + if i["free_cores"]==total_cores and not i.has_key("bj_stopped"): + logging.debug("***Stop BigJob: " + str(bj.pilot_url)) + # release resources of pilot job + bj.stop_pilot_job() + i["bj_stopped"]=True + #self.bigjob_list.remove(i) + + + def get_resources(self): + """ returns list with bigjob dictionaries + for each managed bigjob 1 dictionary exists + """ + return self.bigjob_list + + + def list_bigjobs(self): + """ returns a list of bigjob objects """ + return [i["bigjob"] for i in self.bigjob_list] + + def __schedule_bigjobs(self): + """ prioritizes bigjob_list (bigjob with shortest expected delay will have index 0) """ + # no scheduling for now (start bigjob in the user specified order) + return self.bigjob_list + + def create_job (self, job_description): + subjob = sub_job(self, job_description, self.advert_host) + return subjob + + def __run_subjob(self, subjob): + # select appropriate bigjob + st = time.time() + bigjob_info = self.__schedule_subjob(subjob) + job = subjob.job + if bigjob_info == None: + return job + + # create subjob on bigjob + bj = bigjob_info["bigjob"] + + job.submit_job(bj.pilot_url, subjob.job_description) + self.submisssion_times.append(time.time()-st) + + # store reference of subjob for further bookkeeping + self.active_subjob_list.append(subjob) + self.subjob_bigjob_dict[subjob] = bigjob_info + logging.debug("Subjob submission time: " + str(time.time()-st) + " sec.") + return job + + def queue_subjob(self, subjob): + subjob.job = bigjob.bigjob_manager.subjob(self.advert_host) + self.subjob_queue.put(subjob) + return subjob.job + + def __schedule_subjob (self, subjob): + """ find resource (bigjob) for subjob + returns bigjob object """ + for i in self.bigjob_list: + bigjob = i["bigjob"] + lock = i["lock"] + lock.acquire() + free_cores = i["free_cores"] + bigjob_url = bigjob.pilot_url + state = bigjob.get_state_detail() + logging.debug("Big Job: " + bigjob_url + " Cores: " + "%s"%free_cores + "/" + + str(int(i["number_of_processes"])) + + " State: " + str(state) + " Terminated: " + str(i["to_be_terminated"]) + + " #Required Cores: " + subjob.job_description.number_of_processes + ) + if (state.lower() == "running" and free_cores >= int(subjob.job_description.number_of_processes) + and i["to_be_terminated"]==False): + logging.debug("FOUND match - dispatch to BigJob: " + bigjob_url) + free_cores = i["free_cores"] + free_cores = free_cores - int(subjob.job_description.number_of_processes) + i["free_cores"]=free_cores + lock.release() + return i + + lock.release() + + # no resource found + self.subjob_queue.put(subjob) + logging.debug("found no active resource for sub-job => (re-) queue it") + return None + + def __check_subjobs_states(self): + """iterate through all sub-jobs and check state""" + for i in self.active_subjob_list: + try: + #logging.debug("get job state") + state = i.job.get_state() + #logging.debug("check job state") + if self.__has_finished(state) == True: + #logging.debug("free resources") + self.__free_resources(i) + except: + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stderr) + + def __free_resources(self, subjob): + """free resources taken by subjob""" + if(self.subjob_bigjob_dict.has_key(subjob)): + logging.debug("job: " + str(subjob) + " done - free resources") + bigjob = self.subjob_bigjob_dict[subjob] + lock = bigjob["lock"] + lock.acquire() + free_cores = bigjob["free_cores"] + free_cores = free_cores + int(subjob.job_description.number_of_processes) + bigjob["free_cores"]=free_cores + del(self.subjob_bigjob_dict[subjob]) + lock.release() + print "Freed resource - new state: Big Job: " + bigjob["bigjob"].pilot_url + " Cores: " + "%s"%free_cores + "/" + str(int(bigjob["number_of_processes"])) + + def __reschedule_subjobs_thread(self): + """ periodically checks subjob_queue for unscheduled subjobs + if a unscheduled job exists it is scheduled + """ + + while True and self.stop.isSet()==False: + logging.debug("Reschedule Thread") + # check sub-job state + self.__check_subjobs_states() + # remove unneeded big-jobs + self.__cleanup_resources() + subjob = self.subjob_queue.get() + # check whether this is a real subjob object + if isinstance(subjob, sub_job): + self.__run_subjob(subjob) + if self.last_queue_size == self.subjob_queue.qsize() or self.__get_total_free_cores()==0: + time.sleep(2) # sleep 30 s + + logging.debug("Re-Scheduler terminated") + + + def __get_free_cores(self, bigjob): + """ return number of free cores if bigjob is active """ + #pdb.set_trace() + if (bigjob["bigjob"].get_state_detail().lower()=="running" + and bigjob["to_be_terminated"]==False): + return bigjob["free_cores"] + + return 0 + + def __get_total_free_cores(self): + """ get's the total number of free cores from all active bigjobs """ + free_cores = map(self.__get_free_cores, self.bigjob_list) + #print "Free cores: " + str(free_cores) + if len(free_cores)>0: + total_free_cores = reduce(lambda x, y: x + y, free_cores) + logging.debug("free_cores: " + str(free_cores) + " total_free_cores: " + str(total_free_cores)) + return total_free_cores + return 0 + + def cancel(self): + logging.debug("Cancel re-scheduler thread") + self.stop.set() + # put object in queue to unlock the get() operation + self.subjob_queue.put("dummy") + self.rescheduler_thread.join() + logging.debug("Cancel many-job: kill all bigjobs") + for i in self.bigjob_list: + bigjob = i["bigjob"] + bigjob.cancel() + self.print_stats(self.submisssion_times, "Submission Times") + + def print_stats(self, times, description): + try: + n = len(times) + sum = reduce(operator.add, times) + mean = sum/n + variance=0 + if n > 1: + for i in times: + variance += (i - mean)**2 + variance /= (n-1) + variance = math.sqrt(variance) + print description + " Average: " + str(mean) + " Stdev: " + str(variance) + except: + pass + + def __has_finished(self, state): + state = state.lower() + if state=="done" or state=="failed" or state=="canceled": + return True + else: + return False + + def __repr__(self): + return str(self.uuid) + + def __del__(self): + self.cancel() + + + + +class sub_job(object): + """ Class for controlling individual sub-jobs """ + + def __init__(self, manyjob, job_description, advert_host): + # store bigjob for reference + self.manyjob=manyjob + + # init sub-job via advert + self.job_description = job_description + self.job = None + + def run(self): + # select appropriate bigjob + self.job = self.manyjob.queue_subjob(self) + + def get_state(self): + try: + state = self.job.get_state() + return state + except: + #traceback.print_stack() + pass + return "Unknown" + + def get_arguments(self): + try: + arguments = self.job.get_arguments() + return arguments + except: + traceback.print_exc(file=sys.stdout) + pass + return "" + + def get_exe(self): + try: + exe = self.job.get_exe() + return exe + except: + traceback.print_exc(file=sys.stdout) + pass + return "Unknown" + + + def cancel(self): + return self.job.cancel() + + def wait(self): + while 1: + try: + state = self.get_state() + logging.debug("wait: state: " + state) + if self.__has_finished(state) == True: + break + time.sleep(2) + except (KeyboardInterrupt, SystemExit): + raise + except: + pass + + def __del__(self): + pass + + def __repr__(self): + return str(self.job) + +""" Test Job Submission via ManyJob abstraction """ +if __name__ == "__main__": + try: + print "Test ManyJob" + # create job description + jd = SAGAJobDescription() + jd.executable = "/bin/date" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + jd.working_directory = "/home/luckow" + jd.output = "output.txt" + jd.error = "error.txt" + # submit via mj abstraction + #resource_list = ( {"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_nodes" : "128", "allocation" : "", "queue" : "workq", "bigjob_agent": "$(HOME)/src/REMDgManager/bigjob/advert_launcher.sh"}, + # {"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_nodes" : "64", "allocation" : "", "queue" : "workq", "bigjob_agent": "$(HOME)/src/REMDgManager/bigjob/advert_launcher.sh"}) + resource_list = [] + resource_list.append({"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_nodes" : "16", "allocation" : "", "queue" : "workq", "bigjob_agent": os.getcwd() + "/bigjob_agent_launcher.sh"}) + print "Create manyjob service " + mjs = many_job_service(resource_list, None) + print "Create sub-job using manyjob " + str(mjs) + subjob = mjs.create_job(jd) + print "Run sub-job" + subjob.run() + print "Wait for termination" + subjob.wait() + print "Sub-job state: " + str(subjob.get_state()) + mjs.cancel() + except: + try: + if mjs != None: + mjs.cancel() + except: + pass diff --git a/bigjob_dynamic/many_job_affinity.py b/bigjob_dynamic/many_job_affinity.py index c6506cef..8f6c9d21 100644 --- a/bigjob_dynamic/many_job_affinity.py +++ b/bigjob_dynamic/many_job_affinity.py @@ -5,7 +5,7 @@ import os import sys sys.path.append(os.path.dirname( __file__ )) -from many_job import * +from .many_job import * class many_job_affinity_service(many_job_service): diff --git a/bigjob_dynamic/many_job_affinity.py.bak b/bigjob_dynamic/many_job_affinity.py.bak new file mode 100644 index 00000000..c6506cef --- /dev/null +++ b/bigjob_dynamic/many_job_affinity.py.bak @@ -0,0 +1,53 @@ +#!/usr/bin/env python +"""Dynamic BigJob (ManyJob) with affinity aware scheduler. +""" + +import os +import sys +sys.path.append(os.path.dirname( __file__ )) +from many_job import * + +class many_job_affinity_service(many_job_service): + + def __init__(self, bigjob_list, advert_host): + """ accepts resource list as key/value pair: + ( {"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_cores" : "32", "allocation" : "", "queue" : "workq", + "re_agent": os.getcwd() + "/bigjob_agent_launcher.sh" "walltime":1000, "affinity": "affinity1"}, + {"resource_url" : "gram://qb1.loni.org/jobmanager-pbs", "number_cores" : "32", "allocation" : "=0: + sj_affinity = env.split("=")[1] + logging.debug("Subjob Env: " + str(sj_affinity)) + logging.debug("Big Job: " + bigjob_url + " Cores: " + "%s"%free_cores + "/" + i["number_cores"] + " Affinity: " + + affinity + " SJ affinity: " + sj_affinity + " State: " + state) + if state.lower() == "running" and free_cores >= int(subjob.job_description.number_of_processes) and affinity == sj_affinity: + free_cores = i["free_cores"] + free_cores = free_cores - int(subjob.job_description.number_of_processes) + i["free_cores"]=free_cores + lock.release() + return i + + lock.release() + + # no resource found + self.subjob_queue.put(subjob) + logging.debug("found no active resource for sub-job => (re-) queue it") + return None + + \ No newline at end of file diff --git a/bootstrap/bigjob-bootstrap.py b/bootstrap/bigjob-bootstrap.py index 6ad054b7..ee3bde9d 100644 --- a/bootstrap/bigjob-bootstrap.py +++ b/bootstrap/bigjob-bootstrap.py @@ -26,7 +26,7 @@ import tarfile if sys.version_info < (2, 6): - print('ERROR: %s' % sys.exc_info()[1]) + print(('ERROR: %s' % sys.exc_info()[1])) print('ERROR: this script requires Python 2.6 or greater.') sys.exit(101) @@ -35,12 +35,12 @@ except NameError: from sets import Set as set try: - basestring + str except NameError: - basestring = str + str = str try: - import ConfigParser + import configparser except ImportError: import configparser as ConfigParser @@ -77,7 +77,7 @@ def get_installed_pythons(): try: import winreg except ImportError: - import _winreg as winreg + import winreg as winreg def get_installed_pythons(): python_core = winreg.CreateKey(winreg.HKEY_LOCAL_MACHINE, @@ -552,7 +552,7 @@ class ConfigOptionParser(optparse.OptionParser): configuration files and environmental variables """ def __init__(self, *args, **kwargs): - self.config = ConfigParser.RawConfigParser() + self.config = configparser.RawConfigParser() self.files = self.get_config_files() self.config.read(self.files) optparse.OptionParser.__init__(self, *args, **kwargs) @@ -576,7 +576,7 @@ def update_defaults(self, defaults): # 2. environmental variables config.update(dict(self.get_environ_vars())) # Then set the options with those values - for key, val in config.items(): + for key, val in list(config.items()): key = key.replace('_', '-') if not key.startswith('--'): key = '--%s' % key # only prefer long opts @@ -598,7 +598,7 @@ def update_defaults(self, defaults): val = option.convert_value(key, val) except optparse.OptionValueError: e = sys.exc_info()[1] - print("An error occured during configuration: %s" % e) + print(("An error occured during configuration: %s" % e)) sys.exit(3) defaults[option.dest] = val return defaults @@ -615,7 +615,7 @@ def get_environ_vars(self, prefix='VIRTUALENV_'): """ Returns a generator with all environmental vars with prefix VIRTUALENV """ - for key, val in os.environ.items(): + for key, val in list(os.environ.items()): if key.startswith(prefix): yield (key.replace(prefix, '').lower(), val) @@ -631,7 +631,7 @@ def get_default_values(self): defaults = self.update_defaults(self.defaults.copy()) # ours for option in self._get_all_options(): default = defaults.get(option.dest) - if isinstance(default, basestring): + if isinstance(default, str): opt_str = option.get_opt_string() defaults[option.dest] = option.check_value(opt_str, default) return optparse.Values(defaults) @@ -787,8 +787,8 @@ def main(): parser.print_help() sys.exit(2) if len(args) > 1: - print('There must be only one argument: DEST_DIR (you gave %s)' % ( - ' '.join(args))) + print(('There must be only one argument: DEST_DIR (you gave %s)' % ( + ' '.join(args)))) parser.print_help() sys.exit(2) @@ -985,12 +985,12 @@ def path_locations(home_dir): size = max(len(home_dir)+1, 256) buf = ctypes.create_unicode_buffer(size) try: - u = unicode + u = str except NameError: u = str ret = GetShortPathName(u(home_dir), buf, size) if not ret: - print('Error: the path "%s" has a space in it' % home_dir) + print(('Error: the path "%s" has a space in it' % home_dir)) print('We could not determine the short pathname for it.') print('Exiting.') sys.exit(3) @@ -1471,7 +1471,7 @@ def install_activate(home_dir, bin_dir, prompt=None): if hasattr(home_dir, 'decode'): home_dir = home_dir.decode(sys.getfilesystemencoding()) vname = os.path.basename(home_dir) - for name, content in files.items(): + for name, content in list(files.items()): content = content.replace('__VIRTUAL_PROMPT__', prompt or '') content = content.replace('__VIRTUAL_WINPROMPT__', prompt or '(%s)' % vname) content = content.replace('__VIRTUAL_ENV__', home_dir) @@ -1517,8 +1517,8 @@ def fix_lib64(lib_dir, symlink=True): instead of lib/pythonX.Y. If this is such a platform we'll just create a symlink so lib64 points to lib """ - if [p for p in distutils.sysconfig.get_config_vars().values() - if isinstance(p, basestring) and 'lib64' in p]: + if [p for p in list(distutils.sysconfig.get_config_vars().values()) + if isinstance(p, str) and 'lib64' in p]: # PyPy's library path scheme is not affected by this. # Return early or we will die on the following assert. if is_pypy: @@ -1813,14 +1813,14 @@ def after_install(options, home_dir): import os, subprocess def after_install(options, home_dir): etc = join(home_dir, 'etc') - print join(home_dir, 'bin', 'easy_install') + print(join(home_dir, 'bin', 'easy_install')) if not os.path.exists(etc): os.makedirs(etc) - print "load distribute" + print("load distribute") subprocess.call(["curl", "-O", "https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py"]) - print "install setuptools" + print("install setuptools") subprocess.call([join(home_dir, 'bin', 'python'), 'ez_setup.py']) - print "install bigjob" + print("install bigjob") subprocess.call([join(home_dir, 'bin', 'easy_install'), 'bigjob']) @@ -2325,9 +2325,9 @@ def install_with_virtualenv(): usage="%prog [OPTIONS] DEST_DIR", formatter=UpdatingDefaultsHelpFormatter()) options, args = parser.parse_args() - print "Install with existing Virtualenv. Dest dir: " + str(args[0]) + print("Install with existing Virtualenv. Dest dir: " + str(args[0])) subprocess.call(["virtualenv", args[0]]) - print "install bigjob" + print("install bigjob") subprocess.call([os.path.join(args[0], 'bin', 'easy_install'), 'bigjob']) diff --git a/bootstrap/bigjob-bootstrap.py.bak b/bootstrap/bigjob-bootstrap.py.bak new file mode 100644 index 00000000..6ad054b7 --- /dev/null +++ b/bootstrap/bigjob-bootstrap.py.bak @@ -0,0 +1,2350 @@ +#!/usr/bin/env python +## WARNING: This file is generated +#!/usr/bin/env python +"""Create a "virtual" Python installation +""" + +__version__ = "1.10.1" +virtualenv_version = __version__ # legacy + +import base64 +import sys +import os +import codecs +import optparse +import re +import shutil +import logging +import tempfile +import zlib +import errno +import glob +import distutils.sysconfig +from distutils.util import strtobool +import struct +import subprocess +import tarfile + +if sys.version_info < (2, 6): + print('ERROR: %s' % sys.exc_info()[1]) + print('ERROR: this script requires Python 2.6 or greater.') + sys.exit(101) + +try: + set +except NameError: + from sets import Set as set +try: + basestring +except NameError: + basestring = str + +try: + import ConfigParser +except ImportError: + import configparser as ConfigParser + +join = os.path.join +py_version = 'python%s.%s' % (sys.version_info[0], sys.version_info[1]) + +is_jython = sys.platform.startswith('java') +is_pypy = hasattr(sys, 'pypy_version_info') +is_win = (sys.platform == 'win32') +is_cygwin = (sys.platform == 'cygwin') +is_darwin = (sys.platform == 'darwin') +abiflags = getattr(sys, 'abiflags', '') + +user_dir = os.path.expanduser('~') +if is_win: + default_storage_dir = os.path.join(user_dir, 'virtualenv') +else: + default_storage_dir = os.path.join(user_dir, '.virtualenv') +default_config_file = os.path.join(default_storage_dir, 'virtualenv.ini') + +if is_pypy: + expected_exe = 'pypy' +elif is_jython: + expected_exe = 'jython' +else: + expected_exe = 'python' + +# Return a mapping of version -> Python executable +# Only provided for Windows, where the information in the registry is used +if not is_win: + def get_installed_pythons(): + return {} +else: + try: + import winreg + except ImportError: + import _winreg as winreg + + def get_installed_pythons(): + python_core = winreg.CreateKey(winreg.HKEY_LOCAL_MACHINE, + "Software\\Python\\PythonCore") + i = 0 + versions = [] + while True: + try: + versions.append(winreg.EnumKey(python_core, i)) + i = i + 1 + except WindowsError: + break + exes = dict() + for ver in versions: + path = winreg.QueryValue(python_core, "%s\\InstallPath" % ver) + exes[ver] = join(path, "python.exe") + + winreg.CloseKey(python_core) + + # Add the major versions + # Sort the keys, then repeatedly update the major version entry + # Last executable (i.e., highest version) wins with this approach + for ver in sorted(exes): + exes[ver[0]] = exes[ver] + + return exes + +REQUIRED_MODULES = ['os', 'posix', 'posixpath', 'nt', 'ntpath', 'genericpath', + 'fnmatch', 'locale', 'encodings', 'codecs', + 'stat', 'UserDict', 'readline', 'copy_reg', 'types', + 're', 'sre', 'sre_parse', 'sre_constants', 'sre_compile', + 'zlib'] + +REQUIRED_FILES = ['lib-dynload', 'config'] + +majver, minver = sys.version_info[:2] +if majver == 2: + if minver >= 6: + REQUIRED_MODULES.extend(['warnings', 'linecache', '_abcoll', 'abc']) + if minver >= 7: + REQUIRED_MODULES.extend(['_weakrefset']) + if minver <= 3: + REQUIRED_MODULES.extend(['sets', '__future__']) +elif majver == 3: + # Some extra modules are needed for Python 3, but different ones + # for different versions. + REQUIRED_MODULES.extend(['_abcoll', 'warnings', 'linecache', 'abc', 'io', + '_weakrefset', 'copyreg', 'tempfile', 'random', + '__future__', 'collections', 'keyword', 'tarfile', + 'shutil', 'struct', 'copy', 'tokenize', 'token', + 'functools', 'heapq', 'bisect', 'weakref', + 'reprlib']) + if minver >= 2: + REQUIRED_FILES[-1] = 'config-%s' % majver + if minver == 3: + import sysconfig + platdir = sysconfig.get_config_var('PLATDIR') + REQUIRED_FILES.append(platdir) + # The whole list of 3.3 modules is reproduced below - the current + # uncommented ones are required for 3.3 as of now, but more may be + # added as 3.3 development continues. + REQUIRED_MODULES.extend([ + #"aifc", + #"antigravity", + #"argparse", + #"ast", + #"asynchat", + #"asyncore", + "base64", + #"bdb", + #"binhex", + #"bisect", + #"calendar", + #"cgi", + #"cgitb", + #"chunk", + #"cmd", + #"codeop", + #"code", + #"colorsys", + #"_compat_pickle", + #"compileall", + #"concurrent", + #"configparser", + #"contextlib", + #"cProfile", + #"crypt", + #"csv", + #"ctypes", + #"curses", + #"datetime", + #"dbm", + #"decimal", + #"difflib", + #"dis", + #"doctest", + #"dummy_threading", + "_dummy_thread", + #"email", + #"filecmp", + #"fileinput", + #"formatter", + #"fractions", + #"ftplib", + #"functools", + #"getopt", + #"getpass", + #"gettext", + #"glob", + #"gzip", + "hashlib", + #"heapq", + "hmac", + #"html", + #"http", + #"idlelib", + #"imaplib", + #"imghdr", + "imp", + "importlib", + #"inspect", + #"json", + #"lib2to3", + #"logging", + #"macpath", + #"macurl2path", + #"mailbox", + #"mailcap", + #"_markupbase", + #"mimetypes", + #"modulefinder", + #"multiprocessing", + #"netrc", + #"nntplib", + #"nturl2path", + #"numbers", + #"opcode", + #"optparse", + #"os2emxpath", + #"pdb", + #"pickle", + #"pickletools", + #"pipes", + #"pkgutil", + #"platform", + #"plat-linux2", + #"plistlib", + #"poplib", + #"pprint", + #"profile", + #"pstats", + #"pty", + #"pyclbr", + #"py_compile", + #"pydoc_data", + #"pydoc", + #"_pyio", + #"queue", + #"quopri", + #"reprlib", + "rlcompleter", + #"runpy", + #"sched", + #"shelve", + #"shlex", + #"smtpd", + #"smtplib", + #"sndhdr", + #"socket", + #"socketserver", + #"sqlite3", + #"ssl", + #"stringprep", + #"string", + #"_strptime", + #"subprocess", + #"sunau", + #"symbol", + #"symtable", + #"sysconfig", + #"tabnanny", + #"telnetlib", + #"test", + #"textwrap", + #"this", + #"_threading_local", + #"threading", + #"timeit", + #"tkinter", + #"tokenize", + #"token", + #"traceback", + #"trace", + #"tty", + #"turtledemo", + #"turtle", + #"unittest", + #"urllib", + #"uuid", + #"uu", + #"wave", + #"weakref", + #"webbrowser", + #"wsgiref", + #"xdrlib", + #"xml", + #"xmlrpc", + #"zipfile", + ]) + +if is_pypy: + # these are needed to correctly display the exceptions that may happen + # during the bootstrap + REQUIRED_MODULES.extend(['traceback', 'linecache']) + +class Logger(object): + + """ + Logging object for use in command-line script. Allows ranges of + levels, to avoid some redundancy of displayed information. + """ + + DEBUG = logging.DEBUG + INFO = logging.INFO + NOTIFY = (logging.INFO+logging.WARN)/2 + WARN = WARNING = logging.WARN + ERROR = logging.ERROR + FATAL = logging.FATAL + + LEVELS = [DEBUG, INFO, NOTIFY, WARN, ERROR, FATAL] + + def __init__(self, consumers): + self.consumers = consumers + self.indent = 0 + self.in_progress = None + self.in_progress_hanging = False + + def debug(self, msg, *args, **kw): + self.log(self.DEBUG, msg, *args, **kw) + def info(self, msg, *args, **kw): + self.log(self.INFO, msg, *args, **kw) + def notify(self, msg, *args, **kw): + self.log(self.NOTIFY, msg, *args, **kw) + def warn(self, msg, *args, **kw): + self.log(self.WARN, msg, *args, **kw) + def error(self, msg, *args, **kw): + self.log(self.ERROR, msg, *args, **kw) + def fatal(self, msg, *args, **kw): + self.log(self.FATAL, msg, *args, **kw) + def log(self, level, msg, *args, **kw): + if args: + if kw: + raise TypeError( + "You may give positional or keyword arguments, not both") + args = args or kw + rendered = None + for consumer_level, consumer in self.consumers: + if self.level_matches(level, consumer_level): + if (self.in_progress_hanging + and consumer in (sys.stdout, sys.stderr)): + self.in_progress_hanging = False + sys.stdout.write('\n') + sys.stdout.flush() + if rendered is None: + if args: + rendered = msg % args + else: + rendered = msg + rendered = ' '*self.indent + rendered + if hasattr(consumer, 'write'): + consumer.write(rendered+'\n') + else: + consumer(rendered) + + def start_progress(self, msg): + assert not self.in_progress, ( + "Tried to start_progress(%r) while in_progress %r" + % (msg, self.in_progress)) + if self.level_matches(self.NOTIFY, self._stdout_level()): + sys.stdout.write(msg) + sys.stdout.flush() + self.in_progress_hanging = True + else: + self.in_progress_hanging = False + self.in_progress = msg + + def end_progress(self, msg='done.'): + assert self.in_progress, ( + "Tried to end_progress without start_progress") + if self.stdout_level_matches(self.NOTIFY): + if not self.in_progress_hanging: + # Some message has been printed out since start_progress + sys.stdout.write('...' + self.in_progress + msg + '\n') + sys.stdout.flush() + else: + sys.stdout.write(msg + '\n') + sys.stdout.flush() + self.in_progress = None + self.in_progress_hanging = False + + def show_progress(self): + """If we are in a progress scope, and no log messages have been + shown, write out another '.'""" + if self.in_progress_hanging: + sys.stdout.write('.') + sys.stdout.flush() + + def stdout_level_matches(self, level): + """Returns true if a message at this level will go to stdout""" + return self.level_matches(level, self._stdout_level()) + + def _stdout_level(self): + """Returns the level that stdout runs at""" + for level, consumer in self.consumers: + if consumer is sys.stdout: + return level + return self.FATAL + + def level_matches(self, level, consumer_level): + """ + >>> l = Logger([]) + >>> l.level_matches(3, 4) + False + >>> l.level_matches(3, 2) + True + >>> l.level_matches(slice(None, 3), 3) + False + >>> l.level_matches(slice(None, 3), 2) + True + >>> l.level_matches(slice(1, 3), 1) + True + >>> l.level_matches(slice(2, 3), 1) + False + """ + if isinstance(level, slice): + start, stop = level.start, level.stop + if start is not None and start > consumer_level: + return False + if stop is not None and stop <= consumer_level: + return False + return True + else: + return level >= consumer_level + + #@classmethod + def level_for_integer(cls, level): + levels = cls.LEVELS + if level < 0: + return levels[0] + if level >= len(levels): + return levels[-1] + return levels[level] + + level_for_integer = classmethod(level_for_integer) + +# create a silent logger just to prevent this from being undefined +# will be overridden with requested verbosity main() is called. +logger = Logger([(Logger.LEVELS[-1], sys.stdout)]) + +def mkdir(path): + if not os.path.exists(path): + logger.info('Creating %s', path) + os.makedirs(path) + else: + logger.info('Directory %s already exists', path) + +def copyfileordir(src, dest, symlink=True): + if os.path.isdir(src): + shutil.copytree(src, dest, symlink) + else: + shutil.copy2(src, dest) + +def copyfile(src, dest, symlink=True): + if not os.path.exists(src): + # Some bad symlink in the src + logger.warn('Cannot find file %s (bad symlink)', src) + return + if os.path.exists(dest): + logger.debug('File %s already exists', dest) + return + if not os.path.exists(os.path.dirname(dest)): + logger.info('Creating parent directories for %s', os.path.dirname(dest)) + os.makedirs(os.path.dirname(dest)) + if not os.path.islink(src): + srcpath = os.path.abspath(src) + else: + srcpath = os.readlink(src) + if symlink and hasattr(os, 'symlink') and not is_win: + logger.info('Symlinking %s', dest) + try: + os.symlink(srcpath, dest) + except (OSError, NotImplementedError): + logger.info('Symlinking failed, copying to %s', dest) + copyfileordir(src, dest, symlink) + else: + logger.info('Copying to %s', dest) + copyfileordir(src, dest, symlink) + +def writefile(dest, content, overwrite=True): + if not os.path.exists(dest): + logger.info('Writing %s', dest) + f = open(dest, 'wb') + f.write(content.encode('utf-8')) + f.close() + return + else: + f = open(dest, 'rb') + c = f.read() + f.close() + if c != content.encode("utf-8"): + if not overwrite: + logger.notify('File %s exists with different content; not overwriting', dest) + return + logger.notify('Overwriting %s with new content', dest) + f = open(dest, 'wb') + f.write(content.encode('utf-8')) + f.close() + else: + logger.info('Content %s already in place', dest) + +def rmtree(dir): + if os.path.exists(dir): + logger.notify('Deleting tree %s', dir) + shutil.rmtree(dir) + else: + logger.info('Do not need to delete %s; already gone', dir) + +def make_exe(fn): + if hasattr(os, 'chmod'): + oldmode = os.stat(fn).st_mode & 0xFFF # 0o7777 + newmode = (oldmode | 0x16D) & 0xFFF # 0o555, 0o7777 + os.chmod(fn, newmode) + logger.info('Changed mode of %s to %s', fn, oct(newmode)) + +def _find_file(filename, dirs): + for dir in reversed(dirs): + files = glob.glob(os.path.join(dir, filename)) + if files and os.path.isfile(files[0]): + return True, files[0] + return False, filename + +def file_search_dirs(): + here = os.path.dirname(os.path.abspath(__file__)) + dirs = ['.', here, + join(here, 'virtualenv_support')] + if os.path.splitext(os.path.dirname(__file__))[0] != 'virtualenv': + # Probably some boot script; just in case virtualenv is installed... + try: + import virtualenv + except ImportError: + pass + else: + dirs.append(os.path.join(os.path.dirname(virtualenv.__file__), 'virtualenv_support')) + return [d for d in dirs if os.path.isdir(d)] + + +class UpdatingDefaultsHelpFormatter(optparse.IndentedHelpFormatter): + """ + Custom help formatter for use in ConfigOptionParser that updates + the defaults before expanding them, allowing them to show up correctly + in the help listing + """ + def expand_default(self, option): + if self.parser is not None: + self.parser.update_defaults(self.parser.defaults) + return optparse.IndentedHelpFormatter.expand_default(self, option) + + +class ConfigOptionParser(optparse.OptionParser): + """ + Custom option parser which updates its defaults by checking the + configuration files and environmental variables + """ + def __init__(self, *args, **kwargs): + self.config = ConfigParser.RawConfigParser() + self.files = self.get_config_files() + self.config.read(self.files) + optparse.OptionParser.__init__(self, *args, **kwargs) + + def get_config_files(self): + config_file = os.environ.get('VIRTUALENV_CONFIG_FILE', False) + if config_file and os.path.exists(config_file): + return [config_file] + return [default_config_file] + + def update_defaults(self, defaults): + """ + Updates the given defaults with values from the config files and + the environ. Does a little special handling for certain types of + options (lists). + """ + # Then go and look for the other sources of configuration: + config = {} + # 1. config files + config.update(dict(self.get_config_section('virtualenv'))) + # 2. environmental variables + config.update(dict(self.get_environ_vars())) + # Then set the options with those values + for key, val in config.items(): + key = key.replace('_', '-') + if not key.startswith('--'): + key = '--%s' % key # only prefer long opts + option = self.get_option(key) + if option is not None: + # ignore empty values + if not val: + continue + # handle multiline configs + if option.action == 'append': + val = val.split() + else: + option.nargs = 1 + if option.action == 'store_false': + val = not strtobool(val) + elif option.action in ('store_true', 'count'): + val = strtobool(val) + try: + val = option.convert_value(key, val) + except optparse.OptionValueError: + e = sys.exc_info()[1] + print("An error occured during configuration: %s" % e) + sys.exit(3) + defaults[option.dest] = val + return defaults + + def get_config_section(self, name): + """ + Get a section of a configuration + """ + if self.config.has_section(name): + return self.config.items(name) + return [] + + def get_environ_vars(self, prefix='VIRTUALENV_'): + """ + Returns a generator with all environmental vars with prefix VIRTUALENV + """ + for key, val in os.environ.items(): + if key.startswith(prefix): + yield (key.replace(prefix, '').lower(), val) + + def get_default_values(self): + """ + Overridding to make updating the defaults after instantiation of + the option parser possible, update_defaults() does the dirty work. + """ + if not self.process_default_values: + # Old, pre-Optik 1.5 behaviour. + return optparse.Values(self.defaults) + + defaults = self.update_defaults(self.defaults.copy()) # ours + for option in self._get_all_options(): + default = defaults.get(option.dest) + if isinstance(default, basestring): + opt_str = option.get_opt_string() + defaults[option.dest] = option.check_value(opt_str, default) + return optparse.Values(defaults) + + +def main(): + parser = ConfigOptionParser( + version=virtualenv_version, + usage="%prog [OPTIONS] DEST_DIR", + formatter=UpdatingDefaultsHelpFormatter()) + + parser.add_option( + '-v', '--verbose', + action='count', + dest='verbose', + default=0, + help="Increase verbosity") + + parser.add_option( + '-q', '--quiet', + action='count', + dest='quiet', + default=0, + help='Decrease verbosity') + + parser.add_option( + '-p', '--python', + dest='python', + metavar='PYTHON_EXE', + help='The Python interpreter to use, e.g., --python=python2.5 will use the python2.5 ' + 'interpreter to create the new environment. The default is the interpreter that ' + 'virtualenv was installed with (%s)' % sys.executable) + + parser.add_option( + '--clear', + dest='clear', + action='store_true', + help="Clear out the non-root install and start from scratch") + + parser.set_defaults(system_site_packages=False) + parser.add_option( + '--no-site-packages', + dest='system_site_packages', + action='store_false', + help="Don't give access to the global site-packages dir to the " + "virtual environment (default)") + + parser.add_option( + '--system-site-packages', + dest='system_site_packages', + action='store_true', + help="Give access to the global site-packages dir to the " + "virtual environment") + + parser.add_option( + '--always-copy', + dest='symlink', + action='store_false', + default=True, + help="Always copy files rather than symlinking") + + parser.add_option( + '--unzip-setuptools', + dest='unzip_setuptools', + action='store_true', + help="Unzip Setuptools when installing it") + + parser.add_option( + '--relocatable', + dest='relocatable', + action='store_true', + help='Make an EXISTING virtualenv environment relocatable. ' + 'This fixes up scripts and makes all .pth files relative') + + parser.add_option( + '--no-setuptools', + dest='no_setuptools', + action='store_true', + help='Do not install setuptools (or pip) ' + 'in the new virtualenv.') + + parser.add_option( + '--no-pip', + dest='no_pip', + action='store_true', + help='Do not install pip in the new virtualenv.') + + default_search_dirs = file_search_dirs() + parser.add_option( + '--extra-search-dir', + dest="search_dirs", + action="append", + default=default_search_dirs, + help="Directory to look for setuptools/pip distributions in. " + "You can add any number of additional --extra-search-dir paths.") + + parser.add_option( + '--never-download', + dest="never_download", + action="store_true", + default=True, + help="Never download anything from the network. This is now always " + "the case. The option is only retained for backward compatibility, " + "and does nothing. Virtualenv will fail if local distributions " + "of setuptools/pip are not present.") + + parser.add_option( + '--prompt', + dest='prompt', + help='Provides an alternative prompt prefix for this environment') + + parser.add_option( + '--setuptools', + dest='setuptools', + action='store_true', + help="Backward compatibility. Does nothing.") + + parser.add_option( + '--distribute', + dest='distribute', + action='store_true', + help="Backward compatibility. Does nothing.") + + if 'extend_parser' in globals(): + extend_parser(parser) + + options, args = parser.parse_args() + + global logger + + if 'adjust_options' in globals(): + adjust_options(options, args) + + verbosity = options.verbose - options.quiet + logger = Logger([(Logger.level_for_integer(2 - verbosity), sys.stdout)]) + + if options.python and not os.environ.get('VIRTUALENV_INTERPRETER_RUNNING'): + env = os.environ.copy() + interpreter = resolve_interpreter(options.python) + if interpreter == sys.executable: + logger.warn('Already using interpreter %s' % interpreter) + else: + logger.notify('Running virtualenv with interpreter %s' % interpreter) + env['VIRTUALENV_INTERPRETER_RUNNING'] = 'true' + file = __file__ + if file.endswith('.pyc'): + file = file[:-1] + popen = subprocess.Popen([interpreter, file] + sys.argv[1:], env=env) + raise SystemExit(popen.wait()) + + if not args: + print('You must provide a DEST_DIR') + parser.print_help() + sys.exit(2) + if len(args) > 1: + print('There must be only one argument: DEST_DIR (you gave %s)' % ( + ' '.join(args))) + parser.print_help() + sys.exit(2) + + home_dir = args[0] + + if os.environ.get('WORKING_ENV'): + logger.fatal('ERROR: you cannot run virtualenv while in a workingenv') + logger.fatal('Please deactivate your workingenv, then re-run this script') + sys.exit(3) + + if 'PYTHONHOME' in os.environ: + logger.warn('PYTHONHOME is set. You *must* activate the virtualenv before using it') + del os.environ['PYTHONHOME'] + + if options.relocatable: + make_environment_relocatable(home_dir) + return + + if not options.never_download: + logger.warn('The --never-download option is for backward compatibility only.') + logger.warn('Setting it to false is no longer supported, and will be ignored.') + + create_environment(home_dir, + site_packages=options.system_site_packages, + clear=options.clear, + unzip_setuptools=options.unzip_setuptools, + prompt=options.prompt, + search_dirs=options.search_dirs, + never_download=True, + no_setuptools=options.no_setuptools, + no_pip=options.no_pip, + symlink=options.symlink) + if 'after_install' in globals(): + after_install(options, home_dir) + +def call_subprocess(cmd, show_stdout=True, + filter_stdout=None, cwd=None, + raise_on_returncode=True, extra_env=None, + remove_from_env=None): + cmd_parts = [] + for part in cmd: + if len(part) > 45: + part = part[:20]+"..."+part[-20:] + if ' ' in part or '\n' in part or '"' in part or "'" in part: + part = '"%s"' % part.replace('"', '\\"') + if hasattr(part, 'decode'): + try: + part = part.decode(sys.getdefaultencoding()) + except UnicodeDecodeError: + part = part.decode(sys.getfilesystemencoding()) + cmd_parts.append(part) + cmd_desc = ' '.join(cmd_parts) + if show_stdout: + stdout = None + else: + stdout = subprocess.PIPE + logger.debug("Running command %s" % cmd_desc) + if extra_env or remove_from_env: + env = os.environ.copy() + if extra_env: + env.update(extra_env) + if remove_from_env: + for varname in remove_from_env: + env.pop(varname, None) + else: + env = None + try: + proc = subprocess.Popen( + cmd, stderr=subprocess.STDOUT, stdin=None, stdout=stdout, + cwd=cwd, env=env) + except Exception: + e = sys.exc_info()[1] + logger.fatal( + "Error %s while executing command %s" % (e, cmd_desc)) + raise + all_output = [] + if stdout is not None: + stdout = proc.stdout + encoding = sys.getdefaultencoding() + fs_encoding = sys.getfilesystemencoding() + while 1: + line = stdout.readline() + try: + line = line.decode(encoding) + except UnicodeDecodeError: + line = line.decode(fs_encoding) + if not line: + break + line = line.rstrip() + all_output.append(line) + if filter_stdout: + level = filter_stdout(line) + if isinstance(level, tuple): + level, line = level + logger.log(level, line) + if not logger.stdout_level_matches(level): + logger.show_progress() + else: + logger.info(line) + else: + proc.communicate() + proc.wait() + if proc.returncode: + if raise_on_returncode: + if all_output: + logger.notify('Complete output from command %s:' % cmd_desc) + logger.notify('\n'.join(all_output) + '\n----------------------------------------') + raise OSError( + "Command %s failed with error code %s" + % (cmd_desc, proc.returncode)) + else: + logger.warn( + "Command %s had error code %s" + % (cmd_desc, proc.returncode)) + +def filter_install_output(line): + if line.strip().startswith('running'): + return Logger.INFO + return Logger.DEBUG + +def install_sdist(project_name, sdist, py_executable, search_dirs=None): + + if search_dirs is None: + search_dirs = file_search_dirs() + found, sdist_path = _find_file(sdist, search_dirs) + if not found: + logger.fatal("Cannot find sdist %s" % (sdist,)) + return + + tmpdir = tempfile.mkdtemp() + try: + tar = tarfile.open(sdist_path) + tar.extractall(tmpdir) + tar.close() + srcdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) + cmd = [py_executable, 'setup.py', 'install', + '--single-version-externally-managed', + '--record', 'record'] + logger.start_progress('Installing %s...' % project_name) + logger.indent += 2 + try: + call_subprocess(cmd, show_stdout=False, cwd=srcdir, + filter_stdout=filter_install_output) + finally: + logger.indent -= 2 + logger.end_progress() + finally: + shutil.rmtree(tmpdir) + +def create_environment(home_dir, site_packages=False, clear=False, + unzip_setuptools=False, + prompt=None, search_dirs=None, never_download=False, + no_setuptools=False, no_pip=False, symlink=True): + """ + Creates a new environment in ``home_dir``. + + If ``site_packages`` is true, then the global ``site-packages/`` + directory will be on the path. + + If ``clear`` is true (default False) then the environment will + first be cleared. + """ + home_dir, lib_dir, inc_dir, bin_dir = path_locations(home_dir) + + py_executable = os.path.abspath(install_python( + home_dir, lib_dir, inc_dir, bin_dir, + site_packages=site_packages, clear=clear, symlink=symlink)) + + install_distutils(home_dir) + + if not no_setuptools: + install_sdist('Setuptools', 'setuptools-*.tar.gz', py_executable, search_dirs) + if not no_pip: + install_sdist('Pip', 'pip-*.tar.gz', py_executable, search_dirs) + + install_activate(home_dir, bin_dir, prompt) + +def is_executable_file(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + +def path_locations(home_dir): + """Return the path locations for the environment (where libraries are, + where scripts go, etc)""" + # XXX: We'd use distutils.sysconfig.get_python_inc/lib but its + # prefix arg is broken: http://bugs.python.org/issue3386 + if is_win: + # Windows has lots of problems with executables with spaces in + # the name; this function will remove them (using the ~1 + # format): + mkdir(home_dir) + if ' ' in home_dir: + import ctypes + GetShortPathName = ctypes.windll.kernel32.GetShortPathNameW + size = max(len(home_dir)+1, 256) + buf = ctypes.create_unicode_buffer(size) + try: + u = unicode + except NameError: + u = str + ret = GetShortPathName(u(home_dir), buf, size) + if not ret: + print('Error: the path "%s" has a space in it' % home_dir) + print('We could not determine the short pathname for it.') + print('Exiting.') + sys.exit(3) + home_dir = str(buf.value) + lib_dir = join(home_dir, 'Lib') + inc_dir = join(home_dir, 'Include') + bin_dir = join(home_dir, 'Scripts') + if is_jython: + lib_dir = join(home_dir, 'Lib') + inc_dir = join(home_dir, 'Include') + bin_dir = join(home_dir, 'bin') + elif is_pypy: + lib_dir = home_dir + inc_dir = join(home_dir, 'include') + bin_dir = join(home_dir, 'bin') + elif not is_win: + lib_dir = join(home_dir, 'lib', py_version) + multiarch_exec = '/usr/bin/multiarch-platform' + if is_executable_file(multiarch_exec): + # In Mageia (2) and Mandriva distros the include dir must be like: + # virtualenv/include/multiarch-x86_64-linux/python2.7 + # instead of being virtualenv/include/python2.7 + p = subprocess.Popen(multiarch_exec, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + # stdout.strip is needed to remove newline character + inc_dir = join(home_dir, 'include', stdout.strip(), py_version + abiflags) + else: + inc_dir = join(home_dir, 'include', py_version + abiflags) + bin_dir = join(home_dir, 'bin') + return home_dir, lib_dir, inc_dir, bin_dir + + +def change_prefix(filename, dst_prefix): + prefixes = [sys.prefix] + + if is_darwin: + prefixes.extend(( + os.path.join("/Library/Python", sys.version[:3], "site-packages"), + os.path.join(sys.prefix, "Extras", "lib", "python"), + os.path.join("~", "Library", "Python", sys.version[:3], "site-packages"), + # Python 2.6 no-frameworks + os.path.join("~", ".local", "lib","python", sys.version[:3], "site-packages"), + # System Python 2.7 on OSX Mountain Lion + os.path.join("~", "Library", "Python", sys.version[:3], "lib", "python", "site-packages"))) + + if hasattr(sys, 'real_prefix'): + prefixes.append(sys.real_prefix) + if hasattr(sys, 'base_prefix'): + prefixes.append(sys.base_prefix) + prefixes = list(map(os.path.expanduser, prefixes)) + prefixes = list(map(os.path.abspath, prefixes)) + # Check longer prefixes first so we don't split in the middle of a filename + prefixes = sorted(prefixes, key=len, reverse=True) + filename = os.path.abspath(filename) + for src_prefix in prefixes: + if filename.startswith(src_prefix): + _, relpath = filename.split(src_prefix, 1) + if src_prefix != os.sep: # sys.prefix == "/" + assert relpath[0] == os.sep + relpath = relpath[1:] + return join(dst_prefix, relpath) + assert False, "Filename %s does not start with any of these prefixes: %s" % \ + (filename, prefixes) + +def copy_required_modules(dst_prefix, symlink): + import imp + # If we are running under -p, we need to remove the current + # directory from sys.path temporarily here, so that we + # definitely get the modules from the site directory of + # the interpreter we are running under, not the one + # virtualenv.py is installed under (which might lead to py2/py3 + # incompatibility issues) + _prev_sys_path = sys.path + if os.environ.get('VIRTUALENV_INTERPRETER_RUNNING'): + sys.path = sys.path[1:] + try: + for modname in REQUIRED_MODULES: + if modname in sys.builtin_module_names: + logger.info("Ignoring built-in bootstrap module: %s" % modname) + continue + try: + f, filename, _ = imp.find_module(modname) + except ImportError: + logger.info("Cannot import bootstrap module: %s" % modname) + else: + if f is not None: + f.close() + # special-case custom readline.so on OS X, but not for pypy: + if modname == 'readline' and sys.platform == 'darwin' and not ( + is_pypy or filename.endswith(join('lib-dynload', 'readline.so'))): + dst_filename = join(dst_prefix, 'lib', 'python%s' % sys.version[:3], 'readline.so') + elif modname == 'readline' and sys.platform == 'win32': + # special-case for Windows, where readline is not a + # standard module, though it may have been installed in + # site-packages by a third-party package + pass + else: + dst_filename = change_prefix(filename, dst_prefix) + copyfile(filename, dst_filename, symlink) + if filename.endswith('.pyc'): + pyfile = filename[:-1] + if os.path.exists(pyfile): + copyfile(pyfile, dst_filename[:-1], symlink) + finally: + sys.path = _prev_sys_path + + +def subst_path(prefix_path, prefix, home_dir): + prefix_path = os.path.normpath(prefix_path) + prefix = os.path.normpath(prefix) + home_dir = os.path.normpath(home_dir) + if not prefix_path.startswith(prefix): + logger.warn('Path not in prefix %r %r', prefix_path, prefix) + return + return prefix_path.replace(prefix, home_dir, 1) + + +def install_python(home_dir, lib_dir, inc_dir, bin_dir, site_packages, clear, symlink=True): + """Install just the base environment, no distutils patches etc""" + if sys.executable.startswith(bin_dir): + print('Please use the *system* python to run this script') + return + + if clear: + rmtree(lib_dir) + ## FIXME: why not delete it? + ## Maybe it should delete everything with #!/path/to/venv/python in it + logger.notify('Not deleting %s', bin_dir) + + if hasattr(sys, 'real_prefix'): + logger.notify('Using real prefix %r' % sys.real_prefix) + prefix = sys.real_prefix + elif hasattr(sys, 'base_prefix'): + logger.notify('Using base prefix %r' % sys.base_prefix) + prefix = sys.base_prefix + else: + prefix = sys.prefix + mkdir(lib_dir) + fix_lib64(lib_dir, symlink) + stdlib_dirs = [os.path.dirname(os.__file__)] + if is_win: + stdlib_dirs.append(join(os.path.dirname(stdlib_dirs[0]), 'DLLs')) + elif is_darwin: + stdlib_dirs.append(join(stdlib_dirs[0], 'site-packages')) + if hasattr(os, 'symlink'): + logger.info('Symlinking Python bootstrap modules') + else: + logger.info('Copying Python bootstrap modules') + logger.indent += 2 + try: + # copy required files... + for stdlib_dir in stdlib_dirs: + if not os.path.isdir(stdlib_dir): + continue + for fn in os.listdir(stdlib_dir): + bn = os.path.splitext(fn)[0] + if fn != 'site-packages' and bn in REQUIRED_FILES: + copyfile(join(stdlib_dir, fn), join(lib_dir, fn), symlink) + # ...and modules + copy_required_modules(home_dir, symlink) + finally: + logger.indent -= 2 + mkdir(join(lib_dir, 'site-packages')) + import site + site_filename = site.__file__ + if site_filename.endswith('.pyc'): + site_filename = site_filename[:-1] + elif site_filename.endswith('$py.class'): + site_filename = site_filename.replace('$py.class', '.py') + site_filename_dst = change_prefix(site_filename, home_dir) + site_dir = os.path.dirname(site_filename_dst) + writefile(site_filename_dst, SITE_PY) + writefile(join(site_dir, 'orig-prefix.txt'), prefix) + site_packages_filename = join(site_dir, 'no-global-site-packages.txt') + if not site_packages: + writefile(site_packages_filename, '') + + if is_pypy or is_win: + stdinc_dir = join(prefix, 'include') + else: + stdinc_dir = join(prefix, 'include', py_version + abiflags) + if os.path.exists(stdinc_dir): + copyfile(stdinc_dir, inc_dir, symlink) + else: + logger.debug('No include dir %s' % stdinc_dir) + + platinc_dir = distutils.sysconfig.get_python_inc(plat_specific=1) + if platinc_dir != stdinc_dir: + platinc_dest = distutils.sysconfig.get_python_inc( + plat_specific=1, prefix=home_dir) + if platinc_dir == platinc_dest: + # Do platinc_dest manually due to a CPython bug; + # not http://bugs.python.org/issue3386 but a close cousin + platinc_dest = subst_path(platinc_dir, prefix, home_dir) + if platinc_dest: + # PyPy's stdinc_dir and prefix are relative to the original binary + # (traversing virtualenvs), whereas the platinc_dir is relative to + # the inner virtualenv and ignores the prefix argument. + # This seems more evolved than designed. + copyfile(platinc_dir, platinc_dest, symlink) + + # pypy never uses exec_prefix, just ignore it + if sys.exec_prefix != prefix and not is_pypy: + if is_win: + exec_dir = join(sys.exec_prefix, 'lib') + elif is_jython: + exec_dir = join(sys.exec_prefix, 'Lib') + else: + exec_dir = join(sys.exec_prefix, 'lib', py_version) + for fn in os.listdir(exec_dir): + copyfile(join(exec_dir, fn), join(lib_dir, fn), symlink) + + if is_jython: + # Jython has either jython-dev.jar and javalib/ dir, or just + # jython.jar + for name in 'jython-dev.jar', 'javalib', 'jython.jar': + src = join(prefix, name) + if os.path.exists(src): + copyfile(src, join(home_dir, name), symlink) + # XXX: registry should always exist after Jython 2.5rc1 + src = join(prefix, 'registry') + if os.path.exists(src): + copyfile(src, join(home_dir, 'registry'), symlink=False) + copyfile(join(prefix, 'cachedir'), join(home_dir, 'cachedir'), + symlink=False) + + mkdir(bin_dir) + py_executable = join(bin_dir, os.path.basename(sys.executable)) + if 'Python.framework' in prefix: + # OS X framework builds cause validation to break + # https://github.com/pypa/virtualenv/issues/322 + if os.environ.get('__PYVENV_LAUNCHER__'): + os.unsetenv('__PYVENV_LAUNCHER__') + if re.search(r'/Python(?:-32|-64)*$', py_executable): + # The name of the python executable is not quite what + # we want, rename it. + py_executable = os.path.join( + os.path.dirname(py_executable), 'python') + + logger.notify('New %s executable in %s', expected_exe, py_executable) + pcbuild_dir = os.path.dirname(sys.executable) + pyd_pth = os.path.join(lib_dir, 'site-packages', 'virtualenv_builddir_pyd.pth') + if is_win and os.path.exists(os.path.join(pcbuild_dir, 'build.bat')): + logger.notify('Detected python running from build directory %s', pcbuild_dir) + logger.notify('Writing .pth file linking to build directory for *.pyd files') + writefile(pyd_pth, pcbuild_dir) + else: + pcbuild_dir = None + if os.path.exists(pyd_pth): + logger.info('Deleting %s (not Windows env or not build directory python)' % pyd_pth) + os.unlink(pyd_pth) + + if sys.executable != py_executable: + ## FIXME: could I just hard link? + executable = sys.executable + shutil.copyfile(executable, py_executable) + make_exe(py_executable) + if is_win or is_cygwin: + pythonw = os.path.join(os.path.dirname(sys.executable), 'pythonw.exe') + if os.path.exists(pythonw): + logger.info('Also created pythonw.exe') + shutil.copyfile(pythonw, os.path.join(os.path.dirname(py_executable), 'pythonw.exe')) + python_d = os.path.join(os.path.dirname(sys.executable), 'python_d.exe') + python_d_dest = os.path.join(os.path.dirname(py_executable), 'python_d.exe') + if os.path.exists(python_d): + logger.info('Also created python_d.exe') + shutil.copyfile(python_d, python_d_dest) + elif os.path.exists(python_d_dest): + logger.info('Removed python_d.exe as it is no longer at the source') + os.unlink(python_d_dest) + # we need to copy the DLL to enforce that windows will load the correct one. + # may not exist if we are cygwin. + py_executable_dll = 'python%s%s.dll' % ( + sys.version_info[0], sys.version_info[1]) + py_executable_dll_d = 'python%s%s_d.dll' % ( + sys.version_info[0], sys.version_info[1]) + pythondll = os.path.join(os.path.dirname(sys.executable), py_executable_dll) + pythondll_d = os.path.join(os.path.dirname(sys.executable), py_executable_dll_d) + pythondll_d_dest = os.path.join(os.path.dirname(py_executable), py_executable_dll_d) + if os.path.exists(pythondll): + logger.info('Also created %s' % py_executable_dll) + shutil.copyfile(pythondll, os.path.join(os.path.dirname(py_executable), py_executable_dll)) + if os.path.exists(pythondll_d): + logger.info('Also created %s' % py_executable_dll_d) + shutil.copyfile(pythondll_d, pythondll_d_dest) + elif os.path.exists(pythondll_d_dest): + logger.info('Removed %s as the source does not exist' % pythondll_d_dest) + os.unlink(pythondll_d_dest) + if is_pypy: + # make a symlink python --> pypy-c + python_executable = os.path.join(os.path.dirname(py_executable), 'python') + if sys.platform in ('win32', 'cygwin'): + python_executable += '.exe' + logger.info('Also created executable %s' % python_executable) + copyfile(py_executable, python_executable, symlink) + + if is_win: + for name in 'libexpat.dll', 'libpypy.dll', 'libpypy-c.dll', 'libeay32.dll', 'ssleay32.dll', 'sqlite.dll': + src = join(prefix, name) + if os.path.exists(src): + copyfile(src, join(bin_dir, name), symlink) + + if os.path.splitext(os.path.basename(py_executable))[0] != expected_exe: + secondary_exe = os.path.join(os.path.dirname(py_executable), + expected_exe) + py_executable_ext = os.path.splitext(py_executable)[1] + if py_executable_ext.lower() == '.exe': + # python2.4 gives an extension of '.4' :P + secondary_exe += py_executable_ext + if os.path.exists(secondary_exe): + logger.warn('Not overwriting existing %s script %s (you must use %s)' + % (expected_exe, secondary_exe, py_executable)) + else: + logger.notify('Also creating executable in %s' % secondary_exe) + shutil.copyfile(sys.executable, secondary_exe) + make_exe(secondary_exe) + + if '.framework' in prefix: + if 'Python.framework' in prefix: + logger.debug('MacOSX Python framework detected') + # Make sure we use the the embedded interpreter inside + # the framework, even if sys.executable points to + # the stub executable in ${sys.prefix}/bin + # See http://groups.google.com/group/python-virtualenv/ + # browse_thread/thread/17cab2f85da75951 + original_python = os.path.join( + prefix, 'Resources/Python.app/Contents/MacOS/Python') + if 'EPD' in prefix: + logger.debug('EPD framework detected') + original_python = os.path.join(prefix, 'bin/python') + shutil.copy(original_python, py_executable) + + # Copy the framework's dylib into the virtual + # environment + virtual_lib = os.path.join(home_dir, '.Python') + + if os.path.exists(virtual_lib): + os.unlink(virtual_lib) + copyfile( + os.path.join(prefix, 'Python'), + virtual_lib, + symlink) + + # And then change the install_name of the copied python executable + try: + mach_o_change(py_executable, + os.path.join(prefix, 'Python'), + '@executable_path/../.Python') + except: + e = sys.exc_info()[1] + logger.warn("Could not call mach_o_change: %s. " + "Trying to call install_name_tool instead." % e) + try: + call_subprocess( + ["install_name_tool", "-change", + os.path.join(prefix, 'Python'), + '@executable_path/../.Python', + py_executable]) + except: + logger.fatal("Could not call install_name_tool -- you must " + "have Apple's development tools installed") + raise + + if not is_win: + # Ensure that 'python', 'pythonX' and 'pythonX.Y' all exist + py_exe_version_major = 'python%s' % sys.version_info[0] + py_exe_version_major_minor = 'python%s.%s' % ( + sys.version_info[0], sys.version_info[1]) + py_exe_no_version = 'python' + required_symlinks = [ py_exe_no_version, py_exe_version_major, + py_exe_version_major_minor ] + + py_executable_base = os.path.basename(py_executable) + + if py_executable_base in required_symlinks: + # Don't try to symlink to yourself. + required_symlinks.remove(py_executable_base) + + for pth in required_symlinks: + full_pth = join(bin_dir, pth) + if os.path.exists(full_pth): + os.unlink(full_pth) + if symlink: + os.symlink(py_executable_base, full_pth) + else: + shutil.copyfile(py_executable_base, full_pth) + + if is_win and ' ' in py_executable: + # There's a bug with subprocess on Windows when using a first + # argument that has a space in it. Instead we have to quote + # the value: + py_executable = '"%s"' % py_executable + # NOTE: keep this check as one line, cmd.exe doesn't cope with line breaks + cmd = [py_executable, '-c', 'import sys;out=sys.stdout;' + 'getattr(out, "buffer", out).write(sys.prefix.encode("utf-8"))'] + logger.info('Testing executable with %s %s "%s"' % tuple(cmd)) + try: + proc = subprocess.Popen(cmd, + stdout=subprocess.PIPE) + proc_stdout, proc_stderr = proc.communicate() + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.EACCES: + logger.fatal('ERROR: The executable %s could not be run: %s' % (py_executable, e)) + sys.exit(100) + else: + raise e + + proc_stdout = proc_stdout.strip().decode("utf-8") + proc_stdout = os.path.normcase(os.path.abspath(proc_stdout)) + norm_home_dir = os.path.normcase(os.path.abspath(home_dir)) + if hasattr(norm_home_dir, 'decode'): + norm_home_dir = norm_home_dir.decode(sys.getfilesystemencoding()) + if proc_stdout != norm_home_dir: + logger.fatal( + 'ERROR: The executable %s is not functioning' % py_executable) + logger.fatal( + 'ERROR: It thinks sys.prefix is %r (should be %r)' + % (proc_stdout, norm_home_dir)) + logger.fatal( + 'ERROR: virtualenv is not compatible with this system or executable') + if is_win: + logger.fatal( + 'Note: some Windows users have reported this error when they ' + 'installed Python for "Only this user" or have multiple ' + 'versions of Python installed. Copying the appropriate ' + 'PythonXX.dll to the virtualenv Scripts/ directory may fix ' + 'this problem.') + sys.exit(100) + else: + logger.info('Got sys.prefix result: %r' % proc_stdout) + + pydistutils = os.path.expanduser('~/.pydistutils.cfg') + if os.path.exists(pydistutils): + logger.notify('Please make sure you remove any previous custom paths from ' + 'your %s file.' % pydistutils) + ## FIXME: really this should be calculated earlier + + fix_local_scheme(home_dir, symlink) + + if site_packages: + if os.path.exists(site_packages_filename): + logger.info('Deleting %s' % site_packages_filename) + os.unlink(site_packages_filename) + + return py_executable + + +def install_activate(home_dir, bin_dir, prompt=None): + home_dir = os.path.abspath(home_dir) + if is_win or is_jython and os._name == 'nt': + files = { + 'activate.bat': ACTIVATE_BAT, + 'deactivate.bat': DEACTIVATE_BAT, + 'activate.ps1': ACTIVATE_PS, + } + + # MSYS needs paths of the form /c/path/to/file + drive, tail = os.path.splitdrive(home_dir.replace(os.sep, '/')) + home_dir_msys = (drive and "/%s%s" or "%s%s") % (drive[:1], tail) + + # Run-time conditional enables (basic) Cygwin compatibility + home_dir_sh = ("""$(if [ "$OSTYPE" "==" "cygwin" ]; then cygpath -u '%s'; else echo '%s'; fi;)""" % + (home_dir, home_dir_msys)) + files['activate'] = ACTIVATE_SH.replace('__VIRTUAL_ENV__', home_dir_sh) + + else: + files = {'activate': ACTIVATE_SH} + + # suppling activate.fish in addition to, not instead of, the + # bash script support. + files['activate.fish'] = ACTIVATE_FISH + + # same for csh/tcsh support... + files['activate.csh'] = ACTIVATE_CSH + + files['activate_this.py'] = ACTIVATE_THIS + if hasattr(home_dir, 'decode'): + home_dir = home_dir.decode(sys.getfilesystemencoding()) + vname = os.path.basename(home_dir) + for name, content in files.items(): + content = content.replace('__VIRTUAL_PROMPT__', prompt or '') + content = content.replace('__VIRTUAL_WINPROMPT__', prompt or '(%s)' % vname) + content = content.replace('__VIRTUAL_ENV__', home_dir) + content = content.replace('__VIRTUAL_NAME__', vname) + content = content.replace('__BIN_NAME__', os.path.basename(bin_dir)) + writefile(os.path.join(bin_dir, name), content) + +def install_distutils(home_dir): + distutils_path = change_prefix(distutils.__path__[0], home_dir) + mkdir(distutils_path) + ## FIXME: maybe this prefix setting should only be put in place if + ## there's a local distutils.cfg with a prefix setting? + home_dir = os.path.abspath(home_dir) + ## FIXME: this is breaking things, removing for now: + #distutils_cfg = DISTUTILS_CFG + "\n[install]\nprefix=%s\n" % home_dir + writefile(os.path.join(distutils_path, '__init__.py'), DISTUTILS_INIT) + writefile(os.path.join(distutils_path, 'distutils.cfg'), DISTUTILS_CFG, overwrite=False) + +def fix_local_scheme(home_dir, symlink=True): + """ + Platforms that use the "posix_local" install scheme (like Ubuntu with + Python 2.7) need to be given an additional "local" location, sigh. + """ + try: + import sysconfig + except ImportError: + pass + else: + if sysconfig._get_default_scheme() == 'posix_local': + local_path = os.path.join(home_dir, 'local') + if not os.path.exists(local_path): + os.mkdir(local_path) + for subdir_name in os.listdir(home_dir): + if subdir_name == 'local': + continue + cp_or_ln = (os.symlink if symlink else copyfile) + cp_or_ln(os.path.abspath(os.path.join(home_dir, subdir_name)), \ + os.path.join(local_path, subdir_name)) + +def fix_lib64(lib_dir, symlink=True): + """ + Some platforms (particularly Gentoo on x64) put things in lib64/pythonX.Y + instead of lib/pythonX.Y. If this is such a platform we'll just create a + symlink so lib64 points to lib + """ + if [p for p in distutils.sysconfig.get_config_vars().values() + if isinstance(p, basestring) and 'lib64' in p]: + # PyPy's library path scheme is not affected by this. + # Return early or we will die on the following assert. + if is_pypy: + logger.debug('PyPy detected, skipping lib64 symlinking') + return + + logger.debug('This system uses lib64; symlinking lib64 to lib') + + assert os.path.basename(lib_dir) == 'python%s' % sys.version[:3], ( + "Unexpected python lib dir: %r" % lib_dir) + lib_parent = os.path.dirname(lib_dir) + top_level = os.path.dirname(lib_parent) + lib_dir = os.path.join(top_level, 'lib') + lib64_link = os.path.join(top_level, 'lib64') + assert os.path.basename(lib_parent) == 'lib', ( + "Unexpected parent dir: %r" % lib_parent) + if os.path.lexists(lib64_link): + return + cp_or_ln = (os.symlink if symlink else copyfile) + cp_or_ln('lib', lib64_link) + +def resolve_interpreter(exe): + """ + If the executable given isn't an absolute path, search $PATH for the interpreter + """ + # If the "executable" is a version number, get the installed executable for + # that version + python_versions = get_installed_pythons() + if exe in python_versions: + exe = python_versions[exe] + + if os.path.abspath(exe) != exe: + paths = os.environ.get('PATH', '').split(os.pathsep) + for path in paths: + if os.path.exists(os.path.join(path, exe)): + exe = os.path.join(path, exe) + break + if not os.path.exists(exe): + logger.fatal('The executable %s (from --python=%s) does not exist' % (exe, exe)) + raise SystemExit(3) + if not is_executable(exe): + logger.fatal('The executable %s (from --python=%s) is not executable' % (exe, exe)) + raise SystemExit(3) + return exe + +def is_executable(exe): + """Checks a file is executable""" + return os.access(exe, os.X_OK) + +############################################################ +## Relocating the environment: + +def make_environment_relocatable(home_dir): + """ + Makes the already-existing environment use relative paths, and takes out + the #!-based environment selection in scripts. + """ + home_dir, lib_dir, inc_dir, bin_dir = path_locations(home_dir) + activate_this = os.path.join(bin_dir, 'activate_this.py') + if not os.path.exists(activate_this): + logger.fatal( + 'The environment doesn\'t have a file %s -- please re-run virtualenv ' + 'on this environment to update it' % activate_this) + fixup_scripts(home_dir, bin_dir) + fixup_pth_and_egg_link(home_dir) + ## FIXME: need to fix up distutils.cfg + +OK_ABS_SCRIPTS = ['python', 'python%s' % sys.version[:3], + 'activate', 'activate.bat', 'activate_this.py'] + +def fixup_scripts(home_dir, bin_dir): + if is_win: + new_shebang_args = ( + '%s /c' % os.path.normcase(os.environ.get('COMSPEC', 'cmd.exe')), + '', '.exe') + else: + new_shebang_args = ('/usr/bin/env', sys.version[:3], '') + + # This is what we expect at the top of scripts: + shebang = '#!%s' % os.path.normcase(os.path.join( + os.path.abspath(bin_dir), 'python%s' % new_shebang_args[2])) + # This is what we'll put: + new_shebang = '#!%s python%s%s' % new_shebang_args + + for filename in os.listdir(bin_dir): + filename = os.path.join(bin_dir, filename) + if not os.path.isfile(filename): + # ignore subdirs, e.g. .svn ones. + continue + f = open(filename, 'rb') + try: + try: + lines = f.read().decode('utf-8').splitlines() + except UnicodeDecodeError: + # This is probably a binary program instead + # of a script, so just ignore it. + continue + finally: + f.close() + if not lines: + logger.warn('Script %s is an empty file' % filename) + continue + + old_shebang = lines[0].strip() + old_shebang = old_shebang[0:2] + os.path.normcase(old_shebang[2:]) + + if not old_shebang.startswith(shebang): + if os.path.basename(filename) in OK_ABS_SCRIPTS: + logger.debug('Cannot make script %s relative' % filename) + elif lines[0].strip() == new_shebang: + logger.info('Script %s has already been made relative' % filename) + else: + logger.warn('Script %s cannot be made relative (it\'s not a normal script that starts with %s)' + % (filename, shebang)) + continue + logger.notify('Making script %s relative' % filename) + script = relative_script([new_shebang] + lines[1:]) + f = open(filename, 'wb') + f.write('\n'.join(script).encode('utf-8')) + f.close() + +def relative_script(lines): + "Return a script that'll work in a relocatable environment." + activate = "import os; activate_this=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'activate_this.py'); exec(compile(open(activate_this).read(), activate_this, 'exec'), dict(__file__=activate_this)); del os, activate_this" + # Find the last future statement in the script. If we insert the activation + # line before a future statement, Python will raise a SyntaxError. + activate_at = None + for idx, line in reversed(list(enumerate(lines))): + if line.split()[:3] == ['from', '__future__', 'import']: + activate_at = idx + 1 + break + if activate_at is None: + # Activate after the shebang. + activate_at = 1 + return lines[:activate_at] + ['', activate, ''] + lines[activate_at:] + +def fixup_pth_and_egg_link(home_dir, sys_path=None): + """Makes .pth and .egg-link files use relative paths""" + home_dir = os.path.normcase(os.path.abspath(home_dir)) + if sys_path is None: + sys_path = sys.path + for path in sys_path: + if not path: + path = '.' + if not os.path.isdir(path): + continue + path = os.path.normcase(os.path.abspath(path)) + if not path.startswith(home_dir): + logger.debug('Skipping system (non-environment) directory %s' % path) + continue + for filename in os.listdir(path): + filename = os.path.join(path, filename) + if filename.endswith('.pth'): + if not os.access(filename, os.W_OK): + logger.warn('Cannot write .pth file %s, skipping' % filename) + else: + fixup_pth_file(filename) + if filename.endswith('.egg-link'): + if not os.access(filename, os.W_OK): + logger.warn('Cannot write .egg-link file %s, skipping' % filename) + else: + fixup_egg_link(filename) + +def fixup_pth_file(filename): + lines = [] + prev_lines = [] + f = open(filename) + prev_lines = f.readlines() + f.close() + for line in prev_lines: + line = line.strip() + if (not line or line.startswith('#') or line.startswith('import ') + or os.path.abspath(line) != line): + lines.append(line) + else: + new_value = make_relative_path(filename, line) + if line != new_value: + logger.debug('Rewriting path %s as %s (in %s)' % (line, new_value, filename)) + lines.append(new_value) + if lines == prev_lines: + logger.info('No changes to .pth file %s' % filename) + return + logger.notify('Making paths in .pth file %s relative' % filename) + f = open(filename, 'w') + f.write('\n'.join(lines) + '\n') + f.close() + +def fixup_egg_link(filename): + f = open(filename) + link = f.readline().strip() + f.close() + if os.path.abspath(link) != link: + logger.debug('Link in %s already relative' % filename) + return + new_link = make_relative_path(filename, link) + logger.notify('Rewriting link %s in %s as %s' % (link, filename, new_link)) + f = open(filename, 'w') + f.write(new_link) + f.close() + +def make_relative_path(source, dest, dest_is_directory=True): + """ + Make a filename relative, where the filename is dest, and it is + being referred to from the filename source. + + >>> make_relative_path('/usr/share/something/a-file.pth', + ... '/usr/share/another-place/src/Directory') + '../another-place/src/Directory' + >>> make_relative_path('/usr/share/something/a-file.pth', + ... '/home/user/src/Directory') + '../../../home/user/src/Directory' + >>> make_relative_path('/usr/share/a-file.pth', '/usr/share/') + './' + """ + source = os.path.dirname(source) + if not dest_is_directory: + dest_filename = os.path.basename(dest) + dest = os.path.dirname(dest) + dest = os.path.normpath(os.path.abspath(dest)) + source = os.path.normpath(os.path.abspath(source)) + dest_parts = dest.strip(os.path.sep).split(os.path.sep) + source_parts = source.strip(os.path.sep).split(os.path.sep) + while dest_parts and source_parts and dest_parts[0] == source_parts[0]: + dest_parts.pop(0) + source_parts.pop(0) + full_parts = ['..']*len(source_parts) + dest_parts + if not dest_is_directory: + full_parts.append(dest_filename) + if not full_parts: + # Special case for the current directory (otherwise it'd be '') + return './' + return os.path.sep.join(full_parts) + + + +############################################################ +## Bootstrap script creation: + +def create_bootstrap_script(extra_text, python_version=''): + """ + Creates a bootstrap script, which is like this script but with + extend_parser, adjust_options, and after_install hooks. + + This returns a string that (written to disk of course) can be used + as a bootstrap script with your own customizations. The script + will be the standard virtualenv.py script, with your extra text + added (your extra text should be Python code). + + If you include these functions, they will be called: + + ``extend_parser(optparse_parser)``: + You can add or remove options from the parser here. + + ``adjust_options(options, args)``: + You can change options here, or change the args (if you accept + different kinds of arguments, be sure you modify ``args`` so it is + only ``[DEST_DIR]``). + + ``after_install(options, home_dir)``: + + After everything is installed, this function is called. This + is probably the function you are most likely to use. An + example would be:: + + def after_install(options, home_dir): + subprocess.call([join(home_dir, 'bin', 'easy_install'), + 'MyPackage']) + subprocess.call([join(home_dir, 'bin', 'my-package-script'), + 'setup', home_dir]) + + This example immediately installs a package, and runs a setup + script from that package. + + If you provide something like ``python_version='2.5'`` then the + script will start with ``#!/usr/bin/env python2.5`` instead of + ``#!/usr/bin/env python``. You can use this when the script must + be run with a particular Python version. + """ + filename = __file__ + if filename.endswith('.pyc'): + filename = filename[:-1] + f = codecs.open(filename, 'r', encoding='utf-8') + content = f.read() + f.close() + py_exe = 'python%s' % python_version + content = (('#!/usr/bin/env %s\n' % py_exe) + + '## WARNING: This file is generated\n' + + content) + return content.replace('##EXT' 'END##', extra_text) + + +import os, subprocess +def after_install(options, home_dir): + etc = join(home_dir, 'etc') + print join(home_dir, 'bin', 'easy_install') + if not os.path.exists(etc): + os.makedirs(etc) + print "load distribute" + subprocess.call(["curl", "-O", "https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py"]) + print "install setuptools" + subprocess.call([join(home_dir, 'bin', 'python'), 'ez_setup.py']) + print "install bigjob" + subprocess.call([join(home_dir, 'bin', 'easy_install'), 'bigjob']) + + +def convert(s): + b = base64.b64decode(s.encode('ascii')) + return zlib.decompress(b).decode('utf-8') + +##file site.py +SITE_PY = convert(""" +eJzFPf1z2zaWv/OvwMqToZTIdOJ0e3tOnRsncVrvuYm3SWdz63q0lARZrCmSJUjL2pu7v/3eBwAC +JCXbm+6cphNLJPDw8PC+8PAeOhgMTopCZnOxyud1KoWScTlbiiKulkos8lJUy6Sc7xdxWW3g6ewm +vpZKVLlQGxVhqygInn7lJ3gqPi8TZVCAb3Fd5au4SmZxmm5EsiryspJzMa/LJLsWSZZUSZwm/4AW +eRaJp1+PQXCWCZh5mshS3MpSAVwl8oW42FTLPBPDusA5v4j+GL8cjYWalUlRQYNS4wwUWcZVkEk5 +BzShZa2AlEkl91UhZ8kimdmG67xO56JI45kUf/87T42ahmGg8pVcL2UpRQbIAEwJsArEA74mpZjl +cxkJ8UbOYhyAnzfEChjaGNdMIRmzXKR5dg1zyuRMKhWXGzGc1hUBIpTFPAecEsCgStI0WOfljRrB +ktJ6rOGRiJk9/Mkwe8A8cfwu5wCOH7Pg5yy5GzNs4B4EVy2ZbUq5SO5EjGDhp7yTs4l+NkwWYp4s +FkCDrBphk4ARUCJNpgcFLcd3eoVeHxBWlitjGEMiytyYX1KPKDirRJwqYNu6QBopwvydnCZxBtTI +bmE4gAgkDfrGmSeqsuPQ7EQOAEpcxwqkZKXEcBUnGTDrj/GM0P5rks3ztRoRBWC1lPi1VpU7/2EP +AaC1Q4BxgItlVrPO0uRGppsRIPAZsC+lqtMKBWKelHJW5WUiFQEA1DZC3gHSYxGXUpOQOdPI7Zjo +TzRJMlxYFDAUeHyJJFkk13VJEiYWCXAucMX7jz+Jd6dvzk4+aB4zwFhmr1eAM0ChhXZwggHEQa3K +gzQHgY6Cc/wj4vkchewaxwe8mgYH9650MIS5F1G7j7PgQHa9uHoYmGMFyoTGCqjff0OXsVoCff7n +nvUOgpNtVKGJ87f1MgeZzOKVFMuY+Qs5I/hOw3kdFdXyFXCDQjgVkErh4iCCCcIDkrg0G+aZFAWw +WJpkchQAhabU1l9FYIUPebZPa93iBIBQBhm8dJ6NaMRMwkS7sF6hvjCNNzQz3SSw67zKS1IcwP/Z +jHRRGmc3hKMihuJvU3mdZBkihLwQhHshDaxuEuDEeSTOqRXpBdNIhKy9uCWKRA28hEwHPCnv4lWR +yjGLL+rW3WqEBpOVMGudMsdBy4rUK61aM9Ve3juMvrS4jtCslqUE4PXUE7pFno/FFHQ2YVPEKxav +ap0T5wQ98kSdkCeoJfTF70DRE6XqlbQvkVdAsxBDBYs8TfM1kOwoCITYw0bGKPvMCW/hHfwLcPHf +VFazZRA4I1nAGhQivw0UAgGTIDPN1RoJj9s0K7eVTJKxpsjLuSxpqIcR+4ARf2BjnGvwIa+0UePp +4irnq6RClTTVJjNhi5eFFevHVzxvmAZYbkU0M00bOq1wemmxjKfSuCRTuUBJ0Iv0yi47jBn0jEm2 +uBIrtjLwDsgiE7Yg/YoFlc6ikuQEAAwWvjhLijqlRgoZTMQw0Kog+KsYTXqunSVgbzbLASokNt8z +sD+A2z9AjNbLBOgzAwigYVBLwfJNk6pEB6HRR4Fv9E1/Hh849WyhbRMPuYiTVFv5OAvO6OFpWZL4 +zmSBvcaaGApmmFXo2l1nQEcU88FgEATGHdoo8zVXQVVujoAVhBlnMpnWCRq+yQRNvf6hAh5FOAN7 +3Ww7Cw80hOn0AajkdFmU+Qpf27l9AmUCY2GPYE9ckJaR7CB7nPgKyeeq9MI0RdvtsLNAPRRc/HT6 +/uzL6SdxLC4blTZu67MrGPM0i4GtySIAU7WGbXQZtETFl6DuE+/BvBNTgD2j3iS+Mq5q4F1A/XNZ +02uYxsx7GZx+OHlzfjr5+dPpT5NPZ59PAUGwMzLYoymjeazBYVQRCAdw5VxF2r4GnR704M3JJ/sg +mCRq8u03wG7wZHgtK2DicggzHotwFd8pYNBwTE1HiGOnAVjwcDQSr8Xh06cvDwlasSk2AAzMrtMU +H060RZ8k2SIPR9T4V3bpj1lJaf/t8uibK3F8LMJf49s4DMCHapoyS/xI4vR5U0joWsGfYa5GQTCX +CxC9G4kCOnxKfvGIO8CSQMtc2+lf8yQz75kr3SFIfwypB+AwmczSWClsPJmEQATq0POBDhE71yh1 +Q+hYbNyuI40KfkoJC5thlzH+04NiPKV+iAaj6HYxjUBcV7NYSW5F04d+kwnqrMlkqAcEYSaJAYeL +1VAoTBPUWWUCfi1xHuqwqcpT/InwUQuQAOLWCrUkLpLeOkW3cVpLNXQmBUQcDltkREWbKOJHcFGG +YImbpRuN2tQ0PAPNgHxpDlq0bFEOP3vg74C6Mps43Ojx3otphpj+mXcahAO4nCGqe6VaUFg7iovT +C/Hy+eE+ujOw55xb6njN0UInWS3twwWslpEHRph7GXlx6bJAPYtPj3bDXEV2ZbqssNBLXMpVfivn +gC0ysLPK4id6AztzmMcshlUEvU7+AKtQ4zfGuA/l2YO0oO8A1FsRFLP+Zun3OBggMwWKiDfWRGq9 +62dTWJT5bYLOxnSjX4KtBGWJFtM4NoGzcB6ToUkEDQFecIaUWssQ1GFZs8NKeCNItBfzRrFGBO4c +NfUVfb3J8nU24Z3wMSrd4ciyLgqWZl5s0CzBnngPVgiQzGFj1xCNoYDLL1C29gF5mD5MFyhLewsA +BIZe0XbNgWW2ejRF3jXisAhj9EqQ8JYS/YVbMwRttQwxHEj0NrIPjJZASDA5q+CsatBMhrJmmsHA +Dkl8rjuPeAvqA2hRMQKzOdTQuJGh3+URKGdx7iolpx9a5C9fvjDbqCXFVxCxKU4aXYgFGcuo2IBh +TUAnGI+MozXEBmtwbgFMrTRriv1PIi/YG4P1vNCyDX4A7O6qqjg6OFiv15GOLuTl9YFaHPzxT99+ ++6fnrBPnc+IfmI4jLTrUFh3QO/Roo++MBXptVq7Fj0nmcyPBGkryysgVRfy+r5N5Lo72R1Z/Ihc3 +Zhr/Na4MKJCJGZSpDLQdNBg9UftPopdqIJ6QdbZthyP2S7RJtVbMt7rQo8rBEwC/ZZbXaKobTlDi +GVg32KHP5bS+Du3gno00P2CqKKdDywP7L64QA58zDF8ZUzxBLUFsgRbfIf1PzDYxeUdaQyB50UR1 +ds+bfi1miDt/uLxbX9MRGjPDRCF3oET4TR4sgLZxV3Lwo11btHuOa2s+niEwlj4wzKsdyyEKDuGC +azF2pc7havR4QZrWrJpBwbiqERQ0OIlTprYGRzYyRJDo3ZjNPi+sbgF0akUOTXzArAK0cMfpWLs2 +KzieEPLAsXhBTyS4yEedd895aes0pYBOi0c9qjBgb6HRTufAl0MDYCwG5c8Dbmm2KR9bi8Jr0AMs +5xgQMtiiw0z4xvUBB3uDHnbqWP1tvZnGfSBwkYYci3oQdEL5mEcoFUhTMfR7bmNxS9zuYDstDjGV +WSYSabVFuNrKo1eodhqmRZKh7nUWKZqlOXjFVisSIzXvfWeB9kH4uM+YaQnUZGjI4TQ6Jm/PE8BQ +t8Pw2XWNgQY3DoMYrRJF1g3JtIR/wK2g+AYFo4CWBM2CeaiU+RP7HWTOzld/2cIeltDIEG7TbW5I +x2JoOOb9nkAy6mgMSEEGJOwKI7mOrA5S4DBngTzhhtdyq3QTjEiBnDkWhNQM4E4vvQ0OPonwBIQk +FCHfVUoW4pkYwPK1RfVhuvt35VIThBg6DchV0NGLYzey4UQ1jltRDp+h/fgGnZUUOXDwFFweN9Dv +srlhWht0AWfdV9wWKdDIFIcZjFxUrwxh3GDyH46dFg2xzCCGobyBvCMdM9IosMutQcOCGzDemrfH +0o/diAX2HYa5OpSrO9j/hWWiZrkKKWbSjl24H80VXdpYbM+T6QD+eAswGF15kGSq4xcYZfknBgk9 +6GEfdG+yGBaZx+U6yUJSYJp+x/7SdPCwpPSM3MEn2k4dwEQx4nnwvgQBoaPPAxAn1ASwK5eh0m5/ +F+zOKQ4sXO4+8Nzmy6OXV13ijrdFeOynf6lO76oyVrhaKS8aCwWuVteAo9KFycXZRh9e6sNt3CaU +uYJdpPj46YtAQnBcdx1vHjf1huERm3vn5H0M6qDX7iVXa3bELoAIakVklIPw8Rz5cGQfO7kdE3sE +kEcxzI5FMZA0n/wzcHYtFIyxP99kGEdrqwz8wOtvv5n0REZdJL/9ZnDPKC1i9In9sOUJ2pE5qWDX +bEsZp+RqOH0oqJg1rGPbFCPW57T90zx21eNzarRs7Lu/BX4MFAypS/ARno8bsnWnih/fndoKT9up +HcA6u1Xz2aNFgL19Pv0VdshKB9Vu4ySlcwWY/P4+Klezued4Rb/28CDtVDAOCfr2X+ryOXBDyNGE +UXc62hk7MQHnnl2w+RSx6qKyp3MImiMwLy/APf7sQtUWzDDucz5eOOxRTd6M+5yJr1Gr+PldNJAF +5tFg0Ef2rez4/zHL5/+aST5wKubk+ne0ho8E9HvNhI0HQ9PGw4fVv+yu3TXAHmCetridO9zC7tB8 +Vrkwzh2rJCWeou56KtaUrkCxVTwpAihz9vt64OAy6kPvt3VZ8tE1qcBClvt4HDsWmKllPL9eE7Mn +Dj7ICjGxzWYUq3byevI+NRLq6LOdSdjsG/rlbJmbmJXMbpMS+oLCHYY/fPzxNOw3IRjHhU4PtyIP +9xsQ7iOYNtTECR/Thyn0mC7/vFS1ty4+QU1GgIkIa7L12gc/EGziCP1rcE9EyDuw5WN23KHPlnJ2 +M5GUOoBsil2doPhbfI2Y2IwCP/9LxQtKYoOZzNIaacWON2YfLupsRucjlQT/SqcKY+oQJQRw+G+R +xtdiSJ3nGHrS3EjRqdu41N5nUeaYnCrqZH5wncyF/K2OU9zWy8UCcMHDK/0q4uEpAiXecU4DJy0q +OavLpNoACWKV67M/Sn9wGk43PNGhhyQf8zABMSHiSHzCaeN7JtzckMsEB/wTD5wk7ruxg5OsENFz +eJ/lExx1Qjm+Y0aqey5Pj4P2CDkAGABQmP9gpCN3/htJr9wDRlpzl6ioJT1SupGGnJwxhDIcYaSD +f9NPnxFd3tqC5fV2LK93Y3ndxvK6F8trH8vr3Vi6IoELa4NWRhL6AlftY43efBs35sTDnMazJbfD +3E/M8QSIojAbbCNTnALtRbb4fI+AkNp2DpzpYZM/k3BSaZlzCFyDRO7HQyy9mTfJ605nysbRnXkq +xp3dlkPk9z2IIkoVm1J3lrd5XMWRJxfXaT4FsbXojhsAY9FOJ+JYaXY7mXJ0t2WpBhf/9fmHjx+w +OYIamPQG6oaLiIYFpzJ8GpfXqitNzeavAHakln4iDnXTAPceGFnjUfb4n3eU4YGMI9aUoZCLAjwA +yuqyzdzcpzBsPddJUvo5MzkfNh2LQVYNmkltIdLJxcW7k88nAwr5Df534AqMoa0vHS4+poVt0PXf +3OaW4tgHhFrHthrj587Jo3XDEffbWAO248O3Hhw+xGD3hgn8Wf5LKQVLAoSKdPD3MYR68B7oq7YJ +HfoYRuwk/7kna+ys2HeO7DkuiiP6fccO7QH8w07cY0yAANqFGpqdQbOZail9a153UNQB+kBf76u3 +YO2tV3sn41PUTqLHAXQoa5ttd/+8cxo2ekpWb06/P/twfvbm4uTzD44LiK7cx08Hh+L0xy+C8kPQ +gLFPFGNqRIWZSGBY3EInMc/hvxojP/O64iAx9Hp3fq5PalZY6oK5z2hzInjOaUwWGgfNOAptH+r8 +I8Qo1Rskp6aI0nWo5gj3SyuuZ1G5zo+mUqUpOqu13nrpWjFTU0bn2hFIHzR2ScEgOMUMXlEWe2V2 +hSWfAOo6qx6ktI22iSEpBQU76QLO+Zc5XfECpdQZnjSdtaK/DF1cw6tIFWkCO7lXoZUl3Q3TYxrG +0Q/tATfj1acBne4wsm7Is96KBVqtVyHPTfcfNYz2Ww0YNgz2DuadSUoPoQxsTG4TITbik5xQ3sFX +u/R6DRQsGB70VbiIhukSmH0Mm2uxTGADATy5BOuL+wSA0FoJ/0DgyIkOyByzM8K3q/n+X0JNEL/1 +L7/0NK/KdP9vooBdkOBUorCHmG7jd7DxiWQkTj++H4WMHKXmir/UWB4ADgkFQB1pp/wlPkGfDJVM +Fzq/xNcH+EL7CfS61b2URam797vGIUrAEzUkr+GJMvQLMd3Lwh7jVEYt0Fj5YDHDCkI3DcF89sSn +pUxTne9+9u78FHxHLMZACeJzt1MYjuMleISuk++4wrEFCg/Y4XWJbFyiC0tJFvPIa9YbtEaRo95e +XoZdJwoMd3t1osBlnCgX7SFOm2GZcoIIWRnWwiwrs3arDVLYbUMUR5lhlphclJTA6vME8DI9jXlL +BHslLPUwEXg+RU6yymQspskM9CioXFCoYxASJC7WMxLn5RnHwPNSmTIoeFhsyuR6WeHpBnSOqAQD +m/948uX87AOVJRy+bLzuHuYc005gzEkkx5giiNEO+OKm/SFXTSZ9PKtfIQzUPvCn/YqzU455gE4/ +Dizin/YrrkM7dnaCPANQUHXRFg/cADjd+uSmkQXG1e6D8eOmADaY+WAoFollLzrRw51flxNty5Yp +obiPefmIA5xFYVPSdGc3Ja390XNcFHjONR/2N4K3fbJlPlPoetN5sy35zf10pBBLYgGjbmt/DJMd +1mmqp+Mw2zZuoW2ttrG/ZE6s1Gk3y1CUgYhDt/PIZbJ+JaybMwd6adQdYOI7ja6RxF5VPvglG2gP +w8PEEruzTzEdqYyFjABGMqSu/anBh0KLAAqEsn+HjuSOR08PvTk61uD+OWrdBbbxB1CEOheXajzy +EjgRvvzGjiO/IrRQjx6J0PFUMpnlNk8MP+slepUv/Dn2ygAFMVHsyji7lkOGNTYwn/nE3hKCJW3r +kfoyueozLOIMnNO7LRzelYv+gxODWosROu1u5KatjnzyYIPeUpCdBPPBl/EadH9RV0NeyS3n0L21 +dNuh3g8Rsw+hqT59H4YYjvkt3LI+DeBeamhY6OH9tuUUltfGOLLWPraqmkL7QnuwsxK2ZpWiYxmn +ONH4otYLaAzucWPyB/apThSyv3vqxJyYkAXKg7sgvbkNdINWOGHA5UpcOZpQOnxTTaPfzeWtTMFo +gJEdYrXDr7baYRTZcEpvHthXY3exudj040ZvGsyOTDkGemaqgPWLMlkdIDq9EZ9dmDXI4FL/orck +cXZDXvLbv56NxdsPP8G/b+RHMKVY/DgWfwM0xNu8hP0lV+/StQpYyVHxxjGvFVZIEjQ6quAbKNBt +u/DojMciusTEry2xmlJgVm254mtPAEWeIFW0N36CKZyA36ayq+WNGk+xb1EG+iXSYHuxCxaIHOiW +0bJapWgvnChJs5qXg/Ozt6cfPp1G1R1yuPk5cKIofkIWTkefEZd4HjYW9smsxidXjuP8g0yLHr9Z +bzpN4QxuOkUI+5LCbjT5So3Ybi7iEiMHotjM81mELYHluVavWoMjPXL2l/caes/KIqzhSJ+iNd48 +PgZqiF/aimgADamPnhP1JITiKRaN8eNo0G+Kx4JC2/Dn6c167kbGdfUPTbCNaTProd/d6sIl01nD +s5xEeB3bZTAFoWkSq9V05hYKfsyEvhEFtBydc8hFXKeVkBlILm3y6WoK0PRubR9LCLMKmzMqeKMw +TbqON8pJQoqVGOCoA6quxwMZihjCHvzH+IbtARYdipproQE6IUr7p9zpqurZkiWYt0REvZ7Eg3WS +vXTzeTSFeVDeIc8aRxbmiW4jY3QtKz1/fjAcXb5oMh0oKj3zKntnBVg9l032QHUWT58+HYj/uN/7 +YVSiNM9vwC0D2L1eyzm93mK59eTsanU9e/MmAn6cLeUlPLii6Ll9XmcUmtzRlRZE2r8GRohrE1pm +NO1bdpmDdiUfNHMLPrDSluPnLKF7jzC0JFHZ6uujMOxkpIlYEhRDGKtZkoQcpoD12OQ1FuVhmFHz +i7wDjk8QzBjf4gkZb7WX6GFSAq3lHovOsRgQ4AHllvFoVNVMZWmA5+Rio9GcnGVJ1dSTPHcPT/Vd +AJW9zkjzlYjXKBlmHi1iOPWdHqs2Hna+k0W9HUs+u3QDjq1Z8uv7cAfWBknLFwuDKTw0izTLZTkz +5hRXLJkllQPGtEM43JlucSLrEwU9KA1AvZNVmFuJtm//YNfFxfQjnSPvm5F0+lBlb8bi4FCctRIM +o6gZn8JQlpCWb82XEYzygcLa2hPwxhJ/0EFVLCbwLvBw6xrrTF/MwfkbzW0dAIcug7IK0rKjpyOc +G8gsfGbaLddp4Ie26ITbbVJWdZxO9P0PE3TYJvZgXeNp6+F2VnpabwWc/Bw84H2dug+Og8myQXpi +6q0pzTgWCx2iiNwSM78aq8jRyztkXwl8CqTMfGIKo00Q6dKyq6041TmbjopHUM9MFdMWz9yUz3Qq +T1zMx5TnZOoetnjRfgop3WEhXovhy7E4bG2BZsUGr3QCZJ/MQ98Vo24wFScqYObYviBDvD4Wwxdj +8ccd0KMtAxwduiO0N7QtCFuBvLx6NBnTZEpkC/ty2e/vq5MZQdMzjqOrNvm7ZPqOqPTvLSpxqaDO +WH7Rzlhujb11A9v5+EiGK1Aci0TO958oJKFGutHN2xmc8MNK+j2brKWLyJvSGqqgm8JmZN3oQUcj +GrfZDmKq07X64kJe1DVsOO3lAyZfppWzaK+bw3xGjV6LqABg0nemht/wkhd4r0nh+mdbz1p1NYAF +2xNK0CWffHLWNGwE9V5H8FEa4B5GESGeqjaKwpWsR4hISBfiEBM9a51mOxz/uzMP1xpsOxPtYPnt +N7vwdAWzt7qjZ0F3l1x4ImvrLJrlNp/+CJ3HKH1dv0pgHCiN6ICzau6sJDfzCNOY+TKa3KYzr/BW +SDqiRpOYStdt4q00X/+FfgzFDiirDNYCPKl6gSfKt3TJ5Ymi7De8q+abwxdjUyLMgPQEXkYvn+m7 +IKmbuQXB97HHeu8GL3W/w+jfHGBJ5fe2rzq7GZrWcetSKH+wkMJoo2hi6dAYpvsLQpo1iwVentgQ +k31rexPIe/B2puDnmFtQc3DYYEMa9aHraoxGerepti0CfL/J2CY5D+raKFJEepewbVOeuxTno0VB +9+q3IBhCQM5fxvwGXcG6OLIhNmNT8Ah06KZ14qe66S1AY3uCxra6CXdNn/vvmrtuEdiZm6yGztz9 +QlOXBrrvdivaRwMOb2hCPKhWotH4/cbEtQNjnUzTH6rXHyS/2wlnusWs3AfGpO5g4J/YU2NvzP4q +nrnfMTNsn29mduuKe52N1rQ7NqPN8Q/xFDgLBp/bqwYotWmuOZD3S3TV3oSTZSfy+lpNYrzmcUKb +bErs6uyezLbtPd3SJ2O1MbstvL0IQBhu0im4bpY9MAboSr5umvOinGtqBA1N2cNOOrJK5mwS9NYO +wEUcMaX+JiLP+cSDVGKgW9VlUcJueKAvJeaEnb4c5waoCeCtYnVjUDc9xvqOWlKslCVmapE5TtvK +9gEisBHvmIbJxL4DXnne3LeQjC0zyKxeyTKumruG/NSABDZdzQhUfY6L64TnGqlscYmLWGJ5w0EK +A2T2+zPYWHqb6h0XLIystns4O1EPHfJ9zN0NjjEyXJzc2XsG3fut5nTHtesd2mYN19m7lWAZzKV5 +pCN1rIzf6ou8+LJZjuSjf+nwD8i7W4Lpp6NbdcberUXDeeYqhO7NTXh1ABnnvgsZOxzQvXqxtQG2 +4/v6wjJKx8Pc0thSUfvkvQqnGW3URJAwc/SeCJJfHfDICJIH/4ERJH19JhgajY/WA731AveEmlg9 +uHdRNowAfSZAJDzJbt1kaEzl0M2+L3KV3A3szdKsK52SPmMekCO7l5QRCL5zUrmpyt6dcLsiSL50 +0ePvzz++OTknWkwuTt7+58n3lJ2FxyUtW/XgEFuW7zO19708cDfcpjNq+gZvsO25KpaLmTSEzvtO +MkIPhP7Ctb4FbSsy9/W2Dp0CoG4nQHz3tFtQt6nsXsgdv0wXm7h5NK2E7UA/5exa88tJUTCPzEkd +i0NzEmfeN4cnWkY7seVtC+fkuXbVifZX9XWgW+LeI5ttTSuAZybIX/bIxJTO2MA8Oyjt/98HpYhj +2aG5SgekcCadKx3pNkcGVfn/Y5ESlF2Mezt2FMf2km5qx8dDyt4+j2e/MxkZgnh1f4Pu/Fxhn8t0 +CxWCgBWevrCQETH6Tx+o2vSDJ0pc7lOF8T4qmyv7C9dMO7d/TTDJoLIXfynOVOJjVmi8qFM3ccD2 +6XQgp49Oo/KFU9ICmu8A6NyIpwL2Rn+JFeJ0I0LYOGqXDLNkiY761j4HebSbDvaGVs/F/rb6U7f+ +UogX2xvOWyWeusch91D39FC1qfJzLDCma24rLBWvCTIfZwq66ctzPvAMXW/74evt5Ysje7iA/I6v +HUVCaWUDx7BfOmmZO2+XdLoTs5RjytvDvZoTEtYtrhyo7BNs29t0alO27H9MngNDGnjv+0Nmpod3 +B/+gjallvSOYkhg+USOallPNo3G3T0bd6TZqqwuEK5MeAKSjAgEWgunoRidTdMPp3sPnejc4rele +XveEKXSkgrLGfI7gHsb3a/Brd6eK4gd1ZxRNf27Q5kC95CDc7Dtwq5EXCtluEtpTb/hgiwvAxdn9 +/V88oH83n9F2P9zlV9tWL3sLAtmXxRRYzAxqkcg8jsDIgN4ckrbGugkj6HgfTUNHl6GauSFfoONH +abV46zZtMMiZnWgPwBqF4P8ACHXrHw== +""") + +##file activate.sh +ACTIVATE_SH = convert(""" +eJytVVFvokAQfudXTLEPtTlLeo9tvMSmJpq02hSvl7u2wRUG2QR2DSxSe7n/frOACEVNLlceRHa+ +nfl25pvZDswCnoDPQ4QoTRQsENIEPci4CsBMZBq7CAsuLOYqvmYKTTj3YxnBgiXBudGBjUzBZUJI +BXEqgCvweIyuCjeG4eF2F5x14bcB9KQiQQWrjSddI1/oQIx6SYYeoFjzWIoIhYI1izlbhJjkKO7D +M/QEmKfO9O7WeRo/zr4P7pyHwWxkwitcgwpQ5Ej96OX+PmiFwLeVjFUOrNYKaq1Nud3nR2n8nI2m +k9H0friPTGVsUdptaxGrTEfpNVFEskxpXtUkkCkl1UNF9cgLBkx48J4EXyALuBtAwNYIjF5kcmUU +abMKmMq1ULoiRbgsDEkTSsKSGFCJ6Z8vY/2xYiSacmtyAfCDdCNTVZoVF8vSTQOoEwSnOrngBkws +MYGMBMg8/bMBLSYKS7pYEXP0PqT+ZmBT0Xuy+Pplj5yn4aM9nk72JD8/Wi+Gr98sD9eWSMOwkapD +BbUv91XSvmyVkICt2tmXR4tWmrcUCsjWOpw87YidEC8i0gdTSOFhouJUNxR+4NYBG0MftoCTD9F7 +2rTtxG3oPwY1b2HncYwhrlmj6Wq924xtGDWqfdNxap+OYxplEurnMVo9RWks+rH8qKEtx7kZT5zJ +4H7oOFclrN6uFe+d+nW2aIUsSgs/42EIPuOhXq+jEo3S6tX6w2ilNkDnIpHCWdEQhFgwj9pkk7FN +l/y5eQvRSIQ5+TrL05lewxWpt/Lbhes5cJF3mLET1MGhcKCF+40tNWnUulxrpojwDo2sObdje3Bz +N3QeHqf3D7OjEXMVV8LN3ZlvuzoWHqiUcNKHtwNd0IbvPGKYYM31nPKCgkUILw3KL+Y8l7aO1ArS +Ad37nIU0fCj5NE5gQCuC5sOSu+UdI2NeXg/lFkQIlFpdWVaWZRfvqGiirC9o6liJ9FXGYrSY9mI1 +D/Ncozgn13vJvsznr7DnkJWXsyMH7e42ljdJ+aqNDF1bFnKWFLdj31xtaJYK6EXFgqmV/ymD/ROG ++n8O9H8f5vsGOWXsL1+1k3g= +""") + +##file activate.fish +ACTIVATE_FISH = convert(""" +eJyVVWFv2jAQ/c6vuBoqQVWC9nVSNVGVCaS2VC2rNLWVZZILWAs2s52wVvvxsyEJDrjbmgpK7PP5 +3bt3d22YLbmGlGcIq1wbmCPkGhPYcLMEEsGciwGLDS+YwSjlekngLFVyBe73GXSXxqw/DwbuTS8x +yyKpFr1WG15lDjETQhpQuQBuIOEKY5O9tlppLqxHKSDByjVAPwEy+mXtCq5MzjIUBTCRgEKTKwFG +gpBqxTLYXgN2myspVigMaYF92tZSowGZJf4mFExxNs9Qb614CgZtmH0BpEOn11f0cXI/+za8pnfD +2ZjA1sg9zlV/8QvcMhxbNu0QwgYokn/d+n02nt6Opzcjcnx1vXcIoN74O4ymWQXmHURfJw9jenc/ +vbmb0enj6P5+cuVhqlKm3S0u2XRtRbA2QQAhV7VhBF0rsgUX9Ur1rBUXJgVSy8O751k8mzY5OrKH +RW3eaQhYGTr8hrXO59ALhxQ83mCsDLAid3T72CCSdJhaFE+fXgicXAARUiR2WeVO37gH3oYHzFKo +9k7CaPZ1UeNwH1tWuXA4uFKYYcEa8vaKqXl7q1UpygMPhFLvlVKyNzsSM3S2km7UBOl4xweUXk5u +6e3wZmQ9leY1XE/Ili670tr9g/5POBBpGIJXCCF79L1siarl/dbESa8mD8PL61GpzqpzuMS7tqeB +1YkALrRBloBMbR9yLcVx7frQAgUqR7NZIuzkEu110gbNit1enNs82Rx5utq7Z3prU78HFRgulqNC +OTwbqJa9vkJFclQgZSjbKeBgSsUtCtt9D8OwAbIVJuewQdfvQRaoFE9wd1TmCuRG7OgJ1bVXGHc7 +z5WDL/WW36v2oi37CyVBak61+yPBA9C1qqGxzKQqZ0oPuocU9hpud0PIp8sDHkXR1HKkNlzjuUWA +a0enFUyzOWZA4yXGP+ZMI3Tdt2OuqU/SO4q64526cPE0A7ZyW2PMbWZiZ5HamIZ2RcCKLXhcDl2b +vXL+eccQoRzem80mekPDEiyiWK4GWqZmwxQOmPM0eIfgp1P9cqrBsewR2p/DPMtt+pfcYM+Ls2uh +hALufTAdmGl8B1H3VPd2af8fQAc4PgqjlIBL9cGQqNpXaAwe3LrtVn8AkZTUxg== +""") + +##file activate.csh +ACTIVATE_CSH = convert(""" +eJx9VG1P2zAQ/u5fcYQKNgTNPtN1WxlIQ4KCUEGaxuQ6yYVYSuzKdhqVX7+zk3bpy5YPUXL3PPfc +ne98DLNCWshliVDV1kGCUFvMoJGugMjq2qQIiVSxSJ1cCofD1BYRnOVGV0CfZ0N2DD91DalQSjsw +tQLpIJMGU1euvPe7QeJlkKzgWixlhnAt4aoUVsLnLBiy5NtbJWQ5THX1ZciYKKWwkOFaE04dUm6D +r/zh7pq/3D7Nnid3/HEy+wFHY/gEJydg0aFaQrBFgz1c5DG1IhTs+UZgsBC2GMFBlaeH+8dZXwcW +VPvCjXdlAvCfQsE7al0+07XjZvrSCUevR5dnkVeKlFYZmUztG4BdzL2u9KyLVabTU0bdfg7a0hgs +cSmUg6UwUiQl2iHrcbcVGNvPCiLOe7+cRwG13z9qRGgx2z6DHjfm/Op2yqeT+xvOLzs0PTKHDz2V +tkckFHoQfQRXoGJAj9el0FyJCmEMhzgMS4sB7KPOE2ExoLcSieYwDvR+cP8cg11gKkVJc2wRcm1g +QhYFlXiTaTfO2ki0fQoiFM4tLuO4aZrhOzqR4dIPcWx17hphMBY+Srwh7RTyN83XOWkcSPh1Pg/k +TXX/jbJTbMtUmcxZ+/bbqOsy82suFQg/BhdSOTRhMNBHlUarCpU7JzBhmkKmRejKOQzayQe6MWoa +n1wqWmuh6LZAaHxcdeqIlVLhIBJdO9/kbl0It2oEXQj+eGjJOuvOIR/YGRqvFhttUB2XTvLXYN2H +37CBdbW2W7j2r2+VsCn0doVWcFG1/4y1VwBjfwAyoZhD +""") + +##file activate.bat +ACTIVATE_BAT = convert(""" +eJx9UdEKgjAUfW6wfxjiIH+hEDKUFHSKLCMI7kNOEkIf9P9pTJ3OLJ/03HPPPed4Es9XS9qqwqgT +PbGKKOdXL4aAFS7A4gvAwgijuiKlqOpGlATS2NeMLE+TjJM9RkQ+SmqAXLrBo1LLIeLdiWlD6jZt +r7VNubWkndkXaxg5GO3UaOOKS6drO3luDDiO5my3iA0YAKGzPRV1ack8cOdhysI0CYzIPzjSiH5X +0QcvC8Lfaj0emsVKYF2rhL5L3fCkVjV76kShi59NHwDniAHzkgDgqBcwOgTMx+gDQQqXCw== +""") + +##file deactivate.bat +DEACTIVATE_BAT = convert(""" +eJxzSE3OyFfIT0vj4ipOLVEI8wwKCXX0iXf1C7Pl4spMU0hJTcvMS01RiPf3cYmHyQYE+fsGhCho +cCkAAUibEkTEVhWLMlUlLk6QGixStlyaeCyJDPHw9/Pw93VFsQguim4ZXAJoIUw5DhX47XUM8UCx +EchHtwsohN1bILUgw61c/Vy4AJYPYm4= +""") + +##file activate.ps1 +ACTIVATE_PS = convert(""" +eJylWdmS40Z2fVeE/oHT6rCloNUEAXDThB6wAyQAEjsB29GBjdgXYiWgmC/zgz/Jv+AEWNVd3S2N +xuOKYEUxM+/Jmzfvcm7W//zXf/+wUMOoXtyi1F9kbd0sHH/hFc2iLtrK9b3FrSqyxaVQwr8uhqJd +uHaeg9mqzRdR8/13Pyy8qPLdJh0+LMhi0QCoXxYfFh9WtttEnd34H8p6/f1300KauwrULws39e18 +0ZaLNm9rgN/ZVf3h++/e124Vlc0vKsspHy+Yyi5+XbzPhijvCtduoiL/kA1ukWV27n0o7Sb8LIFj +CvWR5GQgUJdp1Pw8TS9+rPy6SDv/+e3d+0+4qw8f3v20+PliV37efEYBAB9FTKC+RHn/Cfxn3rdv +00Fube5O+iyCtHDs9BfPfz3q4sfFv9d91Ljhfy7ei0VO+nVTtdOkv/jpt0l2AX6iG1jXgKnnDuD4 +ke2k/i8fzzz5UedkVcP4pwF+Wvz2FJl+3vt598urXf5Y6LNA5WcFOP7r0sW7b9a+W/xcu0Xpv5zk +Kfq3P9Dz9di/fCxS72MXVU1rpx9L4Bxl85Wmn5a+zP76Zuh3pL9ROWr87PN+//GHIl+oOtvn9XSU +qH+p0gQBFnx1uV+JLH5O5zv+PXW+WepXVVHZT0+oQezkIATcIm+ivPV/z5J/+cYj3ir4w0Lx09vC +e5n/y5/Y5LPPfdrqb88ga/PabxZRVfmp39l588m/6u+/e+OpP+dF7n1WZpJ9//Z4v372fDDz9eHB +7Juvs/BLMHzrxL9+9twXpJfhd1/DrpQ5Euu/vlss3wp9HXC/54C/Ld69m6zwdx3tC0d8daSv0V8B +n4b9YYF53sJelJV/ix6LZspw/sJtqyl5LJ5r/23htA1Imfm/gt9R7dqVB1LjhydAX4Gb+zksQF59 +9+P7H//U+376afFuvh2/T6P85Xr/5c8C6OXyFY4BGuN+EE0+GeR201b+wkkLN5mmBY5TfMw8ngqL +CztXxCSXKMCYrRIElWkEJlEPYsSOeKBVZCAQTKBhApMwRFQzmCThE0YQu2CdEhgjbgmk9GluHpfR +/hhwJCZhGI5jt5FsAkOrObVyE6g2y1snyhMGFlDY1x+BoHpCMulTj5JYWNAYJmnKpvLxXgmQ8az1 +4fUGxxcitMbbhDFcsiAItg04E+OSBIHTUYD1HI4FHH4kMREPknuYRMyhh3AARWMkfhCketqD1CWJ +mTCo/nhUScoQcInB1hpFhIKoIXLo5jLpwFCgsnLCx1QlEMlz/iFEGqzH3vWYcpRcThgWnEKm0QcS +rA8ek2a2IYYeowUanOZOlrbWSJUC4c7y2EMI3uJPMnMF/SSXdk6E495VLhzkWHps0rOhKwqk+xBI +DhJirhdUCTamMfXz2Hy303hM4DFJ8QL21BcPBULR+gcdYxoeiDqOFSqpi5B5PUISfGg46gFZBPo4 +jdh8lueaWuVSMTURfbAUnLINr/QYuuYoMQV6l1aWxuZVTjlaLC14UzqZ+ziTGDzJzhiYoPLrt3uI +tXkVR47kAo09lo5BD76CH51cTt1snVpMOttLhY93yxChCQPI4OBecS7++h4p4Bdn4H97bJongtPk +s9gQnXku1vzsjjmX4/o4YUDkXkjHwDg5FXozU0fW4y5kyeYW0uJWlh536BKr0kMGjtzTkng6Ep62 +uTWnQtiIqKnEsx7e1hLtzlXs7Upw9TwEnp0t9yzCGgUJIZConx9OHJArLkRYW0dW42G9OeR5Nzwk +yk1mX7du5RGHT7dka7N3AznmSif7y6tuKe2N1Al/1TUPRqH6E2GLVc27h9IptMLkCKQYRqPQJgzV +2m6WLsSipS3v3b1/WmXEYY1meLEVIU/arOGVkyie7ZsH05ZKpjFW4cpY0YkjySpSExNG2TS8nnJx +nrQmWh2WY3cP1eISP9wbaVK35ZXc60yC3VN/j9n7UFoK6zvjSTE2+Pvz6Mx322rnftfP8Y0XKIdv +Qd7AfK0nexBTMqRiErvCMa3Hegpfjdh58glW2oNMsKeAX8x6YJLZs9K8/ozjJkWL+JmECMvhQ54x +9rsTHwcoGrDi6Y4I+H7yY4/rJVPAbYymUH7C2D3uiUS3KQ1nrCAUkE1dJMneDQIJMQQx5SONxoEO +OEn1/Ig1eBBUeEDRuOT2WGGGE4bNypBLFh2PeIg3bEbg44PHiqNDbGIQm50LW6MJU62JHCGBrmc9 +2F7WBJrrj1ssnTAK4sxwRgh5LLblhwNAclv3Gd+jC/etCfyfR8TMhcWQz8TBIbG8IIyAQ81w2n/C +mHWAwRzxd3WoBY7BZnsqGOWrOCKwGkMMNfO0Kci/joZgEocLjNnzgcmdehPHJY0FudXgsr+v44TB +I3jnMGnsK5veAhgi9iXGifkHMOC09Rh9cAw9sQ0asl6wKMk8mpzFYaaDSgG4F0wisQDDBRpjCINg +FIxhlhQ31xdSkkk6odXZFpTYOQpOOgw9ugM2cDQ+2MYa7JsEirGBrOuxsQy5nPMRdYjsTJ/j1iNw +FeSt1jY2+dd5yx1/pzZMOQXUIDcXeAzR7QlDRM8AMkUldXOmGmvYXPABjxqkYKO7VAY6JRU7kpXr ++Epu2BU3qFFXClFi27784LrDZsJwbNlDw0JzhZ6M0SMXE4iBHehCpHVkrQhpTFn2dsvsZYkiPEEB +GSEAwdiur9LS1U6P2U9JhGp4hnFpJo4FfkdJHcwV6Q5dV1Q9uNeeu7rV8PAjwdFg9RLtroifOr0k +uOiRTo/obNPhQIf42Fr4mtThWoSjitEdAmFW66UCe8WFjPk1YVNpL9srFbond7jrLg8tqAasIMpy +zkH0SY/6zVAwJrEc14zt14YRXdY+fcJ4qOd2XKB0/Kghw1ovd11t2o+zjt+txndo1ZDZ2T+uMVHT +VSXhedBAHoJIID9xm6wPQI3cXY+HR7vxtrJuCKh6kbXaW5KkVeJsdsjqsYsOwYSh0w5sMbu7LF8J +5T7U6LJdiTx+ca7RKlulGgS5Z1JSU2Llt32cHFipkaurtBrvNX5UtvNZjkufZ/r1/XyLl6yOpytL +Km8Fn+y4wkhlqZP5db0rooqy7xdL4wxzFVTX+6HaxuQJK5E5B1neSSovZ9ALB8091dDbbjVxhWNY +Ve5hn1VnI9OF0wpvaRm7SZuC1IRczwC7GnkhPt3muHV1YxUJfo+uh1sYnJy+vI0ZwuPV2uqWJYUH +bmBsi1zmFSxHrqwA+WIzLrHkwW4r+bad7xbOzJCnKIa3S3YvrzEBK1Dc0emzJW+SqysQfdEDorQG +9ZJlbQzEHQV8naPaF440YXzJk/7vHGK2xwuP+Gc5xITxyiP+WQ4x18oXHjFzCBy9kir1EFTAm0Zq +LYwS8MpiGhtfxiBRDXpxDWxk9g9Q2fzPPAhS6VFDAc/aiNGatUkPtZIStZFQ1qD0IlJa/5ZPAi5J +ySp1ETDomZMnvgiysZSBfMikrSDte/K5lqV6iwC5q7YN9I1dBZXUytDJNqU74MJsUyNNLAPopWK3 +tzmLkCiDyl7WQnj9sm7Kd5kzgpoccdNeMw/6zPVB3pUwMgi4C7hj4AMFAf4G27oXH8NNT9zll/sK +S6wVlQwazjxWKWy20ZzXb9ne8ngGalPBWSUSj9xkc1drsXkZ8oOyvYT3e0rnYsGwx85xZB9wKeKg +cJKZnamYwiaMymZvzk6wtDUkxmdUg0mPad0YHtvzpjEfp2iMxvORhnx0kCVLf5Qa43WJsVoyfEyI +pzmf8ruM6xBr7dnBgzyxpqXuUPYaKahOaz1LrxNkS/Q3Ae5AC+xl6NbxAqXXlzghZBZHmOrM6Y6Y +ctAkltwlF7SKEsShjVh7QHuxMU0a08/eiu3x3M+07OijMcKFFltByXrpk8w+JNnZpnp3CfgjV1Ax +gUYCnWwYow42I5wHCcTzLXK0hMZN2DrPM/zCSqe9jRSlJnr70BPE4+zrwbk/xVIDHy2FAQyHoomT +Tt5jiM68nBQut35Y0qLclLiQrutxt/c0OlSqXAC8VrxW97lGoRWzhOnifE2zbF05W4xuyhg7JTUL +aqJ7SWDywhjlal0b+NLTpERBgnPW0+Nw99X2Ws72gOL27iER9jgzj7Uu09JaZ3n+hmCjjvZpjNst +vOWWTbuLrg+/1ltX8WpPauEDEvcunIgTxuMEHweWKCx2KQ9DU/UKdO/3za4Szm2iHYL+ss9AAttm +gZHq2pkUXFbV+FiJCKrpBms18zH75vax5jSo7FNunrVWY3Chvd8KKnHdaTt/6ealwaA1x17yTlft +8VBle3nAE+7R0MScC3MJofNCCkA9PGKBgGMYEwfB2QO5j8zUqa8F/EkWKCzGQJ5EZ05HTly1B01E +z813G5BY++RZ2sxbQS8ZveGPJNabp5kXAeoign6Tlt5+L8i5ZquY9+S+KEUHkmYMRFBxRrHnbl2X +rVemKnG+oB1yd9+zT+4c43jQ0wWmQRR6mTCkY1q3VG05Y120ZzKOMBe6Vy7I5Vz4ygPB3yY4G0FP +8RxiMx985YJPXsgRU58EuHj75gygTzejP+W/zKGe78UQN3yOJ1aMQV9hFH+GAfLRsza84WlPLAI/ +9G/5JdcHftEfH+Y3/fHUG7/o8bv98dzzy3e8S+XCvgqB+VUf7sH0yDHpONdbRE8tAg9NWOzcTJ7q +TuAxe/AJ07c1Rs9okJvl1/0G60qvbdDzz5zO0FuPFQIHNp9y9Bd1CufYVx7dB26mAxwa8GMNrN/U +oGbNZ3EQ7inLzHy5tRg9AXJrN8cB59cCUBeCiVO7zKM0jU0MamhnRThkg/NMmBOGb6StNeD9tDfA +7czsAWopDdnGoXUHtA+s/k0vNPkBcxEI13jVd/axp85va3LpwGggXXWw12Gwr/JGAH0b8CPboiZd +QO1l0mk/UHukud4C+w5uRoNzpCmoW6GbgbMyaQNkga2pQINB18lOXOCJzSWPFOhZcwzdgrsQnne7 +nvjBi+7cP2BbtBeDOW5uOLGf3z94FasKIguOqJl+8ss/6Kumns4cuWbqq5592TN/RNIbn5Qo6qbi +O4F0P9txxPAwagqPlftztO8cWBzdN/jz3b7GD6JHYP/Zp4ToAMaA74M+EGSft3hEGMuf8EwjnTk/ +nz/P7SLipB/ogQ6xNX0fDqNncMCfHqGLCMM0ZzFa+6lPJYQ5p81vW4HkCvidYf6kb+P/oB965g8K +C6uR0rdjX1DNKc5pOSTquI8uQ6KXxYaKBn+30/09tK4kMpJPgUIQkbENEPbuezNPPje2Um83SgyX +GTCJb6MnGVIpgncdQg1qz2bvPfxYD9fewCXDomx9S+HQJuX6W3VAL+v5WZMudRQZk9ZdOk6GIUtC +PqEb/uwSIrtR7/edzqgEdtpEwq7p2J5OQV+RLrmtTvFwFpf03M/VrRyTZ73qVod7v7Jh2Dwe5J25 +JqFOU2qEu1sP+CRotklediycKfLjeIZzjJQsvKmiGSNQhxuJpKa+hoWUizaE1PuIRGzJqropwgVB +oo1hr870MZLgnXF5ZIpr6mF0L8aSy2gVnTAuoB4WEd4d5NPVC9TMotYXERKlTcwQ2KiB/C48AEfH +Qbyq4CN8xTFnTvf/ebOc3isnjD95s0QF0nx9s+y+zMmz782xL0SgEmRpA3x1w1Ff9/74xcxKEPdS +IEFTz6GgU0+BK/UZ5Gwbl4gZwycxEw+Kqa5QmMkh4OzgzEVPnDAiAOGBFaBW4wkDmj1G4RyElKgj +NlLCq8zsp085MNh/+R4t1Q8yxoSv8PUpTt7izZwf2BTHZZ3pIZpUIpuLkL1nNL6sYcHqcKm237wp +T2+RCjgXweXd2Zp7ZM8W6dG5bZsqo0nrJBTx8EC0+CQQdzEGnabTnkzofu1pYkWl4E7XSniECdxy +vLYavPMcL9LW5SToJFNnos+uqweOHriUZ1ntIYZUonc7ltEQ6oTRtwOHNwez2sVREskHN+bqG3ua +eaEbJ8XpyO8CeD9QJc8nbLP2C2R3A437ISUNyt5Yd0TbDNcl11/DSsOzdbi/VhCC0KE6v1vqVNkq +45ZnG6fiV2NwzInxCNth3BwL0+8814jE6+1W1EeWtpWbSZJOJNYXmWRXa7vLnAljE692eHjZ4y5u +y1u63De0IzKca7As48Z3XshVF+3XiLNz0JIMh/JOpbiNLlMi672uO0wYzOCZjRxcxj3D+gVenGIE +MvFUGGXuRps2RzMcgWIRolHXpGUP6sMsQt1hspUBnVKUn/WQj2u6j3SXd9Xz0QtEzoM7qTu5y7gR +q9gNNsrlEMLdikBt9bFvBnfbUIh6voTw7eDsyTmPKUvF0bHqWLbHe3VRHyRZnNeSGKsB73q66Vsk +taxWYmwz1tYVFG/vOQhlM0gUkyvIab3nv2caJ1udU1F3pDMty7stubTE4OJqm0i0ECfrJIkLtraC +HwRWKzlqpfhEIqYH09eT9WrOhQyt8YEoyBlnXtAT37WHIQ03TIuEHbnRxZDdLun0iok9PUC79prU +m5beZzfQUelEXnhzb/pIROKx3F7qCttYIFGh5dXNzFzID7u8vKykA8Uejf7XXz//S4nKvW//ofS/ +QastYw== +""") + +##file distutils-init.py +DISTUTILS_INIT = convert(""" +eJytV1uL4zYUfvevOE0ottuMW9q3gVDa3aUMXXbLMlDKMBiNrSTqOJKRlMxkf33PkXyRbGe7Dw2E +UXTu37lpxLFV2oIyifAncxmOL0xLIfcG+gv80x9VW6maw7o/CANSWWBwFtqeWMPlGY6qPjV8A0bB +C4eKSTgZ5LRgFeyErMEeOBhbN+Ipgeizhjtnhkn7DdyjuNLPoCS0l/ayQTG0djwZC08cLXozeMss +aG5EzQ0IScpnWtHSTXuxByV/QCmxE7y+eS0uxWeoheaVVfqSJHiU7Mhhi6gULbOHorshkrEnKxpT +0n3A8Y8SMpuwZx6aoix3ouFlmW8gHRSkeSJ2g7hU+kiHLDaQw3bmRDaTGfTnty7gPm0FHbIBg9U9 +oh1kZzAFLaue2R6htPCtAda2nGlDSUJ4PZBgCJBGVcwKTAMz/vJiLD+Oin5Z5QlvDPdulC6EsiyE +NFzb7McNTKJzbJqzphx92VKRFY1idenzmq3K0emRcbWBD0ryqc4NZGmKOOOX9Pz5x+/l27tP797c +f/z0d+4NruGNai8uAM0bfsYaw8itFk8ny41jsfpyO+BWlpqfhcG4yxLdi/0tQqoT4a8Vby382mt8 +p7XSo7aWGdPBc+b6utaBmCQ7rQKQoWtAuthQCiold2KfJIPTT8xwg9blPumc+YDZC/wYGdAyHpJk +vUbHbHWAp5No6pK/WhhLEWrFjUwtPEv1Agf8YmnsuXUQYkeZoHm8ogP16gt2uHoxcEMdf2C6pmbw +hUMsWGhanboh4IzzmsIpWs134jVPqD/c74bZHdY69UKKSn/+KfVhxLgUlToemayLMYQOqfEC61bh +cbhwaqoGUzIyZRFHPmau5juaWqwRn3mpWmoEA5nhzS5gog/5jbcFQqOZvmBasZtwYlG93k5GEiyw +buHhMWLjDarEGpMGB2LFs5nIJkhp/nUmZneFaRth++lieJtHepIvKgx6PJqIlD9X2j6pG1i9x3pZ +5bHuCPFiirGHeO7McvoXkz786GaKVzC9DSpnOxJdc4xm6NSVq7lNEnKdVlnpu9BNYoKX2Iq3wvgh +gGEUM66kK6j4NiyoneuPLSwaCWDxczgaolEWpiMyDVDb7dNuLAbriL8ig8mmeju31oNvQdpnvEPC +1vAXbWacGRVrGt/uXN/gU0CDDwgooKRrHfTBb1/s9lYZ8ZqOBU0yLvpuP6+K9hLFsvIjeNhBi0KL +MlOuWRn3FRwx5oHXjl0YImUx0+gLzjGchrgzca026ETmYJzPD+IpuKzNi8AFn048Thd63OdD86M6 +84zE8yQm0VqXdbbgvub2pKVnS76icBGdeTHHXTKspUmr4NYo/furFLKiMdQzFjHJNcdAnMhltBJK +0/IKX3DVFqvPJ2dLE7bDBkH0l/PJ29074+F0CsGYOxsb7U3myTUncYfXqnLLfa6sJybX4g+hmcjO +kMRBfA1JellfRRKJcyRpxdS4rIl6FdmQCWjo/o9Qz7yKffoP4JHjOvABcRn4CZIT2RH4jnxmfpVG +qgLaAvQBNfuO6X0/Ux02nb4FKx3vgP+XnkX0QW9pLy/NsXgdN24dD3LxO2Nwil7Zlc1dqtP3d7/h +kzp1/+7hGBuY4pk0XD/0Ao/oTe/XGrfyM773aB7iUhgkpy+dwAMalxMP0DrBcsVw/6p25+/hobP9 +GBknrWExDhLJ1bwt1NcCNblaFbMKCyvmX0PeRaQ= +""") + +##file distutils.cfg +DISTUTILS_CFG = convert(""" +eJxNj00KwkAMhfc9xYNuxe4Ft57AjYiUtDO1wXSmNJnK3N5pdSEEAu8nH6lxHVlRhtDHMPATA4uH +xJ4EFmGbvfJiicSHFRzUSISMY6hq3GLCRLnIvSTnEefN0FIjw5tF0Hkk9Q5dRunBsVoyFi24aaLg +9FDOlL0FPGluf4QjcInLlxd6f6rqkgPu/5nHLg0cXCscXoozRrP51DRT3j9QNl99AP53T2Q= +""") + +##file activate_this.py +ACTIVATE_THIS = convert(""" +eJyNU01v2zAMvetXEB4K21jmDOstQA4dMGCHbeihlyEIDMWmG62yJEiKE//7kXKdpN2KzYBt8euR +fKSyLPs8wiEo8wh4wqZTGou4V6Hm0wJa1cSiTkJdr8+GsoTRHuCotBayiWqQEYGtMCgfD1KjGYBe +5a3p0cRKiAe2NtLADikftnDco0ko/SFEVgEZ8aRC5GLux7i3BpSJ6J1H+i7A2CjiHq9z7JRZuuQq +siwTIvpxJYCeuWaBpwZdhB+yxy/eWz+ZvVSU8C4E9FFZkyxFsvCT/ZzL8gcz9aXVE14Yyp2M+2W0 +y7n5mp0qN+avKXvbsyyzUqjeWR8hjGE+2iCE1W1tQ82hsCZN9UzlJr+/e/iab8WfqsmPI6pWeUPd +FrMsd4H/55poeO9n54COhUs+sZNEzNtg/wanpjpuqHJaxs76HtZryI/K3H7KJ/KDIhqcbJ7kI4ar +XL+sMgXnX0D+Te2Iy5xdP8yueSlQB/x/ED2BTAtyE3K4SYUN6AMNfbO63f4lBW3bUJPbTL+mjSxS +PyRfJkZRgj+VbFv+EzHFi5pKwUEepa4JslMnwkowSRCXI+m5XvEOvtuBrxHdhLalG0JofYBok6qj +YdN2dEngUlbC4PG60M1WEN0piu7Nq7on0mgyyUw3iV1etLo6r/81biWdQ9MWHFaePWZYaq+nmp+t +s3az+sj7eA0jfgPfeoN1 +""") + +MH_MAGIC = 0xfeedface +MH_CIGAM = 0xcefaedfe +MH_MAGIC_64 = 0xfeedfacf +MH_CIGAM_64 = 0xcffaedfe +FAT_MAGIC = 0xcafebabe +BIG_ENDIAN = '>' +LITTLE_ENDIAN = '<' +LC_LOAD_DYLIB = 0xc +maxint = majver == 3 and getattr(sys, 'maxsize') or getattr(sys, 'maxint') + + +class fileview(object): + """ + A proxy for file-like objects that exposes a given view of a file. + Modified from macholib. + """ + + def __init__(self, fileobj, start=0, size=maxint): + if isinstance(fileobj, fileview): + self._fileobj = fileobj._fileobj + else: + self._fileobj = fileobj + self._start = start + self._end = start + size + self._pos = 0 + + def __repr__(self): + return '' % ( + self._start, self._end, self._fileobj) + + def tell(self): + return self._pos + + def _checkwindow(self, seekto, op): + if not (self._start <= seekto <= self._end): + raise IOError("%s to offset %d is outside window [%d, %d]" % ( + op, seekto, self._start, self._end)) + + def seek(self, offset, whence=0): + seekto = offset + if whence == os.SEEK_SET: + seekto += self._start + elif whence == os.SEEK_CUR: + seekto += self._start + self._pos + elif whence == os.SEEK_END: + seekto += self._end + else: + raise IOError("Invalid whence argument to seek: %r" % (whence,)) + self._checkwindow(seekto, 'seek') + self._fileobj.seek(seekto) + self._pos = seekto - self._start + + def write(self, bytes): + here = self._start + self._pos + self._checkwindow(here, 'write') + self._checkwindow(here + len(bytes), 'write') + self._fileobj.seek(here, os.SEEK_SET) + self._fileobj.write(bytes) + self._pos += len(bytes) + + def read(self, size=maxint): + assert size >= 0 + here = self._start + self._pos + self._checkwindow(here, 'read') + size = min(size, self._end - here) + self._fileobj.seek(here, os.SEEK_SET) + bytes = self._fileobj.read(size) + self._pos += len(bytes) + return bytes + + +def read_data(file, endian, num=1): + """ + Read a given number of 32-bits unsigned integers from the given file + with the given endianness. + """ + res = struct.unpack(endian + 'L' * num, file.read(num * 4)) + if len(res) == 1: + return res[0] + return res + + +def mach_o_change(path, what, value): + """ + Replace a given name (what) in any LC_LOAD_DYLIB command found in + the given binary with a new name (value), provided it's shorter. + """ + + def do_macho(file, bits, endian): + # Read Mach-O header (the magic number is assumed read by the caller) + cputype, cpusubtype, filetype, ncmds, sizeofcmds, flags = read_data(file, endian, 6) + # 64-bits header has one more field. + if bits == 64: + read_data(file, endian) + # The header is followed by ncmds commands + for n in range(ncmds): + where = file.tell() + # Read command header + cmd, cmdsize = read_data(file, endian, 2) + if cmd == LC_LOAD_DYLIB: + # The first data field in LC_LOAD_DYLIB commands is the + # offset of the name, starting from the beginning of the + # command. + name_offset = read_data(file, endian) + file.seek(where + name_offset, os.SEEK_SET) + # Read the NUL terminated string + load = file.read(cmdsize - name_offset).decode() + load = load[:load.index('\0')] + # If the string is what is being replaced, overwrite it. + if load == what: + file.seek(where + name_offset, os.SEEK_SET) + file.write(value.encode() + '\0'.encode()) + # Seek to the next command + file.seek(where + cmdsize, os.SEEK_SET) + + def do_file(file, offset=0, size=maxint): + file = fileview(file, offset, size) + # Read magic number + magic = read_data(file, BIG_ENDIAN) + if magic == FAT_MAGIC: + # Fat binaries contain nfat_arch Mach-O binaries + nfat_arch = read_data(file, BIG_ENDIAN) + for n in range(nfat_arch): + # Read arch header + cputype, cpusubtype, offset, size, align = read_data(file, BIG_ENDIAN, 5) + do_file(file, offset, size) + elif magic == MH_MAGIC: + do_macho(file, 32, BIG_ENDIAN) + elif magic == MH_CIGAM: + do_macho(file, 32, LITTLE_ENDIAN) + elif magic == MH_MAGIC_64: + do_macho(file, 64, BIG_ENDIAN) + elif magic == MH_CIGAM_64: + do_macho(file, 64, LITTLE_ENDIAN) + + assert(len(what) >= len(value)) + do_file(open(path, 'r+b')) + + +def install_with_virtualenv(): + parser = ConfigOptionParser( + version=virtualenv_version, + usage="%prog [OPTIONS] DEST_DIR", + formatter=UpdatingDefaultsHelpFormatter()) + options, args = parser.parse_args() + print "Install with existing Virtualenv. Dest dir: " + str(args[0]) + subprocess.call(["virtualenv", args[0]]) + print "install bigjob" + subprocess.call([os.path.join(args[0], 'bin', 'easy_install'), 'bigjob']) + + + + +if __name__ == '__main__': + ret = -1 + try: + ret = subprocess.call(["virtualenv", "--version"]) + except: + pass + + if ret==0: + install_with_virtualenv() + else: + main() + +## TODO: +## Copy python.exe.manifest +## Monkeypatch distutils.sysconfig diff --git a/cli/pilot_cli.py b/cli/pilot_cli.py index 90de1766..92f15284 100644 --- a/cli/pilot_cli.py +++ b/cli/pilot_cli.py @@ -41,7 +41,7 @@ def submit_pilot_by_description(self, coordination_url="redis://localhost/", pil pilot_compute = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) pilot_url = pilot_compute.get_url() self.pilots.append(pilot_url) - print("Started Pilot: %s"%(pilot_url)) + print(("Started Pilot: %s"%(pilot_url))) self.__persist() @@ -53,26 +53,26 @@ def cancel_pilot(self, pilot_url): def list_pilots(self): - print "\nPilot Compute\t\t\t\t\t\t\t\t\tState" - print "-----------------------------------------------------------------------------------------------------" + print("\nPilot Compute\t\t\t\t\t\t\t\t\tState") + print("-----------------------------------------------------------------------------------------------------") if len(self.pilots)==0: - print "No pilot found" + print("No pilot found") for i in self.pilots: pilot_compute = PilotCompute(pilot_url=i) - print "%s\t%s"%(pilot_compute.get_url(), pilot_compute.get_state()) - print "" + print("%s\t%s"%(pilot_compute.get_url(), pilot_compute.get_state())) + print("") def list_cus(self, pilot_url): pilot_compute = PilotCompute(pilot_url=pilot_url) cus = pilot_compute.list_compute_units() counter=1 - print "\nPilot Compute: %s"%(pilot_compute.get_url()) - print "State: %s"%(pilot_compute.get_state()) - print "#\tCompute Unit\t\t\t\t\tState\tQueue\tRuntime" - print "-----------------------------------------------------------------------------------------" + print("\nPilot Compute: %s"%(pilot_compute.get_url())) + print("State: %s"%(pilot_compute.get_state())) + print("#\tCompute Unit\t\t\t\t\tState\tQueue\tRuntime") + print("-----------------------------------------------------------------------------------------") if len(cus)==0: - print "No Compute Unit found." + print("No Compute Unit found.") for i in cus: url = i.get_url() short_url = url[url.index("sj"):] @@ -89,13 +89,13 @@ def list_cus(self, pilot_url): state = i.get_state() details=i.get_details() - if details.has_key("start_time") and details.has_key("end_queue_time"): + if "start_time" in details and "end_queue_time" in details: queue_time = float(details["end_queue_time"]) - float(details["start_time"]) - if details.has_key("end_time") and details.has_key("end_queue_time"): + if "end_time" in details and "end_queue_time" in details: run_time = float(details["end_time"]) - float(details["end_queue_time"]) - print "%d\t%s\t\t%s\t%.1f\t%.1f"%(counter, short_url, state, queue_time, run_time) + print("%d\t%s\t\t%s\t%.1f\t%.1f"%(counter, short_url, state, queue_time, run_time)) counter = counter + 1 - print "" + print("") def submit_cu(self, pilot_url, command): @@ -119,24 +119,24 @@ def submit_cu(self, pilot_url, command): def submit_cu_by_description(self, pilot_url, compute_unit_description={}): pilot_compute = PilotCompute(pilot_url=pilot_url) compute_unit = pilot_compute.submit_compute_unit(compute_unit_description) - print "Started ComputeUnit: %s"%(compute_unit.get_url()) + print("Started ComputeUnit: %s"%(compute_unit.get_url())) return compute_unit def run_cu(self, pilot_url, command): """ submits CU and waits for completion """ compute_unit=self.submit_cu(pilot_url, command) - print "Waiting for termination" + print("Waiting for termination") compute_unit.wait() - print "CU %s terminated"%compute_unit.get_url() + print("CU %s terminated"%compute_unit.get_url()) return compute_unit def run_cu_by_description(self, pilot_url, compute_unit_description={}): compute_unit=self.submit_cu_by_description(pilot_url, compute_unit_description) - print "Waiting for termination" + print("Waiting for termination") compute_unit.wait() - print "CU %s terminated"%compute_unit.get_url() + print("CU %s terminated"%compute_unit.get_url()) return compute_unit @@ -148,18 +148,18 @@ def wait_cu(self, pilot_url): def cancel_cu(self, cu_url): compute_unit = ComputeUnit(cu_url=cu_url) compute_unit.cancel() - print("Terminated CU: %s"%(cu_url)) + print(("Terminated CU: %s"%(cu_url))) def get_cu_state(self, cu_url): compute_unit = ComputeUnit(cu_url=cu_url) - print "Compute Unit: %s State: %s"%(cu_url, compute_unit.get_state()) + print("Compute Unit: %s State: %s"%(cu_url, compute_unit.get_state())) ########################################################################### # auxiliary methods def version(self): - print "BigJob Version: " + bigjob.version + print("BigJob Version: " + bigjob.version) def clean(self): os.remove(self.__get_save_filename()) diff --git a/cli/pilot_cli.py.bak b/cli/pilot_cli.py.bak new file mode 100644 index 00000000..90de1766 --- /dev/null +++ b/cli/pilot_cli.py.bak @@ -0,0 +1,275 @@ +''' +Command Line Util for using BigJob (via the Pilot-API) +''' +import argparse +import sys +import os +import pdb +import pickle + +import bigjob +from pilot import PilotComputeService, PilotCompute, ComputeUnit, State +BIGJOB_DIRECTORY="~/.bigjob/" + + +class BigJobCLI(object): + + def __init__(self): + self.pilots = [] + self.__restore() + + def submit_pilot(self, coordination_url="redis://localhost/", + resource_url="fork://localhost", + working_directory=".", + number_cores=1, + cores_per_node=1, + ): + + # create pilot job service and initiate a pilot job + pilot_compute_description = { + "service_url": resource_url, + "number_of_processes": 1, + "working_directory": os.path.join(os.getcwd(), working_directory) , + "number_of_processes": number_cores, + "processes_per_node": cores_per_node + } + self.submit_pilot_by_description(coordination_url, pilot_compute_description) + + + def submit_pilot_by_description(self, coordination_url="redis://localhost/", pilot_compute_description={}): + pilot_compute_service = PilotComputeService(coordination_url=coordination_url) + pilot_compute = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) + pilot_url = pilot_compute.get_url() + self.pilots.append(pilot_url) + print("Started Pilot: %s"%(pilot_url)) + self.__persist() + + + def cancel_pilot(self, pilot_url): + pilot_compute = PilotCompute(pilot_url=pilot_url) + pilot_compute.cancel() + self.pilots.remove(pilot_url) + self.__persist() + + + def list_pilots(self): + print "\nPilot Compute\t\t\t\t\t\t\t\t\tState" + print "-----------------------------------------------------------------------------------------------------" + if len(self.pilots)==0: + print "No pilot found" + for i in self.pilots: + pilot_compute = PilotCompute(pilot_url=i) + print "%s\t%s"%(pilot_compute.get_url(), pilot_compute.get_state()) + print "" + + + def list_cus(self, pilot_url): + pilot_compute = PilotCompute(pilot_url=pilot_url) + cus = pilot_compute.list_compute_units() + counter=1 + print "\nPilot Compute: %s"%(pilot_compute.get_url()) + print "State: %s"%(pilot_compute.get_state()) + print "#\tCompute Unit\t\t\t\t\tState\tQueue\tRuntime" + print "-----------------------------------------------------------------------------------------" + if len(cus)==0: + print "No Compute Unit found." + for i in cus: + url = i.get_url() + short_url = url[url.index("sj"):] + if short_url.find("/?")!=-1: + short_url=short_url[:short_url.index("/?")] + queue_time = 0.0 + run_time = 0.0 + try: + pass + #details = i.get_details() + #state = details["state"] + except: + pass + + state = i.get_state() + details=i.get_details() + if details.has_key("start_time") and details.has_key("end_queue_time"): + queue_time = float(details["end_queue_time"]) - float(details["start_time"]) + if details.has_key("end_time") and details.has_key("end_queue_time"): + run_time = float(details["end_time"]) - float(details["end_queue_time"]) + print "%d\t%s\t\t%s\t%.1f\t%.1f"%(counter, short_url, state, queue_time, run_time) + counter = counter + 1 + print "" + + + def submit_cu(self, pilot_url, command): + """ submits CUs (does not waits for completion) """ + #print "Submit CU to %s"%(pilot_url) + args= [] + if len(command)>1: + args = command[1:] + compute_unit_description = { + "executable": command[0], + "arguments": args, + "total_core_count": 1, + "number_of_processes": 1, + "output": "stdout.txt", + "error": "stderr.txt", + } + return self.submit_cu_by_description(pilot_url, compute_unit_description) + + + + def submit_cu_by_description(self, pilot_url, compute_unit_description={}): + pilot_compute = PilotCompute(pilot_url=pilot_url) + compute_unit = pilot_compute.submit_compute_unit(compute_unit_description) + print "Started ComputeUnit: %s"%(compute_unit.get_url()) + return compute_unit + + + def run_cu(self, pilot_url, command): + """ submits CU and waits for completion """ + compute_unit=self.submit_cu(pilot_url, command) + print "Waiting for termination" + compute_unit.wait() + print "CU %s terminated"%compute_unit.get_url() + return compute_unit + + + def run_cu_by_description(self, pilot_url, compute_unit_description={}): + compute_unit=self.submit_cu_by_description(pilot_url, compute_unit_description) + print "Waiting for termination" + compute_unit.wait() + print "CU %s terminated"%compute_unit.get_url() + return compute_unit + + + def wait_cu(self, pilot_url): + pilot_compute = PilotCompute(pilot_url=pilot_url) + pilot_compute.wait() + + + def cancel_cu(self, cu_url): + compute_unit = ComputeUnit(cu_url=cu_url) + compute_unit.cancel() + print("Terminated CU: %s"%(cu_url)) + + + def get_cu_state(self, cu_url): + compute_unit = ComputeUnit(cu_url=cu_url) + print "Compute Unit: %s State: %s"%(cu_url, compute_unit.get_state()) + + ########################################################################### + # auxiliary methods + + def version(self): + print "BigJob Version: " + bigjob.version + + def clean(self): + os.remove(self.__get_save_filename()) + + ########################################################################### + # private and protected methods + + def __persist(self): + f = open(self.__get_save_filename(), 'wb') + pickle.dump(self.pilots, f) + f.close() + + def __restore(self): + if os.path.exists(self.__get_save_filename()): + try: + f = open(self.__get_save_filename(), 'rb') + self.pilots = pickle.load(f) + f.close() + except: + pass + + def __get_save_filename(self): + return os.path.join(os.path.expanduser(BIGJOB_DIRECTORY), 'pilot-cli.p') + + +def main(): + app = BigJobCLI() + parser = argparse.ArgumentParser(add_help=True, description="""BigJob Command Line Utility""") + + parser.add_argument('--coordination', '-c', default="redis://localhost") + parser.add_argument('--clean', action="store_true", default=False) + parser.add_argument('--version', action="store_true", default=False) + + pilot_group = parser.add_argument_group('Manage pilots') + pilot_group.add_argument('--number_cores', default="1") + pilot_group.add_argument('--cores_per_node', default="1") + pilot_group.add_argument('--submit_pilot', action="store", nargs="?", metavar="RESOURCE_URL", + help="submit a pilot to specified resource, e.g. fork://localhost", + default=False) + pilot_group.add_argument('--pilot_description', action="store", metavar="PILOT_COMPUTE_DESCRIPTION", + help="""accepts a Pilot-API pilot compute description in JSON format: + \"{'service_url': 'fork://localhost', 'number_of_processes': 1, 'working_directory': '/tmp'}\"""", + default=False) + + + pilot_group.add_argument('--cancel_pilot', action="store", default=False, metavar="PILOT_URL", + help="Cancel pilot") + pilot_group.add_argument('--list_pilots', action="store_true", default=False, help="list all pilots") + pilot_group.add_argument('--wait_cus', action="store_true", default=False, help="wait for termination of all CUs") + pilot_group.add_argument('--list_cus', action="store", metavar="PILOT_URL", default=False) + + + cu_group = parser.add_argument_group('Manage compute units') + cu_group.add_argument('--submit_cu', action="store", nargs="+", metavar=("PILOT_URL", "COMMAND ARGS"), + help="submit CU to pilot", default=False) + cu_group.add_argument('--compute_unit_description', action="store", metavar="COMPUTE_UNIT_DESCRIPTION", + help="""accepts a Pilot-API compute unit description in JSON format: + \"{'executable': '/bin/date', 'arguments': [''], 'total_core_count': 1, 'number_of_processes': 1, 'output': 'stdout.txt', 'error': 'stderr.txt'}\"""", + default=False) + + cu_group.add_argument('--run_cu', action="store", nargs="+", metavar=("PILOT_URL", "COMMAND ARGS"), + help="submit CU to pilot and wait for completion", default=False) + cu_group.add_argument('--get_cu_state', action="store", metavar="CU_URL", default=False) + cu_group.add_argument('--cancel_cu', action="store", metavar="CU_URL", default=False) + + parsed_arguments = parser.parse_args() + #print(str(parsed_arguments)) + + if parsed_arguments.submit_pilot!=False: + if parsed_arguments.pilot_description!=False: + pilot_description = eval(parsed_arguments.pilot_description) + app.submit_pilot_by_description(coordination_url=parsed_arguments.coordination, + pilot_compute_description=pilot_description) + else: + app.submit_pilot(coordination_url=parsed_arguments.coordination, + resource_url=parsed_arguments.submit_pilot, + number_cores=parsed_arguments.number_cores, + cores_per_node=parsed_arguments.cores_per_node) + elif parsed_arguments.list_pilots!=False: + app.list_pilots() + elif parsed_arguments.cancel_pilot!=False: + app.cancel_pilot(parsed_arguments.cancel_pilot) + elif parsed_arguments.submit_cu!=False: + if parsed_arguments.compute_unit_description!=False: + compute_unit_description = eval(parsed_arguments.compute_unit_description) + app.submit_cu_by_description(parsed_arguments.submit_cu[0], + compute_unit_description=compute_unit_description) + else: + app.submit_cu(parsed_arguments.submit_cu[0], parsed_arguments.submit_cu[1:]) + elif parsed_arguments.run_cu!=False: + if parsed_arguments.compute_unit_description!=False: + compute_unit_description = eval(parsed_arguments.compute_unit_description) + app.run_cu_by_description(parsed_arguments.run_cu[0], + compute_unit_description=compute_unit_description) + else: + app.run_cu(parsed_arguments.run_cu[0], parsed_arguments.run_cu[1:]) + elif parsed_arguments.get_cu_state!=False: + app.get_cu_state(parsed_arguments.get_cu_state) + elif parsed_arguments.wait_cus!=False: + app.wait_cu(parsed_arguments.wait_cus) + elif parsed_arguments.list_cus!=False: + app.list_cus(parsed_arguments.list_cus) + elif parsed_arguments.cancel_cu!=False: + app.list_cus(parsed_arguments.cancel_cu) + elif parsed_arguments.clean==True: + app.clean() + elif parsed_arguments.version==True: + app.version() + +if __name__ == '__main__': + main() + + \ No newline at end of file diff --git a/coordination/bigjob_coordination_advert.py b/coordination/bigjob_coordination_advert.py index b5684369..bb7e5e91 100644 --- a/coordination/bigjob_coordination_advert.py +++ b/coordination/bigjob_coordination_advert.py @@ -10,7 +10,7 @@ import pdb import saga import json -import urlparse +import urllib.parse import logging from bigjob import logger diff --git a/coordination/bigjob_coordination_advert.py.bak b/coordination/bigjob_coordination_advert.py.bak new file mode 100644 index 00000000..b5684369 --- /dev/null +++ b/coordination/bigjob_coordination_advert.py.bak @@ -0,0 +1,278 @@ +''' +Encapsulates coordination and communication specifics of bigjob +''' +import threading +import datetime +import time +import sys +import os +import pickle +import pdb +import saga +import json +import urlparse +import logging + +from bigjob import logger +logger.debug("Load Advert Coordination") + +if sys.version_info < (2, 5): + sys.path.append(os.path.dirname( os.path.abspath( __file__) ) + "/../ext/uuid-1.30/") + sys.stderr.write("Warning: Using unsupported Python version\n") + +logging.debug(str(sys.path)) +import uuid + +APPLICATION_NAME="BigJob/BigJob" +ADVERT_URL_SCHEME = "advert://" +ADVERT_SERVER="advert.cct.lsu.edu" +ADVERT_SERVER_PORT=8080 + +class bigjob_coordination(object): + ''' + Encapsulates communication and coordination + Implementation based on Redis (http://redis.io) + ''' + + def __init__(self, server=ADVERT_SERVER, server_port=ADVERT_SERVER_PORT, + server_connect_url=None, username=None, password=None, + dbtype=None, url_prefix=None): + ''' + Constructor + ''' + + #pdb.set_trace() + if url_prefix==None: + url_prefix = ADVERT_URL_SCHEME + + if username!=None and username!="": + url_prefix = url_prefix+username + if password!=None: + url_prefix = url_prefix + ":" + password + url_prefix = url_prefix + "@" + if server_connect_url!=None: + self.address=server_connect_url + elif server_port != None: + self.address = url_prefix+"%s:%i"%(server, server_port) + elif server != None: + self.address = url_prefix+"%s"%(server) + + self.username="" + self.password="" + self.dbtype="" + surl = saga.Url(self.address) + if server_connect_url==None: # Manager + if username!=None: + surl.username=username + self.username=username + if password != None: + surl.password = password + self.password=password + if dbtype != None: + #surl.query = dbtype + self.dbtype = dbtype + else: # Agent + if surl.query!=None: + self.dbtype=surl.query + surl.query="" + + self.address = str(surl) + self.pilot_url = self.address + logger.debug("Server: " + str(server) + " Port " + str(server_port) + + " Url prefix: " + str(url_prefix) + + " Address: " + str(self.get_address()) + + " server_connect_url: " + str(server_connect_url) ) + logger.debug("Initialized Coordination to: %s (DB: %s)"%(self.address, self.dbtype)) + self.resource_lock = threading.RLock() + + + def get_address(self): + return self.address + "?" + self.dbtype + + + def get_url(self, id_string): + + if not id_string.startswith("advert") and not id_string.startswith("sqlasyncadvert"): + path = id_string.replace(":", "/") + if self.dbtype!=None: + if self.dbtype.endswith("?"): + self.dbtype = self.dbtype[:-1] + url_string = self.address + "/" + path + "?" + self.dbtype + else: + url_string = self.address + "/" + path + return url_string + + + if self.dbtype!=None: + if self.dbtype.endswith("?"): + self.dbtype = self.dbtype[:-1] + id_string = id_string + "?" + self.dbtype + + + return id_string + + + ##################################################################################### + # Pilot-Job State + def set_pilot_state(self, pilot_url, new_state, stopped=False): + pilot_url = self.get_url(pilot_url) + logger.debug("create advert entry: " + pilot_url) + pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + logger.debug("update state of pilot job to: " + str(new_state) + " Stopped: " + str(stopped)) + pilot_dir.set_attribute("state", str(new_state)) + pilot_dir.set_attribute("stopped", str(stopped)) + + def get_pilot_state(self, pilot_url): + pilot_url = self.get_url(pilot_url) + pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Read) + state = pilot_dir.get_attribute("state") + stopped = pilot_dir.get_attribute("stopped") + if stopped == "false" or stopped == "False": + return {"state":state, "stopped":False} + else: + return {"state":state, "stopped":True} + + ##################################################################################### + # Pilot-Job Description + def set_pilot_description(self, pilot_url, description): + pass + + def get_pilot_description(self, pilot_url): + pass + + def get_jobs_of_pilot(self, pilot_url): + pilot_url = self.get_url(pilot_url + "/jobs") + """ returns array of job_url that are associated with a pilot """ + pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + jobs = pilot_dir.list() + job_urls = [self.__get_colon_url(self.__remove_dbtype(pilot_url) + "/" + i.get_string()) for i in jobs] + if self.dbtype!=None: + job_urls = [i + "?" + self.dbtype for i in job_urls] + return job_urls + + + def delete_pilot(self, pilot_url): + pilot_url = self.get_url(pilot_url) + pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + pilot_dir.remove(pilot_url, saga.name_space.Recursive) + + + ##################################################################################### + # Sub-Job State + def set_job_state(self, job_url, new_state): + self.resource_lock.acquire() + job_url = self.get_url(job_url) + logger.debug("Set state of job: " + str(job_url) + " to: " + str(new_state)) + job_dir = saga.advert.directory(saga.url(job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + job_dir.set_attribute("state", str(new_state)) + if new_state=="Unknown": + job_dir.set_attribute("start_time", str(time.time())) + if new_state=="Running": + job_dir.set_attribute("end_queue_time", str(time.time())) + elif new_state=="Done": + job_dir.set_attribute("end_time", str(time.time())) + self.resource_lock.release() + + def get_job_state(self, job_url): + job_url = self.get_url(self.__remove_dbtype(job_url)) + job_dir = saga.advert.directory(saga.url(job_url), saga.advert.Read) + state = job_dir.get_attribute("state") + #logger.debug("Get state of job: " + str(job_url) + " state: " + str(state)) + return state + + + ##################################################################################### + # Sub-Job Description + def set_job(self, job_url, job_dict): + job_dir_url = self.get_url(job_url) + job_description_url = self.get_url(job_url+"/job-description") + logger.debug("Job URL: %s, Job Description URL: %s"%(job_dir_url, job_description_url)) + #job_dir = saga.advert.directory(saga.url(job_dir_url), + # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + # directory is recursively created + job_desc_entry = saga.advert.entry(saga.url(job_description_url), + saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + logger.debug("initialized advert entry for job: " + job_dir_url) + job_desc_entry.store_string(json.dumps(job_dict)) + self.set_job_state(job_url, str(saga.job.Unknown)) + + + + def get_job(self, job_url): + #job_dir = saga.advert.directory(saga.url(job_url), + # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + job_url = self.get_url(job_url+"/job-description") + logger.debug("Get job description from: %s"%(job_url)) + job_desc_entry = saga.advert.entry(saga.url(job_url), + saga.advert.Read) + job_dict = json.loads(job_desc_entry.retrieve_string()) + return job_dict + + + def delete_job(self, job_url): + job_url = self.get_url(job_url) + job_dir = saga.advert.directory(saga.url(job_url), + saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + job_dir.remove(job_url, saga.name_space.Recursive) + + + ##################################################################################### + # Distributed queue for sub-jobs + def queue_job(self, pilot_url, job_url): + self.resource_lock.acquire() + #pilot_url = self.get_url(pilot_url) + job_url = self.get_url(job_url) + """ queue new job to pilot """ + new_job_url = self.get_url(pilot_url + "/new/" + str(uuid.uuid1())) + logger.debug("Job URL: %s Create new job entry at: %s"%(job_url,new_job_url)) + new_job_dir = saga.advert.directory(saga.url(new_job_url), + saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + new_job_dir.set_attribute("joburl", job_url) + self.resource_lock.release() + + def dequeue_job(self, pilot_url): + """ deque to new job of a certain pilot """ + self.resource_lock.acquire() + #pilot_url = self.get_url(pilot_url) + jobs = [] + new_job_dir_url = self.get_url(pilot_url + "/new/") + new_job_dir = saga.advert.directory(saga.url(new_job_dir_url), + saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + new_jobs = new_job_dir.list() + logger.debug("Pilot Job base dir: " + new_job_dir_url + " #new jobs: " + str(len(new_jobs)) + + " jobs: " + str(new_jobs)); + if len(new_jobs)>=1: + job_entry=new_jobs[0] + job_dir_url = self.get_url(pilot_url + "/new/" + "/" + job_entry.get_string()) + logger.debug("Open job at " + str(job_dir_url)) + job_dir = saga.advert.directory(saga.url(job_dir_url), + saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + + #new_job_dir.open_dir(job_entry) + job_url = job_dir.get_attribute("joburl") + #remove old job entry + job_dir.remove(self.__remove_dbtype(job_dir_url), saga.name_space.Recursive) + + logger.debug("Dequeued new job: " + str(job_url)) + self.resource_lock.release() + return self.__remove_dbtype(job_url) + else: + self.resource_lock.release() + time.sleep(1) + return + + ########################################################################### + # Private internal methods + + def __remove_dbtype(self, url): + surl = saga.url(url) + surl.query="" + return str(surl) + + + def __get_colon_url(self, id_string): + surl = saga.url(id_string) + path = surl.path[1:] + new_path = path.replace("/", ":") + surl.path = "/" + new_path + return surl.get_string() diff --git a/coordination/bigjob_coordination_redis.py b/coordination/bigjob_coordination_redis.py index e8cc59cb..db6089db 100644 --- a/coordination/bigjob_coordination_redis.py +++ b/coordination/bigjob_coordination_redis.py @@ -79,7 +79,7 @@ def __init__(self, server=REDIS_SERVER, server_port=REDIS_SERVER_PORT, server_co self.pipe = self.redis_client.pipeline() try: self.redis_client.ping() - except Exception, ex: + except Exception as ex: logger.error("Cannot connect to Redis server: %s" % str(ex)) raise Exception("Cannot connect to Redis server: %s" % str(ex)) diff --git a/coordination/bigjob_coordination_redis.py.bak b/coordination/bigjob_coordination_redis.py.bak new file mode 100644 index 00000000..e8cc59cb --- /dev/null +++ b/coordination/bigjob_coordination_redis.py.bak @@ -0,0 +1,232 @@ +''' +Encapsulates coordination and communication specifics of bigjob +''' +import logging +import threading +import datetime +import sys +import os +import pickle +import pdb +import time +import socket +from bigjob import logger +import redis + + +logging.debug(str(sys.path)) +import uuid + + +REDIS_SERVER="localhost" +REDIS_SERVER_PORT=6379 +REDIS_URL_SCHEME="redis://" +REDIS_BIGJOB_STATS="bigjob:stats" + +class bigjob_coordination(object): + ''' + Encapsulates communication and coordination + Implementation based on Redis (http://redis.io) + ''' + + def __init__(self, server=REDIS_SERVER, server_port=REDIS_SERVER_PORT, server_connect_url=None, + username=None, password=None, dbtype=None, url_prefix=None): + ''' + Constructor + ''' + if server_port==None: + server_port=6379 + + self.username = None + self.password = None + + self.address = "%s%s:%i"%(REDIS_URL_SCHEME, server, server_port) + self.dbtype="" + #self.redis_adaptor_start_time = datetime.datetime.utcnow().strftime("%s") + self.redis_adaptor_start_time = time.time() + + if server_connect_url!=None: + self.address=server_connect_url + start_index = self.address.find(REDIS_URL_SCHEME)+len(REDIS_URL_SCHEME) + server_and_port = self.address[start_index:] + password_end = server_and_port.find("@") + # parse out password + if password_end != -1: + self.password = server_and_port[:password_end] + start_index=password_end + server_and_port= server_and_port[(password_end+1):] + + # port and hostname + if server_and_port.find(":")==-1: + server=server_and_port + server_port = REDIS_SERVER_PORT + else: + server = server_and_port.split(":")[0] + server_port = int(server_and_port.split(":")[1]) + else: + self.password = username + if self.password != None and self.password!="": + self.address = "%s%s@%s:%i"%(REDIS_URL_SCHEME, self.password, server, server_port) + + logger.debug("Connect to Redis: " + server + " Port: " + str(server_port)) + + if self.password==None: + self.redis_client = redis.Redis(host=server, port=server_port, db=0) + else: + self.redis_client = redis.Redis(host=server, port=server_port, password=self.password, db=0) + #self.redis_client_pubsub = self.redis_client.pubsub() # redis pubsub client + self.resource_lock = threading.RLock() + self.pipe = self.redis_client.pipeline() + try: + self.redis_client.ping() + except Exception, ex: + logger.error("Cannot connect to Redis server: %s" % str(ex)) + raise Exception("Cannot connect to Redis server: %s" % str(ex)) + + + def get_address(self): + return self.address + + ##################################################################################### + # Pilot-Job State + def set_pilot_state(self, pilot_url, new_state, stopped=False): + logger.debug("update state of pilot job to: " + str(new_state) + + " stopped: " + str(stopped)) + + state_dict = {"state":str(new_state), "stopped":str(stopped)} + + # set timestamps for state changes + #timestamp = datetime.datetime.utcnow().strftime("%s") + timestamp = time.time() + if new_state=="Unknown": + state_dict["start_time"] = str(timestamp) + elif new_state=="Running": + state_dict["end_queue_time"] = str(timestamp) + elif new_state=="Done": + state_dict["end_time"] = str(timestamp) + + #self.redis_client.hmset(pilot_url, {"state":str(new_state), "stopped":str(stopped)}) + self.redis_client.hmset(pilot_url, state_dict) + if stopped==True: + self.queue_job(pilot_url, "STOP") + + + def get_pilot_state(self, pilot_url): + state = self.redis_client.hgetall(pilot_url) + return state + + + ##################################################################################### + # Pilot-Job State + def set_pilot_description(self, pilot_url, description): + logger.debug("update description of pilot job to: " + str(description)) + #self.redis_client.hmset(pilot_url + ":description", {"description":description}) + self.redis_client.hset(pilot_url, "description", description) + + def get_pilot_description(self, pilot_url): + #description = self.redis_client.hgetall(pilot_url + ":description") + description = self.redis_client.hget(pilot_url, "description") + return description + + #def is_pilot_stopped(self,pilot_url): + # state = self.redis_client.hgetall(pilot_url) + # if state==None or not state.has_key("stopped"): + # return True + # return state["stopped"] + + def get_jobs_of_pilot(self, pilot_url): + """ returns array of job_url that are associated with a pilot """ + jobs = self.redis_client.keys(pilot_url+":jobs:*") + jobs_fqdn = [os.path.join(self.get_address(), i)for i in jobs] + return jobs_fqdn + + + def delete_pilot(self, pilot_url): + items = self.redis_client.keys(pilot_url+":queue*") + for i in items: + self.pipe.delete(i) + self.pipe.execute() + pass + + ##################################################################################### + # Sub-Job State + def set_job_state(self, job_url, new_state): + self.resource_lock.acquire() + try: + logger.debug("set job state to: " + str(new_state)) + timestamp =time.time() + if new_state=="Unknown": + #self.redis_client.hset(job_url,"start_time", str(timestamp)) + self.pipe.hset(job_url,"start_time", str(timestamp)) + elif new_state=="Staging": + self.pipe.hset(job_url,"start_staging_time", str(timestamp)) + elif new_state=="Running": + self.pipe.hset(job_url,"agent_start_time", str(self.redis_adaptor_start_time)) + self.pipe.hset(job_url,"end_queue_time", str(timestamp)) + elif new_state=="Done": + self.pipe.hset(job_url, "run_host", socket.gethostname()) + self.pipe.hset(job_url, "end_time", str(timestamp)) + self.pipe.hset(job_url, "state", str(new_state)) + + # update last contact time in pilot hash + pilot_url = job_url[:job_url.index(":jobs")] + self.pipe.hset(pilot_url, "last_contact", str(timestamp)) + # execute pipe + self.pipe.execute() + except: + pass + self.resource_lock.release() + + + def get_job_state(self, job_url): + return self.redis_client.hget(job_url, "state") + + + ##################################################################################### + # Sub-Job Description + def set_job(self, job_url, job_dict): + self.redis_client.hmset(job_url, job_dict) + self.set_job_state(job_url, "Unknown") + + def get_job(self, job_url): + return self.redis_client.hgetall(job_url) + + def delete_job(self, job_url): + #self.redis_client.delete(job_url+"*") + pass + + + ##################################################################################### + # Distributed queue for sub-jobs + def queue_job(self, pilot_url, job_url): + """ queue new job to pilot """ + queue_name = pilot_url + ":queue" + self.redis_client.set(queue_name + ':last_in', pickle.dumps(datetime.datetime.now())) + self.redis_client.lpush(queue_name, job_url) + + + def dequeue_job(self, pilot_url, pilot_url2=None): + """ deque to new job of a certain pilot """ + queue_list = [] + queue_name = pilot_url + ":queue" + queue_list.append(queue_name) + if pilot_url2!=None: + queue_name2 = pilot_url2 + ":queue" + queue_list.append(queue_name2) + logger.debug("Dequeue sub-job from: " + str(queue_list)) + #+ " number queued items: " + str(self.redis_client.llen(queue_name))) + #self.redis_client.set(queue_name + ':last_out', pickle.dumps(datetime.datetime.now())) + job_url = self.redis_client.brpop(queue_list, timeout=1) + #job_url = self.redis_client.rpop(queue_name) + logger.debug("Dequeued: " + str(job_url)) + if job_url==None: + return job_url + logger.debug("Dequeued: " + str(job_url)) + return job_url[1] + + + def get_queue_length(self, pilot_url): + queue_name = pilot_url + ":queue" + length = self.redis_client.llen(queue_name) + logger.debug("Queue: " + queue_name + " number queued items: " + str(length)) + return length diff --git a/coordination/bigjob_coordination_zmq.py b/coordination/bigjob_coordination_zmq.py index 5dfdb7d1..39e4cfd0 100644 --- a/coordination/bigjob_coordination_zmq.py +++ b/coordination/bigjob_coordination_zmq.py @@ -11,11 +11,11 @@ import pdb import zmq import traceback -import Queue +import queue import socket import time from zmq.eventloop import ioloop, zmqstream -import zlib, cPickle as pickle +import zlib, pickle as pickle from bigjob import logger if sys.version_info < (2, 5): @@ -60,13 +60,13 @@ def __init__(self, server=SERVER_IP, server_port=SERVER_PORT, server_connect_url self.job_ids = [] self.jobs = {} self.job_states = {} - self.new_job_queue = Queue.Queue() + self.new_job_queue = queue.Queue() # Lock for server and client to manage concurrent access self.resource_lock = threading.Lock() # Client side queue - self.subjob_queue = Queue.Queue() + self.subjob_queue = queue.Queue() # set up ZMQ client / server communication self.context = zmq.Context() @@ -247,7 +247,7 @@ def set_job(self, job_url, job_dict): self.job_states[job_url]="Unknown" def get_job(self, job_url): - if self.jobs.has_key(job_url)==False: + if (job_url in self.jobs)==False: logging.debug("get_job: " + str(self.resource_lock)) with self.resource_lock: logging.debug("get_job (lock acquired): " + str(self.resource_lock)) diff --git a/coordination/bigjob_coordination_zmq.py.bak b/coordination/bigjob_coordination_zmq.py.bak new file mode 100644 index 00000000..5dfdb7d1 --- /dev/null +++ b/coordination/bigjob_coordination_zmq.py.bak @@ -0,0 +1,460 @@ +''' +Encapsulates coordination and communication specifics of bigjob +''' + +import logging +import threading +import datetime +import sys +import os +import pickle +import pdb +import zmq +import traceback +import Queue +import socket +import time +from zmq.eventloop import ioloop, zmqstream +import zlib, cPickle as pickle +from bigjob import logger + +if sys.version_info < (2, 5): + sys.path.append(os.path.dirname( os.path.abspath( __file__) ) + "/../ext/uuid-1.30/") + sys.stderr.write("Warning: Using unsupported Python version\n") + +logging.debug(str(sys.path)) +import uuid + +SERVER_IP="localhost" +SERVER_PORT=0 + +NUMBER_RETRIES=2 + +class message: + def __init__(self, command, key, value): + self.command = command + self.key = key + self.value = value + + def __repr__(self): + return ("command: %s, key: %s, value: %s "%(self.command, self.key, self.value)) + +class bigjob_coordination(object): + ''' + Encapsulates communication and coordination + Implementation based on ZMQ + ''' + def __init__(self, server=SERVER_IP, server_port=SERVER_PORT, server_connect_url=None, + username=None, password=None, dbtype=None, url_prefix=None): + ''' + Constructor + set server and server_port to create a service (server) + set server_connect_url to connect to a service (client) + ''' + self.stopped = False + self.has_stopped=False + + + # state managed by server + self.pilot_states = {} + self.job_ids = [] + self.jobs = {} + self.job_states = {} + self.new_job_queue = Queue.Queue() + + # Lock for server and client to manage concurrent access + self.resource_lock = threading.Lock() + + # Client side queue + self.subjob_queue = Queue.Queue() + + # set up ZMQ client / server communication + self.context = zmq.Context() + + self.server_role = False + self.address = None + self.dbtype = "" + if server == "*": + server = socket.gethostname() + logging.debug("Server: " + server) + if server_connect_url==None: # role = Server + self.server_role = True + # start eventloop + self.startup_condition = threading.Condition() + self.eventloop_thread=threading.Thread(target=self.__server, args=(server, server_port)) + self.eventloop_thread.daemon=True + self.eventloop_thread.start() + + logging.debug("Setting up socket for notifications") + # socket for sending notification + self.push_socket = self.context.socket(zmq.PUSH) + push_port = self.push_socket.bind_to_random_port("tcp://*") + self.push_address = "tcp://"+server+":"+str(push_port) + + + logging.debug("Waiting for server to complete startup") + # wait for server thread to complete startup + self.startup_condition.acquire() + while self.address == None: + self.startup_condition.wait() + self.startup_condition.release() + else: # role client + urls = server_connect_url.split(",") + self.address = urls[0] + self.push_address = urls[1] + self.server_role = False + self.pilot_url = server_connect_url + + + + + logging.debug("Connect sockets to server: " + self.address + " push: " + self.push_address) + # connect to REP server + self.client_socket = self.context.socket(zmq.REQ) + self.client_socket.connect(self.address) + + # connect to PUSH server + self.pull_socket = self.context.socket(zmq.PULL) + self.pull_socket.connect(self.push_address) + + if self.server_role==False: + self.notification_thread=threading.Thread(target=self.__wait_for_notifications) + self.notification_thread.daemon=True + self.notification_thread.start() + + + logging.debug("Connected to REP socket at: " + self.address + " and PUSH socket at: " + self.push_address) + logging.debug("C&C ZMQ system initialized") + + + + def get_address(self): + """ return handle to c&c subsystems """ + return self.address+"," +self.push_address + + ##################################################################################### + # Pilot-Job State + def set_pilot_state(self, pilot_url, new_state, stopped=False): + logging.debug("BEGIN update state of pilot job to: " + str(new_state) + + " Lock: " + str(self.resource_lock)) + counter = 0 + result = None + while result != "SUCCESS" and counter < NUMBER_RETRIES: + with self.resource_lock: + msg = message("set_pilot_state", pilot_url, {"state":str(new_state), "stopped":str(stopped)}) + try: + self.client_socket.send_pyobj(msg, zmq.NOBLOCK) + result = self.client_socket.recv_pyobj() + except: + traceback.print_exc(file=sys.stderr) + # stop background thread running the server (if True) + self.stopped=stopped + if result == None: + logging.error("RETRY set_pilot_state") + counter = counter + 1 + time.sleep(1) + logging.debug("END update state of pilot job to: " + str(new_state)) + + def get_pilot_state(self, pilot_url): + logging.debug("BEGIN get_pilot_state: %s lock: %s" % (pilot_url, str(self.resource_lock))) + counter = 0 + result = None + while result ==None and counter < NUMBER_RETRIES: + with self.resource_lock: + msg = message("get_pilot_state", pilot_url, " Lock: ") + try: + self.client_socket.send_pyobj(msg, zmq.NOBLOCK) + result = self.client_socket.recv_pyobj() + except: + pass + + logging.debug("END get_pilot_state: %s state: %s, lock: %s" % + (pilot_url, str(result.value), str(self.resource_lock))) + if result == None: + counter = counter + 1 + logging.error("RETRY get_pilot_state - Retry # %d"%counter) + time.sleep(1) + return result.value + logging.debug("END get_pilot_state: %s lock: %s" % (pilot_url, str(self.resource_lock))) + + def get_jobs_of_pilot(self, pilot_url): + """ returns array of job_url that are associated with a pilot """ + """ local only - used only by manager """ + return self.job_ids + + def delete_pilot(self, pilot_url): + """ local only - used only by manager """ + # stop everything + self.stopped=True + msg = message("STOP", pilot_url, "") + self.client_socket.send_pyobj(msg, zmq.NOBLOCK) + if self.server_role == True: + self.push_socket.send_pyobj(msg, zmq.NOBLOCK) + #self.eventloop_thread.join() + logging.debug("Has stopped: " + str(self.has_stopped)) + self.__shutdown() + + + ##################################################################################### + # Sub-Job State + def set_job_state(self, job_url, new_state): + logging.debug("Set job state: %s to %s"%(job_url, new_state)) + counter = 0 + result = None + while result == None and counter < NUMBER_RETRIES: + with self.resource_lock: + msg = message("set_job_state", job_url, new_state) + try: + self.client_socket.send_pyobj(msg, zmq.NOBLOCK) + result = self.client_socket.recv_pyobj() + except: + traceback.print_exc(file=sys.stderr) + + if result == None: + counter = counter + 1 + logging.error("RETRY %d set_job_state %s to %s"%(counter, job_url, new_state)) + if counter == NUMBER_RETRIES-1: + self.__reset_client_socket() + time.sleep(2) + continue # retry + else: + logging.debug("SUCCESS set_job_state (%s to %s)"%(job_url, new_state)) + + + + def get_job_state(self, job_url): + #logging.debug("get_job_state") + with self.resource_lock: + msg = message("get_job_state", job_url, "") + self.client_socket.send_pyobj(msg, zmq.NOBLOCK) + result = self.client_socket.recv_pyobj() + return result.value + + ##################################################################################### + # Pilot-Job Description + def set_pilot_description(self, pilot_url, description): + pass + + def get_pilot_description(self, pilot_url): + pass + + ##################################################################################### + # Sub-Job Description + def set_job(self, job_url, job_dict): + """ local only - used only by manager """ + self.job_ids.append(job_url) + self.jobs[job_url] = job_dict + self.job_states[job_url]="Unknown" + + def get_job(self, job_url): + if self.jobs.has_key(job_url)==False: + logging.debug("get_job: " + str(self.resource_lock)) + with self.resource_lock: + logging.debug("get_job (lock acquired): " + str(self.resource_lock)) + msg = message("get_job", job_url, "") + self.client_socket.send_pyobj(msg) + result = self.client_socket.recv_pyobj() + self.jobs[job_url] = result.value + logging.debug("received job: " + str(result.value)) + return self.jobs[job_url] + + def delete_job(self, job_url): + self.job_ids.remove(job_url) + del self.jobs[job_url] + del self.job_ids[job_url] + + + ##################################################################################### + # Distributed queue for sub-jobs + def queue_job(self, pilot_url, job_url): + if self.server_role == False: # just re-queue locally at client + self.subjob_queue.put(job_url) + elif self.server_role == True: + """ queue new job to pilot """ + logging.debug("queue_job " + str(self.resource_lock)) + counter = 0 + result = None + success = False + while result ==None and counter < NUMBER_RETRIES: + with self.resource_lock: + msg = message("queue_job", "", job_url) + try: + self.client_socket.send_pyobj(msg, zmq.NOBLOCK) + result = self.client_socket.recv_pyobj() + success=True + except: + traceback.print_exc(file=sys.stderr) + if result == None: + counter = counter + 1 + logging.error("RETRY %d queue_job"%counter) + if counter == NUMBER_RETRIES and success==False: + self.__reset_client_socket() + time.sleep(2) + msg2 = message("notification", "", job_url) + self.push_socket.send_pyobj(msg2) + + return success + + + def dequeue_job(self, pilot_url): + """ dequeue to new job of a certain pilot """ + return self.subjob_queue.get() + + + + + ##################################################################################### + # Private functions + def __server_handler(self, messageList): + """ server for managing job / pilot job states via ZMQ """ + msg = pickle.loads(messageList[0]) + logging.debug("Message received: " + str(msg)) + self.__handle_message(msg, self.stream) + + + def __handle_message(self, msg, reply_socket): + try: + command = msg.command + if command == "set_pilot_state": + self.pilot_states[msg.key] = msg.value + reply_socket.send_pyobj("SUCCESS") + #self.service_socket.send("") + elif command == "get_pilot_state": + result = message ("", "", self.pilot_states[msg.key]) + reply_socket.send_pyobj(result, zmq.NOBLOCK) + elif command == "set_job_state": + self.job_states[msg.key] = msg.value + reply_socket.send_pyobj("SUCCESS", zmq.NOBLOCK) + elif command == "get_job_state": + result=message("", "", self.job_states[msg.key]) + reply_socket.send_pyobj(result, zmq.NOBLOCK) + elif command == "get_job": + result = message("","", self.jobs[msg.key]) + reply_socket.send_pyobj(result, zmq.NOBLOCK) + elif command == "queue_job": + self.new_job_queue.put(msg.value) + reply_socket.send_pyobj("SUCCESS", zmq.NOBLOCK) + elif command == "dequeue_job": + new_job=None + try: + new_job = self.new_job_queue.get(False) + except: + pass + result = message("","", new_job) + reply_socket.send_pyobj(result, zmq.NOBLOCK) + else: + logging.debug("sending default reply") + reply_socket.send_pyobj("", zmq.NOBLOCK) + except: + traceback.print_exc(file=sys.stderr) + + + def __server(self, server, server_port): + """ server for managing job / pilot job states via ZMQ """ + service_socket = self.context.socket(zmq.REP) + if SERVER_PORT==0: # random port + server_port = service_socket.bind_to_random_port("tcp://*") + self.address = "tcp://"+server+":"+str(server_port) + elif server == "localhost": + self.server_address = "tcp://*:"+str(server_port) + self.address = "tcp://"+server+":"+str(server_port) + service_socket.bind(self.server_address) + else: + self.server_address = "tcp://"+server+":"+str(server_port) + self.address = self.server_address + service_socket.bind(self.server_address) + logging.debug("Starting service at: " + self.address) + self.startup_condition.acquire() + self.startup_condition.notifyAll() + self.startup_condition.release() + logging.debug("Startup condition signaled") + while self.stopped == False: + #logging.debug("Waiting for messages...") + try: + msg = service_socket.recv_pyobj() + #logging.debug("Message received: " + str(msg)) + self.__handle_message(msg, service_socket) + except: + pass + #logging.debug("Message handled: " + str(msg) + " stopped = " + str(self.stopped)) + #pdb.set_trace() + logging.debug("__server thread stopped: " + str(self.stopped)) + self.has_stopped = True + #service_socket.close() + + def __loop(self): + self.loop = ioloop.IOLoop.instance() + #self.loop.add_handler(self.service_socket, self.__server_handler, zmq.POLLIN) + self.stream = zmqstream.ZMQStream(self.service_socket, self.loop) + self.stream.on_recv(self.__server_handler) + logging.debug("Start event loop") + self.loop.start() + + def __reset_client_socket(self): + logging.error("RESETING client socket") + with self.resource_lock: + try: + self.client_socket.close() + except: + traceback.print_exc(file=sys.stderr) + self.client_socket = self.context.socket(zmq.REQ) + self.client_socket.connect(self.address) + + def __wait_for_notifications(self): + """ waits for notifications and puts new jobs into queue """ + #while result==None and counter < NUMBER_RETRIES: + while self.stopped == False: + # read object from queue + logging.debug(" __wait_for_notifications: polling for new jobs - stopped: " + str(self.stopped)) + msg = message ("dequeue_job", self.pilot_url, "") + try: + with self.resource_lock: + self.client_socket.send_pyobj(msg, zmq.NOBLOCK) + result = self.client_socket.recv_pyobj().value + logging.debug(" __wait_for_notifications: received new jobs " + str(result)) + if result != None: + self.subjob_queue.put(result) + time.sleep(0.2) + continue + except: + traceback.print_exc(file=sys.stderr) + logging.error("Error dequeuing job") + time.sleep(1) + continue + logging.debug(" __wait_for_notifications: End Loop - stopped " + str(self.stopped)) + + #if counter == NUMBER_RETRIES-1 and success == False: + # self.__reset_client_socket() + # time.sleep(2) + # continue # retry + + #if counter == NUMBER_RETRIES and success == False: + # return result + + logging.debug(" __wait_for_notifications: wait for notification") + # wait for next job notification + if result == None: + try: + logging.debug("wait for notification") + self.pull_socket.recv_pyobj() + logging.debug("received notification") + except: + pass + + # wait for next job notification + #while self.stopped == False: + # logging.debug("wait for notification") + # self.pull_socket.recv_pyobj() + # logging.debug("received notification") + + def __shutdown(self): + logging.debug("shutdown ZMQ") + try: + #self.client_socket.close() + #self.service_socket.close() + #self.context.term() + pass + except: + pass + + \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index b6784a57..be21cfe0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -44,8 +44,8 @@ master_doc = 'index' # General information about the project. -project = u'BigJob' -copyright = u'2013, The SAGA Project' +project = 'BigJob' +copyright = '2013, The SAGA Project' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -190,8 +190,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'BigJob.tex', u'BigJob Manual', - u'The SAGA Project', 'manual'), + ('index', 'BigJob.tex', 'BigJob Manual', + 'The SAGA Project', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -220,8 +220,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'BigJob', u'BigJob User Manual', - [u'The SAGA Project'], 1) + ('index', 'BigJob', 'BigJob User Manual', + ['The SAGA Project'], 1) ] # If true, show URL addresses after external links. @@ -234,8 +234,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'BigJob', u'BigJob Manual', - u'The BigJob Project', 'BigJob', 'One line description of project.', + ('index', 'BigJob', 'BigJob Manual', + 'The BigJob Project', 'BigJob', 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/source/conf.py.bak b/docs/source/conf.py.bak new file mode 100644 index 00000000..b6784a57 --- /dev/null +++ b/docs/source/conf.py.bak @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +# +# BigJob documentation build configuration file, created by +# sphinx-quickstart on Mon Dec 3 21:55:42 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os +import sphinx_rtd_theme + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('../../')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath', 'sphinx.ext.mathjax', 'sphinx.ext.ifconfig'] #, 'sphinx.ext.viewcode'] + +[extensions] +todo_include_todos=True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'BigJob' +copyright = u'2013, The SAGA Project' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.0' +# The full version, including alpha/beta/rc tags. +release = '1.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +#html_theme = 'armstrong' +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { +} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = ["_themes",] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +html_title = 'BigJob 1.0 User Manual' + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = 'images/logo.jpg' + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'BigJobdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'BigJob.tex', u'BigJob Manual', + u'The SAGA Project', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = 'images/logo.jpg' + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +latex_show_pagerefs = True + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'BigJob', u'BigJob User Manual', + [u'The SAGA Project'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'BigJob', u'BigJob Manual', + u'The BigJob Project', 'BigJob', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/source/tutorial/rst-tables.py b/docs/source/tutorial/rst-tables.py index 2ad4fd77..e8056674 100644 --- a/docs/source/tutorial/rst-tables.py +++ b/docs/source/tutorial/rst-tables.py @@ -46,7 +46,7 @@ def main(): ] rst = make_table(array) - print rst + print(rst) return(0) def make_table(grid): @@ -56,12 +56,12 @@ def make_table(grid): width_cols = [] for i in range(0, num_cols) : - print i + print(i) width_cols.append (0) for j in range(0, num_rows) : elem = grid[j][i] elem_len = 0 - print "%d - %d: %s" % (i, j, elem) + print("%d - %d: %s" % (i, j, elem)) if type(elem) is list : for entry in elem : elem_len = max (elem_len, len(entry)) @@ -93,7 +93,7 @@ def make_table(grid): if k == 0 : val = elem fmt = "%%-%ds" % width_cols[j] - print fmt + print(fmt) rst += '| ' + fmt % str(val) + ' ' rst += '|\n' diff --git a/docs/source/tutorial/rst-tables.py.bak b/docs/source/tutorial/rst-tables.py.bak new file mode 100644 index 00000000..2ad4fd77 --- /dev/null +++ b/docs/source/tutorial/rst-tables.py.bak @@ -0,0 +1,129 @@ +import os +import sys + +def main(): + array = [ + [ + 'Resource (Hostname)', + 'SAGA Adaptor', + 'Queue Names', + 'Number of Processes', + 'Max Walltime (if known)', + 'Project Accounts' + ], + [ + 'All machines', + 'fork, ssh', + 'N/A', + 'Depends on machine cores', + 'N/A', + 'N/A' + ], + [ + 'stampede.tacc.utexas.edu', + ['Local: slurm', 'Remote: slurm+ssh', 'slurm+gsissh'], + ['normal', 'development', 'serial', 'largemem'], + ['4K (increments of 16)','256 (increments of 16)','16','128 (increments of 32)'], + ['48hrs','4hrs','12hrs','24hrs','48hrs'], + 'XSEDE Allocation' + ], + [ + 'lonestar.tacc.utexas.edu', + ['Local: sge', 'Remote: sge+ssh', 'sge+gsissh'], + ['normal', 'development', 'largemem'], + ['4K (increments of 12)','256 (increments of 12)','128 (increments of 24)'], + ['48hrs','4hrs','48hrs'], + 'XSEDE Allocation' + ], + [ + 'trestles.sdsc.edu', + ['Local: pbs', 'Remote: pbs+ssh', 'pbs+gsissh'], + ['normal', 'shared'], + ['1024 (increments of 32)','128 (increments of 32)'], + ['48hrs','48hrs'], + 'XSEDE Allocation' + ] + + ] + rst = make_table(array) + print rst + return(0) + +def make_table(grid): + + num_rows = len(grid) + num_cols = len(grid[0]) + + width_cols = [] + for i in range(0, num_cols) : + print i + width_cols.append (0) + for j in range(0, num_rows) : + elem = grid[j][i] + elem_len = 0 + print "%d - %d: %s" % (i, j, elem) + if type(elem) is list : + for entry in elem : + elem_len = max (elem_len, len(entry)) + else : # assume string + elem_len = max (elem_len, len(elem)) + + width_cols[i] = max (width_cols[i], elem_len+2) + + rst = table_div (width_cols, '-') + + header_char = '=' + for i in range(0, num_rows) : + row = grid[i] + elems = [] + max_nelems = 1 + for j in range(0, num_cols) : + elem = row[j] + if type(elem) is list : + max_nelems = max (max_nelems, len(elem)) + + for k in range (0, max_nelems) : + for j in range(0, num_cols) : + elem = row[j] + val = "--" + if type(elem) is list : + if len(elem) > k : + val = elem[k] + else : + if k == 0 : + val = elem + fmt = "%%-%ds" % width_cols[j] + print fmt + rst += '| ' + fmt % str(val) + ' ' + + rst += '|\n' + # if type(elem) is list : + # for entry in elem : + # elem_len = max (elem_len, len(entry)) + # else : # assume string + # elem_len = max (elem_len, len(elem)) + # + # rst = rst + '| ' + '| '.join([normalize_cell(x, cell_width-1) for x in row]) + '|\n' + rst = rst + table_div(width_cols, header_char) + header_char = '-' + return rst + + +def table_div(width_cols, char): + + ret = "" + + for width in width_cols : + ret += "+" + (width+2) * char + + return ret + "+\n" + + +def normalize_cell(string, length): + return string + ((length - len(string)) * ' ') + +if __name__ == "__main__": + sys.exit(main()) + +# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 + diff --git a/examples/datatransfer/simple_datatransfer.py b/examples/datatransfer/simple_datatransfer.py index 01afa296..5822b973 100644 --- a/examples/datatransfer/simple_datatransfer.py +++ b/examples/datatransfer/simple_datatransfer.py @@ -81,13 +81,13 @@ def example_datatransfer(): task.arguments = ["Hello from task %s" % t] pilotjob.submit_compute_unit(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") pilotjob.wait() - print "FINISHED" + print("FINISHED") pilot_service.cancel() - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) traceback.print_exc() return(-1) @@ -95,19 +95,19 @@ def example_datatransfer(): # Step 2: Retrieve output files with SAGA # ########################################### try: - print "Transferring output files back to local machine..." - for task_id, output_path in output_paths.iteritems(): + print("Transferring output files back to local machine...") + for task_id, output_path in output_paths.items(): remote_file = "sftp://%s/%s/stdout" % (EXEC_HOST, output_path) local_file = "file://localhost/%s/task-%s.out" % (os.getcwd(), task_id) f = saga.filesystem.File(remote_file) f.copy(local_file) - print "Copied %s -> %s" % (remote_file, local_file) + print("Copied %s -> %s" % (remote_file, local_file)) - print "FINISHED" + print("FINISHED") - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) traceback.print_exc() return(-1) diff --git a/examples/datatransfer/simple_datatransfer.py.bak b/examples/datatransfer/simple_datatransfer.py.bak new file mode 100644 index 00000000..01afa296 --- /dev/null +++ b/examples/datatransfer/simple_datatransfer.py.bak @@ -0,0 +1,118 @@ +"""As discussed in the CDI meeting on Nov. 7th, this examples shows how 'out- +of-band' (i.e., non-BigJob/PilotData) data transfer can be implemented using +SAGA-Pythons sftp file adaptor. + +For this example Redis password is read from the environment. The example can +be run like this:: + + REDIS_PASSWORD= python example_datatransfer.py + +The application logic of this example is as follows: + + (1) We define the directory for our CU(s) explicitly as + WORKING_DIR + task name:: + + task.working_directory = "%s/task-%s" % (WORKING_DIR, t) + + All task output ends up in that direcotry, including stdout and stderr. + + (2) We use SAGA-Python's SFTP file adaptor to retrieve the output files. + SAGA-Python is installed together with BigJob, so it doesn't require + any additional Python packages to be installed. + +NOTE: If working directory is not defined, BigJob uses uuids to create unique + directories for each bigjob and each tasks (bj-/sj-). + When setting working directory manually, you need to pay attention that + subsequent runs don't overwrite each others output files and directories. +""" + +import os +import sys +import saga +import pilot +import traceback + +#------------------------------------------------------------------------------ +# +EXEC_HOST = "login1.stampede.tacc.utexas.edu" +WORKING_DIR = "/home1/00988/tg802352/mysim/" + +REDIS_PASSWD = os.environ.get('REDIS_PASSWORD') +REDIS_SERVER = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PASSWD + +#------------------------------------------------------------------------------ +# +def example_datatransfer(): + """Example entry point. + """ + # we use this dictionary to store the paths of the + # individual tasks output directories:: + # + # {'task_id' : 'output_dir'}3 + # + output_paths = {} + + ################################### + # Step 1: Submit tasks via BigJob # + ################################### + try: + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "slurm+ssh://"+EXEC_HOST + pilot_description.number_of_processes = 16 + pilot_description.walltime = 1 + pilot_description.project = "TG-MCB090174" + pilot_description.queue = "normal" + pilot_description.working_directory = WORKING_DIR + + pilot_service = pilot.PilotComputeService(REDIS_SERVER) + pilotjob = pilot_service.create_pilot(pilot_description) + + for t in range(0, 32): + # Task output will end up in WORKING_DIR+/task-+/stdout:: + # + # /home1/00988/tg802352/mysim/task-/stdout + # + output_path = "%s/task-%s" % (WORKING_DIR, t) + output_paths[t] = output_path + + task = pilot.ComputeUnitDescription() + task.executable = "/bin/echo" + task.working_directory = output_path + task.arguments = ["Hello from task %s" % t] + pilotjob.submit_compute_unit(task) + + print "Waiting for tasks to finish..." + pilotjob.wait() + print "FINISHED" + pilot_service.cancel() + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + traceback.print_exc() + return(-1) + + ########################################### + # Step 2: Retrieve output files with SAGA # + ########################################### + try: + print "Transferring output files back to local machine..." + for task_id, output_path in output_paths.iteritems(): + remote_file = "sftp://%s/%s/stdout" % (EXEC_HOST, output_path) + local_file = "file://localhost/%s/task-%s.out" % (os.getcwd(), task_id) + + f = saga.filesystem.File(remote_file) + f.copy(local_file) + print "Copied %s -> %s" % (remote_file, local_file) + + print "FINISHED" + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + traceback.print_exc() + return(-1) + +#------------------------------------------------------------------------------ +# +if __name__ == "__main__": + sys.exit(example_datatransfer()) + diff --git a/examples/example_styleguide.py b/examples/example_styleguide.py index 4706fdcd..6f541e8f 100644 --- a/examples/example_styleguide.py +++ b/examples/example_styleguide.py @@ -44,15 +44,15 @@ def main(): # - see https://github.com/saga-project/BigJob/issues/121 # - see https://github.com/saga-project/BigJob/issues/131 for i, pj in enumerate(pjs): - print "cancel %3d" % i + print("cancel %3d" % i) pj.cancel() pilot_service.cancel() return(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/example_styleguide.py.bak b/examples/example_styleguide.py.bak new file mode 100644 index 00000000..4706fdcd --- /dev/null +++ b/examples/example_styleguide.py.bak @@ -0,0 +1,62 @@ +import os +import sys +import pilot +import traceback + +""" DESCRIPTION: This example does this... +""" + +# Redis password is read from the environment. The example can be run like this: +# REDIS_PASSWORD=ILikeBigJob_wITH-REdIS python examples/example_styleguide.py +# Alternatively, for tutorials, etc. REDIS_PASSWORD can be defined in /etc/profile + +#------------------------------------------------------------------------------ +# +REDIS_PWD = os.environ.get('REDIS_PASSWORD') +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +HOST = "ssh://localhost" + +#------------------------------------------------------------------------------ +# +def main(): + try: + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = HOST + pilot_description.number_of_processes = 1 + pilot_description.working_directory = os.getcwd() + + pilot_service = pilot.PilotComputeService(COORD) + + ### This is broken !!! -> https://github.com/saga-project/BigJob/issues/118 + #pilotjob = pilot_service.create_pilot(pilot_compute_description) + pilotjob = pilot_service.create_pilot(pilot_compute_description=pilot_description) + + task = pilot.ComputeUnitDescription() + task.executable = "/bin/sleep" + task.arguments = ["10"] + + pilotjob.submit_compute_unit(task) + + # do something useful here, wait or whatever. print some information. + + + # Not sure how to cancel properly + # - see https://github.com/saga-project/BigJob/issues/121 + # - see https://github.com/saga-project/BigJob/issues/131 + for i, pj in enumerate(pjs): + print "cancel %3d" % i + pj.cancel() + + pilot_service.cancel() + + return(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/pilot-api/example-pilot-api-decentral.py b/examples/pilot-api/example-pilot-api-decentral.py index fe44b3bd..adcea059 100644 --- a/examples/pilot-api/example-pilot-api-decentral.py +++ b/examples/pilot-api/example-pilot-api-decentral.py @@ -46,8 +46,8 @@ while compute_unit != State.Done: print("Final state check...") state_cu = compute_unit.get_state() - print "PCS State %s" % pilot_compute_service - print "CU: %s State: %s"%(compute_unit, state_cu) + print("PCS State %s" % pilot_compute_service) + print("CU: %s State: %s"%(compute_unit, state_cu)) if state_cu==State.Done: break time.sleep(2) diff --git a/examples/pilot-api/example-pilot-api-decentral.py.bak b/examples/pilot-api/example-pilot-api-decentral.py.bak new file mode 100644 index 00000000..fe44b3bd --- /dev/null +++ b/examples/pilot-api/example-pilot-api-decentral.py.bak @@ -0,0 +1,57 @@ +import sys +import os +import time + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +sys.path.insert(0, os.getcwd() + "/../") +from pilot import PilotComputeService, ComputeDataServiceDecentral, State + +COORDINATION_URL = "redis://localhost:6379" + +if __name__ == "__main__": + + pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) + + # create pilot job service and initiate a pilot job + pilot_compute_description = { + "service_url": 'fork://localhost', + "number_of_processes": 1, + "working_directory": os.path.join(os.getcwd(),"work"), + 'affinity_datacenter_label': "eu-de-south", + 'affinity_machine_label': "mymachine" + } + + pilotjob = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) + pilotjob2 = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) + + compute_data_service = ComputeDataServiceDecentral() + compute_data_service.add_pilot_compute_service(pilot_compute_service) + + # start work unit + compute_unit_description = { + "executable": "/bin/date", + "arguments": [""], + "number_of_processes": 1, + "output": "stdout.txt", + "error": "stderr.txt", + } + + for i in range(0,4): + compute_unit = compute_data_service.submit_compute_unit(compute_unit_description) + + + print("Finished setup. Waiting for scheduling of CU") + compute_data_service.wait() + + while compute_unit != State.Done: + print("Final state check...") + state_cu = compute_unit.get_state() + print "PCS State %s" % pilot_compute_service + print "CU: %s State: %s"%(compute_unit, state_cu) + if state_cu==State.Done: + break + time.sleep(2) + + print("Terminate Pilot Compute and Compute Data Service") + compute_data_service.cancel() + pilot_compute_service.cancel() diff --git a/examples/pilot-api/example-pilot-api.py b/examples/pilot-api/example-pilot-api.py index 94c42e14..323707a9 100644 --- a/examples/pilot-api/example-pilot-api.py +++ b/examples/pilot-api/example-pilot-api.py @@ -45,20 +45,20 @@ while compute_unit != State.Done: print("Final state check...") state_cu = compute_unit.get_state() - print "Pilot Compute Service State %s" % pilot_compute_service - print "CU: %s State: %s"%(compute_unit, state_cu) + print("Pilot Compute Service State %s" % pilot_compute_service) + print("CU: %s State: %s"%(compute_unit, state_cu)) if state_cu==State.Done: break time.sleep(2) - print "CU Details:" - print str(compute_unit.get_details()) + print("CU Details:") + print(str(compute_unit.get_details())) - print "PJ 1 Details:" - print str(pilotjob.get_details()) - print "PJ 2 Details:" - print str(pilotjob2.get_details()) + print("PJ 1 Details:") + print(str(pilotjob.get_details())) + print("PJ 2 Details:") + print(str(pilotjob2.get_details())) print("Terminate Pilot Compute and Compute Data Service") compute_data_service.cancel() diff --git a/examples/pilot-api/example-pilot-api.py.bak b/examples/pilot-api/example-pilot-api.py.bak new file mode 100644 index 00000000..94c42e14 --- /dev/null +++ b/examples/pilot-api/example-pilot-api.py.bak @@ -0,0 +1,65 @@ +import sys +import os +import time + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +sys.path.insert(0, os.getcwd() + "/../") +from pilot import PilotComputeService, ComputeDataService, State + + +COORDINATION_URL = "redis://localhost:6379" + +if __name__ == "__main__": + + pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) + + # create pilot job service and initiate a pilot job + pilot_compute_description = { + #"service_url": 'fork://localhost', + "service_url": 'ssh://localhost', + "number_of_processes": 1, + #"working_directory": "/tmp", + } + + pilotjob = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) + pilotjob2 = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) + + compute_data_service = ComputeDataService() + compute_data_service.add_pilot_compute_service(pilot_compute_service) + + # start work unit + compute_unit_description = { + "executable": "/bin/date", + "arguments": [""], + "number_of_processes": 1, + "output": "stdout.txt", + "error": "stderr.txt", + } + + compute_unit = compute_data_service.submit_compute_unit(compute_unit_description) + + + print("Finished setup. Waiting for scheduling of CU") + compute_data_service.wait() + + while compute_unit != State.Done: + print("Final state check...") + state_cu = compute_unit.get_state() + print "Pilot Compute Service State %s" % pilot_compute_service + print "CU: %s State: %s"%(compute_unit, state_cu) + if state_cu==State.Done: + break + time.sleep(2) + + + print "CU Details:" + print str(compute_unit.get_details()) + + print "PJ 1 Details:" + print str(pilotjob.get_details()) + print "PJ 2 Details:" + print str(pilotjob2.get_details()) + + print("Terminate Pilot Compute and Compute Data Service") + compute_data_service.cancel() + pilot_compute_service.cancel() diff --git a/examples/pilot-api/example-pilot-compute-data-cloud.py b/examples/pilot-api/example-pilot-compute-data-cloud.py index f02c68e4..7d807b8d 100644 --- a/examples/pilot-api/example-pilot-compute-data-cloud.py +++ b/examples/pilot-api/example-pilot-compute-data-cloud.py @@ -13,7 +13,7 @@ if __name__ == "__main__": - print COORDINATION_URL + print(COORDINATION_URL) # create pilot data service (factory for data pilots (physical, distributed storage)) # and pilot data pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) diff --git a/examples/pilot-api/example-pilot-compute-data-cloud.py.bak b/examples/pilot-api/example-pilot-compute-data-cloud.py.bak new file mode 100644 index 00000000..f02c68e4 --- /dev/null +++ b/examples/pilot-api/example-pilot-compute-data-cloud.py.bak @@ -0,0 +1,138 @@ +import sys +import os +import time +import logging +import uuid +#logging.basicConfig(level=logging.DEBUG) + +#sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from pilot import PilotComputeService, PilotDataService, ComputeDataService, State +from bigjob import logger + +COORDINATION_URL = "redis://localhost:6379" + +if __name__ == "__main__": + + print COORDINATION_URL + # create pilot data service (factory for data pilots (physical, distributed storage)) + # and pilot data + pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) + + ################################################################################################### + # Pick one of the Pilot Data Descriptions below + + pilot_data_description_aws={ + "service_url": "s3://pilot-data-" + str(uuid.uuid1()), + "size": 100, + #"region" : "", # or "" for DEFAULT/EAST + "access_key_id":"AKIAJPGNDJRYIG5LIEUA", + "secret_access_key":"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" + } + + pilot_data_description_india = { + "service_url": "walrus://149.165.146.135/pilot-data-" + str(uuid.uuid1()), + #"service_url": "ssh://localhost/tmp/pilot-data-" + str(uuid.uuid1()), + "affinity_datacenter_label": "us-east", + "affinity_machine_label": "", + "access_key_id":"", + "secret_access_key":"" + } + + + pd = pilot_data_service.create_pilot(pilot_data_description=pilot_data_description_aws) + + + # Create Data Unit Description + #base_dir = "../data1" + #url_list = os.listdir(base_dir) + # make absolute paths + #absolute_url_list = [os.path.join(base_dir, i) for i in url_list] + data_unit_description = { + "file_urls": [os.path.join(os.getcwd(), "test.txt")], + "affinity_datacenter_label": "us-east-1", + "affinity_machine_label": "" + } + + # submit pilot data to a pilot store + input_data_unit = pd.submit_data_unit(data_unit_description) + input_data_unit.wait() + + logger.info("Data Unit URL: " + input_data_unit.get_url()) + pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) + + pilot_compute_description_amazon_west = { + "service_url": 'ec2+ssh://aws.amazon.com', + "number_of_processes": 1, + 'affinity_datacenter_label': "us-google", + 'affinity_machine_label': "", + # cloud specific attributes + #"vm_id":"ami-d7f742be", + "vm_id": "ami-5c3b1b19", + "vm_ssh_username":"ubuntu", + "vm_ssh_keyname":"MyKey", + "vm_ssh_keyfile":"/Users/luckow/.ssh/id_rsa", + "vm_type":"t1.micro", + "region" : "us-west-1", + "access_key_id":"AKIAJPGNDJRYIG5LIEUA", + "secret_access_key":"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" + + } + + + pilot_compute_description_euca_india = { + "service_url": 'euca+ssh://149.165.146.135:8773/services/Eucalyptus', + #"service_url": 'fork://localhost', + "number_of_processes": 1, + 'affinity_datacenter_label': "us-east", + 'affinity_machine_label': "", + #'working_directory': os.getcwd(), + # cloud specific attributes + "vm_id":"emi-36913A82", + "vm_ssh_username":"root", + "vm_ssh_keyname":"luckow", + "vm_ssh_keyfile":"/Users/luckow/.ssh/eucakey-india", + "vm_type":"c1.xlarge", + "access_key_id":"", + "secret_access_key":"" + } + + pilotjob = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description_amazon_west) + + compute_data_service = ComputeDataService() + compute_data_service.add_pilot_compute_service(pilot_compute_service) + compute_data_service.add_pilot_data_service(pilot_data_service) + + # create empty data unit for output data + output_data_unit_description = { + "file_urls": [] + } + output_data_unit = pd.submit_data_unit(output_data_unit_description) + output_data_unit.wait() + + # create compute unit + compute_unit_description = { + "executable": "/bin/cat", + "arguments": ["test.txt"], + "number_of_processes": 1, + "output": "stdout.txt", + "error": "stderr.txt", + "input_data": [input_data_unit.get_url()], + # Put files stdout.txt and stderr.txt into output data unit + "output_data": [ + { + output_data_unit.get_url(): + ["std*"] + } + ] + } + + compute_unit = compute_data_service.submit_compute_unit(compute_unit_description) + logger.info("Finished setup of ComputeDataService. Waiting for scheduling of PD") + compute_data_service.wait() + + logger.debug("Output Data Unit: " + str(output_data_unit.list())) + + logger.info("Terminate Pilot Compute/Data Service") + compute_data_service.cancel() + pilot_data_service.cancel() + pilot_compute_service.cancel() diff --git a/examples/pilot-api/example-pilot-compute-direct.py b/examples/pilot-api/example-pilot-compute-direct.py index 3cbc7c87..9915411a 100644 --- a/examples/pilot-api/example-pilot-compute-direct.py +++ b/examples/pilot-api/example-pilot-compute-direct.py @@ -40,7 +40,7 @@ logging.debug("Finished submission. Waiting for completion of CU") compute_unit.wait() - print str(compute_unit.get_details()) + print(str(compute_unit.get_details())) logging.debug("Terminate Pilot Compute Service") pilot_compute_service.cancel() diff --git a/examples/pilot-api/example-pilot-compute-direct.py.bak b/examples/pilot-api/example-pilot-compute-direct.py.bak new file mode 100644 index 00000000..3cbc7c87 --- /dev/null +++ b/examples/pilot-api/example-pilot-compute-direct.py.bak @@ -0,0 +1,46 @@ +import sys +import os +import time +import logging +logging.basicConfig(level=logging.DEBUG) + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from pilot import PilotComputeService, ComputeDataService, State + +COORDINATION_URL = "redis://localhost:6379" + +if __name__ == "__main__": + + pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) + + # create pilot job service and initiate a pilot job + pilot_compute_description = { + "service_url": 'fork://localhost', + "number_of_processes": 1, + #"working_directory": os.path.join(os.getcwd(), "agent"), + #"file_transfer": ["ssh://" + os.path.dirname(os.path.abspath(__file__)) + # + "/../test.txt > BIGJOB_WORK_DIR"] + } + + pilotjob = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) + + # start compute unit + compute_unit_description = { + "executable": "/bin/echo", + "arguments": ["$TBD_DIR"], + "number_of_processes": 1, + "environment": ["TBD_DIR=/tmp"], + "output": "stdout.txt", + "error": "stderr.txt", + "file_transfer": ["ssh://" + os.path.dirname(os.path.abspath(__file__)) + + "/../test.txt > BIGJOB_WORK_DIR"] + } + + compute_unit = pilotjob.submit_compute_unit(compute_unit_description) + logging.debug("Finished submission. Waiting for completion of CU") + compute_unit.wait() + + print str(compute_unit.get_details()) + + logging.debug("Terminate Pilot Compute Service") + pilot_compute_service.cancel() diff --git a/examples/pilot-api/example-pilot-data-reconnect.py b/examples/pilot-api/example-pilot-data-reconnect.py index fe129d18..be33ce5a 100644 --- a/examples/pilot-api/example-pilot-data-reconnect.py +++ b/examples/pilot-api/example-pilot-data-reconnect.py @@ -16,7 +16,7 @@ if len(sys.argv)==2: reconnect_url=sys.argv[1] else: - print "Usage: " + sys.executable + " " + __file__ + " " + print("Usage: " + sys.executable + " " + __file__ + " ") sys.exit(-1) # create pilot data service (factory for pilot stores (physical, distributed storage)) diff --git a/examples/pilot-api/example-pilot-data-reconnect.py.bak b/examples/pilot-api/example-pilot-data-reconnect.py.bak new file mode 100644 index 00000000..fe129d18 --- /dev/null +++ b/examples/pilot-api/example-pilot-data-reconnect.py.bak @@ -0,0 +1,45 @@ +import sys +import os +import time +import logging +import json + +logging.basicConfig(level=logging.DEBUG) + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from pilot import PilotDataService, ComputeDataService, DataUnit, State + +COORDINATION_URL = "redis://localhost:6379" + +if __name__ == "__main__": + + if len(sys.argv)==2: + reconnect_url=sys.argv[1] + else: + print "Usage: " + sys.executable + " " + __file__ + " " + sys.exit(-1) + + # create pilot data service (factory for pilot stores (physical, distributed storage)) + pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) + pd_new = pilot_data_service.create_pilot({ + 'service_url': "ssh://localhost/tmp/pilotdata-reconnect/", + 'size':100, + 'affinity_datacenter_label': "eu-de-south", + 'affinity_machine_label': "mymachine-1" + }) + + + logging.debug("Pilot Data URL: %s"%pilot_data_service.url) + + + ########################################################################### + # PD should only be scheduled to machine 1 + logging.debug("Connect to PD URL: %s"%reconnect_url) + pd = DataUnit(du_url=reconnect_url) + + + # Move PD to another pilot store + pd.add_pilot_data(pd_new) + + #time.sleep(120) + #pilot_data_service.cancel() \ No newline at end of file diff --git a/examples/pilot-api/example-pilot-data.py b/examples/pilot-api/example-pilot-data.py index 2ff20e59..c4e099e2 100644 --- a/examples/pilot-api/example-pilot-data.py +++ b/examples/pilot-api/example-pilot-data.py @@ -74,10 +74,10 @@ du1.export("/tmp/pilot-data-export/") #du2.export("ssh://hotel.futuregrid.org/N/u/luckow/pilot-store-export/") - print "***************************************************************" - print "To reconnect to Data Unit 1 use the following URL: %s"%du1.url - print "Run:\n\n " + sys.executable + " example-pilot-data-reconnect.py %s"%du1.url - print "\n\n******************* SLEEPING *********************************" + print("***************************************************************") + print("To reconnect to Data Unit 1 use the following URL: %s"%du1.url) + print("Run:\n\n " + sys.executable + " example-pilot-data-reconnect.py %s"%du1.url) + print("\n\n******************* SLEEPING *********************************") #time.sleep(1200) logging.debug("Terminate Pilot Data/Compute Data Service") diff --git a/examples/pilot-api/example-pilot-data.py.bak b/examples/pilot-api/example-pilot-data.py.bak new file mode 100644 index 00000000..2ff20e59 --- /dev/null +++ b/examples/pilot-api/example-pilot-data.py.bak @@ -0,0 +1,85 @@ +import sys +import os +import time +import logging +logging.basicConfig(level=logging.DEBUG) + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from pilot import PilotComputeService, PilotDataService, ComputeDataService, State + +COORDINATION_URL = "redis://localhost:6379" + +if __name__ == "__main__": + + # What files? Create Pilot Data Description using absolute URLs + base_dir = "/Users/luckow/workspace-saga/applications/pilot-store/test/data1" + url_list = os.listdir(base_dir) + # make absolute paths + absolute_url_list = [os.path.join(base_dir, i) for i in url_list] + data_unit_description1 = { + "file_urls":absolute_url_list, + 'affinity_datacenter_label': "eu-de-south", + 'affinity_machine_label': "mymachine-1" + } + logging.debug("Pilot Data Description 1: \n%s"%str(data_unit_description1)) + + + # What files? Create Pilot Data Description using remote SSH URLs + # make remotete paths + remote_url_list = ["ssh://localhost"+os.path.join(base_dir, i) for i in url_list] + data_unit_description2 = { + "file_urls":remote_url_list, + 'affinity_datacenter_label': "eu-de-south", + 'affinity_machine_label': "mymachine-2" + } + + logging.debug("Pilot Data Description 2: \n%s"%str(data_unit_description2)) + + + + # create pilot data service (factory for pilot stores (physical, distributed storage)) + pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) + ps1 = pilot_data_service.create_pilot({ + 'service_url': "ssh://localhost/tmp/pilotdata-1/", + 'size':100, + 'affinity_datacenter_label': "eu-de-south", + 'affinity_machine_label': "mymachine-1" + }) + + ps2 = pilot_data_service.create_pilot({ + 'service_url': "ssh://localhost/tmp/pilotdata-2/", + 'size':100, + 'affinity_datacenter_label': "eu-de-south", + 'affinity_machine_label': "mymachine-2" + }) + + + # create pilot data service + compute_data_service = ComputeDataService() + + # add resources to pilot data service + compute_data_service.add_pilot_data_service(pilot_data_service) + + ########################################################################### + # DU1 should only be scheduled to machine 1 + # DU2 should only be scheduled to machine 2 + du1 = compute_data_service.submit_data_unit(data_unit_description1) + du2 = compute_data_service.submit_data_unit(data_unit_description2) + + logging.debug("Finished setup of Pilot Data and Compute Data Service. Waiting for scheduling of Data Units") + + compute_data_service.wait() + logging.debug("DU scheduled: " + du1.url) + logging.debug("Export files of PD") + du1.export("/tmp/pilot-data-export/") + #du2.export("ssh://hotel.futuregrid.org/N/u/luckow/pilot-store-export/") + + print "***************************************************************" + print "To reconnect to Data Unit 1 use the following URL: %s"%du1.url + print "Run:\n\n " + sys.executable + " example-pilot-data-reconnect.py %s"%du1.url + print "\n\n******************* SLEEPING *********************************" + #time.sleep(1200) + + logging.debug("Terminate Pilot Data/Compute Data Service") + compute_data_service.cancel() + pilot_data_service.cancel() diff --git a/examples/tutorial/barebones-local/local_chained_ensembles.py b/examples/tutorial/barebones-local/local_chained_ensembles.py index 8f234b5e..667daabc 100644 --- a/examples/tutorial/barebones-local/local_chained_ensembles.py +++ b/examples/tutorial/barebones-local/local_chained_ensembles.py @@ -51,7 +51,7 @@ def main(): # Submit task to PilotJob task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id())) task_set_A.append(task) # Chaining tasks i.e submit a compute unit, when compute unit from A is successfully executed. @@ -61,7 +61,7 @@ def main(): while len(task_set_A) > 0: for a_task in task_set_A: if a_task.get_state() == "Done": - print "One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id()) + print("One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id())) task_desc = pilot.ComputeUnitDescription() task_desc.executable = '/bin/echo' task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] @@ -72,12 +72,12 @@ def main(): # Submit task to Pilot Job task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id())) task_set_B.append(task) task_set_A.remove(a_task) - except Exception, ex: - print "AN ERROR OCCURRED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURRED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/tutorial/barebones-local/local_chained_ensembles.py.bak b/examples/tutorial/barebones-local/local_chained_ensembles.py.bak new file mode 100644 index 00000000..8f234b5e --- /dev/null +++ b/examples/tutorial/barebones-local/local_chained_ensembles.py.bak @@ -0,0 +1,93 @@ +import os +import time +import sys +import pilot +import traceback + +""" DESCRIPTION: This example shows how to run BigJob locally to execute chained tasks. +""" + +#------------------------------------------------------------------------------ +# Redis password and 'user' name +REDIS_PWD = ""# Fill in the password to your server +USER_NAME = ""# Fill in your username on the resource you're running on + +# The coordination server +COORD = "redis://%s@localhost:6379" % REDIS_PWD +# The host to run BigJob on +HOSTNAME = "localhost" +# The working directory on your machine +WORKDIR = "/home/%s/example1" % USER_NAME +# The number of jobs you want to run +NUMBER_JOBS = 4 + + +#------------------------------------------------------------------------------ +# + +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "fork://%s" % HOSTNAME + pilot_description.number_of_processes = 4 + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit 'A' tasks to pilot job + task_set_A = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'A', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'A-stdout.txt' + task_desc.error = 'A-stderr.txt' + + # Submit task to PilotJob + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + task_set_A.append(task) + + # Chaining tasks i.e submit a compute unit, when compute unit from A is successfully executed. + # A 'B' task reads the content of the output file of an 'A' task and writes it into its own + # output file. + task_set_B = list() + while len(task_set_A) > 0: + for a_task in task_set_A: + if a_task.get_state() == "Done": + print "One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id()) + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'B', 'TASK_NO': a_task} + task_desc.number_of_processes = 1 + task_desc.output = 'B-stdout.txt' + task_desc.error = 'B-stderr.txt' + + # Submit task to Pilot Job + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + task_set_B.append(task) + task_set_A.remove(a_task) + + except Exception, ex: + print "AN ERROR OCCURRED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + sys.exit(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + diff --git a/examples/tutorial/barebones-local/local_coupled_ensembles.py b/examples/tutorial/barebones-local/local_coupled_ensembles.py index 3d1a5339..541fbc7c 100644 --- a/examples/tutorial/barebones-local/local_coupled_ensembles.py +++ b/examples/tutorial/barebones-local/local_coupled_ensembles.py @@ -53,7 +53,7 @@ task_desc.output = 'A-stdout.txt' task_desc.error = 'A-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id())) task_set_A.append(task) # submit 'B' tasks to pilot job @@ -67,13 +67,13 @@ task_desc.output = 'B-stdout.txt' task_desc.error = 'B-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id())) task_set_B.append(task) # --------------------------------------------------------------------- - print "Waiting for 'A' and 'B' tasks to complete..." + print("Waiting for 'A' and 'B' tasks to complete...") pilotjob.wait() - print "Executing 'C' tasks now���" + print("Executing 'C' tasks now���") # --------------------------------------------------------------------- # submit 'C' tasks to pilot job. each 'C' task takes the output of @@ -88,16 +88,16 @@ task_desc.output = 'C-stdout.txt' task_desc.error = 'C-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id())) task_set_C.append(task) # --------------------------------------------------------------------- - print "Waiting for 'C' tasks to complete..." + print("Waiting for 'C' tasks to complete...") pilotjob.wait() # --------------------------------------------------------------------- - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/tutorial/barebones-local/local_coupled_ensembles.py.bak b/examples/tutorial/barebones-local/local_coupled_ensembles.py.bak new file mode 100644 index 00000000..3d1a5339 --- /dev/null +++ b/examples/tutorial/barebones-local/local_coupled_ensembles.py.bak @@ -0,0 +1,113 @@ +import os +import sys +import saga +import pilot +import traceback + +""" This tutorial example shows another form of task set synchronization. + It exemplifies a simple workflow which submit a set of tasks (set A) and + (set B) and wait until they are completed until it submits another set of + tasks (set C). Both A- and B-tasks are 'producers'. C-tasks 'consumers' and + concatenate the output of an A- and a B-tasks. +""" + +#------------------------------------------------------------------------------ +# Redis password and 'user' name +REDIS_PWD = ""# Fill in the password to your server +USER_NAME = ""# Fill in your username on the resource you're running on + +# The coordination server +COORD = "redis://%s@localhost:6379" % REDIS_PWD +# The host to run BigJob on +HOSTNAME = "localhost" +# The working directory on your machine +WORKDIR = "/home/%s/example1" % USER_NAME +# The number of jobs you want to run +NUMBER_JOBS = 4 + + +#------------------------------------------------------------------------------ +# +if __name__ == "__main__": + + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "fork://%s" % HOSTNAME + pilot_description.number_of_processes = NUMBER_JOBS + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit 'A' tasks to pilot job + task_set_A = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'A', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'A-stdout.txt' + task_desc.error = 'A-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + task_set_A.append(task) + + # submit 'B' tasks to pilot job + task_set_B = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am a $TASK_SET task with id $TASK_NO'] + task_desc.environment = {'TASK_SET': 'B', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'B-stdout.txt' + task_desc.error = 'B-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + task_set_B.append(task) + + # --------------------------------------------------------------------- + print "Waiting for 'A' and 'B' tasks to complete..." + pilotjob.wait() + print "Executing 'C' tasks now���" + # --------------------------------------------------------------------- + + # submit 'C' tasks to pilot job. each 'C' task takes the output of + # an 'A' and a 'B' task and puts them together. + task_set_C = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am a $TASK_SET task with id $TASK_NO'] + task_desc.environment = {'TASK_SET': 'C', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'C-stdout.txt' + task_desc.error = 'C-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id()) + task_set_C.append(task) + + # --------------------------------------------------------------------- + print "Waiting for 'C' tasks to complete..." + pilotjob.wait() + # --------------------------------------------------------------------- + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + sys.exit(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + diff --git a/examples/tutorial/barebones-local/local_mandelbrot.py b/examples/tutorial/barebones-local/local_mandelbrot.py index 37ecb98e..23f74a90 100644 --- a/examples/tutorial/barebones-local/local_mandelbrot.py +++ b/examples/tutorial/barebones-local/local_mandelbrot.py @@ -62,12 +62,12 @@ # Preparing the final image for image in workdir.list('*.gif'): - print ' * Copying %s/%s back to %s' % (workdir.get_url(), image, os.getcwd()) + print(' * Copying %s/%s back to %s' % (workdir.get_url(), image, os.getcwd())) workdir.copy(image, 'sftp://localhost/%s/' % os.getcwd()) # stitch together the final image fullimage = Image.new('RGB',(imgx, imgy),(255,255,255)) - print ' * Stitching together the whole fractal: mandelbrot_full.png' + print(' * Stitching together the whole fractal: mandelbrot_full.png') for x in range(0, tilesx): for y in range(0, tilesy): partimage = Image.open('tile_x%s_y%s.gif' % (x, y)) diff --git a/examples/tutorial/barebones-local/local_mandelbrot.py.bak b/examples/tutorial/barebones-local/local_mandelbrot.py.bak new file mode 100644 index 00000000..37ecb98e --- /dev/null +++ b/examples/tutorial/barebones-local/local_mandelbrot.py.bak @@ -0,0 +1,79 @@ +import os, time, sys +from PIL import Image +import bliss.saga as saga +from pilot import PilotComputeService, ComputeDataService, State + +# the dimension (in pixel) of the whole fractal +imgx = 8192 +imgy = 8192 + +# the number of tiles in X and Y direction +tilesx = 2 +tilesy = 2 + + +### This is the number of jobs you want to run +NUMBER_JOBS=4 +COORDINATION_URL = "redis://localhost:6379" + +if __name__ == "__main__": + + pilot_compute_service = PilotComputeService(COORDINATION_URL) + + # copy image tiles back to our 'local' directory + dirname = 'sftp://localhost/%s/PJ-mbrot/' % '/tmp' + workdir = saga.filesystem.Directory(dirname, saga.filesystem.Create) + + pilot_compute_description={ "service_url": "fork://localhost", + "number_of_processes": 12, + "working_directory": workdir.get_url().path, + "walltime":10 + } + + pilot_compute_service.create_pilot(pilot_compute_description) + + compute_data_service = ComputeDataService() + compute_data_service.add_pilot_compute_service(pilot_compute_service) + + print ("Finished Pilot-Job setup. Submitting compute units") + + # submit compute units + for x in range(0, tilesx): + for y in range(0, tilesy): + # describe a single Mandelbrot job. we're using the + # directory created above as the job's working directory + outputfile = 'tile_x%s_y%s.gif' % (x,y) + + compute_unit_description = { + "executable": "python", + "arguments": [os.getenv("HOME")+'/mandelbrot.py', str(imgx), str(imgy), + str(imgx/tilesx*x), str(imgx/tilesx*(x+1)), + str(imgy/tilesy*y), str(imgy/tilesy*(y+1)), + outputfile], + "number_of_processes": 1, + "working_directory":workdir.get_url().path, + "output": "stdout_x%s_y%s.txt" % (x,y), + "error": "stderr_x%s_y%s.txt" % (x,y), + } + compute_data_service.submit_compute_unit(compute_unit_description) + + print ("Waiting for compute units to complete") + compute_data_service.wait() + + # Preparing the final image + for image in workdir.list('*.gif'): + print ' * Copying %s/%s back to %s' % (workdir.get_url(), image, os.getcwd()) + workdir.copy(image, 'sftp://localhost/%s/' % os.getcwd()) + + # stitch together the final image + fullimage = Image.new('RGB',(imgx, imgy),(255,255,255)) + print ' * Stitching together the whole fractal: mandelbrot_full.png' + for x in range(0, tilesx): + for y in range(0, tilesy): + partimage = Image.open('tile_x%s_y%s.gif' % (x, y)) + fullimage.paste(partimage, (imgx/tilesx*x, imgy/tilesy*y, imgx/tilesx*(x+1), imgy/tilesy*(y+1)) ) + fullimage.save("mandelbrot_full.gif", "GIF") + + print ("Terminate Pilot Jobs") + compute_data_service.cancel() + pilot_compute_service.cancel() diff --git a/examples/tutorial/barebones-local/local_simple_ensembles.py b/examples/tutorial/barebones-local/local_simple_ensembles.py index 8ec61f30..8febf781 100644 --- a/examples/tutorial/barebones-local/local_simple_ensembles.py +++ b/examples/tutorial/barebones-local/local_simple_ensembles.py @@ -49,16 +49,16 @@ def main(): task_desc.error = 'simple-ensemble-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + print("* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME)) tasks.append(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") pilotjob.wait() return(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/tutorial/barebones-local/local_simple_ensembles.py.bak b/examples/tutorial/barebones-local/local_simple_ensembles.py.bak new file mode 100644 index 00000000..8ec61f30 --- /dev/null +++ b/examples/tutorial/barebones-local/local_simple_ensembles.py.bak @@ -0,0 +1,78 @@ +import os +import sys +import pilot +import traceback + +""" DESCRIPTION: This example shows how to run BigJob locally to execute tasks. +""" + +#------------------------------------------------------------------------------ +# Redis password and 'user' name +REDIS_PWD = ""# Fill in the password to your server +USER_NAME = ""# Fill in your username on the resource you're running on + +# The coordination server +COORD = "redis://localhost:6379" +# The host to run BigJob on +HOSTNAME = "localhost" +# The working directory on your machine +WORKDIR = "/tmp" +# The number of jobs you want to run +NUMBER_JOBS = 4 + + +#------------------------------------------------------------------------------ +# + +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "fork://%s" % HOSTNAME + pilot_description.number_of_processes = 4 + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit tasks to pilot job + tasks = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am task number $TASK_NO', ] + task_desc.environment = {'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'simple-ensemble-stdout.txt' + task_desc.error = 'simple-ensemble-stderr.txt' + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + tasks.append(task) + + print "Waiting for tasks to finish..." + pilotjob.wait() + + return(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) + + diff --git a/examples/tutorial/chained_ensembles.py b/examples/tutorial/chained_ensembles.py index f45cc6b6..a3e30cc9 100644 --- a/examples/tutorial/chained_ensembles.py +++ b/examples/tutorial/chained_ensembles.py @@ -80,7 +80,7 @@ def main(): # Submit task to PilotJob task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id())) task_set_A.append(task) # Chaining tasks i.e submit a compute unit, when compute unit from A is successfully executed. @@ -90,7 +90,7 @@ def main(): while len(task_set_A) > 0: for a_task in task_set_A: if a_task.get_state() == "Done": - print "One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id()) + print("One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id())) # -------- BEGIN USER DEFINED TASK 2 DESCRIPTION --------- # task_desc = pilot.ComputeUnitDescription() @@ -105,14 +105,14 @@ def main(): # Submit task to Pilot Job task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id())) task_set_B.append(task) task_set_A.remove(a_task) return(0) - except Exception, ex: - print "AN ERROR OCCURRED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURRED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/tutorial/chained_ensembles.py.bak b/examples/tutorial/chained_ensembles.py.bak new file mode 100644 index 00000000..f45cc6b6 --- /dev/null +++ b/examples/tutorial/chained_ensembles.py.bak @@ -0,0 +1,130 @@ +import os +import sys +import pilot +import traceback + +""" DESCRIPTION: Tutorial 2: Chaining Tasks +Note: User must edit PILOT SETUP and TASK DESCRIPTION 1-2 sections +This example will not run if these values are not set. +""" + +# ---------------- BEGIN REQUIRED PILOT SETUP ----------------- + +# Distributed Coordination Service - Redis server and password +REDIS_PWD = ""# Fill in the password to your redis server +REDIS_URL = "redis://%s@localhost:6379" % REDIS_PWD + +# Resource Information +HOSTNAME = ""# Remote Resource URL +USER_NAME = ""# Username on the remote resource +SAGA_ADAPTOR = ""# Name of the SAGA adaptor, e.g. fork, sge, pbs, slurm, etc. +# NOTE: See complete list of BigJob supported SAGA adaptors at: +# http://saga-project.github.io/BigJob/sphinxdoc/tutorial/table.html + +# Fill in queue and allocation for the given resource +# Note: Set fields to "None" if not applicable +QUEUE = ""# Add queue you want to use +PROJECT = ""# Add project / allocation / account to charge + +WALLTIME = ""# Maximum Runtime (minutes) for the Pilot Job + +WORKDIR = "" # Path of Resource Working Directory +# This is the directory where BigJob will store its output and error files + +SPMD_VARIATION = ""# Specify the WAYNESS of SGE clusters ONLY, valid input '12way' for example. + +PROCESSES_PER_NODE = ""# Valid on PBS clusters ONLY - this is the number of processors per node. One processor core is treated as one processor on PBS; e.g. a node with 8 cores has a maximum ppn=8 + +PILOT_SIZE = ""# Number of cores required for the Pilot-Job + +# Job Information +NUMBER_JOBS = ""# The TOTAL number of tasks to run + +# Continue to USER DEFINED TASK DESCRIPTION to add +# the required information about the individual tasks. + +# ---------------- END REQUIRED PILOT SETUP ----------------- +# + +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "%s://%s@%s" % (SAGA_ADAPTOR,USER_NAME,HOSTNAME) + pilot_description.queue = QUEUE + pilot_description.project = PROJECT + pilot_description.number_of_processes = PILOT_SIZE + pilot_description.working_directory = WORKDIR + pilot_description.walltime = WALLTIME + pilot_description.processes_per_node = PROCESSES_PER_NODE + pilot_description.spmd_variation = SPMD_VARIATION + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(REDIS_URL) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit 'A' tasks to pilot job + task_set_A = list() + for i in range(NUMBER_JOBS): + + # -------- BEGIN USER DEFINED TASK 1 DESCRIPTION --------- # + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'A', 'TASK_NO': i} + task_desc.spmd_variation = 'single' + task_desc.number_of_processes = 1 + task_desc.output = 'A-stdout.txt' + task_desc.error = 'A-stderr.txt' + # -------- END USER DEFINED TASK 1 DESCRIPTION --------- # + + # Submit task to PilotJob + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + task_set_A.append(task) + + # Chaining tasks i.e submit a compute unit, when compute unit from A is successfully executed. + # A 'B' task reads the content of the output file of an 'A' task and writes it into its own + # output file. + task_set_B = list() + while len(task_set_A) > 0: + for a_task in task_set_A: + if a_task.get_state() == "Done": + print "One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id()) + + # -------- BEGIN USER DEFINED TASK 2 DESCRIPTION --------- # + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am a $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'B', 'TASK_NO': a_task} + task_desc.spmd_variation = 'single' + task_desc.number_of_processes = 1 + task_desc.output = 'B-stdout.txt' + task_desc.error = 'B-stderr.txt' + # -------- END USER DEFINED TASK 2 DESCRIPTION --------- # + + # Submit task to Pilot Job + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + task_set_B.append(task) + task_set_A.remove(a_task) + + return(0) + + except Exception, ex: + print "AN ERROR OCCURRED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/tutorial/coupled_ensembles.py b/examples/tutorial/coupled_ensembles.py index 1a64706c..33b5bcd6 100644 --- a/examples/tutorial/coupled_ensembles.py +++ b/examples/tutorial/coupled_ensembles.py @@ -79,7 +79,7 @@ def main(): # -------- END USER DEFINED TASK 1 DESCRIPTION --------- # task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id())) task_set_A.append(task) @@ -99,13 +99,13 @@ def main(): # -------- END USER DEFINED TASK 2 DESCRIPTION --------- # task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id())) task_set_B.append(task) # --------------------------------------------------------------------- - print "Waiting for 'A' and 'B' tasks to complete..." + print("Waiting for 'A' and 'B' tasks to complete...") pilotjob.wait() - print "Executing 'C' tasks now..." + print("Executing 'C' tasks now...") # --------------------------------------------------------------------- # submit 'C' tasks to pilot job. each 'C' task takes the output of @@ -124,18 +124,18 @@ def main(): # -------- END USER DEFINED TASK 3 DESCRIPTION --------- # task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id())) task_set_C.append(task) # --------------------------------------------------------------------- - print "Waiting for 'C' tasks to complete..." + print("Waiting for 'C' tasks to complete...") pilotjob.wait() # --------------------------------------------------------------------- return(0) - except Exception, ex: - print "AN ERROR OCCURRED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURRED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/tutorial/coupled_ensembles.py.bak b/examples/tutorial/coupled_ensembles.py.bak new file mode 100644 index 00000000..1a64706c --- /dev/null +++ b/examples/tutorial/coupled_ensembles.py.bak @@ -0,0 +1,153 @@ +import os +import sys +import pilot +import traceback + +""" DESCRIPTION: Tutorial 3: Coupled Ensembles +Note: User must edit PILOT SETUP and TASK DESCRIPTION 1-3 sections +This example will not run if these values are not set. +""" + +# ---------------- BEGIN REQUIRED PILOT SETUP ----------------- + +# Distributed Coordination Service - Redis server and password +REDIS_PWD = ""# Fill in the password to your redis server +REDIS_URL = "redis://%s@localhost:6379" % REDIS_PWD + +# Resource Information +HOSTNAME = ""# Remote Resource URL +USER_NAME = ""# Username on the remote resource +SAGA_ADAPTOR = ""# Name of the SAGA adaptor, e.g. fork, sge, pbs, slurm, etc. +# NOTE: See complete list of BigJob supported SAGA adaptors at: +# http://saga-project.github.io/BigJob/sphinxdoc/tutorial/table.html + +# Fill in queue and allocation for the given resource +# Note: Set fields to "None" if not applicable +QUEUE = ""# Add queue you want to use +PROJECT = ""# Add project / allocation / account to charge + +WALLTIME = ""# Maximum Runtime (minutes) for the Pilot Job + +WORKDIR = "" # Path of Resource Working Directory +# This is the directory where BigJob will store its output and error files + +SPMD_VARIATION = ""# Specify the WAYNESS of SGE clusters ONLY, valid input '12way' for example + +PROCESSES_PER_NODE = ""# Valid on PBS clusters ONLY - this is the number of processors per node. One processor core is treated as one processor on PBS; e.g. a node with 8 cores has a maximum ppn=8 + +PILOT_SIZE = ""# Number of cores required for the Pilot-Job + +# Job Information +NUMBER_JOBS = ""# The TOTAL number of tasks to run + +# Continue to USER DEFINED TASK DESCRIPTION to add +# the required information about the individual tasks. + +# ---------------- END REQUIRED PILOT SETUP ----------------- +# + +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "%s://%s@%s" % (SAGA_ADAPTOR,USER_NAME,HOSTNAME) + pilot_description.queue = QUEUE + pilot_description.project = PROJECT + pilot_description.number_of_processes = PILOT_SIZE + pilot_description.working_directory = WORKDIR + pilot_description.walltime = WALLTIME + pilot_description.processes_per_node = PROCESSES_PER_NODE + pilot_description.spmd_variation = SPMD_VARIATION + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(REDIS_URL) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit 'A' tasks to pilot job + task_set_A = list() + for i in range(NUMBER_JOBS): + + # -------- BEGIN USER DEFINED TASK 1 DESCRIPTION --------- # + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'A', 'TASK_NO': i} + task_desc.spmd_variation = 'single' + task_desc.number_of_processes = 1 + task_desc.output = 'A-stdout.txt' + task_desc.error = 'A-stderr.txt' + # -------- END USER DEFINED TASK 1 DESCRIPTION --------- # + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + task_set_A.append(task) + + + # submit 'B' tasks to pilot job + task_set_B = list() + for i in range(NUMBER_JOBS): + + # -------- BEGIN USER DEFINED TASK 2 DESCRIPTION --------- # + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am a $TASK_SET task with id $TASK_NO'] + task_desc.environment = {'TASK_SET': 'B', 'TASK_NO': i} + task_desc.spmd_variation = 'single' + task_desc.number_of_processes = 1 + task_desc.output = 'B-stdout.txt' + task_desc.error = 'B-stderr.txt' + # -------- END USER DEFINED TASK 2 DESCRIPTION --------- # + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + task_set_B.append(task) + + # --------------------------------------------------------------------- + print "Waiting for 'A' and 'B' tasks to complete..." + pilotjob.wait() + print "Executing 'C' tasks now..." + # --------------------------------------------------------------------- + + # submit 'C' tasks to pilot job. each 'C' task takes the output of + # an 'A' and a 'B' task and puts them together. + task_set_C = list() + for i in range(NUMBER_JOBS): + # -------- BEGIN USER DEFINED TASK 3 DESCRIPTION --------- # + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am a $TASK_SET task with id $TASK_NO'] + task_desc.environment = {'TASK_SET': 'C', 'TASK_NO': i} + task_desc.spmd_variation = 'single' + task_desc.number_of_processes = 1 + task_desc.output = 'C-stdout.txt' + task_desc.error = 'C-stderr.txt' + # -------- END USER DEFINED TASK 3 DESCRIPTION --------- # + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id()) + task_set_C.append(task) + + # --------------------------------------------------------------------- + print "Waiting for 'C' tasks to complete..." + pilotjob.wait() + # --------------------------------------------------------------------- + + return(0) + + except Exception, ex: + print "AN ERROR OCCURRED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/tutorial/simple_ensemble_dataxfer.py b/examples/tutorial/simple_ensemble_dataxfer.py index fa8792da..c7cbe903 100644 --- a/examples/tutorial/simple_ensemble_dataxfer.py +++ b/examples/tutorial/simple_ensemble_dataxfer.py @@ -82,10 +82,10 @@ def main(): # -------- END USER DEFINED TASK DESCRIPTION --------- # task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + print("* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME)) tasks.append(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") pilotjob.wait() # ------------ BEGIN FILE TRANSFER LOGIC ------------- # @@ -95,13 +95,13 @@ def main(): for task in tasks: local_filename = "ex-2-stdout-%s.txt" % (task.get_id()) d.copy("%s/stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) - print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + print("* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename)) # ------------ END FILE TRANSFER LOGIC ------------- # return(0) - except Exception, ex: - print "AN ERROR OCCURRED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURRED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/tutorial/simple_ensemble_dataxfer.py.bak b/examples/tutorial/simple_ensemble_dataxfer.py.bak new file mode 100644 index 00000000..fa8792da --- /dev/null +++ b/examples/tutorial/simple_ensemble_dataxfer.py.bak @@ -0,0 +1,119 @@ +import os +import sys +import saga +import pilot +import traceback + +""" DESCRIPTION: Tutorial 4: Adding File Transfer +Note: User must edit REQUIRED PILOT SETUP and TASK DESCRIPTION sections +This example will not run if these values are not set. +This example will execute on the HOSTNAME (remote resource) and transfer the output +back to the localhost. +""" + +# ---------------- BEGIN REQUIRED PILOT SETUP ----------------- + +# Distributed Coordination Service - Redis server and password +REDIS_PWD = ""# Fill in the password to your redis server +REDIS_URL = "redis://%s@localhost:6379" % REDIS_PWD + +# Resource Information +HOSTNAME = ""# Remote Resource URL +USER_NAME = ""# Username on the remote resource +SAGA_ADAPTOR = ""# Name of the SAGA adaptor, e.g. fork, sge, pbs, slurm, etc. +# NOTE: See complete list of BigJob supported SAGA adaptors at: +# http://saga-project.github.io/BigJob/sphinxdoc/tutorial/table.html + +# Fill in queue and allocation for the given resource +# Note: Set fields to "None" if not applicable +QUEUE = ""# Add queue you want to use +PROJECT = ""# Add project / allocation / account to charge + +WALLTIME = ""# Maximum Runtime (minutes) for the Pilot Job + +WORKDIR = "" # Path of Resource Working Directory +# This is the directory where BigJob will store its output and error files + +SPMD_VARIATION = ""# Specify the WAYNESS of SGE clusters ONLY, valid input '12way' for example + +PROCESSES_PER_NODE = ""# Valid on PBS clusters ONLY - this is the number of processors per node. One processor core is treated as one processor on PBS; e.g. a node with 8 cores has a maximum ppn=8 + +PILOT_SIZE = ""# Number of cores required for the Pilot-Job + +# Job Information +NUMBER_JOBS = ""# The TOTAL number of tasks to run + +# Continue to USER DEFINED TASK DESCRIPTION to add +# the required information about the individual tasks. + +# ---------------- END REQUIRED PILOT SETUP ----------------- +# + +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "%s://%s@%s" % (SAGA_ADAPTOR,USER_NAME,HOSTNAME) + pilot_description.queue = QUEUE + pilot_description.project = PROJECT + pilot_description.number_of_processes = PILOT_SIZE + pilot_description.working_directory = WORKDIR + pilot_description.walltime = WALLTIME + pilot_description.processes_per_node = PROCESSES_PER_NODE + pilot_description.spmd_variation = SPMD_VARIATION + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(REDIS_URL) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + + # submit tasks to pilot job + tasks = list() + for i in range(NUMBER_JOBS): + # -------- BEGIN USER DEFINED TASK DESCRIPTION --------- # + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am task number $TASK_NO', ] + task_desc.environment = {'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.spmd_variation = single # Valid values are single or mpi + task_desc.output = 'stdout.txt' + task_desc.error = 'stderr.txt' + # -------- END USER DEFINED TASK DESCRIPTION --------- # + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + tasks.append(task) + + print "Waiting for tasks to finish..." + pilotjob.wait() + + # ------------ BEGIN FILE TRANSFER LOGIC ------------- # + # all compute units have finished. now we can use saga-python + # to transfer back the output files... + d = saga.filesystem.Directory("sftp://%s/" % (HOSTNAME)) + for task in tasks: + local_filename = "ex-2-stdout-%s.txt" % (task.get_id()) + d.copy("%s/stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) + print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + # ------------ END FILE TRANSFER LOGIC ------------- # + + return(0) + + except Exception, ex: + print "AN ERROR OCCURRED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/tutorial/simple_ensembles.py b/examples/tutorial/simple_ensembles.py index c1678b0c..b0a50327 100644 --- a/examples/tutorial/simple_ensembles.py +++ b/examples/tutorial/simple_ensembles.py @@ -79,16 +79,16 @@ def main(): # -------- END USER DEFINED TASK DESCRIPTION --------- # task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + print("* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME)) tasks.append(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") pilotjob.wait() return(0) - except Exception, ex: - print "AN ERROR OCCURRED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURRED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/tutorial/simple_ensembles.py.bak b/examples/tutorial/simple_ensembles.py.bak new file mode 100644 index 00000000..c1678b0c --- /dev/null +++ b/examples/tutorial/simple_ensembles.py.bak @@ -0,0 +1,106 @@ +import os +import sys +import pilot +import traceback + +""" DESCRIPTION: Tutorial 1: A Simple Workload +Note: User must edit USER VARIABLES section +This example will not run if these values are not set. +""" + +# ---------------- BEGIN REQUIRED PILOT SETUP ----------------- + +# Distributed Coordination Service - Redis server and password +REDIS_PWD = ""# Fill in the password to your redis server +REDIS_URL = "redis://%s@localhost:6379" % REDIS_PWD + +# Resource Information +HOSTNAME = "" # Remote Resource URL +USER_NAME = '' # Username on the remote resource +SAGA_ADAPTOR = '' # Name of the SAGA adaptor, e.g. fork, sge, pbs, slurm, etc. +# NOTE: See complete list of BigJob supported SAGA adaptors at: +# http://saga-project.github.io/BigJob/sphinxdoc/tutorial/table.html + +# Fill in queue and allocation for the given resource +# Note: Set fields to "None" if not applicable +QUEUE = '' # Add queue you want to use +PROJECT = '' # Add project / allocation / account to charge + +WALLTIME = ""# Maximum Runtime (minutes) for the Pilot Job + +WORKDIR = "" # Path of Resource Working Directory +# This is the directory where BigJob will store its output and error files + +SPMD_VARIATION = '' # Specify the WAYNESS of SGE clusters ONLY, valid input '12way' for example + +PROCESSES_PER_NODE = '' # Valid on PBS clusters ONLY - this is the number of processors per node. One processor core is treated as one processor on PBS; e.g. a node with 8 cores has a maximum ppn=8 + +PILOT_SIZE = ""# Number of cores required for the Pilot-Job + +# Job Information +NUMBER_JOBS = ""# The TOTAL number of tasks to run + +# Continue to USER DEFINED TASK DESCRIPTION to add +# the required information about the individual tasks. + +# ---------------- END REQUIRED PILOT SETUP ----------------- +# + +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "%s://%s@%s" % (SAGA_ADAPTOR,USER_NAME,HOSTNAME) + pilot_description.queue = QUEUE + pilot_description.project = PROJECT + pilot_description.number_of_processes = PILOT_SIZE + pilot_description.working_directory = WORKDIR + pilot_description.walltime = WALLTIME + pilot_description.processes_per_node = PROCESSES_PER_NODE + pilot_description.spmd_variation = SPMD_VARIATION + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(REDIS_URL) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + + # submit tasks to pilot job + tasks = list() + for i in range(NUMBER_JOBS): + # -------- BEGIN USER DEFINED TASK DESCRIPTION --------- # + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am task number $TASK_NO'] + task_desc.environment = {'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.spmd_variation = "single" # Valid values are single or mpi + task_desc.output = 'simple-ensemble-stdout.txt' + task_desc.error = 'simple-ensemble-stderr.txt' + # -------- END USER DEFINED TASK DESCRIPTION --------- # + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + tasks.append(task) + + print "Waiting for tasks to finish..." + pilotjob.wait() + + return(0) + + except Exception, ex: + print "AN ERROR OCCURRED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/xsede2013/01_bigjob-simple-ensemble.py b/examples/xsede2013/01_bigjob-simple-ensemble.py index 7e6122fd..cc6d783c 100644 --- a/examples/xsede2013/01_bigjob-simple-ensemble.py +++ b/examples/xsede2013/01_bigjob-simple-ensemble.py @@ -56,16 +56,16 @@ def main(): task_desc.error = 'stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + print("* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME)) tasks.append(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") pilotjob.wait() return(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/xsede2013/01_bigjob-simple-ensemble.py.bak b/examples/xsede2013/01_bigjob-simple-ensemble.py.bak new file mode 100644 index 00000000..7e6122fd --- /dev/null +++ b/examples/xsede2013/01_bigjob-simple-ensemble.py.bak @@ -0,0 +1,83 @@ +import os +import sys +import pilot +import traceback + +""" This example runs NUMBER_JOBS (32) concurrent '/bin/echo' tasks + on TACC's stampede cluster. A 32-core pilot job is initialized + and 32 single-core tasks are submitted to it. This example also + show basic error handling via 'try/except' and coordinated + shutdown (removing pilot from stampede's queue) once all tasks + have finihsed running via 'finally'. +""" + +#------------------------------------------------------------------------------ +# Redis password and 'user' name a aquired from the environment +REDIS_PWD = os.environ.get('XSEDE_TUTORIAL_REDIS_PASSWORD') +USER_NAME = os.environ.get('XSEDE_TUTORIAL_USER_NAME') + +# The coordination server +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +# The host (+username) to run BigJob on +HOSTNAME = "sagatut@stampede.tacc.utexas.edu" +# The queue on the remote system +QUEUE = "normal" +# The working directory on the remote cluster / machine +WORKDIR = "/home1/02554/sagatut/XSEDETutorial/%s/example1" % USER_NAME +# The number of jobs you want to run +NUMBER_JOBS = 32 + + +#------------------------------------------------------------------------------ +# +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "slurm+ssh://%s" % HOSTNAME + pilot_description.queue = QUEUE + pilot_description.number_of_processes = 32 + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit tasks to pilot job + tasks = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am task number $TASK_NO', ] + task_desc.environment = {'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'stdout.txt' + task_desc.error = 'stderr.txt' + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + tasks.append(task) + + print "Waiting for tasks to finish..." + pilotjob.wait() + + return(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py b/examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py index 1e18053f..27e2a5ef 100644 --- a/examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py +++ b/examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py @@ -56,10 +56,10 @@ def main(): task_desc.error = 'stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + print("* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME)) tasks.append(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") pilotjob.wait() # all compute units have finished. now we can use saga-python @@ -68,12 +68,12 @@ def main(): for task in tasks: local_filename = "ex-2-stdout-%s.txt" % (task.get_id()) d.copy("%s/stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) - print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + print("* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename)) return(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py.bak b/examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py.bak new file mode 100644 index 00000000..1e18053f --- /dev/null +++ b/examples/xsede2013/02_bigjob-simple-ensemble-datatransfer.py.bak @@ -0,0 +1,91 @@ +import os +import sys +import saga # <=== !! +import pilot +import traceback + +""" This tutorial example extends and improves the first example + (01_bigjob-simple-ensemble.py) by adding file transfer: once + the 32 tasks have finished executing, we use SAGA-Python to + transfer the individual output files back to the local machine. +""" + + +#------------------------------------------------------------------------------ +# Redis password and 'user' name a aquired from the environment +REDIS_PWD = os.environ.get('XSEDE_TUTORIAL_REDIS_PASSWORD') +USER_NAME = os.environ.get('XSEDE_TUTORIAL_USER_NAME') + +# The coordination server +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +# The host to run BigJob on +HOSTNAME = "sagatut@stampede.tacc.utexas.edu" +# The queue on the remote system +QUEUE = "normal" +# The working directory on the remote cluster / machine +WORKDIR = "/home1/02554/sagatut/XSEDETutorial/%s/example2" % USER_NAME +# The number of jobs you want to run +NUMBER_JOBS = 32 + + +#------------------------------------------------------------------------------ +# +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "slurm+ssh://%s" % HOSTNAME + pilot_description.queue = QUEUE + pilot_description.number_of_processes = 32 + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit tasks to pilot job + tasks = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am task number $TASK_NO', ] + task_desc.environment = {'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'stdout.txt' + task_desc.error = 'stderr.txt' + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + tasks.append(task) + + print "Waiting for tasks to finish..." + pilotjob.wait() + + # all compute units have finished. now we can use saga-python + # to transfer back the output files... + d = saga.filesystem.Directory("sftp://%s/" % (HOSTNAME)) + for task in tasks: + local_filename = "ex-2-stdout-%s.txt" % (task.get_id()) + d.copy("%s/stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) + print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + + return(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/xsede2013/03_bigjob_chained_ensemble.py b/examples/xsede2013/03_bigjob_chained_ensemble.py index 992844be..2eeaa8fc 100644 --- a/examples/xsede2013/03_bigjob_chained_ensemble.py +++ b/examples/xsede2013/03_bigjob_chained_ensemble.py @@ -55,7 +55,7 @@ task_desc.output = 'A-stdout.txt' task_desc.error = 'A-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id())) task_set_A.append(task) # Chaining tasks i.e submit a compute unit, when compute unit from A is successfully executed. @@ -65,7 +65,7 @@ while len(task_set_A) > 0: for a_task in task_set_A: if a_task.get_state() == "Done": - print "One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id()) + print("One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id())) task_desc = pilot.ComputeUnitDescription() task_desc.executable = '/bin/cat' task_desc.arguments = ["%s/A-stdout.txt" % a_task.get_local_working_directory()] @@ -73,12 +73,12 @@ task_desc.output = 'B-stdout.txt' task_desc.error = 'B-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id())) task_set_B.append(task) task_set_A.remove(a_task) # --------------------------------------------------------------------- - print "Waiting for 'B' tasks to finish..." + print("Waiting for 'B' tasks to finish...") pilotjob.wait() # --------------------------------------------------------------------- @@ -88,12 +88,12 @@ for task in task_set_B: local_filename = "ex3-stdout-%s.txt" % (task.get_id()) d.copy("%s/B-stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) - print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + print("* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename)) sys.exit(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/xsede2013/03_bigjob_chained_ensemble.py.bak b/examples/xsede2013/03_bigjob_chained_ensemble.py.bak new file mode 100644 index 00000000..992844be --- /dev/null +++ b/examples/xsede2013/03_bigjob_chained_ensemble.py.bak @@ -0,0 +1,107 @@ +import os +import sys +import saga +import pilot +import traceback + +""" This tutorial example introduces task synchronization. It submits a + set of 32 '/bin/echo' tasks (task set A). For every successfully completed + task, we submits another '/bin/cat' task from task set B to the same Pilot-Job. + Task from set A can be seen as producers and tasks from task set B as + consumers, since B-tasks read 'consume' the output file an A-tasks. +""" + +#------------------------------------------------------------------------------ +# Redis password and 'user' name a aquired from the environment +REDIS_PWD = os.environ.get('XSEDE_TUTORIAL_REDIS_PASSWORD') +USER_NAME = os.environ.get('XSEDE_TUTORIAL_USER_NAME') + +# The coordination server +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +# The host to run BigJob on +HOSTNAME = "sagatut@stampede.tacc.utexas.edu" +# The queue on the remote system +QUEUE = "normal" +# The working directory on the remote cluster / machine +WORKDIR = "/home1/02554/sagatut/XSEDETutorial/%s/example3" % USER_NAME +# The number of jobs you want to run +NUMBER_JOBS = 32 + + +#------------------------------------------------------------------------------ +# +if __name__ == "__main__": + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "slurm+ssh://%s" % HOSTNAME + pilot_description.queue = QUEUE + pilot_description.number_of_processes = NUMBER_JOBS + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit 'A' tasks to pilot job + task_set_A = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'A', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'A-stdout.txt' + task_desc.error = 'A-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + task_set_A.append(task) + + # Chaining tasks i.e submit a compute unit, when compute unit from A is successfully executed. + # A 'B' task reads the content of the output file of an 'A' task and writes it into its own + # output file. + task_set_B = list() + while len(task_set_A) > 0: + for a_task in task_set_A: + if a_task.get_state() == "Done": + print "One 'A' task %s finished. Launching a 'B' task." % (a_task.get_id()) + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/cat' + task_desc.arguments = ["%s/A-stdout.txt" % a_task.get_local_working_directory()] + task_desc.number_of_processes = 1 + task_desc.output = 'B-stdout.txt' + task_desc.error = 'B-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + task_set_B.append(task) + task_set_A.remove(a_task) + + # --------------------------------------------------------------------- + print "Waiting for 'B' tasks to finish..." + pilotjob.wait() + # --------------------------------------------------------------------- + + # all 'B' tasks have finished. now we can use saga-python + # to transfer back the output files... + d = saga.filesystem.Directory("sftp://%s/" % (HOSTNAME)) + for task in task_set_B: + local_filename = "ex3-stdout-%s.txt" % (task.get_id()) + d.copy("%s/B-stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) + print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + + sys.exit(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + sys.exit(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() diff --git a/examples/xsede2013/04_bigjob_coupled_ensembles.py b/examples/xsede2013/04_bigjob_coupled_ensembles.py index ded470e5..e78b2a13 100644 --- a/examples/xsede2013/04_bigjob_coupled_ensembles.py +++ b/examples/xsede2013/04_bigjob_coupled_ensembles.py @@ -56,7 +56,7 @@ task_desc.output = 'A-stdout.txt' task_desc.error = 'A-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id())) task_set_A.append(task) # submit 'B' tasks to pilot job @@ -70,11 +70,11 @@ task_desc.output = 'B-stdout.txt' task_desc.error = 'B-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id())) task_set_B.append(task) # --------------------------------------------------------------------- - print "Waiting for 'A' and 'B' tasks to complete..." + print("Waiting for 'A' and 'B' tasks to complete...") pilotjob.wait() # --------------------------------------------------------------------- @@ -94,11 +94,11 @@ task_desc.output = 'C-stdout.txt' task_desc.error = 'C-stderr.txt' task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id()) + print("* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id())) task_set_C.append(task) # --------------------------------------------------------------------- - print "Waiting for 'C' tasks to complete..." + print("Waiting for 'C' tasks to complete...") pilotjob.wait() # --------------------------------------------------------------------- @@ -108,13 +108,13 @@ for task in task_set_C: local_filename = "ex4-stdout-%s.txt" % (task.get_id()) d.copy("%s/C-stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) - print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + print("* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename)) sys.exit(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/xsede2013/04_bigjob_coupled_ensembles.py.bak b/examples/xsede2013/04_bigjob_coupled_ensembles.py.bak new file mode 100644 index 00000000..ded470e5 --- /dev/null +++ b/examples/xsede2013/04_bigjob_coupled_ensembles.py.bak @@ -0,0 +1,128 @@ +import os +import sys +import saga +import pilot +import traceback + +""" This tutorial example shows another form of task set synchronization. + It exemplifies a simple workflow which submit a set of tasks (set A) and + (set B) and wait until they are completed until it submits another set of + tasks (set C). Both A- and B-tasks are 'producers'. C-tasks 'consumers' and + concatenate the output of an A- and a B-tasks. +""" + +#------------------------------------------------------------------------------ +# Redis password and 'user' name a aquired from the environment +REDIS_PWD = os.environ.get('XSEDE_TUTORIAL_REDIS_PASSWORD') +USER_NAME = os.environ.get('XSEDE_TUTORIAL_USER_NAME') + +# The coordination server +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +# The host to run BigJob on +HOSTNAME = "sagatut@stampede.tacc.utexas.edu" +# The queue on the remote system +QUEUE = "normal" +# The working directory on the remote cluster / machine +WORKDIR = "/home1/02554/sagatut/XSEDETutorial/%s/example4" % USER_NAME +# The number of jobs you want to run +NUMBER_JOBS = 32 + + +#------------------------------------------------------------------------------ +# +if __name__ == "__main__": + + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "slurm+ssh://%s" % HOSTNAME + pilot_description.queue = QUEUE + pilot_description.number_of_processes = NUMBER_JOBS + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit 'A' tasks to pilot job + task_set_A = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am an $TASK_SET task with id $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'A', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'A-stdout.txt' + task_desc.error = 'A-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'A' task '%s' with id '%s'" % (i, task.get_id()) + task_set_A.append(task) + + # submit 'B' tasks to pilot job + task_set_B = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['I am a $TASK_SET task with id $TASK_NO'] + task_desc.environment = {'TASK_SET': 'B', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'B-stdout.txt' + task_desc.error = 'B-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'B' task '%s' with id '%s'" % (i, task.get_id()) + task_set_B.append(task) + + # --------------------------------------------------------------------- + print "Waiting for 'A' and 'B' tasks to complete..." + pilotjob.wait() + # --------------------------------------------------------------------- + + # submit 'C' tasks to pilot job. each 'C' task takes the output of + # an 'A' and a 'B' task and puts them together. + task_set_C = list() + for i in range(NUMBER_JOBS): + a_task_output = "%s/A-stdout.txt" \ + % task_set_A[i].get_local_working_directory() + b_task_output = "%s/B-stdout.txt" \ + % task_set_B[i].get_local_working_directory() + + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/cat' + task_desc.arguments = [a_task_output, b_task_output] + task_desc.number_of_processes = 1 + task_desc.output = 'C-stdout.txt' + task_desc.error = 'C-stderr.txt' + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted 'C' task '%s' with id '%s'" % (i, task.get_id()) + task_set_C.append(task) + + # --------------------------------------------------------------------- + print "Waiting for 'C' tasks to complete..." + pilotjob.wait() + # --------------------------------------------------------------------- + + # all 'C' tasks have finished. now we can use saga-python + # to transfer back the output files... + d = saga.filesystem.Directory("sftp://%s/" % (HOSTNAME)) + for task in task_set_C: + local_filename = "ex4-stdout-%s.txt" % (task.get_id()) + d.copy("%s/C-stdout.txt" % (task.get_local_working_directory()), "file://localhost/%s/%s" % (os.getcwd(), local_filename)) + print "* Output for '%s' copied to: './%s'" % (task.get_id(), local_filename) + + + sys.exit(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + sys.exit(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() diff --git a/examples/xsede2013/05_bigjob_mandelbrot.py b/examples/xsede2013/05_bigjob_mandelbrot.py index 0cdde1d2..39b8837a 100644 --- a/examples/xsede2013/05_bigjob_mandelbrot.py +++ b/examples/xsede2013/05_bigjob_mandelbrot.py @@ -84,23 +84,23 @@ def main(): task_desc.number_of_processes = 1 task = pilotjob.submit_compute_unit(task_desc) - print "* Submitted task '%s' to %s" % (task.get_id(), HOSTNAME) + print("* Submitted task '%s' to %s" % (task.get_id(), HOSTNAME)) tasks.append(task) # --------------------------------------------------------------------- - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") pilotjob.wait() # --------------------------------------------------------------------- # copy image tiles back to our 'local' directory for image in workdir.list('*.gif'): - print ' * Copying %s/%s back to %s' % (workdir.get_url(), - image, os.getcwd()) + print(' * Copying %s/%s back to %s' % (workdir.get_url(), + image, os.getcwd())) workdir.copy(image, 'file://localhost/%s/' % os.getcwd()) # stitch together the final image fullimage = Image.new('RGB', (IMGX, IMGY), (255, 255, 255)) - print ' * Stitching together the whole fractal: mandelbrot_full.gif' + print(' * Stitching together the whole fractal: mandelbrot_full.gif') for x in range(0, TILESX): for y in range(0, TILESY): partimage = Image.open('tile_x%s_y%s.gif' % (x, y)) @@ -111,8 +111,8 @@ def main(): return(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/xsede2013/05_bigjob_mandelbrot.py.bak b/examples/xsede2013/05_bigjob_mandelbrot.py.bak new file mode 100644 index 00000000..0cdde1d2 --- /dev/null +++ b/examples/xsede2013/05_bigjob_mandelbrot.py.bak @@ -0,0 +1,130 @@ +import os +import sys +import saga +import pilot +import traceback + +from PIL import Image + +""" +This tutorial example splits up the calculation of a traditional Mandelbrot set using tiles. +This code aims to show the concurrent execution of tasks +(as opposed to the analagous saga-python example, which submits tasks serially). +It also demonstrates file-movement (calculations are done in a temporary directory, but +the final image is concatenated back in the home directory), as well as working +with external python libraries and alternate executables. +""" + +#------------------------------------------------------------------------------ +# Redis password and 'user' name a aquired from the environment +REDIS_PWD = os.environ.get('XSEDE_TUTORIAL_REDIS_PASSWORD') +USER_NAME = os.environ.get('XSEDE_TUTORIAL_USER_NAME') + +# The coordination server +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +# The host (+username) to run BigJob on +#HOSTNAME = "sagatut@stampede.tacc.utexas.edu" +HOSTNAME = "sagatut@stampede.tacc.utexas.edu" +# The queue on the remote system +QUEUE = "normal" +# The working directory on the remote cluster / machine +WORKDIR = "/home1/02554/sagatut/XSEDETutorial/%s/example5" % USER_NAME + +# The number of jobs you want to run +NUMBER_JOBS = 16 + +# the dimension of the whole fractal (in pixel) +IMGX = 8192 +IMGY = 8192 + +# the number of tiles in X and Y direction +TILESX = 4 +TILESY = 4 + + +#------------------------------------------------------------------------------ +# +def main(): + try: + # copy the executable and warpper script to the remote host + workdir = saga.filesystem.Directory("sftp://%s/%s" % (HOSTNAME, WORKDIR), + saga.filesystem.CREATE_PARENTS) + mbwrapper = saga.filesystem.File("file://localhost/%s/mandelbrot.sh" % os.getcwd()) + mbwrapper.copy(workdir.get_url()) + mbexe = saga.filesystem.File("file://localhost/%s/mandelbrot.py" % os.getcwd()) + mbexe.copy(workdir.get_url()) + + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "slurm+ssh://%s" % HOSTNAME + pilot_description.queue = QUEUE + pilot_description.number_of_processes = 32 + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilotjob = pilot_compute_service.create_pilot(pilot_description) + + # submit tasks to pilot job + tasks = list() + + for x in range(0, TILESX): + for y in range(0, TILESY): + # describe a single Mandelbrot job. we're using the + # directory created above as the job's working directory + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/sh' + task_desc.arguments = ["/%s/mandelbrot.sh" % WORKDIR, IMGX, IMGY, + (IMGX/TILESX*x), (IMGX/TILESX*(x+1)), + (IMGY/TILESY*y), (IMGY/TILESY*(y+1)), + '%s/tile_x%s_y%s.gif' % (WORKDIR, x, y)] + + task_desc.wall_time_limit = 10 + task_desc.number_of_processes = 1 + + task = pilotjob.submit_compute_unit(task_desc) + print "* Submitted task '%s' to %s" % (task.get_id(), HOSTNAME) + tasks.append(task) + + # --------------------------------------------------------------------- + print "Waiting for tasks to finish..." + pilotjob.wait() + # --------------------------------------------------------------------- + + # copy image tiles back to our 'local' directory + for image in workdir.list('*.gif'): + print ' * Copying %s/%s back to %s' % (workdir.get_url(), + image, os.getcwd()) + workdir.copy(image, 'file://localhost/%s/' % os.getcwd()) + + # stitch together the final image + fullimage = Image.new('RGB', (IMGX, IMGY), (255, 255, 255)) + print ' * Stitching together the whole fractal: mandelbrot_full.gif' + for x in range(0, TILESX): + for y in range(0, TILESY): + partimage = Image.open('tile_x%s_y%s.gif' % (x, y)) + fullimage.paste(partimage, + (IMGX/TILESX*x, IMGY/TILESY*y, + IMGX/TILESX*(x+1), IMGY/TILESY*(y+1))) + fullimage.save("mandelbrot_full.gif", "GIF") + + return(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + pilotjob.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/xsede2013/CDS-01_bigjob-simple-ensemble.py b/examples/xsede2013/CDS-01_bigjob-simple-ensemble.py index 3c9c71cd..14fab1c6 100644 --- a/examples/xsede2013/CDS-01_bigjob-simple-ensemble.py +++ b/examples/xsede2013/CDS-01_bigjob-simple-ensemble.py @@ -54,16 +54,16 @@ def main(): task_desc.error = 'simple-ensemble-stderr.txt' task = compute_data_service.submit_compute_unit(task_desc) - print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + print("* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME)) tasks.append(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") compute_data_service.wait() return(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/xsede2013/CDS-01_bigjob-simple-ensemble.py.bak b/examples/xsede2013/CDS-01_bigjob-simple-ensemble.py.bak new file mode 100644 index 00000000..3c9c71cd --- /dev/null +++ b/examples/xsede2013/CDS-01_bigjob-simple-ensemble.py.bak @@ -0,0 +1,81 @@ +import os +import sys +import pilot +import traceback + +""" DESCRIPTION: This example does this... +""" + +# Redis password is read from the environment. The example can be run like this: +# REDIS_PASSWORD=ILikeBigJob_wITH-REdIS python examples/example_styleguide.py +# Alternatively, for tutorials, etc. REDIS_PASSWORD can be defined in /etc/profile + +#------------------------------------------------------------------------------ +# +REDIS_PWD = os.environ.get('REDIS_PASSWORD') +# The coordination server +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +# The host to run BigJob on +HOSTNAME = "localhost" +# The working directory on the remote cluster / machine +WORKDIR = os.getenv("HOME")+"/XSEDETutorial" +# The number of jobs you want to run +NUMBER_JOBS = 4 + + +#------------------------------------------------------------------------------ +# +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "ssh://%s" % HOSTNAME + pilot_description.number_of_processes = 1 + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilot_compute_service.create_pilot(pilot_description) + + # Compute Data Service + compute_data_service = pilot.ComputeDataService() + compute_data_service.add_pilot_compute_service(pilot_compute_service) + + # submit tasks to pilot job + tasks = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['Hello, I am task number $TASK_NO', ] + task_desc.environment = ['TASK_NO=%s' % i] + task_desc.number_of_processes = 1 + task_desc.output = 'simple-ensemble-stdout.txt' + task_desc.error = 'simple-ensemble-stderr.txt' + + task = compute_data_service.submit_compute_unit(task_desc) + print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + tasks.append(task) + + print "Waiting for tasks to finish..." + compute_data_service.wait() + + return(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + compute_data_service.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py b/examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py index 70029710..543649bf 100644 --- a/examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py +++ b/examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py @@ -58,16 +58,16 @@ def main(): task_desc.error = 'simple-ensemble-stderr.txt' task = compute_data_service.submit_compute_unit(task_desc) - print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + print("* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME)) tasks.append(task) - print "Waiting for tasks to finish..." + print("Waiting for tasks to finish...") compute_data_service.wait() # all compute units have finished. now we can use saga-python # to transfer back the output files... for task in tasks: - print task.get_local_working_directory() + print(task.get_local_working_directory()) # d = saga.filesystem.Directory("sftp://%s/%s" % (HOSTNAME, task.get_local_working_directory())) # local_filename = "stdout-%s.txt" % (task.get_id()) # d.copy("simple-ensemble-stdout.txt", "file://localhost/%s/%s" % (os.getcwd(), local_filename)) @@ -75,8 +75,8 @@ def main(): return(0) - except Exception, ex: - print "AN ERROR OCCURED: %s" % ((str(ex))) + except Exception as ex: + print("AN ERROR OCCURED: %s" % ((str(ex)))) # print a stack trace in case of an exception - # this can be helpful for debugging the problem traceback.print_exc() diff --git a/examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py.bak b/examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py.bak new file mode 100644 index 00000000..70029710 --- /dev/null +++ b/examples/xsede2013/CDS-02_bigjob-simple-ensemble-datatransfer.py.bak @@ -0,0 +1,94 @@ +import os +import sys +import saga # <=== !! +import pilot +import traceback + +""" This tutorial example extends and improves the first example + (01_bigjob-simple-ensemble.py) by adding file transfer: once the tasks have + finished executing, we use SAGA-Python to transfer the individual output + files back to the local machine. +""" + +# Redis password is read from the environment. The example can be run like this: +# REDIS_PASSWORD=ILikeBigJob_wITH-REdIS python examples/example_styleguide.py +# Alternatively, for tutorials, etc. REDIS_PASSWORD can be defined in /etc/profile + +#------------------------------------------------------------------------------ +# +REDIS_PWD = os.environ.get('REDIS_PASSWORD') +# The coordination server +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % REDIS_PWD +# The host to run BigJob on +HOSTNAME = "localhost" +# The working directory on the remote cluster / machine +WORKDIR = os.getenv("HOME")+"/XSEDETutorial" +# The number of jobs you want to run +NUMBER_JOBS = 4 + + +#------------------------------------------------------------------------------ +# +def main(): + try: + # this describes the parameters and requirements for our pilot job + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = "ssh://%s" % HOSTNAME + pilot_description.number_of_processes = 1 + pilot_description.working_directory = WORKDIR + pilot_description.walltime = 10 + + # create a new pilot job + pilot_compute_service = pilot.PilotComputeService(COORD) + pilot_compute_service.create_pilot(pilot_description) + + # Compute Data Service + compute_data_service = pilot.ComputeDataService() + compute_data_service.add_pilot_compute_service(pilot_compute_service) + + # submit tasks to pilot job + tasks = list() + for i in range(NUMBER_JOBS): + task_desc = pilot.ComputeUnitDescription() + task_desc.executable = '/bin/echo' + task_desc.arguments = ['Hello, I am an $TASK_SET task with number $TASK_NO', ] + task_desc.environment = {'TASK_SET': 'A', 'TASK_NO': i} + task_desc.number_of_processes = 1 + task_desc.output = 'simple-ensemble-stdout.txt' + task_desc.error = 'simple-ensemble-stderr.txt' + + task = compute_data_service.submit_compute_unit(task_desc) + print "* Submitted task '%s' with id '%s' to %s" % (i, task.get_id(), HOSTNAME) + tasks.append(task) + + print "Waiting for tasks to finish..." + compute_data_service.wait() + + # all compute units have finished. now we can use saga-python + # to transfer back the output files... + for task in tasks: + print task.get_local_working_directory() + # d = saga.filesystem.Directory("sftp://%s/%s" % (HOSTNAME, task.get_local_working_directory())) + # local_filename = "stdout-%s.txt" % (task.get_id()) + # d.copy("simple-ensemble-stdout.txt", "file://localhost/%s/%s" % (os.getcwd(), local_filename)) + # print "* Output for '%s' can be found locally in: './%s'" % (task.get_id(), local_filename) + + return(0) + + except Exception, ex: + print "AN ERROR OCCURED: %s" % ((str(ex))) + # print a stack trace in case of an exception - + # this can be helpful for debugging the problem + traceback.print_exc() + return(-1) + + finally: + # alway try to shut down pilots, otherwise jobs might end up + # lingering in the queue + print ("Terminating BigJob...") + compute_data_service.cancel() + pilot_compute_service.cancel() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/xsede2013/mandelbrot.py b/examples/xsede2013/mandelbrot.py index 2cc0d515..25986656 100644 --- a/examples/xsede2013/mandelbrot.py +++ b/examples/xsede2013/mandelbrot.py @@ -77,7 +77,7 @@ def makemandel(mandelx, mandely, xbeg, xend, ybeg, yend, filename=None): args = sys.argv[1:] if len(args) < 6: - print "Usage: python %s imgX imgY xBeg xEnd yBeg yEnd filename" % __file__ + print("Usage: python %s imgX imgY xBeg xEnd yBeg yEnd filename" % __file__) sys.exit(-1) imgX = int(sys.argv[1]) diff --git a/examples/xsede2013/mandelbrot.py.bak b/examples/xsede2013/mandelbrot.py.bak new file mode 100644 index 00000000..2cc0d515 --- /dev/null +++ b/examples/xsede2013/mandelbrot.py.bak @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 + +""" A Simple Mandelbrot Fractal Generator. + + We use this example to explore the distributed capabilites of + the SAGA Job and Filesystem APIs in Bliss. The mandelbrot module + calculates a full or partial (tile) mandelbrot set fractal and + writes it to a PNG image file. + + It requires the Python Image Library (PIL) which can be easily + installed with 'easy_install PIL'. + + The mandelbrot module can be called either as a function:: + + from mandelbrot import makemandel + makemandel( imgX, imgY, xBeg, xEnd, yBeg, yEnd, filename) + + or alternatively on the command line:: + + python mandelbrot.py imgX imgY xBeg xEnd yBeg yEnd filename + + The parameters are as follows: + + imgX, imgY: the dimensions of the mandelbrot image, e.g. 1024, 1024 + xBeg, xEnd: the x-axis portion of the (sub-)image to calculate + yBeg, yEnd: the y-axis portion of the (sub-)image to calculate + filename: the output filename (defaults to mandel_x_%s_%s_y%s_%s.png) +""" + +__author__ = "Ole Christian Weidner" +__copyright__ = "Copyright 2012, Ole Christian Weidner" +__license__ = "MIT" + +import sys, Image + +################################################################################ +## +def makemandel(mandelx, mandely, xbeg, xend, ybeg, yend, filename=None): + + # drawing area (xa < xb and ya < yb) + xa = -2.0 + xb = 1.0 + ya = -1.5 + yb = 1.5 + + # maximum iterations + maxIt = 128 + + # the output image + image = Image.new("RGB", (xend-xbeg, yend-ybeg)) + + for y in range(ybeg, yend): + cy = y * (yb - ya) / (mandely - 1) + ya + for x in range(xbeg, xend): + cx = x * (xb - xa) / (mandelx - 1) + xa + c = complex(cx, cy) + z = 0 + for i in range(maxIt): + if abs(z) > 2.0: break + z = z * z + c + r = i % 4 * 16 + g = i % 6 * 16 + b = i % 16 * 16 + image.putpixel((x-xbeg, y-ybeg), b * 65536 + g * 256 + r) + + if filename is not None: + image.save(filename, "GIF") + else: + image.save("mandel_x_%s_%s_y%s_%s.gif" % (xbeg, xend, ybeg, yend), "GIF") + return image + +################################################################################ +## +if __name__ == "__main__": + + args = sys.argv[1:] + if len(args) < 6: + print "Usage: python %s imgX imgY xBeg xEnd yBeg yEnd filename" % __file__ + sys.exit(-1) + + imgX = int(sys.argv[1]) + imgY = int(sys.argv[2]) + xBeg = int(sys.argv[3]) + xEnd = int(sys.argv[4]) + yBeg = int(sys.argv[5]) + yEnd = int(sys.argv[6]) + + filename = None + if len(args) == 7: + filename = str(sys.argv[7]) + + makemandel(imgX, imgY, xBeg, xEnd, yBeg, yEnd, filename) + sys.exit(0) diff --git a/ez_setup.py b/ez_setup.py index ba03f0a7..4d4f335e 100644 --- a/ez_setup.py +++ b/ez_setup.py @@ -228,7 +228,7 @@ def download_file_insecure(url, target): try: from urllib.request import urlopen except ImportError: - from urllib2 import urlopen + from urllib.request import urlopen src = dst = None try: src = urlopen(url) diff --git a/ez_setup.py.bak b/ez_setup.py.bak new file mode 100644 index 00000000..ba03f0a7 --- /dev/null +++ b/ez_setup.py.bak @@ -0,0 +1,361 @@ +#!/usr/bin/env python +"""Bootstrap setuptools installation + +To use setuptools in your package's setup.py, include this +file in the same directory and add this to the top of your setup.py:: + + from ez_setup import use_setuptools + use_setuptools() + +To require a specific version of setuptools, set a download +mirror, or use an alternate download directory, simply supply +the appropriate options to ``use_setuptools()``. + +This file can also be run as a script to install or upgrade setuptools. +""" +import os +import shutil +import sys +import tempfile +import tarfile +import optparse +import subprocess +import platform +import textwrap + +from distutils import log + +try: + from site import USER_SITE +except ImportError: + USER_SITE = None + +DEFAULT_VERSION = "2.1" +DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" + +def _python_cmd(*args): + args = (sys.executable,) + args + return subprocess.call(args) == 0 + +def _install(tarball, install_args=()): + # extracting the tarball + tmpdir = tempfile.mkdtemp() + log.warn('Extracting in %s', tmpdir) + old_wd = os.getcwd() + try: + os.chdir(tmpdir) + tar = tarfile.open(tarball) + _extractall(tar) + tar.close() + + # going in the directory + subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) + os.chdir(subdir) + log.warn('Now working in %s', subdir) + + # installing + log.warn('Installing Setuptools') + if not _python_cmd('setup.py', 'install', *install_args): + log.warn('Something went wrong during the installation.') + log.warn('See the error message above.') + # exitcode will be 2 + return 2 + finally: + os.chdir(old_wd) + shutil.rmtree(tmpdir) + + +def _build_egg(egg, tarball, to_dir): + # extracting the tarball + tmpdir = tempfile.mkdtemp() + log.warn('Extracting in %s', tmpdir) + old_wd = os.getcwd() + try: + os.chdir(tmpdir) + tar = tarfile.open(tarball) + _extractall(tar) + tar.close() + + # going in the directory + subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) + os.chdir(subdir) + log.warn('Now working in %s', subdir) + + # building an egg + log.warn('Building a Setuptools egg in %s', to_dir) + _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) + + finally: + os.chdir(old_wd) + shutil.rmtree(tmpdir) + # returning the result + log.warn(egg) + if not os.path.exists(egg): + raise IOError('Could not build the egg.') + + +def _do_download(version, download_base, to_dir, download_delay): + egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg' + % (version, sys.version_info[0], sys.version_info[1])) + if not os.path.exists(egg): + tarball = download_setuptools(version, download_base, + to_dir, download_delay) + _build_egg(egg, tarball, to_dir) + sys.path.insert(0, egg) + + # Remove previously-imported pkg_resources if present (see + # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details). + if 'pkg_resources' in sys.modules: + del sys.modules['pkg_resources'] + + import setuptools + setuptools.bootstrap_install_from = egg + + +def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, + to_dir=os.curdir, download_delay=15): + to_dir = os.path.abspath(to_dir) + rep_modules = 'pkg_resources', 'setuptools' + imported = set(sys.modules).intersection(rep_modules) + try: + import pkg_resources + except ImportError: + return _do_download(version, download_base, to_dir, download_delay) + try: + pkg_resources.require("setuptools>=" + version) + return + except pkg_resources.DistributionNotFound: + return _do_download(version, download_base, to_dir, download_delay) + except pkg_resources.VersionConflict as VC_err: + if imported: + msg = textwrap.dedent(""" + The required version of setuptools (>={version}) is not available, + and can't be installed while this script is running. Please + install a more recent version first, using + 'easy_install -U setuptools'. + + (Currently using {VC_err.args[0]!r}) + """).format(VC_err=VC_err, version=version) + sys.stderr.write(msg) + sys.exit(2) + + # otherwise, reload ok + del pkg_resources, sys.modules['pkg_resources'] + return _do_download(version, download_base, to_dir, download_delay) + +def _clean_check(cmd, target): + """ + Run the command to download target. If the command fails, clean up before + re-raising the error. + """ + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError: + if os.access(target, os.F_OK): + os.unlink(target) + raise + +def download_file_powershell(url, target): + """ + Download the file at url to target using Powershell (which will validate + trust). Raise an exception if the command cannot complete. + """ + target = os.path.abspath(target) + cmd = [ + 'powershell', + '-Command', + "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" % vars(), + ] + _clean_check(cmd, target) + +def has_powershell(): + if platform.system() != 'Windows': + return False + cmd = ['powershell', '-Command', 'echo test'] + devnull = open(os.path.devnull, 'wb') + try: + try: + subprocess.check_call(cmd, stdout=devnull, stderr=devnull) + except: + return False + finally: + devnull.close() + return True + +download_file_powershell.viable = has_powershell + +def download_file_curl(url, target): + cmd = ['curl', url, '--silent', '--output', target] + _clean_check(cmd, target) + +def has_curl(): + cmd = ['curl', '--version'] + devnull = open(os.path.devnull, 'wb') + try: + try: + subprocess.check_call(cmd, stdout=devnull, stderr=devnull) + except: + return False + finally: + devnull.close() + return True + +download_file_curl.viable = has_curl + +def download_file_wget(url, target): + cmd = ['wget', url, '--quiet', '--output-document', target] + _clean_check(cmd, target) + +def has_wget(): + cmd = ['wget', '--version'] + devnull = open(os.path.devnull, 'wb') + try: + try: + subprocess.check_call(cmd, stdout=devnull, stderr=devnull) + except: + return False + finally: + devnull.close() + return True + +download_file_wget.viable = has_wget + +def download_file_insecure(url, target): + """ + Use Python to download the file, even though it cannot authenticate the + connection. + """ + try: + from urllib.request import urlopen + except ImportError: + from urllib2 import urlopen + src = dst = None + try: + src = urlopen(url) + # Read/write all in one block, so we don't create a corrupt file + # if the download is interrupted. + data = src.read() + dst = open(target, "wb") + dst.write(data) + finally: + if src: + src.close() + if dst: + dst.close() + +download_file_insecure.viable = lambda: True + +def get_best_downloader(): + downloaders = [ + download_file_powershell, + download_file_curl, + download_file_wget, + download_file_insecure, + ] + + for dl in downloaders: + if dl.viable(): + return dl + +def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, + to_dir=os.curdir, delay=15, + downloader_factory=get_best_downloader): + """Download setuptools from a specified location and return its filename + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end + with a '/'). `to_dir` is the directory where the egg will be downloaded. + `delay` is the number of seconds to pause before an actual download + attempt. + + ``downloader_factory`` should be a function taking no arguments and + returning a function for downloading a URL to a target. + """ + # making sure we use the absolute path + to_dir = os.path.abspath(to_dir) + tgz_name = "setuptools-%s.tar.gz" % version + url = download_base + tgz_name + saveto = os.path.join(to_dir, tgz_name) + if not os.path.exists(saveto): # Avoid repeated downloads + log.warn("Downloading %s", url) + downloader = downloader_factory() + downloader(url, saveto) + return os.path.realpath(saveto) + + +def _extractall(self, path=".", members=None): + """Extract all members from the archive to the current working + directory and set owner, modification time and permissions on + directories afterwards. `path' specifies a different directory + to extract to. `members' is optional and must be a subset of the + list returned by getmembers(). + """ + import copy + import operator + from tarfile import ExtractError + directories = [] + + if members is None: + members = self + + for tarinfo in members: + if tarinfo.isdir(): + # Extract directories with a safe mode. + directories.append(tarinfo) + tarinfo = copy.copy(tarinfo) + tarinfo.mode = 448 # decimal for oct 0700 + self.extract(tarinfo, path) + + # Reverse sort directories. + directories.sort(key=operator.attrgetter('name'), reverse=True) + + # Set correct owner, mtime and filemode on directories. + for tarinfo in directories: + dirpath = os.path.join(path, tarinfo.name) + try: + self.chown(tarinfo, dirpath) + self.utime(tarinfo, dirpath) + self.chmod(tarinfo, dirpath) + except ExtractError as e: + if self.errorlevel > 1: + raise + else: + self._dbg(1, "tarfile: %s" % e) + + +def _build_install_args(options): + """ + Build the arguments to 'python setup.py install' on the setuptools package + """ + return ['--user'] if options.user_install else [] + +def _parse_args(): + """ + Parse the command line for options + """ + parser = optparse.OptionParser() + parser.add_option( + '--user', dest='user_install', action='store_true', default=False, + help='install in user site package (requires Python 2.6 or later)') + parser.add_option( + '--download-base', dest='download_base', metavar="URL", + default=DEFAULT_URL, + help='alternative URL from where to download the setuptools package') + parser.add_option( + '--insecure', dest='downloader_factory', action='store_const', + const=lambda: download_file_insecure, default=get_best_downloader, + help='Use internal, non-validating downloader' + ) + options, args = parser.parse_args() + # positional arguments are ignored + return options + +def main(version=DEFAULT_VERSION): + """Install or upgrade setuptools and EasyInstall""" + options = _parse_args() + tarball = download_setuptools(download_base=options.download_base, + downloader_factory=options.downloader_factory) + return _install(tarball, _build_install_args(options)) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/pilot/api/api.py b/pilot/api/api.py index 28457a07..82e5b7c4 100644 --- a/pilot/api/api.py +++ b/pilot/api/api.py @@ -1,5 +1,5 @@ -import compute.api -import data.api +from . import compute.api +from . import data.api class PilotError(Exception): def __init__(self, value): diff --git a/pilot/api/api.py.bak b/pilot/api/api.py.bak new file mode 100644 index 00000000..28457a07 --- /dev/null +++ b/pilot/api/api.py.bak @@ -0,0 +1,29 @@ +import compute.api +import data.api + +class PilotError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + + +class ComputeDataService(compute.api.ComputeUnitService, data.api.PilotDataService): + """ B{ComputeDataService (CDS).} + + The ComputeDataService is the application's interface to submit + ComputeUnits and PilotData/DataUnit to the Pilot-Manager + in the P* Model. + """ + + + def __init__(self, wds_id=None): + """ Create a Compute Data Service object. + + Keyword arguments: + wds_id -- Reconnect to an existing WDS (optional). + """ + raise NotImplementedError("Abstract super class, please use ComputeDataService implementation class in pilot namespace") + + diff --git a/pilot/coordination/advert_adaptor.py b/pilot/coordination/advert_adaptor.py index ee808c1d..b18d397a 100644 --- a/pilot/coordination/advert_adaptor.py +++ b/pilot/coordination/advert_adaptor.py @@ -85,7 +85,7 @@ def add_pd(cls, pds_url, pd): @classmethod def update_pd(cls, pd): if len(pd.data_units) > 0: - du_urls = [i.url for i in pd.data_units.values()] + du_urls = [i.url for i in list(pd.data_units.values())] cls.__store_entry(cls.__remove_dbtype(pd.url)+"/data-units", du_urls) cls.__store_entry(cls.__remove_dbtype(pd.url)+"/pilot-data", pd.to_dict()) @@ -147,10 +147,10 @@ def update_cds(cls, cds_url, cds): cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cds/", pjs_urls) # currently managed PDs and WUs - pd_urls = [i.url for i in cds.data_units.values()] + pd_urls = [i.url for i in list(cds.data_units.values())] cls.__store_entry(cls.__remove_dbtype(cds_url)+"/du/", pd_urls) - wu_urls = [i.url for i in cds.compute_units.values()] + wu_urls = [i.url for i in list(cds.compute_units.values())] cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cu/", wu_urls) diff --git a/pilot/coordination/advert_adaptor.py.bak b/pilot/coordination/advert_adaptor.py.bak new file mode 100644 index 00000000..ee808c1d --- /dev/null +++ b/pilot/coordination/advert_adaptor.py.bak @@ -0,0 +1,287 @@ +import logging +import saga +import json +import pdb + +from pilot import * +from bigjob import logger + +class AdvertCoordinationAdaptor: + """ + BigData persists its data in a central data space, e.g. the Advert service + to facilitate distributed coordination: + + advert://advert.cct.lsu.edu/pilot/3d0d5960-296d-11e1-8896-00264a13ca4c/data/ => namespace for pilot data + + advert://advert.cct.lsu.edu/pilot/3d0d5960-296d-11e1-8896-00264a13ca4c/data/pds => pilot data service + advert://advert.cct.lsu.edu/pilot/3d0d5960-296d-11e1-8896-00264a13ca4c/data/pds/pilot-data-description => pilot data description + ... + + + advert://advert.cct.lsu.edu/pilot/3d0d5960-296d-11e1-8896-00264a13ca4c/data/pds/ => pilot store service + advert://advert.cct.lsu.edu/pilot/3d0d5960-296d-11e1-8896-00264a13ca4c/data/pds/pilot-data-description => pilot data description + + This class is stateless - the application's base_url needs to be passed into every method. + """ + BASE_URL="advert://localhost/" + BASE_URL_QUERY_STRING="?dbtype=sqlite3" + + PILOT_PATH="pilot" + PILOT_DATA_PATH=PILOT_PATH + PILOT_DATA_SERVICE_PATH=PILOT_DATA_PATH+"/pds" + DATA_UNIT_SERVICE_PATH=PILOT_DATA_PATH+"/dus" + COMPUTE_DATA_SERVICE_PATH = PILOT_DATA_PATH + "/cds" + + + ########################################################################### + # Construct a base url for an application + + @classmethod + def get_base_url(cls, application_id): + surl = saga.url(cls.BASE_URL) + base_url = surl.scheme + "://" + surl.host + "/" + application_id + "/" + logging.debug(base_url) + return base_url + + ########################################################################### + # Pilot Store Service related methods + + @classmethod + def add_pds(cls, application_url, pds): + pds_url_no_dbtype = cls.get_pds_url(application_url, pds.id) + pds_url = cls.__get_url(pds_url_no_dbtype) + logger.debug("Create PDS directory at %s"%pds_url) + saga.advert.directory(pds_url, saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + return pds_url_no_dbtype + + + @classmethod + def delete_pds(cls, pds_url): + pds_url = cls.__get_url(pds_url) + pds_dir = saga.advert.directory(saga.url(pds_url), + saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + pds_dir.remove(pds_url, saga.name_space.Recursive) + + ########################################################################### + # Pilot Data related methods + + @classmethod + def add_pd(cls, pds_url, pd): + pds_url = cls.__remove_dbtype(pds_url) + pd_url =pds_url+"/" + pd.id + pd_description_url = cls.__get_url(pd_url + "/description") + logger.debug("PDS URL: %s, PD Description URL: %s"%(pds_url, pd_description_url)) + # directory is recursively created + pd_desc_entry = saga.advert.entry(saga.url(pd_description_url), + saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + logger.debug("initialized advert entry for pds: " + pd_description_url) + pd_desc_entry.store_string(json.dumps(pd.data_unit_description)) + return pd_url + + @classmethod + def update_pd(cls, pd): + if len(pd.data_units) > 0: + du_urls = [i.url for i in pd.data_units.values()] + cls.__store_entry(cls.__remove_dbtype(pd.url)+"/data-units", du_urls) + cls.__store_entry(cls.__remove_dbtype(pd.url)+"/pilot-data", pd.to_dict()) + + + @classmethod + def get_pd(cls, pds_url): + logger.debug("GET PD: " + pds_url) + pd_dict={} + #pd_dict["pilot_data" ]= cls.__retrieve_entry(cls.__remove_dbtype(pds_url)+"/pilot-data") + pd_dict["pilot_data"] = cls.__retrieve_entry(cls.__remove_dbtype(pds_url)+"/pilot-data") + return pd_dict + + + @classmethod + def list_pd(cls, pds_url): + """ return a list of urls to pd managed by the PDS """ + pds_url = cls.__get_url(pds_url) + logger.debug("List PD at %s"%pds_url) + pds_dir = saga.advert.directory(pds_url, saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + + pd_list = pds_dir.list() + pd_full_urls = [] + for i in pd_list: + pd_full_urls.append(pds_url + "/" + i) + return pd_full_urls + + @classmethod + def delete_pd(cls, pds_url): + pds_url = cls.__get_url(pds_url) + pd_dir = saga.advert.directory(saga.url(pds_url), + saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + pd_dir.remove(pds_url, saga.name_space.Recursive) + + + ########################################################################### + # Compute Data Service related methods + @classmethod + def add_cds(cls, application_url, cds): + cds_url_no_dbtype = cls.get_cds_url(application_url, cds.id) + cds_url = cls.__get_url(cds_url_no_dbtype) + logger.debug("Create CDS directory at %s"%cds_url) + saga.advert.directory(cds_url, saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + return cds_url_no_dbtype + + @classmethod + def update_cds(cls, cds_url, cds): + + # Storage and Compute Resources + pds_urls = [cls.__remove_dbtype(i.url) for i in cds.pilot_data_services] + cls.__store_entry(cls.__remove_dbtype(cds_url)+"/pds/", pds_urls) + + pjs_urls = [i.url for i in cds.pilot_job_services] + cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cds/", pjs_urls) + + # currently managed PDs and WUs + pd_urls = [i.url for i in cds.data_units.values()] + cls.__store_entry(cls.__remove_dbtype(cds_url)+"/du/", pd_urls) + + wu_urls = [i.url for i in cds.compute_units.values()] + cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cu/", wu_urls) + + + @classmethod + def delete_cds(cls, cds_url): + cds_url = cls.__get_url(cls.__remove_dbtype(cds_url)) + cds_dir = saga.advert.directory(saga.url(cds_url), + saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + # cds_dir.remove(cds_url, saga.name_space.Recursive) + + + + + ########################################################################### + # Data Unit related methods + @classmethod + def add_du(cls, dus_url, du): + du_url = cls.__remove_dbtype(dus_url) + "/" + du.id + du_url = cls.__get_url(du_url) + # directory is recursively created + #saga.advert.directory(saga.url(du_url), + # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + #logger.debug("initialized advert entry for dus: " + du_url) + return du_url + + + @classmethod + def get_du(cls, du_url): + logger.debug("**** GET PD: " + du_url) + du_dict={} + du_dict["data_unit_description" ]= cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/description") + du_dict["state"] = cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/state") + du_dict["data_units"] = cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/data-units") + du_dict["pilot_data"] = cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/pilot-data") + logger.debug("Open pilot data at: " + du_url + " State: " + str(du_dict)) + return du_dict + + + @classmethod + def update_du(cls, du): + logger.debug("**** Update pilot data at: " + du.url) + cls.__store_entry(cls.__remove_dbtype(du.url)+"/description", du.data_unit_description) + cls.__store_entry(cls.__remove_dbtype(du.url)+"/state", du.state) + + du_urls = [i.url for i in du.pilot_data] + cls.__store_entry(cls.__remove_dbtype(du.url)+"/pilot-data", du_urls) + + du_dict_list = [i.to_dict() for i in du.data_unit_items] + cls.__store_entry(cls.__remove_dbtype(du.url)+"/data-units", du_dict_list) + + + @classmethod + def list_du(cls, dus_url): + """ return a list of urls to du managed by the PDS """ + dus_url = cls.__get_url(dus_url) + logger.debug("List PDS at %s"%dus_url) + dus_dir = saga.advert.directory(dus_url, saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + + du_list = dus_dir.list() + du_full_urls = [] + for i in du_list: + du_full_urls.append(dus_url + "/" + i) + return du_full_urls + + + @classmethod + def delete_du(cls, du_url): + du_url = cls.__get_url(du_url) + du_dir = saga.advert.directory(saga.url(du_url), + saga.advert.Create | + saga.advert.CreateParents | + saga.advert.ReadWrite) + du_dir.remove(du_url, saga.name_space.Recursive) + + + + ########################################################################### + # URL Tweaking + + @classmethod + def get_pds_url(cls, application_url, pds_id): + pds_url = application_url+AdvertCoordinationAdaptor.PILOT_DATA_SERVICE_PATH+"/"+pds_id + logger.debug("PDS URL: %s"%(pds_url)) + return pds_url + + @classmethod + def get_cds_url(cls, application_url, cds_id): + cds_url = application_url+AdvertCoordinationAdaptor.COMPUTE_DATA_SERVICE_PATH+"/"+cds_id + logger.debug("CDS URL: %s"%(cds_url)) + return cds_url + + ########################################################################### + # internal methods + + @classmethod + def __get_url(cls, url): + """ appends advert querystring for dbtype to url """ + url = url + AdvertCoordinationAdaptor.BASE_URL_QUERY_STRING + return url + + @classmethod + def __remove_dbtype(cls, url): + surl = saga.url(url) + surl.query = "" + return surl.get_string() + + @classmethod + def __store_entry(cls, entry_url, content): + entry_url = cls.__get_url(entry_url) + + # directory is recursively created + entry = saga.advert.entry(saga.url(entry_url), + saga.advert.Create | + saga.advert.CreateParents | saga.advert.ReadWrite) + entry.store_string(json.dumps(content)) + #logger.debug("Store Advert entry at: " + entry_url + # + " Content: " + str(json.dumps(content))) + + @classmethod + def __retrieve_entry(cls, entry_url): + entry_url = cls.__get_url(entry_url) + #logger.debug("Retrieve Advert entry at: " + entry_url) + # directory is recursively created + entry = saga.advert.entry(saga.url(entry_url), + saga.advert.Create | + saga.advert.CreateParents | saga.advert.ReadWrite) + content = json.loads(entry.retrieve_string()) + #logger.debug("Retrieve Advert entry at: " + entry_url + # + " Content: " + str(json.dumps(content))) + return content diff --git a/pilot/coordination/nocoord_adaptor.py b/pilot/coordination/nocoord_adaptor.py index b554cf5e..9241c134 100644 --- a/pilot/coordination/nocoord_adaptor.py +++ b/pilot/coordination/nocoord_adaptor.py @@ -72,7 +72,7 @@ def add_pd(cls, pds_url, pd): @classmethod def update_pd(cls, pd): if len(pd.data_units) > 0: - du_urls = [i.url for i in pd.data_units.values()] + du_urls = [i.url for i in list(pd.data_units.values())] #cls.__store_entry(cls.__remove_dbtype(pd.url)+"/data-units", du_urls) #cls.__store_entry(cls.__remove_dbtype(pd.url)+"/pilot-data", pd.to_dict()) @@ -134,10 +134,10 @@ def update_cds(cls, cds_url, cds): #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cds/", pjs_urls) # currently managed PDs and WUs - pd_urls = [i.url for i in cds.data_units.values()] + pd_urls = [i.url for i in list(cds.data_units.values())] #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/du/", pd_urls) - wu_urls = [i.url for i in cds.compute_units.values()] + wu_urls = [i.url for i in list(cds.compute_units.values())] #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cu/", wu_urls) diff --git a/pilot/coordination/nocoord_adaptor.py.bak b/pilot/coordination/nocoord_adaptor.py.bak new file mode 100644 index 00000000..b554cf5e --- /dev/null +++ b/pilot/coordination/nocoord_adaptor.py.bak @@ -0,0 +1,273 @@ +import logging +import json +import pdb + +from pilot import * +from bigjob import logger +from saga import Url as SAGAUrl + +class NoCoordinationAdaptor: + """ + Dummy Adaptor - No distributed coordination done + """ + BASE_URL="nocoord://localhost/" + BASE_URL_QUERY_STRING="?dbtype=sqlite3" + + PILOT_PATH="pilot" + PILOT_DATA_PATH=PILOT_PATH + PILOT_DATA_SERVICE_PATH=PILOT_DATA_PATH+"/pds" + DATA_UNIT_SERVICE_PATH=PILOT_DATA_PATH+"/dus" + COMPUTE_DATA_SERVICE_PATH = PILOT_DATA_PATH + "/cds" + + + ########################################################################### + # Construct a base url for an application + + @classmethod + def get_base_url(cls, application_id): + surl = SAGAUrl(cls.BASE_URL) + base_url = surl.scheme + "://" + surl.host + "/" + application_id + "/" + logger.debug(base_url) + return base_url + + ########################################################################### + # Pilot Store Service related methods + + @classmethod + def add_pds(cls, application_url, pds): + pds_url_no_dbtype = cls.get_pds_url(application_url, pds.id) + pds_url = cls.__get_url(pds_url_no_dbtype) + logger.debug("Create PDS directory at %s"%pds_url) + #saga.advert.directory(pds_url, saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + return pds_url_no_dbtype + + + @classmethod + def delete_pds(cls, pds_url): + pds_url = cls.__get_url(pds_url) + #pds_dir = saga.advert.directory(saga.url(pds_url), + # saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + #pds_dir.remove(pds_url, saga.name_space.Recursive) + + ########################################################################### + # Pilot Data related methods + + @classmethod + def add_pd(cls, pds_url, pd): + pds_url = cls.__remove_dbtype(pds_url) + pd_url =pds_url+"/" + pd.id + pd_description_url = cls.__get_url(pd_url + "/description") + logger.debug("PDS URL: %s, PD Description URL: %s"%(pds_url, pd_description_url)) + # directory is recursively created + #pd_desc_entry = saga.advert.entry(saga.url(pd_description_url), + # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + #logger.debug("initialized advert entry for pds: " + pd_description_url) + #pd_desc_entry.store_string(json.dumps(pd.data_unit_description)) + return pd_url + + @classmethod + def update_pd(cls, pd): + if len(pd.data_units) > 0: + du_urls = [i.url for i in pd.data_units.values()] + #cls.__store_entry(cls.__remove_dbtype(pd.url)+"/data-units", du_urls) + #cls.__store_entry(cls.__remove_dbtype(pd.url)+"/pilot-data", pd.to_dict()) + + + @classmethod + def get_pd(cls, pds_url): + logger.debug("GET PD: " + pds_url) + pd_dict={} + #pd_dict["pilot_data" ]= cls.__retrieve_entry(cls.__remove_dbtype(pds_url)+"/pilot-data") + #pd_dict["pilot_data"] = cls.__retrieve_entry(cls.__remove_dbtype(pds_url)+"/pilot-data") + #return pd_dict + + + @classmethod + def list_pd(cls, pds_url): + """ return a list of urls to pd managed by the PDS """ + pds_url = cls.__get_url(pds_url) + logger.debug("List PD at %s"%pds_url) + #pds_dir = saga.advert.directory(pds_url, saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + + #pd_list = pds_dir.list() + #pd_full_urls = [] + #for i in pd_list: + # pd_full_urls.append(pds_url + "/" + i) + #return pd_full_urls + + @classmethod + def delete_pd(cls, pds_url): + pds_url = cls.__get_url(pds_url) + #pd_dir = saga.advert.directory(saga.url(pds_url), + # saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + #pd_dir.remove(pds_url, saga.name_space.Recursive) + + + ########################################################################### + # Compute Data Service related methods + @classmethod + def add_cds(cls, application_url, cds): + cds_url_no_dbtype = cls.get_cds_url(application_url, cds.id) + cds_url = cls.__get_url(cds_url_no_dbtype) + logger.debug("Create CDS directory at %s"%cds_url) + #saga.advert.directory(cds_url, saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + return cds_url_no_dbtype + + @classmethod + def update_cds(cls, cds_url, cds): + + # Storage and Compute Resources + pds_urls = [cls.__remove_dbtype(i.url) for i in cds.pilot_data_services] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/pds/", pds_urls) + + pjs_urls = [i.url for i in cds.pilot_job_services] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cds/", pjs_urls) + + # currently managed PDs and WUs + pd_urls = [i.url for i in cds.data_units.values()] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/du/", pd_urls) + + wu_urls = [i.url for i in cds.compute_units.values()] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cu/", wu_urls) + + + @classmethod + def delete_cds(cls, cds_url): + cds_url = cls.__get_url(cls.__remove_dbtype(cds_url)) + #cds_dir = saga.advert.directory(saga.url(cds_url), + # saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + # cds_dir.remove(cds_url, saga.name_space.Recursive) + + + + + ########################################################################### + # Data Unit related methods + @classmethod + def add_du(cls, dus_url, du): + du_url = cls.__remove_dbtype(dus_url) + "/" + du.id + du_url = cls.__get_url(du_url) + # directory is recursively created + #saga.advert.directory(saga.url(du_url), + # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) + #logger.debug("initialized advert entry for dus: " + du_url) + return du_url + + + @classmethod + def get_du(cls, du_url): + logger.debug("**** GET PD: " + du_url) + du_dict={} + #du_dict["data_unit_description" ]= cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/description") + #du_dict["state"] = cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/state") + #du_dict["data_units"] = cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/data-units") + #du_dict["pilot_data"] = cls.__retrieve_entry(cls.__remove_dbtype(du_url)+"/pilot-data") + #logger.debug("Open pilot data at: " + du_url + " State: " + str(du_dict)) + return du_dict + + + @classmethod + def update_du(cls, du): + logger.debug("**** Update pilot data at: " + du.url) + #cls.__store_entry(cls.__remove_dbtype(du.url)+"/description", du.data_unit_description) + #cls.__store_entry(cls.__remove_dbtype(du.url)+"/state", du.state) + + du_urls = [i.url for i in du.pilot_data] + #cls.__store_entry(cls.__remove_dbtype(du.url)+"/pilot-data", du_urls) + + du_dict_list = [i.to_dict() for i in du.data_unit_items] + #cls.__store_entry(cls.__remove_dbtype(du.url)+"/data-units", du_dict_list) + + + @classmethod + def list_du(cls, dus_url): + """ return a list of urls to du managed by the PDS """ + dus_url = cls.__get_url(dus_url) + logger.debug("List PDS at %s"%dus_url) + #dus_dir = saga.advert.directory(dus_url, saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + + #du_list = dus_dir.list() + #du_full_urls = [] + #for i in du_list: + # du_full_urls.append(dus_url + "/" + i) + return du_full_urls + + + @classmethod + def delete_du(cls, du_url): + du_url = cls.__get_url(du_url) + #du_dir = saga.advert.directory(saga.url(du_url), + # saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + #du_dir.remove(du_url, saga.name_space.Recursive) + + + + ########################################################################### + # URL Tweaking + + @classmethod + def get_pds_url(cls, application_url, pds_id): + pds_url = application_url+NoCoordinationAdaptor.PILOT_DATA_SERVICE_PATH+"/"+pds_id + logger.debug("PDS URL: %s"%(pds_url)) + return pds_url + + @classmethod + def get_cds_url(cls, application_url, cds_id): + cds_url = application_url+NoCoordinationAdaptor.COMPUTE_DATA_SERVICE_PATH+"/"+cds_id + logger.debug("CDS URL: %s"%(cds_url)) + return cds_url + + ########################################################################### + # internal methods + + @classmethod + def __get_url(cls, url): + """ appends advert querystring for dbtype to url """ + url = url + NoCoordinationAdaptor.BASE_URL_QUERY_STRING + return url + + @classmethod + def __remove_dbtype(cls, url): + surl = SAGAUrl(url) + return str(surl) + + @classmethod + def __store_entry(cls, entry_url, content): + entry_url = cls.__get_url(entry_url) + + # directory is recursively created + #entry = saga.advert.entry(saga.url(entry_url), + # saga.advert.Create | + # saga.advert.CreateParents | saga.advert.ReadWrite) + #entry.store_string(json.dumps(content)) + #logger.debug("Store Advert entry at: " + entry_url + # + " Content: " + str(json.dumps(content))) + + @classmethod + def __retrieve_entry(cls, entry_url): + entry_url = cls.__get_url(entry_url) + #logger.debug("Retrieve Advert entry at: " + entry_url) + # directory is recursively created + #entry = saga.advert.entry(saga.url(entry_url), + # saga.advert.Create | + # saga.advert.CreateParents | saga.advert.ReadWrite) + #content = json.loads(entry.retrieve_string()) + #logger.debug("Retrieve Advert entry at: " + entry_url + # + " Content: " + str(json.dumps(content))) + return content diff --git a/pilot/coordination/redis_adaptor.py b/pilot/coordination/redis_adaptor.py index 2284a5d1..92453de0 100644 --- a/pilot/coordination/redis_adaptor.py +++ b/pilot/coordination/redis_adaptor.py @@ -129,10 +129,10 @@ def update_cds(cls, cds_url, cds): #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cds/", pjs_urls) # currently managed PDs and WUs - pd_urls = [i.url for i in cds.data_units.values()] + pd_urls = [i.url for i in list(cds.data_units.values())] #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/du/", pd_urls) - wu_urls = [i.url for i in cds.compute_units.values()] + wu_urls = [i.url for i in list(cds.compute_units.values())] #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cu/", wu_urls) diff --git a/pilot/coordination/redis_adaptor.py.bak b/pilot/coordination/redis_adaptor.py.bak new file mode 100644 index 00000000..2284a5d1 --- /dev/null +++ b/pilot/coordination/redis_adaptor.py.bak @@ -0,0 +1,301 @@ +from pilot import * +from bigjob import logger +from redis.client import Lock + +try: + import json +except ImportError: + import simplejson as json + +from saga import Url as SAGAUrl + +class RedisCoordinationAdaptor: + """ + Dummy Adaptor - No distributed coordination done + """ + BASE_URL="redis://localhost/" + SEPARATOR=":" + + PILOT_PATH="pilot" + PILOT_DATA_PATH=PILOT_PATH + PILOT_DATA_SERVICE_PATH=PILOT_DATA_PATH + SEPARATOR + "pds" + DATA_UNIT_SERVICE_PATH=PILOT_DATA_PATH + SEPARATOR +"dus" + COMPUTE_DATA_SERVICE_PATH = PILOT_DATA_PATH + SEPARATOR + "cds" + + ########################################################################### + # Construct a base url for an application + + @classmethod + def configure_base_url(cls, base_url): + cls.BASE_URL=base_url + + @classmethod + def get_base_url(cls, application_id): + if cls.BASE_URL==None: + logger.error("Coordination URL not set. Exiting Pilot-Data.") + raise Exception("Coordination URL not set. Exiting Pilot-Data.") + surl = SAGAUrl(cls.BASE_URL) + base_url = surl.scheme + "://" + surl.host + "/" + application_id + logger.debug(base_url) + return base_url + + ########################################################################### + # Pilot Store Service related methods + + @classmethod + def add_pds(cls, application_url, pds): + pds_url_no_dbtype = cls.get_pds_url(application_url, pds.id) + pds_url = cls.__get_url(pds_url_no_dbtype) + logger.debug("Create PDS directory at %s"%pds_url) + return pds_url_no_dbtype + + + @classmethod + def delete_pds(cls, pds_url): + pds_url = cls.__get_url(pds_url) + + #pds_dir.remove(pds_url, saga.name_space.Recursive) + + ########################################################################### + # Pilot Data related methods + + @classmethod + def add_pd(cls, pds_url, pd): + pd_url =pds_url+ RedisCoordinationAdaptor.SEPARATOR + pd.id + return pd_url + + + @classmethod + def update_pd(cls, pd): + du_urls=None + if len(pd.data_unit_urls) > 0: + du_urls = pd.data_unit_urls + + pd_dict={ + "data_unit_urls": du_urls, + "pilot_data": pd.to_dict(), + "pilot_data_description": pd.pilot_data_description, + "security_context": pd.security_context + } + + cls.__store_entry(pd.url+RedisCoordinationAdaptor.SEPARATOR + "info", pd_dict) + + + @classmethod + def get_pd(cls, pd_url): + logger.debug("GET PD: " + pd_url) + pd_dict=cls.__retrieve_entry(pd_url + RedisCoordinationAdaptor.SEPARATOR + "info") + return pd_dict + + + @classmethod + def list_pd(cls, pds_url): + """ return a list of urls to pd managed by the PDS """ + pds_url = cls.__get_url(pds_url) + logger.debug("List PD at %s"%pds_url) + + + @classmethod + def delete_pd(cls, pds_url): + pds_url = cls.__get_url(pds_url) + #pd_dir = saga.advert.directory(saga.url(pds_url), + # saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + #pd_dir.remove(pds_url, saga.name_space.Recursive) + + + ########################################################################### + # Compute Data Service related methods + @classmethod + def add_cds(cls, application_url, cds): + cds_url_no_dbtype = cls.get_cds_url(application_url, cds.id) + cds_url = cls.__get_url(cds_url_no_dbtype) + logger.debug("Create CDS directory at %s"%cds_url) + #saga.advert.directory(cds_url, saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + return cds_url_no_dbtype + + + @classmethod + def update_cds(cls, cds_url, cds): + + # Storage and Compute Resources + pds_urls = [i.url for i in cds.pilot_data_services] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/pds/", pds_urls) + + pjs_urls = [i.url for i in cds.pilot_job_services] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cds/", pjs_urls) + + # currently managed PDs and WUs + pd_urls = [i.url for i in cds.data_units.values()] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/du/", pd_urls) + + wu_urls = [i.url for i in cds.compute_units.values()] + #cls.__store_entry(cls.__remove_dbtype(cds_url)+"/cu/", wu_urls) + + + @classmethod + def delete_cds(cls, cds_url): + cds_url = cls.__get_url(cds_url) + #cds_dir = saga.advert.directory(saga.url(cds_url), + # saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + # cds_dir.remove(cds_url, saga.name_space.Recursive) + + + + + ########################################################################### + # Data Unit related methods + @classmethod + def add_du(cls, root_url, du): + du_url = root_url + RedisCoordinationAdaptor.SEPARATOR + du.id + du_url = cls.__get_url(du_url) + return du_url + + + @classmethod + def get_du(cls, du_url): + logger.debug("**** GET DU: " + str(du_url)) + du_dict=cls.__retrieve_entry(du_url+ RedisCoordinationAdaptor.SEPARATOR + "info") + logger.debug("Retrieved DU: " + du_url + " Content: " + str(du_dict)) + return du_dict + + + + @classmethod + def update_du_state(cls, du, state): + logger.debug("**** Update data unit STATE at: " + du.url + " to: " + str(state)) + cls.__store_entry_item(du.url + RedisCoordinationAdaptor.SEPARATOR + "info", "state", state) + + + @classmethod + def update_du(cls, du): + logger.debug("**** Update data unit FULL at: " + du.url) + du_dict_list = [i.to_dict() for i in du.data_unit_items] + pd_urls=[] + if du.pilot_data!=None: pd_urls = [i.url for i in du.pilot_data] + du_dict = { + "data_unit_description":du.data_unit_description, + "state": du.state, + "pilot_data": pd_urls, + "data_unit_items": du_dict_list + } + logger.debug("Update DU: " + str(du.url) + " to " + str(du_dict["state"])) + cls.__store_entry(du.url + RedisCoordinationAdaptor.SEPARATOR + "info", du_dict) + + + @classmethod + def list_du(cls, pd_url): + """ return a list of urls to du managed by the PDS """ + pd_url = cls.__get_url(pd_url) + logger.debug("List Data-Units of Pilot-Data at %s"%pd_url) + dus = cls.__list_keys(pd_url+":du-*") + return dus + + + @classmethod + def delete_du(cls, du_url): + du_url = cls.__get_url(du_url) + #du_dir = saga.advert.directory(saga.url(du_url), + # saga.advert.Create | + # saga.advert.CreateParents | + # saga.advert.ReadWrite) + #du_dir.remove(du_url, saga.name_space.Recursive) + + + + ########################################################################### + # URL Tweaking + @classmethod + def get_pds_url(cls, application_url, pds_id): + pds_url = application_url + RedisCoordinationAdaptor.SEPARATOR +pds_id + logger.debug("PDS URL: %s"%(pds_url)) + return pds_url + + + @classmethod + def get_cds_url(cls, application_url, cds_id): + cds_url = application_url + RedisCoordinationAdaptor.SEPARATOR +cds_id + logger.debug("CDS URL: %s"%(cds_url)) + return cds_url + + + ########################################################################### + # internal Redis-related methods + @classmethod + def __get_redis_api_client(cls): + import redis + ''' Initialize Redis API Client ''' + saga_url = SAGAUrl(RedisCoordinationAdaptor.BASE_URL) + username = saga_url.username + server = saga_url.host + server_port = saga_url.port + if username==None or username=="": + redis_client = redis.Redis(host=server, port=server_port, db=0) + else: + redis_client = redis.Redis(host=server, port=server_port, password=username, db=0) + + try: + redis_client.ping() + except: + logger.error("Please start Redis server!") + raise Exception("Please start Redis server!") + return redis_client + + + @classmethod + def __get_url(cls, url): + return url + + + @classmethod + def __list_keys(cls, search_url): + redis_client = cls.__get_redis_api_client() + keys = redis_client.keys(search_url) + keys_normalized = [i[:i.index(":info")] for i in keys] + return keys_normalized + + @classmethod + def __store_entry_item(cls, entry_url, item_key, item_value): + entry_url = cls.__get_url(entry_url) + redis_client = cls.__get_redis_api_client() + lock_name=entry_url+":lock" + logger.debug("Acquire Redis lock for update: " + lock_name) + lock = Lock(redis_client, lock_name, timeout=None, sleep=0.1) + acquired=lock.acquire(blocking=True) + logger.debug("Lock acquired: " + str(acquired)) + redis_client.hset(entry_url, item_key, item_value) + lock.release() + logger.debug("Stored Redis entry at: " + entry_url + + " Key: " + str(json.dumps(item_key)) + + " Value: " + str(json.dumps(item_value)) + ) + + + @classmethod + def __store_entry(cls, entry_url, content): + entry_url = cls.__get_url(entry_url) + redis_client = cls.__get_redis_api_client() + lock_name=entry_url+":lock" + logger.debug("Acquire Redis lock for update: " + lock_name) + lock = Lock(redis_client, lock_name, timeout=None, sleep=0.1) + acquired=lock.acquire(blocking=True) + logger.debug("Lock acquired: " + str(acquired)) + redis_client.hmset(entry_url, content) + lock.release() + logger.debug("Stored Redis entry at: " + entry_url + + " Content: " + str(json.dumps(content))) + + @classmethod + def __retrieve_entry(cls, entry_url): + entry_url = cls.__get_url(entry_url) + redis_client = cls.__get_redis_api_client() + content = redis_client.hgetall(entry_url) + + logger.debug("Retrieve Redis entry at: " + entry_url + + " Content: " + str(json.dumps(content))) + return content diff --git a/pilot/filemanagement/globusonline_adaptor.py b/pilot/filemanagement/globusonline_adaptor.py index f57dc5d6..889c80cc 100644 --- a/pilot/filemanagement/globusonline_adaptor.py +++ b/pilot/filemanagement/globusonline_adaptor.py @@ -1,7 +1,7 @@ ''' Globus Online based File Transfer ''' -import urlparse +import urllib.parse import pdb import glob import errno @@ -30,7 +30,7 @@ class GlobusOnlineFileAdaptor(object): def __init__(self, service_url): self.service_url = service_url - result = urlparse.urlparse(service_url) + result = urllib.parse.urlparse(service_url) self.host = result.netloc self.query = result.path self.ep = self.__get_ep(self.query) @@ -94,7 +94,7 @@ def create_du(self, du_id): def put_du(self, du): logging.debug("Copy DU using Globus Online") du_items = du.list() - for i in du_items.keys(): + for i in list(du_items.keys()): local_filename=du_items[i]["local"] remote_path = os.path.join(self.path, str(du.id), os.path.basename(local_filename)) logging.debug("Put file: %s to %s"%(local_filename, remote_path)) @@ -103,7 +103,7 @@ def put_du(self, du): if self.__is_remote_directory(local_filename): logging.warning("Path %s is a directory. Ignored."%local_filename) continue - result = urlparse.urlparse(local_filename) + result = urllib.parse.urlparse(local_filename) source_host = result.netloc source_path = result.path logger.debug(str((source_host, source_path, self.host, remote_path))) @@ -176,7 +176,7 @@ def transfer(self, source_url, target_url): def create_remote_directory(self, target_url): if not self.__is_remote_directory(target_url): - result = urlparse.urlparse(target_url) + result = urllib.parse.urlparse(target_url) target_query = result.path target_ep = self.__get_ep(target_query) target_path = self.__get_path(target_query) @@ -190,7 +190,7 @@ def create_remote_directory(self, target_url): def get_path(self, target_url): - result = urlparse.urlparse(target_url) + result = urllib.parse.urlparse(target_url) target_query = result.path target_path = self.__get_path(target_query) return target_path @@ -217,7 +217,7 @@ def __remove_directory(self, path): def __is_remote_directory(self, url): try: - result = urlparse.urlparse(url) + result = urllib.parse.urlparse(url) target_query = result.path target_ep = self.__get_ep(target_query) target_path = self.__get_path(target_query) @@ -234,12 +234,12 @@ def __third_party_transfer_host(self, source_url, target_url): Transfers from source URL to machine to target_url """ transfer_start = time.time() - result = urlparse.urlparse(source_url) + result = urllib.parse.urlparse(source_url) source_query = result.path source_ep = self.__get_ep(source_query) source_path = self.__get_path(source_query) - result = urlparse.urlparse(target_url) + result = urllib.parse.urlparse(target_url) target_query = result.path target_ep = self.__get_ep(target_query) target_path = self.__get_path(target_query) @@ -307,10 +307,10 @@ def __wait_for_task(self, task_id, timeout=None): timeout -= 1 if status != "ACTIVE": - print "Task %s complete!" % task_id + print("Task %s complete!" % task_id) return True else: - print "Task still not complete after %d seconds" % timeout + print("Task still not complete after %d seconds" % timeout) return False @@ -319,7 +319,7 @@ def __exists(self, path): """ try: self.__sftp.stat(path) - except IOError, e: + except IOError as e: if e.errno == errno.ENOENT: return False raise @@ -329,9 +329,9 @@ def __exists(self, path): def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) diff --git a/pilot/filemanagement/globusonline_adaptor.py.bak b/pilot/filemanagement/globusonline_adaptor.py.bak new file mode 100644 index 00000000..f57dc5d6 --- /dev/null +++ b/pilot/filemanagement/globusonline_adaptor.py.bak @@ -0,0 +1,342 @@ +''' +Globus Online based File Transfer +''' +import urlparse +import pdb +import glob +import errno +import sys +import os +import stat +import logging +import traceback +import time + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) +#sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from pilot.api import State +from bigjob import logger + +from globusonline.transfer import api_client + +#from globusonline.transfer.api_client.get_go_cookie import get_go_auth +from globusonline.transfer.api_client.goauth import get_access_token + +class GlobusOnlineFileAdaptor(object): + """ BigData Coordination File Management for Pilot Data """ + + URL_PREFIX="go://" + + + def __init__(self, service_url): + self.service_url = service_url + result = urlparse.urlparse(service_url) + self.host = result.netloc + self.query = result.path + self.ep = self.__get_ep(self.query) + self.path = self.__get_path(self.query) + self.user = result.username + self.password = result.password + + #result = get_go_auth(ca_certs=None, username=self.user, password=self.password) + result = get_access_token(ca_certs=None, username=self.user, password=self.password) + + #saml_cookie = result.cookie + saml_cookie = result.token + + self.api = api_client.TransferAPIClient(username=self.user, + goauth=saml_cookie + ) + status_code, status_message, data = self.api.task_list() + + # initialize ssh client + self.__state=State.New + + + def get_security_context(self): + """ Returns security context that needs to be available on the distributed + node in order to access this Pilot Data """ + return None + + + def initialize_pilotdata(self): + # check whether directory exists + try: + self.api.endpoint_mkdir(self.ep, self.path) + except: + pass + self.__state=State.Running + + + def get_pilotdata_size(self): + # check size + return None + + + def delete_pilotdata(self): + self.api.endpoint_delete(self.ep, self.path) + self.__state=State.Done + + + def get_state(self): + if self.__client.get_transport().is_active()==True: + return self.__state + else: + self.__state=State.Failed + return self.__state + + def create_du(self, du_id): + du_dir = os.path.join(self.path, str(du_id)) + logger.debug("mkdir: " + du_dir) + self.api.endpoint_mkdir(self.ep, du_dir) + + + def put_du(self, du): + logging.debug("Copy DU using Globus Online") + du_items = du.list() + for i in du_items.keys(): + local_filename=du_items[i]["local"] + remote_path = os.path.join(self.path, str(du.id), os.path.basename(local_filename)) + logging.debug("Put file: %s to %s"%(local_filename, remote_path)) + if local_filename.startswith("ssh://"): + # check if remote path is directory + if self.__is_remote_directory(local_filename): + logging.warning("Path %s is a directory. Ignored."%local_filename) + continue + result = urlparse.urlparse(local_filename) + source_host = result.netloc + source_path = result.path + logger.debug(str((source_host, source_path, self.host, remote_path))) + if source_host == "" or source_host==None: + cmd = "scp "+ source_path + " " + self.host + ":" + remote_path + else: + cmd = "scp "+ source_host+":"+source_path + " " + self.host + ":" + remote_path + logger.debug("Command: %s"%cmd) + os.system(cmd) + elif(local_filename.startswith("go://")): + self.__third_party_transfer_host(local_filename, self.service_url + "/" + str(du.id)) + + + + def copy_du_to_url(self, du, local_url, remote_url): + base_dir = self.__get_path_for_du(du) + logger.debug("copy_du_to_url, source: %s remote: %s"%(base_dir, remote_url)) + if remote_url.startswith("/") and os.path.exists(base_dir): + target_path = remote_url + source_path = base_dir + logger.debug("Target and source host are localhost. Processing: %s" %(source_path)) + expanded_path = glob.glob(source_path + "/*") + logger.debug("Expanded path: " + str(expanded_path)) + for path in expanded_path: + if os.path.isdir(path): + logger.debug("Source path %s is directory"%path) + files = os.listdir(path) + for i in files: + try: + os.symlink(os.path.join(files, i), target_path) + except: + self.__print_traceback() + else: + try: + os.symlink(path, os.path.join(target_path, os.path.basename(path))) + except: + self.__print_traceback() + else: + self.create_remote_directory(remote_url) + for filename in self.__sftp.listdir(base_dir): + file_url = local_url + "/" + filename + file_remote_url = remote_url + "/" + filename + logger.debug("Copy " + file_url + " to " + file_remote_url) + self.__third_party_transfer_host(file_url, file_remote_url) + + + + def copy_du(self, du, pd_new): + remote_url = pd_new.service_url + "/" + str(du.id) + local_url = self.service_url + "/" + str(du.id) + self.copy_du_to_url(du, local_url, remote_url) + + + def get_du(self, du, target_url): + remote_url = target_url + local_url = self.service_url + "/" + str(du.id) + self.copy_du_to_url(du, local_url, remote_url) + + + def remove_du(self, du): + self.__remove_directory(os.path.join(self.path, du.id)) + + + ########################################################################### + # Pure File Management APIs + + def transfer(self, source_url, target_url): + self.__third_party_transfer_host(source_url, target_url) + + + def create_remote_directory(self, target_url): + if not self.__is_remote_directory(target_url): + result = urlparse.urlparse(target_url) + target_query = result.path + target_ep = self.__get_ep(target_query) + target_path = self.__get_path(target_query) + result = self.api.endpoint_mkdir(target_ep, target_path) + logger.debug("GO EP: %s Directory: %s Creation Result: %s"%(target_ep, target_path, str(result))) + #task_id = result[2]["task_id"] + #logger.debug("Transfer Request Result: %s Task ID: %s"%(str(result), task_id)) + #self.__wait_for_task(task_id) + return True + return True + + + def get_path(self, target_url): + result = urlparse.urlparse(target_url) + target_query = result.path + target_path = self.__get_path(target_query) + return target_path + + ########################################################################### + # Private support methods + def __get_path_for_du(self, du): + return os.path.join(self.path, str(du.id)) + + + def __remove_directory(self, path): + """Remove remote directory that may contain files. + """ + if self.__exists(path): + for filename in self.__sftp.listdir(path): + filepath = os.path.join(path, filename) + logging.debug("Delete %s"%filepath) + if stat.S_ISDIR(self.__sftp.stat(filepath).st_mode): + [self.__remove_directory(filepath)] + else: + self.__sftp.remove(filepath) + self.__sftp.rmdir(path) + + + def __is_remote_directory(self, url): + try: + result = urlparse.urlparse(url) + target_query = result.path + target_ep = self.__get_ep(target_query) + target_path = self.__get_path(target_query) + result = self.api.endpoint_ls(target_ep, target_path) + logger.debug("GO EP: %s Directory: %s Creation Result: %s"%(target_ep, target_path, str(result))) + return True + except: + pass + return False + + + def __third_party_transfer_host(self, source_url, target_url): + """ + Transfers from source URL to machine to target_url + """ + transfer_start = time.time() + result = urlparse.urlparse(source_url) + source_query = result.path + source_ep = self.__get_ep(source_query) + source_path = self.__get_path(source_query) + + result = urlparse.urlparse(target_url) + target_query = result.path + target_ep = self.__get_ep(target_query) + target_path = self.__get_path(target_query) + + + target_path = os.path.join(target_path, os.path.basename(source_path)) + logger.debug("transfer from %s:%s to %s:%s"%(source_ep, source_path, target_ep, target_path)) + + if os.path.exists(os.path.dirname(source_path)) and os.path.exists(target_path): + logger.debug("Target and source host are localhost. Processing: %s" %(source_path)) + expanded_path = glob.glob(source_path) + logger.debug("Expanded path: " + str(expanded_path)) + for path in expanded_path: + if os.path.isdir(path): + logger.debug("Source path %s is directory"%path) + files = os.listdir(path) + for i in files: + try: + os.symlink(os.path.join(files, i), target_path) + except: + self.__print_traceback() + else: + try: + os.symlink(path, os.path.join(target_path, os.path.basename(path))) + except: + self.__print_traceback() + + transfer_id = self.api.submission_id()[2]["value"] + logger.debug("Transfer ID: %s"%transfer_id) + transfer = api_client.Transfer(transfer_id, source_ep, target_ep, + deadline=None, sync_level=None, label=None) + transfer.add_item(source_path=source_path, destination_path=target_path, recursive=False ) + result = self.api.transfer(transfer) + task_id = result[2]["task_id"] + logger.debug("Transfer Request Result: %s Task ID: %s"%(str(result), task_id)) + self.__wait_for_task(task_id) + logger.debug("Task ID: %s Time: %d sec"%(transfer_id, (time.time()-transfer_start))) + + + def __get_ep(self, query_string): + if query_string.startswith("?"): + query_string = query_string[1:] + comp = query_string.split("&") + for i in comp: + part = i.split("=") + if part[0]=="ep": + return part[1] + + def __get_path(self, query_string): + if query_string.startswith("?"): + query_string = query_string[1:] + comp = query_string.split("&") + for i in comp: + part = i.split("=") + if part[0]=="path": + return part[1] + + def __wait_for_task(self, task_id, timeout=None): + status = "ACTIVE" + while (timeout==None or timeout > 0) and status == "ACTIVE": + code, reason, data = self.api.task(task_id, fields="status") + status = data["status"] + time.sleep(1) + if timeout!=None: + timeout -= 1 + + if status != "ACTIVE": + print "Task %s complete!" % task_id + return True + else: + print "Task still not complete after %d seconds" % timeout + return False + + + def __exists(self, path): + """Return True if the remote path exists + """ + try: + self.__sftp.stat(path) + except IOError, e: + if e.errno == errno.ENOENT: + return False + raise + else: + return True + + + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + print "*** print_tb:" + traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) + print "*** print_exception:" + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stdout) + + +if __name__ == "__main__": + go = GlobusOnlineFileAdaptor("http://drelu:bigjob@cli.globusonline.org?ep=drelu#egi&path=/ho") + go.transfer("go://cli.globusonline.org?ep=drelu#MacBook&path=/~/cert.tar.gz", "go://cli.globusonline.org?ep=xsede#kraken&path=/~/") + diff --git a/pilot/filemanagement/gs_adaptor.py b/pilot/filemanagement/gs_adaptor.py index 20dd2649..a4690847 100644 --- a/pilot/filemanagement/gs_adaptor.py +++ b/pilot/filemanagement/gs_adaptor.py @@ -1,7 +1,7 @@ ''' Google Storage based File Transfer Implementation ''' -import urlparse +import urllib.parse import errno import sys @@ -22,7 +22,7 @@ from oauth2client.client import Credentials from oauth2client.tools import run import httplib2 -import urllib +import urllib.request, urllib.parse, urllib.error """ AN OAUTH2 Client Id must be created at the Google API console at: @@ -118,7 +118,7 @@ def create_du(self, du_id): def put_du(self, du): logger.debug("Copy DU to Google Storage") du_items = du.list() - for i in du_items.keys(): + for i in list(du_items.keys()): try: local_filename=du_items[i]["local"] remote_path = os.path.join(str(du.id), os.path.basename(local_filename)) @@ -183,7 +183,7 @@ def create_remote_directory(self, target_url): def get_path(self, target_url): - result = urlparse.urlparse(target_url) + result = urllib.parse.urlparse(target_url) target_path = result.path return target_path @@ -205,9 +205,9 @@ def __get_bucket_name(self, service_url): def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) diff --git a/pilot/filemanagement/gs_adaptor.py.bak b/pilot/filemanagement/gs_adaptor.py.bak new file mode 100644 index 00000000..20dd2649 --- /dev/null +++ b/pilot/filemanagement/gs_adaptor.py.bak @@ -0,0 +1,220 @@ +''' +Google Storage based File Transfer Implementation +''' +import urlparse + +import errno +import sys +import os +import stat +import logging +import traceback +import time + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) +from pilot.api import State +from bigjob import logger + +from apiclient.discovery import build +from apiclient.http import MediaFileUpload +from oauth2client.file import Storage +from oauth2client.client import OAuth2WebServerFlow +from oauth2client.client import Credentials +from oauth2client.tools import run +import httplib2 +import urllib + +""" +AN OAUTH2 Client Id must be created at the Google API console at: + +https://code.google.com/apis/console/ + +=> API Access + +More information with respect to OAUTH: https://developers.google.com/compute/docs/api/how-tos/authorization +""" +OAUTH2_CLIENT_ID='1004462711324-55akehip32m59u6omdfrt9s8u8ehb0hm.apps.googleusercontent.com' +OAUTH2_CLIENT_SECRET='EIMML1W7anu0XijVghws0DY-' + +GS_PROJECT_ID="1004462711324" + + +class GSFileAdaptor(object): + """ BigData File Management for Pilot Data """ + + + def __init__(self, service_url, security_context=None): + # Initializations of instance variables + self.service_url = service_url + self.bucket_name = self.__get_bucket_name(service_url) + self.__state=State.New + + # Do OAUTH authentication + if security_context!=None: + logger.debug("Attempt to restore credentials from security context: " + str(security_context)) + self.credentials = Credentials.new_from_json(security_context) + else: + storage = Storage('gce.dat') + self.credentials = storage.get() + if self.credentials is None or self.credentials.invalid == True: + logger.debug("No valid credential found. Run new OAuth authentication round...") + flow = OAuth2WebServerFlow( + client_id=OAUTH2_CLIENT_ID, + client_secret=OAUTH2_CLIENT_SECRET, + scope=['https://www.googleapis.com/auth/devstorage.full_control', + 'https://www.googleapis.com/auth/compute'], + user_agent='bigjob-client/1.0') + + self.credentials = run(flow, storage) + + + def get_security_context(self): + """ Returns security context that needs to be available on the distributed + node in order to access this Pilot Data """ + return self.credentials.to_json() + + + def initialize_pilotdata(self): + # check whether directory exists + self.__state=State.Running + request_dict = { + "id":self.bucket_name, + "projectId":GS_PROJECT_ID + } + logger.debug(str(request_dict)) + try: + gs = self.__get_api_client()[0] + gs.buckets().insert(body=request_dict).execute() + except: + logger.debug("Error creating bucket: " + self.bucket_name) + pass # Do nothing if bucket already exists + + + def get_pilotdata_size(self): + # unlimited size + return None + + + def delete_pilotdata(self): + self.__state=State.Done + + + def get_state(self): + return self.__state + + + def create_du(self, du_id): + gs = self.__get_api_client()[0] + o = gs.objects().insert(bucket=self.bucket_name, name=str(du_id)+"/du_info", + body={'media': { + "contentType":"text/ascii", + "data": du_id + } + } + ).execute() + logger.debug("Created GS: " + str(o)) + + + def put_du(self, du): + logger.debug("Copy DU to Google Storage") + du_items = du.list() + for i in du_items.keys(): + try: + local_filename=du_items[i]["local"] + remote_path = os.path.join(str(du.id), os.path.basename(local_filename)) + self._put_file(local_filename, remote_path) + except: + logger.debug("Could not copy file: " + (str(i))) + + + def copy_du(self, du, pd_new): + pass + + + def get_du(self, du, target_url): + du_id=du.id + logger.debug("Get DU: " + str(du_id)) + gs = self.__get_api_client()[0] + result = gs.objects().list(bucket=self.bucket_name, prefix=du_id).execute() + #delimiter="/", + #prefix=[du_id]).execute() + logger.debug("Result: " + str(result)) + for i in result["items"]: + full_filename = i["name"] + self._get_file(full_filename, os.path.join(target_url, os.path.basename(full_filename))) + + + def remove_du(self, du): + self.__remove_directory(os.path.join(self.bucket_name, du.id)) + + + ########################################################################### + # Pure File Management APIs + def _put_file(self, source, target): + logger.debug("Put file: %s to %s"%(source, target)) + gs = self.__get_api_client()[0] + #media = MediaFileUpload(source, + # resumable=False) + o = gs.objects().insert(bucket=self.bucket_name, + name=target, + media_body=source).execute() + logger.debug("Put file result: %s"%str(o)) + + + def _get_file(self, source, target): + logger.debug("GET file: %s to %s"%(source, target)) + gs, http = self.__get_api_client() + f = gs.objects().get(bucket=self.bucket_name, + object=source).execute() + logger.debug("Get file result: %s"%str(f)) + downloadUrl = f["media"]['link'] + if downloadUrl: + response, content = http.request(downloadUrl) + logger.debug("Download file response: %d"%(response.status)) + with open(target, 'wb') as f: + f.write(content) + + + def transfer(self, source_url, target_url): + pass + + def create_remote_directory(self, target_url): + return True + + + def get_path(self, target_url): + result = urlparse.urlparse(target_url) + target_path = result.path + return target_path + + + ########################################################################### + # Auxiliary functions + def __get_api_client(self): + http = httplib2.Http() + http = self.credentials.authorize(http) + gs = build("storage", "v1beta1", http=http) + return gs, http + + + def __get_bucket_name(self, service_url): + bucket_name = service_url.replace("gs://", "") + bucket_name = bucket_name.replace("/", "") + return bucket_name + + + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + print "*** print_tb:" + traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) + print "*** print_exception:" + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stdout) + + +if __name__ == "__main__": + gs = GSFileAdaptor("gs://pilot-data-bucket-1234") + gs.initialize_pilotdata() + gs._put_file("test-random.exe", "test.exe") + gs._get_file("test.txt", "test2.txt") + gs.get_du(None, ".") \ No newline at end of file diff --git a/pilot/filemanagement/irods_adaptor.py b/pilot/filemanagement/irods_adaptor.py index 970fa0b7..1c19ccfe 100644 --- a/pilot/filemanagement/irods_adaptor.py +++ b/pilot/filemanagement/irods_adaptor.py @@ -1,7 +1,7 @@ ''' iRods based File Transfer Implementation ''' -import urlparse +import urllib.parse import datetime import errno import sys @@ -51,7 +51,7 @@ def __is_local(self): env_var = match.group(1) logger.debug("Found: " + env_var + " in URL.") logger.debug("Env list: " + str(os.environ)) - if os.environ.has_key(env_var): + if env_var in os.environ: self.localpath = re.sub(r'\$\{.*\}', os.environ[env_var], self.localpath) #self.localpath = os.environ[env_var] logger.debug("Expanding URL Path to: " + self.localpath) @@ -96,7 +96,7 @@ def put_du(self, du): start = time.time() logger.debug("Copy DU to iRod") du_items = du.list() - for i in du_items.keys(): + for i in list(du_items.keys()): try: local_filename=du_items[i]["local"] remote_path = os.path.join(str(du.id), os.path.basename(local_filename)) @@ -126,13 +126,13 @@ def get_du(self, du, target_url): for i in files: try: os.symlink(os.path.join(files, i), target_path) - os.chmod(os.path.join(target_path, os.path.basename(path)), 0777) + os.chmod(os.path.join(target_path, os.path.basename(path)), 0o777) except: self.__print_traceback() else: try: os.symlink(path, os.path.join(target_path, os.path.basename(path))) - os.chmod(os.path.join(target_path, os.path.basename(path)), 0777) + os.chmod(os.path.join(target_path, os.path.basename(path)), 0o777) except: self.__print_traceback() @@ -149,7 +149,7 @@ def get_du(self, du, target_url): for i in os.listdir(full_path): try: logger.debug("chmod " + str(i)) - os.chmod(os.path.join(full_path, i), 0777) + os.chmod(os.path.join(full_path, i), 0o777) logger.debug("move " + str(i)) shutil.move(os.path.join(full_path, i), target_url) except: @@ -223,9 +223,9 @@ def __run_command(self, command): def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) diff --git a/pilot/filemanagement/irods_adaptor.py.bak b/pilot/filemanagement/irods_adaptor.py.bak new file mode 100644 index 00000000..970fa0b7 --- /dev/null +++ b/pilot/filemanagement/irods_adaptor.py.bak @@ -0,0 +1,244 @@ +''' +iRods based File Transfer Implementation +''' +import urlparse +import datetime +import errno +import sys +import os +import stat +import logging +import traceback +import time +import re +import shutil +import pdb +import glob +import pexpect + +# This is for local debugging! +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) + +import saga +from pilot.api import State +from bigjob import logger + + + +class iRodsFileAdaptor(object): + """ BigData File Management for Pilot Data + Supports pilot data on top of iRods. + + Assumption: working iRods installation + + irods://localhost/${OSG_DATA}/?vo=&resource-group= + + """ + def __init__(self, resource_url, security_context=None, pilot_data_description=None): + self.resource_url = saga.Url(resource_url) + query_string = self.resource_url.query + self.localpath = self.resource_url.path + self.vo = re.search("(?<=vo=)(.*)([&\b]{1})", query_string).group(1) + self.resource_group = re.search("(?<=resource-group=)(.*)[&\b$]?", query_string).group(1) + logger.debug("VO: %s, Resource Group: %s"%(self.vo, self.resource_group)) + self.is_local = self.__is_local() + + + def __is_local(self): + # test whether path contains environment variable + match = re.search("\$\{(.*)\}", self.localpath) + if match: + env_var = match.group(1) + logger.debug("Found: " + env_var + " in URL.") + logger.debug("Env list: " + str(os.environ)) + if os.environ.has_key(env_var): + self.localpath = re.sub(r'\$\{.*\}', os.environ[env_var], self.localpath) + #self.localpath = os.environ[env_var] + logger.debug("Expanding URL Path to: " + self.localpath) + return True + logger.debug("No expansion in: " + self.localpath) + return False + + + def get_security_context(self): + """ Returns security context that needs to be available on the distributed + node in order to access this Pilot Data """ + return None + + + def initialize_pilotdata(self): + pass + + + def get_pilotdata_size(self): + # unlimited size + return None + + + def delete_pilotdata(self): + self.__state=State.Done + + + def get_state(self): + return self.__state + + + def create_du(self, du_id): + logger.debug("create iRods collection: " + du_id) + if self.is_local: + command = "mkdir %s"%(os.path.join(self.localpath, du_id)) + else: + command = "imkdir %s"%(du_id) + self.__run_command(command) + + + def put_du(self, du): + start = time.time() + logger.debug("Copy DU to iRod") + du_items = du.list() + for i in du_items.keys(): + try: + local_filename=du_items[i]["local"] + remote_path = os.path.join(str(du.id), os.path.basename(local_filename)) + logger.debug("copy %s to %s"%(local_filename, remote_path)) + self._put_file(local_filename, remote_path) + except: + logger.debug("Could not copy: " + str(i)) + logger.debug("Finished Put DU in: " + str(time.time()-start) + " sec.") + + + def get_du(self, du, target_url): + #du_id = "du-7370d7b5-ed0b-11e1-95df-705681b3df0f" + start = time.time() + du_id = du.id + logger.debug("Get DU: " + str(du_id)) + if self.is_local: + command = "cp -r %s %s"%(os.path.join(self.localpath, du_id), target_url) + source_path = os.path.join(self.localpath, du_id, "*") + target_path = target_url + logger.debug("Target and source host are localhost. Processing: %s" %(source_path)) + expanded_path = glob.glob(source_path) + logger.debug("Expanded path: " + str(expanded_path)) + for path in expanded_path: + if os.path.isdir(path): + logger.debug("Source path %s is directory"%path) + files = os.listdir(path) + for i in files: + try: + os.symlink(os.path.join(files, i), target_path) + os.chmod(os.path.join(target_path, os.path.basename(path)), 0777) + except: + self.__print_traceback() + else: + try: + os.symlink(path, os.path.join(target_path, os.path.basename(path))) + os.chmod(os.path.join(target_path, os.path.basename(path)), 0777) + except: + self.__print_traceback() + + else: + command = "iget -f -r %s %s"%(du_id, target_url) + logger.debug(command) + self.__run_command(command) + + full_path = os.path.join(target_url, du_id) + #logger.debug("Path: " + str(full_path) + " Exists: " + str(os.path.exists(full_path))) + #while os.path.exists(full_path)==False: + # time.sleep(1) + + for i in os.listdir(full_path): + try: + logger.debug("chmod " + str(i)) + os.chmod(os.path.join(full_path, i), 0777) + logger.debug("move " + str(i)) + shutil.move(os.path.join(full_path, i), target_url) + except: + self.__print_traceback() + + shutil.rmtree(full_path, ignore_errors=True) + #time.sleep(2) + #if target_url==".": + # target_url = os.getcwd() + #command = "mv %s/* %s"%(os.path.join(target_url, du_id), target_url) + #self.__run_command(command) + logger.debug("Finished Get DU " + du.id + " in: " + str(time.time()-start) + " sec.") + + + def copy_du(self, du, pd_new): + remote_url = pd_new.resource_url + "/" + str(du.id) + local_url = self.resource_url + "/" + str(du.id) + self.copy_du_to_url(du, local_url, remote_url) + + + def remove_du(self, du): + if self.is_local: + command = "rm -rf %s"%(os.path.join(self.localpath, du.id)) + else: + command = "irm %s"%du.id + + self.__run_command(command) + + + ########################################################################### + # Pure File Management APIs + def _put_file(self, source, target): + logger.debug("Put file: %s to %s"%(source, target)) + start = time.time() + if self.is_local: + command = "cp -r %s %s"%(source, target) + else: + command = "iput -f -R %s %s %s"%(self.resource_group, source, target) + self.__run_command(command) + put_time = time.time() - start + number_replica = 0 + if self.is_local==False: + #pdb.set_trace() + home_directory= self.__run_command("ipwd")[0].strip() + full_filename = os.path.join(home_directory, target) + command = "irepl-osg -f %s -G %s"%(full_filename, self.resource_group) + output = self.__run_command(command) + for i in output: + if i.find("copied") > 0 or i.find("replica")>0: + number_replica = number_replica + 1 + rep_time = time.time() - start - put_time + logger.info("Upload;Replication;Total;File Size;Backend;Number Replica;Timestamp: %f;%f;%f;%d;%s;%d;%s"%(put_time, rep_time, time.time()-start, os.path.getsize(source), self.resource_group, number_replica, datetime.datetime.today().isoformat())) + + + def transfer(self, source_url, target_url): + pass + + + def create_remote_directory(self, target_url): + return True + + + ########################################################################### + def __run_command(self, command): + logger.debug(command) + child = pexpect.spawn(command, timeout=None) + output = child.readlines() + logger.debug("Run %s Output: %s"%(command, str(output))) + child.close() + return output + + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + print "*** print_tb:" + traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) + print "*** print_exception:" + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stderr) + + +def test_irods(): + irods = iRodsFileAdaptor("irods://gw68/${OSG_DATA}/osg/home/luckow/?vo=osg&resource-group=osgGridFtpGroup") + irods.initialize_pilotdata() + irods.create_du("du-7370d7b5-ed0b-11e1-95df-705681b3df0f") + irods._put_file("test.txt", "du-7370d7b5-ed0b-11e1-95df-705681b3df0f/test.txt") + irods.get_du("du-7370d7b5-ed0b-11e1-95df-705681b3df0f", "export") + + + +if __name__ == "__main__": + test_irods() + diff --git a/pilot/filemanagement/s3_adaptor.py b/pilot/filemanagement/s3_adaptor.py index 19e50f47..bc028847 100644 --- a/pilot/filemanagement/s3_adaptor.py +++ b/pilot/filemanagement/s3_adaptor.py @@ -1,7 +1,7 @@ ''' Amazon S3 based File Transfer Implementation ''' -import urlparse +import urllib.parse import errno import sys @@ -69,8 +69,8 @@ def __init__(self, resource_url, security_context=None, pilot_data_description=N # try to recover key from pilot_data_description if self.pilot_data_description!=None and\ - self.pilot_data_description.has_key("access_key_id") and \ - self.pilot_data_description.has_key("secret_access_key"): + "access_key_id" in self.pilot_data_description and \ + "secret_access_key" in self.pilot_data_description: aws_access_key_id=self.pilot_data_description["access_key_id"] aws_secret_access_key=self.pilot_data_description["secret_access_key"] @@ -107,7 +107,7 @@ def __init__(self, resource_url, security_context=None, pilot_data_description=N self.s3_region = None # Region specifier according to Amazon API: # http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETlocation.html - if self.pilot_data_description.has_key("region"): + if "region" in self.pilot_data_description: self.s3_region = self.pilot_data_description["region"] self.s3_conn = S3Connection( @@ -163,7 +163,7 @@ def create_du(self, du_id): def put_du(self, du): logger.debug("Copy DU to S3/Walrus") du_items = du.list() - for i in du_items.keys(): + for i in list(du_items.keys()): try: local_filename=du_items[i]["local"] remote_path = os.path.join(str(du.id), os.path.basename(local_filename)) @@ -276,9 +276,9 @@ def __get_key_name(self, resource_url): def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) diff --git a/pilot/filemanagement/s3_adaptor.py.bak b/pilot/filemanagement/s3_adaptor.py.bak new file mode 100644 index 00000000..19e50f47 --- /dev/null +++ b/pilot/filemanagement/s3_adaptor.py.bak @@ -0,0 +1,343 @@ +''' +Amazon S3 based File Transfer Implementation +''' +import urlparse + +import errno +import sys +import os +import stat +import logging +import traceback +import time + +# This is for local debugging! +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) + +import saga +from pilot.api import State +from bigjob import logger + +################## + +from boto.s3.connection import S3Connection, OrdinaryCallingFormat +from boto.s3.key import Key +from boto.s3.connection import Location + +# Authentication +# Please use ~/.boto file to configure your security credentials (if possible) +# see http://boto.readthedocs.org/en/latest/boto_config_tut.html +# +# [Credentials] +# aws_access_key_id = +# aws_secret_access_key = +# +# Alternatively you can use these two variables +AWS_ACCESS_KEY_ID=None +AWS_SECRET_ACCESS_KEY=None + +class S3FileAdaptor(object): + """ BigData File Management for Pilot Data + Supports pilot data on top of S3 and Eucalyptus Walrus + + s3:// + walrus:// + + Amazon S3 supported regions: + + Default is an empty string "" => us-east-1 + + 'ap-northeast-1' + 'ap-southeast-1' + 'ap-southeast-2' + 'EU' + 'sa-east-1' + 'us-west-1' + 'us-west-2' + + """ + + + + def __init__(self, resource_url, security_context=None, pilot_data_description=None): + self.resource_url = saga.Url(resource_url) + self.bucket_name = self.__get_bucket_name(resource_url) + self.__state=State.New + self.pilot_data_description = pilot_data_description + aws_access_key_id=None + aws_secret_access_key=None + + # try to recover key from pilot_data_description + if self.pilot_data_description!=None and\ + self.pilot_data_description.has_key("access_key_id") and \ + self.pilot_data_description.has_key("secret_access_key"): + aws_access_key_id=self.pilot_data_description["access_key_id"] + aws_secret_access_key=self.pilot_data_description["secret_access_key"] + + # try to recover key from security context + if security_context!=None: + logger.debug("Attempt to restore credentials from security context: " + str(security_context)) + security_context = eval(security_context) + aws_access_key_id=security_context["aws_access_key_id"] + aws_secret_access_key=security_context["aws_secret_access_key"] + + self.s3_conn=None + if self.resource_url.scheme == "walrus" or self.resource_url.scheme == "swift": + calling_format=OrdinaryCallingFormat() + logger.debug("Access Key: %s Secret: %s Host: %s"%(aws_access_key_id, + aws_secret_access_key, + self.resource_url.host) + ) + port = 8773 + if self.resource_url.port!=None: + port = self.resource_url.port + + path = "/" + if self.resource_url.scheme == "walrus": + path = "/services/Walrus" + + self.s3_conn = S3Connection(aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + is_secure=False, + host=self.resource_url.host, + port=port, + calling_format=calling_format, + path=path) + else: # s3:// urls + self.s3_region = None + # Region specifier according to Amazon API: + # http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETlocation.html + if self.pilot_data_description.has_key("region"): + self.s3_region = self.pilot_data_description["region"] + + self.s3_conn = S3Connection( + aws_access_key_id, + aws_secret_access_key, + ) + + + + def get_security_context(self): + """ Returns security context that needs to be available on the distributed + node in order to access this Pilot Data """ + return {"aws_access_key_id": self.s3_conn.aws_access_key_id, + "aws_secret_access_key": self.s3_conn.aws_secret_access_key} + + + + def initialize_pilotdata(self): + # Create bucket + try: + if self.s3_region==None: + logger.debug("Use default S3 region.") + self.s3_region = "" # Default for US East + self.bucket = self.s3_conn.create_bucket(self.bucket_name, location=self.s3_region) + except: + # bucket already exists + #self.__print_traceback() + self.bucket = self.s3_conn.get_bucket(self.bucket_name) + + self.__state=State.Running + + + def get_pilotdata_size(self): + # unlimited size + return None + + + def delete_pilotdata(self): + self.__state=State.Done + + + def get_state(self): + return self.__state + + + def create_du(self, du_id): + logger.debug("create object: " + du_id) + k = Key(self.bucket) + k.key = str(du_id)+"/du_info" + k.set_contents_from_string(du_id) + + + def put_du(self, du): + logger.debug("Copy DU to S3/Walrus") + du_items = du.list() + for i in du_items.keys(): + try: + local_filename=du_items[i]["local"] + remote_path = os.path.join(str(du.id), os.path.basename(local_filename)) + logger.debug("copy %s to %s"%(local_filename, remote_path)) + self._put_file(local_filename, remote_path) + except: + logger.debug("Could not copy: " + str(i)) + + + def get_du(self, du, target_url): + #du_id = "du-7370d7b5-ed0b-11e1-95df-705681b3df0f" + du_id = du.id + logger.debug("Get DU: " + str(du_id)) + result = self.bucket.list(prefix=du_id) + logger.debug("Result Bucket List: " + str(result)) + for key in result: + logger.debug(str(key)) + full_filename = key.name + if full_filename != None: + logger.debug("Process file: " + full_filename) + if not full_filename.endswith("/"): + self._get_file(full_filename, os.path.join(target_url, os.path.basename(full_filename))) + + + def copy_du(self, du, pd_new): + remote_url = pd_new.resource_url + "/" + str(du.id) + local_url = self.resource_url + "/" + str(du.id) + self.copy_du_to_url(du, local_url, remote_url) + + + + + def remove_du(self, du): + self.__remove_directory(os.path.join(self.path, du.id)) + + + ########################################################################### + # Pure File Management APIs + def _put_file(self, source, target): + logger.debug("Put file: %s to %s"%(source, target)) + if self.__starts_with_valid_prefix(source): + logger.debug("Copy file from S3/Walrus") + source_bucket_name = self.__get_bucket_name(source) + source_key_name = self.__get_key_name(source) + self.bucket.copy_key(target, source_bucket_name, source_key_name) + #k = Key(source_bucket_name) + #k.copy(self.bucket_name, target) + else: + logger.debug("Copy file from Local") + k = Key(self.bucket) + k.key=target + k.set_contents_from_filename(source) + logger.debug("Put file result: %s"%source) + + + def _get_file(self, source, target): + logger.debug("GET file: %s to %s"%(source, target)) + k = self.bucket.get_key(source) + k.key=source + k.get_contents_to_filename(target) + + + def transfer(self, source_url, target_url): + pass + + def create_remote_directory(self, target_url): + return True + + + ########################################################################### + def __starts_with_valid_prefix(self, url): + valid_prefix=["s3", "walrus"] + result = False + for i in valid_prefix: + result = url.startswith(i) + if result == True: + break + return result + + def __get_bucket_name(self, resource_url): + surl = saga.Url(resource_url) + if surl.scheme.startswith("s3"): + bucket_name = resource_url.replace("s3://", "") + try: + bucket_name = bucket_name[:bucket_name.index("/")] + except: + pass + #bucket_name = bucket_name.replace("/", "") + else: + bucket_name = surl.path[1:] + return bucket_name + + + def __get_key_name(self, resource_url): + surl = saga.Url(resource_url) + # get path out of URL + if surl.scheme.startswith("s3"): + bucket_name = resource_url.replace("s3://", "") + else: + bucket_name = surl.path[1:] + + # get key path out of URL + try: + key_name = bucket_name[bucket_name.index("/")+1:] + except: + pass + + return key_name + + + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + print "*** print_tb:" + traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) + print "*** print_exception:" + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stdout) + + +def test_walrus(): + s3 = S3FileAdaptor("walrus://149.165.146.135/pilot-data-c4eb26eb-ed0c-11e1-ac98-705681b3df0f", + pilot_data_description={ "access_key_id":"8MCXRAMXMHDYKWNKXZ8WF", + "secret_access_key":"YrcUqSw2Arxshrh3ZtenkxerWwCWdMTKvZYoLPAo" }) + s3.initialize_pilotdata() + s3._put_file("test.txt", "du-7370d7b5-ed0b-11e1-95df-705681b3df0f/test.txt") + s3._get_file("du-7370d7b5-ed0b-11e1-95df-705681b3df0f/test.txt", "test2.txt") + s3.get_du("du-7370d7b5-ed0b-11e1-95df-705681b3df0f", ".") + +def test_swift(): + s3 = S3FileAdaptor("swift://149.165.146.50:3333/pilot-data-c4eb26eb-ed0c-11e1-ac98-705681b3df0f", + pilot_data_description={ "access_key_id":"f9716a49c92a4a4cbedb6aba5e78d682", + "secret_access_key":"bcdff54b7fe94d63b4412c762e823a84" }) + s3.initialize_pilotdata() + s3._put_file("test.txt", "du-7370d7b5-ed0b-11e1-95df-705681b3df0f/test.txt") + s3._get_file("du-7370d7b5-ed0b-11e1-95df-705681b3df0f/test.txt", "test2.txt") + s3.get_du("du-7370d7b5-ed0b-11e1-95df-705681b3df0f", ".") + +def test_s3import(): + s3 = S3FileAdaptor("s3://pilot-data-andre-test-create-from-s3-url", + pilot_data_description={ "access_key_id":"AKIAJPGNDJRYIG5LIEUA", + "secret_access_key":"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" }) + s3.initialize_pilotdata() + s3._put_file("s3://pilot-data-05d88e40-f65b-11e1-a327-00215ec9e3ac/du-3624837e-f66f-11e1-a327-00215ec9e3ac/WRT54GS_UG_WEB_20070529.pdf", "bla/test.pdf") + +def test_s3import_via_pilotapi(): + COORDINATION_URL="redis://localhost:6379" + from pilot import PilotComputeService, PilotDataService, ComputeDataService, State + pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) + + ################################################################################################### + # Pick one of the Pilot Data Descriptions below + + pilot_data_description_aws={ + "service_url": "s3://pilot-data-andre-workflow", + "size": 100, + "affinity_datacenter_label": "us-east-1", + "affinity_machine_label": "" , + "access_key_id": "AKIAJPGNDJRYIG5LIEUA", + "secret_access_key":"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", + } + + pd = pilot_data_service.create_pilot(pilot_data_description=pilot_data_description_aws) + + data_unit_description = { + "file_urls": ['s3://pilot-data-cec5d816-fa8f-11e1-ab5e-e61f1322a75c/du-67b4c762-fa90-11e1-ab5e-e61f1322a75c/ip-10-84-173-21512MB_2.input-chunk-02'], + "affinity_datacenter_label": "us-east-1", + "affinity_machine_label": "" + } + + # submit pilot data to a pilot store + input_data_unit = pd.submit_data_unit(data_unit_description) + input_data_unit.wait() + + +if __name__ == "__main__": + test_s3import_via_pilotapi() + diff --git a/pilot/filemanagement/ssh_adaptor.py b/pilot/filemanagement/ssh_adaptor.py index 57f2e863..c0952f1d 100644 --- a/pilot/filemanagement/ssh_adaptor.py +++ b/pilot/filemanagement/ssh_adaptor.py @@ -1,7 +1,7 @@ ''' SSH-based coordination scheme between manager and agent ''' -import urlparse +import urllib.parse import pdb import glob import errno @@ -26,7 +26,7 @@ class SSHFileAdaptor(object): def __init__(self, service_url, security_context=None, pilot_data_description=None): self.service_url = service_url - result = urlparse.urlparse(service_url) + result = urllib.parse.urlparse(service_url) self.host = result.hostname self.path = result.path self.user = result.username @@ -39,7 +39,7 @@ def __init__(self, service_url, security_context=None, pilot_data_description=No # try to recover key from pilot_data_description if self.pilot_data_description!=None and\ - self.pilot_data_description.has_key("userkey"): + "userkey" in self.pilot_data_description: self.userkey=self.pilot_data_description["userkey"] logger.debug("Security Context: " + str(self.security_context)) @@ -59,7 +59,7 @@ def __init__(self, service_url, security_context=None, pilot_data_description=No logger.debug("write: " + str(i)) f.write(i) f.close() - os.chmod(self.userkey, 0600) + os.chmod(self.userkey, 0o600) except: self.__print_traceback() @@ -72,7 +72,7 @@ def __init__(self, service_url, security_context=None, pilot_data_description=No def get_security_context(self): """ Returns security context that needs to be available on the distributed node in order to access this Pilot Data """ - if (self.security_context==None or self.security_context=="None") and self.pilot_data_description.has_key("userkey"): + if (self.security_context==None or self.security_context=="None") and "userkey" in self.pilot_data_description: f = open(self.pilot_data_description["userkey"]) key = f.readlines() f.close @@ -122,7 +122,7 @@ def put_du(self, du): def put_du_scp(self, du): logger.debug("Copy DU using SCP") du_items = du.list() - for i in du_items.keys(): + for i in list(du_items.keys()): local_filename = du_items[i]["local"] remote_path = os.path.join(self.path, str(du.id), os.path.basename(local_filename)) logger.debug("Put file: %s to %s"%(i, remote_path)) @@ -140,7 +140,7 @@ def put_du_scp(self, du): continue except: pass - result = urlparse.urlparse(local_filename) + result = urllib.parse.urlparse(local_filename) source_host = result.netloc source_path = result.path source_user = result.username @@ -178,7 +178,7 @@ def transfer(self, source_url, target_url): def create_remote_directory(self, target_url): - result = urlparse.urlparse(target_url) + result = urllib.parse.urlparse(target_url) target_host = result.hostname target_path = result.path target_user = result.username @@ -192,7 +192,7 @@ def create_remote_directory(self, target_url): def get_path(self, target_url): - result = urlparse.urlparse(target_url) + result = urllib.parse.urlparse(target_url) return result.path @@ -220,7 +220,7 @@ def __remove_directory(self, path): def __is_remote_directory(self, url): - result = urlparse.urlparse(url) + result = urllib.parse.urlparse(url) host = result.hostname path = result.path user = result.username @@ -235,14 +235,14 @@ def __is_remote_directory(self, url): def __third_party_transfer_scp(self, source_url, target_url): - result = urlparse.urlparse(source_url) + result = urllib.parse.urlparse(source_url) source_host = result.hostname source_path = result.path source_user = result.username if source_host==None or source_host=="": source_host="localhost" - result = urlparse.urlparse(target_url) + result = urllib.parse.urlparse(target_url) target_host = result.hostname target_path = result.path target_user = result.username @@ -358,9 +358,9 @@ def __run_check(self): def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) diff --git a/pilot/filemanagement/ssh_adaptor.py.bak b/pilot/filemanagement/ssh_adaptor.py.bak new file mode 100644 index 00000000..57f2e863 --- /dev/null +++ b/pilot/filemanagement/ssh_adaptor.py.bak @@ -0,0 +1,366 @@ +''' +SSH-based coordination scheme between manager and agent +''' +import urlparse +import pdb +import glob +import errno +import sys +import os +import stat +import logging +import traceback +import pexpect +from pexpect import TIMEOUT +from pilot.api.api import PilotError + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from pilot.api import State +from bigjob import logger + +SSH_OPTS="-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o NumberOfPasswordPrompts=0" + + +class SSHFileAdaptor(object): + """ BigData Coordination File Management for Pilot Store """ + + def __init__(self, service_url, security_context=None, pilot_data_description=None): + self.service_url = service_url + result = urlparse.urlparse(service_url) + self.host = result.hostname + self.path = result.path + self.user = result.username + + self.pilot_data_description=pilot_data_description + + # handle security context + self.userkey=None + self.security_context=security_context + + # try to recover key from pilot_data_description + if self.pilot_data_description!=None and\ + self.pilot_data_description.has_key("userkey"): + self.userkey=self.pilot_data_description["userkey"] + + logger.debug("Security Context: " + str(self.security_context)) + + # try to recover key from security context + if security_context!=None and security_context!="None": + logger.debug("Attempt to restore SSH credentials from security context: " + str(security_context)) + security_context = eval(security_context) + key=security_context["userkey"] + self.userkey=os.path.join(os.getcwd(), ".ssh/id_rsa") + if os.path.exists(os.path.join(os.getcwd(),".ssh"))==False: + os.makedirs(os.path.join(os.getcwd(), ".ssh")) + logger.debug("Write key: " + str(type(key)) + " to: " + self.userkey) + try: + f = open(self.userkey, "w") + for i in key: + logger.debug("write: " + str(i)) + f.write(i) + f.close() + os.chmod(self.userkey, 0600) + except: + self.__print_traceback() + + if self.__run_check()==False: + sys.stderr.write("Unable to connect/initialize Pilot-Data. Exit BigJob.") + raise PilotError("Unable to connect/initialize Pilot-Data. Exit BigJob.") + + + + def get_security_context(self): + """ Returns security context that needs to be available on the distributed + node in order to access this Pilot Data """ + if (self.security_context==None or self.security_context=="None") and self.pilot_data_description.has_key("userkey"): + f = open(self.pilot_data_description["userkey"]) + key = f.readlines() + f.close + self.security_context = {"userkey":key} + logger.debug("Return security context: " + str(self.security_context)) + return self.security_context + + + def initialize_pilotdata(self): + # check whether directory exists + try: + command = "mkdir -p %s"%self.path + self.__run_ssh_command(self.userkey, self.user, self.host, command) + except IOError: + self.__print_traceback() + # directory does not exist + self.__state=State.Running + + + def get_pilotdata_size(self): + return None + + + def delete_pilotdata(self): + self.__remove_directory(self.path) + self.__state=State.Done + + + def get_state(self): + if self.__client.get_transport().is_active()==True: + return self.__state + else: + self.__state=State.Failed + return self.__state + + def create_du(self, du_id): + du_dir = os.path.join(self.path, str(du_id)) + logger.debug("mkdir: " + du_dir) + command = "mkdir %s"%du_dir + self.__run_ssh_command(self.userkey, self.user, self.host, command) + + + def put_du(self, du): + self.put_du_scp(du) + + + def put_du_scp(self, du): + logger.debug("Copy DU using SCP") + du_items = du.list() + for i in du_items.keys(): + local_filename = du_items[i]["local"] + remote_path = os.path.join(self.path, str(du.id), os.path.basename(local_filename)) + logger.debug("Put file: %s to %s"%(i, remote_path)) + if local_filename.startswith("ssh://"): + # check if remote path is directory + if self.__is_remote_directory(local_filename): + logger.warning("Path %s is a directory. Ignored."%local_filename) + continue + + #self.__third_party_transfer(i.local_url, remote_path) + else: + try: + if stat.S_ISDIR(os.stat(local_filename).st_mode): + logger.warning("Path %s is a directory. Ignored."%local_filename) + continue + except: + pass + result = urlparse.urlparse(local_filename) + source_host = result.netloc + source_path = result.path + source_user = result.username + logger.debug(str((source_host, source_path, self.host, remote_path))) + self.__run_scp_command(self.userkey, source_user, source_host, source_path, self.user, self.host, remote_path) + + + def copy_du(self, du, pd_new): + remote_url = pd_new.service_url + "/" + str(du.id) + local_url = self.service_url + "/" + str(du.id) + self.copy_du_to_url(du, local_url, remote_url) + + + def get_du(self, du, target_url): + remote_url = target_url + local_url = self.service_url + "/" + str(du.id) + logger.debug("get_du(): copy %s to %s:"%(local_url, remote_url)) + self.copy_du_to_url(du, local_url, remote_url) + + + def remove_du(self, du): + self.__remove_directory(os.path.join(self.path, du.id)) + + + def put_progress(self, transfered_bytes, total_bytes): + logger.debug("Bytes transfered %d/%d"%(transfered_bytes, total_bytes)) + + + + #################################################################################### + # pure file management methods + # used by BJ file staging + def transfer(self, source_url, target_url): + self.__third_party_transfer_scp(source_url, target_url) + + + def create_remote_directory(self, target_url): + result = urlparse.urlparse(target_url) + target_host = result.hostname + target_path = result.path + target_user = result.username + logger.debug("Create directory: %s"%target_path) + command = "mkdir %s"%target_path + rc = self.__run_ssh_command(self.userkey, target_user, target_host, command) + if rc==0: + return True + else: + return False + + + def get_path(self, target_url): + result = urlparse.urlparse(target_url) + return result.path + + + def copy_du_to_url(self, du, local_url, remote_url): + self.create_remote_directory(remote_url) + self.__third_party_transfer_scp(local_url + "/*", remote_url) + + + ########################################################################### + # Private support methods + def __get_path_for_du(self, du): + return os.path.join(self.path, str(du.id)) + + + def __remove_directory(self, path): + """Remove remote directory that may contain files. + """ + if self.__exists(path): + command = "rm -rf %s"%path + rc = self.__run_ssh_command(self.userkey, self.user, self.host, command) + if rc==0: + return True + else: + return False + + + def __is_remote_directory(self, url): + result = urlparse.urlparse(url) + host = result.hostname + path = result.path + user = result.username + command = "test -d %s"%path + rc = self.__run_ssh_command(self.userkey, user, host, command) + if rc==0: + logger.debug("Directory found: %s"%path) + return True + else: + logger.debug("Directory not found: %s"%path) + return False + + + def __third_party_transfer_scp(self, source_url, target_url): + result = urlparse.urlparse(source_url) + source_host = result.hostname + source_path = result.path + source_user = result.username + if source_host==None or source_host=="": + source_host="localhost" + + result = urlparse.urlparse(target_url) + target_host = result.hostname + target_path = result.path + target_user = result.username + if target_host==None or target_host=="": + target_host="localhost" + + #check whether this is a local transfer + if os.path.exists(os.path.dirname(source_path)): + logger.debug("Target and source host are localhost. Processing: %s" %(source_path)) + expanded_path = glob.glob(source_path) + logger.debug("Expanded path: " + str(expanded_path)) + for path in expanded_path: + if os.path.isdir(path): + logger.debug("Source path %s is directory"%path) + files = os.listdir(path) + for i in files: + try: + os.symlink(os.path.join(files, i), target_path) + except: + self.__print_traceback() + else: + try: + os.symlink(path, os.path.join(target_path, os.path.basename(path))) + except: + self.__print_traceback() + else: + self.__run_scp_command(self.userkey, source_user, source_host, source_path, target_user, target_host, target_path) + + + + def __exists(self, path): + """Return True if the remote path exists + """ + command = "test -e %s"%path + rc = self.__run_ssh_command(self.userkey, self.user, self.host, command) + if rc==0: + return True + else: + return False + + + + def __run_ssh_command(self, userkey, user, host, command): + output = self.__run_ssh_command_rc(userkey, user, host, command)[0] + return output + + + def __run_ssh_command_rc(self, userkey, user, host, command): + prefix="" + if host != None: + prefix = "ssh " + SSH_OPTS + " " + if userkey != None: + prefix = prefix + " -i " + userkey + " " + if user!=None: + prefix = prefix + " " + user+ "@" + prefix = prefix + host + + command = prefix + " " + command + logger.debug(command.strip()) + child = pexpect.spawn(command.strip(), timeout=None) + output = child.readlines() + logger.debug("Run %s Output: %s"%(command, str(output))) + child.close() + return (output, child.exitstatus) + + + def __run_scp_command(self, userkey, source_user, source_host, source_path, target_user, target_host, target_path): + logger.debug("Create scp command: source_user: %s, source_host: %s"%(source_user, source_host)) + command = "scp " + SSH_OPTS + " " + if userkey != None: + command = command + "-i " + userkey + " " + + if source_host != None and source_host!="" and source_host!="localhost": + if source_user!=None: + command = command + " " + source_user + "@" + if source_host!="localhost": + command = command + source_host + ":" + + # path is a must parameter + command = command + source_path + " " + + logger.debug("Create scp command: target_user: %s, target_host: %s"%(str(target_user), str(target_host))) + if target_host != None and target_host!="" and target_host!="localhost": + if target_user!=None: + command = command + " " + target_user + "@" + if target_host!="localhost": + command = command + target_host + ":" + + command = command + target_path + logger.debug(command) + child = pexpect.spawn(command.strip(), timeout=None) + password_error=False + try: + child.timeout=300 + child.expect("password:",timeout=300, searchwindowsize=5024) + password_error=True + except Exception as ex: + logger.debug("No password prompt error found" + str(ex)) + + if password_error: + raise PilotError("SSH key-less login not correctly setup.") + output = child.readlines() + logger.debug("Run %s Output: %s"%(command, str(output))) + child.close() + return child.exitstatus + + def __run_check(self): + rc = self.__run_ssh_command_rc(self.userkey, self.user, self.host, "/bin/date")[1] + if rc == 0: + return True + else: + return False + + def __print_traceback(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + print "*** print_tb:" + traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) + print "*** print_exception:" + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stderr) + diff --git a/pilot/filemanagement/webhdfs_adaptor.py b/pilot/filemanagement/webhdfs_adaptor.py index 110ea610..35e27ede 100644 --- a/pilot/filemanagement/webhdfs_adaptor.py +++ b/pilot/filemanagement/webhdfs_adaptor.py @@ -1,6 +1,6 @@ import sys, os import stat -import urlparse +import urllib.parse sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/../../../webhdfs-py/") @@ -22,7 +22,7 @@ def __init__(self, service_url): self.service_url = service_url try: - result = urlparse.urlparse(service_url) + result = urllib.parse.urlparse(service_url) self.host = result.netloc self.path = result.path except: @@ -80,7 +80,7 @@ def copy_pd_to_url(self, pd, local_url, remote_url): logger.error("Only local URLs supported") return - result = urlparse.urlparse(remote_url) + result = urllib.parse.urlparse(remote_url) path = result.path # create directory try: diff --git a/pilot/filemanagement/webhdfs_adaptor.py.bak b/pilot/filemanagement/webhdfs_adaptor.py.bak new file mode 100644 index 00000000..110ea610 --- /dev/null +++ b/pilot/filemanagement/webhdfs_adaptor.py.bak @@ -0,0 +1,118 @@ +import sys, os +import stat +import urlparse + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/../../../webhdfs-py/") + +from pilot.api import State +from bigjob import logger +logger.debug(str(sys.path)) + +from webhdfs.webhdfs import WebHDFS + + +class WebHDFSFileAdaptor(object): + + HDFS_USER_NAME="luckow" + HDFS_SERVICE_HOST="192.168.2.108" + HDFS_SERVICE_PORT=50070 + + def __init__(self, service_url): + + self.service_url = service_url + + try: + result = urlparse.urlparse(service_url) + self.host = result.netloc + self.path = result.path + except: + logger.error("Error parsing URL.") + + self.__state=State.New + self.__webhdfs= WebHDFS(self.HDFS_SERVICE_HOST, + self.HDFS_SERVICE_PORT, + self.HDFS_USER_NAME) + + def get_security_context(self): + """ Returns security context that needs to be available on the distributed + node in order to access this Pilot Data """ + return None + + + def initialize_pilotstore(self): + self.__webhdfs.mkdir(self.path) + + + def get_pilotstore_size(self): + return 0 + + + def delete_pilotstore(self): + self.__webhdfs.rmdir(self.path) + + def get_state(self): + return self.__state + + + def create_pd(self, pd_id): + pd_dir = self.__get_pd_path(pd_id) + logger.debug("mkdir: " + pd_dir) + self.__webhdfs.mkdir(pd_dir) + + + def put_pd(self, pd): + for i in pd.list_data_units(): + remote_path = os.path.join(self.__get_pd_path(pd.id), os.path.basename(i.local_url)) + logger.debug("Put file: %s to %s"%(i.local_url, remote_path)) + + if i.local_url.startswith("file://") or i.local_url.startswith("/"): + if stat.S_ISDIR(os.stat(i.local_url).st_mode): + logger.warning("Path %s is a directory. Ignored."%i.local_url) + continue + self.__webhdfs.copyFromLocal(i.local_url, remote_path) + else: + logger.error("File URLs: %s not supported"%i.local_url) + + + def copy_pd_to_url(self, pd, local_url, remote_url): + + if not remote_url.startswith("file://") and not remote_url.startswith("/"): + logger.error("Only local URLs supported") + return + + result = urlparse.urlparse(remote_url) + path = result.path + # create directory + try: + os.makedirs(path) + except: + logger.debug("Directory: %s already exists."%path) + + base_dir = self.__get_pd_path(pd.id) + for filename in self.__webhdfs.listdir(base_dir): + file_url = local_url + "/" + filename + file_remote_url = remote_url + "/" + filename + logger.debug("GET " + file_url + " to " + file_remote_url) + self.__webhdfs.copyToLocal(file_url, file_remote_url) + + + + def copy_pd(self, pd, ps_new): + pass + + + def get_pd(self, pd, target_url): + remote_url = target_url + local_url = self.__get_pd_path(pd.id) + self.copy_pd_to_url(pd, local_url, remote_url) + + + def remove_pd(self, pd): + self.__webhdfs.rmdir(self.__get_pd_path(pd.id)) + + + ########################################################################### + # Internal methods + def __get_pd_path(self, pd_id): + return os.path.join(self.path, str(pd_id)) + \ No newline at end of file diff --git a/pilot/impl/pilot_manager.py b/pilot/impl/pilot_manager.py index 808c6706..fe0c1e9e 100644 --- a/pilot/impl/pilot_manager.py +++ b/pilot/impl/pilot_manager.py @@ -11,10 +11,10 @@ import threading import logging import pdb -import Queue +import queue import uuid import traceback -import urlparse +import urllib.parse from bigjob import logger @@ -70,8 +70,8 @@ def __init__(self, cds_url=None): # Background Thread for scheduling self.scheduler = Scheduler() - self.cu_queue = Queue.Queue() - self.du_queue = Queue.Queue() + self.cu_queue = queue.Queue() + self.du_queue = queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.daemon=True @@ -150,11 +150,11 @@ def list_pilot_data(self): def list_data_units(self): """ List all DUs of CDS """ - return self.data_units.items() + return list(self.data_units.items()) def get_data_unit(self, du_id): - if self.data_units.has_key(du_id): + if du_id in self.data_units: return self.data_units[du_id] return None @@ -190,11 +190,11 @@ def wait(self): self.du_queue.join() logger.debug("DU queue empty") - for i in self.data_units.values(): + for i in list(self.data_units.values()): i.wait() logger.debug("DUs done") - for i in self.compute_units.values(): + for i in list(self.compute_units.values()): i.wait() logger.debug("CUs done") @@ -264,7 +264,7 @@ def _scheduler_thread(self): else: self.du_queue.task_done() self.du_queue.put(du) - except Queue.Empty: + except queue.Empty: pass try: @@ -281,7 +281,7 @@ def _scheduler_thread(self): logger.debug("No resource found.") self.cu_queue.task_done() self.cu_queue.put(cu) - except Queue.Empty: + except queue.Empty: pass except: exc_type, exc_value, exc_traceback = sys.exc_info() @@ -300,9 +300,9 @@ def _scheduler_thread(self): def __wait_for_du(self, compute_unit): """ wait for Data Units that are required for Compute Unit """ cu_description = compute_unit.compute_unit_description - if cu_description.has_key("input_data") and len(cu_description["input_data"])>0: + if "input_data" in cu_description and len(cu_description["input_data"])>0: for input_du_url in cu_description["input_data"]: - for du in self.data_units.values(): + for du in list(self.data_units.values()): if input_du_url == du.get_url(): logger.debug("Wait for DU: %s"%du.get_url()) du.wait() diff --git a/pilot/impl/pilot_manager.py.bak b/pilot/impl/pilot_manager.py.bak new file mode 100644 index 00000000..808c6706 --- /dev/null +++ b/pilot/impl/pilot_manager.py.bak @@ -0,0 +1,389 @@ +""" +B{ComputeDataService Module}: A central implementation of the L{ComputeDataService} + +A Meta-Scheduling service for pilots (both PilotCompute and PilotData) + +""" + +import sys +import os +import time +import threading +import logging +import pdb +import Queue +import uuid +import traceback +import urlparse + +from bigjob import logger + +import pilot +from pilot.api import ComputeDataService, State +from pilot.impl.pilotdata_manager import DataUnit +from pilot.impl.pilotcompute_manager import ComputeUnit + + +#from pilot.coordination.advert import AdvertCoordinationAdaptor as CoordinationAdaptor +#from pilot.coordination.nocoord_adaptor import NoCoordinationAdaptor as CoordinationAdaptor +from pilot.coordination.redis_adaptor import RedisCoordinationAdaptor as CoordinationAdaptor + +""" Loaded Module determines scheduler: + + bigdata.scheduler.data_compute_scheduler - selects random locations for PD and CUs + bigdata.scheduler.data_compute_affinity_scheduler - considers affinity descriptions + +""" +from pilot.scheduler.data_compute_affinity_scheduler import Scheduler + +class ComputeDataService(ComputeDataService): + """ B{ComputeDataService (CDS).} + + The ComputeDataService is the application's interface to submit + ComputeUnits and PilotData/DataUnit to the Pilot-Manager + in the P* Model. + """ + CDS_ID_PREFIX="cds-" + + + def __init__(self, cds_url=None): + """ Create a ComputeDataService object. + + Keyword arguments: + cds_url -- Reconnect to an existing CDS (optional). + """ + # Pilot Data + self.data_units={} + self.pilot_data_services=[] + + # Pilot Job + self.compute_units={} + self.pilot_job_services=[] + + if cds_url == None: + self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) + application_url = CoordinationAdaptor.get_base_url(pilot.application_id) + self.url = CoordinationAdaptor.add_cds(application_url, self) + else: + self.id = self.__get_cds_id(cds_url) + self.url = cds_url + + # Background Thread for scheduling + self.scheduler = Scheduler() + self.cu_queue = Queue.Queue() + self.du_queue = Queue.Queue() + self.stop=threading.Event() + self.scheduler_thread=threading.Thread(target=self._scheduler_thread) + self.scheduler_thread.daemon=True + self.scheduler_thread.start() + + + def __get_cds_id(self, cds_url): + start = cds_url.index(self.CDS_ID_PREFIX) + end =cds_url.index("/", start) + return cds_url[start:end] + + + ########################################################################### + # Pilot Compute + def add_pilot_compute_service(self, pcs): + """ Add a PilotComputeService to this CDS. + + @param pcs: The PilotComputeService to which this ComputeDataService will connect. + + """ + self.pilot_job_services.append(pcs) + CoordinationAdaptor.update_cds(self.url, self) + + + def remove_pilot_compute_service(self, pcs): + """ Remove a PilotJobService from this CDS. + + Note that it won't cancel the PilotComputeService, it will just no + longer be connected to this CDS. + + Keyword arguments: + @param pcs: The PilotComputeService to remove from this ComputeDataService. + """ + self.pilot_job_services.remove(pcs) + CoordinationAdaptor.update_cds(self.url, self) + + + def submit_compute_unit(self, compute_unit_description): + """ Submit a CU to this Compute Data Service. + + @param compute_unit_description: The ComputeUnitDescription from the application + @return: ComputeUnit object + """ + cu = ComputeUnit(compute_unit_description, self) + self.compute_units[cu.id]=cu + self.cu_queue.put(cu) + CoordinationAdaptor.update_cds(self.url, self) + return cu + + def list_pilot_compute(self): + """ List all pilot compute of CDS """ + return self.pilot_job_service + + ########################################################################### + # Pilot Data + def add_pilot_data_service(self, pds): + """ Add a PilotDataService + + @param pds: The PilotDataService to add. + """ + self.pilot_data_services.append(pds) + CoordinationAdaptor.update_cds(self.url, self) + + def remove_pilot_data_service(self, pds): + """ Remove a PilotDataService + @param pds: The PilotDataService to remove + """ + self.pilot_data_services.remove(pds) + CoordinationAdaptor.update_cds(self.url, self) + + + def list_pilot_data(self): + """ List all pilot data of CDS """ + return self.pilot_data_services + + + def list_data_units(self): + """ List all DUs of CDS """ + return self.data_units.items() + + + def get_data_unit(self, du_id): + if self.data_units.has_key(du_id): + return self.data_units[du_id] + return None + + + def submit_data_unit(self, data_unit_description): + """ creates a data unit object and binds it to a physical resource (a pilotdata) """ + du = DataUnit(pilot_data=None, + data_unit_description=data_unit_description) + self.data_units[du.id]=du + self.du_queue.put(du) + # queue currently not persisted + CoordinationAdaptor.update_cds(self.url, self) + return du + + + def cancel(self): + """ Cancel the CDS. + All associated PD and PC objects are canceled. + """ + # terminate background thread + self.stop.set() + CoordinationAdaptor.delete_cds(self.url) + + def wait(self): + """ Waits for CUs and DUs. Return after all DU's have been placed (i.e. in state Running) + and all CU's have been completed (i.e. in state Done) or if a fault has occurred or + the user has cancelled a CU or DU. + """ + try: + logger.debug("### START WAIT ###") + self.cu_queue.join() + logger.debug("CU queue empty") + self.du_queue.join() + logger.debug("DU queue empty") + + for i in self.data_units.values(): + i.wait() + logger.debug("DUs done") + + for i in self.compute_units.values(): + i.wait() + logger.debug("CUs done") + + logger.debug("### END WAIT ###") + except: + logger.debug("Ctrl-c detected. Terminating ComputeDataService...") + self.cancel() + raise KeyboardInterrupt + + + def get_state(self): + "@return: State of the ComputeDataService" + return self.state + + + def get_id(self): + "@return: id of ComputeDataService" + return str(self.id) + + + def __del__(self): + """ Make sure that background thread terminates""" + self.cancel() + + ########################################################################### + # Internal Scheduling + def __update_scheduler_resources(self): + logger.debug("__update_scheduler_resources") + pd = [s for i in self.pilot_data_services for s in i.list_pilots()] + self.scheduler.set_pilot_data(pd) + pj = [p for i in self.pilot_job_services for p in i.list_pilots()] + logger.debug("Pilot-Jobs: " + str(pj)) + self.scheduler.set_pilot_jobs(pj) + + def _schedule_du(self, du): + """ Schedule DU to a suitable pilot data + + Currently one level of scheduling is used: + 1.) Add all resources managed by the contained PDS + 2.) Select one resource + """ + logger.debug("Schedule PD") + self.__update_scheduler_resources() + selected_pilot_data = self.scheduler.schedule_pilot_data(du.data_unit_description) + return selected_pilot_data + + def _schedule_cu(self, cu): + logger.debug("Schedule CU") + self.__update_scheduler_resources() + selected_pilot_job = self.scheduler.schedule_pilot_job(cu.compute_unit_description) + return selected_pilot_job + + def _scheduler_thread(self): + while True and self.stop.isSet()==False: + try: + #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") + du = self.du_queue.get(True, 1) + # check whether this is a real du object + if isinstance(du, DataUnit): + pd=self._schedule_du(du) + if(pd!=None): + logger.debug("Initiate Transfer to PD.") + du.add_pilot_data(pd) + logger.debug("Transfer to PD finished.") + du._update_state(State.Running) + self.du_queue.task_done() + else: + self.du_queue.task_done() + self.du_queue.put(du) + except Queue.Empty: + pass + + try: + #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job") + cu = self.cu_queue.get(True, 1) + if isinstance(cu, ComputeUnit): + self.__wait_for_du(cu) + pj=self._schedule_cu(cu) + if pj !=None: + cu = self.__expand_working_directory(cu, pj) + pj._submit_cu(cu) + self.cu_queue.task_done() + else: + logger.debug("No resource found.") + self.cu_queue.task_done() + self.cu_queue.put(cu) + except Queue.Empty: + pass + except: + exc_type, exc_value, exc_traceback = sys.exc_info() + logger.error("*** print_tb:") + traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) + logger.error("*** print_exception:") + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stderr) + + if self.cu_queue.empty() and self.du_queue.empty(): + time.sleep(5) + + logger.debug("Re-Scheduler terminated") + + + def __wait_for_du(self, compute_unit): + """ wait for Data Units that are required for Compute Unit """ + cu_description = compute_unit.compute_unit_description + if cu_description.has_key("input_data") and len(cu_description["input_data"])>0: + for input_du_url in cu_description["input_data"]: + for du in self.data_units.values(): + if input_du_url == du.get_url(): + logger.debug("Wait for DU: %s"%du.get_url()) + du.wait() + + def __expand_working_directory(self, compute_unit, pilot_job): + """ + DEPRECATED capability! + Expand pilotdata:// url specified in the compute_unit_description + to a local url on the machine of the PJ + + pilotdata://localhost/434bfc5c-23fd-11e1-a43f-00264a13ca4c + + to + + /tmp/pilotstore//434bfc5c-23fd-11e1-a43f-00264a13ca4c on machine running pilot_job + """ + #======================================================================= + # if compute_unit.compute_unit_description.has_key("working_directory"): + # working_directory=compute_unit.compute_unit_description["working_directory"] + # if working_directory.find(DataUnit.DU_ID_PREFIX)!=-1: + # pilot_data_url = working_directory + # pj_description = pilot_job.pilot_compute_description + # pj_dc_affinity = pj_description["affinity_datacenter_label"] + # pj_machine_affinity = pj_description["affinity_machine_label"] + # pd = [s for i in self.pilot_data_services for s in i.list_pilots()] + # + # # find all pilot stores with the same affinity + # candidate_pd = [] + # for i in pd: + # pd_description = i.pilot_data_description + # pd_dc_affinity = pd_description["affinity_datacenter_label"] + # pd_machine_affinity = pd_description["affinity_machine_label"] + # if pd_dc_affinity == pj_dc_affinity and pd_machine_affinity == pj_machine_affinity: + # candidate_pd.append(i) + # + # # check whether required pilot_data is part of pilot_data + # target_pd = None + # target_du = None + # for pd in candidate_pd: + # for du in pd.list_data_units(): + # logger.debug("DU URL:%s"%(du.url)) + # if du.url == pilot_data_url: + # logger.debug("Found PD %s at %s"%(du.url, pd.service_url)) + # target_pd = pd + # target_du = du + # break + # if target_du == None: + # self.__stage_du_to_pj(pilot_data_url, pilot_job) + # + # if target_pd!=None: + # pd_url = target_pd.url_for_du(target_du) + # components = urlparse.urlparse(pd_url) + # compute_unit.compute_unit_description["working_directory"] = components.path + # compute_unit._update_compute_unit_description(compute_unit.compute_unit_description) + # logger.debug("__expand_working_directory %s: Set working directory to %s"%(pilot_data_url, compute_unit.compute_unit_description["working_directory"])) + # return compute_unit + # + #======================================================================= + return compute_unit + + + def __stage_du_to_pj(self, pilotdata, pilotjob): + """ + stage required files to machine of pilot job + """ + pass + + def __find_pd_at_pj_resource(self, pilotjob): + pass + + +############################################################################### +# Unimplemented entities + +class ComputeUnitService(object): + def __init__(self): + raise NotImplementedError("Please use ComputeDataService.") + + +class DataUnitService(object): + def __init__(self): + raise NotImplementedError("Please use ComputeDataService.") + + diff --git a/pilot/impl/pilot_manager_decentral.py b/pilot/impl/pilot_manager_decentral.py index 11bf4e94..e5635e6c 100644 --- a/pilot/impl/pilot_manager_decentral.py +++ b/pilot/impl/pilot_manager_decentral.py @@ -9,10 +9,10 @@ import threading import logging import pdb -import Queue +import queue import uuid import traceback -import urlparse +import urllib.parse import bigjob from bigjob import logger, bigjob, subjob, description @@ -70,7 +70,7 @@ def __init__(self, cds_url=None): # Background Thread for scheduling self.scheduler = Scheduler() - self.du_queue = Queue.Queue() + self.du_queue = queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) @@ -170,9 +170,9 @@ def __submit_cu(self, compute_unit): def __wait_for_du(self, compute_unit): """ wait for Data Units that are required for Compute Unit """ cu_description = compute_unit.compute_unit_description - if cu_description.has_key("input_data") and len(cu_description["input_data"])>0: + if "input_data" in cu_description and len(cu_description["input_data"])>0: for input_du_url in cu_description["input_data"]: - for du in self.data_units.values(): + for du in list(self.data_units.values()): if input_du_url == du.get_url(): logger.debug("Wait for DU: %s"%du.get_url()) du.wait() @@ -203,11 +203,11 @@ def list_pilot_data(self): def list_data_units(self): """ List all DUs of CDS """ - return self.data_units.items() + return list(self.data_units.items()) def get_data_unit(self, du_id): - if self.data_units.has_key(du_id): + if du_id in self.data_units: return self.data_units[du_id] return None @@ -241,8 +241,8 @@ def wait(self): the user has cancelled a CU or DU. """ try: - dus = self.data_units.values() - cus = self.compute_units.values() + dus = list(self.data_units.values()) + cus = list(self.compute_units.values()) pilots = [] for i in self.pilot_job_services: pilots.extend(i.list_pilots()) @@ -298,9 +298,9 @@ def wait(self): logger.debug("### END WAIT ###") except: exc_type, exc_value, exc_traceback = sys.exc_info() - print "*** print_tb:" + print("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) - print "*** print_exception:" + print("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) logger.debug("Ctrl-c detected. Terminating ComputeDataService...") @@ -363,7 +363,7 @@ def _scheduler_thread(self): else: self.du_queue.task_done() self.du_queue.put(du) - except Queue.Empty: + except queue.Empty: pass if self.du_queue.empty(): diff --git a/pilot/impl/pilot_manager_decentral.py.bak b/pilot/impl/pilot_manager_decentral.py.bak new file mode 100644 index 00000000..11bf4e94 --- /dev/null +++ b/pilot/impl/pilot_manager_decentral.py.bak @@ -0,0 +1,377 @@ +""" B{ComputeDataServiceDecentral Module}: A decentral implementation of a ComputeDataService (see L{ComputeDataServiceDecentral}). + +A Meta-Scheduling service for pilots (both PilotCompute and PilotData) +""" + +import sys +import os +import time +import threading +import logging +import pdb +import Queue +import uuid +import traceback +import urlparse + +import bigjob +from bigjob import logger, bigjob, subjob, description + +import pilot +from pilot.api import ComputeDataService, State +from pilot.api.api import PilotError +from pilot.impl.pilotdata_manager import PilotData, DataUnit +from pilot.impl.pilotcompute_manager import PilotCompute, PilotComputeService +from pilot.impl.pilot_manager import ComputeUnit + + +#from pilot.coordination.advert import AdvertCoordinationAdaptor as CoordinationAdaptor +from pilot.coordination.redis_adaptor import RedisCoordinationAdaptor as CoordinationAdaptor + +""" Loaded Module determines scheduler: + + bigdata.scheduler.data_compute_scheduler - selects random locations for PD and CUs + bigdata.scheduler.data_compute_affinity_scheduler - considers affinity descriptions + +""" +from pilot.scheduler.data_compute_affinity_scheduler import Scheduler + +class ComputeDataServiceDecentral(ComputeDataService): + """ B{ComputeDataServiceDecentral.} + + The ComputeDataService is the application's interface to submit + ComputeUnits and PilotData/DataUnit to the Pilot-Manager + in the P* Model. + + The decentral ComputeDateService will only work with Redis! + """ + CDS_ID_PREFIX="cds-" + + def __init__(self, cds_url=None): + """ Create a ComputeDataService (Decentral) object. + + @param cds_url: Reconnect to an existing CDS (optional). + """ + # Pilot Data + self.data_units={} + self.pilot_data_services=[] + + # Pilot Compute + self.compute_units={} + self.pilot_job_services=[] + + if cds_url == None: + self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) + application_url = CoordinationAdaptor.get_base_url(pilot.application_id) + self.url = CoordinationAdaptor.add_cds(application_url, self) + else: + self.id = self.__get_cds_id(cds_url) + self.url = cds_url + + # Background Thread for scheduling + self.scheduler = Scheduler() + self.du_queue = Queue.Queue() + + self.stop=threading.Event() + self.scheduler_thread=threading.Thread(target=self._scheduler_thread) + self.scheduler_thread.daemon=True + self.scheduler_thread.start() + logger.debug("Created ComputeDataServiceDecentral") + + + def __get_cds_id(self, cds_url): + start = cds_url.index(self.CDS_ID_PREFIX) + end =cds_url.index("/", start) + return cds_url[start:end] + + + ########################################################################### + # Pilot Compute + def add_pilot_compute_service(self, pcs): + """ Add a PilotComputeService to this CDS. + + @param pcs: The PilotComputeService to which this ComputeDataService will connect. + + """ + self.pilot_job_services.append(pcs) + CoordinationAdaptor.update_cds(self.url, self) + if len(self.pilot_job_services)>1: + logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") + raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService") + + + def remove_pilot_compute_service(self, pcs): + """ Remove a PilotJobService from this CDS. + + Note that it won't cancel the PilotJobService, it will just no + longer be connected to this WUS. + + Keyword arguments: + pilotjob_services -- The PilotJob Service(s) to remove from this + Work Unit Service. + + Return: + Result + """ + self.pilot_job_services.remove(pcs) + CoordinationAdaptor.update_cds(self.url, self) + if len(self.pilot_job_services)>1: + logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") + raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService") + + + def submit_compute_unit(self, compute_unit_description): + """ Submit a CU to this Compute Data Service. + @param compute_unit_description: The L{ComputeUnitDescription} from the application + @return: L{ComputeUnit} object + """ + cu = ComputeUnit(compute_unit_description, self) + self.compute_units[cu.id]=cu + self.__submit_cu(cu) + return cu + + + def list_pilot_compute(self): + """ List all pilot compute of CDS """ + return self.pilot_job_service + + + def get_details(self): + """ returns a list with dicts that contains the details of the Pilot Compute, + - job state + - description + - ... + """ + pilot_details=[] + for pcs in self.pilot_job_services: + for pc in pcs.list_pilots(): + pilot_details.append(pc.get_details()) + return pilot_details + + ########################################################################### + # Compute Data Service private methods + def __submit_cu(self, compute_unit): + """ Submits compute unit to Bigjob """ + + if len(self.pilot_job_services)!=1: + raise PilotError("No PilotComputeService found. Please start a PCS before submitting ComputeUnits.") + + self.__wait_for_du(compute_unit) + + sj = subjob() + self.pcs_coordination_namespace=self.pilot_job_services[0].coordination_queue + logger.debug("Submit CU to big-job via external queue: %s"%self.pcs_coordination_namespace) + sj.submit_job(self.pcs_coordination_namespace, compute_unit.subjob_description) + + compute_unit._update_subjob(sj) + return compute_unit + + + def __wait_for_du(self, compute_unit): + """ wait for Data Units that are required for Compute Unit """ + cu_description = compute_unit.compute_unit_description + if cu_description.has_key("input_data") and len(cu_description["input_data"])>0: + for input_du_url in cu_description["input_data"]: + for du in self.data_units.values(): + if input_du_url == du.get_url(): + logger.debug("Wait for DU: %s"%du.get_url()) + du.wait() + + + ########################################################################### + # Pilot Data + def add_pilot_data_service(self, pds): + """ Add a PilotDataService + + @param pds: The PilotDataService to add. + """ + self.pilot_data_services.append(pds) + CoordinationAdaptor.update_cds(self.url, self) + + def remove_pilot_data_service(self, pds): + """ Remove a PilotDataService + @param pds: The PilotDataService to remove + """ + self.pilot_data_services.remove(pds) + CoordinationAdaptor.update_cds(self.url, self) + + + def list_pilot_data(self): + """ List all pilot data of CDS """ + return self.pilot_data_services + + + def list_data_units(self): + """ List all DUs of CDS """ + return self.data_units.items() + + + def get_data_unit(self, du_id): + if self.data_units.has_key(du_id): + return self.data_units[du_id] + return None + + + def submit_data_unit(self, data_unit_description): + """ creates a data unit object and binds it to a physical resource (a pilotdata) """ + du = DataUnit(pilot_data=None, + data_unit_description=data_unit_description) + self.data_units[du.id]=du + self.du_queue.put(du) + # queue currently not persisted + CoordinationAdaptor.update_cds(self.url, self) + return du + + + ########################################################################### + # General + + def cancel(self): + """ Cancel the CDS. + All associated PD and PC objects are canceled. + """ + # terminate background thread + self.stop.set() + CoordinationAdaptor.delete_cds(self.url) + + + def wait(self): + """ Waits for CUs and DUs. Return after all DU's have been placed (i.e. in state Running) + and all CU's have been completed (i.e. in state Done) or if a fault has occurred or + the user has cancelled a CU or DU. + """ + try: + dus = self.data_units.values() + cus = self.compute_units.values() + pilots = [] + for i in self.pilot_job_services: + pilots.extend(i.list_pilots()) + number_dus=len(dus) + number_cus=len(cus) + number_pilots=len(pilots) + completed_dus=0 + completed_cus=0 + completed_pilots=0 + logger.debug("### ComputeDataService wait for completion of %d CUs/ %d DUs ###"%(len(cus), len(dus))) + + while not (completed_dus==number_dus and completed_cus==number_cus): + completed_dus=0 + completed_cus=0 + completed_pilots=0 + + for p in pilots: + state = p.get_state() + if state==State.Done or state==State.Failed: + completed_pilots = completed_pilots + 1 + + if completed_pilots==number_pilots: + logger.debug("All pilots done/failed. No more active pilots. Exit.") + break + + for cu in cus: + state = cu.get_state() + if state==State.Done or state==State.Failed: + completed_cus=completed_cus + 1 + + for du in dus: + state = du.get_state() + if state==State.Running or state==State.Failed: + completed_dus=completed_dus + 1 + + logger.debug("Compute Data Service Completion Status: %d/%d CUs %d/%d DUs %d/%d Pilots"% + (completed_cus, number_cus, completed_dus, + number_dus, completed_pilots, number_pilots)) + + logger.debug("exit? " + str((completed_dus==number_dus and completed_cus==number_cus))) + if completed_dus + gsissh:// + go:// + gs://google.com + s3://aws.amazon.com + + In the future more SAGA/Bliss URL schemes/adaptors are supported. + """ + self.id = None + self.url = pd_url + self.pilot_data_description = None + self.pilot_data_service = pilot_data_service + self.service_url=None + self.size = None + self.data_unit_urls = [] + self.security_context = None + + if pd_url==None and pilot_data_service!=None: # new pd + self.id = self.PD_ID_PREFIX+str(uuid.uuid1()) + self.pilot_data_description = pilot_data_description + self.url = CoordinationAdaptor.add_pd(CoordinationAdaptor.get_base_url(application_id)+":"+pilot_data_service.id, self) + elif pd_url != None: + logger.warn("Reconnect to PilotData: %s"%pd_url) + dictionary = CoordinationAdaptor.get_pd(pd_url) + if dictionary.has_key("security_context"): + self.security_context=dictionary["security_context"] + pd_dict = eval(dictionary["pilot_data"]) + for i in pd_dict: + self.__setattr__(i, pd_dict[i]) + # A Pilot Data does not hold a direct reference to a Data Unit (only URL refs are stored) + self.data_unit_urls = eval(dictionary["data_unit_urls"]) + + self.__initialize_pilot_data() + CoordinationAdaptor.update_pd(self) + + + def cancel(self): + """ Cancel PilotData """ + #self.__filemanager.delete_pilotdata() + pass + + + def get_url(self): + """ Get URL of PilotData. Used for reconnecting to PilotData """ + return self.url + + + def url_for_du(self, du): + """ Get full URL to DataUnit within PilotData """ + return self.service_url + "/" + str(du.id) + + + def submit_data_unit(self, data_unit_description=None, data_unit=None): + """ creates a data unit object and initially imports data specified in data_unit_description """ + if data_unit!=None: + du = data_unit + else: + du = DataUnit(pilot_data=self, + data_unit_description=data_unit_description) + self.data_unit_urls.append(du.get_url()) + du.add_pilot_data(self) + return du + + + def list_data_units(self): + """ List all data units of Pilot Data """ + return self.data_unit_urls + + + def get_state(self): + """ Return current state of Pilot Data """ + return self.__filemanager.get_state() + + + def get_du(self, du_url): + """ Returns Data Unit if part of Pilot Data """ + if self.data_unit_urls.count(du_url)>0: + du = DataUnit(du_url=du_url) + return du + return None + + + def wait(self): + """ Wait until PD enters a final state (Done, Canceled or Failed).""" + while 1: + finish_counter=0 + result_map = {} + for du_url in self.data_units_urls: + du = DataUnit(du_url=du_url) + du.wait() + state = du.get_state() + #state = job_detail["state"] + if result_map.has_key(state)==False: + result_map[state]=1 + else: + result_map[state] = result_map[state]+1 + if self.__has_finished(state)==True: + finish_counter = finish_counter + 1 + logger.debug("PD ID: " + str(self.id) + " Total DUs: %s States: %s"%(len(self.data_units_urls), str(result_map))) + if finish_counter == len(self.data_units_urls): + break + time.sleep(2) + + + def export_du(self, du, target_url): + """ Export Data Unit to a local directory """ + if target_url.startswith("/") and os.path.exists(target_url)==False: + os.mkdir(target_url) + logger.debug("Export Data-Unit to %s"%target_url) + self.__filemanager.get_du(du, target_url) + + + def put_du(self, du): + """Copy Data Unit to Pilot Data""" + logger.debug("Put DU: %s to Pilot-Data: %s"%(du.id,self.service_url)) + self.__filemanager.create_du(du.id) + self.__filemanager.put_du(du) + self.data_unit_urls.append(du.get_url()) + CoordinationAdaptor.update_pd(self) + + + def remove_du(self, du): + """ Remove Data Unit from Pilot Data """ + if self.data_unit_urls.count(du.get_url())>0: + self.__filemanager.remove_du(du) + self.data_unit_urls.remove(du.get_url()) + CoordinationAdaptor.update_pd(self) + + + def copy_du(self, du, pd_new): + """ Copy DataUnit to another Pilot Data """ + pd_new.create_du(du) + self.__filemanager.copy_du(du, pd_new) + + # update meta data at pd_new + #pd_new.data_units[du.id] = du + pd_new.data_unit_urls.append(du.get_url()) + CoordinationAdaptor.update_pd(pd_new) + + + # END API methods + ########################################################################### + # Auxillary Methods + + def create_du(self, du): + """ Create a new Data Unit within Pilot """ + self.__filemanager.create_du(du.id) + + + def __initialize_pilot_data(self): + + if self.pilot_data_description!=None: + self.service_url=self.pilot_data_description["service_url"] + if self.pilot_data_description.has_key("size"): + self.size = self.pilot_data_description["size"] + + # initialize file adaptor + if self.service_url.startswith("ssh:"): + logger.debug("Use SSH backend") + self.__filemanager = SSHFileAdaptor(self.service_url, + self.security_context, + self.pilot_data_description) + elif self.service_url.startswith("http:"): + logger.debug("Use WebHDFS backend") + self.__filemanager = WebHDFSFileAdaptor(self.service_url) + elif self.service_url.startswith("go:"): + logger.debug("Use Globus Online backend") + self.__filemanager = GlobusOnlineFileAdaptor(self.service_url) + elif self.service_url.startswith("gs:"): + logger.debug("Use Google Cloud Storage backend") + self.__filemanager = GSFileAdaptor(self.service_url, self.security_context) + elif self.service_url.startswith("irods:"): + logger.debug("Use iRods Storage backend") + self.__filemanager = iRodsFileAdaptor(self.service_url, self.security_context) + elif self.service_url.startswith("s3:") \ + or self.service_url.startswith("walrus:") \ + or self.service_url.startswith("swift:"): + logger.debug("Use Amazon S3/Eucalyptus Walrus/SWIFT Storage backend") + self.__filemanager = S3FileAdaptor(self.service_url, + self.security_context, + self.pilot_data_description) + else: + raise PilotError("No File Plugin found.") + + self.__filemanager.initialize_pilotdata() + self.__filemanager.get_pilotdata_size() + + # Update security context + self.security_context = self.__filemanager.get_security_context() + + + def __get_pd_id(self, pd_url): + start = pd_url.index(self.PD_ID_PREFIX) + end =pd_url.index("/", start) + return pd_url[start:end] + + + + def to_dict(self): + """ Internal method that returns a dict with all data contained in this Pilot Data""" + pd_dict = {} + pd_dict["id"]=self.id + pd_dict["url"]=self.url + pd_dict["pilot_data_description"]=self.pilot_data_description + logger.debug("PilotData Dictionary: " + str(pd_dict)) + return pd_dict + + + def __repr__(self): + """Returns Pilot Data URL""" + return self.service_url + + + def __has_finished(self, state): + state = state.lower() + if state=="running" or state=="failed" or state=="canceled": + return True + else: + return False + + @classmethod + def create_pilot_data_from_dict(cls, pd_dict): + """Restore Pilot Data from dictionary""" + pd = PilotData() + for i in pd_dict.keys(): + pd.__setattr__(i, pd_dict[i]) + pd.__initialize_pilot_data() + logger.debug("created pd " + str(pd)) + return pd + + +############################################################################### +COORDINATION_URL = "redis://localhost" + +class PilotDataService(PilotDataService): + """ B{PilotDataService (PDS).} + + Factory for creating Pilot Data. + + """ + + PDS_ID_PREFIX="pds-" + + def __init__(self, coordination_url=COORDINATION_URL, pds_url=None): + """ Create a PilotDataService + + Keyword arguments: + pds_id -- restore from pds_id + """ + self.pilot_data={} + CoordinationAdaptor.configure_base_url(coordination_url) + if pds_url == None: + self.id = self.PDS_ID_PREFIX + str(uuid.uuid1()) + application_url = CoordinationAdaptor.get_base_url(application_id) + self.url = CoordinationAdaptor.add_pds(application_url, self) + else: + self.id = self.__get_pds_id(pds_url) + + + + def create_pilot(self, pilot_data_description): + """ Create a PilotData + + Keyword arguments: + pilot_data_description -- PilotData Description:: + + { + 'service_url': "ssh:///base-url/", + 'size': "1000" + } + + Return value: + A PilotData object + """ + pd = PilotData(pilot_data_service=self, + pilot_data_description=pilot_data_description) + self.pilot_data[pd.id]=pd + + # store pilot data in central data space + CoordinationAdaptor.add_pd(self.url, pd) + return pd + + + def get_pilot(self, pd_id): + """ Reconnect to an existing pilot. """ + if self.pilot_data.has_key(pd_id): + return self.pilot_data[pd_id] + return None + + + def list_pilots(self): + """ List all PDs of PDS """ + return self.pilot_data.values() + + + def cancel(self): + """ Cancel the PilotDataService. Release all Pilot Data created by this service. + + Keyword arguments: + None + + Return value: + Result of operation + """ + for i in self.pilot_data.values(): + i.cancel() + + + def wait(self): + """ Wait until all managed PD (of this Pilot Data Service) enter a final state""" + + for i in self.pilot_data.values(): + i.wait() + + + def get_url(self): + """ Returns URL of Pilot Data Service """ + return self.url + + ########################################################################### + # Non-API methods + def to_dict(self): + """ Return a Python dictionary containing the representation of the PDS + (internal method not part of Pilot API) + """ + pds_dict = self.__dict__ + pds_dict["id"]=self.id + return pds_dict + + + def __del__(self): + """Releases all Pilot Data created by this Pilot Data Service.""" + self.cancel() + + + def __get_pds_id(self, pds_url): + start = pds_url.index(self.PDS_ID_PREFIX) + end =pds_url.index("/", start) + return pds_url[start:end] + + + def __restore_pd(self, pds_url): + pd_list=CoordinationAdaptor.list_pd(pds_url) + for i in pd_list: + pass + + +class DataUnit(DataUnit): + """ B{DataUnit (DU).} + + This is the object that is returned by the ComputeDataService when a + new DataUnit is created based on a DataUnitDescription. + + The DataUnit object can be used by the application to keep track + of a DataUnit. + + A DataUnit has state, can be queried and can be cancelled. + + + + State model: + - New: PD object created + - Pending: PD object is currently updated + - Running: At least 1 replica of PD is persistent in a pilot data + """ + + ## TODO + # DU are stored as top-level objects in Redis: + # redis://localhost//du- + # + # In the future a DU can be possibly bound to multiple PD + # Thus, it should be a top level entity + # The lower levels of the hierarchy will only store references to the DU then + + + DU_ID_PREFIX="du-" + + def __init__(self, pilot_data=None, data_unit_description=None, du_url=None): + """ + 1.) create a new Pilot Data: pilot_data_service and data_unit_description required + 2.) reconnect to an existing Pilot Data: du_url required + + """ + if du_url==None: + self.id = self.DU_ID_PREFIX + str(uuid.uuid1()) + self.data_unit_description = data_unit_description + self.pilot_data=[] + self.state = State.New + self.data_unit_items=[] + if self.data_unit_description.has_key("file_urls"): + self.data_unit_items = DataUnitItem.create_data_unit_list(self, self.data_unit_description["file_urls"]) + + self.url = None + + # register a data unit as top-level entry in Redis + application_url = CoordinationAdaptor.get_base_url(application_id) + self.url = CoordinationAdaptor.add_du(application_url, self) + CoordinationAdaptor.update_du(self) + + # Deprecated + # old method only allowed the creation of a du if a pd existed + #if pilot_data!=None: + # # Allow data units that are not connected to a resource! + # self.url = CoordinationAdaptor.add_du(pilot_data.url, self) + # CoordinationAdaptor.update_du(self) + else: + self.id = DataUnit._get_du_id(du_url) + self.url = du_url + logger.debug("Restore du: %s"%self.id) + self.__restore_state() + + self.transfer_threads=[] + + + def cancel(self): + """ Cancel the Data Unit. """ + self.state = State.Done + if len(self.pilot_data) > 0: + CoordinationAdaptor.update_du(self) + + + def add_files(self, file_url_list=[]): + """Add files referenced in list to Data Unit""" + self._update_state(State.Pending) + item_list = DataUnitItem.create_data_unit_from_urls(None, file_url_list) + for i in item_list: + self.data_unit_items.append(i) + CoordinationAdaptor.update_du(self) + if len(self.pilot_data) > 0: + for i in self.pilot_data: + logger.debug("Update Pilot Data %s"%(i.get_url())) + i.put_du(self) + self._update_state(State.Running) + CoordinationAdaptor.update_du(self) + + + def remove_files(self, file_urls): + """Remove files from Data Unit (NOT implemented yet""" + # TODO + #self.data_unit_items.remove(input_data_unit) + if len(self.pilot_data) > 0: + CoordinationAdaptor.update_du(self) + + + def list_pilot_data(self): + pd_urls = [] + for i in self.pilot_data: + pd_urls.append(i.get_url()) + return pd_urls + + + def list(self): + """ List all items contained in DU + { + "filename" : { + "pilot_data" : [url1, url2], + "local" : url + } + } + """ + self.__refresh() + base_urls = [i.url_for_du(self) for i in self.get_pilot_data()] + result_dict = {} + for i in self.data_unit_items: + logger.debug("Process file: %s"%(i.filename)) + result_dict[i.filename]={ + "pilot_data": [os.path.join(j, i.filename) for j in base_urls], + "local": i.local_url + } + return result_dict + + + + def get_state(self): + """ Return current state of DataUnit """ + # update remote state + du_dict = CoordinationAdaptor.get_du(self.url) + self.state = du_dict["state"] + return self.state + + + def wait(self): + """ Wait until in running state + (or failed state) + """ + logger.debug("DU: %s wait()"%(str(self.id))) + # Wait for all transfers to finish + for i in self.transfer_threads: + i.join() + + # Wait for state to change + while True: + self.state = self.get_state() + if self.state==State.Running or self.state==State.Failed: + break + logger.debug("Waiting DU %s State: %s"%(self.get_url(), self.state)) + time.sleep(2) + + + def add_pilot_data(self, pilot_data): + """ add this DU (self) to a certain pilot data + data will be moved into this data + """ + transfer_thread=threading.Thread(target=self.__add_pilot_data, args=[pilot_data]) + transfer_thread.start() + self.transfer_threads.append(transfer_thread) + + + def get_pilot_data(self): + """ get a list of pilot data that have a copy of this PD """ + self.__restore_state() + return self.pilot_data + + + def export(self, target_url): + """ simple implementation of export: + copies file from first pilot data to local machine + """ + if self.get_state()!=State.Running: + self.wait() + + if len(self.pilot_data) > 0: + # Search for PD that is close to local machine + local_hostname=socket.getfqdn() + max_score=0 + best_pd=None + for pd in self.pilot_data: + pd_host = SAGAUrl(pd.service_url).host + pd_score = difflib.SequenceMatcher(a=pd_host, b=local_hostname).ratio() + logger.debug("Export locality compute score: Localhost: %s PD at: %s Score: %s"%(local_hostname, pd_host, pd_score)) + if pd_score > max_score: + best_pd=pd + max_score=pd_score + + #pd_domain = tldextract.extract(pd.service_url).domain + #local_domain = tldextract.extract(socket.getfqdn()).domain + + if best_pd!=None: + logger.debug("Export from: %s"%(best_pd.service_url)) + best_pd.export_du(self, target_url) + return + + # No PD found. Utilize default PD + logger.debug("Export from random PD") + self.pilot_data[0].export_du(self, target_url) + else: + logger.error("No Pilot Data for PD found") + + + def get_url(self): + """ Return URL that can be used to reconnect to Data Unit """ + return self.url + + + + ########################################################################### + # BigData Internal Methods + def to_dict(self): + """ Internal method that returns a dict with all data contained in this DataUnit""" + du_dict = self.__dict__ + du_dict["id"]=self.id + return du_dict + + + def _update_state(self, state): + """ Internal method for updating state""" + self.state=state + logger.debug("Update DU: "+ str(self.url) + " state: " + state) + CoordinationAdaptor.update_du_state(self, state) + logger.debug("Updated DU: "+ str(self.url) + " New state: " + self.get_state()) + + + def __add_pilot_data(self, pilot_data): + logger.debug("DU add_pilot_data: add DU to pilot data in Thread") + self._update_state(State.Pending) + if len(self.pilot_data) > 0: # copy files from other pilot data + self.pilot_data[0].copy_du(self, pilot_data) + else: # copy files from original location + pilot_data.put_du(self) + logger.debug("DU add_pilot_data: Copy/Put DU to pilot data successfull") + self.pilot_data.append(pilot_data) + self._update_state(State.Running) + logger.debug("DU add_pilot_data: Updated State") + #self.url = CoordinationAdaptor.add_du(pilot_data.url, self) + CoordinationAdaptor.update_du(self) + + + @classmethod + def _get_du_id(cls, du_url): + try: + start = du_url.index(cls.DU_ID_PREFIX) + end = du_url.find("/", start) + if end==-1: + end = du_url.find("?", start) + if end==-1: + end = len(du_url) + return du_url[start:end] + except: + logger.error("No valid PD URL") + return None + + + def __refresh(self): + """ Update list of data units items + from coordination service """ + try: + if self.url != None: + du_dict = CoordinationAdaptor.get_du(self.url) + data_unit_dict_list = eval(du_dict["data_unit_items"]) + self.data_unit_items = [DataUnitItem.create_data_unit_from_dict(i) for i in data_unit_dict_list] + except: + logger.warn("Refresh of DU %s failed"%(self.get_url())) + + + def __restore_state(self): + du_dict = CoordinationAdaptor.get_du(self.url) + # Restore Data Unit + self.data_unit_description = eval(du_dict["data_unit_description"]) + self.state = du_dict["state"] + + # Restore DataUnitItems + data_unit_dict_list = eval(du_dict["data_unit_items"]) + self.data_unit_items = [DataUnitItem.create_data_unit_from_dict(i) for i in data_unit_dict_list] + + # restore Pilot Data + pd_list = eval(du_dict["pilot_data"]) + self.pilot_data = [] + for i in pd_list: + logger.debug("PD: "+str(i)) + pd = PilotData(pd_url=str(i)) + self.pilot_data.append(pd) + + + def __repr__(self): + return "PD: " + str(self.url) + + " \nData Units: " + str(self.data_unit_items) + + " \nPilot Data: " + str(self.pilot_data) + + + +class DataUnitItem(object): + """ DataUnitItem """ + DUI_ID_PREFIX="dui-" + + def __init__(self, pd=None, local_url=None): + if local_url!=None: + self.id = self.DUI_ID_PREFIX + str(uuid.uuid1()) + self.local_url = local_url + self.filename = os.path.basename(local_url) + #if pd != None: + # self.url = pd.url + "/" + self.filename + + + @classmethod + def __exists_file(cls, url): + """ return True if file at url exists. Otherwise False """ + file_url = SAGAUrl(url) + if file_url.host == "": + if os.path.exists(str(file_url)): + return True + else: + return False + elif file_url.host=="localhost": + if os.path.exists(file_url.path): + return True + else: + return False + else: + return True + + + def __repr__(self): + return str(self.__dict__) + + + ########################################################################### + # Auxiliary Methods + @classmethod + def create_data_unit_list(cls, pd=None, urls=None): + """ Creates a list of DUs from URL list + """ + du_list = [] + for i in urls: + if cls.__exists_file(i): + du = DataUnitItem(pd, i) + du_list.append(du) + + return du_list + + @classmethod + def create_data_unit_from_urls(cls, pd=None, urls=None): + """ Creates a list of DUs from URL list + """ + du_item_list = [] + for i in urls: + if cls.__exists_file(i): + du = DataUnitItem(pd, i) + du_item_list.append(du) + + return du_item_list + + + @classmethod + def create_data_unit_from_dict(cls, du_dict): + du = DataUnitItem() + logger.debug("Restore DU: " + str(du_dict)) + for i in du_dict.keys(): + logger.debug("Set attribute: %s", i) + du.__setattr__(i, du_dict[i]) + return du + + + def to_dict(self): + du_dict = self.__dict__ + du_dict["id"]=self.id + return du_dict + +################################################################################################### +# Tests +# Auxilliary testing methods +def __get_pd_url(du_url): + url = du_url[:du_url.index(":du-")] + return url + +def __get_du_id(du_url): + du_id = du_url[du_url.index("du-"):] + return du_id + +# Tests +def test_pd_reconnect(): + du_url = "redis://localhost/bigdata:pds-f31a670c-e3f6-11e1-afaf-705681b3df0f:pd-f31c47b8-e3f6-11e1-af44-705681b3df0f:du-f4debce8-e3f6-11e1-8399-705681b3df0f" + pd_url = __get_pd_url(du_url) + pd = PilotData(pd_url=pd_url) + print str(pd.list_data_units()) + du = pd.get_du(du_url) + + #du = DataUnit(du_url="redis://localhost/bigdata:pds-32d63b2e-df05-11e1-a329-705681b3df0f:pd-37674138-df05-11e1-80d0-705681b3df0f:du-3b8d428c-df05-11e1-af2a-705681b3df0f") + logger.debug(str(du.list())) + + +def test_du_reconnect(): + du_url = "redis://localhost/bigdata:du-1d1b7078-229f-11e2-834e-705681b3df0f" + du = DataUnit(du_url=du_url) + logger.debug(str(du.list())) + du.export("/tmp/export-test") + + +def test_data_unit_add_file(): + pilot_data_service = PilotDataService(coordination_url="redis://localhost/") + pilot_data_description = { + "service_url": "ssh://localhost/tmp/pilot-" + str(uuid.uuid1()), + "size": 100 + } + pd = pilot_data_service.create_pilot(pilot_data_description=pilot_data_description) + + # create data unit for output data + output_data_unit_description = { + "file_urls": [], + "file_url_patterns": ["test.txt"] + } + output_data_unit = pd.submit_data_unit(output_data_unit_description) + output_data_unit.wait() + logger.debug("Output DU: " + output_data_unit.get_url()) + pd_reconnect_url = pd.get_url() + du_url = output_data_unit.get_url() + pd_reconnect = PilotData(pd_url=pd_reconnect_url) + du_reconnect = pd_reconnect.get_du(du_url) + du_reconnect.add_files(["test.txt"]) + + + +class Lock(object): + def __init__(self, key, redis, expires=60, timeout=10): + """ + Distributed locking using Redis SETNX and GETSET. + + Usage:: + + with Lock('my_lock'): + print "Critical section" + + :param expires We consider any existing lock older than + ``expires`` seconds to be invalid in order to + detect crashed clients. This value must be higher + than it takes the critical section to execute. + :param timeout If another client has already obtained the lock, + sleep for a maximum of ``timeout`` seconds before + giving up. A value of 0 means we never wait. + """ + + self.key = key + self.timeout = timeout + self.expires = expires + self.redis = redis + + def __enter__(self): + timeout = self.timeout + while timeout >= 0: + expires = time.time() + self.expires + 1 + + if self.redis.setnx(self.key, expires): + # We gained the lock; enter critical section + return + + current_value = self.redis.get(self.key) + + # We found an expired lock and nobody raced us to replacing it + if current_value and float(current_value) < time.time() and \ + self.redis.getset(self.key, expires) == current_value: + return + + timeout -= 1 + time.sleep(1) + + raise LockTimeout("Timeout whilst waiting for lock") + + def __exit__(self, exc_type, exc_value, traceback): + self.redis.delete(self.key) + +class LockTimeout(BaseException): + pass + + +if __name__ == "__main__": + #test_data_unit_add_file() + test_du_reconnect() + diff --git a/pilot/scheduler/data_compute_affinity_scheduler.py b/pilot/scheduler/data_compute_affinity_scheduler.py index 34c7d4c6..0ab6a05d 100644 --- a/pilot/scheduler/data_compute_affinity_scheduler.py +++ b/pilot/scheduler/data_compute_affinity_scheduler.py @@ -25,10 +25,10 @@ def set_pilot_jobs(self, pilot_jobs): def schedule_pilot_data(self, data_unit_description=None): logger.debug("Schedule to PD - # Avail pilots: %d"%len(self.pilot_data)) candidate_pilot_data = [] - if data_unit_description.has_key("affinity_datacenter_label") and data_unit_description.has_key("affinity_machine_label"): + if "affinity_datacenter_label" in data_unit_description and "affinity_machine_label" in data_unit_description: for i in self.pilot_data: pilot_data_description = i.pilot_data_description - if pilot_data_description.has_key("affinity_machine_label") and pilot_data_description.has_key("affinity_datacenter_label"): + if "affinity_machine_label" in pilot_data_description and "affinity_datacenter_label" in pilot_data_description: if data_unit_description["affinity_datacenter_label"] == pilot_data_description["affinity_datacenter_label"]\ and data_unit_description["affinity_machine_label"] == pilot_data_description["affinity_machine_label"]: candidate_pilot_data.append(i) @@ -53,16 +53,16 @@ def schedule_pilot_job(self, compute_unit_description=None): logger.debug("Schedule to PJ - # Avail PJs: %d"%len(self.pilot_jobs)) candidate_pilot_jobs = [] required_number_of_processes=1 - if compute_unit_description.has_key("number_of_processes"): + if "number_of_processes" in compute_unit_description: required_number_of_processes = int(compute_unit_description["number_of_processes"]) - if compute_unit_description.has_key("affinity_datacenter_label") and compute_unit_description.has_key("affinity_machine_label"): + if "affinity_datacenter_label" in compute_unit_description and "affinity_machine_label" in compute_unit_description: for i in self.pilot_jobs: free_nodes = i.get_free_nodes() logger.debug("BJ: %r State: %s Free nodes: %d"%(i, i.get_state(), free_nodes)) if i.get_state()=="Running" and free_nodes >= required_number_of_processes: # check whether pilot is active pilot_job_description = i.pilot_compute_description - if pilot_job_description.has_key("affinity_datacenter_label") and pilot_job_description.has_key("affinity_machine_label"): + if "affinity_datacenter_label" in pilot_job_description and "affinity_machine_label" in pilot_job_description: if pilot_job_description["affinity_datacenter_label"] == compute_unit_description["affinity_datacenter_label"] and pilot_job_description["affinity_machine_label"] == compute_unit_description["affinity_machine_label"]: candidate_pilot_jobs.append(i) diff --git a/pilot/scheduler/data_compute_affinity_scheduler.py.bak b/pilot/scheduler/data_compute_affinity_scheduler.py.bak new file mode 100644 index 00000000..34c7d4c6 --- /dev/null +++ b/pilot/scheduler/data_compute_affinity_scheduler.py.bak @@ -0,0 +1,92 @@ +""" Affinity-aware scheduler that evaluates affinity labels and input/output data flow + + +""" +import random +import logging +from bigjob import logger + +class Scheduler: + + def __init__(self): + self.pilot_data=[] + self.pilot_jobs=[] + + def set_pilot_data(self, pilot_data): + """ set resources which are used for scheduling """ + self.pilot_data=pilot_data + + + def set_pilot_jobs(self, pilot_jobs): + """ set resources which are used for scheduling """ + self.pilot_jobs=pilot_jobs + + + def schedule_pilot_data(self, data_unit_description=None): + logger.debug("Schedule to PD - # Avail pilots: %d"%len(self.pilot_data)) + candidate_pilot_data = [] + if data_unit_description.has_key("affinity_datacenter_label") and data_unit_description.has_key("affinity_machine_label"): + for i in self.pilot_data: + pilot_data_description = i.pilot_data_description + if pilot_data_description.has_key("affinity_machine_label") and pilot_data_description.has_key("affinity_datacenter_label"): + if data_unit_description["affinity_datacenter_label"] == pilot_data_description["affinity_datacenter_label"]\ + and data_unit_description["affinity_machine_label"] == pilot_data_description["affinity_machine_label"]: + candidate_pilot_data.append(i) + + if len(candidate_pilot_data) == 0: + # No PD with requested affinity found + # move data unit into a "possibly" remote pilot data + logger.debug("A No pilot data w/ affinity found... Looking for alternative pilot.") + candidate_pilot_data = self.pilot_data + + if len(candidate_pilot_data)>0: + return random.choice(candidate_pilot_data) + + return None + + + def schedule_pilot_job(self, compute_unit_description=None): + """ Enforces affinity description: if no PJ is available with the right + affinity, CU can't be scheduled. + + """ + logger.debug("Schedule to PJ - # Avail PJs: %d"%len(self.pilot_jobs)) + candidate_pilot_jobs = [] + required_number_of_processes=1 + if compute_unit_description.has_key("number_of_processes"): + required_number_of_processes = int(compute_unit_description["number_of_processes"]) + + if compute_unit_description.has_key("affinity_datacenter_label") and compute_unit_description.has_key("affinity_machine_label"): + for i in self.pilot_jobs: + free_nodes = i.get_free_nodes() + logger.debug("BJ: %r State: %s Free nodes: %d"%(i, i.get_state(), free_nodes)) + if i.get_state()=="Running" and free_nodes >= required_number_of_processes: # check whether pilot is active + pilot_job_description = i.pilot_compute_description + if pilot_job_description.has_key("affinity_datacenter_label") and pilot_job_description.has_key("affinity_machine_label"): + if pilot_job_description["affinity_datacenter_label"] == compute_unit_description["affinity_datacenter_label"] and pilot_job_description["affinity_machine_label"] == compute_unit_description["affinity_machine_label"]: + candidate_pilot_jobs.append(i) + + + if len(candidate_pilot_jobs) == 0: + # No PJ with requested affinity found + # move compute unit into a "possibly" remote pilot job + logger.debug("B No pilot compute w/ affinity found... Looking for alternative pilot.") + for i in self.pilot_jobs: + logger.debug("BJ: %r State: %s"%(i, i.get_state())) + free_nodes = i.get_free_nodes() + if i.get_state()=="Running" and free_nodes >= required_number_of_processes: + candidate_pilot_jobs.append(i) + #candidate_pilot_jobs=self.pilot_jobs + + + logger.debug("Candidate PJs: %r"%(candidate_pilot_jobs)) + if len(candidate_pilot_jobs)>0: + return random.choice(candidate_pilot_jobs) + + return None + + def __check_pilot_data_dependency(self, work_unit_description): + pilot_data_dependencies = work_unit_description["input_pilot_data"] + for i in pilot_data_dependencies: + pd = PilotData.pilot + ps = i.get_pilot_data() diff --git a/setup.py b/setup.py index 14758c37..30df47ef 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ try: import saga except: - print "Installing BigJob and SAGA/Python." + print("Installing BigJob and SAGA/Python.") if sys.version_info < (2, 6): sys.stderr.write("BigJob requires Python 2.6 and above. Installation unsuccessful!") @@ -24,18 +24,18 @@ def update_version(): if not os.path.isdir(".git"): - print "This does not appear to be a Git repository." + print("This does not appear to be a Git repository.") return try: p = subprocess.Popen(["git", "describe", "--tags", "--always"], stdout=subprocess.PIPE) except EnvironmentError: - print "Warning: Unable to run git, not modifying VERSION" + print("Warning: Unable to run git, not modifying VERSION") return stdout = p.communicate()[0] if p.returncode != 0: - print "Warning: Unable to run git, not modifying VERSION" + print("Warning: Unable to run git, not modifying VERSION") return ver = stdout.strip() @@ -43,7 +43,7 @@ def update_version(): f = open(fn, "w") f.write(ver) f.close() - print "BigJob VERSION: '%s'" % ver + print("BigJob VERSION: '%s'" % ver) def get_version(): diff --git a/setup.py.bak b/setup.py.bak new file mode 100644 index 00000000..14758c37 --- /dev/null +++ b/setup.py.bak @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import os +import sys + +#import ez_setup +#ez_setup.use_setuptools() + +from setuptools import setup, find_packages + +import subprocess + +try: + import saga +except: + print "Installing BigJob and SAGA/Python." + +if sys.version_info < (2, 6): + sys.stderr.write("BigJob requires Python 2.6 and above. Installation unsuccessful!") + sys.exit(1) + +VERSION_FILE="VERSION" + + +def update_version(): + if not os.path.isdir(".git"): + print "This does not appear to be a Git repository." + return + try: + p = subprocess.Popen(["git", "describe", + "--tags", "--always"], + stdout=subprocess.PIPE) + except EnvironmentError: + print "Warning: Unable to run git, not modifying VERSION" + return + stdout = p.communicate()[0] + if p.returncode != 0: + print "Warning: Unable to run git, not modifying VERSION" + return + + ver = stdout.strip() + fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'VERSION') + f = open(fn, "w") + f.write(ver) + f.close() + print "BigJob VERSION: '%s'" % ver + + +def get_version(): + try: + fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'VERSION') + f = open(fn) + version = f.read().strip() + f.close() + except EnvironmentError: + return "-1" + return version + + +update_version() + +setup(name='BigJob', + version=get_version(), + description='P* Pilot-Job Implementation based on SAGA-Python', + author='Andre Luckow, et al.', + author_email='aluckow@cct.lsu.edu', + url='https://github.com/saga-project/BigJob', + classifiers = ['Development Status :: 5 - Production/Stable', + 'Programming Language :: Python', + 'Environment :: Console', + 'Topic :: Utilities', + ], + platforms = ('Unix', 'Linux', 'Mac OS'), + packages=['bigjob', 'bigjob_dynamic', 'coordination', 'pilot', 'bigjob.job_plugin', 'pilot.api','pilot.api.compute', 'pilot.api.data', 'pilot.coordination', + 'pilot.filemanagement', 'pilot.impl', 'pilot.scheduler', 'examples', 'api', 'bootstrap', 'cli'], + include_package_data=True, + # data files for easy_install + data_files = [('', ['bigjob.conf', 'bigjob.conf']), + ('', ['bigjob_agent.conf', 'bigjob_agent.conf']), + ('', ['README.md', 'README.md']), + ('', ['VERSION', 'VERSION'])], + + # data files for pip + package_data = {'': ["ez_setup.py", '*.conf']}, + + install_requires=['setuptools>2.0', 'uuid', 'threadpool', 'virtualenv', 'redis', + 'radical.utils', 'saga-python', 'google-api-python-client', 'python-hostlist', + 'globusonline-transfer-api-client', 'boto>=2.2,<2.3', 'simplejson<2.1', 'pexpect', 'tldextract'], + entry_points = { + 'console_scripts': [ + 'test-bigjob = examples.example_local_single:main', + 'test-bigjob-dynamic = examples.example_manyjob_local:main', + 'pilot-cli = cli.pilot_cli:main' + ] + } +) diff --git a/tests/bigjob-api/example_fg_single.py b/tests/bigjob-api/example_fg_single.py index 014ead16..891f4a09 100644 --- a/tests/bigjob-api/example_fg_single.py +++ b/tests/bigjob-api/example_fg_single.py @@ -65,7 +65,7 @@ def main(): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_nodes, @@ -76,7 +76,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -97,7 +97,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/bigjob-api/example_fg_single.py.bak b/tests/bigjob-api/example_fg_single.py.bak new file mode 100644 index 00000000..014ead16 --- /dev/null +++ b/tests/bigjob-api/example_fg_single.py.bak @@ -0,0 +1,115 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +#COORDINATION_URL = "redis://cyder.cct.lsu.edu:2525" +COORDINATION_URL = "redis://*@gw68.quarry.iu.teragrid.org:6379" +#COORDINATION_URL = "redis://@gw68.quarry.iu.teragrid.org:6379" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=8 + number_nodes = 16 + workingdirectory="/N/u/luckow" # working directory for agent + workingdirectory= os.getcwd() + "/agent/" # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "pbs://localhost" # resource url to run the jobs on localhost + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_nodes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/hostname" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + + for i in range(0,128): + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.wait() + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_gce_single.py b/tests/bigjob-api/example_gce_single.py index ca523114..b54217a4 100644 --- a/tests/bigjob-api/example_gce_single.py +++ b/tests/bigjob-api/example_gce_single.py @@ -45,7 +45,7 @@ def main(): lrms_url = "gce+ssh://locahost" ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, @@ -56,7 +56,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -79,7 +79,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/bigjob-api/example_gce_single.py.bak b/tests/bigjob-api/example_gce_single.py.bak new file mode 100644 index 00000000..ca523114 --- /dev/null +++ b/tests/bigjob-api/example_gce_single.py.bak @@ -0,0 +1,96 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +COORDINATION_URL = "redis://localhost:6379" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=1 + number_of_processes = 1 + workingdirectory="." # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE + lrms_url = "gce+ssh://locahost" + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/echo" + #jd.executable = "$HOME/hello.sh" + jd.number_of_processes = "1" + jd.arguments = ["$HELLOWORLD"] + jd.environment = ['HELLOWORLD=hello_world'] + jd.input_data = ["hi", "ho"] + + # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_hector_single.py b/tests/bigjob-api/example_hector_single.py index 332c06e6..b2bb1e01 100644 --- a/tests/bigjob-api/example_hector_single.py +++ b/tests/bigjob-api/example_hector_single.py @@ -56,7 +56,7 @@ def main(): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, @@ -67,7 +67,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -94,7 +94,7 @@ def main(): while 1: for idx, sj in enumerate(sjs): state = str(sj.get_state()) - print "sj: %d state: %s"%(idx,state) + print("sj: %d state: %s"%(idx,state)) if(state=="Failed" or state=="Done"): break diff --git a/tests/bigjob-api/example_hector_single.py.bak b/tests/bigjob-api/example_hector_single.py.bak new file mode 100644 index 00000000..332c06e6 --- /dev/null +++ b/tests/bigjob-api/example_hector_single.py.bak @@ -0,0 +1,112 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + redis://localhost:6379 (Redis at localhost) +""" + +#COORDINATION_URL = "redis://localhost:6379" +COORDINATION_URL = "redis://hector-xe6-1:6379" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project="d45" # if None default allocation is used + walltime=10 + processes_per_node=8 + number_of_processes=64 + # workingdirectory="/lustre/scratch/aluckow/agent" # working directory for agent + workingdirectory="/home/d45/d45/s1026257/al/" # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + #lrms_url = "xt5torque://localhost" # resource url to run the jobs on localhost + lrms_url = "pbs://localhost" # resource url to run the jobs on localhost + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/hostname" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + + sjs = [] + for i in range(0,24): + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + sjs.append(sj) + + + + + ######################################### + # busy wait for completion + while 1: + for idx, sj in enumerate(sjs): + state = str(sj.get_state()) + print "sj: %d state: %s"%(idx,state) + + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_india.py b/tests/bigjob-api/example_india.py index ee94c640..98e6329b 100644 --- a/tests/bigjob-api/example_india.py +++ b/tests/bigjob-api/example_india.py @@ -64,7 +64,7 @@ def main(): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_nodes, @@ -75,7 +75,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob diff --git a/tests/bigjob-api/example_india.py.bak b/tests/bigjob-api/example_india.py.bak new file mode 100644 index 00000000..ee94c640 --- /dev/null +++ b/tests/bigjob-api/example_india.py.bak @@ -0,0 +1,105 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +#COORDINATION_URL = "redis://cyder.cct.lsu.edu:2525" +COORDINATION_URL = "redis://ILikeBigJob_wITH-REdIS@gw68.quarry.iu.teragrid.org:6379" +#COORDINATION_URL = "redis://@gw68.quarry.iu.teragrid.org:6379" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=8 + number_nodes = 24 + workingdirectory= os.getcwd() # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "pbs://localhost" # resource url to run the jobs on localhost + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_nodes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/hostname" + jd.number_of_processes = "2" + jd.spmd_variation = "single" + jd.arguments = [""] + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + + for i in range(0,12): + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ########################################################################################## + # Cleanup - stop BigJob + bj.wait() + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_kraken_single.py b/tests/bigjob-api/example_kraken_single.py index a7ec9165..29b77c37 100644 --- a/tests/bigjob-api/example_kraken_single.py +++ b/tests/bigjob-api/example_kraken_single.py @@ -64,7 +64,7 @@ def main(): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, @@ -75,7 +75,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -102,7 +102,7 @@ def main(): while 1: for idx, sj in enumerate(sjs): state = str(sj.get_state()) - print "sj: %d state: %s"%(idx,state) + print("sj: %d state: %s"%(idx,state)) if(state=="Failed" or state=="Done"): break diff --git a/tests/bigjob-api/example_kraken_single.py.bak b/tests/bigjob-api/example_kraken_single.py.bak new file mode 100644 index 00000000..a7ec9165 --- /dev/null +++ b/tests/bigjob-api/example_kraken_single.py.bak @@ -0,0 +1,120 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +#COORDINATION_URL = "redis://localhost:6379" +#COORDINATION_URL = "redis://@gw68.quarry.iu.teragrid.org:6379" +COORDINATION_URL = "redis://ILikeBigJob_wITH-REdIS@gw68.quarry.iu.teragrid.org:6379" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=600 + processes_per_node=12 + number_of_processes=24 + workingdirectory="/lustre/scratch/aluckow/agent" # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "xt5torque://localhost" # resource url to run the jobs on localhost + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/hostname" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + + sjs = [] + for i in range(0,24): + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + sjs.append(sj) + + + + + ######################################### + # busy wait for completion + while 1: + for idx, sj in enumerate(sjs): + state = str(sj.get_state()) + print "sj: %d state: %s"%(idx,state) + + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_local_multiple.py b/tests/bigjob-api/example_local_multiple.py index 73df12de..b936d68b 100644 --- a/tests/bigjob-api/example_local_multiple.py +++ b/tests/bigjob-api/example_local_multiple.py @@ -74,7 +74,7 @@ def has_finished(state): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, @@ -85,7 +85,7 @@ def has_finished(state): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.get_url() + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.get_url() + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -107,5 +107,5 @@ def has_finished(state): job_start_times[sj]=time.time() job_states[sj] = sj.get_state() - print "Terminating application. You can reconnect to BJ via the following URL: %s"%bj.get_url() + print("Terminating application. You can reconnect to BJ via the following URL: %s"%bj.get_url()) diff --git a/tests/bigjob-api/example_local_multiple.py.bak b/tests/bigjob-api/example_local_multiple.py.bak new file mode 100644 index 00000000..73df12de --- /dev/null +++ b/tests/bigjob-api/example_local_multiple.py.bak @@ -0,0 +1,111 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" + +import os +import time +import pdb +# BigJob implementation can be swapped here by importing another implementation, +# e.g. condor, cloud, azure +import sys +sys.path.insert(0, os.getcwd() + "/../") + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +### EDIT COORDINATION_URL to point to advert server. +COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "redis://localhost" + +from bigjob import bigjob, subjob, description + + +### EDIT based on the number of jobs you want to submit +NUMBER_JOBS=8 + +def has_finished(state): + state = state.lower() + if state=="done" or state=="failed" or state=="canceled": + return True + else: + return False + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + + starttime=time.time() + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=100 + processes_per_node=4 + number_of_processes =2 + workingdirectory= os.path.join(os.getcwd(), "agent") # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "fork://localhost" + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.get_url() + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jobs = [] + job_start_times = {} + job_states = {} + for i in range(0, NUMBER_JOBS): + jd = description() + jd.executable = "/bin/date" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + jd.output = "sj-stdout-"+str(i)+".txt" + jd.error = "sj-stderr-"+str(i)+".txt" + + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + jobs.append(sj) + job_start_times[sj]=time.time() + job_states[sj] = sj.get_state() + + print "Terminating application. You can reconnect to BJ via the following URL: %s"%bj.get_url() + diff --git a/tests/bigjob-api/example_local_multiple_reconnect.py b/tests/bigjob-api/example_local_multiple_reconnect.py index 28a58b61..5e8f5360 100644 --- a/tests/bigjob-api/example_local_multiple_reconnect.py +++ b/tests/bigjob-api/example_local_multiple_reconnect.py @@ -51,14 +51,14 @@ def has_finished(state): if len(sys.argv)==2: reconnect_url=sys.argv[1] else: - print "Usage: " + sys.executable + " " + __file__ + " " + print("Usage: " + sys.executable + " " + __file__ + " ") sys.exit(-1) - print "Reconnect to Pilot Job/BigJob at: " + reconnect_url + print("Reconnect to Pilot Job/BigJob at: " + reconnect_url) bj = bigjob(pilot_url=reconnect_url) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() - + " Number of SJs: " + str(len(bj.list_subjobs())) ) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() + + " Number of SJs: " + str(len(bj.list_subjobs())) )) ########################################################################################## # Submit some more subjobs @@ -81,13 +81,13 @@ def has_finished(state): job_start_times[sj]=time.time() job_states[sj] = sj.get_state() - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() - + " Number of SJs: " + str(len(bj.list_subjobs())) ) - print "Wait for completion..." + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() + + " Number of SJs: " + str(len(bj.list_subjobs())) )) + print("Wait for completion...") bj.wait() runtime = time.time()-starttime - print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) + print("Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS)) bj.cancel() diff --git a/tests/bigjob-api/example_local_multiple_reconnect.py.bak b/tests/bigjob-api/example_local_multiple_reconnect.py.bak new file mode 100644 index 00000000..28a58b61 --- /dev/null +++ b/tests/bigjob-api/example_local_multiple_reconnect.py.bak @@ -0,0 +1,93 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" + +import os +import time +import pdb +# BigJob implementation can be swapped here by importing another implementation, +# e.g. condor, cloud, azure +import sys +sys.path.insert(0, os.getcwd() + "/../") + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +### EDIT COORDINATION_URL to point to advert server. +COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "redis://localhost" +#COORDINATION_URL = "advert://advert.cct.lsu.edu:8080/" + +from bigjob import bigjob, subjob, description + + +### EDIT based on the number of jobs you want to submit +NUMBER_JOBS=1 + +def has_finished(state): + state = state.lower() + if state=="done" or state=="failed" or state=="canceled": + return True + else: + return False + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + + starttime=time.time() + + if len(sys.argv)==2: + reconnect_url=sys.argv[1] + else: + print "Usage: " + sys.executable + " " + __file__ + " " + sys.exit(-1) + + print "Reconnect to Pilot Job/BigJob at: " + reconnect_url + bj = bigjob(pilot_url=reconnect_url) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() + + " Number of SJs: " + str(len(bj.list_subjobs())) ) + + ########################################################################################## + # Submit some more subjobs + if bj.get_state()!="Done": + jobs = [] + job_start_times = {} + job_states = {} + for i in range(0, NUMBER_JOBS): + jd = description() + jd.executable = "/bin/date" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + jd.output = "sj-stdout-"+str(i)+".txt" + jd.error = "sj-stderr-"+str(i)+".txt" + + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + jobs.append(sj) + job_start_times[sj]=time.time() + job_states[sj] = sj.get_state() + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() + + " Number of SJs: " + str(len(bj.list_subjobs())) ) + print "Wait for completion..." + bj.wait() + + runtime = time.time()-starttime + print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) + + bj.cancel() + diff --git a/tests/bigjob-api/example_local_single.py b/tests/bigjob-api/example_local_single.py index 9cb97138..0e6a5db2 100644 --- a/tests/bigjob-api/example_local_single.py +++ b/tests/bigjob-api/example_local_single.py @@ -62,7 +62,7 @@ def main(): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, @@ -73,7 +73,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -96,7 +96,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/bigjob-api/example_local_single.py.bak b/tests/bigjob-api/example_local_single.py.bak new file mode 100644 index 00000000..9cb97138 --- /dev/null +++ b/tests/bigjob-api/example_local_single.py.bak @@ -0,0 +1,113 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "tcp://*" +COORDINATION_URL = "redis://localhost:6379" +#COORDINATION_URL = "redis://Oily9tourSorenavyvault@redis01.tacc.utexas.edu" +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue="" # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=4 + number_of_processes = 8 + #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent + workingdirectory="agent" + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "fork://localhost" # resource url to run the jobs on localhost + #lrms_url = "sge://localhost" # resource url to run the jobs on localhost + #lrms_url = "ssh://localhost" # resource url to run the jobs on localhost + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/echo" + #jd.executable = "$HOME/hello.sh" + jd.number_of_processes = "1" + jd.arguments = ["$HELLOWORLD"] + jd.environment = ['HELLOWORLD=hello_world'] + #jd.spmd_variation = "mpi" + + # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_local_single_filestaging.py b/tests/bigjob-api/example_local_single_filestaging.py index d7e9bd79..64d8ec9e 100644 --- a/tests/bigjob-api/example_local_single_filestaging.py +++ b/tests/bigjob-api/example_local_single_filestaging.py @@ -36,7 +36,7 @@ #sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) #sys.path.insert(0, os.getcwd() + "/../") -print(str(sys.path)) +print((str(sys.path))) from bigjob import bigjob, subjob, description def main(): @@ -69,7 +69,7 @@ def main(): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) #bj_filetransfers = ["ssh://" + os.path.dirname(os.path.abspath(__file__)) @@ -86,7 +86,7 @@ def main(): processes_per_node, bj_filetransfers) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -108,7 +108,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/bigjob-api/example_local_single_filestaging.py.bak b/tests/bigjob-api/example_local_single_filestaging.py.bak new file mode 100644 index 00000000..d7e9bd79 --- /dev/null +++ b/tests/bigjob-api/example_local_single_filestaging.py.bak @@ -0,0 +1,125 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" + +import os +import time +import pdb +import sys + +#import bigjob +#bigjob.SAGA_BLISS=False + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +COORDINATION_URL = "redis://localhost:6379" +#COORDINATION_URL = "redis://@gw68.quarry.iu.teragrid.org:6379" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.path.join(os.getcwd(), "..")) +#sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +#sys.path.insert(0, os.getcwd() + "/../") + +print(str(sys.path)) +from bigjob import bigjob, subjob, description + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=4 + number_of_processes = 8 + workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "fork://localhost" # resource url to run the jobs on localhost + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + + #bj_filetransfers = ["ssh://" + os.path.dirname(os.path.abspath(__file__)) + # + "/test.txt > BIGJOB_WORK_DIR"] + bj_filetransfers=None + + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node, + bj_filetransfers) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/cat" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = ["test.txt"] + #jd.working_directory = os.getcwd() + jd.output = "stdout.txt" + jd.error = "stderr.txt" + jd.file_transfer = ["ssh://" + os.path.dirname(os.path.abspath(__file__)) + + "/test.txt > SUBJOB_WORK_DIR"] + + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_ls_single.py b/tests/bigjob-api/example_ls_single.py index e4c52b59..8b291d39 100644 --- a/tests/bigjob-api/example_ls_single.py +++ b/tests/bigjob-api/example_ls_single.py @@ -63,7 +63,7 @@ def main(): ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, @@ -74,7 +74,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -93,7 +93,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/bigjob-api/example_ls_single.py.bak b/tests/bigjob-api/example_ls_single.py.bak new file mode 100644 index 00000000..e4c52b59 --- /dev/null +++ b/tests/bigjob-api/example_ls_single.py.bak @@ -0,0 +1,110 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +#COORDINATION_URL = "redis://localhost:6379" +#COORDINATION_URL = "redis://gw68.quarry.iu.teragrid.org:2525" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=12 + number_of_processes = 12 + workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "gram://gridftp1.ls4.tacc.utexas.edu:2119/jobmanager-sge" # resource url to run the jobs on localhost + + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/date" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/bigjob-api/example_manyjob_affinity.py b/tests/bigjob-api/example_manyjob_affinity.py index b6f2258d..fbf890d3 100644 --- a/tests/bigjob-api/example_manyjob_affinity.py +++ b/tests/bigjob-api/example_manyjob_affinity.py @@ -41,7 +41,7 @@ def has_finished(state): """ Test Job Submission via ManyJob abstraction """ if __name__ == "__main__": try: - print "ManyJob load test with " + str(NUMBER_JOBS) + " jobs." + print("ManyJob load test with " + str(NUMBER_JOBS) + " jobs.") starttime=time.time() # submit via mj abstraction @@ -55,7 +55,7 @@ def has_finished(state): # "queue" : "workq", "bigjob_agent": (BIGJOB_HOME+"/bigjob_agent_launcher.sh"), # "working_directory": (os.getcwd() + "/agent"), "walltime":10, "affinity" : "affinity1"}) - print "Create manyjob service " + print("Create manyjob service ") mjs = many_job_affinity_service(resource_list, COORDINATION_URL) jobs = [] @@ -74,37 +74,37 @@ def has_finished(state): jd.environment = ["affinity=affinity1","VAR=USER"] subjob = mjs.create_job(jd) subjob.run() - print "Submited sub-job " + "%d"%i + "." + print("Submited sub-job " + "%d"%i + ".") jobs.append(subjob) job_start_times[subjob]=time.time() job_states[subjob] = subjob.get_state() - print "************************ All Jobs submitted ************************" + print("************************ All Jobs submitted ************************") while 1: finish_counter=0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() - if result_map.has_key(state) == False: + if (state in result_map) == False: result_map[state]=0 result_map[state] = result_map[state]+1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: - print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state + print("Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state) if old_state != state and has_finished(state)==True: - print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s." + print("Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s.") if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[jobs[i]]=state - print "Current states: " + str(result_map) + print("Current states: " + str(result_map)) time.sleep(5) if finish_counter == NUMBER_JOBS: break mjs.cancel() runtime = time.time()-starttime - print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) + print("Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS)) except: traceback.print_exc(file=sys.stdout) try: diff --git a/tests/bigjob-api/example_manyjob_affinity.py.bak b/tests/bigjob-api/example_manyjob_affinity.py.bak new file mode 100644 index 00000000..b6f2258d --- /dev/null +++ b/tests/bigjob-api/example_manyjob_affinity.py.bak @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +"""many_job example with affinity. + +This Module is used to launch a set of bigjobs. + +""" +import getopt +import time +import pdb +import os +import traceback +import logging +import sys + +sys.path.insert(0, os.getcwd() + "/../") + +# Big Job Imports +from bigjob import bigjob, subjob, description +from bigjob_dynamic.many_job_affinity import * + +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) +""" +COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" + +NUMBER_JOBS=8 + + +def has_finished(state): + state = state.lower() + if state=="done" or state=="failed" or state=="canceled": + return True + else: + return False + +""" Test Job Submission via ManyJob abstraction """ +if __name__ == "__main__": + try: + print "ManyJob load test with " + str(NUMBER_JOBS) + " jobs." + starttime=time.time() + + # submit via mj abstraction + resource_list = [] + resource_list.append( {"resource_url" : "fork://localhost/", "number_of_processes" : "2", "allocation" : "myAllocation", + "queue" : "workq", + "working_directory": (os.getcwd() + "/agent"), + "walltime": 10, "affinity" : "affinity1"}) + + #resource_list.append( {"resource_url" : "gram://oliver1.loni.org/jobmanager-pbs", "number_nodes" : "4", "allocation" : "", + # "queue" : "workq", "bigjob_agent": (BIGJOB_HOME+"/bigjob_agent_launcher.sh"), + # "working_directory": (os.getcwd() + "/agent"), "walltime":10, "affinity" : "affinity1"}) + + print "Create manyjob service " + mjs = many_job_affinity_service(resource_list, COORDINATION_URL) + + jobs = [] + job_start_times = {} + job_states = {} + cwd = os.getcwd() + for i in range(0, NUMBER_JOBS): + # create job description + jd = description() + jd.executable = "/bin/echo" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = ["Hello","$VAR"] + jd.output = "stdout-" + str(i) + ".txt" + jd.error = "stderr-" + str(i) + ".txt" + jd.environment = ["affinity=affinity1","VAR=USER"] + subjob = mjs.create_job(jd) + subjob.run() + print "Submited sub-job " + "%d"%i + "." + jobs.append(subjob) + job_start_times[subjob]=time.time() + job_states[subjob] = subjob.get_state() + print "************************ All Jobs submitted ************************" + while 1: + finish_counter=0 + result_map = {} + for i in range(0, NUMBER_JOBS): + old_state = job_states[jobs[i]] + state = jobs[i].get_state() + if result_map.has_key(state) == False: + result_map[state]=0 + result_map[state] = result_map[state]+1 + #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state + if old_state != state: + print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state + if old_state != state and has_finished(state)==True: + print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s." + if has_finished(state)==True: + finish_counter = finish_counter + 1 + job_states[jobs[i]]=state + + print "Current states: " + str(result_map) + time.sleep(5) + if finish_counter == NUMBER_JOBS: + break + + mjs.cancel() + runtime = time.time()-starttime + print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) + except: + traceback.print_exc(file=sys.stdout) + try: + mjs.cancel() + except: + pass + diff --git a/tests/bigjob-api/example_manyjob_local.py b/tests/bigjob-api/example_manyjob_local.py index 9e33d744..38b88e74 100644 --- a/tests/bigjob-api/example_manyjob_local.py +++ b/tests/bigjob-api/example_manyjob_local.py @@ -51,7 +51,7 @@ def has_finished(state): def main(): try: - print "ManyJob load test with " + str(NUMBER_JOBS) + " jobs." + print("ManyJob load test with " + str(NUMBER_JOBS) + " jobs.") starttime=time.time() """ submit via mj abstraction @@ -73,7 +73,7 @@ def main(): remove_additional_resources=False - print "Create Dynamic BigJob Service " + print("Create Dynamic BigJob Service ") mjs = many_job_service(resource_list, COORDINATION_URL) jobs = [] @@ -92,25 +92,25 @@ def main(): jd.error = "stderr-" + str(i) + ".txt" subjob = mjs.create_job(jd) subjob.run() - print "Submited sub-job " + "%d"%i + "." + print("Submited sub-job " + "%d"%i + ".") jobs.append(subjob) job_start_times[subjob]=time.time() job_states[subjob] = subjob.get_state() - print "************************ All Jobs submitted ************************" + print("************************ All Jobs submitted ************************") while 1: finish_counter=0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() - if result_map.has_key(state) == False: + if (state in result_map) == False: result_map[state]=0 result_map[state] = result_map[state]+1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: - print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state + print("Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state) if old_state != state and has_finished(state)==True: - print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s." + print("Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s.") if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[jobs[i]]=state @@ -118,7 +118,7 @@ def main(): # Dynamic BigJob add resources at runtime # if more than 30 s - add additional resource if time.time()-starttime > 10 and add_additional_resources==True: - print "***add additional resources***" + print("***add additional resources***") mjs.add_resource(resource_dictionary) add_additional_resources=False @@ -126,18 +126,18 @@ def main(): if (time.time()-starttime > 15 and remove_additional_resources==True): bj_list = mjs.get_resources() if len(bj_list)>0: - print "***remove resources: " + str(bj_list[0]) + print("***remove resources: " + str(bj_list[0])) mjs.remove_resource(bj_list[0]) remove_additional_resources=False - print "Current states: " + str(result_map) + print("Current states: " + str(result_map)) time.sleep(5) if finish_counter == NUMBER_JOBS: break mjs.cancel() runtime = time.time()-starttime - print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) + print("Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS)) except: traceback.print_exc(file=sys.stdout) try: diff --git a/tests/bigjob-api/example_manyjob_local.py.bak b/tests/bigjob-api/example_manyjob_local.py.bak new file mode 100644 index 00000000..9e33d744 --- /dev/null +++ b/tests/bigjob-api/example_manyjob_local.py.bak @@ -0,0 +1,150 @@ +#!/usr/bin/env python + +""" +Dynamic BigJob (ManyJob) Example + +This Module is used to launch a set of bigjobs. + +DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + +THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USE-SPECIFIC PATHS! + +""" + +import sys +import getopt +import time +import pdb +import os +import traceback +import logging + + +# BigJob implementation can be swapped here by importing another implementation, +# e.g. condor, cloud, azure +import sys + +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description +from bigjob_dynamic.many_job import * + + +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) +""" +COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://advert.cct.lsu.edu:8080" + +NUMBER_JOBS=8 + +def has_finished(state): + state = state.lower() + if state=="done" or state=="failed" or state=="canceled": + return True + else: + return False + +def main(): + try: + print "ManyJob load test with " + str(NUMBER_JOBS) + " jobs." + starttime=time.time() + + """ submit via mj abstraction + + resource_list.append( {"resource_url" : "gram://eric1.loni.org/jobmanager-pbs", "processes_per_node":"4", + "number_of_processes" : "4", "allocation" : None, "queue" : "workq", + "working_directory": (os.getcwd() + "/agent"), "walltime":10 }) + + """ + resource_list = [] + resource_dictionary = {"resource_url" : "fork://localhost/", "number_of_processes" : "32", + "processes_per_node":"1", "allocation" : None, "queue" : None, + "working_directory": (os.getcwd() + "/agent"), "walltime":3600 } + resource_list.append(resource_dictionary) + + + #Flags for controlling dynamic BigJob + add_additional_resources=True + remove_additional_resources=False + + + print "Create Dynamic BigJob Service " + mjs = many_job_service(resource_list, COORDINATION_URL) + + jobs = [] + job_start_times = {} + job_states = {} + cwd = os.getcwd() + for i in range(0, NUMBER_JOBS): + # create job description + jd = description() + jd.executable = "/bin/date" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = [""] + jd.working_directory = os.getcwd(); + jd.output = "stdout-" + str(i) + ".txt" + jd.error = "stderr-" + str(i) + ".txt" + subjob = mjs.create_job(jd) + subjob.run() + print "Submited sub-job " + "%d"%i + "." + jobs.append(subjob) + job_start_times[subjob]=time.time() + job_states[subjob] = subjob.get_state() + print "************************ All Jobs submitted ************************" + while 1: + finish_counter=0 + result_map = {} + for i in range(0, NUMBER_JOBS): + old_state = job_states[jobs[i]] + state = jobs[i].get_state() + if result_map.has_key(state) == False: + result_map[state]=0 + result_map[state] = result_map[state]+1 + #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state + if old_state != state: + print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state + if old_state != state and has_finished(state)==True: + print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s." + if has_finished(state)==True: + finish_counter = finish_counter + 1 + job_states[jobs[i]]=state + + # Dynamic BigJob add resources at runtime + # if more than 30 s - add additional resource + if time.time()-starttime > 10 and add_additional_resources==True: + print "***add additional resources***" + mjs.add_resource(resource_dictionary) + add_additional_resources=False + + # remove resources from dynamic bigjob + if (time.time()-starttime > 15 and remove_additional_resources==True): + bj_list = mjs.get_resources() + if len(bj_list)>0: + print "***remove resources: " + str(bj_list[0]) + mjs.remove_resource(bj_list[0]) + remove_additional_resources=False + + print "Current states: " + str(result_map) + time.sleep(5) + if finish_counter == NUMBER_JOBS: + break + + mjs.cancel() + runtime = time.time()-starttime + print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) + except: + traceback.print_exc(file=sys.stdout) + try: + mjs.cancel() + except: + pass + +""" Test Job Submission via ManyJob abstraction """ +if __name__ == "__main__": + main() diff --git a/tests/bigjob-api/example_single_filestaging_globusonline.py b/tests/bigjob-api/example_single_filestaging_globusonline.py index ce137dac..7d7ee811 100644 --- a/tests/bigjob-api/example_single_filestaging_globusonline.py +++ b/tests/bigjob-api/example_single_filestaging_globusonline.py @@ -36,7 +36,7 @@ #sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) #sys.path.insert(0, os.getcwd() + "/../") -print(str(sys.path)) +print((str(sys.path))) from bigjob import bigjob, subjob, description ############################################################################################## @@ -77,7 +77,7 @@ def main(): workingdirectory="go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=xsede#lonestar4&path=~/bigjob/" ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj_filetransfers = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) @@ -95,7 +95,7 @@ def main(): processes_per_node, bj_filetransfers) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -116,7 +116,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/bigjob-api/example_single_filestaging_globusonline.py.bak b/tests/bigjob-api/example_single_filestaging_globusonline.py.bak new file mode 100644 index 00000000..ce137dac --- /dev/null +++ b/tests/bigjob-api/example_single_filestaging_globusonline.py.bak @@ -0,0 +1,133 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" + +import os +import time +import pdb +import sys + +#import bigjob +#bigjob.SAGA_BLISS=False + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +#COORDINATION_URL = "redis://localhost:6379" +#COORDINATION_URL = "redis://@gw68.quarry.iu.teragrid.org:6379" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.path.join(os.getcwd(), "..")) +#sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +#sys.path.insert(0, os.getcwd() + "/../") + +print(str(sys.path)) +from bigjob import bigjob, subjob, description + +############################################################################################## +# PLEASE SET YOUR GLOBUS ONLINE PASSWORD HERE!!! + +GLOBUS_ONLINE_USER="xxxx" +GLOBUS_ONLINE_PASSWORD="xxxx" + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=4 + number_of_processes = 8 + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "sge-ssh://lonestar.tacc.teragrid.org" + + """ + To use Globus Online the working directory must be specified using the following conventions + """ + workingdirectory="go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=xsede#lonestar4&path=~/bigjob/" + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + + bj_filetransfers = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + + "/test.txt > BIGJOB_WORK_DIR"] + + + bj.start_pilot_job( lrms_url, + None, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node, + bj_filetransfers) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/cat" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = ["test.txt"] + jd.output = "stdout.txt" + jd.error = "stderr.txt" + jd.file_transfer = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + + "/test.txt > SUBJOB_WORK_DIR"] + + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/condor/example_condor_single.py b/tests/condor/example_condor_single.py index 52c324de..d87332ba 100644 --- a/tests/condor/example_condor_single.py +++ b/tests/condor/example_condor_single.py @@ -68,7 +68,7 @@ def main(): input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") bj_filetransfers = [input_file +" > test.txt"] - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, None, @@ -81,7 +81,7 @@ def main(): processes_per_node, bj_filetransfers) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -103,7 +103,7 @@ def main(): while 1: state = str(sj.get_state()) bj_state = bj.get_state() - print "bj state: " + str(bj_state) + " state: " + state + print("bj state: " + str(bj_state) + " state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/condor/example_condor_single.py.bak b/tests/condor/example_condor_single.py.bak new file mode 100644 index 00000000..52c324de --- /dev/null +++ b/tests/condor/example_condor_single.py.bak @@ -0,0 +1,120 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +#COORDINATION_URL = "redis://localhost:6379" + +COORDINATION_URL = "redis://gw68.quarry.iu.teragrid.org:2525" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=4 + number_of_processes = 8 + workingdirectory= os.path.join(os.getcwd(), "agent") + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "condor://localhost" + + ########################################################################################## + + + input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") + bj_filetransfers = [input_file +" > test.txt"] + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + None, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node, + bj_filetransfers) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + + jd.executable = "/bin/cat" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + jd.arguments = ["test.txt"] + #jd.working_directory = "" + jd.output = "sj-stdout.txt" + jd.error = "sj-stderr.txt" + + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + bj_state = bj.get_state() + print "bj state: " + str(bj_state) + " state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/condor/example_condorg_single.py b/tests/condor/example_condorg_single.py index 297326e6..60f888e1 100644 --- a/tests/condor/example_condorg_single.py +++ b/tests/condor/example_condorg_single.py @@ -66,7 +66,7 @@ def main(): #lrms_url = "ssh://smaddi2@cyder.cct.lsu.edu" ########################################################################################## - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, None, @@ -78,7 +78,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -100,7 +100,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/condor/example_condorg_single.py.bak b/tests/condor/example_condorg_single.py.bak new file mode 100644 index 00000000..297326e6 --- /dev/null +++ b/tests/condor/example_condorg_single.py.bak @@ -0,0 +1,117 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + advert://advert.cct.lsu.edu:5432 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "advert://SAGA:SAGA_client@advert.cct.lsu.edu:8080/?dbtype=postgresql" +#COORDINATION_URL = "tcp://*" +#COORDINATION_URL = "redis://localhost:6379" + +COORDINATION_URL = "redis://gw68.quarry.iu.teragrid.org:2525" +#COORDINATION_URL="sqlasyncadvert://gw68.quarry.iu.teragrid.org/" + +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue=None # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=4 + number_of_processes = 8 + workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + #lrms_url = "fork://localhost" # resource url to run the jobs on localhost + lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs" + + #lrms_url = "ssh://smaddi2@cyder.cct.lsu.edu" + ########################################################################################## + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + None, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + + jd.executable = "/bin/date" + jd.number_of_processes = "1" + jd.spmd_variation = "single" + #jd.arguments = ["match -f bgr1.fa -A 0 -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"] + jd.arguments = [""] + #jd.working_directory = "" + jd.output = "bfast-stdout.txt" + jd.error = "bfast-stderr.txt" + + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + diff --git a/tests/performance/throughput.py b/tests/performance/throughput.py index 1ff6d2fc..ea7e57c6 100644 --- a/tests/performance/throughput.py +++ b/tests/performance/throughput.py @@ -50,7 +50,7 @@ runtime=time.time()-start print("Number Slots, Number CUs, Runtime, Throughput") - print("%d,%d,%f,%f"%(NUMBER_SLOTS,NUMBER_CUS,runtime, runtime/NUMBER_CUS)) + print(("%d,%d,%f,%f"%(NUMBER_SLOTS,NUMBER_CUS,runtime, runtime/NUMBER_CUS))) print("Terminate Pilot Compute and Compute Data Service") pilot_compute_service.cancel() diff --git a/tests/performance/throughput.py.bak b/tests/performance/throughput.py.bak new file mode 100644 index 00000000..1ff6d2fc --- /dev/null +++ b/tests/performance/throughput.py.bak @@ -0,0 +1,56 @@ +import sys +import os +import time + +from pilot import PilotComputeService, ComputeDataService, State + + +COORDINATION_URL = "redis://localhost" +NUMBER_CUS=128 +NUMBER_SLOTS=16 +if __name__ == "__main__": + + pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) + + # create pilot job service and initiate a pilot job + pilot_compute_description = { + "service_url": 'fork://localhost', + "number_of_processes": NUMBER_SLOTS, + "working_directory": os.getcwd(), + "number_executor_threads": 16 + } + + pilotjob = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) + + while pilotjob.get_state()!="Running": + time.sleep(2) + + start = time.time() + # start work unit + compute_unit_description = { + "executable": "/bin/date", + "arguments": [""], + "number_of_processes": 1, + "output": "stdout.txt", + "error": "stderr.txt", + } + + cds = ComputeDataService() + cds.add_pilot_compute_service(pilot_compute_service) + + unitservice = pilotjob + + for i in range(0,NUMBER_CUS): + compute_unit = unitservice.submit_compute_unit(compute_unit_description) + + + print("Finished setup. Waiting for scheduling of CU") + unitservice.wait() + + + runtime=time.time()-start + print("Number Slots, Number CUs, Runtime, Throughput") + print("%d,%d,%f,%f"%(NUMBER_SLOTS,NUMBER_CUS,runtime, runtime/NUMBER_CUS)) + + print("Terminate Pilot Compute and Compute Data Service") + pilot_compute_service.cancel() diff --git a/tests/test_connection_pooling.py b/tests/test_connection_pooling.py index c57f4c3a..5f90ba2a 100644 --- a/tests/test_connection_pooling.py +++ b/tests/test_connection_pooling.py @@ -23,7 +23,7 @@ for i in range(0, N): - print "start %3d" % i + print("start %3d" % i) pilot_description = pilot.PilotComputeDescription() pilot_description.service_url = HOST @@ -50,10 +50,10 @@ # see https://github.com/saga-project/BigJob/issues/131 for i, pj in enumerate(pjs): - print "cancel %3d" % i + print("cancel %3d" % i) pj.cancel() pilot_service.cancel() -print "time: %.1fs rate: %.1f/s" % (stop-start, N/(stop-start)) +print("time: %.1fs rate: %.1f/s" % (stop-start, N/(stop-start))) diff --git a/tests/test_connection_pooling.py.bak b/tests/test_connection_pooling.py.bak new file mode 100644 index 00000000..c57f4c3a --- /dev/null +++ b/tests/test_connection_pooling.py.bak @@ -0,0 +1,59 @@ +import os +import time +import pilot + +try: + import pudb + pudb.set_interrupt_handler() +except: + pass + +######################################################################### +## +redis_password = os.environ.get('REDIS_PASSWORD') +COORD = "redis://%s@gw68.quarry.iu.teragrid.org:6379" % redis_password +HOST = "ssh://localhost" +## +######################################################################### + +N = 20 +pjs = [] +start = time.time() +total = 0.0 + +for i in range(0, N): + + print "start %3d" % i + + pilot_description = pilot.PilotComputeDescription() + pilot_description.service_url = HOST + pilot_description.number_of_processes = 1 + pilot_description.working_directory = os.getcwd() + + pilot_service = pilot.PilotComputeService(COORD) + + ### This is broken !!! -> https://github.com/saga-project/BigJob/issues/118 + #pilotjob = pilot_service.create_pilot(pilot_compute_description) + pilotjob = pilot_service.create_pilot(pilot_compute_description=pilot_description) + + pjs.append(pilotjob) + + task = pilot.ComputeUnitDescription() + task.executable = "/bin/sleep" + task.arguments = ["10"] + + pilotjob.submit_compute_unit(task) + +stop = time.time() + +# see https://github.com/saga-project/BigJob/issues/121 +# see https://github.com/saga-project/BigJob/issues/131 + +for i, pj in enumerate(pjs): + print "cancel %3d" % i + pj.cancel() + +pilot_service.cancel() + + +print "time: %.1fs rate: %.1f/s" % (stop-start, N/(stop-start)) diff --git a/tests/test_pty_exhaustion.py b/tests/test_pty_exhaustion.py index 8f493fde..ff59f1dd 100644 --- a/tests/test_pty_exhaustion.py +++ b/tests/test_pty_exhaustion.py @@ -75,9 +75,9 @@ def main(): # print "%4d: %s" % (i, j.state) for i in range(99999): - print i + print(i) - print "Start Pilot Job/BigJob at: " + lrms_url + print("Start Pilot Job/BigJob at: " + lrms_url) bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, @@ -88,7 +88,7 @@ def main(): walltime, processes_per_node) - print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + print("Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state())) ########################################################################################## # Submit SubJob through BigJob @@ -112,7 +112,7 @@ def main(): # busy wait for completion while 1: state = str(sj.get_state()) - print "state: " + state + print("state: " + state) if(state=="Failed" or state=="Done"): break time.sleep(2) diff --git a/tests/test_pty_exhaustion.py.bak b/tests/test_pty_exhaustion.py.bak new file mode 100644 index 00000000..8f493fde --- /dev/null +++ b/tests/test_pty_exhaustion.py.bak @@ -0,0 +1,130 @@ +""" Example application demonstrating job submission via bigjob + + DON'T EDIT THIS FILE (UNLESS THERE IS A BUG) + + THIS FILE SHOULD NOT BE COMMITTED TO SVN WITH USER-SPECIFIC PATHS! +""" +import os +import time +import pdb +import sys +import saga + +try : + import pudb + pudb.set_interrupt_handler() +except : + pass + +# configuration +""" This variable defines the coordination system that is used by BigJob + e.g. + advert://localhost (SAGA/Advert SQLITE) + advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) + redis://localhost:6379 (Redis at localhost) + tcp://localhost (ZMQ) + tcp://* (ZMQ - listening to all interfaces) +""" + +#COORDINATION_URL = "advert://localhost/?dbtype=sqlite3" +#COORDINATION_URL = "tcp://*" +COORDINATION_URL = "redis://10.0.1.18:6379" +#COORDINATION_URL = "redis://Oily9tourSorenavyvault@redis01.tacc.utexas.edu" +# for running BJ from local dir +sys.path.insert(0, os.getcwd() + "/../") + +from bigjob import bigjob, subjob, description + + +def main(): + # Start BigJob + + ########################################################################################## + # Edit parameters for BigJob + queue="normal" # if None default queue is used + project=None # if None default allocation is used + walltime=10 + processes_per_node=4 + number_of_processes = 8 + #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent + workingdirectory="agent" + userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) + + + """ + URL of the SAGA Job Service that is used to dispatch the pilot job. + The following URLs are accepted: + + lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) + lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) + lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. + lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. + lrms_url = "xt5torque://localhost" # torque resource url. + + Please ensure that the respective SAGA adaptor is installed and working + """ + lrms_url = "ssh://boskop" + #lrms_url = "sge://localhost" + #lrms_url = "fork://localhost" + + ########################################################################################## + + # for i in range(99999): + # js = saga.job.Service (lrms_url) + # j = js.run_job ("/bin/sleep 1000") + # print "%4d: %s" % (i, j.state) + + for i in range(99999): + print i + + print "Start Pilot Job/BigJob at: " + lrms_url + bj = bigjob(COORDINATION_URL) + bj.start_pilot_job( lrms_url, + number_of_processes, + queue, + project, + workingdirectory, + userproxy, + walltime, + processes_per_node) + + print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) + + ########################################################################################## + # Submit SubJob through BigJob + jd = description() + jd.executable = "/bin/echo" + #jd.executable = "$HOME/hello.sh" + jd.number_of_processes = "1" + jd.arguments = ["$HELLOWORLD"] + jd.environment = ['HELLOWORLD=hello_world'] + #jd.spmd_variation = "mpi" + + # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox + #jd.working_directory = "/tmp" + jd.output = "stdout.txt" + jd.error = "stderr.txt" + + sj = subjob() + sj.submit_job(bj.pilot_url, jd) + + ######################################### + # busy wait for completion + while 1: + state = str(sj.get_state()) + print "state: " + state + if(state=="Failed" or state=="Done"): + break + time.sleep(2) + + ########################################################################################## + # Cleanup - stop BigJob + bj.cancel() + #time.sleep(30) + + +""" Test Job Submission via Advert """ +if __name__ == "__main__": + main() + + diff --git a/util/archive.py b/util/archive.py index 97878681..779fbbd7 100644 --- a/util/archive.py +++ b/util/archive.py @@ -80,11 +80,11 @@ def get_cus(self): if __name__ == '__main__': if len(sys.argv)>1: - print "Get data from " + sys.argv[1] + print("Get data from " + sys.argv[1]) rd = RedisDownloader(sys.argv[1]) pilots = rd.get_pilots() cus = rd.get_cus() - print "Loaded Redis data: %d pilots, %d cus"%(len(pilots), len(cus)) + print("Loaded Redis data: %d pilots, %d cus"%(len(pilots), len(cus))) else: for i in REDIS_URLS: rd = RedisDownloader(i) diff --git a/util/archive.py.bak b/util/archive.py.bak new file mode 100644 index 00000000..97878681 --- /dev/null +++ b/util/archive.py.bak @@ -0,0 +1,97 @@ +# Archive Redis content and generate some primitive analytics +# +# On OS X +# brew install zeromq freetype +# pip install pyzmq tornado pandas ipython matplotlib + +import redis +import os, sys +import pandas as pd + + +from bigjob import logger + +# Archive the following redis urls +REDIS_URLS=["redis://ILikeBigJob_wITH-REdIS@gw68.quarry.iu.teragrid.org:6379", "redis://localhost"] + + +REDIS_SERVER="localhost" +REDIS_SERVER_PORT=6379 +REDIS_URL_SCHEME="redis://" + +class RedisDownloader(object): + + def __init__(self, redis_url): + ''' + Constructor + ''' + server_port=6379 + self.redis_url=redis_url + self.password=None + start_index = self.redis_url.find(REDIS_URL_SCHEME)+len(REDIS_URL_SCHEME) + server_and_port = self.redis_url[start_index:] + password_end = server_and_port.find("@") + # parse out password + if password_end != -1: + self.password = server_and_port[:password_end] + start_index=password_end + server_and_port= server_and_port[(password_end+1):] + + # port and hostname + if server_and_port.find(":")==-1: + server=server_and_port + server_port = REDIS_SERVER_PORT + else: + server = server_and_port.split(":")[0] + server_port = int(server_and_port.split(":")[1]) + + logger.debug("Connect to Redis: " + server + " Port: " + str(server_port)) + + if self.password==None: + self.redis_client = redis.Redis(host=server, port=server_port, db=0) + else: + self.redis_client = redis.Redis(host=server, port=server_port, password=self.password, db=0) + + self.pipe = self.redis_client.pipeline() + try: + self.redis_client.ping() + except: + logger.error("Please start Redis server!") + raise Exception("Please start Redis server!") + + + def get_pilots(self): + pilots = self.redis_client.keys("bigjob:bj-*") + for i in pilots: + if ":jobs:" not in i and i.count(":")==2: + #print i + self.pipe.hgetall(i) + response = self.pipe.execute() + return response; + + + def get_cus(self): + cus = self.redis_client.keys("*:jobs:*") + for i in cus: + self.pipe.hgetall(i) + response = self.pipe.execute() + return response; + + +if __name__ == '__main__': + if len(sys.argv)>1: + print "Get data from " + sys.argv[1] + rd = RedisDownloader(sys.argv[1]) + pilots = rd.get_pilots() + cus = rd.get_cus() + print "Loaded Redis data: %d pilots, %d cus"%(len(pilots), len(cus)) + else: + for i in REDIS_URLS: + rd = RedisDownloader(i) + pilots = rd.get_pilots() + cus = rd.get_cus() + + + + + \ No newline at end of file diff --git a/util/bigjob_usage.py b/util/bigjob_usage.py index 76bc8b24..5426c9d8 100644 --- a/util/bigjob_usage.py +++ b/util/bigjob_usage.py @@ -11,7 +11,7 @@ import pandas as pd import matplotlib.pyplot as plt import os, sys -import archive +from . import archive import datetime import ast @@ -26,7 +26,7 @@ max_cus_date = cus_df.index.max() max_pilots_date = pilot_df.index.max() - print "Restored data frames until %s"%max_cus_date + print("Restored data frames until %s"%max_cus_date) # @@ -117,11 +117,11 @@ # -print "Number of Pilots: %d Number CUs: %d Executed since: %s"%(len(pilots), len(cus), str(cus_df.index.min())) +print("Number of Pilots: %d Number CUs: %d Executed since: %s"%(len(pilots), len(cus), str(cus_df.index.min()))) # -pilots = [i for i in pilots if i.has_key("start_time")] +pilots = [i for i in pilots if "start_time" in i] max_pilot_date = None try: max_pilot_date = max_pilot_date.index.max() diff --git a/util/bigjob_usage.py.bak b/util/bigjob_usage.py.bak new file mode 100644 index 00000000..76bc8b24 --- /dev/null +++ b/util/bigjob_usage.py.bak @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +# 3.0 + +# + +# # Generating BigJob Usage Statistics out of Redis entries +# Read `cus` and `pilots` from Redis + +# + +import pandas as pd +import matplotlib.pyplot as plt +import os, sys +import archive +import datetime +import ast + +# + +# Attempt to restore old data frame +cus_df = None +pilot_df = None +if os.path.exists("cus.df") and os.path.exists("pilot.df"): + cus_df = pd.load("cus.df") #pd.read_csv("cus.csv", index_col=0, parse_dates=False, date_parser=) + pilot_df = pd.load("pilot.df") #pd.read_csv("pilot.csv", index_col=0, parse_dates=False, date_parser=), dat + + max_cus_date = cus_df.index.max() + max_pilots_date = pilot_df.index.max() + print "Restored data frames until %s"%max_cus_date + +# + +# Download new data +# Redis Service to connect to: +# redis://ILikeBigJob_wITH-REdIS@gw68.quarry.iu.teragrid.org:6379 +# redis://localhost +rd = archive.RedisDownloader("redis://ILikeBigJob_wITH-REdIS@gw68.quarry.iu.teragrid.org:6379") +#rd = archive.RedisDownloader("redis://localhost:6379") +pilots = rd.get_pilots() +cus = rd.get_cus() + +# + +# ## Compute Units Executed per Day + +# + +# make sure only new entries are loaded into data frame +max_cus_date = None +try: + max_cus_date = cus_df.index.max() +except: + pass +timestamp_index = [] +cus_new = [] +for i in cus: + if max_cus_date == None or datetime.datetime.utcfromtimestamp(float(i["start_time"]))>max_cus_date: + # print "add " + str(datetime.datetime.utcfromtimestamp(float(i["start_time"]))) + timestamp_index.append(datetime.datetime.utcfromtimestamp(float(i["start_time"]))) + cus_new.append(i) + +#print cus_new +if len(cus_new) > 0: + cus_df_new = pd.DataFrame(cus_new, index=timestamp_index, columns=['Executable', 'NumberOfProcesses', "SPMDVariation", "start_time", "end_queue_time", "start_staging_time", "end_time"]) + try: + cus_df = pd.concat([cus_df, cus_df_new]) + except: + cus_df = cus_df_new + +# + +cus_df_h = cus_df["Executable"].resample("D", how="count") +cus_df_h.plot(color='k', alpha=0.7) +plt.ylabel("Number of CUs Executed") +plt.xlabel("Day") +plt.savefig("number_cus_per_day.pdf", format="pdf", bbox_inches='tight', pad_inches=0.1) + +# + +# ## Compute Unit Types +# +# How many sequential versus parallel (MPI) CUs are executed? + +# + +spmd = cus_df["SPMDVariation"].astype("object") +spmd[spmd.isnull()]="single" +spmd.value_counts().plot(kind="bar", color='k', alpha=0.7) +plt.ylabel("Number of CUs") +plt.ylabel("CU SPMD Variation") +plt.savefig("cu_type.pdf", format="pdf", bbox_inches='tight', pad_inches=0.1) + +# + +cus_df["Executable"].value_counts().plot(kind="bar", color='k', alpha=0.7) +plt.ylabel("Number CUs") +plt.xlabel("CU Executable") +plt.savefig("cu_executable.pdf", format="pdf", bbox_inches='tight', pad_inches=0.1) + +# + +# ## CU Runtime Distribution + +# + +runtimes = cus_df.apply(lambda row: float(row["end_time"]) - float(row["end_queue_time"]), axis=1) +runtimes.hist(bins=50) +plt.ylabel("Number of Events") +plt.xlabel("CU Runtime (in sec)") +plt.savefig("cu_runtime.pdf", format="pdf", bbox_inches='tight', pad_inches=0.1) +runtimes.describe() + +# + +# ## Pilots Executed per Day +# Extract pilot desciptions out of Redis entries + +# + +print "Number of Pilots: %d Number CUs: %d Executed since: %s"%(len(pilots), len(cus), str(cus_df.index.min())) + +# + +pilots = [i for i in pilots if i.has_key("start_time")] +max_pilot_date = None +try: + max_pilot_date = max_pilot_date.index.max() +except: + pass +timestamp_index = [] +pilot_new = [] +for i in pilots: + if max_pilot_date == None or datetime.datetime.utcfromtimestamp(float(i["start_time"]))>max_pilot_date: + timestamp_index.append(datetime.datetime.utcfromtimestamp(float(i["start_time"]))) + pilot_new.append(ast.literal_eval(i["description"])) + +#print cus_new +if len(pilot_new) > 0: + pilot_df_new = pd.DataFrame(pilot_new, index=timestamp_index, columns=['service_url', "number_of_processes"]) + try: + pilot_df = pd.concat([pilot_df, pilot_df_new]) + except: + pilot_df = pilot_df_new + +# + +pilot_df_h = pilot_df['service_url'].resample("D", how="count") +pilot_df_h.plot(kind="line", color='k', alpha=0.7) +plt.ylabel("Number of Pilots") +plt.xlabel("Day") +plt.savefig("number_pilots.pdf", format="pdf", bbox_inches='tight', pad_inches=0.1) + +# + +# ## Store Dataframes for later usage + +# + +cus_df.save("cus.df") +pilot_df.save("pilot.df") + +date_string = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +cus_df.to_csv("cus-"+date_string+".csv", index_label="Date") +pilot_df.to_csv("pilot-"+date_string+".csv", index_label="Date") +