hps-mc/batch_8py_source.html

"""!

@package batch


Defines a set of classes and a command-line interface for submitting batch jobs.


Supported systems include serial execution locally, a multiprocessing pool,

Slurm, LSF, and Auger.

"""


import os

import argparse

import subprocess

import sys

import logging

import signal

import multiprocessing

import psutil

from pathlib import Path

import socket


import xml.etree.ElementTree as ET

from xml.dom import minidom

from xml.sax.saxutils import unescape

from distutils.spawn import find_executable


from abc import ABC, abstractmethod


from hpsmc.job import Job, JobStore, JobScriptDatabase


logger = logging.getLogger("hpsmc.batch")


RUN_SCRIPT = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'job.py')


class Batch(ABC):

    """!

    Generic batch processing interface.

    """


    def __init__(self):


        parser = argparse.ArgumentParser(self.__class__.__name__,

                                         epilog='Available scripts: %s' % ', '.join(JobScriptDatabase().get_script_names()))


        parser.add_argument("-c", "--config-file", nargs='?', help="Config file", action='append')

        parser.add_argument("-l", "--log-dir", nargs='?', help="Log file output dir", required=False, default=str(Path(os.getcwd(), 'logs')))

        parser.add_argument("-d", "--run-dir", nargs='?', help="Base run dir for the jobs (must be an absolute path)", default=None)

        parser.add_argument("-D", "--debug", action='store_true', help="Enable debug settings", required=False)

        parser.add_argument("-o", "--check-output", action='store_true', required=False, help="Do not submit jobs where output files already exist")

        parser.add_argument("-s", "--job-steps", type=int, default=None, required=False)

        parser.add_argument("-r", "--job-range", nargs='?', help="Submit jobs numbers within range (e.g. '1:100')", required=False)

        parser.add_argument("script", nargs='?', help="Name of job script")

        parser.add_argument("jobstore", nargs='?', help="Job store in JSON format")

        parser.add_argument("jobids", nargs="*", type=int, help="List of individual job IDs to submit (optional)")


        self.parser = parser


    def parse_args(self, args):

        """! Parse command line arguments and perform setup."""


        cl = self.parser.parse_args(args)


        logger.debug(str(cl))


        if cl.script is None:

            raise Exception('The script is a required argument.')

        self.script_name = cl.script  # Name of script

        script_db = JobScriptDatabase()

        if not script_db.exists(self.script_name):

            raise Exception('The script name is not valid: %s' % self.script_name)

        self.script = script_db.get_script_path(self.script_name)  # Path to script

        if not os.path.isfile(self.script):

            raise Exception('The job script does not exist: %s' % self.script)


        if cl.jobstore is None:

            raise Exception('The job store file is a required argument.')

        if not os.path.isfile(cl.jobstore):

            raise Exception('The job store does not exist: %s' % cl.jobstore)

        self.jobstore = JobStore(cl.jobstore)


        self.debug = cl.debug


        # Set log dir which is for copying back log files generated by the batch system

        self.log_dir = os.path.abspath(cl.log_dir)

        logger.info('log dir: {}'.format(self.log_dir))

        if not os.path.exists(self.log_dir):

            os.makedirs(self.log_dir)

            logger.info('Created log dir: {}'.format(self.log_dir))


        # Set run dir which is a root directory under which job directories will be created

        self.run_dir = cl.run_dir

        if self.run_dir is not None:

            logger.info('run dir: {}'.format(self.run_dir))

            if not os.path.isabs(self.run_dir):

                # Require that the run dir is supplied as an abs path

                raise Exception("The run dir for batch processing must be an abs path.")


        self.check_output = cl.check_output


        if cl.jobids:

            self.job_ids = list(map(int, cl.jobids))

        else:

            self.job_ids = []


        self.job_steps = cl.job_steps


        if cl.job_range:

            toks = cl.job_range.split(':')

            if len(toks) != 2:

                raise ValueError('Bad format for job range: ' + cl.job_range)

            self.start_job_num = int(toks[0])

            self.end_job_num = int(toks[1])

            if self.start_job_num > self.end_job_num:

                raise ValueError("The start job number must be >= the end job num when using a range.")

            if self.start_job_num < 0 or self.end_job_num < 0:

                raise ValueError("The job range numbers must be > 0.")

        else:

            self.start_job_num = None

            self.end_job_num = None


        if cl.config_file:

            self.config_files = list(map(os.path.abspath, cl.config_file))

        else:

            self.config_files = []


        return cl


    @abstractmethod


    def submit_job(self, job_id):

        """!

        Submit a single batch job and return the batch ID.


        This is abstract as each batch system will do this differently.


        Some batch systems don't implement this but sub-classes should override this and make it a

        no-op so that they can be instantiated.

        """

        pass


    def submit(self):

        """!

        This is the generic batch submission function which gets a list of jobs to run based on command line

        arguments and submits them individually. It calls the abstract submit_job() method and prints the batch

        system ID that was returned, if any.

        """

        job_ids = self._get_filtered_job_ids()

        logger.info('Submitting jobs: %s' % str(job_ids))

        for job_id in job_ids:

            if not self.jobstore.has_job_id(job_id):

                raise Exception('Job ID was not found in job store: %s' % job_id)

            job_data = self.jobstore.get_job(job_id)

            batch_id = self.submit_job(job_id)

            logger.info(f"Submitted job {job_id} with batch ID {str(batch_id)}")


    def default_rundir(self, job_id=None):

        if job_id is None:

            raise Exception('Missing valid job ID')

        return str(Path(os.getcwd(), 'scratch', str(job_id)))


    def build_cmd(self, job_id):

        """!

        This is the basic implementation of building a command to run the job from a batch system.

        """

        cmd = [sys.executable, RUN_SCRIPT, 'run']

        logfile = self._logfile(job_id)

        cmd.extend(['-o', f"{logfile}.out",

                    '-e', f"{logfile}.err"])

        if self.run_dir:

            # Set the job's base run dir explicitly from user argument,

            # appending the job number as a subdirectory.

            job_dir = str(Path(self.run_dir, str(job_id)))

        else:

            # Set the job directory to the default.

            job_dir = self.default_rundir(job_id)


        logger.debug(f'job dir: {job_dir}')

        cmd.extend(['-d', job_dir])


        if len(self.config_files):

            for cfg in self.config_files:

                cmd.extend(['-c', cfg])

        if self.job_steps:

            cmd.extend(['--job-steps', str(self.job_steps)])

        cmd.extend(['-i', str(job_id)])

        cmd.append(self.script)

        cmd.append(os.path.abspath(self.jobstore.path))

        logger.debug("Job command: %s" % " ".join(cmd))

        return cmd


    def _logfile(self, job_id):

        """!

        Get the base name of a log file for the job.

        """

        return os.path.abspath(os.path.join(self.log_dir, 'job.%s' % str(job_id)))


    @staticmethod


    def _outputs_exist(job):

        """!

        Check if all output files exist for the given job. This is not the job ID but the full JSON job data.


        Return False when first missing output is found.

        """

        for src, dest in job["output_files"].items():

            if not os.path.isfile(os.path.join(job["output_dir"], dest)):

                logger.debug('Job output does not exist: %s -> %s' % (src, dest))

                return False

        return True


    def _get_filtered_job_ids(self):

        """!

        Get a list of job IDs to submit based on parsed command line options and whether output files are being checked.

        """

        submit_ids = self.jobstore.get_job_ids()

        logger.debug('Initial pre-filtered job IDs: {}'.format(str(submit_ids)))

        if self.start_job_num:

            submit_ids = [job_id for job_id in submit_ids

                          if int(job_id) >= self.start_job_num and int(job_id) <= self.end_job_num]

        elif len(self.job_ids):

            submit_ids = self.job_ids

        logger.debug('job IDs after range check: {}'.format(str(submit_ids)))

        if self.check_output:

            submit_ids = self._job_ids_missing_output(submit_ids)

            logger.info('job IDs after output file check: {}'.format(str(submit_ids)))

        return submit_ids


    def _job_ids_missing_output(self, job_ids):

        """! Get a list of IDs for jobs that are missing output files."""

        return [job_id for job_id in job_ids if not self._outputs_exist(self.jobstore.get_job(job_id))]


class BatchSystem(Batch, ABC):

    """!

    Represents a batch processing system that requires submission like Slurm or Auger.


    This subclasses Batch because it adds a number of different parameters which do not apply to all the

    batch system types (namely Pool and Local).

    """


    def __init__(self):


        super().__init__()


        self.parser.add_argument("-q", "--queue", nargs='?',

                                 help="Job queue or partition",

                                 required=False)

        self.parser.add_argument("-W", "--job-length", type=int, help="Max job length in hours", required=False, default=4)

        self.parser.add_argument("-m", "--memory", type=int, help="Max job memory allocation in MB", default=2000)

        self.parser.add_argument("-f", "--diskspace", type=int, help="Disk space needed for job in GB", default=20)

        self.parser.add_argument("-e", "--email", nargs='?', help="Email address for job notifications", required=False)


        self.parser.add_argument("-O", "--os", nargs='?', help="Operating system of batch nodes (Auger and LSF)")


        # Set site based on FQDN

        self.site = BatchSystem._site()


    def parse_args(self, args):

        """! Parse command line arguments and perform setup."""


        cl = super().parse_args(args)


        self.email = cl.email

        self.queue = cl.queue

        self.os = cl.os

        self.memory = cl.memory

        self.diskspace = cl.diskspace

        self.job_length = cl.job_length


        return cl


    @staticmethod


    def _site():

        fqdn = socket.getfqdn()

        site = None

        if 'slac.stanford.edu' in fqdn:

            site = 'slac'

        elif 'jlab.org' in fqdn:

            site = 'jlab'

        return site


class LSF(BatchSystem):

    """! Submit LSF batch jobs."""


    def __init__(self):

        super().__init__()


    def parse_args(self, args):

        super().parse_args(args)

        os.environ['LSB_JOB_REPORT_MAIL'] = 'Y' if self.email else 'N'


    def build_cmd(self, job_id):


        log_file = os.path.abspath(os.path.join(self.log_dir, 'job.%s.log' % str(job_id)))


        queue = self.queue

        if queue is None:

            queue = 'long'


        if self.os is not None:

            lsf_os = self.os

        else:

            lsf_os = 'centos7'


        cmd = ['bsub',

               '-W', str(self.job_lengthjob_length) + ':0',

               '-q', queue,

               '-R', lsf_os,

               '-o', log_file,

               '-e', log_file]


        if self.email:

            cmd.extend(['-u', self.email])


        cmd.extend(super().build_cmd(self, job_id))


        return cmd


    def submit_job(self, job_id):

        cmd = self.build_cmdbuild_cmd(job_id)

        logger.info('Submitting job %s to LSF with command: %s' % (job_id, ' '.join(cmd)))

        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        out, err = proc.communicate()

        if err is not None and len(err):

            logger.warning(err)

        tokens = out.decode().split(" ")

        if tokens[0] != 'Job':

            raise Exception('Unexpected output from bsub command: %s' % out)

        batch_id = int(tokens[1].replace('<', '').replace('>', ''))

        return batch_id


class Slurm(BatchSystem):

    """! Submit Slurm batch jobs."""


    def __init__(self):


        super().__init__()


        self.parser.add_argument("-S", "--sh-dir", nargs='?', help="Directory to hold generated shell scripts for Slurm", default=str(Path(os.getcwd(), 'sh')))

        self.parser.add_argument("-E", "--env", nargs='?', help="Full path to env setup script", required=False, default=None)

        self.parser.add_argument("-A", "--account", nargs='?', help="Account name for s3df slurm jobs.", required=False, default=None)


    def parse_args(self, args):


        cl = super().parse_args(args)


        # Set Slurm env script

        self.env = cl.env

        self.account = cl.account


        # Set Slurm scripts dir

        self.sh_dir = os.path.abspath(cl.sh_dir)

        logger.info('Slurm sh dir: {}'.format(self.sh_dir))

        if not os.path.exists(self.sh_dir):

            os.makedirs(self.sh_dir)

            logger.info('Created Slurm sh dir: {}'.format(self.sh_dir))


    def default_rundir(self, job_id=None):

        """!

        Override the basic implementation for getting the default run directory.

        """

        if self.sitesite == 'slac':

            run_dir = '$LSCRATCH'

        elif self.sitesite == 'jlab':

            run_dir = '/scratch/slurm/$SLURM_JOBID'

        else:

            run_dir = os.getcwd() + "/scratch/$SLURM_JOBID"

        return run_dir


    def _default_queue(self):

        queue = self.queue

        if queue is None:

            if self.sitesite == 'slac':

                queue = 'shared'

            elif self.sitesite == 'jlab':

                queue = 'ifarm'

            else:

                raise Exception('No queue name was provided.')

        return queue


    def _sbatch(self, job_id):

        log_file = self._logfile(job_id)

        sbatch_cmd = ['sbatch',

                      '--time=%s' % (str(self.job_lengthjob_length) + ':00:00'),

                      '--mem=%sM' % self.memorymemory,

                      '--job-name=%i_%s' % (job_id, self.script_namescript_name),

                      '--output=%s.out' % log_file,

                      '--error=%s.err' % log_file]

        if self.queue:

            sbatch_cmd.extend([f'--partition={self.queue}'])

        if self.account:

            sbatch_cmd.extend([f'--account={self.account}'])

        if self.email:

            sbatch_cmd.extend([f'--mail-user={self.email}',

                               f'--mail-type=ALL'])

        return sbatch_cmd


    def _sh_filename(self, job_id):

        return self.sh_dir + '/job.%i.sh' % job_id


    def build_cmd(self, job_id):

        """!

        Wrap submission of Slurm jobs using a generated script.

        """


        # Get the sbatch command

        cmd = self._sbatch(job_id)


        # Get name of shell script to generate

        sh_filename = self._sh_filename(job_id)


        # Build the basic job command for execution

        job_cmd = super().build_cmd(job_id)

        if self.run_dir is None:

            # The superclass will have already set this if the user provided an

            # explicit run dir. Here we set a default scratch directory if none

            # was given.

            job_cmd.extend(['-d', self.default_rundirdefault_rundir()])


        # Write the job submission script out

        self._write_job_script(sh_filename, job_cmd)


        # Append job run script to Slurm command

        cmd.append(sh_filename)


        return cmd


    def _write_job_script(self, sh_filename, job_cmd):

        """!

        Write the shell script for Slurm job submission using the 'sbatch' command.

        """


        script_lines = ['#!/bin/bash',

                        '']

        if self.env:

            script_lines.append(f'source {self.env}')

        script_lines.extend(['echo Start time: `date`',

                             'echo PWD=`pwd`',

                             'echo ---- Start Environment ----',

                             'env | sort',

                             'echo ---- End Environment ----',

                             'time ' + ' '.join(job_cmd),

                             'echo End time: `date`'])


        logger.debug("Slurm submission script:\n" + str(script_lines))


        with open(sh_filename, 'w') as sh_file:

            for script_line in script_lines:

                sh_file.write(script_line + '\n')


        logger.debug('Wrote Slurm submission script to: '.format(str(Path(self.sh_dir, sh_filename))))


    def submit_job(self, job_id):

        cmd = self.build_cmdbuild_cmd(job_id)

        logger.info('Submitting job %s to Slurm with command: %s' % (job_id, ' '.join(cmd)))

        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        out, err = proc.communicate()

        if err is not None and len(err):

            logger.warning(err)

        tokens = out.decode().split(" ")

        if tokens[0] != 'Submitted':

            raise Exception('Unexpected output from sbatch command: %s' % out)

        batch_id = int(tokens[3].replace('<', '').replace('>', ''))

        return batch_id


class Auger(BatchSystem):

    """!

    Submit Auger batch jobs.


    Auger itself is actually deprecated and unavailable but its submission XML format is supported by

    the Swif class (see below).

    """


    def __init__(self):


        super().__init__()


        self.setup_script = find_executable('hps-mc-env.csh')


        if not self.setup_script:

            raise Exception("Failed to find 'hps-mc-env.csh' in environment.")


    def submit_job(self, job_id):

        """!

        Make this a no-op. Auger is a bit of a special case in terms of how batch submission works with a

        generated XML file including all job IDs, so we do not implement single job submission.

        """

        pass


    def submit(self):

        """!

        Batch submission method for Auger.


        This differs from some of the other systems in that it doesn't loop over individual

        job IDs. Instead a single XML file is submitted for all the jobs at once.

        """

        xml_filename = self._create_job_xml()  # write request to XML file

        auger_ids = self._jsub(xml_filename)  # execute jsub to submit jobs

        logger.info("Submitted Auger jobs: %s" % str(auger_ids))


    def _create_job_xml(self):

        job_ids = self._get_filtered_job_ids()

        logger.info('Submitting jobs: %s' % str(job_ids))

        req = self._create_req(self.script_name)  # create request XML header

        for job_id in job_ids:

            if not self.jobstore.has_job_id(job_id):

                raise Exception('Job ID was not found in job store: %s' % job_id)

            job_params = self.jobstore.get_job(job_id)

            if self.check_output and Batch._outputs_exist(job_params):

                logger.warning("Skipping Auger submission for job "

                               "because outputs already exist: %d" % job_id)

            else:

                self._add_job(req, job_params)  # add job to request

        return self._write_req(req)    # write request to file


    def _jsub(self, xml_filename):

        cmd = ['jsub', '-xml', xml_filename]

        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)

        out, err = proc.communicate()

        auger_ids = self._get_auger_ids(out)

        return auger_ids


    def _get_auger_ids(self, out):

        auger_ids = []

        for line in out.splitlines():

            if line.strip().startswith(b'<jsub>'):

                j = ET.fromstring(line)

                for req in j.getchildren():

                    for child in req.getchildren():

                        if child.tag == 'jobIndex':

                            auger_id = int(child.text)

                            auger_ids.append(auger_id)

                        elif child.tag == 'error':

                            # Submission failed so raise an exception with the error msg

                            raise Exception(child.text)

                break

        return auger_ids


    def _write_req(self, req, filename='temp.xml'):

        pretty = unescape(minidom.parseString(ET.tostring(req)).toprettyxml(indent="  "))

        with open(filename, 'w') as f:

            f.write(pretty)

            return f.name


    def _create_req(self, req_name):

        req = ET.Element("Request")

        name_elem = ET.SubElement(req, "Name")

        name_elem.set("name", req_name)

        prj = ET.SubElement(req, "Project")

        prj.set("name", "hps")

        trk = ET.SubElement(req, "Track")

        if self.debug:

            # Queue arg is not used when debug flag is active.

            trk.set("name", "debug")

        else:

            # Queue name is used to set job track.

            queue = 'simulation'

            if self.queue is not None:

                queue = self.queue

            trk.set("name", queue)

        if self.emailemail:

            email = ET.SubElement(req, "Email")

            email.set("email", self.emailemail)

            email.set("request", "true")

            email.set("job", "true")

        mem = ET.SubElement(req, "Memory")

        mem.set("space", str(self.memorymemory))

        mem.set("unit", "MB")

        disk = ET.SubElement(req, "DiskSpace")

        disk.set("space", str(self.diskspacediskspace))

        disk.set("unit", "GB")

        limit = ET.SubElement(req, "TimeLimit")

        limit.set("time", str(self.job_lengthjob_length))

        limit.set("unit", "hours")

        os_elem = ET.SubElement(req, "OS")

        if self.os is not None:

            auger_os = self.os

        else:

            auger_os = 'el9'

        os_elem.set("name", auger_os)

        return req


    def build_cmd(self, job_id):

        cmd = [sys.executable, RUN_SCRIPT, 'run']

        if len(self.config_filesconfig_files):

            for cfg in self.config_filesconfig_files:

                cmd.extend(['-c', cfg])

        if self.job_stepsjob_steps is not None:

            cmd.extend(['--job-steps', str(self.job_stepsjob_steps)])

        cmd.extend(['-i', str(job_id)])

        cmd.append(self.scriptscript)

        cmd.append(os.path.abspath(self.jobstore.path))

        logger.debug("Job command: %s" % " ".join(cmd))

        return cmd


    def _create_job(self, params):

        """! Needed for resolving ptag output sources."""

        j = Job()

        j.script = self.scriptscript

        j._load_params(params)

        j._load_script()

        return j


    def _add_job(self, req, job_params):

        job = ET.SubElement(req, "Job")

        job_id = job_params['job_id']

        year = ''  # /todo change to number

        if 'year' in job_params.keys():

            year = job_params['year']


        if 'input_files' in list(job_params.keys()):

            inputfiles = job_params['input_files']

            for src, dest in inputfiles.items():

                if not src.startswith('http'):

                    input_elem = ET.SubElement(job, "Input")

                    input_elem.set("dest", dest)

                    if src.startswith("/mss"):

                        src_file = "mss:%s" % src

                    else:

                        src_file = src

                    input_elem.set("src", src_file)

                else:

                    logger.warning("http input file will not be included in XML job descr: {}".format(src))

        outputfiles = job_params["output_files"]

        outputdir = job_params["output_dir"]

        # outputdir = os.path.realpath(outputdir)

        j = self._create_job(job_params)

        for src, dest in outputfiles.items():

            output_elem = ET.SubElement(job, "Output")

            res_src = j.resolve_output_src(src)

            output_elem.set("src", res_src)

            dest_file = os.path.abspath(os.path.join(outputdir, dest))

            if dest_file.startswith("/mss"):

                dest_file = "mss:%s" % dest_file

            logger.debug('Auger dest file: {} -> {}'.format(src, dest))

            output_elem.set("dest", dest_file)


        job_name = ET.SubElement(job, "Name")

        job_name.set("name", '%ihps%i' % (year, job_id))


        job_err = ET.SubElement(job, "Stderr")

        stdout_file = os.path.abspath(os.path.join(self.log_dir, "job.%d.out" % job_id))

        stderr_file = os.path.abspath(os.path.join(self.log_dir, "job.%d.err" % job_id))

        job_err.set("dest", stderr_file)

        job_out = ET.SubElement(job, "Stdout")

        job_out.set("dest", stdout_file)


        cmd = ET.SubElement(job, "Command")

        cmd_lines = []

        cmd_lines.append("<![CDATA[")


        cmd_lines.append('pwd;\n')

        cmd_lines.append('env | sort;\n')

        cmd_lines.append('ls -lart;\n')

        cmd_lines.append("source %s;\n" % os.path.realpath(self.setup_script))

        cmd_lines.append("source %s/bin/jlab-env.csh;\n" % os.getenv('HPSMC_DIR'))


        job_cmd = self.build_cmdbuild_cmd(job_id)


        # Write log file locally so it can be copied back with Output element

        # log_file = 'job.%d.log' % job_id

        # job_cmd.extend(['-l', '$PWD/%s' % log_file])

        # log_out_elem = ET.SubElement(job, "Output")

        # log_out_elem.set('src', log_file)

        # log_out_elem.set('dest', os.path.join(self.log_dir, log_file))


        cmd_lines.extend(job_cmd)

        cmd_lines.append(';\n')


        cmd_lines.append('ls -lart; \n')


        cmd_lines.append("]]>")


        # logger.debug(cmd_lines)


        cmd.text = ' '.join(cmd_lines)


class Swif(Auger):

    """!

    Submit using the 'swif2' command at JLAB using an Auger file.


    This is just a thin wrapper of the parent class to call the swif2 commands with the generated Auger XML file.


    Existing workflows generated by this class should be fully canceled and removed before resubmitting using this

    interface.

    """


    def __init__(self):


        super().__init__()


        self.parser.add_argument("-w", "--workflow", nargs='?', help="Name of swif2 workflow", required=False)


    def parse_args(self, args):

        cl = super().parse_args(args)

        if cl.workflow:

            self.workflow = cl.workflow

        else:

            self.workflow = self.script_name

        logger.debug(f'swif workflow name set to: {self.workflow}')

        return cl


    def submit(self):


        logger.info("Submitting swif workflow: {}".format(self.workflow))


        # Write request to XML file

        xml_filename = self._create_job_xml()


        # Add job to swif2 workflow using Auger XML file

        cmd = ['swif2', 'add-jsub', self.workflow, '-script', xml_filename]

        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        out = proc.communicate()[0]

        print("".join([s for s in out.decode().strip().splitlines(True) if s.strip()]))

        proc.wait()


        # Run the workflow

        run_cmd = ['swif2', 'run', self.workflow]

        proc = subprocess.Popen(run_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        out = proc.communicate()[0]

        print("".join([s for s in out.decode().strip().splitlines(True) if s.strip()]))

        proc.wait()


class Local(Batch):

    """!

    Run local batch jobs sequentially.

    """


    def __init__(self):

        super().__init__()


    def submit_job(self, job_id):

        """! Run a single job locally."""

        cmd = self.build_cmd(job_id)

        if self.submit:

            logger.info(f"Executing local job: {job_id}")

            proc = subprocess.Popen(cmd, shell=False)

            proc.communicate()

            if proc.returncode:

                logger.error(f"Local execution of {job_id} returned error code: {proc.returncode}")


# Queue used to keep track of processes created by batch pool.


mp_queue = multiprocessing.Queue()


def run_job_pool(cmd):

    """! Run the command in a new process whose PID is added to a global MP queue."""

    try:

        sys.stdout.flush()

        proc = subprocess.Popen(cmd, preexec_fn=os.setsid)

        mp_queue.put(proc.pid)

        proc.wait()

        returncode = proc.returncode

    except subprocess.CalledProcessError as e:

        logger.error(str(e))

        sys.stdout.flush()

        pass

    return returncode


def is_running(proc):

    """!

    Check if a system process looks like it is still running.

    """

    return proc.status() in [psutil.STATUS_RUNNING,

                             psutil.STATUS_SLEEPING,

                             psutil.STATUS_DISK_SLEEP,

                             psutil.STATUS_IDLE]


class KillProcessQueue():

    """!

    Kill processes in the multiprocessing queue if the jobs are canceled.

    """


    def __init__(self, mp_queue):

        self.mp_queue = mp_queue


    def __enter__(self):

        return self


    def __exit__(self, type, val, tb):

        """! Kill processes on exit."""

        while True:

            pid = mp_queue.get()

            try:

                parent = psutil.Process(pid)

                for child in parent.children(recursive=True):

                    if is_running(child):

                        print('Killing running process: %d' % child.pid)

                        child.kill()

                if is_running(parent):

                    parent.kill()

            except Exception as e:

                # This probably just means it already finished.

                pass


            if mp_queue.empty():

                break


class Pool(Batch):

    """!

    Run a set of jobs in a local multiprocessing pool using Python's multiprocessing module.


    The number of processes to spawn can be provided using the '-p' argument.

    """


    # Max wait in seconds when getting results

    max_wait = 999999


    def __init__(self):

        super().__init__()

        self.parser.add_argument("-p", "--pool-size", type=int,

                                 help="Job pool size (only applicable when running pool)", required=False,

                                 default=multiprocessing.cpu_count())


    def submit_job(self, job_id):

        """!

        Make this a no-op as we do not implement single job submission for the processing pool.

        """

        pass


    def parse_args(self, args):

        cl = super().parse_args(args)

        self.pool_size = int(cl.pool_size)

        return cl


    def submit(self):

        """! Submit jobs to a local processing pool.


        This method will not return until all jobs are finished or execution

        is interrupted.

        """


        cmds = []

        for job_id in self._get_filtered_job_ids():

            cmd = self.build_cmd(job_id)

            cmds.append(cmd)


        # logger.debug('Running job commands in pool ...')

        # logger.debug('\n'.join([' '.join(cmd) for cmd in cmds]))


        if not len(cmds):

            raise Exception('No job IDs found to submit')


        # Run jobs in an MP pool and cleanup child processes on exit

        with KillProcessQueue(mp_queue):

            original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)

            pool = multiprocessing.Pool(self.pool_size)

            signal.signal(signal.SIGINT, original_sigint_handler)

            try:

                logger.info("Running %d jobs in pool ..." % len(cmds))

                res = pool.map_async(run_job_pool, cmds)

                # timeout must be properly set, otherwise tasks will crash

                logger.info("Pool results: " + str(res.get(Pool.max_wait)))

                logger.info("Normal termination")

                pool.close()

                pool.join()

            except KeyboardInterrupt:

                logger.fatal("Caught KeyboardInterrupt, terminating workers")

                pool.terminate()

            except Exception as e:

                logger.fatal("Caught Exception '%s', terminating workers" % (str(e)))

                pool.terminate()

            except BaseException:  # catch *all* exceptions

                e = sys.exc_info()[0]

                logger.fatal("Caught non-Python Exception '%s'" % (e))

                pool.terminate()


if __name__ == '__main__':

    system_dict = {

        "lsf": LSF,

        "slurm": Slurm,

        "auger": Auger,

        "local": Local,

        "pool": Pool,

        "swif": Swif

    }

    if len(sys.argv) > 1:

        system = sys.argv[1].lower()

        if system not in list(system_dict.keys()):

            raise Exception(f"The batch system {system} is not valid.")

        batch = system_dict[system]()

        args = sys.argv[2:]

        batch.parse_args(args)

        batch.submit()

    else:

        print("Usage: batch.py [system] [args]")

        print("    Available systems: {}".format(', '.join(list(system_dict.keys()))))

hpsmc.batch.Auger
Submit Auger batch jobs.
Definition batch.py:467

hpsmc.batch.Auger.job_length
job_length
Definition batch.py:574

hpsmc.batch.Auger.setup_script
setup_script
Definition batch.py:479

hpsmc.batch.Auger._create_req
_create_req(self, req_name)
Definition batch.py:546

hpsmc.batch.Auger.build_cmd
build_cmd(self, job_id)
This is the basic implementation of building a command to run the job from a batch system.
Definition batch.py:584

hpsmc.batch.Auger.submit_job
submit_job(self, job_id)
Make this a no-op.
Definition batch.py:484

hpsmc.batch.Auger.job_steps
job_steps
Definition batch.py:590

hpsmc.batch.Auger._create_job
_create_job(self, params)
Needed for resolving ptag output sources.
Definition batch.py:597

hpsmc.batch.Auger._get_auger_ids
_get_auger_ids(self, out)
Definition batch.py:524

hpsmc.batch.Auger.script
script
Definition batch.py:592

hpsmc.batch.Auger.submit
submit(self)
Batch submission method for Auger.
Definition batch.py:491

hpsmc.batch.Auger.memory
memory
Definition batch.py:568

hpsmc.batch.Auger._create_job_xml
_create_job_xml(self)
Definition batch.py:502

hpsmc.batch.Auger.diskspace
diskspace
Definition batch.py:571

hpsmc.batch.Auger.__init__
__init__(self)
Definition batch.py:475

hpsmc.batch.Auger._add_job
_add_job(self, req, job_params)
Definition batch.py:605

hpsmc.batch.Auger.email
email
Definition batch.py:564

hpsmc.batch.Auger._write_req
_write_req(self, req, filename='temp.xml')
Definition batch.py:540

hpsmc.batch.Auger._jsub
_jsub(self, xml_filename)
Definition batch.py:517

hpsmc.batch.Auger.config_files
config_files
Definition batch.py:586

hpsmc.batch.BatchSystem
Represents a batch processing system that requires submission like Slurm or Auger.
Definition batch.py:231

hpsmc.batch.BatchSystem.job_length
job_length
Definition batch.py:266

hpsmc.batch.BatchSystem.os
os
Definition batch.py:263

hpsmc.batch.BatchSystem.memory
memory
Definition batch.py:264

hpsmc.batch.BatchSystem.site
site
Definition batch.py:254

hpsmc.batch.BatchSystem.parse_args
parse_args(self, args)
Parse command line arguments and perform setup.
Definition batch.py:256

hpsmc.batch.BatchSystem.diskspace
diskspace
Definition batch.py:265

hpsmc.batch.BatchSystem.__init__
__init__(self)
Definition batch.py:239

hpsmc.batch.BatchSystem._site
_site()
Definition batch.py:271

hpsmc.batch.BatchSystem.email
email
Definition batch.py:261

hpsmc.batch.BatchSystem.queue
queue
Definition batch.py:262

hpsmc.batch.Batch
Generic batch processing interface.
Definition batch.py:35

hpsmc.batch.Batch.end_job_num
end_job_num
Definition batch.py:112

hpsmc.batch.Batch.debug
debug
Definition batch.py:81

hpsmc.batch.Batch._get_filtered_job_ids
_get_filtered_job_ids(self)
Get a list of job IDs to submit based on parsed command line options and whether output files are bei...
Definition batch.py:209

hpsmc.batch.Batch.job_ids
job_ids
Definition batch.py:101

hpsmc.batch.Batch.build_cmd
build_cmd(self, job_id)
This is the basic implementation of building a command to run the job from a batch system.
Definition batch.py:160

hpsmc.batch.Batch.submit_job
submit_job(self, job_id)
Submit a single batch job and return the batch ID.
Definition batch.py:129

hpsmc.batch.Batch.job_steps
job_steps
Definition batch.py:105

hpsmc.batch.Batch.script
script
Definition batch.py:71

hpsmc.batch.Batch.submit
submit(self)
This is the generic batch submission function which gets a list of jobs to run based on command line ...
Definition batch.py:140

hpsmc.batch.Batch.jobstore
jobstore
Definition batch.py:79

hpsmc.batch.Batch.default_rundir
default_rundir(self, job_id=None)
Definition batch.py:155

hpsmc.batch.Batch.start_job_num
start_job_num
Definition batch.py:111

hpsmc.batch.Batch.parse_args
parse_args(self, args)
Parse command line arguments and perform setup.
Definition batch.py:58

hpsmc.batch.Batch.parser
parser
Definition batch.py:56

hpsmc.batch.Batch._outputs_exist
_outputs_exist(job)
Check if all output files exist for the given job.
Definition batch.py:197

hpsmc.batch.Batch.script_name
script_name
Definition batch.py:67

hpsmc.batch.Batch._logfile
_logfile(self, job_id)
Get the base name of a log file for the job.
Definition batch.py:190

hpsmc.batch.Batch.log_dir
log_dir
Definition batch.py:84

hpsmc.batch.Batch.__init__
__init__(self)
Definition batch.py:40

hpsmc.batch.Batch.run_dir
run_dir
Definition batch.py:91

hpsmc.batch.Batch._job_ids_missing_output
_job_ids_missing_output(self, job_ids)
Get a list of IDs for jobs that are missing output files.
Definition batch.py:226

hpsmc.batch.Batch.config_files
config_files
Definition batch.py:122

hpsmc.batch.Batch.check_output
check_output
Definition batch.py:98

hpsmc.batch.KillProcessQueue
Kill processes in the multiprocessing queue if the jobs are canceled.
Definition batch.py:775

hpsmc.batch.KillProcessQueue.__exit__
__exit__(self, type, val, tb)
Kill processes on exit.
Definition batch.py:786

hpsmc.batch.KillProcessQueue.mp_queue
mp_queue
Definition batch.py:781

hpsmc.batch.KillProcessQueue.__enter__
__enter__(self)
Definition batch.py:783

hpsmc.batch.KillProcessQueue.__init__
__init__(self, mp_queue)
Definition batch.py:780

hpsmc.batch.LSF
Submit LSF batch jobs.
Definition batch.py:281

hpsmc.batch.LSF.job_length
job_length
Definition batch.py:305

hpsmc.batch.LSF.build_cmd
build_cmd(self, job_id)
This is the basic implementation of building a command to run the job from a batch system.
Definition batch.py:291

hpsmc.batch.LSF.submit_job
submit_job(self, job_id)
Submit a single batch job and return the batch ID.
Definition batch.py:318

hpsmc.batch.LSF.parse_args
parse_args(self, args)
Parse command line arguments and perform setup.
Definition batch.py:287

hpsmc.batch.LSF.__init__
__init__(self)
Definition batch.py:284

hpsmc.batch.Local
Run local batch jobs sequentially.
Definition batch.py:727

hpsmc.batch.Local.submit_job
submit_job(self, job_id)
Run a single job locally.
Definition batch.py:735

hpsmc.batch.Local.__init__
__init__(self)
Definition batch.py:732

hpsmc.batch.Pool
Run a set of jobs in a local multiprocessing pool using Python's multiprocessing module.
Definition batch.py:806

hpsmc.batch.Pool.submit_job
submit_job(self, job_id)
Make this a no-op as we do not implement single job submission for the processing pool.
Definition batch.py:822

hpsmc.batch.Pool.submit
submit(self)
Submit jobs to a local processing pool.
Definition batch.py:833

hpsmc.batch.Pool.parse_args
parse_args(self, args)
Parse command line arguments and perform setup.
Definition batch.py:828

hpsmc.batch.Pool.__init__
__init__(self)
Definition batch.py:816

hpsmc.batch.Pool.pool_size
pool_size
Definition batch.py:830

hpsmc.batch.Slurm
Submit Slurm batch jobs.
Definition batch.py:332

hpsmc.batch.Slurm.job_length
job_length
Definition batch.py:384

hpsmc.batch.Slurm.build_cmd
build_cmd(self, job_id)
Wrap submission of Slurm jobs using a generated script.
Definition batch.py:401

hpsmc.batch.Slurm.sh_dir
sh_dir
Definition batch.py:352

hpsmc.batch.Slurm.account
account
Definition batch.py:349

hpsmc.batch.Slurm.submit_job
submit_job(self, job_id)
Submit a single batch job and return the batch ID.
Definition batch.py:453

hpsmc.batch.Slurm.memory
memory
Definition batch.py:385

hpsmc.batch.Slurm._sbatch
_sbatch(self, job_id)
Definition batch.py:381

hpsmc.batch.Slurm.site
site
Definition batch.py:362

hpsmc.batch.Slurm.default_rundir
default_rundir(self, job_id=None)
Override the basic implementation for getting the default run directory.
Definition batch.py:358

hpsmc.batch.Slurm._default_queue
_default_queue(self)
Definition batch.py:370

hpsmc.batch.Slurm.parse_args
parse_args(self, args)
Parse command line arguments and perform setup.
Definition batch.py:343

hpsmc.batch.Slurm._write_job_script
_write_job_script(self, sh_filename, job_cmd)
Write the shell script for Slurm job submission using the 'sbatch' command.
Definition batch.py:428

hpsmc.batch.Slurm.script_name
script_name
Definition batch.py:386

hpsmc.batch.Slurm._sh_filename
_sh_filename(self, job_id)
Definition batch.py:398

hpsmc.batch.Slurm.__init__
__init__(self)
Definition batch.py:335

hpsmc.batch.Slurm.env
env
Definition batch.py:348

hpsmc.batch.Swif
Submit using the 'swif2' command at JLAB using an Auger file.
Definition batch.py:680

hpsmc.batch.Swif.workflow
workflow
Definition batch.py:699

hpsmc.batch.Swif.submit
submit(self)
Batch submission method for Auger.
Definition batch.py:705

hpsmc.batch.Swif.parse_args
parse_args(self, args)
Parse command line arguments and perform setup.
Definition batch.py:696

hpsmc.batch.Swif.__init__
__init__(self)
Definition batch.py:690

hpsmc.job.JobScriptDatabase
Database of job scripts.
Definition job.py:125

hpsmc.job.JobStore
Simple JSON based store of job data.
Definition job.py:73

hpsmc.job.Job
Primary class to run HPS jobs from a Python script.
Definition job.py:160

hpsmc.batch.run_job_pool
run_job_pool(cmd)
Run the command in a new process whose PID is added to a global MP queue.
Definition batch.py:750

hpsmc.batch.is_running
is_running(proc)
Check if a system process looks like it is still running.
Definition batch.py:765

hpsmc.job
Definition job.py:1