Source code for pyworkflow.utils.mpi

# **************************************************************************
# *
# * Authors:     J.M. De la Rosa Trevin (jmdelarosa@cnb.csic.es)
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************
"""
MPI utilities. runJobMPI and runJobMPISlave send and receive the commands
to execute, in the given directory and with the given environment.
"""


import os
from time import time, sleep
from pickle import dumps, loads

from pyworkflow import Config

from .process import buildRunCommand, runCommand

from pyworkflow.utils.utils import getLocalHostName, redStr

TIMEOUT = 60  # seconds trying to send/receive data through a socket

TAG_RUN_JOB = 1000


[docs]def send(command, comm, dest, tag): """ Send command in a non-blocking way and raise exception on error. """ # This function blocks, but it uses the isend() function (which is # nonblocking) and sleeps without using the cpu while we try to send. # Also, if we cannot send after TIMEOUT seconds, raise exception. if command.startswith('env='): print("Sending environment to %d" % dest) else: print("Sending command to %d: %s" % (dest, command)) # Send command with isend() req_send = comm.isend(dumps(command), dest=dest, tag=tag) t0 = time() while not req_send.test()[0]: sleep(1) if time() - t0 > TIMEOUT: raise Exception("Timeout in process %d, cannot send command " "to worker %d." % (os.getpid(), dest)) # Receive the result in a non-blocking way too (with irecv()) req_recv = comm.irecv(source=dest, tag=tag) while True: done, result = req_recv.test() if done: break sleep(1) if result != 0: # result will then be a string with the error print(redStr("Worker process %d has failed. Please check the terminal " "for details." % dest)) raise Exception(str(result))
[docs]def runJobMPI(programname, params, mpiComm, mpiDest, numberOfMpi=1, hostConfig=None, env=None, cwd=None, gpuList=None): """ Send the command to the MPI node in which it will be executed. """ command = buildRunCommand(programname, params, numberOfMpi, hostConfig, env, gpuList=gpuList) if cwd is not None: send("cwd=%s" % cwd, mpiComm, mpiDest, TAG_RUN_JOB+mpiDest) if env is not None: send("env=%s" % env, mpiComm, mpiDest, TAG_RUN_JOB+mpiDest) send(command, mpiComm, mpiDest, TAG_RUN_JOB+mpiDest)
[docs]def runJobMPISlave(mpiComm): """ This slave will be receiving commands to execute until 'None' is received. """ rank = mpiComm.Get_rank() hostname = getLocalHostName() print(" Running MPIWorker: ", rank) exitResult = 0 # Listen for commands until we get 'None' cwd = None # We run without changing directory by default env = None # And we don't change the environment either! while True: # Receive command in a non-blocking way req_recv = mpiComm.irecv(source=0, tag=TAG_RUN_JOB+rank) while True: done, command = req_recv.test() if done: break sleep(1) print(" Worker %s(rank %d) received command." % (hostname, rank)) # We need to convert to string because req_recv.test() returns bytes or None if command == 'None': print(" Stopping...") return else: command = loads(command) # Run the command and get the result (exit code or exception) try: if command.startswith("cwd="): cwd = command.split("=", 1)[-1] print(" Changing to dir %s ..." % cwd) elif command.startswith("env="): env = command.split("=", 1)[-1] env = eval(env) print(" Setting the environment...") if Config.debugOn(): print(env) else: runCommand(command, cwd=cwd, env=env) cwd = None # unset directory env = None # unset environment except Exception as e: print(" Error in process %d (rank %d)" % (os.getpid(), rank)) import traceback traceback.print_exc() exitResult = str(e) # Communicate to master, either error os success req_send = mpiComm.isend(exitResult, dest=0, tag=TAG_RUN_JOB+rank) t0 = time() while not req_send.test()[0]: sleep(1) if time() - t0 > TIMEOUT: msg = " Error in process %d, cannot send error message to master." print(msg % os.getpid()) break