# **************************************************************************
# *
# * Authors: J.M. De la Rosa Trevin (jmdelarosa@cnb.csic.es)
# *
# * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307 USA
# *
# * All comments concerning this program package may be sent to the
# * e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************
"""
MPI utilities. runJobMPI and runJobMPISlave send and receive the commands
to execute, in the given directory and with the given environment.
"""
import os
from time import time, sleep
from pickle import dumps, loads
from pyworkflow import Config
from .process import buildRunCommand, runCommand
from pyworkflow.utils.utils import getLocalHostName, redStr
TIMEOUT = 60 # seconds trying to send/receive data through a socket
TAG_RUN_JOB = 1000
[docs]def send(command, comm, dest, tag):
""" Send command in a non-blocking way and raise exception on error. """
# This function blocks, but it uses the isend() function (which is
# nonblocking) and sleeps without using the cpu while we try to send.
# Also, if we cannot send after TIMEOUT seconds, raise exception.
if command.startswith('env='):
print("Sending environment to %d" % dest)
else:
print("Sending command to %d: %s" % (dest, command))
# Send command with isend()
req_send = comm.isend(dumps(command), dest=dest, tag=tag)
t0 = time()
while not req_send.test()[0]:
sleep(1)
if time() - t0 > TIMEOUT:
raise Exception("Timeout in process %d, cannot send command "
"to worker %d." % (os.getpid(), dest))
# Receive the result in a non-blocking way too (with irecv())
req_recv = comm.irecv(source=dest, tag=tag)
while True:
done, result = req_recv.test()
if done:
break
sleep(1)
if result != 0: # result will then be a string with the error
print(redStr("Worker process %d has failed. Please check the terminal "
"for details." % dest))
raise Exception(str(result))
[docs]def runJobMPI(programname, params, mpiComm, mpiDest,
numberOfMpi=1, hostConfig=None,
env=None, cwd=None, gpuList=None):
""" Send the command to the MPI node in which it will be executed. """
command = buildRunCommand(programname, params, numberOfMpi, hostConfig,
env, gpuList=gpuList)
if cwd is not None:
send("cwd=%s" % cwd, mpiComm, mpiDest, TAG_RUN_JOB+mpiDest)
if env is not None:
send("env=%s" % env, mpiComm, mpiDest, TAG_RUN_JOB+mpiDest)
send(command, mpiComm, mpiDest, TAG_RUN_JOB+mpiDest)
[docs]def runJobMPISlave(mpiComm):
""" This slave will be receiving commands to execute
until 'None' is received.
"""
rank = mpiComm.Get_rank()
hostname = getLocalHostName()
print(" Running MPIWorker: ", rank)
exitResult = 0
# Listen for commands until we get 'None'
cwd = None # We run without changing directory by default
env = None # And we don't change the environment either!
while True:
# Receive command in a non-blocking way
req_recv = mpiComm.irecv(source=0, tag=TAG_RUN_JOB+rank)
while True:
done, command = req_recv.test()
if done:
break
sleep(1)
print(" Worker %s(rank %d) received command." % (hostname, rank))
# We need to convert to string because req_recv.test() returns bytes or None
if command == 'None':
print(" Stopping...")
return
else:
command = loads(command)
# Run the command and get the result (exit code or exception)
try:
if command.startswith("cwd="):
cwd = command.split("=", 1)[-1]
print(" Changing to dir %s ..." % cwd)
elif command.startswith("env="):
env = command.split("=", 1)[-1]
env = eval(env)
print(" Setting the environment...")
if Config.debugOn():
print(env)
else:
runCommand(command, cwd=cwd, env=env)
cwd = None # unset directory
env = None # unset environment
except Exception as e:
print(" Error in process %d (rank %d)" % (os.getpid(), rank))
import traceback
traceback.print_exc()
exitResult = str(e)
# Communicate to master, either error os success
req_send = mpiComm.isend(exitResult, dest=0, tag=TAG_RUN_JOB+rank)
t0 = time()
while not req_send.test()[0]:
sleep(1)
if time() - t0 > TIMEOUT:
msg = " Error in process %d, cannot send error message to master."
print(msg % os.getpid())
break