#!/usr/bin/env bash ##************************************************************** ## ## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department, ## University of Wisconsin-Madison, WI. ## ## Licensed under the Apache License, Version 2.0 (the "License"); you ## may not use this file except in compliance with the License. You may ## obtain a copy of the License at ## ## http://www.apache.org/licenses/LICENSE-2.0 ## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ## See the License for the specific language governing permissions and ## limitations under the License. ## ##************************************************************** # This is a script to run OpenMPI jobs under the HTCondor parallel universe. # OpenMPI assumes that a full install is available on all execute nodes. ## sample submit script #universe = parallel #executable = openmpiscript #arguments = actual_mpi_job arg1 arg2 arg3 #getenv = true # #should_transfer_files = yes #transfer_input_files = actual_mpi_job #when_to_transfer_output = on_exit_or_evict # #output = out.$(NODE) #error = err.$(NODE) #log = log # #notification = never #machine_count = 8 #queue ## ## configuration notes # $MPDIR points to the location of the OpenMPI install # You may set it manually (not recommended) #MPDIR=/usr/lib64/openmpi # The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended) MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH) # $EXINT is a comma-delimited list of excluded network interfaces. # If your mpi jobs are hanging, OpenMPI may be trying to use too many # network interfaces to communicate between nodes. # You may set it manually (not recommended) #EXINT="docker0,virbr0" # The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended) EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES) # We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp # so that OpenMPI caches all data under the user's scratch directory. # Not having /tmp mounted under scratch can also lead to unlink errors, # which may hang mpi jobs. _USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH) if [ -z $_USE_SCRATCH ]; then echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config" elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH" fi # If MPDIR is not set, then use a default value if [ -z $MPDIR ]; then echo "WARNING: Using default value for \$MPDIR in openmpiscript" MPDIR=/usr/lib64/openmpi fi PATH=$MPDIR/bin:.:$PATH export PATH # If EXINT is not set, then use some default values if [ -z $EXINT ]; then echo "WARNING: Using default values for \$EXINT in openmpiscript" EXINT="docker0,virbr0" fi # The condor_ssh and sshd.sh helper scripts reside in $(LIBEXEC) CONDOR_SSH=$(condor_config_val libexec) CONDOR_SSH=$CONDOR_SSH/condor_ssh SSHD_SH=$(condor_config_val libexec) SSHD_SH=$SSHD_SH/sshd.sh ## # Set up SSHD on the node . $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS # Set up mpirun cleanup function _MPIRUN_PID=0 mpirun_cleanup() { echo "Caught SIGTERM, cleaning up..." if [ "$_MPIRUN_PID" -ne "0" ]; then # Send SIGTERM to mpirun echo "Sending SIGTERM to mpirun (${_MPIRUN_PID})..." kill -s SIGTERM $_MPIRUN_PID # Give mpirun 60 seconds to exit nicely before proceeding echo "Waiting for mpirun to exit..." for i in {1..12}; do kill -0 $_MPIRUN_PID 2> /dev/null # Returns 0 if PID is running if [ "$?" -ne "0" ]; then break fi sleep 5 done fi # Cleanup sshd echo "Cleaning up sshd files..." sshd_cleanup rm -f machines echo "Exiting early." exit 1 } # If not the head node, just sleep forever to let the SSHDs run if [ $_CONDOR_PROCNO -ne 0 ] then wait sshd_cleanup exit 0 # If the head node, then set the trap to cleanup mpirun (also does sshd_cleanup) else trap mpirun_cleanup SIGTERM fi EXECUTABLE=$1 shift # The binary is copied but the executable flag may be cleared. chmod +x $EXECUTABLE # Set the location of the contact file CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact export CONDOR_CONTACT_FILE # The second field in the contact file contains the node ranks. # mpirun will use a list a of these node ranks, # and condor_ssh will translate them into a hostname:port. sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $1}' > machines # Check for which ssh agent to use because one or the other # have each been deprecated at one OpenMPI version or another. _MCA_FAIL=true for mca_ssh_agent in orte_rsh_agent plm_rsh_agent do if $(ompi_info -a | grep \"${mca_ssh_agent}\" 1>/dev/null 2>&1) then if $(ompi_info -a | grep \"${mca_ssh_agent}\" | grep deprecated 1>/dev/null 2>&1); then continue; fi _MCA_FAIL=false # set MCA values for running on HTCondor export OMPI_MCA_plm_rsh_no_tree_spawn="true" # disable ssh tree spawn export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT" # exclude network interfaces # optionally set MCA values for increasing mpirun verbosity #export OMPI_MCA_plm_base_verbose=30 #export OMPI_MCA_btl_base_verbose=30 # run mpirun in the background and wait for it to exit mpirun -v --prefix $MPDIR --mca $mca_ssh_agent $CONDOR_SSH -n $_CONDOR_NPROCS -hostfile machines $EXECUTABLE $@ & _MPIRUN_PID=$! wait $_MPIRUN_PID break fi done if $_MCA_FAIL then echo could not find a suitable MCA ssh agent exit 255 fi sshd_cleanup rm -f machines exit $?