#!/bin/sh ##************************************************************** ## ## Copyright (C) 1990-2010, Condor Team, Computer Sciences Department, ## University of Wisconsin-Madison, WI. ## ## Licensed under the Apache License, Version 2.0 (the "License"); you ## may not use this file except in compliance with the License. You may ## obtain a copy of the License at ## ## http://www.apache.org/licenses/LICENSE-2.0 ## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ## See the License for the specific language governing permissions and ## limitations under the License. ## ##************************************************************** # YOU MUST CHANGE THIS TO THE PREFIX DIR OF OPENMPI if [[ -d /usr/lib64/openmpi ]]; then MPDIR=/usr/lib64/openmpi else MPDIR=/usr/lib/openmpi fi PATH=$MPDIR/bin:.:$PATH export PATH # This is a script to run openmpi jobs under the Condor parallel universe # openmpi assumes that a full openmpi is pre-installed on all execute # machines # A sample submit file might look like... # #universe = parallel #executable = openmpiscript #getenv=true #arguments = actual_mpi_job arg1 arg2 arg3 # #should_transfer_files = yes #when_to_transfer_output = on_exit_or_evict # #output = o.$(NODE) #error = e.$(NODE) #log = l # #notification = never #machine_count = 8 #queue _CONDOR_PROCNO=$_CONDOR_PROCNO _CONDOR_NPROCS=$_CONDOR_NPROCS CONDOR_SSH=`condor_config_val libexec` CONDOR_SSH=$CONDOR_SSH/condor_ssh SSHD_SH=`condor_config_val libexec` SSHD_SH=$SSHD_SH/sshd.sh . $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS # Openmpi assumes that all of the subprocs run the same binary # image within the same work directory. To fake this condition # for independent condor job slots, create a simlink in /local/tmp # with the directory name given by the master node. jobid=`echo $_CONDOR_REMOTE_SPOOL_DIR | awk -F/ '{print $3}'` tmpdir=/local/tmp/mpi_job_$jobid rm -f $tmpdir ln -s $_CONDOR_SCRATCH_DIR $tmpdir # If not the head node, just sleep forever, to let the # sshds run if [ $_CONDOR_PROCNO -ne 0 ] then wait sshd_cleanup rm -f $tmpdir exit 0 fi EXECUTABLE=$1 shift # the binary is copied but the executable flag is cleared. # so the script have to take care of this chmod +x $EXECUTABLE CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact export CONDOR_CONTACT_FILE # The second field in the contact file is the machine name # that condor_ssh knows how to use sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $2":"$1}' > machines ## run the actual mpijob mpirun -v --prefix $MPDIR -wdir $tmpdir --mca plm_rsh_agent $CONDOR_SSH -n $_CONDOR_NPROCS -hostfile machines $EXECUTABLE $@ ########## For mpi versions 1.1 & 1.2 use the line below #mpirun -v --mca plm_rsh_agent $CONDOR_SSH -n $_CONDOR_NPROCS -hostfile machines $EXECUTABLE $@ sshd_cleanup rm -f $tmpdir rm -f machines exit $?