#!/bin/sh ##************************************************************** ## ## Copyright (C) 1990-2007, Condor Team, Computer Sciences Department, ## University of Wisconsin-Madison, WI. ## ## Licensed under the Apache License, Version 2.0 (the "License"); you ## may not use this file except in compliance with the License. You may ## obtain a copy of the License at ## ## http://www.apache.org/licenses/LICENSE-2.0 ## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ## See the License for the specific language governing permissions and ## limitations under the License. ## ##************************************************************** _CONDOR_PROCNO=$_CONDOR_PROCNO _CONDOR_NPROCS=$_CONDOR_NPROCS _CONDOR_REMOTE_SPOOL_DIR=$_CONDOR_REMOTE_SPOOL_DIR SSHD_SH=`condor_config_val libexec` SSHD_SH=$SSHD_SH/sshd.sh CONDOR_SSH=`condor_config_val libexec` CONDOR_SSH=$CONDOR_SSH/condor_ssh . $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS # If not the head node, just sleep forever, to let the # sshds run if [ $_CONDOR_PROCNO -ne 0 ] then wait sshd_cleanup exit 0 fi EXECUTABLE=$1 shift # the binary is copied but the executable flag is cleared. # so the script have to take care of this chmod +x $EXECUTABLE # Set this to the bin directory of your lam installation # This also must be in your .cshrc file, so the remote side # can find it! LAMDIR=/u/g/t/gthain/lam-7.0.6/bin PATH=$LAMDIR:$PATH export PATH # to allow multiple lam jobs running on a single machine, # we have to give somewhat unique value export LAM_MPI_SESSION_SUFFIX=$$ # This is the way to accomplish the above when running # LAM < 7.0.2 export LAM_MPI_SOCKET_SUFFIX=$$ export LAMRSH=$CONDOR_SSH # when a job is killed by the user, this script will get sigterm # This script have to catch it and do the cleaning for the # lam environment finalize() { sshd_cleanup lamhalt exit } trap finalize TERM CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact export CONDOR_CONTACT_FILE # The second field in the contact file is the machine name # that condor_ssh knows how to use sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $2}' > machines # start the lam environment # For older versions of lam you may need to remove the -ssi boot rsh line lamboot -ssi boot rsh machines if [ $? -ne 0 ] then echo "lamscript error booting lam" exit 1 fi ## run the actual mpijob mpirun C $EXECUTABLE $@ & CHILD=$! TMP=130 while [ $TMP -gt 128 ] ; do wait $CHILD TMP=$?; done # clean up files sshd_cleanup rm -f machines # clean up lam lamhalt exit $TMP