#!/bin/sh ##************************************************************** ## ## Copyright (C) 1990-2007, Condor Team, Computer Sciences Department, ## University of Wisconsin-Madison, WI. ## ## Licensed under the Apache License, Version 2.0 (the "License"); you ## may not use this file except in compliance with the License. You may ## obtain a copy of the License at ## ## http://www.apache.org/licenses/LICENSE-2.0 ## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ## See the License for the specific language governing permissions and ## limitations under the License. ## ##************************************************************** # Set this to the bin directory of MPICH installation #MPDIR=/usr/local/mpich2/bin #PATH=$MPDIR:.:$PATH #export PATH _CONDOR_PROCNO=$_CONDOR_PROCNO _CONDOR_NPROCS=$_CONDOR_NPROCS # Remove the contact file, so if we are held and released # it can be recreated anew rm -f $CONDOR_CONTACT_FILE PATH=`condor_config_val libexec`/:$PATH # MPP needs a conf file, and it must be # 0700 mkdir tmp MPD_CONF_FILE=`pwd`/tmp/mpd_conf_file export MPD_CONF_FILE ulimit -c 0 # If you have a shared filesystem, maybe you # want to put the mpd.conf file in your home # directory echo "password=somepassword" > $MPD_CONF_FILE chmod 0700 $MPD_CONF_FILE # If on the head node, start mpd, get the port # it, and condor_chirp it back into the ad # so others can see # sshds run if [ $_CONDOR_PROCNO -eq 0 ] then mpd > mpd.out.$_CONDOR_PROCNO 2>&1 & sleep 1 host=`mpdtrace -l | sed 1q | tr '_' ' ' | awk '{print $1}'` port=`mpdtrace -l | sed 1q | tr '_' ' ' | awk '{print $2}'` condor_chirp set_job_attr MPICH_PORT $port condor_chirp set_job_attr MPICH_HOST \"$host\" num_hosts=1 retries=0 while [ $num_hosts -ne $_CONDOR_NPROCS ] do num_hosts=`mpdtrace | wc -l` sleep 2 retries=`expr $retries + 1` if [ $retries -gt 100 ] then echo "Too many retries, could not start all $_CONDOR_NPROCS nodes, only started $num_hosts, giving up. Here are the hosts I could start " mpdtrace exit 1 fi done ## run the actual mpijob mpiexec -n $_CONDOR_NPROCS $EXECUTABLE $@ e=$? mpdallexit rm mpd.out.$_CONDOR_PROCNO sleep 20 echo $e else e=1 while [ $e -ne 0 -o "$host" = "UNDEFINED" -o "$port" = "UNDEFINED" ] do port=`condor_chirp get_job_attr MPICH_PORT` host=`condor_chirp get_job_attr MPICH_HOST` e=$? sleep 2 done host=`echo $host | tr -d '"'` mpd --noconsole --host=$host --port=$port > mpd.out.$_CONDOR_PROCNO 2>&1 rm mpd.out.$_CONDOR_PROCNO fi