#!/bin/sh ##************************************************************** ## ## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department, ## University of Wisconsin-Madison, WI. ## ## Licensed under the Apache License, Version 2.0 (the "License"); you ## may not use this file except in compliance with the License. You may ## obtain a copy of the License at ## ## http://www.apache.org/licenses/LICENSE-2.0 ## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ## See the License for the specific language governing permissions and ## limitations under the License. ## ##************************************************************** # Set this to the bin directory of MPICH installation MPDIR=/usr/local/mpich2/bin PATH=$MPDIR:.:$PATH export PATH _CONDOR_PROCNO=$_CONDOR_PROCNO _CONDOR_NPROCS=$_CONDOR_NPROCS # Remove the contact file, so if we are held and released # it can be recreated anew rm -f $CONDOR_CONTACT_FILE PATH=`condor_config_val libexec`/:$PATH # mpd needs a conf file, and it must be # permissions 0700 mkdir tmp MPD_CONF_FILE=`pwd`/tmp/mpd_conf_file export MPD_CONF_FILE ulimit -c 0 # If you have a shared file system, maybe you # want to put the mpd.conf file in your home # directory echo "password=somepassword" > $MPD_CONF_FILE chmod 0700 $MPD_CONF_FILE # If on the head node, start mpd, get the port and host, # and condor_chirp it back into the ClassAd # so the non-head nodes can find the head node. if [ $_CONDOR_PROCNO -eq 0 ] then mpd > mpd.out.$_CONDOR_PROCNO 2>&1 & sleep 1 host=`mpdtrace -l | sed 1q | tr '_' ' ' | awk '{print $1}'` port=`mpdtrace -l | sed 1q | tr '_' ' ' | awk '{print $2}'` condor_chirp set_job_attr MPICH_PORT $port condor_chirp set_job_attr MPICH_HOST \"$host\" num_hosts=1 retries=0 while [ $num_hosts -ne $_CONDOR_NPROCS ] do num_hosts=`mpdtrace | wc -l` sleep 2 retries=`expr $retries + 1` if [ $retries -gt 100 ] then echo "Too many retries, could not start all $_CONDOR_NPROCS nodes, only started $num_hosts, giving up. Here are the hosts I could start " mpdtrace exit 1 fi done ## run the actual mpi job, which was the command line argument ## to the invocation of this shell script mpiexec -n $_CONDOR_NPROCS $@ e=$? mpdallexit sleep 20 echo $e else # If NOT the head node, acquire the host and port of # the head node retries=0 host=UNDEFINED while [ $host == "UNDEFINED" ] do host=`condor_chirp get_job_attr MPICH_HOST` sleep 2 retries=`expr $retries + 1` if [ $retries -gt 100 ]; then echo "Too many retries, could not get mpd host from condor_chirp, giving up." exit 1 fi done port=`condor_chirp get_job_attr MPICH_PORT` host=`echo $host | tr -d '"'` mpd --host=$host --port=$port --noconsole > mpd.out.$_CONDOR_PROCNO 2>&1 fi