diff --git a/.gitattributes b/.gitattributes index 166eefc15b7b03d5d24a7356214f729fa306ee0e..a38765307f7f163f7b8fa8920de71c071d8f29d4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5568,6 +5568,7 @@ SubSystems/Online_Cobalt/validation/intercluster/connectivity/cobalt2locus.test SubSystems/Online_Cobalt/validation/intercluster/ethernet/iperf-cobalt2locus.bw-req -text SubSystems/Online_Cobalt/validation/intercluster/ethernet/iperf-cobalt2locus.test -text SubSystems/Online_Cobalt/validation/intercluster/funcs.sh eol=lf +SubSystems/Online_Cobalt/validation/intercluster/infiniband/cobalt2-to-cep4.test -text SubSystems/Online_Cobalt/validation/system/gpu/basic-gpu.test eol=lf SubSystems/Online_Cobalt/validation/system/gpu/persistence-mode.test eol=lf SubSystems/Online_Cobalt/validation/system/hardware/sata-ahci.test eol=lf diff --git a/SubSystems/Online_Cobalt/validation/intercluster/infiniband/cobalt2-to-cep4.test b/SubSystems/Online_Cobalt/validation/intercluster/infiniband/cobalt2-to-cep4.test new file mode 100755 index 0000000000000000000000000000000000000000..a39c0a05fe4d78c3f3a47d92f679cc5c7641a922 --- /dev/null +++ b/SubSystems/Online_Cobalt/validation/intercluster/infiniband/cobalt2-to-cep4.test @@ -0,0 +1,50 @@ +#!/bin/bash + +source $(dirname $0)/../../validation_utils.sh +check_running_on_cobalt2 + +#find latest osu-micro-benchmarks dir +MPITESTSDIR=$(readlink -f -n "$(dirname $(which mpirun))/../tests") +OSUMBDIR=$(find $MPITESTSDIR -type d -name 'osu-micro-benchmarks*' | sort | tail -n 1) +echo "Latest mpi osu-micro-benchmarks dir: $OSUMBDIR" + +REQUIRED_BW=90 #in Gbps + +EXIT_CODE=0 + +# check mpi bandwith over infiniband between each pair of cobalt nodes (both ways, and even on the same source/dest) +for i in {201..213} ; do + for j in {1..2} ; do + COBALT_SOURCE_IF_NAME=$(printf "cbt%03d-IPoIB%02d.cobalt.lofar" "$i" "$j") + COBALT_SOURCE_IF_IP=$(host $COBALT_SOURCE_IF_NAME | awk '{ print $4}') + for k in {1..50} ; do + CEP4_DEST_IF_NAME=$(printf "cpu%02d.cep4.infiniband.lofar" $k) + CEP4_DEST_IF_IP=$(host $CEP4_DEST_IF_NAME | awk '{ print $4}') + + echo "mpirun -x UCX_SHM_DEVICES="" -H $COBALT_SOURCE_IF_IP,$CEP4_DEST_IF_IP $OSUMBDIR/osu_bw" + + # measure throughput for 4MB blocks, results are in MBps + RESULT=$(mpirun -x UCX_SHM_DEVICES="" -H $COBALT_SOURCE_IF_IP,$CEP4_DEST_IF_IP $OSUMBDIR/osu_bw) + if [ $? -ne 0 ] ; then + echo "Could not run infiniband bandwith test between $COBALT_SOURCE_IF_NAME ($COBALT_SOURCE_IF_IP) and $CEP4_DEST_IF_NAME ($CEP4_DEST_IF_IP): $RESULT" + EXIT_CODE=1 ; + else + THROUGHPUT=`echo "$RESULT" | grep "^4194304s*" | awk '{ print $2 }'` + + #convert to Gbps + THROUGHPUT=`echo "$THROUGHPUT*8/1000" | bc` + + echo "infiniband bandwith for 4MB block between $COBALT_SOURCE_IF_NAME ($COBALT_SOURCE_IF_IP) and $CEP4_DEST_IF_NAME ($CEP4_DEST_IF_IP) is $THROUGHPUT Gbps" + + # check if throughput > 90Gbps + if [ $THROUGHPUT -lt "$REQUIRED_BW" ] ; then EXIT_CODE=1 ; fi + fi + done + done +done + +if [ $EXIT_CODE -ne 0 ] ; then + echo "ERROR: not all pairs of cobalt nodes reach required $REQUIRED_BW Gbps over infiniband" +fi + +exit $EXIT_CODE