Select Git revision
run_integration_test.sh

L2SS-530: Document device server step-by-step debug
Corné Lukken authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
run_integration_test.sh 10.96 KiB
#!/bin/bash -e
#
# Copyright (C) 2023 ASTRON (Netherlands Institute for Radio Astronomy)
# SPDX-License-Identifier: Apache-2.0
#
export DNS=192.168.76.1
LOCAL_IP=$(ip route get 195.169.155.206 | head -1 | cut -d' ' -f7)
export LOCAL_IP
export do_cleanup=true
# Usage function explains how parameters are parsed
function usage {
echo "./$(basename "$0")
no arguments, builds and configures all docker containers and starts each
stage of the integration test one after the other. Between each stage the
dsconfig is updated accordingly."
echo ""
echo "./$(basename "$0") -h --help
displays this help message"
echo ""
echo "./$(basename "$0") --no-build
disables building of docker images"
echo ""
echo "./$(basename "$0") --skip-tests
Only setup environment, implies --preserve"
echo ""
echo "./$(basename "$0") --preserve
Prevents tearing down the dev environment afterwards"
echo ""
echo "./$(basename "$0") --save-logs
Export logs for each container into the /log directory"
echo ""
echo "./$(basename "$0") --interactive
Allow for interactively debugging integration tests"
echo ""
echo "./$(basename "$0") --module=<tango|services|all>
Only start given subset of the infrastructure, defaults to all"
}
# list of arguments expected in the input
optstring_long="help,no-build,skip-tests,preserve,save-logs,interactive,module::"
optstring="h"
options=$(getopt -l ${optstring_long} -o ${optstring} -- "$@")
eval set -- "$options"
module="all"
while true; do
case ${1} in
-h|--help)
usage
exit 0
;;
--no-build)
echo "Disable docker build step"
export no_build=1
export NO_BUILD=${no_build}
;;
--skip-tests)
echo "Only setup and configure environment don't run any tests"
echo "Implies --preserve"
export no_tests=1
export preserve=1
;;
--preserve)
echo "Preserve test environment"
export preserve=1
;;
--save-logs)
echo "Save logs after execution"
export save_logs=1
;;
--interactive)
echo "Run integration tests interactively"
export interactive=1
;;
--module)
shift
module="$1"
;;
--)
shift
break;;
esac
shift
done
if [ "${module}" == "services" ]; then
echo "module=services, enabling skip-tests and preservation"
export no_tests=1
export preserve=1
fi
if [ -z "$LOFAR20_DIR" ]; then
# We assume we aren't in the PATH, so we can derive our path.
# We need our parent directory.
LOFAR20_DIR_RELATIVE=$(dirname "$0")/..
# As an absolute path
LOFAR20_DIR=$(readlink -f "${LOFAR20_DIR_RELATIVE}")
fi
if [ -z "$(which shyaml)" ]; then
echo "Shyaml not found!, install using: 'python -m pip install shyaml'"
exit 1
fi
if [ -z "$TAG" ]; then
export TAG="latest"
fi
echo "Using TAG: ${TAG}"
# Build dsconfig first, `-z ${y+x}` is the inverse of `-n ${y}`
if [ -z "${no_build+x}" ]; then
rm -rf "${LOFAR20_DIR}/tangostationcontrol/dist"
cd "${LOFAR20_DIR}/tangostationcontrol" || exit 1
tox -e build
cd "$LOFAR20_DIR/docker" || exit 1
make ec-sim lofar-device-base dsconfig grafana integration-test
cd "$LOFAR20_DIR"
else
docker pull -q "git.astron.nl:5000/lofar2.0/tango/grafana:${TAG}" || docker pull -q "git.astron.nl:5000/lofar2.0/tango/grafana:latest"
if [ -z "$(docker images -q git.astron.nl:5000/lofar2.0/tango/grafana:${TAG} 2> /dev/null)" ]; then
echo "Tagging Grafana latest as ${TAG} for nomad copy_image task"
docker tag git.astron.nl:5000/lofar2.0/tango/grafana:latest git.astron.nl:5000/lofar2.0/tango/grafana:${TAG}
fi
fi
docker network rm station || true
# prepare a docker volume for nomad
tmp_volume="test_$(hexdump -n 16 -v -e '/1 "%02X"' /dev/urandom)"
function cleanup {
teardown=${1}
if [ -n "${do_cleanup}" ]; then
echo "Performing cleanup..."
unset do_cleanup
else
echo "Cleanup done earlier. Skipping now"
return
fi
sleep 1
cd "$LOFAR20_DIR"
if [ -n "${save_logs}" ]; then
mkdir -p log
for container in $(docker ps -a --format "{{.Names}}")
do
echo "Saving log for container $container"
docker logs "${container}" >& "log/${container}.log"
done
bash "${LOFAR20_DIR}"/sbin/dsconfig.sh --dump >& log/dump_ConfigDb.log
mkdir -p log/allocations
# obtain (and save!) the list of tasks, but only those that have actually started.
docker exec server.station.nomad.nomad-cluster.local.jmpd.in nomad alloc status -json > log/allocations/alloc-status.json
<log/allocations/alloc-status.json jq --raw-output '.[] | (.ID + " " + .Name + " " + (.TaskStates | objects | map_values(select(.StartedAt)) | keys[]) + " " + (.CreateTime/1e9 | todate))' | while read -r line
do
read -r -a parts <<< "${line}"
ALLOC_ID="${parts[0]}"
ALLOC_NAME="${parts[1]}"
TASK_NAME="${parts[2]}"
CREATE_TIME="${parts[3]}"
echo "Saving logs for job ${TASK_NAME} allocation ${ALLOC_ID}"
docker exec server.station.nomad.nomad-cluster.local.jmpd.in nomad alloc logs -stderr "${ALLOC_ID}" "${TASK_NAME}" > "log/allocations/${ALLOC_NAME}-${TASK_NAME}-${CREATE_TIME}-${ALLOC_ID}.stderr.log" || true
docker exec server.station.nomad.nomad-cluster.local.jmpd.in nomad alloc logs -stdout "${ALLOC_ID}" "${TASK_NAME}" > "log/allocations/${ALLOC_NAME}-${TASK_NAME}-${CREATE_TIME}-${ALLOC_ID}.stdout.log" || true
done
# clean up empty files (most containers log to either stderr or stdout)
find log -size 0 -exec rm {} ';'
fi
if [[ -z "${preserve}" || -n "${teardown}" ]]; then
echo "Tearing down environment because: ${teardown}"
HOME="$JUMPPAD_HOME" jumppad down infra/dev
docker volume rm "$tmp_volume" || true
fi
}
trap "cleanup int" INT
trap "cleanup SIGINT" SIGINT
trap "cleanup TERM" TERM
trap "cleanup ERR" ERR
trap "cleanup SIGTERM" SIGTERM
trap cleanup EXIT
# Configure the config database, restart containers and run a specific
# integration module or even specific tests
# integration_test module restarted_containers config_files specific_test
function integration_test {
test_module=${1}
IFS=" " read -r -a restarts <<< "${2}"
IFS=" " read -r -a configs <<< "${3}"
for config in "${configs[@]}"; do
echo "Updating config ${config} ..."
bash "${LOFAR20_DIR}"/sbin/dsconfig.sh --update "${config}"
done
if [ -n "${2+x}" ]; then
# shellcheck disable=SC2145
echo "restart device-servers restart ..."
docker exec -i server.station.nomad.nomad-cluster.local.jmpd.in nomad job restart -on-error=fail device-servers
# shellcheck disable=SC2145
echo "await ${restarts[@]} ..."
await "${restarts[@]}"
fi
if [ -n "${interactive+x}" ]; then
echo "Preparing interactive session ..."
echo "Using local ip: ${LOCAL_IP}"
export docker_args=(
run --rm -e "TANGO_HOST=$TANGO_HOST" -e "DEBUG_HOST=${LOCAL_IP}" --network="station" --dns="$DNS" -it
-v "$LOFAR20_DIR":/opt/lofar/tango:rw
-w="/opt/lofar/tango/tangostationcontrol"
)
export docker_command="bash"
else
export docker_args=(
run --rm -e "TANGO_HOST=$TANGO_HOST" --network="station" --dns="$DNS" -i
-v "$LOFAR20_DIR":/opt/lofar/tango:rw
-w="/opt/lofar/tango/tangostationcontrol"
)
export docker_command="tox -e integration"
fi
echo "run integration ${test_module} starting at $(date +'%F %T')..."
# shellcheck disable=SC2086
docker "${docker_args[@]}" -e "TEST_MODULE=${test_module}" "git.astron.nl:5000/lofar2.0/tango/ci-build-runner:$TAG" \
${docker_command}
}
function await {
awaits=${1}
timeout_sec=300
for i in "$@"; do
start_time="$(date -u +%s)"
echo -n "Wait for service ${i} to become healthy .."
while [ "$(docker exec -i server.station.nomad.nomad-cluster.local.jmpd.in nomad job allocs -json device-servers | jq -r ".[] | select(.TaskGroup == \"${awaits}\") | .TaskStates[].State")" != "running" ] ; do
echo -n '.'
sleep 2
current_time="$(date -u +%s)"
elapsed_seconds=$(("$current_time" - "$start_time"))
if [ "${elapsed_seconds}" -gt "${timeout_sec}" ]; then
printf ". \u1b[31m[timeout]\u1b[m\n"
exit 1
fi
done
sleep 2
printf ". \u1b[32m[ok]\u1b[m\n"
done
# TODO(JDM): Wait for device servers to start, until we
# implement a robust health check. Currently, using a
# script for this in nomad seems to clash with restarting
# a task.
sleep 20
}
cd "$LOFAR20_DIR" || exit 1
source "${LOFAR20_DIR}"/sbin/prepare_dev_env.sh --volume="$tmp_volume"
if [ -z "$JUMPPAD_HOME" ]; then
JUMPPAD_HOME="$HOME"
fi
if [ -z "$DOCKER_HOST" ]; then
DOCKER_HOST="unix:///var/run/docker.sock"
fi
dsconfig_image="git.astron.nl:5000/lofar2.0/tango/dsconfig:$TAG"
# Update the dsconfig
docker pull -q "$dsconfig_image" || docker pull -q "git.astron.nl:5000/lofar2.0/tango/dsconfig:latest" || true
docker image inspect "$dsconfig_image" > /dev/null || docker tag "git.astron.nl:5000/lofar2.0/tango/dsconfig:latest" "$dsconfig_image"
jumppad_options=( # these don't seem to propagate
--var="host_volume=$tmp_volume"
--var="lofar20_dir=$LOFAR20_DIR"
--var="image_tag=$TAG"
--var="debug_host=$LOCAL_IP"
)
echo "Start module: $module"
jumppad_options+=("infra/dev/$module.hcl")
DOCKER_HOST="$DOCKER_HOST" HOME="$JUMPPAD_HOME" jumppad up "${jumppad_options[@]}"
if [ "${module}" == "services" ]; then
echo "Only starting services, skipping tests"
exit 0
fi
# shellcheck disable=SC2046
#eval $(HOME="$JUMPPAD_HOME" jumppad env "infra/dev/$module.hcl")
docker run -t --rm --network="station" --dns="$DNS" busybox \
sh -c 'echo -n "Waiting for tango service to become available .."
until nc -z -w 1 tango.service.consul 10000; do
sleep 2
echo -n '.'
done
echo ". [ok]"
'
export TANGO_HOST="tango.service.consul:10000"
echo "Using tango host $TANGO_HOST"
# Devices list is used to explitly word split when supplied to commands, must
# disable shellcheck SC2086 for each case.
DEVICES=(device-stationmanager device-aps device-apsct device-ccd device-ec device-apspu device-sdpfirmware device-sdp device-recvh device-recvl device-bst device-sst device-unb2 device-xst device-beamlet device-digitalbeam device-tilebeam device-psoc device-pcon device-afh device-afl device-temperaturemanager device-observationcontrol device-configuration device-calibration device-metadata)
# Wait for devices to restart
await "${DEVICES[@]}"
if [ -n "${no_tests}" ]; then
exit 0
fi
# Start the integration test
integration_test default
integration_test tilebeam_performance "device-sdpfirmware device-sdp device-recvh device-recvl device-tilebeam device-afh device-afl" "${LOFAR20_DIR}/CDB/integrations/tilebeam_cluster_ConfigDb.json"
integration_test digitalbeam_performance "device-sdpfirmware device-sdp device-recvh device-recvl device-digitalbeam device-beamlet device-afh device-afl" "${LOFAR20_DIR}/CDB/integrations/digitalbeam_cluster_ConfigDb.json"
integration_test configuration "device-configuration"