Skip to content
Snippets Groups Projects
Select Git revision
  • e0e5eb7c92e70291c6bea92e92651ae2f8cac8a1
  • master default protected
  • L2SS-2407-swap-iers-caltable-monitoring-port
  • L2SS-2357-fix-ruff
  • sync-up-with-meta-pypcc
  • stabilise-landing-page
  • all-stations-lofar2
  • v0.39.7-backports
  • Move-sdptr-to-v1.5.0
  • fix-build-ubuntu
  • tokens-in-env-files
  • fix-build
  • L2SS-2214-deploy-cdb
  • fix-missing-init
  • add-power-hardware-apply
  • L2SS-2129-Add-Subrack-Routine
  • Also-listen-internal-to-rpc
  • fix-build-dind
  • L2SS-2153--Improve-Error-Handling
  • L2SS-2153-Add-Grpc-Gateway-support
  • L2SS-1970-apsct-lol
  • v0.55.5 protected
  • v0.55.4 protected
  • 0.55.2.dev0
  • 0.55.1.dev0
  • 0.55.0.dev0
  • v0.54.0 protected
  • 0.53.2.dev0
  • 0.53.1.dev0
  • v0.52.3-r2 protected
  • remove-snmp-client
  • v0.52.3 protected
  • v0.52.3dev0 protected
  • 0.53.1dev0
  • v0.52.2-rc3 protected
  • v0.52.2-rc2 protected
  • v0.52.2-rc1 protected
  • v0.52.1.1 protected
  • v0.52.1 protected
  • v0.52.1-rc1 protected
  • v0.51.9-6 protected
41 results

run_integration_test.sh

Blame
  • Corné Lukken's avatar
    L2SS-530: Document device server step-by-step debug
    Corné Lukken authored
    e0e5eb7c
    History
    Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    run_integration_test.sh 10.96 KiB
    #!/bin/bash -e
    #
    # Copyright (C) 2023 ASTRON (Netherlands Institute for Radio Astronomy)
    # SPDX-License-Identifier: Apache-2.0
    #
    
    export DNS=192.168.76.1
    LOCAL_IP=$(ip route get 195.169.155.206 | head -1 | cut -d' ' -f7)
    export LOCAL_IP
    export do_cleanup=true
    
    # Usage function explains how parameters are parsed
    function usage {
        echo "./$(basename "$0")
          no arguments, builds and configures all docker containers and starts each
          stage of the integration test one after the other. Between each stage the
          dsconfig is updated accordingly."
        echo ""
        echo "./$(basename "$0") -h --help
          displays this help message"
        echo ""
        echo "./$(basename "$0") --no-build
          disables building of docker images"
        echo ""
        echo "./$(basename "$0") --skip-tests
          Only setup environment, implies --preserve"
        echo ""
        echo "./$(basename "$0") --preserve
          Prevents tearing down the dev environment afterwards"
        echo ""
        echo "./$(basename "$0") --save-logs
          Export logs for each container into the /log directory"
        echo ""
          echo "./$(basename "$0") --interactive
          Allow for interactively debugging integration tests"
        echo ""
        echo "./$(basename "$0") --module=<tango|services|all>
          Only start given subset of the infrastructure, defaults to all"
    }
    
    
    
    # list of arguments expected in the input
    optstring_long="help,no-build,skip-tests,preserve,save-logs,interactive,module::"
    optstring="h"
    
    options=$(getopt -l ${optstring_long} -o ${optstring} -- "$@")
    
    eval set -- "$options"
    
    module="all"
    
    while true; do
      case ${1} in
        -h|--help)
          usage
          exit 0
          ;;
        --no-build)
          echo "Disable docker build step"
          export no_build=1
          export NO_BUILD=${no_build}
          ;;
        --skip-tests)
          echo "Only setup and configure environment don't run any tests"
          echo "Implies --preserve"
          export no_tests=1
          export preserve=1
          ;;
        --preserve)
          echo "Preserve test environment"
          export preserve=1
          ;;
        --save-logs)
          echo "Save logs after execution"
          export save_logs=1
          ;;
        --interactive)
          echo "Run integration tests interactively"
          export interactive=1
          ;;
        --module)
          shift
          module="$1"
          ;;
        --)
        shift
        break;;
      esac
      shift
    done
    
    if [ "${module}" == "services" ]; then
      echo "module=services, enabling skip-tests and preservation"
      export no_tests=1
      export preserve=1
    fi
    
    if [ -z "$LOFAR20_DIR" ]; then
        # We assume we aren't in the PATH, so we can derive our path.
        # We need our parent directory.
        LOFAR20_DIR_RELATIVE=$(dirname "$0")/..
    
        # As an absolute path
        LOFAR20_DIR=$(readlink -f "${LOFAR20_DIR_RELATIVE}")
    fi
    
    if [ -z "$(which shyaml)" ]; then
      echo "Shyaml not found!, install using: 'python -m pip install shyaml'"
      exit 1
    fi
    
    if [ -z "$TAG" ]; then
      export TAG="latest"
    fi
    
    echo "Using TAG: ${TAG}"
    
    # Build dsconfig first, `-z ${y+x}` is the inverse of `-n ${y}`
    if [ -z "${no_build+x}" ]; then
        rm -rf "${LOFAR20_DIR}/tangostationcontrol/dist"
        cd "${LOFAR20_DIR}/tangostationcontrol" || exit 1
        tox -e build
        cd "$LOFAR20_DIR/docker" || exit 1
        make ec-sim lofar-device-base dsconfig grafana integration-test
        cd "$LOFAR20_DIR"
    else
      docker pull -q "git.astron.nl:5000/lofar2.0/tango/grafana:${TAG}" || docker pull -q "git.astron.nl:5000/lofar2.0/tango/grafana:latest"
      if [ -z "$(docker images -q git.astron.nl:5000/lofar2.0/tango/grafana:${TAG} 2> /dev/null)" ]; then
        echo "Tagging Grafana latest as ${TAG} for nomad copy_image task"
        docker tag git.astron.nl:5000/lofar2.0/tango/grafana:latest git.astron.nl:5000/lofar2.0/tango/grafana:${TAG}
      fi
    fi
    
    docker network rm station || true
    
    # prepare a docker volume for nomad
    tmp_volume="test_$(hexdump -n 16 -v -e '/1 "%02X"' /dev/urandom)"
    
    function cleanup {
      teardown=${1}
    
      if [ -n "${do_cleanup}" ]; then
        echo "Performing cleanup..."
        unset do_cleanup
      else
        echo "Cleanup done earlier. Skipping now"
        return
      fi
    
      sleep 1
    
      cd "$LOFAR20_DIR"
      if [ -n "${save_logs}" ]; then
        mkdir -p log
        for container in $(docker ps -a --format "{{.Names}}")
        do
          echo "Saving log for container $container"
          docker logs "${container}" >& "log/${container}.log"
        done
        bash "${LOFAR20_DIR}"/sbin/dsconfig.sh --dump >& log/dump_ConfigDb.log
    
        mkdir -p log/allocations
        # obtain (and save!) the list of tasks, but only those that have actually started.
        docker exec server.station.nomad.nomad-cluster.local.jmpd.in nomad alloc status -json > log/allocations/alloc-status.json
        <log/allocations/alloc-status.json jq --raw-output '.[] | (.ID + " " + .Name + " " + (.TaskStates | objects | map_values(select(.StartedAt)) | keys[]) + " " + (.CreateTime/1e9 | todate))' | while read -r line
        do
          read -r -a parts <<< "${line}"
          ALLOC_ID="${parts[0]}"
          ALLOC_NAME="${parts[1]}"
          TASK_NAME="${parts[2]}"
          CREATE_TIME="${parts[3]}"
    
          echo "Saving logs for job ${TASK_NAME} allocation ${ALLOC_ID}"
          docker exec server.station.nomad.nomad-cluster.local.jmpd.in nomad alloc logs -stderr "${ALLOC_ID}" "${TASK_NAME}" > "log/allocations/${ALLOC_NAME}-${TASK_NAME}-${CREATE_TIME}-${ALLOC_ID}.stderr.log" || true
          docker exec server.station.nomad.nomad-cluster.local.jmpd.in nomad alloc logs -stdout "${ALLOC_ID}" "${TASK_NAME}" > "log/allocations/${ALLOC_NAME}-${TASK_NAME}-${CREATE_TIME}-${ALLOC_ID}.stdout.log" || true
        done
    
        # clean up empty files (most containers log to either stderr or stdout)
        find log -size 0 -exec rm {} ';'
      fi
      if [[ -z "${preserve}" || -n "${teardown}" ]]; then
        echo "Tearing down environment because: ${teardown}"
        HOME="$JUMPPAD_HOME" jumppad down infra/dev
        docker volume rm "$tmp_volume" || true
      fi
    }
    
    trap "cleanup int" INT
    trap "cleanup SIGINT" SIGINT
    trap "cleanup TERM" TERM
    trap "cleanup ERR" ERR
    trap "cleanup SIGTERM" SIGTERM
    trap cleanup EXIT
    
    # Configure the config database, restart containers and run a specific
    # integration module or even specific tests
    # integration_test module restarted_containers config_files specific_test
    function integration_test {
      test_module=${1}
      IFS=" " read -r -a restarts <<< "${2}"
      IFS=" " read -r -a configs <<< "${3}"
      for config in "${configs[@]}"; do
        echo "Updating config ${config} ..."
        bash "${LOFAR20_DIR}"/sbin/dsconfig.sh --update "${config}"
      done
      if [ -n "${2+x}" ]; then
        # shellcheck disable=SC2145
        echo "restart device-servers restart ..."
        docker exec -i server.station.nomad.nomad-cluster.local.jmpd.in nomad job restart -on-error=fail device-servers
        # shellcheck disable=SC2145
        echo "await ${restarts[@]} ..."
        await "${restarts[@]}"
      fi
    
      if [ -n "${interactive+x}" ]; then
        echo "Preparing interactive session ..."
        echo "Using local ip: ${LOCAL_IP}"
        export docker_args=(
          run --rm  -e "TANGO_HOST=$TANGO_HOST" -e "DEBUG_HOST=${LOCAL_IP}" --network="station" --dns="$DNS" -it
          -v "$LOFAR20_DIR":/opt/lofar/tango:rw
          -w="/opt/lofar/tango/tangostationcontrol"
        )
        export docker_command="bash"
      else
        export docker_args=(
          run --rm  -e "TANGO_HOST=$TANGO_HOST" --network="station" --dns="$DNS" -i
          -v "$LOFAR20_DIR":/opt/lofar/tango:rw
          -w="/opt/lofar/tango/tangostationcontrol"
        )
        export docker_command="tox -e integration"
      fi
    
      echo "run integration ${test_module} starting at $(date +'%F %T')..."
      # shellcheck disable=SC2086
      docker "${docker_args[@]}" -e "TEST_MODULE=${test_module}" "git.astron.nl:5000/lofar2.0/tango/ci-build-runner:$TAG" \
        ${docker_command}
    }
    
    function await {
      awaits=${1}
      timeout_sec=300
      for i in "$@"; do
        start_time="$(date -u +%s)"
        echo -n "Wait for service ${i} to become healthy .."
        while [ "$(docker exec -i server.station.nomad.nomad-cluster.local.jmpd.in nomad job allocs -json device-servers | jq -r ".[] | select(.TaskGroup == \"${awaits}\") | .TaskStates[].State")" != "running" ] ; do
          echo -n '.'
          sleep 2
          current_time="$(date -u +%s)"
          elapsed_seconds=$(("$current_time" - "$start_time"))
          if [ "${elapsed_seconds}" -gt "${timeout_sec}" ]; then
            printf ". \u1b[31m[timeout]\u1b[m\n"
            exit 1
          fi
        done
        sleep 2
        printf ". \u1b[32m[ok]\u1b[m\n"
      done
    
      # TODO(JDM): Wait for device servers to start, until we
      # implement a robust health check. Currently, using a
      # script for this in nomad seems to clash with restarting
      # a task.
      sleep 20
    }
    
    cd "$LOFAR20_DIR" || exit 1
    
    source "${LOFAR20_DIR}"/sbin/prepare_dev_env.sh --volume="$tmp_volume"
    
    if [ -z "$JUMPPAD_HOME" ]; then
      JUMPPAD_HOME="$HOME"
    fi
    
    if [ -z "$DOCKER_HOST" ]; then
      DOCKER_HOST="unix:///var/run/docker.sock"
    fi
    
    dsconfig_image="git.astron.nl:5000/lofar2.0/tango/dsconfig:$TAG"
    # Update the dsconfig
    docker pull -q "$dsconfig_image" || docker pull -q "git.astron.nl:5000/lofar2.0/tango/dsconfig:latest" || true
    docker image inspect "$dsconfig_image" > /dev/null || docker tag "git.astron.nl:5000/lofar2.0/tango/dsconfig:latest" "$dsconfig_image"
    
    jumppad_options=(  # these don't seem to propagate
      --var="host_volume=$tmp_volume"
      --var="lofar20_dir=$LOFAR20_DIR"
      --var="image_tag=$TAG"
      --var="debug_host=$LOCAL_IP"
    )
    
    echo "Start module: $module"
    jumppad_options+=("infra/dev/$module.hcl")
    
    DOCKER_HOST="$DOCKER_HOST" HOME="$JUMPPAD_HOME" jumppad up "${jumppad_options[@]}"
    
    if [ "${module}" == "services" ]; then
      echo "Only starting services, skipping tests"
      exit 0
    fi
    
    # shellcheck disable=SC2046
    #eval $(HOME="$JUMPPAD_HOME" jumppad env "infra/dev/$module.hcl")
    
    docker run -t --rm --network="station" --dns="$DNS" busybox \
        sh -c  'echo -n "Waiting for tango service to become available .."
                until nc -z -w 1 tango.service.consul 10000; do
                  sleep 2
                  echo -n '.'
                done
                echo ". [ok]"
                '
    
    export TANGO_HOST="tango.service.consul:10000"
    
    echo "Using tango host $TANGO_HOST"
    
    # Devices list is used to explitly word split when supplied to commands, must
    # disable shellcheck SC2086 for each case.
    DEVICES=(device-stationmanager device-aps device-apsct device-ccd device-ec device-apspu device-sdpfirmware device-sdp device-recvh device-recvl device-bst device-sst device-unb2 device-xst device-beamlet device-digitalbeam device-tilebeam device-psoc device-pcon device-afh device-afl device-temperaturemanager device-observationcontrol device-configuration device-calibration device-metadata)
    
    # Wait for devices to restart
    
    await "${DEVICES[@]}"
    
    if [ -n "${no_tests}" ]; then
      exit 0
    fi
    
    # Start the integration test
    
    integration_test default
    
    integration_test tilebeam_performance "device-sdpfirmware device-sdp device-recvh device-recvl device-tilebeam device-afh device-afl" "${LOFAR20_DIR}/CDB/integrations/tilebeam_cluster_ConfigDb.json"
    
    integration_test digitalbeam_performance "device-sdpfirmware device-sdp device-recvh device-recvl device-digitalbeam device-beamlet device-afh device-afl" "${LOFAR20_DIR}/CDB/integrations/digitalbeam_cluster_ConfigDb.json"
    
    integration_test configuration "device-configuration"