1#!/usr/bin/env bash 2# Copyright 2021 The gRPC Authors 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16# WARNING: this script has been reviewed by the security team, so 17# any changes need to be made with great care. 18# Contact @jtattermusch or @amidlash if in doubt. 19 20# This script is responsible for remotely running tests on an ARM instance. 21# At the start, it provisions a new AWS ARM64 instance and then uses 22# it to execute a test script (and cleans up afterwards). 23# It should return a status code useful to the kokoro infrastructure. 24 25# TODO(jtattermusch): make the script safe to run under "set -ex" 26set -e 27 28if [ -z "$KOKORO_KEYSTORE_DIR" ]; then 29 echo "KOKORO_KEYSTORE_DIR is unset. This must be run from kokoro" 30 exit 1 31fi 32 33AWS_CREDENTIALS=${KOKORO_KEYSTORE_DIR}/73836_grpc_aws_ec2_credentials 34 35# Setup aws cli 36curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 37unzip -q awscliv2.zip 38sudo ./aws/install 39aws --version 40 41# authenticate with aws cli 42mkdir ~/.aws/ 43echo "[default]" >> ~/.aws/config 44ln -s $AWS_CREDENTIALS ~/.aws/credentials 45 46# setup instance 47sudo apt update && sudo apt install -y jq 48 49# ubuntu 18.04 lts(arm64) 50# https://aws.amazon.com/amazon-linux-ami/ 51AWS_MACHINE_IMAGE=ami-026141f3d5c6d2d0c 52# use 4-core instance by default 53AWS_INSTANCE_TYPE=${AWS_INSTANCE_TYPE:-t4g.xlarge} 54AWS_SECURITY_GROUP=sg-021240e886feba750 55# Max allowed lifespan of the AWS instance. After this period of time, the instance will 56# self-terminate (delete itself). This is very important to ensure that there will 57# be no orphaned AWS instances if the initiating kokoro job fails / gets cancelled etc. 58AWS_INSTANCE_MAX_LIFESPAN_MINS=120 59# increase the size of the root volume so that builds don't run out of disk space 60AWS_STORAGE_SIZE_GB=75 61AWS_DEVICE_MAPPING="DeviceName='/dev/sda1',Ebs={VolumeSize=${AWS_STORAGE_SIZE_GB}}" 62AWS_INSTANCE_TAGS="ResourceType='instance',Tags=[{Key='kokoro_job_name',Value='${KOKORO_JOB_NAME}'},{Key='kokoro_build_number',Value='${KOKORO_BUILD_NUMBER}'},{Key='kokoro_aws_integration',Value='true'}]" 63 64ssh-keygen -N '' -t rsa -b 4096 -f ~/.ssh/temp_client_key 65ssh-keygen -N '' -t ecdsa -b 256 -f ~/.ssh/temp_server_key 66SERVER_PRIVATE_KEY=$(cat ~/.ssh/temp_server_key | sed 's/\(.*\)/ \1/') 67SERVER_PUBLIC_KEY=$(cat ~/.ssh/temp_server_key.pub | awk '{print $1 " " $2 " root@localhost"}') 68SERVER_HOST_KEY_ENTRY=$(cat ~/.ssh/temp_server_key.pub | awk '{print $1 " " $2}') 69CLIENT_PUBLIC_KEY=$(cat ~/.ssh/temp_client_key.pub) 70 71echo '#cloud-config' > userdata 72echo 'ssh_authorized_keys:' >> userdata 73echo " - $CLIENT_PUBLIC_KEY" >> userdata 74echo 'ssh_keys:' >> userdata 75echo ' ecdsa_private: |' >> userdata 76echo "$SERVER_PRIVATE_KEY" >> userdata 77echo " ecdsa_public: $SERVER_PUBLIC_KEY" >> userdata 78echo '' >> userdata 79echo 'runcmd:' >> userdata 80echo " - sleep ${AWS_INSTANCE_MAX_LIFESPAN_MINS}m" >> userdata 81echo ' - shutdown' >> userdata 82 83ID=$(aws ec2 run-instances --image-id $AWS_MACHINE_IMAGE --instance-initiated-shutdown-behavior=terminate \ 84 --instance-type $AWS_INSTANCE_TYPE \ 85 --security-group-ids $AWS_SECURITY_GROUP \ 86 --user-data file://userdata \ 87 --block-device-mapping "$AWS_DEVICE_MAPPING" \ 88 --tag-specifications "$AWS_INSTANCE_TAGS" \ 89 --region us-east-2 | jq .Instances[0].InstanceId | sed 's/"//g') 90echo "instance-id=$ID" 91echo "Waiting 1m for instance ip..." 92sleep 1m 93IP=$(aws ec2 describe-instances \ 94 --instance-id=$ID \ 95 --region us-east-2 | jq .Reservations[0].Instances[0].NetworkInterfaces[0].Association.PublicIp | sed 's/"//g') 96SERVER_HOST_KEY_ENTRY="$IP $SERVER_HOST_KEY_ENTRY" 97echo $SERVER_HOST_KEY_ENTRY >> ~/.ssh/known_hosts 98echo "Waiting 2m for instance to initialize..." 99sleep 2m 100 101echo "Copying workspace to remote instance..." 102# use rsync over ssh since it's much faster than scp 103time rsync -e "ssh -i ~/.ssh/temp_client_key" -a github/grpc ubuntu@$IP:~/workspace 104echo "Beginning CI workload..." 105 106# filename of the test script to execute remotely, relative to gRPC repository root 107# use a default value if the env variable is not set 108REMOTE_WORKLOAD_SCRIPT=${REMOTE_WORKLOAD_SCRIPT:-tools/internal_ci/linux/aws/grpc_aws_experiment_remote.sh} 109 110# run remote workload script in the background, with redirected stdout and stderr 111# to avoid problems with ssh session not closing after the remote script finishes 112# but stdout and stderr are still open because the remote has spawned subprocesses 113# that keep stdout and stderr open. 114# * PID of the process that executes the remote script will be stored in aws_build.pid 115# * stderr and stdout will be streamed to aws_build.log 116# * once done, the exitcode of the remote script will be in aws_build.exitcode 117REMOTE_WORKLOAD_COMMAND="nohup bash -c '(bash grpc/${REMOTE_WORKLOAD_SCRIPT}; echo \$? >/tmp/aws_build.exitcode) >>/tmp/aws_build.log 2>&1' >/dev/null 2>&1 & echo \$! >/tmp/aws_build.pid" 118 119# the tail command simply streams the contents of aws_build.log as they become available 120# and stops when the remote workload exits (determined based on the PID) 121SSH_COMMAND='uname -a; rm -f /tmp/aws_build.log /tmp/aws_build.exitcode /tmp/aws_build.pid; touch /tmp/aws_build.log; cd ~/workspace; '"${REMOTE_WORKLOAD_COMMAND};"' tail -f /tmp/aws_build.log --pid $(cat /tmp/aws_build.pid); exit $(cat /tmp/aws_build.exitcode)' 122 123REMOTE_SCRIPT_EXITCODE=0 124time ssh -i ~/.ssh/temp_client_key ubuntu@$IP "${SSH_COMMAND}" || REMOTE_SCRIPT_EXITCODE=$? 125 126echo "Copying artifacts from the remote instance..." 127ARTIFACT_RSYNC_PATTERN="**/*sponge_log.*" 128# NOTE: the include "*/" rule and --prune-empty-dirs are important for not 129# excluding parent directories that contain artifacts before they have 130# get a chance to be examined (see man rsync) 131COPY_ARTIFACTS_EXITCODE=0 132time rsync -av -e "ssh -i ~/.ssh/temp_client_key" --include="${ARTIFACT_RSYNC_PATTERN}" --include="*/" --exclude="*" --prune-empty-dirs ubuntu@$IP:~/workspace/grpc github || COPY_ARTIFACTS_EXITCODE=$? 133 134# Regardless of the remote script's result (success or failure), initiate shutdown of AWS instance a minute from now. 135# The small delay is useful to make sure the ssh session doesn't hang up on us if shutdown happens too quickly. 136echo "Shutting down instance $ID." 137ssh -i ~/.ssh/temp_client_key ubuntu@$IP "sudo shutdown +1" || echo "WARNING: Failed to initiate AWS instance shutdown." 138 139if [ "$REMOTE_SCRIPT_EXITCODE" == "0" ] && [ "$COPY_ARTIFACTS_EXITCODE" != "0" ] 140then 141 echo "Exiting with exitcode $COPY_ARTIFACTS_EXITCODE since remote script has passed, but copying artifacts has failed." 142 exit $COPY_ARTIFACTS_EXITCODE 143fi 144 145# Match exitcode 146echo "Exiting with exitcode $REMOTE_SCRIPT_EXITCODE based on remote script output." 147exit $REMOTE_SCRIPT_EXITCODE 148