1# Python wrapper script for collecting Canary metrics, setting-up/tearing-down alarms, reporting metrics to Cloudwatch, 2# checking the alarms to ensure everything is correct at the end of the run, and pushing the log to S3 if successful. 3 4# Needs to be installed prior to running 5# Part of standard packages in Python 3.4+ 6import argparse 7import time 8import datetime 9# Dependencies in project folder 10from CanaryWrapper_Classes import * 11from CanaryWrapper_MetricFunctions import * 12 13# Code for command line argument parsing 14# ================================================================================ 15command_parser = argparse.ArgumentParser("CanaryWrapper") 16command_parser.add_argument("--canary_executable", type=str, required=True, 17 help="The path to the canary executable (or program - like 'python3')") 18command_parser.add_argument("--canary_arguments", type=str, default="", 19 help="The arguments to pass/launch the canary executable with") 20command_parser.add_argument("--git_hash", type=str, required=True, 21 help="The Git commit hash that we are running the canary with") 22command_parser.add_argument("--git_repo_name", type=str, required=True, 23 help="The name of the Git repository") 24command_parser.add_argument("--git_hash_as_namespace", type=bool, default=False, 25 help="(OPTIONAL, default=False) If true, the git hash will be used as the name of the Cloudwatch namespace") 26command_parser.add_argument("--output_log_filepath", type=str, default="output.log", 27 help="(OPTIONAL, default=output.log) The file to output log info to. Set to 'None' to disable") 28command_parser.add_argument("--output_to_console", type=bool, default=True, 29 help="(OPTIONAL, default=True) If true, info will be output to the console") 30command_parser.add_argument("--cloudwatch_region", type=str, default="us-east-1", 31 help="(OPTIONAL, default=us-east-1) The AWS region for Cloudwatch") 32command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder", 33 help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored") 34command_parser.add_argument("--snapshot_wait_time", type=int, default=600, 35 help="(OPTIONAL, default=600) The number of seconds between gathering and sending snapshot reports") 36command_parser.add_argument("--ticket_category", type=str, default="AWS", 37 help="(OPTIONAL, default=AWS) The category to register the ticket under") 38command_parser.add_argument("--ticket_type", type=str, default="SDKs and Tools", 39 help="(OPTIONAL, default='SDKs and Tools') The type to register the ticket under") 40command_parser.add_argument("--ticket_item", type=str, default="IoT SDK for CPP", 41 help="(OPTIONAL, default='IoT SDK for CPP') The item to register the ticket under") 42command_parser.add_argument("--ticket_group", type=str, default="AWS IoT Device SDK", 43 help="(OPTIONAL, default='AWS IoT Device SDK') The group to register the ticket under") 44command_parser.add_argument("--dependencies", type=str, default="", 45 help="(OPTIONAL, default='') Any dependencies and their commit hashes. \ 46 Current expected format is '(name or path);(hash);(next name or path);(hash);(etc...)'.") 47command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda", 48 help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails") 49command_parser.add_argument("--codebuild_log_path", type=str, default="", 50 help="The CODEBUILD_LOG_PATH environment variable. Leave blank to ignore") 51command_parser_arguments = command_parser.parse_args() 52 53if (command_parser_arguments.output_log_filepath == "None"): 54 command_parser_arguments.output_log_filepath = None 55if (command_parser_arguments.snapshot_wait_time <= 0): 56 command_parser_arguments.snapshot_wait_time = 60 57 58# Deal with possibly empty values in semi-critical commands/arguments 59if (command_parser_arguments.canary_executable == ""): 60 print ("ERROR - required canary_executable is empty!", flush=True) 61 exit (1) # cannot run without a canary executable 62if (command_parser_arguments.git_hash == ""): 63 print ("ERROR - required git_hash is empty!", flush=True) 64 exit (1) # cannot run without git hash 65if (command_parser_arguments.git_repo_name == ""): 66 print ("ERROR - required git_repo_name is empty!", flush=True) 67 exit (1) # cannot run without git repo name 68if (command_parser_arguments.git_hash_as_namespace is not True and command_parser_arguments.git_hash_as_namespace is not False): 69 command_parser_arguments.git_hash_as_namespace = False 70if (command_parser_arguments.output_log_filepath == ""): 71 command_parser_arguments.output_log_filepath = None 72if (command_parser_arguments.output_to_console != True and command_parser_arguments.output_to_console != False): 73 command_parser_arguments.output_to_console = True 74if (command_parser_arguments.cloudwatch_region == ""): 75 command_parser_arguments.cloudwatch_region = "us-east-1" 76if (command_parser_arguments.s3_bucket_name == ""): 77 command_parser_arguments.s3_bucket_name = "canary-wrapper-folder" 78if (command_parser_arguments.ticket_category == ""): 79 command_parser_arguments.ticket_category = "AWS" 80if (command_parser_arguments.ticket_type == ""): 81 command_parser_arguments.ticket_type = "SDKs and Tools" 82if (command_parser_arguments.ticket_item == ""): 83 command_parser_arguments.ticket_item = "IoT SDK for CPP" 84if (command_parser_arguments.ticket_group == ""): 85 command_parser_arguments.ticket_group = "AWS IoT Device SDK" 86 87 88 89# ================================================================================ 90 91datetime_now = datetime.datetime.now() 92datetime_string = datetime_now.strftime("%d-%m-%Y/%H-%M-%S") 93print("Datetime string is: " + datetime_string, flush=True) 94 95# Make the snapshot class 96data_snapshot = DataSnapshot( 97 git_hash=command_parser_arguments.git_hash, 98 git_repo_name=command_parser_arguments.git_repo_name, 99 datetime_string=datetime_string, 100 git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, 101 git_fixed_namespace_text="mqtt5_canary", 102 output_log_filepath="output.txt", 103 output_to_console=command_parser_arguments.output_to_console, 104 cloudwatch_region="us-east-1", 105 cloudwatch_make_dashboard=False, 106 cloudwatch_teardown_alarms_on_complete=True, 107 cloudwatch_teardown_dashboard_on_complete=True, 108 s3_bucket_name=command_parser_arguments.s3_bucket_name, 109 s3_bucket_upload_on_complete=True, 110 lambda_name=command_parser_arguments.lambda_name, 111 metric_frequency=command_parser_arguments.snapshot_wait_time) 112 113# Make sure nothing failed 114if (data_snapshot.abort_due_to_internal_error == True): 115 print ("INFO - Stopping application due to error caused by credentials") 116 print ("Please fix your credentials and then restart this application again", flush=True) 117 exit(0) 118 119# Register metrics 120data_snapshot.register_metric( 121 new_metric_name="total_cpu_usage", 122 new_metric_function=get_metric_total_cpu_usage, 123 new_metric_unit="Percent", 124 new_metric_alarm_threshold=70, 125 new_metric_reports_to_skip=1, 126 new_metric_alarm_severity=5, 127 is_percent=True) 128data_snapshot.register_metric( 129 new_metric_name="total_memory_usage_value", 130 new_metric_function=get_metric_total_memory_usage_value, 131 new_metric_unit="Bytes") 132data_snapshot.register_metric( 133 new_metric_name="total_memory_usage_percent", 134 new_metric_function=get_metric_total_memory_usage_percent, 135 new_metric_unit="Percent", 136 new_metric_alarm_threshold=70, 137 new_metric_reports_to_skip=0, 138 new_metric_alarm_severity=5, 139 is_percent=True) 140 141# Print diagnosis information 142data_snapshot.output_diagnosis_information(command_parser_arguments.dependencies) 143 144# Make the snapshot (metrics) monitor 145snapshot_monitor = SnapshotMonitor( 146 wrapper_data_snapshot=data_snapshot, 147 wrapper_metrics_wait_time=command_parser_arguments.snapshot_wait_time) 148 149# Make sure nothing failed 150if (snapshot_monitor.had_internal_error == True): 151 print ("INFO - Stopping application due to error caused by credentials") 152 print ("Please fix your credentials and then restart this application again", flush=True) 153 exit(0) 154 155# Make the application monitor 156application_monitor = ApplicationMonitor( 157 wrapper_application_path=command_parser_arguments.canary_executable, 158 wrapper_application_arguments=command_parser_arguments.canary_arguments, 159 wrapper_application_restart_on_finish=False, 160 data_snapshot=data_snapshot # pass the data_snapshot for printing to the log 161) 162 163# Make sure nothing failed 164if (application_monitor.error_has_occurred == True): 165 print ("INFO - Stopping application due to error caused by credentials") 166 print ("Please fix your credentials and then restart this application again", flush=True) 167 exit(0) 168 169# For tracking if we stopped due to a metric alarm 170stopped_due_to_metric_alarm = False 171 172execution_sleep_time = 30 173def execution_loop(): 174 while True: 175 snapshot_monitor.monitor_loop_function( 176 time_passed=execution_sleep_time, psutil_process=application_monitor.application_process_psutil) 177 application_monitor.monitor_loop_function( 178 time_passed=execution_sleep_time) 179 180 # Did a metric go into alarm? 181 if (snapshot_monitor.has_cut_ticket == True): 182 # Set that we had an 'internal error' so we go down the right code path 183 snapshot_monitor.had_internal_error = True 184 break 185 186 # If an error has occurred or otherwise this thread needs to stop, then break the loop 187 if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True): 188 break 189 190 time.sleep(execution_sleep_time) 191 192 193def application_thread(): 194 195 start_email_body = "MQTT5 Short Running Canary Wrapper has started for " 196 start_email_body += "\"" + command_parser_arguments.git_repo_name + "\" commit \"" + command_parser_arguments.git_hash + "\"" 197 start_email_body += "\nThe wrapper will run for the length the MQTT5 Canary application is set to run for, which is determined by " 198 start_email_body += "the arguments set. The arguments used for this run are listed below:" 199 start_email_body += "\n Arguments: " + command_parser_arguments.canary_arguments 200 snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started") 201 202 # Start the application going 203 snapshot_monitor.start_monitoring() 204 application_monitor.start_monitoring() 205 # Allow the snapshot monitor to cut tickets 206 snapshot_monitor.can_cut_ticket = True 207 208 # Start the execution loop 209 execution_loop() 210 211 # Make sure everything is stopped 212 snapshot_monitor.stop_monitoring() 213 application_monitor.stop_monitoring() 214 215 # Track whether this counts as an error (and therefore we should cleanup accordingly) or not 216 wrapper_error_occurred = False 217 # Finished Email 218 send_finished_email = True 219 finished_email_body = "MQTT5 Short Running Canary Wrapper has stopped." 220 finished_email_body += "\n\n" 221 222 try: 223 # Find out why we stopped 224 if (snapshot_monitor.had_internal_error == True): 225 if (snapshot_monitor.has_cut_ticket == True): 226 # We do not need to cut a ticket here - it's cut by the snapshot monitor! 227 print ("ERROR - Snapshot monitor stopped due to metric in alarm!", flush=True) 228 finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" 229 finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) 230 wrapper_error_occurred = True 231 else: 232 print ("ERROR - Snapshot monitor stopped due to internal error!", flush=True) 233 cut_ticket_using_cloudwatch( 234 git_repo_name=command_parser_arguments.git_repo_name, 235 git_hash=command_parser_arguments.git_hash, 236 git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, 237 git_fixed_namespace_text="mqtt5_canary", 238 cloudwatch_region="us-east-1", 239 ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, 240 ticket_reason="Snapshot monitor stopped due to internal error", 241 ticket_allow_duplicates=True, 242 ticket_category=command_parser_arguments.ticket_category, 243 ticket_item=command_parser_arguments.ticket_item, 244 ticket_group=command_parser_arguments.ticket_group, 245 ticket_type=command_parser_arguments.ticket_type, 246 ticket_severity=4) 247 wrapper_error_occurred = True 248 finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." 249 finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason 250 251 elif (application_monitor.error_has_occurred == True): 252 if (application_monitor.error_due_to_credentials == True): 253 print ("INFO - Stopping application due to error caused by credentials") 254 print ("Please fix your credentials and then restart this application again", flush=True) 255 wrapper_error_occurred = True 256 send_finished_email = False 257 else: 258 # Is the error something in the canary failed? 259 if (application_monitor.error_code != 0): 260 cut_ticket_using_cloudwatch( 261 git_repo_name=command_parser_arguments.git_repo_name, 262 git_hash=command_parser_arguments.git_hash, 263 git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, 264 git_fixed_namespace_text="mqtt5_canary", 265 cloudwatch_region="us-east-1", 266 ticket_description="The Short Running Canary exited with a non-zero exit code! This likely means something in the canary failed.", 267 ticket_reason="The Short Running Canary exited with a non-zero exit code", 268 ticket_allow_duplicates=True, 269 ticket_category=command_parser_arguments.ticket_category, 270 ticket_item=command_parser_arguments.ticket_item, 271 ticket_group=command_parser_arguments.ticket_group, 272 ticket_type=command_parser_arguments.ticket_type, 273 ticket_severity=4) 274 wrapper_error_occurred = True 275 finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code! This means something in the Canary application itself failed" 276 else: 277 print ("INFO - Stopping application. No error has occurred, application has stopped normally", flush=True) 278 application_monitor.print_stdout() 279 finished_email_body += "Short Running Canary finished successfully and run without errors!" 280 wrapper_error_occurred = False 281 else: 282 print ("ERROR - Short Running Canary stopped due to unknown reason!", flush=True) 283 cut_ticket_using_cloudwatch( 284 git_repo_name=command_parser_arguments.git_repo_name, 285 git_hash=command_parser_arguments.git_hash, 286 git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, 287 git_fixed_namespace_text="mqtt5_canary", 288 cloudwatch_region="us-east-1", 289 ticket_description="The Short Running Canary stopped for an unknown reason!", 290 ticket_reason="The Short Running Canary stopped for unknown reason", 291 ticket_allow_duplicates=True, 292 ticket_category=command_parser_arguments.ticket_category, 293 ticket_item=command_parser_arguments.ticket_item, 294 ticket_group=command_parser_arguments.ticket_group, 295 ticket_type=command_parser_arguments.ticket_type, 296 ticket_severity=4) 297 wrapper_error_occurred = True 298 finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" 299 except Exception as e: 300 print ("ERROR: Could not (possibly) cut ticket due to exception!") 301 print ("Exception: " + str(e), flush=True) 302 303 # Clean everything up and stop 304 snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) 305 application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) 306 print ("Short Running Canary finished!", flush=True) 307 308 finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: " 309 finished_email_body += "https://s3.console.aws.amazon.com/s3/object/" 310 finished_email_body += command_parser_arguments.s3_bucket_name 311 finished_email_body += "?region=" + command_parser_arguments.cloudwatch_region 312 finished_email_body += "&prefix=" + command_parser_arguments.git_repo_name + "/" + datetime_string + "/" 313 if (wrapper_error_occurred == True): 314 finished_email_body += "Failed_Logs/" 315 finished_email_body += command_parser_arguments.git_hash + ".log" 316 if (command_parser_arguments.codebuild_log_path != ""): 317 print ("\n Codebuild log path: " + command_parser_arguments.codebuild_log_path + "\n") 318 319 # Send the finish email 320 if (send_finished_email == True): 321 if (wrapper_error_occurred == True): 322 snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error") 323 else: 324 snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished") 325 326 exit (application_monitor.error_code) 327 328 329# Start the application! 330application_thread() 331