# Python wrapper script for collecting Canary metrics, setting-up/tearing-down alarms, reporting metrics to Cloudwatch, # checking the alarms to ensure everything is correct at the end of the run, and pushing the log to S3 if successful. # Needs to be installed prior to running # Part of standard packages in Python 3.4+ import argparse import time import datetime # Dependencies in project folder from CanaryWrapper_Classes import * from CanaryWrapper_MetricFunctions import * # Code for command line argument parsing # ================================================================================ command_parser = argparse.ArgumentParser("CanaryWrapper") command_parser.add_argument("--canary_executable", type=str, required=True, help="The path to the canary executable (or program - like 'python3')") command_parser.add_argument("--canary_arguments", type=str, default="", help="The arguments to pass/launch the canary executable with") command_parser.add_argument("--git_hash", type=str, required=True, help="The Git commit hash that we are running the canary with") command_parser.add_argument("--git_repo_name", type=str, required=True, help="The name of the Git repository") command_parser.add_argument("--git_hash_as_namespace", type=bool, default=False, help="(OPTIONAL, default=False) If true, the git hash will be used as the name of the Cloudwatch namespace") command_parser.add_argument("--output_log_filepath", type=str, default="output.log", help="(OPTIONAL, default=output.log) The file to output log info to. Set to 'None' to disable") command_parser.add_argument("--output_to_console", type=bool, default=True, help="(OPTIONAL, default=True) If true, info will be output to the console") command_parser.add_argument("--cloudwatch_region", type=str, default="us-east-1", help="(OPTIONAL, default=us-east-1) The AWS region for Cloudwatch") command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder", help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored") command_parser.add_argument("--snapshot_wait_time", type=int, default=600, help="(OPTIONAL, default=600) The number of seconds between gathering and sending snapshot reports") command_parser.add_argument("--ticket_category", type=str, default="AWS", help="(OPTIONAL, default=AWS) The category to register the ticket under") command_parser.add_argument("--ticket_type", type=str, default="SDKs and Tools", help="(OPTIONAL, default='SDKs and Tools') The type to register the ticket under") command_parser.add_argument("--ticket_item", type=str, default="IoT SDK for CPP", help="(OPTIONAL, default='IoT SDK for CPP') The item to register the ticket under") command_parser.add_argument("--ticket_group", type=str, default="AWS IoT Device SDK", help="(OPTIONAL, default='AWS IoT Device SDK') The group to register the ticket under") command_parser.add_argument("--dependencies", type=str, default="", help="(OPTIONAL, default='') Any dependencies and their commit hashes. \ Current expected format is '(name or path);(hash);(next name or path);(hash);(etc...)'.") command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda", help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails") command_parser.add_argument("--codebuild_log_path", type=str, default="", help="The CODEBUILD_LOG_PATH environment variable. Leave blank to ignore") command_parser_arguments = command_parser.parse_args() if (command_parser_arguments.output_log_filepath == "None"): command_parser_arguments.output_log_filepath = None if (command_parser_arguments.snapshot_wait_time <= 0): command_parser_arguments.snapshot_wait_time = 60 # Deal with possibly empty values in semi-critical commands/arguments if (command_parser_arguments.canary_executable == ""): print ("ERROR - required canary_executable is empty!", flush=True) exit (1) # cannot run without a canary executable if (command_parser_arguments.git_hash == ""): print ("ERROR - required git_hash is empty!", flush=True) exit (1) # cannot run without git hash if (command_parser_arguments.git_repo_name == ""): print ("ERROR - required git_repo_name is empty!", flush=True) exit (1) # cannot run without git repo name if (command_parser_arguments.git_hash_as_namespace is not True and command_parser_arguments.git_hash_as_namespace is not False): command_parser_arguments.git_hash_as_namespace = False if (command_parser_arguments.output_log_filepath == ""): command_parser_arguments.output_log_filepath = None if (command_parser_arguments.output_to_console != True and command_parser_arguments.output_to_console != False): command_parser_arguments.output_to_console = True if (command_parser_arguments.cloudwatch_region == ""): command_parser_arguments.cloudwatch_region = "us-east-1" if (command_parser_arguments.s3_bucket_name == ""): command_parser_arguments.s3_bucket_name = "canary-wrapper-folder" if (command_parser_arguments.ticket_category == ""): command_parser_arguments.ticket_category = "AWS" if (command_parser_arguments.ticket_type == ""): command_parser_arguments.ticket_type = "SDKs and Tools" if (command_parser_arguments.ticket_item == ""): command_parser_arguments.ticket_item = "IoT SDK for CPP" if (command_parser_arguments.ticket_group == ""): command_parser_arguments.ticket_group = "AWS IoT Device SDK" # ================================================================================ datetime_now = datetime.datetime.now() datetime_string = datetime_now.strftime("%d-%m-%Y/%H-%M-%S") print("Datetime string is: " + datetime_string, flush=True) # Make the snapshot class data_snapshot = DataSnapshot( git_hash=command_parser_arguments.git_hash, git_repo_name=command_parser_arguments.git_repo_name, datetime_string=datetime_string, git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, git_fixed_namespace_text="mqtt5_canary", output_log_filepath="output.txt", output_to_console=command_parser_arguments.output_to_console, cloudwatch_region="us-east-1", cloudwatch_make_dashboard=False, cloudwatch_teardown_alarms_on_complete=True, cloudwatch_teardown_dashboard_on_complete=True, s3_bucket_name=command_parser_arguments.s3_bucket_name, s3_bucket_upload_on_complete=True, lambda_name=command_parser_arguments.lambda_name, metric_frequency=command_parser_arguments.snapshot_wait_time) # Make sure nothing failed if (data_snapshot.abort_due_to_internal_error == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again", flush=True) exit(0) # Register metrics data_snapshot.register_metric( new_metric_name="total_cpu_usage", new_metric_function=get_metric_total_cpu_usage, new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=1, new_metric_alarm_severity=5, is_percent=True) data_snapshot.register_metric( new_metric_name="total_memory_usage_value", new_metric_function=get_metric_total_memory_usage_value, new_metric_unit="Bytes") data_snapshot.register_metric( new_metric_name="total_memory_usage_percent", new_metric_function=get_metric_total_memory_usage_percent, new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=0, new_metric_alarm_severity=5, is_percent=True) # Print diagnosis information data_snapshot.output_diagnosis_information(command_parser_arguments.dependencies) # Make the snapshot (metrics) monitor snapshot_monitor = SnapshotMonitor( wrapper_data_snapshot=data_snapshot, wrapper_metrics_wait_time=command_parser_arguments.snapshot_wait_time) # Make sure nothing failed if (snapshot_monitor.had_internal_error == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again", flush=True) exit(0) # Make the application monitor application_monitor = ApplicationMonitor( wrapper_application_path=command_parser_arguments.canary_executable, wrapper_application_arguments=command_parser_arguments.canary_arguments, wrapper_application_restart_on_finish=False, data_snapshot=data_snapshot # pass the data_snapshot for printing to the log ) # Make sure nothing failed if (application_monitor.error_has_occurred == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again", flush=True) exit(0) # For tracking if we stopped due to a metric alarm stopped_due_to_metric_alarm = False execution_sleep_time = 30 def execution_loop(): while True: snapshot_monitor.monitor_loop_function( time_passed=execution_sleep_time, psutil_process=application_monitor.application_process_psutil) application_monitor.monitor_loop_function( time_passed=execution_sleep_time) # Did a metric go into alarm? if (snapshot_monitor.has_cut_ticket == True): # Set that we had an 'internal error' so we go down the right code path snapshot_monitor.had_internal_error = True break # If an error has occurred or otherwise this thread needs to stop, then break the loop if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True): break time.sleep(execution_sleep_time) def application_thread(): start_email_body = "MQTT5 Short Running Canary Wrapper has started for " start_email_body += "\"" + command_parser_arguments.git_repo_name + "\" commit \"" + command_parser_arguments.git_hash + "\"" start_email_body += "\nThe wrapper will run for the length the MQTT5 Canary application is set to run for, which is determined by " start_email_body += "the arguments set. The arguments used for this run are listed below:" start_email_body += "\n Arguments: " + command_parser_arguments.canary_arguments snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started") # Start the application going snapshot_monitor.start_monitoring() application_monitor.start_monitoring() # Allow the snapshot monitor to cut tickets snapshot_monitor.can_cut_ticket = True # Start the execution loop execution_loop() # Make sure everything is stopped snapshot_monitor.stop_monitoring() application_monitor.stop_monitoring() # Track whether this counts as an error (and therefore we should cleanup accordingly) or not wrapper_error_occurred = False # Finished Email send_finished_email = True finished_email_body = "MQTT5 Short Running Canary Wrapper has stopped." finished_email_body += "\n\n" try: # Find out why we stopped if (snapshot_monitor.had_internal_error == True): if (snapshot_monitor.has_cut_ticket == True): # We do not need to cut a ticket here - it's cut by the snapshot monitor! print ("ERROR - Snapshot monitor stopped due to metric in alarm!", flush=True) finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) wrapper_error_occurred = True else: print ("ERROR - Snapshot monitor stopped due to internal error!", flush=True) cut_ticket_using_cloudwatch( git_repo_name=command_parser_arguments.git_repo_name, git_hash=command_parser_arguments.git_hash, git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, git_fixed_namespace_text="mqtt5_canary", cloudwatch_region="us-east-1", ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, ticket_reason="Snapshot monitor stopped due to internal error", ticket_allow_duplicates=True, ticket_category=command_parser_arguments.ticket_category, ticket_item=command_parser_arguments.ticket_item, ticket_group=command_parser_arguments.ticket_group, ticket_type=command_parser_arguments.ticket_type, ticket_severity=4) wrapper_error_occurred = True finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason elif (application_monitor.error_has_occurred == True): if (application_monitor.error_due_to_credentials == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again", flush=True) wrapper_error_occurred = True send_finished_email = False else: # Is the error something in the canary failed? if (application_monitor.error_code != 0): cut_ticket_using_cloudwatch( git_repo_name=command_parser_arguments.git_repo_name, git_hash=command_parser_arguments.git_hash, git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, git_fixed_namespace_text="mqtt5_canary", cloudwatch_region="us-east-1", ticket_description="The Short Running Canary exited with a non-zero exit code! This likely means something in the canary failed.", ticket_reason="The Short Running Canary exited with a non-zero exit code", ticket_allow_duplicates=True, ticket_category=command_parser_arguments.ticket_category, ticket_item=command_parser_arguments.ticket_item, ticket_group=command_parser_arguments.ticket_group, ticket_type=command_parser_arguments.ticket_type, ticket_severity=4) wrapper_error_occurred = True finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code! This means something in the Canary application itself failed" else: print ("INFO - Stopping application. No error has occurred, application has stopped normally", flush=True) application_monitor.print_stdout() finished_email_body += "Short Running Canary finished successfully and run without errors!" wrapper_error_occurred = False else: print ("ERROR - Short Running Canary stopped due to unknown reason!", flush=True) cut_ticket_using_cloudwatch( git_repo_name=command_parser_arguments.git_repo_name, git_hash=command_parser_arguments.git_hash, git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, git_fixed_namespace_text="mqtt5_canary", cloudwatch_region="us-east-1", ticket_description="The Short Running Canary stopped for an unknown reason!", ticket_reason="The Short Running Canary stopped for unknown reason", ticket_allow_duplicates=True, ticket_category=command_parser_arguments.ticket_category, ticket_item=command_parser_arguments.ticket_item, ticket_group=command_parser_arguments.ticket_group, ticket_type=command_parser_arguments.ticket_type, ticket_severity=4) wrapper_error_occurred = True finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" except Exception as e: print ("ERROR: Could not (possibly) cut ticket due to exception!") print ("Exception: " + str(e), flush=True) # Clean everything up and stop snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) print ("Short Running Canary finished!", flush=True) finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: " finished_email_body += "https://s3.console.aws.amazon.com/s3/object/" finished_email_body += command_parser_arguments.s3_bucket_name finished_email_body += "?region=" + command_parser_arguments.cloudwatch_region finished_email_body += "&prefix=" + command_parser_arguments.git_repo_name + "/" + datetime_string + "/" if (wrapper_error_occurred == True): finished_email_body += "Failed_Logs/" finished_email_body += command_parser_arguments.git_hash + ".log" if (command_parser_arguments.codebuild_log_path != ""): print ("\n Codebuild log path: " + command_parser_arguments.codebuild_log_path + "\n") # Send the finish email if (send_finished_email == True): if (wrapper_error_occurred == True): snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error") else: snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished") exit (application_monitor.error_code) # Start the application! application_thread()