# Python wrapper script for collecting Canary metrics, setting up alarms, reporting metrics to Cloudwatch, # checking the alarms to ensure everything is correct at the end of the run, and checking for new # builds in S3, downloading them, and launching them if they exist (24/7 operation) # # Will only stop running if the Canary application itself has an issue - in which case it Canary application will # need to be fixed and then the wrapper script restarted # Needs to be installed prior to running # Part of standard packages in Python 3.4+ import argparse import time # Dependencies in project folder from CanaryWrapper_Classes import * from CanaryWrapper_MetricFunctions import * # TODO - Using subprocess may not work on Windows for starting/stopping the application thread. # Canary will likely be running on Linux, so it's probably okay, but need to confirm/check at some point.... # ================================================================================ # Code for command line argument parsing command_parser = argparse.ArgumentParser("CanaryWrapper_24_7") command_parser.add_argument("--canary_executable", type=str, required=True, help="The path to the canary executable") command_parser.add_argument("--canary_arguments", type=str, default="", help="The arguments to pass/launch the canary executable with") command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder", help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored") command_parser.add_argument("--s3_bucket_application", type=str, required=True, help="(OPTIONAL, default=canary-wrapper-folder) The S3 URL to monitor for changes MINUS the bucket name") command_parser.add_argument("--s3_bucket_application_in_zip", type=str, required=False, default="", help="(OPTIONAL, default="") The file path in the zip folder where the application is stored. Will be ignored if set to empty string") command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda", help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails") command_parser_arguments = command_parser.parse_args() # ================================================================================ # Global variables that both threads use to communicate. # NOTE - These should likely be replaced with futures or similar for better thread safety. # However, these variables are only either read or written to from a single thread, no # thread should read and write to these variables. # The local file path (and extension) of the Canary application that the wrapper will manage # (This will also be the filename and directory used when a new file is detected in S3) # [THIS IS READ ONLY] canary_local_application_path = command_parser_arguments.canary_executable if (canary_local_application_path == ""): print ("ERROR - required canary_executable is empty!") exit (1) # cannot run without a canary executable # This is the arguments passed to the local file path when starting # [THIS IS READ ONLY] canary_local_application_arguments = command_parser_arguments.canary_arguments # The "Git Hash" to use for metrics and dimensions # [THIS IS READ ONLY] canary_local_git_hash_stub = "Canary" # The "Git Repo" name to use for metrics and dimensions. Is hard-coded since this is a 24/7 canary that should only run for MQTT # [THIS IS READ ONLY] canary_local_git_repo_stub = "MQTT5_24_7" # The Fixed Namespace name for the Canary # [THIS IS READ ONLY] canary_local_git_fixed_namespace = "MQTT5_24_7_Canary" # The S3 bucket name to monitor for the application # [THIS IS READ ONLY] canary_s3_bucket_name = command_parser_arguments.s3_bucket_name if (canary_s3_bucket_name == ""): canary_s3_bucket_name = "canary-wrapper-folder" # The file in the S3 bucket to monitor (The application filepath and file. Example: "canary/canary_application.exe") # [THIS IS READ ONLY] canary_s3_bucket_application_path = command_parser_arguments.s3_bucket_application if (canary_s3_bucket_application_path == ""): print ("ERROR - required s3_bucket_application is empty!") exit (1) # cannot run without a s3_bucket_application to monitor # The location of the file in the S3 zip, if the S3 file being monitored is a zip # (THIS IS READ ONLY) canary_s3_bucket_application_path_zip = command_parser_arguments.s3_bucket_application_in_zip if (canary_s3_bucket_application_path_zip == ""): canary_s3_bucket_application_path_zip = None # The name of the email lambda. If an empty string is set, it defaults to 'iot-send-email-lambda' if (command_parser_arguments.lambda_name == ""): command_parser_arguments.lambda_name = "iot-send-email-lambda" # The region the canary is running in # (THIS IS READ ONLY) canary_region_stub = "us-east-1" # How long (in seconds) to wait before gathering metrics and pushing them to Cloudwatch canary_metrics_wait_time = 600 # 10 minutes # How long (in seconds) to run the Application thread loop. Should be shorter or equal to the Canary Metrics time canary_application_loop_wait_time = 300 # 5 minutes # For testing - set both to 30 seconds # canary_metrics_wait_time = 30 # canary_application_loop_wait_time = 30 # ================================================================================ # Make the snapshot class data_snapshot = DataSnapshot( git_hash=canary_local_git_hash_stub, git_repo_name=canary_local_git_repo_stub, git_hash_as_namespace=False, datetime_string=None, git_fixed_namespace_text=canary_local_git_fixed_namespace, output_log_filepath="output.txt", output_to_console=True, cloudwatch_region=canary_region_stub, cloudwatch_make_dashboard=True, cloudwatch_teardown_alarms_on_complete=True, cloudwatch_teardown_dashboard_on_complete=False, s3_bucket_name=canary_s3_bucket_name, s3_bucket_upload_on_complete=True, lambda_name=command_parser_arguments.lambda_name, metric_frequency=canary_metrics_wait_time) # Make sure nothing failed if (data_snapshot.abort_due_to_internal_error == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again") exit(0) # Register metrics data_snapshot.register_metric( new_metric_name="total_cpu_usage", new_metric_function=get_metric_total_cpu_usage, new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=1, new_metric_alarm_severity=5, is_percent=True) data_snapshot.register_metric( new_metric_name="total_memory_usage_value", new_metric_function=get_metric_total_memory_usage_value, new_metric_unit="Bytes") data_snapshot.register_metric( new_metric_name="total_memory_usage_percent", new_metric_function=get_metric_total_memory_usage_percent, new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=0, new_metric_alarm_severity=5, is_percent=True) data_snapshot.register_dashboard_widget("Process CPU Usage - Percentage", ["total_cpu_usage"], 60) data_snapshot.register_dashboard_widget("Process Memory Usage - Percentage", ["total_memory_usage_percent"], 60) # Print diagnosis information data_snapshot.output_diagnosis_information("24/7 Canary cannot show dependencies!") # Make the S3 class s3_monitor = S3Monitor( s3_bucket_name=canary_s3_bucket_name, s3_file_name=canary_s3_bucket_application_path, s3_file_name_in_zip=canary_s3_bucket_application_path_zip, canary_local_application_path=canary_local_application_path, data_snapshot=data_snapshot) if (s3_monitor.had_internal_error == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again") exit(0) # Make the snapshot (metrics) monitor snapshot_monitor = SnapshotMonitor( wrapper_data_snapshot=data_snapshot, wrapper_metrics_wait_time=canary_metrics_wait_time) # Make sure nothing failed if (snapshot_monitor.had_internal_error == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again") exit(0) # Make the application monitor application_monitor = ApplicationMonitor( wrapper_application_path=canary_local_application_path, wrapper_application_arguments=canary_local_application_arguments, wrapper_application_restart_on_finish=True, data_snapshot=data_snapshot) # Make sure nothing failed if (application_monitor.error_has_occurred == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again") exit(0) # For tracking if we stopped due to a metric alarm stopped_due_to_metric_alarm = False def execution_loop(): while True: s3_monitor.monitor_loop_function(time_passed=canary_application_loop_wait_time) # Is there an error? if (s3_monitor.had_internal_error == True): print ("[Debug] S3 monitor had an internal error!") break # Is there a new file? if (s3_monitor.s3_file_needs_replacing == True): # Stop the application print ("[Debug] Stopping application monitor...") application_monitor.stop_monitoring() print ("[Debug] Getting S3 file...") s3_monitor.replace_current_file_for_new_file() # Start the application print ("[Debug] Starting application monitor...") application_monitor.start_monitoring() # Allow the snapshot monitor to cut a ticket snapshot_monitor.can_cut_ticket = True snapshot_monitor.monitor_loop_function( time_passed=canary_application_loop_wait_time, psutil_process=application_monitor.application_process_psutil) application_monitor.monitor_loop_function( time_passed=canary_application_loop_wait_time) # Did a metric go into alarm? if (snapshot_monitor.has_cut_ticket == True): # Do not allow it to cut anymore tickets until it gets a new build snapshot_monitor.can_cut_ticket = False # If an error has occurred or otherwise this thread needs to stop, then break the loop if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True): if (application_monitor.error_has_occurred == True): print ("[Debug] Application monitor error occurred!") else: print ("[Debug] Snapshot monitor internal error ocurred!") break time.sleep(canary_application_loop_wait_time) def application_thread(): # Start the application going snapshot_monitor.start_monitoring() application_monitor.start_monitoring() # Allow the snapshot monitor to cut tickets snapshot_monitor.can_cut_ticket = True start_email_body = "MQTT5 24/7 Canary Wrapper has started. This will run and continue to test new MQTT5 application builds as" start_email_body += " they pass CodeBuild and are uploaded to S3." snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started") # Start the execution loop execution_loop() # Make sure everything is stopped snapshot_monitor.stop_monitoring() application_monitor.stop_monitoring() # Track whether this counts as an error (and therefore we should cleanup accordingly) or not wrapper_error_occurred = False send_finished_email = True finished_email_body = "MQTT5 24/7 Canary Wrapper has stopped." finished_email_body += "\n\n" try: # Find out why we stopped # S3 Monitor if (s3_monitor.had_internal_error == True): if (s3_monitor.error_due_to_credentials == False): print ("ERROR - S3 monitor stopped due to internal error!") cut_ticket_using_cloudwatch( git_repo_name=canary_local_git_repo_stub, git_hash=canary_local_git_hash_stub, git_hash_as_namespace=False, git_fixed_namespace_text=canary_local_git_fixed_namespace, cloudwatch_region=canary_region_stub, ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + s3_monitor.internal_error_reason, ticket_reason="S3 monitor stopped due to internal error", ticket_allow_duplicates=True, ticket_category="AWS", ticket_type="SDKs and Tools", ticket_item="IoT SDK for CPP", ticket_group="AWS IoT Device SDK", ticket_severity=4) finished_email_body += "Failure due to S3 monitor stopping due to an internal error." finished_email_body += " Reason given for error: " + s3_monitor.internal_error_reason wrapper_error_occurred = True # Snapshot Monitor elif (snapshot_monitor.had_internal_error == True): if (snapshot_monitor.has_cut_ticket == True): # We do not need to cut a ticket here - it's cut by the snapshot monitor! print ("ERROR - Snapshot monitor stopped due to metric in alarm!") finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) finished_email_body += "\nNOTE - this shouldn't occur in the 24/7 Canary! If it does, then the wrapper needs adjusting." wrapper_error_occurred = True else: print ("ERROR - Snapshot monitor stopped due to internal error!") cut_ticket_using_cloudwatch( git_repo_name=canary_local_git_repo_stub, git_hash=canary_local_git_hash_stub, git_hash_as_namespace=False, git_fixed_namespace_text=canary_local_git_fixed_namespace, cloudwatch_region=canary_region_stub, ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, ticket_reason="Snapshot monitor stopped due to internal error", ticket_allow_duplicates=True, ticket_category="AWS", ticket_type="SDKs and Tools", ticket_item="IoT SDK for CPP", ticket_group="AWS IoT Device SDK", ticket_severity=4) wrapper_error_occurred = True finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason # Application Monitor elif (application_monitor.error_has_occurred == True): if (application_monitor.error_due_to_credentials == True): print ("INFO - Stopping application due to error caused by credentials") print ("Please fix your credentials and then restart this application again") wrapper_error_occurred = True send_finished_email = False else: # Is the error something in the canary failed? if (application_monitor.error_code != 0): cut_ticket_using_cloudwatch( git_repo_name=canary_local_git_repo_stub, git_hash=canary_local_git_hash_stub, git_hash_as_namespace=False, git_fixed_namespace_text=canary_local_git_fixed_namespace, cloudwatch_region=canary_region_stub, ticket_description="The 24/7 Canary exited with a non-zero exit code! This likely means something in the canary failed.", ticket_reason="The 24/7 Canary exited with a non-zero exit code", ticket_allow_duplicates=True, ticket_category="AWS", ticket_type="SDKs and Tools", ticket_item="IoT SDK for CPP", ticket_group="AWS IoT Device SDK", ticket_severity=3) wrapper_error_occurred = True finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code!" finished_email_body += " This means something in the Canary application itself failed" else: cut_ticket_using_cloudwatch( git_repo_name=canary_local_git_repo_stub, git_hash=canary_local_git_hash_stub, git_hash_as_namespace=False, git_fixed_namespace_text=canary_local_git_fixed_namespace, cloudwatch_region=canary_region_stub, ticket_description="The 24/7 Canary exited with a zero exit code but did not restart!", ticket_reason="The 24/7 Canary exited with a zero exit code but did not restart", ticket_allow_duplicates=True, ticket_category="AWS", ticket_type="SDKs and Tools", ticket_item="IoT SDK for CPP", ticket_group="AWS IoT Device SDK", ticket_severity=3) wrapper_error_occurred = True finished_email_body += "Failure due to MQTT5 application stopping and not automatically restarting!" finished_email_body += " This shouldn't occur and means something is wrong with the Canary wrapper!" # Other else: print ("ERROR - 24/7 Canary stopped due to unknown reason!") cut_ticket_using_cloudwatch( git_repo_name=canary_local_git_repo_stub, git_hash=canary_local_git_hash_stub, git_hash_as_namespace=False, git_fixed_namespace_text=canary_local_git_fixed_namespace, cloudwatch_region=canary_region_stub, ticket_description="The 24/7 Canary stopped for an unknown reason!", ticket_reason="The 24/7 Canary stopped for unknown reason", ticket_allow_duplicates=True, ticket_category="AWS", ticket_type="SDKs and Tools", ticket_item="IoT SDK for CPP", ticket_group="AWS IoT Device SDK", ticket_severity=3) wrapper_error_occurred = True finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" except Exception as e: print ("ERROR: Could not (possibly) cut ticket due to exception!") print ("Exception: " + str(e), flush=True) # Clean everything up and stop snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) print ("24/7 Canary finished!") finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: " finished_email_body += "https://s3.console.aws.amazon.com/s3/object/" finished_email_body += command_parser_arguments.s3_bucket_name finished_email_body += "?region=" + canary_region_stub finished_email_body += "&prefix=" + canary_local_git_repo_stub + "/" if (wrapper_error_occurred == True): finished_email_body += "Failed_Logs/" finished_email_body += canary_local_git_hash_stub + ".log" # Send the finish email if (send_finished_email == True): if (wrapper_error_occurred == True): snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error") else: snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished") exit (-1) # Start the application! application_thread()