xref: /aosp_15_r20/external/aws-crt-java/codebuild/CanaryWrapper.py (revision 3c7ae9de214676c52d19f01067dc1a404272dc11)
1# Python wrapper script for collecting Canary metrics, setting-up/tearing-down alarms, reporting metrics to Cloudwatch,
2# checking the alarms to ensure everything is correct at the end of the run, and pushing the log to S3 if successful.
3
4# Needs to be installed prior to running
5# Part of standard packages in Python 3.4+
6import argparse
7import time
8import datetime
9# Dependencies in project folder
10from CanaryWrapper_Classes import *
11from CanaryWrapper_MetricFunctions import *
12
13# Code for command line argument parsing
14# ================================================================================
15command_parser = argparse.ArgumentParser("CanaryWrapper")
16command_parser.add_argument("--canary_executable", type=str, required=True,
17    help="The path to the canary executable (or program - like 'python3')")
18command_parser.add_argument("--canary_arguments", type=str, default="",
19    help="The arguments to pass/launch the canary executable with")
20command_parser.add_argument("--git_hash", type=str, required=True,
21    help="The Git commit hash that we are running the canary with")
22command_parser.add_argument("--git_repo_name", type=str, required=True,
23    help="The name of the Git repository")
24command_parser.add_argument("--git_hash_as_namespace", type=bool, default=False,
25    help="(OPTIONAL, default=False) If true, the git hash will be used as the name of the Cloudwatch namespace")
26command_parser.add_argument("--output_log_filepath", type=str, default="output.log",
27    help="(OPTIONAL, default=output.log) The file to output log info to. Set to 'None' to disable")
28command_parser.add_argument("--output_to_console", type=bool, default=True,
29    help="(OPTIONAL, default=True) If true, info will be output to the console")
30command_parser.add_argument("--cloudwatch_region", type=str, default="us-east-1",
31    help="(OPTIONAL, default=us-east-1) The AWS region for Cloudwatch")
32command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder",
33    help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored")
34command_parser.add_argument("--snapshot_wait_time", type=int, default=600,
35    help="(OPTIONAL, default=600) The number of seconds between gathering and sending snapshot reports")
36command_parser.add_argument("--ticket_category", type=str, default="AWS",
37    help="(OPTIONAL, default=AWS) The category to register the ticket under")
38command_parser.add_argument("--ticket_type", type=str, default="SDKs and Tools",
39    help="(OPTIONAL, default='SDKs and Tools') The type to register the ticket under")
40command_parser.add_argument("--ticket_item", type=str, default="IoT SDK for CPP",
41    help="(OPTIONAL, default='IoT SDK for CPP') The item to register the ticket under")
42command_parser.add_argument("--ticket_group", type=str, default="AWS IoT Device SDK",
43    help="(OPTIONAL, default='AWS IoT Device SDK') The group to register the ticket under")
44command_parser.add_argument("--dependencies", type=str, default="",
45    help="(OPTIONAL, default='') Any dependencies and their commit hashes. \
46        Current expected format is '(name or path);(hash);(next name or path);(hash);(etc...)'.")
47command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda",
48    help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails")
49command_parser.add_argument("--codebuild_log_path", type=str, default="",
50    help="The CODEBUILD_LOG_PATH environment variable. Leave blank to ignore")
51command_parser_arguments = command_parser.parse_args()
52
53if (command_parser_arguments.output_log_filepath == "None"):
54    command_parser_arguments.output_log_filepath = None
55if (command_parser_arguments.snapshot_wait_time <= 0):
56    command_parser_arguments.snapshot_wait_time = 60
57
58# Deal with possibly empty values in semi-critical commands/arguments
59if (command_parser_arguments.canary_executable == ""):
60    print ("ERROR - required canary_executable is empty!", flush=True)
61    exit (1) # cannot run without a canary executable
62if (command_parser_arguments.git_hash == ""):
63    print ("ERROR - required git_hash is empty!", flush=True)
64    exit (1) # cannot run without git hash
65if (command_parser_arguments.git_repo_name == ""):
66    print ("ERROR - required git_repo_name is empty!", flush=True)
67    exit (1) # cannot run without git repo name
68if (command_parser_arguments.git_hash_as_namespace is not True and command_parser_arguments.git_hash_as_namespace is not False):
69    command_parser_arguments.git_hash_as_namespace = False
70if (command_parser_arguments.output_log_filepath == ""):
71    command_parser_arguments.output_log_filepath = None
72if (command_parser_arguments.output_to_console != True and command_parser_arguments.output_to_console != False):
73    command_parser_arguments.output_to_console = True
74if (command_parser_arguments.cloudwatch_region == ""):
75    command_parser_arguments.cloudwatch_region = "us-east-1"
76if (command_parser_arguments.s3_bucket_name == ""):
77    command_parser_arguments.s3_bucket_name = "canary-wrapper-folder"
78if (command_parser_arguments.ticket_category == ""):
79    command_parser_arguments.ticket_category = "AWS"
80if (command_parser_arguments.ticket_type == ""):
81    command_parser_arguments.ticket_type = "SDKs and Tools"
82if (command_parser_arguments.ticket_item == ""):
83    command_parser_arguments.ticket_item = "IoT SDK for CPP"
84if (command_parser_arguments.ticket_group == ""):
85    command_parser_arguments.ticket_group = "AWS IoT Device SDK"
86
87
88
89# ================================================================================
90
91datetime_now = datetime.datetime.now()
92datetime_string = datetime_now.strftime("%d-%m-%Y/%H-%M-%S")
93print("Datetime string is: " + datetime_string, flush=True)
94
95# Make the snapshot class
96data_snapshot = DataSnapshot(
97    git_hash=command_parser_arguments.git_hash,
98    git_repo_name=command_parser_arguments.git_repo_name,
99    datetime_string=datetime_string,
100    git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
101    git_fixed_namespace_text="mqtt5_canary",
102    output_log_filepath="output.txt",
103    output_to_console=command_parser_arguments.output_to_console,
104    cloudwatch_region="us-east-1",
105    cloudwatch_make_dashboard=False,
106    cloudwatch_teardown_alarms_on_complete=True,
107    cloudwatch_teardown_dashboard_on_complete=True,
108    s3_bucket_name=command_parser_arguments.s3_bucket_name,
109    s3_bucket_upload_on_complete=True,
110    lambda_name=command_parser_arguments.lambda_name,
111    metric_frequency=command_parser_arguments.snapshot_wait_time)
112
113# Make sure nothing failed
114if (data_snapshot.abort_due_to_internal_error == True):
115    print ("INFO - Stopping application due to error caused by credentials")
116    print ("Please fix your credentials and then restart this application again", flush=True)
117    exit(0)
118
119# Register metrics
120data_snapshot.register_metric(
121    new_metric_name="total_cpu_usage",
122    new_metric_function=get_metric_total_cpu_usage,
123    new_metric_unit="Percent",
124    new_metric_alarm_threshold=70,
125    new_metric_reports_to_skip=1,
126    new_metric_alarm_severity=5,
127    is_percent=True)
128data_snapshot.register_metric(
129    new_metric_name="total_memory_usage_value",
130    new_metric_function=get_metric_total_memory_usage_value,
131    new_metric_unit="Bytes")
132data_snapshot.register_metric(
133    new_metric_name="total_memory_usage_percent",
134    new_metric_function=get_metric_total_memory_usage_percent,
135    new_metric_unit="Percent",
136    new_metric_alarm_threshold=70,
137    new_metric_reports_to_skip=0,
138    new_metric_alarm_severity=5,
139    is_percent=True)
140
141# Print diagnosis information
142data_snapshot.output_diagnosis_information(command_parser_arguments.dependencies)
143
144# Make the snapshot (metrics) monitor
145snapshot_monitor = SnapshotMonitor(
146    wrapper_data_snapshot=data_snapshot,
147    wrapper_metrics_wait_time=command_parser_arguments.snapshot_wait_time)
148
149# Make sure nothing failed
150if (snapshot_monitor.had_internal_error == True):
151    print ("INFO - Stopping application due to error caused by credentials")
152    print ("Please fix your credentials and then restart this application again", flush=True)
153    exit(0)
154
155# Make the application monitor
156application_monitor = ApplicationMonitor(
157    wrapper_application_path=command_parser_arguments.canary_executable,
158    wrapper_application_arguments=command_parser_arguments.canary_arguments,
159    wrapper_application_restart_on_finish=False,
160    data_snapshot=data_snapshot # pass the data_snapshot for printing to the log
161)
162
163# Make sure nothing failed
164if (application_monitor.error_has_occurred == True):
165    print ("INFO - Stopping application due to error caused by credentials")
166    print ("Please fix your credentials and then restart this application again", flush=True)
167    exit(0)
168
169# For tracking if we stopped due to a metric alarm
170stopped_due_to_metric_alarm = False
171
172execution_sleep_time = 30
173def execution_loop():
174    while True:
175        snapshot_monitor.monitor_loop_function(
176            time_passed=execution_sleep_time, psutil_process=application_monitor.application_process_psutil)
177        application_monitor.monitor_loop_function(
178            time_passed=execution_sleep_time)
179
180        # Did a metric go into alarm?
181        if (snapshot_monitor.has_cut_ticket == True):
182            # Set that we had an 'internal error' so we go down the right code path
183            snapshot_monitor.had_internal_error = True
184            break
185
186        # If an error has occurred or otherwise this thread needs to stop, then break the loop
187        if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True):
188            break
189
190        time.sleep(execution_sleep_time)
191
192
193def application_thread():
194
195    start_email_body = "MQTT5 Short Running Canary Wrapper has started for "
196    start_email_body += "\"" + command_parser_arguments.git_repo_name + "\" commit \"" + command_parser_arguments.git_hash + "\""
197    start_email_body += "\nThe wrapper will run for the length the MQTT5 Canary application is set to run for, which is determined by "
198    start_email_body += "the arguments set. The arguments used for this run are listed below:"
199    start_email_body += "\n  Arguments: " + command_parser_arguments.canary_arguments
200    snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started")
201
202    # Start the application going
203    snapshot_monitor.start_monitoring()
204    application_monitor.start_monitoring()
205    # Allow the snapshot monitor to cut tickets
206    snapshot_monitor.can_cut_ticket = True
207
208    # Start the execution loop
209    execution_loop()
210
211    # Make sure everything is stopped
212    snapshot_monitor.stop_monitoring()
213    application_monitor.stop_monitoring()
214
215    # Track whether this counts as an error (and therefore we should cleanup accordingly) or not
216    wrapper_error_occurred = False
217    # Finished Email
218    send_finished_email = True
219    finished_email_body = "MQTT5 Short Running Canary Wrapper has stopped."
220    finished_email_body += "\n\n"
221
222    try:
223        # Find out why we stopped
224        if (snapshot_monitor.had_internal_error == True):
225            if (snapshot_monitor.has_cut_ticket == True):
226                # We do not need to cut a ticket here - it's cut by the snapshot monitor!
227                print ("ERROR - Snapshot monitor stopped due to metric in alarm!", flush=True)
228                finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!"
229                finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered)
230                wrapper_error_occurred = True
231            else:
232                print ("ERROR - Snapshot monitor stopped due to internal error!", flush=True)
233                cut_ticket_using_cloudwatch(
234                    git_repo_name=command_parser_arguments.git_repo_name,
235                    git_hash=command_parser_arguments.git_hash,
236                    git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
237                    git_fixed_namespace_text="mqtt5_canary",
238                    cloudwatch_region="us-east-1",
239                    ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason,
240                    ticket_reason="Snapshot monitor stopped due to internal error",
241                    ticket_allow_duplicates=True,
242                    ticket_category=command_parser_arguments.ticket_category,
243                    ticket_item=command_parser_arguments.ticket_item,
244                    ticket_group=command_parser_arguments.ticket_group,
245                    ticket_type=command_parser_arguments.ticket_type,
246                    ticket_severity=4)
247                wrapper_error_occurred = True
248                finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error."
249                finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason
250
251        elif (application_monitor.error_has_occurred == True):
252            if (application_monitor.error_due_to_credentials == True):
253                print ("INFO - Stopping application due to error caused by credentials")
254                print ("Please fix your credentials and then restart this application again", flush=True)
255                wrapper_error_occurred = True
256                send_finished_email = False
257            else:
258                # Is the error something in the canary failed?
259                if (application_monitor.error_code != 0):
260                    cut_ticket_using_cloudwatch(
261                        git_repo_name=command_parser_arguments.git_repo_name,
262                        git_hash=command_parser_arguments.git_hash,
263                        git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
264                        git_fixed_namespace_text="mqtt5_canary",
265                        cloudwatch_region="us-east-1",
266                        ticket_description="The Short Running Canary exited with a non-zero exit code! This likely means something in the canary failed.",
267                        ticket_reason="The Short Running Canary exited with a non-zero exit code",
268                        ticket_allow_duplicates=True,
269                        ticket_category=command_parser_arguments.ticket_category,
270                        ticket_item=command_parser_arguments.ticket_item,
271                        ticket_group=command_parser_arguments.ticket_group,
272                        ticket_type=command_parser_arguments.ticket_type,
273                        ticket_severity=4)
274                    wrapper_error_occurred = True
275                    finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code! This means something in the Canary application itself failed"
276                else:
277                    print ("INFO - Stopping application. No error has occurred, application has stopped normally", flush=True)
278                    application_monitor.print_stdout()
279                    finished_email_body += "Short Running Canary finished successfully and run without errors!"
280                    wrapper_error_occurred = False
281        else:
282            print ("ERROR - Short Running Canary stopped due to unknown reason!", flush=True)
283            cut_ticket_using_cloudwatch(
284                git_repo_name=command_parser_arguments.git_repo_name,
285                git_hash=command_parser_arguments.git_hash,
286                git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
287                git_fixed_namespace_text="mqtt5_canary",
288                cloudwatch_region="us-east-1",
289                ticket_description="The Short Running Canary stopped for an unknown reason!",
290                ticket_reason="The Short Running Canary stopped for unknown reason",
291                ticket_allow_duplicates=True,
292                ticket_category=command_parser_arguments.ticket_category,
293                ticket_item=command_parser_arguments.ticket_item,
294                ticket_group=command_parser_arguments.ticket_group,
295                ticket_type=command_parser_arguments.ticket_type,
296                ticket_severity=4)
297            wrapper_error_occurred = True
298            finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!"
299    except Exception as e:
300        print ("ERROR: Could not (possibly) cut ticket due to exception!")
301        print ("Exception: " + str(e), flush=True)
302
303    # Clean everything up and stop
304    snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred)
305    application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred)
306    print ("Short Running Canary finished!", flush=True)
307
308    finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: "
309    finished_email_body += "https://s3.console.aws.amazon.com/s3/object/"
310    finished_email_body += command_parser_arguments.s3_bucket_name
311    finished_email_body += "?region=" + command_parser_arguments.cloudwatch_region
312    finished_email_body += "&prefix=" + command_parser_arguments.git_repo_name + "/" + datetime_string + "/"
313    if (wrapper_error_occurred == True):
314        finished_email_body += "Failed_Logs/"
315    finished_email_body += command_parser_arguments.git_hash + ".log"
316    if (command_parser_arguments.codebuild_log_path != ""):
317        print ("\n Codebuild log path: " + command_parser_arguments.codebuild_log_path + "\n")
318
319    # Send the finish email
320    if (send_finished_email == True):
321        if (wrapper_error_occurred == True):
322            snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error")
323        else:
324            snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished")
325
326    exit (application_monitor.error_code)
327
328
329# Start the application!
330application_thread()
331