xref: /aosp_15_r20/external/autotest/utils/frozen_chromite/third_party/infra_libs/ts_mon/config.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Copyright 2015 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import json
6import logging
7import os
8import socket
9import sys
10import re
11
12import requests
13
14from infra_libs.ts_mon.common import interface
15from infra_libs.ts_mon.common import monitors
16from infra_libs.ts_mon.common import standard_metrics
17from infra_libs.ts_mon.common import targets
18
19
20def load_machine_config(filename):
21  if not os.path.exists(filename):
22    logging.info('Configuration file does not exist, ignoring: %s', filename)
23    return {}
24
25  try:
26    with open(filename) as fh:
27      return json.load(fh)
28  except Exception:
29    logging.error('Configuration file couldn\'t be read: %s', filename)
30    raise
31
32
33def _default_region(fqdn):
34  # Check if we're running in a GCE instance.
35  try:
36    r = requests.get(
37        'http://metadata.google.internal/computeMetadata/v1/instance/zone',
38        headers={'Metadata-Flavor': 'Google'},
39        timeout=1.0)
40  except requests.exceptions.RequestException:
41    pass
42  else:
43    if r.status_code == requests.codes.ok:
44      # The zone is the last slash-separated component.
45      return r.text.split('/')[-1]
46
47  try:
48    return fqdn.split('.')[1]  # [chrome|golo]
49  except IndexError:
50    return ''
51
52
53def _default_network(host):
54  try:
55    # Regular expression that matches the vast majority of our host names.
56    # Matches everything of the form 'masterN', 'masterNa', and 'foo-xN'.
57    return re.match(r'^([\w-]*?-[acm]|master)(\d+)a?$', host).group(2)  # N
58  except AttributeError:
59    return ''
60
61
62def add_argparse_options(parser):
63  """Add monitoring related flags to a process' argument parser.
64
65  Args:
66    parser (argparse.ArgumentParser): the parser for the main process.
67  """
68  if sys.platform == 'win32':  # pragma: no cover
69    default_config_file = 'C:\\chrome-infra\\ts-mon.json'
70  else:  # pragma: no cover
71    default_config_file = '/etc/chrome-infra/ts-mon.json'
72
73  parser = parser.add_argument_group('Timeseries Monitoring Options')
74  parser.add_argument(
75      '--ts-mon-config-file',
76      default=default_config_file,
77      help='path to a JSON config file that contains suitable values for '
78           '"endpoint" and "credentials" for this machine. This config file is '
79           'intended to be shared by all processes on the machine, as the '
80           'values depend on the machine\'s position in the network, IP '
81           'whitelisting and deployment of credentials. (default: %(default)s)')
82  parser.add_argument(
83      '--ts-mon-endpoint',
84      help='url (file:// or https://) to post monitoring metrics to. If set, '
85           'overrides the value in --ts-mon-config-file')
86  parser.add_argument(
87      '--ts-mon-credentials',
88      help='path to a pkcs8 json credential file. If set, overrides the value '
89           'in --ts-mon-config-file')
90  parser.add_argument(
91      '--ts-mon-ca-certs',
92      help='path to file containing root CA certificates for SSL server '
93           'certificate validation. If not set, a CA cert file bundled with '
94           'httplib2 is used.')
95  parser.add_argument(
96      '--ts-mon-flush',
97      choices=('manual', 'auto'), default='auto',
98      help=('metric push behavior: manual (only send when flush() is called), '
99            'or auto (send automatically every --ts-mon-flush-interval-secs '
100            'seconds). (default: %(default)s)'))
101  parser.add_argument(
102      '--ts-mon-flush-interval-secs',
103      type=int,
104      default=60,
105      help=('automatically push metrics on this interval if '
106            '--ts-mon-flush=auto.'))
107  parser.add_argument(
108      '--ts-mon-autogen-hostname',
109      action="store_true",
110      help=('Indicate that the hostname is autogenerated. '
111            'This option must be set on autoscaled GCE VMs, Kubernetes pods, '
112            'or any other hosts with dynamically generated names.'))
113
114  parser.add_argument(
115      '--ts-mon-target-type',
116      choices=('device', 'task'),
117      default='device',
118      help='the type of target that is being monitored ("device" or "task").'
119           ' (default: %(default)s)')
120
121  fqdn = socket.getfqdn().lower()  # foo-[a|m]N.[chrome|golo].chromium.org
122  host = fqdn.split('.')[0]  # foo-[a|m]N
123  region = _default_region(fqdn)
124  network = _default_network(host)
125
126  parser.add_argument(
127      '--ts-mon-device-hostname',
128      default=host,
129      help='name of this device, (default: %(default)s)')
130  parser.add_argument(
131      '--ts-mon-device-region',
132      default=region,
133      help='name of the region this devices lives in. (default: %(default)s)')
134  parser.add_argument(
135      '--ts-mon-device-role',
136      default='default',
137      help='Role of the device. (default: %(default)s)')
138  parser.add_argument(
139      '--ts-mon-device-network',
140      default=network,
141      help='name of the network this device is connected to. '
142           '(default: %(default)s)')
143
144  parser.add_argument(
145      '--ts-mon-task-service-name',
146      help='name of the service being monitored')
147  parser.add_argument(
148      '--ts-mon-task-job-name',
149      help='name of this job instance of the task')
150  parser.add_argument(
151      '--ts-mon-task-region',
152      default=region,
153      help='name of the region in which this task is running '
154           '(default: %(default)s)')
155  parser.add_argument(
156      '--ts-mon-task-hostname',
157      default=host,
158      help='name of the host on which this task is running '
159           '(default: %(default)s)')
160  parser.add_argument(
161      '--ts-mon-task-number', type=int, default=0,
162      help='number (e.g. for replication) of this instance of this task '
163           '(default: %(default)s)')
164
165  parser.add_argument(
166      '--ts-mon-metric-name-prefix',
167      default='/chrome/infra/',
168      help='metric name prefix for all metrics (default: %(default)s)')
169
170  parser.add_argument(
171      '--ts-mon-use-new-proto',
172      default=True, action='store_true',
173      help='deprecated and ignored')
174
175
176def process_argparse_options(args):
177  """Process command line arguments to initialize the global monitor.
178
179  Also initializes the default target.
180
181  Starts a background thread to automatically flush monitoring metrics if not
182  disabled by command line arguments.
183
184  Args:
185    args (argparse.Namespace): the result of parsing the command line arguments
186  """
187  # Parse the config file if it exists.
188  config = load_machine_config(args.ts_mon_config_file)
189  endpoint = config.get('endpoint', '')
190  credentials = config.get('credentials', '')
191  autogen_hostname = config.get('autogen_hostname', False)
192
193  # Command-line args override the values in the config file.
194  if args.ts_mon_endpoint is not None:
195    endpoint = args.ts_mon_endpoint
196  if args.ts_mon_credentials is not None:
197    credentials = args.ts_mon_credentials
198
199  if args.ts_mon_target_type == 'device':
200    hostname = args.ts_mon_device_hostname
201    if args.ts_mon_autogen_hostname or autogen_hostname:
202      hostname = 'autogen:' + hostname
203    interface.state.target = targets.DeviceTarget(
204        args.ts_mon_device_region,
205        args.ts_mon_device_role,
206        args.ts_mon_device_network,
207        hostname)
208  if args.ts_mon_target_type == 'task':
209    # Reimplement ArgumentParser.error, since we don't have access to the parser
210    if not args.ts_mon_task_service_name:
211      print >> sys.stderr, ('Argument --ts-mon-task-service-name must be '
212                            'provided when the target type is "task".')
213      sys.exit(2)
214    if not args.ts_mon_task_job_name:
215      print >> sys.stderr, ('Argument --ts-mon-task-job-name must be provided '
216                            'when the target type is "task".')
217      sys.exit(2)
218    hostname = args.ts_mon_task_hostname
219    if args.ts_mon_autogen_hostname or autogen_hostname:
220      hostname = 'autogen:' + hostname
221    interface.state.target = targets.TaskTarget(
222        args.ts_mon_task_service_name,
223        args.ts_mon_task_job_name,
224        args.ts_mon_task_region,
225        hostname,
226        args.ts_mon_task_number)
227
228  interface.state.metric_name_prefix = args.ts_mon_metric_name_prefix
229  interface.state.global_monitor = monitors.NullMonitor()
230
231  if endpoint.startswith('file://'):
232    interface.state.global_monitor = monitors.DebugMonitor(
233        endpoint[len('file://'):])
234  elif endpoint.startswith('https://'):
235    interface.state.global_monitor = monitors.HttpsMonitor(
236        endpoint, monitors.CredentialFactory.from_string(credentials),
237        ca_certs=args.ts_mon_ca_certs)
238  elif endpoint.lower() == 'none' or not endpoint:
239    logging.info('ts_mon monitoring has been explicitly disabled')
240  else:
241    logging.error('ts_mon monitoring is disabled because the endpoint provided'
242                  ' is invalid or not supported: %s', endpoint)
243
244  interface.state.flush_mode = args.ts_mon_flush
245
246  if args.ts_mon_flush == 'auto':
247    interface.state.flush_thread = interface._FlushThread(
248        args.ts_mon_flush_interval_secs)
249    interface.state.flush_thread.start()
250
251  standard_metrics.init()
252