1#!/usr/bin/env/python3 2# mypy: allow-untyped-defs 3 4# Copyright (c) Facebook, Inc. and its affiliates. 5# All rights reserved. 6# 7# This source code is licensed under the BSD-style license found in the 8# LICENSE file in the root directory of this source tree. 9 10"""Metrics API. 11 12**Overview**: 13 14The metrics API in torchelastic is used to publish telemetry metrics. 15It is designed to be used by torchelastic's internal modules to 16publish metrics for the end user with the goal of increasing visibility 17and helping with debugging. However you may use the same API in your 18jobs to publish metrics to the same metrics ``sink``. 19 20A ``metric`` can be thought of as timeseries data 21and is uniquely identified by the string-valued tuple 22``(metric_group, metric_name)``. 23 24torchelastic makes no assumptions about what a ``metric_group`` is 25and what relationship it has with ``metric_name``. It is totally up 26to the user to use these two fields to uniquely identify a metric. 27 28.. note:: The metric group ``torchelastic`` is reserved by torchelastic for 29 platform level metrics that it produces. 30 For instance torchelastic may output the latency (in milliseconds) 31 of a re-rendezvous operation from the agent as 32 ``(torchelastic, agent.rendezvous.duration.ms)`` 33 34A sensible way to use metric groups is to map them to a stage or module 35in your job. You may also encode certain high level properties 36the job such as the region or stage (dev vs prod). 37 38**Publish Metrics**: 39 40Using torchelastic's metrics API is similar to using python's logging 41framework. You first have to configure a metrics handler before 42trying to add metric data. 43 44The example below measures the latency for the ``calculate()`` function. 45 46:: 47 48 import time 49 import torch.distributed.elastic.metrics as metrics 50 51 # makes all metrics other than the one from "my_module" to go /dev/null 52 metrics.configure(metrics.NullMetricsHandler()) 53 metrics.configure(metrics.ConsoleMetricsHandler(), "my_module") 54 55 def my_method(): 56 start = time.time() 57 calculate() 58 end = time.time() 59 metrics.put_metric("calculate_latency", int(end-start), "my_module") 60 61You may also use the torch.distributed.elastic.metrics.prof` decorator 62to conveniently and succinctly profile functions 63 64:: 65 66 # -- in module examples.foobar -- 67 68 import torch.distributed.elastic.metrics as metrics 69 70 metrics.configure(metrics.ConsoleMetricsHandler(), "foobar") 71 metrics.configure(metrics.ConsoleMetricsHandler(), "Bar") 72 73 @metrics.prof 74 def foo(): 75 pass 76 77 class Bar(): 78 79 @metrics.prof 80 def baz(): 81 pass 82 83``@metrics.prof`` will publish the following metrics 84:: 85 86 <leaf_module or classname>.success - 1 if the function finished successfully 87 <leaf_module or classname>.failure - 1 if the function threw an exception 88 <leaf_module or classname>.duration.ms - function duration in milliseconds 89 90**Configuring Metrics Handler**: 91 92`torch.distributed.elastic.metrics.MetricHandler` is responsible for emitting 93the added metric values to a particular destination. Metric groups can be 94configured with different metric handlers. 95 96By default torchelastic emits all metrics to ``/dev/null``. 97By adding the following configuration metrics, 98``torchelastic`` and ``my_app`` metric groups will be printed out to 99console. 100 101:: 102 103 import torch.distributed.elastic.metrics as metrics 104 105 metrics.configure(metrics.ConsoleMetricHandler(), group = "torchelastic") 106 metrics.configure(metrics.ConsoleMetricHandler(), group = "my_app") 107 108**Writing a Custom Metric Handler**: 109 110If you want your metrics to be emitted to a custom location, implement 111the `torch.distributed.elastic.metrics.MetricHandler` interface 112and configure your job to use your custom metric handler. 113 114Below is a toy example that prints the metrics to ``stdout`` 115 116:: 117 118 import torch.distributed.elastic.metrics as metrics 119 120 class StdoutMetricHandler(metrics.MetricHandler): 121 def emit(self, metric_data): 122 ts = metric_data.timestamp 123 group = metric_data.group_name 124 name = metric_data.name 125 value = metric_data.value 126 print(f"[{ts}][{group}]: {name}={value}") 127 128 metrics.configure(StdoutMetricHandler(), group="my_app") 129 130Now all metrics in the group ``my_app`` will be printed to stdout as: 131 132:: 133 134 [1574213883.4182858][my_app]: my_metric=<value> 135 [1574213940.5237644][my_app]: my_metric=<value> 136 137""" 138 139from typing import Optional 140 141from .api import ( # noqa: F401 142 configure, 143 ConsoleMetricHandler, 144 get_elapsed_time_ms, 145 getStream, 146 MetricData, 147 MetricHandler, 148 MetricsConfig, 149 NullMetricHandler, 150 prof, 151 profile, 152 publish_metric, 153 put_metric, 154) 155 156 157def initialize_metrics(cfg: Optional[MetricsConfig] = None): 158 pass 159 160 161try: 162 from torch.distributed.elastic.metrics.static_init import * # type: ignore[import] # noqa: F401 F403 163except ModuleNotFoundError: 164 pass 165