xref: /aosp_15_r20/external/pytorch/torch/distributed/elastic/metrics/__init__.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1#!/usr/bin/env/python3
2# mypy: allow-untyped-defs
3
4# Copyright (c) Facebook, Inc. and its affiliates.
5# All rights reserved.
6#
7# This source code is licensed under the BSD-style license found in the
8# LICENSE file in the root directory of this source tree.
9
10"""Metrics API.
11
12**Overview**:
13
14The metrics API in torchelastic is used to publish telemetry metrics.
15It is designed to be used by torchelastic's internal modules to
16publish metrics for the end user with the goal of increasing visibility
17and helping with debugging. However you may use the same API in your
18jobs to publish metrics to the same metrics ``sink``.
19
20A ``metric`` can be thought of as timeseries data
21and is uniquely identified by the string-valued tuple
22``(metric_group, metric_name)``.
23
24torchelastic makes no assumptions about what a ``metric_group`` is
25and what relationship it has with ``metric_name``. It is totally up
26to the user to use these two fields to uniquely identify a metric.
27
28.. note:: The metric group ``torchelastic`` is reserved by torchelastic for
29          platform level metrics that it produces.
30          For instance torchelastic may output the latency (in milliseconds)
31          of a re-rendezvous operation from the agent as
32          ``(torchelastic, agent.rendezvous.duration.ms)``
33
34A sensible way to use metric groups is to map them to a stage or module
35in your job. You may also encode certain high level properties
36the job such as the region or stage (dev vs prod).
37
38**Publish Metrics**:
39
40Using torchelastic's metrics API is similar to using python's logging
41framework. You first have to configure a metrics handler before
42trying to add metric data.
43
44The example below measures the latency for the ``calculate()`` function.
45
46::
47
48  import time
49  import torch.distributed.elastic.metrics as metrics
50
51  # makes all metrics other than the one from "my_module" to go /dev/null
52  metrics.configure(metrics.NullMetricsHandler())
53  metrics.configure(metrics.ConsoleMetricsHandler(), "my_module")
54
55  def my_method():
56    start = time.time()
57    calculate()
58    end = time.time()
59    metrics.put_metric("calculate_latency", int(end-start), "my_module")
60
61You may also use the torch.distributed.elastic.metrics.prof` decorator
62to conveniently and succinctly profile functions
63
64::
65
66  # -- in module examples.foobar --
67
68  import torch.distributed.elastic.metrics as metrics
69
70  metrics.configure(metrics.ConsoleMetricsHandler(), "foobar")
71  metrics.configure(metrics.ConsoleMetricsHandler(), "Bar")
72
73  @metrics.prof
74  def foo():
75    pass
76
77  class Bar():
78
79    @metrics.prof
80    def baz():
81        pass
82
83``@metrics.prof`` will publish the following metrics
84::
85
86  <leaf_module or classname>.success - 1 if the function finished successfully
87  <leaf_module or classname>.failure - 1 if the function threw an exception
88  <leaf_module or classname>.duration.ms - function duration in milliseconds
89
90**Configuring Metrics Handler**:
91
92`torch.distributed.elastic.metrics.MetricHandler` is responsible for emitting
93the added metric values to a particular destination. Metric groups can be
94configured with different metric handlers.
95
96By default torchelastic emits all metrics to ``/dev/null``.
97By adding the following configuration metrics,
98``torchelastic`` and ``my_app`` metric groups will be printed out to
99console.
100
101::
102
103  import torch.distributed.elastic.metrics as metrics
104
105  metrics.configure(metrics.ConsoleMetricHandler(), group = "torchelastic")
106  metrics.configure(metrics.ConsoleMetricHandler(), group = "my_app")
107
108**Writing a Custom Metric Handler**:
109
110If you want your metrics to be emitted to a custom location, implement
111the `torch.distributed.elastic.metrics.MetricHandler` interface
112and configure your job to use your custom metric handler.
113
114Below is a toy example that prints the metrics to ``stdout``
115
116::
117
118  import torch.distributed.elastic.metrics as metrics
119
120  class StdoutMetricHandler(metrics.MetricHandler):
121     def emit(self, metric_data):
122         ts = metric_data.timestamp
123         group = metric_data.group_name
124         name = metric_data.name
125         value = metric_data.value
126         print(f"[{ts}][{group}]: {name}={value}")
127
128  metrics.configure(StdoutMetricHandler(), group="my_app")
129
130Now all metrics in the group ``my_app`` will be printed to stdout as:
131
132::
133
134  [1574213883.4182858][my_app]: my_metric=<value>
135  [1574213940.5237644][my_app]: my_metric=<value>
136
137"""
138
139from typing import Optional
140
141from .api import (  # noqa: F401
142    configure,
143    ConsoleMetricHandler,
144    get_elapsed_time_ms,
145    getStream,
146    MetricData,
147    MetricHandler,
148    MetricsConfig,
149    NullMetricHandler,
150    prof,
151    profile,
152    publish_metric,
153    put_metric,
154)
155
156
157def initialize_metrics(cfg: Optional[MetricsConfig] = None):
158    pass
159
160
161try:
162    from torch.distributed.elastic.metrics.static_init import *  # type: ignore[import] # noqa: F401 F403
163except ModuleNotFoundError:
164    pass
165