xref: /aosp_15_r20/external/pytorch/torch/distributed/elastic/agent/server/health_check_server.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1#!/usr/bin/env python3
2
3# Copyright (c) Facebook, Inc. and its affiliates.
4# All rights reserved.
5#
6# This source code is licensed under the BSD-style license found in the
7# LICENSE file in the root directory of this source tree.
8
9from typing import Callable
10
11from torch.distributed.elastic.utils.logging import get_logger
12
13
14log = get_logger(__name__)
15
16__all__ = ["HealthCheckServer", "create_healthcheck_server"]
17
18
19class HealthCheckServer:
20    """
21    Interface for health check monitoring server, which can be extended
22    by starting tcp/http server on the specified port.
23
24    Args:
25
26        alive_callback: Callable[[], int], callback to last progress time of agent
27
28        port: int, port number to start tcp/http server
29
30        timeout: int, timeout seconds to decide agent is alive/dead
31    """
32
33    _alive_callback: Callable[[], int]
34    _port: int
35    _timeout: int
36
37    def __init__(
38        self, alive_callback: Callable[[], int], port: int, timeout: int
39    ) -> None:
40        self._alive_callback = alive_callback
41        self._port = port
42        self._timeout = timeout
43
44    def start(self) -> None:
45        """
46        Unsupported functionality for Pytorch, doesn't start any health check server
47        """
48        log.warning("No health check server started")
49
50    def stop(self) -> None:
51        """
52        Function to stop health check server
53        """
54        log.info("Stopping noop health check server.")
55
56
57def create_healthcheck_server(
58    alive_callback: Callable[[], int],
59    port: int,
60    timeout: int,
61) -> HealthCheckServer:
62    """
63    creates health check server object
64    """
65    return HealthCheckServer(alive_callback, port, timeout)
66