1#!/usr/bin/env python3 2 3# Copyright (c) Facebook, Inc. and its affiliates. 4# All rights reserved. 5# 6# This source code is licensed under the BSD-style license found in the 7# LICENSE file in the root directory of this source tree. 8 9from typing import Callable 10 11from torch.distributed.elastic.utils.logging import get_logger 12 13 14log = get_logger(__name__) 15 16__all__ = ["HealthCheckServer", "create_healthcheck_server"] 17 18 19class HealthCheckServer: 20 """ 21 Interface for health check monitoring server, which can be extended 22 by starting tcp/http server on the specified port. 23 24 Args: 25 26 alive_callback: Callable[[], int], callback to last progress time of agent 27 28 port: int, port number to start tcp/http server 29 30 timeout: int, timeout seconds to decide agent is alive/dead 31 """ 32 33 _alive_callback: Callable[[], int] 34 _port: int 35 _timeout: int 36 37 def __init__( 38 self, alive_callback: Callable[[], int], port: int, timeout: int 39 ) -> None: 40 self._alive_callback = alive_callback 41 self._port = port 42 self._timeout = timeout 43 44 def start(self) -> None: 45 """ 46 Unsupported functionality for Pytorch, doesn't start any health check server 47 """ 48 log.warning("No health check server started") 49 50 def stop(self) -> None: 51 """ 52 Function to stop health check server 53 """ 54 log.info("Stopping noop health check server.") 55 56 57def create_healthcheck_server( 58 alive_callback: Callable[[], int], 59 port: int, 60 timeout: int, 61) -> HealthCheckServer: 62 """ 63 creates health check server object 64 """ 65 return HealthCheckServer(alive_callback, port, timeout) 66