xref: /aosp_15_r20/external/autotest/client/common_lib/cros/retry.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Lint as: python2, python3
2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6from __future__ import absolute_import
7from __future__ import division
8from __future__ import print_function
9
10import logging
11import random
12import signal
13import six
14import sys
15import threading
16import time
17
18from autotest_lib.client.common_lib import env
19from autotest_lib.client.common_lib import error
20
21
22def install_sigalarm_handler(new_handler):
23    """
24    Try installing a sigalarm handler.
25
26    In order to protect apache, wsgi intercepts any attempt to install a
27    sigalarm handler, so our function will feel the full force of a sigalarm
28    even if we try to install a pacifying signal handler. To avoid this we
29    need to confirm that the handler we tried to install really was installed.
30
31    @param new_handler: The new handler to install. This must be a callable
32                        object, or signal.SIG_IGN/SIG_DFL which correspond to
33                        the numbers 1,0 respectively.
34    @return: True if the installation of new_handler succeeded, False otherwise.
35    """
36    # Installing signal handlers does not and is never expected to work if we're
37    # running in a mod_wsgi process.
38    if env.IN_MOD_WSGI:
39        return False
40
41    if (new_handler is None or
42        (not callable(new_handler) and
43         new_handler != signal.SIG_IGN and
44         new_handler != signal.SIG_DFL)):
45        logging.warning('Trying to install an invalid sigalarm handler.')
46        return False
47
48    signal.signal(signal.SIGALRM, new_handler)
49    installed_handler = signal.getsignal(signal.SIGALRM)
50    return installed_handler == new_handler
51
52
53def set_sigalarm_timeout(timeout_secs, default_timeout=60):
54    """
55    Set the sigalarm timeout.
56
57    This methods treats any timeout <= 0 as a possible error and falls back to
58    using it's default timeout, since negative timeouts can have 'alarming'
59    effects. Though 0 is a valid timeout, it is often used to cancel signals; in
60    order to set a sigalarm of 0 please call signal.alarm directly as there are
61    many situations where a 0 timeout is considered invalid.
62
63    @param timeout_secs: The new timeout, in seconds.
64    @param default_timeout: The default timeout to use, if timeout <= 0.
65    @return: The old sigalarm timeout
66    """
67    timeout_sec_n = int(timeout_secs)
68    if timeout_sec_n <= 0:
69        timeout_sec_n = int(default_timeout)
70    return signal.alarm(timeout_sec_n)
71
72
73def sigalarm_wrapper(message):
74    """
75    Raise a TimeoutException with the given message.  Needed because the body
76    of a closure (lambda) can only be an expression, not a statement (such
77    as "raise") :P :P :P
78
79    @param message: the exception message.
80    """
81    raise error.TimeoutException(message)
82
83
84def custom_sigalarm_handler(func, timeout_sec):
85    """
86    Returns a sigalarm handler which produces an exception with a custom
87    error message (function name and timeout length) instead of a generic
88    one.
89
90    @param func: the function that may time out
91    @param timeout_sec: timeout length in seconds
92    """
93    try:
94        name = str(func.__name__)
95    except Exception as e:
96        name = '(unavailable function name: exception: %s)' % e
97    message = "sigalarm timeout (%d seconds) in %s" % (timeout_sec, name)
98    return lambda signum, frame: sigalarm_wrapper(message)
99
100
101def timeout(func, args=(), kwargs={}, timeout_sec=60.0, default_result=None):
102    """
103    This function run the given function using the args, kwargs and
104    return the given default value if the timeout_sec is exceeded.
105
106    @param func: function to be called.
107    @param args: arguments for function to be called.
108    @param kwargs: keyword arguments for function to be called.
109    @param timeout_sec: timeout setting for call to exit, in seconds.
110    @param default_result: default return value for the function call.
111
112    @return 1: is_timeout 2: result of the function call. If
113            is_timeout is True, the call is timed out. If the
114            value is False, the call is finished on time.
115    """
116    old_alarm_sec = 0
117    old_handler = signal.getsignal(signal.SIGALRM)
118    handler = custom_sigalarm_handler(func, timeout_sec)
119    installed_handler = install_sigalarm_handler(handler)
120    if installed_handler:
121        old_alarm_sec = set_sigalarm_timeout(timeout_sec, default_timeout=60)
122
123    # If old_timeout_time = 0 we either didn't install a handler, or sigalrm
124    # had a signal.SIG_DFL handler with 0 timeout. In the latter case we still
125    # need to restore the handler/timeout.
126    old_timeout_time = (time.time() + old_alarm_sec) if old_alarm_sec > 0 else 0
127
128    try:
129        default_result = func(*args, **kwargs)
130        return False, default_result
131    except error.TimeoutException:
132        return True, default_result
133    finally:
134        # If we installed a sigalarm handler, cancel it since our function
135        # returned on time. If we can successfully restore the old handler,
136        # reset the old timeout, or, if the old timeout's deadline has passed,
137        # set the sigalarm to fire in one second. If the old_timeout_time is 0
138        # we don't need to set the sigalarm timeout since we have already set it
139        # as a byproduct of cancelling the current signal.
140        if installed_handler:
141            signal.alarm(0)
142            if install_sigalarm_handler(old_handler) and old_timeout_time:
143                set_sigalarm_timeout(int(old_timeout_time - time.time()),
144                                     default_timeout=1)
145
146
147
148def retry(ExceptionToCheck, timeout_min=1.0, delay_sec=3, raiselist=None,
149          exception_to_raise=None, label=None, callback=None, backoff=1):
150    """Retry calling the decorated function using a delay with jitter.
151
152    Will raise RPC ValidationError exceptions from the decorated
153    function without retrying; a malformed RPC isn't going to
154    magically become good. Will raise exceptions in raiselist as well.
155
156    If the retry is done in a child thread, timeout may not be enforced as
157    signal only works in main thread. Therefore, the retry inside a child
158    thread may run longer than timeout or even hang.
159
160    original from:
161      http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
162
163    @param ExceptionToCheck: the exception to check.  May be a tuple of
164                             exceptions to check.
165    @param timeout_min: timeout in minutes until giving up.
166    @param delay_sec: pre-jittered base delay between retries in seconds. Actual
167                      delays will be first calculated with exponential backoff,
168                      then randomized around this new value, ranging up to 50%
169                      off this midpoint.
170    @param raiselist: a list of exceptions that will be raised without retrying.
171    @param exception_to_raise: the exception to raise. Callers can specify the
172                               exception they want to raise.
173    @param label: a label added to the exception message to help debug.
174    @param callback: a function to call before each retry.
175    @param backoff: exponent to calculate exponential backoff for the actual
176                    delay. Set to 1 to disable exponential backoff.
177    """
178    def deco_retry(func):
179        """
180        Decorator wrapper.
181
182        @param func: the function to be retried and timed-out.
183        """
184        random.seed()
185
186
187        def delay(delay_with_backoff_sec):
188            """
189            'Jitter' the delay with backoff, up to 50% in either direction.
190            """
191            random_delay = random.uniform(0.5 * delay_with_backoff_sec,
192                                          1.5 * delay_with_backoff_sec)
193            logging.warning('Retrying in %f seconds...', random_delay)
194            time.sleep(random_delay)
195
196
197        def func_retry(*args, **kwargs):
198            """
199            Used to cache exception to be raised later.
200            """
201            exc_info = None
202            delayed_enabled = False
203            exception_tuple = () if raiselist is None else tuple(raiselist)
204            start_time = time.time()
205            remaining_time = timeout_min * 60
206            delay_with_backoff_sec = delay_sec
207            is_main_thread = isinstance(threading.current_thread(),
208                                        threading._MainThread)
209            if label:
210                details = 'label="%s"' % label
211            elif hasattr(func, '__name__'):
212                details = 'function="%s()"' % func.__name__
213            else:
214                details = 'unknown function'
215
216            exception_message = ('retry exception (%s), timeout = %ds' %
217                                 (details, timeout_min * 60))
218
219            while remaining_time > 0:
220                if delayed_enabled:
221                    delay(delay_with_backoff_sec)
222                    delay_with_backoff_sec *= backoff
223                else:
224                    delayed_enabled = True
225                try:
226                    # Clear the cache
227                    exc_info = None
228                    if is_main_thread:
229                        is_timeout, result = timeout(func, args, kwargs,
230                                                     remaining_time)
231                        if not is_timeout:
232                            return result
233                    else:
234                        return func(*args, **kwargs)
235                except exception_tuple:
236                    raise
237                except error.CrosDynamicSuiteException:
238                    raise
239                except ExceptionToCheck as e:
240                    logging.warning('%s(%s)', e.__class__, e)
241                    # Cache the exception to be raised later.
242                    exc_info = sys.exc_info()
243
244                remaining_time = int(timeout_min * 60 -
245                                     (time.time() - start_time))
246
247                if remaining_time > 0 and callback:
248                    callback()
249                    remaining_time = int(timeout_min * 60 -
250                                         (time.time() - start_time))
251
252
253            # The call must have timed out or raised ExceptionToCheck.
254            if not exc_info:
255                if exception_to_raise:
256                    raise exception_to_raise(exception_message)
257                else:
258                    raise error.TimeoutException(exception_message)
259            # Raise the cached exception with original backtrace.
260            if exception_to_raise:
261                raise exception_to_raise('%s: %s' % (exc_info[0], exc_info[1]))
262            six.reraise(exc_info[0], exc_info[1], exc_info[2])
263
264
265        return func_retry  # true decorator
266    return deco_retry
267