xref: /aosp_15_r20/external/autotest/client/common_lib/cros/retry.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1*9c5db199SXin Li# Lint as: python2, python3
2*9c5db199SXin Li# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
3*9c5db199SXin Li# Use of this source code is governed by a BSD-style license that can be
4*9c5db199SXin Li# found in the LICENSE file.
5*9c5db199SXin Li
6*9c5db199SXin Lifrom __future__ import absolute_import
7*9c5db199SXin Lifrom __future__ import division
8*9c5db199SXin Lifrom __future__ import print_function
9*9c5db199SXin Li
10*9c5db199SXin Liimport logging
11*9c5db199SXin Liimport random
12*9c5db199SXin Liimport signal
13*9c5db199SXin Liimport six
14*9c5db199SXin Liimport sys
15*9c5db199SXin Liimport threading
16*9c5db199SXin Liimport time
17*9c5db199SXin Li
18*9c5db199SXin Lifrom autotest_lib.client.common_lib import env
19*9c5db199SXin Lifrom autotest_lib.client.common_lib import error
20*9c5db199SXin Li
21*9c5db199SXin Li
22*9c5db199SXin Lidef install_sigalarm_handler(new_handler):
23*9c5db199SXin Li    """
24*9c5db199SXin Li    Try installing a sigalarm handler.
25*9c5db199SXin Li
26*9c5db199SXin Li    In order to protect apache, wsgi intercepts any attempt to install a
27*9c5db199SXin Li    sigalarm handler, so our function will feel the full force of a sigalarm
28*9c5db199SXin Li    even if we try to install a pacifying signal handler. To avoid this we
29*9c5db199SXin Li    need to confirm that the handler we tried to install really was installed.
30*9c5db199SXin Li
31*9c5db199SXin Li    @param new_handler: The new handler to install. This must be a callable
32*9c5db199SXin Li                        object, or signal.SIG_IGN/SIG_DFL which correspond to
33*9c5db199SXin Li                        the numbers 1,0 respectively.
34*9c5db199SXin Li    @return: True if the installation of new_handler succeeded, False otherwise.
35*9c5db199SXin Li    """
36*9c5db199SXin Li    # Installing signal handlers does not and is never expected to work if we're
37*9c5db199SXin Li    # running in a mod_wsgi process.
38*9c5db199SXin Li    if env.IN_MOD_WSGI:
39*9c5db199SXin Li        return False
40*9c5db199SXin Li
41*9c5db199SXin Li    if (new_handler is None or
42*9c5db199SXin Li        (not callable(new_handler) and
43*9c5db199SXin Li         new_handler != signal.SIG_IGN and
44*9c5db199SXin Li         new_handler != signal.SIG_DFL)):
45*9c5db199SXin Li        logging.warning('Trying to install an invalid sigalarm handler.')
46*9c5db199SXin Li        return False
47*9c5db199SXin Li
48*9c5db199SXin Li    signal.signal(signal.SIGALRM, new_handler)
49*9c5db199SXin Li    installed_handler = signal.getsignal(signal.SIGALRM)
50*9c5db199SXin Li    return installed_handler == new_handler
51*9c5db199SXin Li
52*9c5db199SXin Li
53*9c5db199SXin Lidef set_sigalarm_timeout(timeout_secs, default_timeout=60):
54*9c5db199SXin Li    """
55*9c5db199SXin Li    Set the sigalarm timeout.
56*9c5db199SXin Li
57*9c5db199SXin Li    This methods treats any timeout <= 0 as a possible error and falls back to
58*9c5db199SXin Li    using it's default timeout, since negative timeouts can have 'alarming'
59*9c5db199SXin Li    effects. Though 0 is a valid timeout, it is often used to cancel signals; in
60*9c5db199SXin Li    order to set a sigalarm of 0 please call signal.alarm directly as there are
61*9c5db199SXin Li    many situations where a 0 timeout is considered invalid.
62*9c5db199SXin Li
63*9c5db199SXin Li    @param timeout_secs: The new timeout, in seconds.
64*9c5db199SXin Li    @param default_timeout: The default timeout to use, if timeout <= 0.
65*9c5db199SXin Li    @return: The old sigalarm timeout
66*9c5db199SXin Li    """
67*9c5db199SXin Li    timeout_sec_n = int(timeout_secs)
68*9c5db199SXin Li    if timeout_sec_n <= 0:
69*9c5db199SXin Li        timeout_sec_n = int(default_timeout)
70*9c5db199SXin Li    return signal.alarm(timeout_sec_n)
71*9c5db199SXin Li
72*9c5db199SXin Li
73*9c5db199SXin Lidef sigalarm_wrapper(message):
74*9c5db199SXin Li    """
75*9c5db199SXin Li    Raise a TimeoutException with the given message.  Needed because the body
76*9c5db199SXin Li    of a closure (lambda) can only be an expression, not a statement (such
77*9c5db199SXin Li    as "raise") :P :P :P
78*9c5db199SXin Li
79*9c5db199SXin Li    @param message: the exception message.
80*9c5db199SXin Li    """
81*9c5db199SXin Li    raise error.TimeoutException(message)
82*9c5db199SXin Li
83*9c5db199SXin Li
84*9c5db199SXin Lidef custom_sigalarm_handler(func, timeout_sec):
85*9c5db199SXin Li    """
86*9c5db199SXin Li    Returns a sigalarm handler which produces an exception with a custom
87*9c5db199SXin Li    error message (function name and timeout length) instead of a generic
88*9c5db199SXin Li    one.
89*9c5db199SXin Li
90*9c5db199SXin Li    @param func: the function that may time out
91*9c5db199SXin Li    @param timeout_sec: timeout length in seconds
92*9c5db199SXin Li    """
93*9c5db199SXin Li    try:
94*9c5db199SXin Li        name = str(func.__name__)
95*9c5db199SXin Li    except Exception as e:
96*9c5db199SXin Li        name = '(unavailable function name: exception: %s)' % e
97*9c5db199SXin Li    message = "sigalarm timeout (%d seconds) in %s" % (timeout_sec, name)
98*9c5db199SXin Li    return lambda signum, frame: sigalarm_wrapper(message)
99*9c5db199SXin Li
100*9c5db199SXin Li
101*9c5db199SXin Lidef timeout(func, args=(), kwargs={}, timeout_sec=60.0, default_result=None):
102*9c5db199SXin Li    """
103*9c5db199SXin Li    This function run the given function using the args, kwargs and
104*9c5db199SXin Li    return the given default value if the timeout_sec is exceeded.
105*9c5db199SXin Li
106*9c5db199SXin Li    @param func: function to be called.
107*9c5db199SXin Li    @param args: arguments for function to be called.
108*9c5db199SXin Li    @param kwargs: keyword arguments for function to be called.
109*9c5db199SXin Li    @param timeout_sec: timeout setting for call to exit, in seconds.
110*9c5db199SXin Li    @param default_result: default return value for the function call.
111*9c5db199SXin Li
112*9c5db199SXin Li    @return 1: is_timeout 2: result of the function call. If
113*9c5db199SXin Li            is_timeout is True, the call is timed out. If the
114*9c5db199SXin Li            value is False, the call is finished on time.
115*9c5db199SXin Li    """
116*9c5db199SXin Li    old_alarm_sec = 0
117*9c5db199SXin Li    old_handler = signal.getsignal(signal.SIGALRM)
118*9c5db199SXin Li    handler = custom_sigalarm_handler(func, timeout_sec)
119*9c5db199SXin Li    installed_handler = install_sigalarm_handler(handler)
120*9c5db199SXin Li    if installed_handler:
121*9c5db199SXin Li        old_alarm_sec = set_sigalarm_timeout(timeout_sec, default_timeout=60)
122*9c5db199SXin Li
123*9c5db199SXin Li    # If old_timeout_time = 0 we either didn't install a handler, or sigalrm
124*9c5db199SXin Li    # had a signal.SIG_DFL handler with 0 timeout. In the latter case we still
125*9c5db199SXin Li    # need to restore the handler/timeout.
126*9c5db199SXin Li    old_timeout_time = (time.time() + old_alarm_sec) if old_alarm_sec > 0 else 0
127*9c5db199SXin Li
128*9c5db199SXin Li    try:
129*9c5db199SXin Li        default_result = func(*args, **kwargs)
130*9c5db199SXin Li        return False, default_result
131*9c5db199SXin Li    except error.TimeoutException:
132*9c5db199SXin Li        return True, default_result
133*9c5db199SXin Li    finally:
134*9c5db199SXin Li        # If we installed a sigalarm handler, cancel it since our function
135*9c5db199SXin Li        # returned on time. If we can successfully restore the old handler,
136*9c5db199SXin Li        # reset the old timeout, or, if the old timeout's deadline has passed,
137*9c5db199SXin Li        # set the sigalarm to fire in one second. If the old_timeout_time is 0
138*9c5db199SXin Li        # we don't need to set the sigalarm timeout since we have already set it
139*9c5db199SXin Li        # as a byproduct of cancelling the current signal.
140*9c5db199SXin Li        if installed_handler:
141*9c5db199SXin Li            signal.alarm(0)
142*9c5db199SXin Li            if install_sigalarm_handler(old_handler) and old_timeout_time:
143*9c5db199SXin Li                set_sigalarm_timeout(int(old_timeout_time - time.time()),
144*9c5db199SXin Li                                     default_timeout=1)
145*9c5db199SXin Li
146*9c5db199SXin Li
147*9c5db199SXin Li
148*9c5db199SXin Lidef retry(ExceptionToCheck, timeout_min=1.0, delay_sec=3, raiselist=None,
149*9c5db199SXin Li          exception_to_raise=None, label=None, callback=None, backoff=1):
150*9c5db199SXin Li    """Retry calling the decorated function using a delay with jitter.
151*9c5db199SXin Li
152*9c5db199SXin Li    Will raise RPC ValidationError exceptions from the decorated
153*9c5db199SXin Li    function without retrying; a malformed RPC isn't going to
154*9c5db199SXin Li    magically become good. Will raise exceptions in raiselist as well.
155*9c5db199SXin Li
156*9c5db199SXin Li    If the retry is done in a child thread, timeout may not be enforced as
157*9c5db199SXin Li    signal only works in main thread. Therefore, the retry inside a child
158*9c5db199SXin Li    thread may run longer than timeout or even hang.
159*9c5db199SXin Li
160*9c5db199SXin Li    original from:
161*9c5db199SXin Li      http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
162*9c5db199SXin Li
163*9c5db199SXin Li    @param ExceptionToCheck: the exception to check.  May be a tuple of
164*9c5db199SXin Li                             exceptions to check.
165*9c5db199SXin Li    @param timeout_min: timeout in minutes until giving up.
166*9c5db199SXin Li    @param delay_sec: pre-jittered base delay between retries in seconds. Actual
167*9c5db199SXin Li                      delays will be first calculated with exponential backoff,
168*9c5db199SXin Li                      then randomized around this new value, ranging up to 50%
169*9c5db199SXin Li                      off this midpoint.
170*9c5db199SXin Li    @param raiselist: a list of exceptions that will be raised without retrying.
171*9c5db199SXin Li    @param exception_to_raise: the exception to raise. Callers can specify the
172*9c5db199SXin Li                               exception they want to raise.
173*9c5db199SXin Li    @param label: a label added to the exception message to help debug.
174*9c5db199SXin Li    @param callback: a function to call before each retry.
175*9c5db199SXin Li    @param backoff: exponent to calculate exponential backoff for the actual
176*9c5db199SXin Li                    delay. Set to 1 to disable exponential backoff.
177*9c5db199SXin Li    """
178*9c5db199SXin Li    def deco_retry(func):
179*9c5db199SXin Li        """
180*9c5db199SXin Li        Decorator wrapper.
181*9c5db199SXin Li
182*9c5db199SXin Li        @param func: the function to be retried and timed-out.
183*9c5db199SXin Li        """
184*9c5db199SXin Li        random.seed()
185*9c5db199SXin Li
186*9c5db199SXin Li
187*9c5db199SXin Li        def delay(delay_with_backoff_sec):
188*9c5db199SXin Li            """
189*9c5db199SXin Li            'Jitter' the delay with backoff, up to 50% in either direction.
190*9c5db199SXin Li            """
191*9c5db199SXin Li            random_delay = random.uniform(0.5 * delay_with_backoff_sec,
192*9c5db199SXin Li                                          1.5 * delay_with_backoff_sec)
193*9c5db199SXin Li            logging.warning('Retrying in %f seconds...', random_delay)
194*9c5db199SXin Li            time.sleep(random_delay)
195*9c5db199SXin Li
196*9c5db199SXin Li
197*9c5db199SXin Li        def func_retry(*args, **kwargs):
198*9c5db199SXin Li            """
199*9c5db199SXin Li            Used to cache exception to be raised later.
200*9c5db199SXin Li            """
201*9c5db199SXin Li            exc_info = None
202*9c5db199SXin Li            delayed_enabled = False
203*9c5db199SXin Li            exception_tuple = () if raiselist is None else tuple(raiselist)
204*9c5db199SXin Li            start_time = time.time()
205*9c5db199SXin Li            remaining_time = timeout_min * 60
206*9c5db199SXin Li            delay_with_backoff_sec = delay_sec
207*9c5db199SXin Li            is_main_thread = isinstance(threading.current_thread(),
208*9c5db199SXin Li                                        threading._MainThread)
209*9c5db199SXin Li            if label:
210*9c5db199SXin Li                details = 'label="%s"' % label
211*9c5db199SXin Li            elif hasattr(func, '__name__'):
212*9c5db199SXin Li                details = 'function="%s()"' % func.__name__
213*9c5db199SXin Li            else:
214*9c5db199SXin Li                details = 'unknown function'
215*9c5db199SXin Li
216*9c5db199SXin Li            exception_message = ('retry exception (%s), timeout = %ds' %
217*9c5db199SXin Li                                 (details, timeout_min * 60))
218*9c5db199SXin Li
219*9c5db199SXin Li            while remaining_time > 0:
220*9c5db199SXin Li                if delayed_enabled:
221*9c5db199SXin Li                    delay(delay_with_backoff_sec)
222*9c5db199SXin Li                    delay_with_backoff_sec *= backoff
223*9c5db199SXin Li                else:
224*9c5db199SXin Li                    delayed_enabled = True
225*9c5db199SXin Li                try:
226*9c5db199SXin Li                    # Clear the cache
227*9c5db199SXin Li                    exc_info = None
228*9c5db199SXin Li                    if is_main_thread:
229*9c5db199SXin Li                        is_timeout, result = timeout(func, args, kwargs,
230*9c5db199SXin Li                                                     remaining_time)
231*9c5db199SXin Li                        if not is_timeout:
232*9c5db199SXin Li                            return result
233*9c5db199SXin Li                    else:
234*9c5db199SXin Li                        return func(*args, **kwargs)
235*9c5db199SXin Li                except exception_tuple:
236*9c5db199SXin Li                    raise
237*9c5db199SXin Li                except error.CrosDynamicSuiteException:
238*9c5db199SXin Li                    raise
239*9c5db199SXin Li                except ExceptionToCheck as e:
240*9c5db199SXin Li                    logging.warning('%s(%s)', e.__class__, e)
241*9c5db199SXin Li                    # Cache the exception to be raised later.
242*9c5db199SXin Li                    exc_info = sys.exc_info()
243*9c5db199SXin Li
244*9c5db199SXin Li                remaining_time = int(timeout_min * 60 -
245*9c5db199SXin Li                                     (time.time() - start_time))
246*9c5db199SXin Li
247*9c5db199SXin Li                if remaining_time > 0 and callback:
248*9c5db199SXin Li                    callback()
249*9c5db199SXin Li                    remaining_time = int(timeout_min * 60 -
250*9c5db199SXin Li                                         (time.time() - start_time))
251*9c5db199SXin Li
252*9c5db199SXin Li
253*9c5db199SXin Li            # The call must have timed out or raised ExceptionToCheck.
254*9c5db199SXin Li            if not exc_info:
255*9c5db199SXin Li                if exception_to_raise:
256*9c5db199SXin Li                    raise exception_to_raise(exception_message)
257*9c5db199SXin Li                else:
258*9c5db199SXin Li                    raise error.TimeoutException(exception_message)
259*9c5db199SXin Li            # Raise the cached exception with original backtrace.
260*9c5db199SXin Li            if exception_to_raise:
261*9c5db199SXin Li                raise exception_to_raise('%s: %s' % (exc_info[0], exc_info[1]))
262*9c5db199SXin Li            six.reraise(exc_info[0], exc_info[1], exc_info[2])
263*9c5db199SXin Li
264*9c5db199SXin Li
265*9c5db199SXin Li        return func_retry  # true decorator
266*9c5db199SXin Li    return deco_retry
267