xref: /aosp_15_r20/external/perfetto/src/base/watchdog_posix.cc (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1*6dbdd20aSAndroid Build Coastguard Worker /*
2*6dbdd20aSAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*6dbdd20aSAndroid Build Coastguard Worker  *
4*6dbdd20aSAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*6dbdd20aSAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*6dbdd20aSAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*6dbdd20aSAndroid Build Coastguard Worker  *
8*6dbdd20aSAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*6dbdd20aSAndroid Build Coastguard Worker  *
10*6dbdd20aSAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*6dbdd20aSAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*6dbdd20aSAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*6dbdd20aSAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*6dbdd20aSAndroid Build Coastguard Worker  * limitations under the License.
15*6dbdd20aSAndroid Build Coastguard Worker  */
16*6dbdd20aSAndroid Build Coastguard Worker 
17*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/platform.h"
18*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/watchdog.h"
19*6dbdd20aSAndroid Build Coastguard Worker 
20*6dbdd20aSAndroid Build Coastguard Worker #if PERFETTO_BUILDFLAG(PERFETTO_WATCHDOG)
21*6dbdd20aSAndroid Build Coastguard Worker 
22*6dbdd20aSAndroid Build Coastguard Worker #include <fcntl.h>
23*6dbdd20aSAndroid Build Coastguard Worker #include <poll.h>
24*6dbdd20aSAndroid Build Coastguard Worker #include <signal.h>
25*6dbdd20aSAndroid Build Coastguard Worker #include <stdint.h>
26*6dbdd20aSAndroid Build Coastguard Worker #include <stdlib.h>
27*6dbdd20aSAndroid Build Coastguard Worker #include <sys/syscall.h>
28*6dbdd20aSAndroid Build Coastguard Worker #include <sys/timerfd.h>
29*6dbdd20aSAndroid Build Coastguard Worker #include <unistd.h>
30*6dbdd20aSAndroid Build Coastguard Worker 
31*6dbdd20aSAndroid Build Coastguard Worker #include <algorithm>
32*6dbdd20aSAndroid Build Coastguard Worker #include <cinttypes>
33*6dbdd20aSAndroid Build Coastguard Worker #include <fstream>
34*6dbdd20aSAndroid Build Coastguard Worker #include <thread>
35*6dbdd20aSAndroid Build Coastguard Worker 
36*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/build_config.h"
37*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/logging.h"
38*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/thread_utils.h"
39*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/time.h"
40*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/crash_keys.h"
41*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/file_utils.h"
42*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/scoped_file.h"
43*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/utils.h"
44*6dbdd20aSAndroid Build Coastguard Worker 
45*6dbdd20aSAndroid Build Coastguard Worker namespace perfetto {
46*6dbdd20aSAndroid Build Coastguard Worker namespace base {
47*6dbdd20aSAndroid Build Coastguard Worker 
48*6dbdd20aSAndroid Build Coastguard Worker namespace {
49*6dbdd20aSAndroid Build Coastguard Worker 
50*6dbdd20aSAndroid Build Coastguard Worker constexpr uint32_t kDefaultPollingInterval = 30 * 1000;
51*6dbdd20aSAndroid Build Coastguard Worker 
52*6dbdd20aSAndroid Build Coastguard Worker base::CrashKey g_crash_key_reason("wdog_reason");
53*6dbdd20aSAndroid Build Coastguard Worker 
IsMultipleOf(uint32_t number,uint32_t divisor)54*6dbdd20aSAndroid Build Coastguard Worker bool IsMultipleOf(uint32_t number, uint32_t divisor) {
55*6dbdd20aSAndroid Build Coastguard Worker   return number >= divisor && number % divisor == 0;
56*6dbdd20aSAndroid Build Coastguard Worker }
57*6dbdd20aSAndroid Build Coastguard Worker 
MeanForArray(const uint64_t array[],size_t size)58*6dbdd20aSAndroid Build Coastguard Worker double MeanForArray(const uint64_t array[], size_t size) {
59*6dbdd20aSAndroid Build Coastguard Worker   uint64_t total = 0;
60*6dbdd20aSAndroid Build Coastguard Worker   for (size_t i = 0; i < size; i++) {
61*6dbdd20aSAndroid Build Coastguard Worker     total += array[i];
62*6dbdd20aSAndroid Build Coastguard Worker   }
63*6dbdd20aSAndroid Build Coastguard Worker   return static_cast<double>(total / size);
64*6dbdd20aSAndroid Build Coastguard Worker }
65*6dbdd20aSAndroid Build Coastguard Worker 
66*6dbdd20aSAndroid Build Coastguard Worker }  //  namespace
67*6dbdd20aSAndroid Build Coastguard Worker 
ReadProcStat(int fd,ProcStat * out)68*6dbdd20aSAndroid Build Coastguard Worker bool ReadProcStat(int fd, ProcStat* out) {
69*6dbdd20aSAndroid Build Coastguard Worker   char c[512];
70*6dbdd20aSAndroid Build Coastguard Worker   size_t c_pos = 0;
71*6dbdd20aSAndroid Build Coastguard Worker   while (c_pos < sizeof(c) - 1) {
72*6dbdd20aSAndroid Build Coastguard Worker     ssize_t rd = PERFETTO_EINTR(read(fd, c + c_pos, sizeof(c) - c_pos));
73*6dbdd20aSAndroid Build Coastguard Worker     if (rd < 0) {
74*6dbdd20aSAndroid Build Coastguard Worker       PERFETTO_ELOG("Failed to read stat file to enforce resource limits.");
75*6dbdd20aSAndroid Build Coastguard Worker       return false;
76*6dbdd20aSAndroid Build Coastguard Worker     }
77*6dbdd20aSAndroid Build Coastguard Worker     if (rd == 0)
78*6dbdd20aSAndroid Build Coastguard Worker       break;
79*6dbdd20aSAndroid Build Coastguard Worker     c_pos += static_cast<size_t>(rd);
80*6dbdd20aSAndroid Build Coastguard Worker   }
81*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_CHECK(c_pos < sizeof(c));
82*6dbdd20aSAndroid Build Coastguard Worker   c[c_pos] = '\0';
83*6dbdd20aSAndroid Build Coastguard Worker 
84*6dbdd20aSAndroid Build Coastguard Worker   if (sscanf(c,
85*6dbdd20aSAndroid Build Coastguard Worker              "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu "
86*6dbdd20aSAndroid Build Coastguard Worker              "%lu %*d %*d %*d %*d %*d %*d %*u %*u %ld",
87*6dbdd20aSAndroid Build Coastguard Worker              &out->utime, &out->stime, &out->rss_pages) != 3) {
88*6dbdd20aSAndroid Build Coastguard Worker     PERFETTO_ELOG("Invalid stat format: %s", c);
89*6dbdd20aSAndroid Build Coastguard Worker     return false;
90*6dbdd20aSAndroid Build Coastguard Worker   }
91*6dbdd20aSAndroid Build Coastguard Worker   return true;
92*6dbdd20aSAndroid Build Coastguard Worker }
93*6dbdd20aSAndroid Build Coastguard Worker 
Watchdog(uint32_t polling_interval_ms)94*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Watchdog(uint32_t polling_interval_ms)
95*6dbdd20aSAndroid Build Coastguard Worker     : polling_interval_ms_(polling_interval_ms) {}
96*6dbdd20aSAndroid Build Coastguard Worker 
~Watchdog()97*6dbdd20aSAndroid Build Coastguard Worker Watchdog::~Watchdog() {
98*6dbdd20aSAndroid Build Coastguard Worker   if (!thread_.joinable()) {
99*6dbdd20aSAndroid Build Coastguard Worker     PERFETTO_DCHECK(!enabled_);
100*6dbdd20aSAndroid Build Coastguard Worker     return;
101*6dbdd20aSAndroid Build Coastguard Worker   }
102*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_DCHECK(enabled_);
103*6dbdd20aSAndroid Build Coastguard Worker   enabled_ = false;
104*6dbdd20aSAndroid Build Coastguard Worker 
105*6dbdd20aSAndroid Build Coastguard Worker   // Rearm the timer to 1ns from now. This will cause the watchdog thread to
106*6dbdd20aSAndroid Build Coastguard Worker   // wakeup from the poll() and see |enabled_| == false.
107*6dbdd20aSAndroid Build Coastguard Worker   // This code path is used only in tests. In production code the watchdog is
108*6dbdd20aSAndroid Build Coastguard Worker   // a singleton and is never destroyed.
109*6dbdd20aSAndroid Build Coastguard Worker   struct itimerspec ts {};
110*6dbdd20aSAndroid Build Coastguard Worker   ts.it_value.tv_sec = 0;
111*6dbdd20aSAndroid Build Coastguard Worker   ts.it_value.tv_nsec = 1;
112*6dbdd20aSAndroid Build Coastguard Worker   timerfd_settime(*timer_fd_, /*flags=*/0, &ts, nullptr);
113*6dbdd20aSAndroid Build Coastguard Worker 
114*6dbdd20aSAndroid Build Coastguard Worker   thread_.join();
115*6dbdd20aSAndroid Build Coastguard Worker }
116*6dbdd20aSAndroid Build Coastguard Worker 
GetInstance()117*6dbdd20aSAndroid Build Coastguard Worker Watchdog* Watchdog::GetInstance() {
118*6dbdd20aSAndroid Build Coastguard Worker   static Watchdog* watchdog = new Watchdog(kDefaultPollingInterval);
119*6dbdd20aSAndroid Build Coastguard Worker   return watchdog;
120*6dbdd20aSAndroid Build Coastguard Worker }
121*6dbdd20aSAndroid Build Coastguard Worker 
122*6dbdd20aSAndroid Build Coastguard Worker // Can be called from any thread.
CreateFatalTimer(uint32_t ms,WatchdogCrashReason crash_reason)123*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer Watchdog::CreateFatalTimer(uint32_t ms,
124*6dbdd20aSAndroid Build Coastguard Worker                                            WatchdogCrashReason crash_reason) {
125*6dbdd20aSAndroid Build Coastguard Worker   if (!enabled_.load(std::memory_order_relaxed))
126*6dbdd20aSAndroid Build Coastguard Worker     return Watchdog::Timer(this, 0, crash_reason);
127*6dbdd20aSAndroid Build Coastguard Worker 
128*6dbdd20aSAndroid Build Coastguard Worker   return Watchdog::Timer(this, ms, crash_reason);
129*6dbdd20aSAndroid Build Coastguard Worker }
130*6dbdd20aSAndroid Build Coastguard Worker 
131*6dbdd20aSAndroid Build Coastguard Worker // Can be called from any thread.
AddFatalTimer(TimerData timer)132*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::AddFatalTimer(TimerData timer) {
133*6dbdd20aSAndroid Build Coastguard Worker   std::lock_guard<std::mutex> guard(mutex_);
134*6dbdd20aSAndroid Build Coastguard Worker   timers_.emplace_back(std::move(timer));
135*6dbdd20aSAndroid Build Coastguard Worker   RearmTimerFd_Locked();
136*6dbdd20aSAndroid Build Coastguard Worker }
137*6dbdd20aSAndroid Build Coastguard Worker 
138*6dbdd20aSAndroid Build Coastguard Worker // Can be called from any thread.
RemoveFatalTimer(TimerData timer)139*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::RemoveFatalTimer(TimerData timer) {
140*6dbdd20aSAndroid Build Coastguard Worker   std::lock_guard<std::mutex> guard(mutex_);
141*6dbdd20aSAndroid Build Coastguard Worker   for (auto it = timers_.begin(); it != timers_.end(); it++) {
142*6dbdd20aSAndroid Build Coastguard Worker     if (*it == timer) {
143*6dbdd20aSAndroid Build Coastguard Worker       timers_.erase(it);
144*6dbdd20aSAndroid Build Coastguard Worker       break;  // Remove only one. Doesn't matter which one.
145*6dbdd20aSAndroid Build Coastguard Worker     }
146*6dbdd20aSAndroid Build Coastguard Worker   }
147*6dbdd20aSAndroid Build Coastguard Worker   RearmTimerFd_Locked();
148*6dbdd20aSAndroid Build Coastguard Worker }
149*6dbdd20aSAndroid Build Coastguard Worker 
RearmTimerFd_Locked()150*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::RearmTimerFd_Locked() {
151*6dbdd20aSAndroid Build Coastguard Worker   if (!enabled_)
152*6dbdd20aSAndroid Build Coastguard Worker     return;
153*6dbdd20aSAndroid Build Coastguard Worker   auto it = std::min_element(timers_.begin(), timers_.end());
154*6dbdd20aSAndroid Build Coastguard Worker 
155*6dbdd20aSAndroid Build Coastguard Worker   // We use one timerfd to handle all the oustanding |timers_|. Keep it armed
156*6dbdd20aSAndroid Build Coastguard Worker   // to the task expiring soonest.
157*6dbdd20aSAndroid Build Coastguard Worker   struct itimerspec ts {};
158*6dbdd20aSAndroid Build Coastguard Worker   if (it != timers_.end()) {
159*6dbdd20aSAndroid Build Coastguard Worker     ts.it_value = ToPosixTimespec(it->deadline);
160*6dbdd20aSAndroid Build Coastguard Worker   }
161*6dbdd20aSAndroid Build Coastguard Worker   // If |timers_| is empty (it == end()) |ts.it_value| will remain
162*6dbdd20aSAndroid Build Coastguard Worker   // zero-initialized and that will disarm the timer in the call below.
163*6dbdd20aSAndroid Build Coastguard Worker   int res = timerfd_settime(*timer_fd_, TFD_TIMER_ABSTIME, &ts, nullptr);
164*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_DCHECK(res == 0);
165*6dbdd20aSAndroid Build Coastguard Worker }
166*6dbdd20aSAndroid Build Coastguard Worker 
Start()167*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::Start() {
168*6dbdd20aSAndroid Build Coastguard Worker   std::lock_guard<std::mutex> guard(mutex_);
169*6dbdd20aSAndroid Build Coastguard Worker   if (thread_.joinable()) {
170*6dbdd20aSAndroid Build Coastguard Worker     PERFETTO_DCHECK(enabled_);
171*6dbdd20aSAndroid Build Coastguard Worker   } else {
172*6dbdd20aSAndroid Build Coastguard Worker     PERFETTO_DCHECK(!enabled_);
173*6dbdd20aSAndroid Build Coastguard Worker 
174*6dbdd20aSAndroid Build Coastguard Worker #if PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \
175*6dbdd20aSAndroid Build Coastguard Worker     PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID)
176*6dbdd20aSAndroid Build Coastguard Worker     // Kick the thread to start running but only on Android or Linux.
177*6dbdd20aSAndroid Build Coastguard Worker     timer_fd_.reset(
178*6dbdd20aSAndroid Build Coastguard Worker         timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK));
179*6dbdd20aSAndroid Build Coastguard Worker     if (!timer_fd_) {
180*6dbdd20aSAndroid Build Coastguard Worker       PERFETTO_PLOG(
181*6dbdd20aSAndroid Build Coastguard Worker           "timerfd_create failed, the Perfetto watchdog is not available");
182*6dbdd20aSAndroid Build Coastguard Worker       return;
183*6dbdd20aSAndroid Build Coastguard Worker     }
184*6dbdd20aSAndroid Build Coastguard Worker     enabled_ = true;
185*6dbdd20aSAndroid Build Coastguard Worker     RearmTimerFd_Locked();  // Deal with timers created before Start().
186*6dbdd20aSAndroid Build Coastguard Worker     thread_ = std::thread(&Watchdog::ThreadMain, this);
187*6dbdd20aSAndroid Build Coastguard Worker #endif
188*6dbdd20aSAndroid Build Coastguard Worker   }
189*6dbdd20aSAndroid Build Coastguard Worker }
190*6dbdd20aSAndroid Build Coastguard Worker 
SetMemoryLimit(uint64_t bytes,uint32_t window_ms)191*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::SetMemoryLimit(uint64_t bytes, uint32_t window_ms) {
192*6dbdd20aSAndroid Build Coastguard Worker   // Update the fields under the lock.
193*6dbdd20aSAndroid Build Coastguard Worker   std::lock_guard<std::mutex> guard(mutex_);
194*6dbdd20aSAndroid Build Coastguard Worker 
195*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_CHECK(IsMultipleOf(window_ms, polling_interval_ms_) || bytes == 0);
196*6dbdd20aSAndroid Build Coastguard Worker 
197*6dbdd20aSAndroid Build Coastguard Worker   size_t size = bytes == 0 ? 0 : window_ms / polling_interval_ms_ + 1;
198*6dbdd20aSAndroid Build Coastguard Worker   memory_window_bytes_.Reset(size);
199*6dbdd20aSAndroid Build Coastguard Worker   memory_limit_bytes_ = bytes;
200*6dbdd20aSAndroid Build Coastguard Worker }
201*6dbdd20aSAndroid Build Coastguard Worker 
SetCpuLimit(uint32_t percentage,uint32_t window_ms)202*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::SetCpuLimit(uint32_t percentage, uint32_t window_ms) {
203*6dbdd20aSAndroid Build Coastguard Worker   std::lock_guard<std::mutex> guard(mutex_);
204*6dbdd20aSAndroid Build Coastguard Worker 
205*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_CHECK(percentage <= 100);
206*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_CHECK(IsMultipleOf(window_ms, polling_interval_ms_) ||
207*6dbdd20aSAndroid Build Coastguard Worker                  percentage == 0);
208*6dbdd20aSAndroid Build Coastguard Worker 
209*6dbdd20aSAndroid Build Coastguard Worker   size_t size = percentage == 0 ? 0 : window_ms / polling_interval_ms_ + 1;
210*6dbdd20aSAndroid Build Coastguard Worker   cpu_window_time_ticks_.Reset(size);
211*6dbdd20aSAndroid Build Coastguard Worker   cpu_limit_percentage_ = percentage;
212*6dbdd20aSAndroid Build Coastguard Worker }
213*6dbdd20aSAndroid Build Coastguard Worker 
ThreadMain()214*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::ThreadMain() {
215*6dbdd20aSAndroid Build Coastguard Worker   // Register crash keys explicitly to avoid running out of slots at crash time.
216*6dbdd20aSAndroid Build Coastguard Worker   g_crash_key_reason.Register();
217*6dbdd20aSAndroid Build Coastguard Worker 
218*6dbdd20aSAndroid Build Coastguard Worker   base::ScopedFile stat_fd(base::OpenFile("/proc/self/stat", O_RDONLY));
219*6dbdd20aSAndroid Build Coastguard Worker   if (!stat_fd) {
220*6dbdd20aSAndroid Build Coastguard Worker     PERFETTO_ELOG("Failed to open stat file to enforce resource limits.");
221*6dbdd20aSAndroid Build Coastguard Worker     return;
222*6dbdd20aSAndroid Build Coastguard Worker   }
223*6dbdd20aSAndroid Build Coastguard Worker 
224*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_DCHECK(timer_fd_);
225*6dbdd20aSAndroid Build Coastguard Worker 
226*6dbdd20aSAndroid Build Coastguard Worker   constexpr uint8_t kFdCount = 1;
227*6dbdd20aSAndroid Build Coastguard Worker   struct pollfd fds[kFdCount]{};
228*6dbdd20aSAndroid Build Coastguard Worker   fds[0].fd = *timer_fd_;
229*6dbdd20aSAndroid Build Coastguard Worker   fds[0].events = POLLIN;
230*6dbdd20aSAndroid Build Coastguard Worker 
231*6dbdd20aSAndroid Build Coastguard Worker   for (;;) {
232*6dbdd20aSAndroid Build Coastguard Worker     // We use the poll() timeout to drive the periodic ticks for the cpu/memory
233*6dbdd20aSAndroid Build Coastguard Worker     // checks. The only other case when the poll() unblocks is when we crash
234*6dbdd20aSAndroid Build Coastguard Worker     // (or have to quit via enabled_ == false, but that happens only in tests).
235*6dbdd20aSAndroid Build Coastguard Worker     platform::BeforeMaybeBlockingSyscall();
236*6dbdd20aSAndroid Build Coastguard Worker     auto ret = poll(fds, kFdCount, static_cast<int>(polling_interval_ms_));
237*6dbdd20aSAndroid Build Coastguard Worker     platform::AfterMaybeBlockingSyscall();
238*6dbdd20aSAndroid Build Coastguard Worker     if (!enabled_)
239*6dbdd20aSAndroid Build Coastguard Worker       return;
240*6dbdd20aSAndroid Build Coastguard Worker     if (ret < 0) {
241*6dbdd20aSAndroid Build Coastguard Worker       if (errno == ENOMEM || errno == EINTR) {
242*6dbdd20aSAndroid Build Coastguard Worker         // Should happen extremely rarely.
243*6dbdd20aSAndroid Build Coastguard Worker         std::this_thread::sleep_for(std::chrono::milliseconds(100));
244*6dbdd20aSAndroid Build Coastguard Worker         continue;
245*6dbdd20aSAndroid Build Coastguard Worker       }
246*6dbdd20aSAndroid Build Coastguard Worker       PERFETTO_FATAL("watchdog poll() failed");
247*6dbdd20aSAndroid Build Coastguard Worker     }
248*6dbdd20aSAndroid Build Coastguard Worker 
249*6dbdd20aSAndroid Build Coastguard Worker     // If we get here either:
250*6dbdd20aSAndroid Build Coastguard Worker     // 1. poll() timed out, in which case we should process cpu/mem guardrails.
251*6dbdd20aSAndroid Build Coastguard Worker     // 2. A timer expired, in which case we shall crash.
252*6dbdd20aSAndroid Build Coastguard Worker 
253*6dbdd20aSAndroid Build Coastguard Worker     uint64_t expired = 0;  // Must be exactly 8 bytes.
254*6dbdd20aSAndroid Build Coastguard Worker     auto res = PERFETTO_EINTR(read(*timer_fd_, &expired, sizeof(expired)));
255*6dbdd20aSAndroid Build Coastguard Worker     PERFETTO_DCHECK((res < 0 && (errno == EAGAIN)) ||
256*6dbdd20aSAndroid Build Coastguard Worker                     (res == sizeof(expired) && expired > 0));
257*6dbdd20aSAndroid Build Coastguard Worker     const auto now = GetWallTimeMs();
258*6dbdd20aSAndroid Build Coastguard Worker 
259*6dbdd20aSAndroid Build Coastguard Worker     // Check if any of the timers expired.
260*6dbdd20aSAndroid Build Coastguard Worker     int tid_to_kill = 0;
261*6dbdd20aSAndroid Build Coastguard Worker     WatchdogCrashReason crash_reason{};
262*6dbdd20aSAndroid Build Coastguard Worker     {
263*6dbdd20aSAndroid Build Coastguard Worker       std::lock_guard<std::mutex> guard(mutex_);
264*6dbdd20aSAndroid Build Coastguard Worker       for (const auto& timer : timers_) {
265*6dbdd20aSAndroid Build Coastguard Worker         if (now >= timer.deadline) {
266*6dbdd20aSAndroid Build Coastguard Worker           tid_to_kill = timer.thread_id;
267*6dbdd20aSAndroid Build Coastguard Worker           crash_reason = timer.crash_reason;
268*6dbdd20aSAndroid Build Coastguard Worker           break;
269*6dbdd20aSAndroid Build Coastguard Worker         }
270*6dbdd20aSAndroid Build Coastguard Worker       }
271*6dbdd20aSAndroid Build Coastguard Worker     }
272*6dbdd20aSAndroid Build Coastguard Worker 
273*6dbdd20aSAndroid Build Coastguard Worker     if (tid_to_kill)
274*6dbdd20aSAndroid Build Coastguard Worker       SerializeLogsAndKillThread(tid_to_kill, crash_reason);
275*6dbdd20aSAndroid Build Coastguard Worker 
276*6dbdd20aSAndroid Build Coastguard Worker     // Check CPU and memory guardrails (if enabled).
277*6dbdd20aSAndroid Build Coastguard Worker     lseek(stat_fd.get(), 0, SEEK_SET);
278*6dbdd20aSAndroid Build Coastguard Worker     ProcStat stat;
279*6dbdd20aSAndroid Build Coastguard Worker     if (!ReadProcStat(stat_fd.get(), &stat))
280*6dbdd20aSAndroid Build Coastguard Worker       continue;
281*6dbdd20aSAndroid Build Coastguard Worker     uint64_t cpu_time = stat.utime + stat.stime;
282*6dbdd20aSAndroid Build Coastguard Worker     uint64_t rss_bytes =
283*6dbdd20aSAndroid Build Coastguard Worker         static_cast<uint64_t>(stat.rss_pages) * base::GetSysPageSize();
284*6dbdd20aSAndroid Build Coastguard Worker 
285*6dbdd20aSAndroid Build Coastguard Worker     bool threshold_exceeded = false;
286*6dbdd20aSAndroid Build Coastguard Worker     {
287*6dbdd20aSAndroid Build Coastguard Worker       std::lock_guard<std::mutex> guard(mutex_);
288*6dbdd20aSAndroid Build Coastguard Worker       if (CheckMemory_Locked(rss_bytes) && !IsSyncMemoryTaggingEnabled()) {
289*6dbdd20aSAndroid Build Coastguard Worker         threshold_exceeded = true;
290*6dbdd20aSAndroid Build Coastguard Worker         crash_reason = WatchdogCrashReason::kMemGuardrail;
291*6dbdd20aSAndroid Build Coastguard Worker       } else if (CheckCpu_Locked(cpu_time)) {
292*6dbdd20aSAndroid Build Coastguard Worker         threshold_exceeded = true;
293*6dbdd20aSAndroid Build Coastguard Worker         crash_reason = WatchdogCrashReason::kCpuGuardrail;
294*6dbdd20aSAndroid Build Coastguard Worker       }
295*6dbdd20aSAndroid Build Coastguard Worker     }
296*6dbdd20aSAndroid Build Coastguard Worker 
297*6dbdd20aSAndroid Build Coastguard Worker     if (threshold_exceeded)
298*6dbdd20aSAndroid Build Coastguard Worker       SerializeLogsAndKillThread(getpid(), crash_reason);
299*6dbdd20aSAndroid Build Coastguard Worker   }
300*6dbdd20aSAndroid Build Coastguard Worker }
301*6dbdd20aSAndroid Build Coastguard Worker 
SerializeLogsAndKillThread(int tid,WatchdogCrashReason crash_reason)302*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::SerializeLogsAndKillThread(int tid,
303*6dbdd20aSAndroid Build Coastguard Worker                                           WatchdogCrashReason crash_reason) {
304*6dbdd20aSAndroid Build Coastguard Worker   g_crash_key_reason.Set(static_cast<int>(crash_reason));
305*6dbdd20aSAndroid Build Coastguard Worker 
306*6dbdd20aSAndroid Build Coastguard Worker   // We are about to die. Serialize the logs into the crash buffer so the
307*6dbdd20aSAndroid Build Coastguard Worker   // debuggerd crash handler picks them up and attaches to the bugreport.
308*6dbdd20aSAndroid Build Coastguard Worker   // In the case of a PERFETTO_CHECK/PERFETTO_FATAL this is done in logging.h.
309*6dbdd20aSAndroid Build Coastguard Worker   // But in the watchdog case, we don't hit that codepath and must do ourselves.
310*6dbdd20aSAndroid Build Coastguard Worker   MaybeSerializeLastLogsForCrashReporting();
311*6dbdd20aSAndroid Build Coastguard Worker 
312*6dbdd20aSAndroid Build Coastguard Worker   // Send a SIGABRT to the thread that armed the timer. This is to see the
313*6dbdd20aSAndroid Build Coastguard Worker   // callstack of the thread that is stuck in a long task rather than the
314*6dbdd20aSAndroid Build Coastguard Worker   // watchdog thread.
315*6dbdd20aSAndroid Build Coastguard Worker   if (syscall(__NR_tgkill, getpid(), tid, SIGABRT) < 0) {
316*6dbdd20aSAndroid Build Coastguard Worker     // At this point the process must die. If for any reason the tgkill doesn't
317*6dbdd20aSAndroid Build Coastguard Worker     // work (e.g. the thread has disappeared), force a crash from here.
318*6dbdd20aSAndroid Build Coastguard Worker     abort();
319*6dbdd20aSAndroid Build Coastguard Worker   }
320*6dbdd20aSAndroid Build Coastguard Worker 
321*6dbdd20aSAndroid Build Coastguard Worker   if (disable_kill_failsafe_for_testing_)
322*6dbdd20aSAndroid Build Coastguard Worker     return;
323*6dbdd20aSAndroid Build Coastguard Worker 
324*6dbdd20aSAndroid Build Coastguard Worker   // The tgkill() above will take some milliseconds to cause a crash, as it
325*6dbdd20aSAndroid Build Coastguard Worker   // involves the kernel to queue the SIGABRT on the target thread (often the
326*6dbdd20aSAndroid Build Coastguard Worker   // main thread, which is != watchdog thread) and do a scheduling round.
327*6dbdd20aSAndroid Build Coastguard Worker   // If something goes wrong though (the target thread has signals masked or
328*6dbdd20aSAndroid Build Coastguard Worker   // is stuck in an uninterruptible+wakekill syscall) force quit from this
329*6dbdd20aSAndroid Build Coastguard Worker   // thread.
330*6dbdd20aSAndroid Build Coastguard Worker   std::this_thread::sleep_for(std::chrono::seconds(10));
331*6dbdd20aSAndroid Build Coastguard Worker   abort();
332*6dbdd20aSAndroid Build Coastguard Worker }
333*6dbdd20aSAndroid Build Coastguard Worker 
CheckMemory_Locked(uint64_t rss_bytes)334*6dbdd20aSAndroid Build Coastguard Worker bool Watchdog::CheckMemory_Locked(uint64_t rss_bytes) {
335*6dbdd20aSAndroid Build Coastguard Worker   if (memory_limit_bytes_ == 0)
336*6dbdd20aSAndroid Build Coastguard Worker     return false;
337*6dbdd20aSAndroid Build Coastguard Worker 
338*6dbdd20aSAndroid Build Coastguard Worker   // Add the current stat value to the ring buffer and check that the mean
339*6dbdd20aSAndroid Build Coastguard Worker   // remains under our threshold.
340*6dbdd20aSAndroid Build Coastguard Worker   if (memory_window_bytes_.Push(rss_bytes)) {
341*6dbdd20aSAndroid Build Coastguard Worker     if (memory_window_bytes_.Mean() >
342*6dbdd20aSAndroid Build Coastguard Worker         static_cast<double>(memory_limit_bytes_)) {
343*6dbdd20aSAndroid Build Coastguard Worker       PERFETTO_ELOG(
344*6dbdd20aSAndroid Build Coastguard Worker           "Memory watchdog trigger. Memory window of %f bytes is above the "
345*6dbdd20aSAndroid Build Coastguard Worker           "%" PRIu64 " bytes limit.",
346*6dbdd20aSAndroid Build Coastguard Worker           memory_window_bytes_.Mean(), memory_limit_bytes_);
347*6dbdd20aSAndroid Build Coastguard Worker       return true;
348*6dbdd20aSAndroid Build Coastguard Worker     }
349*6dbdd20aSAndroid Build Coastguard Worker   }
350*6dbdd20aSAndroid Build Coastguard Worker   return false;
351*6dbdd20aSAndroid Build Coastguard Worker }
352*6dbdd20aSAndroid Build Coastguard Worker 
CheckCpu_Locked(uint64_t cpu_time)353*6dbdd20aSAndroid Build Coastguard Worker bool Watchdog::CheckCpu_Locked(uint64_t cpu_time) {
354*6dbdd20aSAndroid Build Coastguard Worker   if (cpu_limit_percentage_ == 0)
355*6dbdd20aSAndroid Build Coastguard Worker     return false;
356*6dbdd20aSAndroid Build Coastguard Worker 
357*6dbdd20aSAndroid Build Coastguard Worker   // Add the cpu time to the ring buffer.
358*6dbdd20aSAndroid Build Coastguard Worker   if (cpu_window_time_ticks_.Push(cpu_time)) {
359*6dbdd20aSAndroid Build Coastguard Worker     // Compute the percentage over the whole window and check that it remains
360*6dbdd20aSAndroid Build Coastguard Worker     // under the threshold.
361*6dbdd20aSAndroid Build Coastguard Worker     uint64_t difference_ticks = cpu_window_time_ticks_.NewestWhenFull() -
362*6dbdd20aSAndroid Build Coastguard Worker                                 cpu_window_time_ticks_.OldestWhenFull();
363*6dbdd20aSAndroid Build Coastguard Worker     double window_interval_ticks =
364*6dbdd20aSAndroid Build Coastguard Worker         (static_cast<double>(WindowTimeForRingBuffer(cpu_window_time_ticks_)) /
365*6dbdd20aSAndroid Build Coastguard Worker          1000.0) *
366*6dbdd20aSAndroid Build Coastguard Worker         static_cast<double>(sysconf(_SC_CLK_TCK));
367*6dbdd20aSAndroid Build Coastguard Worker     double percentage = static_cast<double>(difference_ticks) /
368*6dbdd20aSAndroid Build Coastguard Worker                         static_cast<double>(window_interval_ticks) * 100;
369*6dbdd20aSAndroid Build Coastguard Worker     if (percentage > cpu_limit_percentage_) {
370*6dbdd20aSAndroid Build Coastguard Worker       PERFETTO_ELOG("CPU watchdog trigger. %f%% CPU use is above the %" PRIu32
371*6dbdd20aSAndroid Build Coastguard Worker                     "%% CPU limit.",
372*6dbdd20aSAndroid Build Coastguard Worker                     percentage, cpu_limit_percentage_);
373*6dbdd20aSAndroid Build Coastguard Worker       return true;
374*6dbdd20aSAndroid Build Coastguard Worker     }
375*6dbdd20aSAndroid Build Coastguard Worker   }
376*6dbdd20aSAndroid Build Coastguard Worker   return false;
377*6dbdd20aSAndroid Build Coastguard Worker }
378*6dbdd20aSAndroid Build Coastguard Worker 
WindowTimeForRingBuffer(const WindowedInterval & window)379*6dbdd20aSAndroid Build Coastguard Worker uint32_t Watchdog::WindowTimeForRingBuffer(const WindowedInterval& window) {
380*6dbdd20aSAndroid Build Coastguard Worker   return static_cast<uint32_t>(window.size() - 1) * polling_interval_ms_;
381*6dbdd20aSAndroid Build Coastguard Worker }
382*6dbdd20aSAndroid Build Coastguard Worker 
Push(uint64_t sample)383*6dbdd20aSAndroid Build Coastguard Worker bool Watchdog::WindowedInterval::Push(uint64_t sample) {
384*6dbdd20aSAndroid Build Coastguard Worker   // Add the sample to the current position in the ring buffer.
385*6dbdd20aSAndroid Build Coastguard Worker   buffer_[position_] = sample;
386*6dbdd20aSAndroid Build Coastguard Worker 
387*6dbdd20aSAndroid Build Coastguard Worker   // Update the position with next one circularily.
388*6dbdd20aSAndroid Build Coastguard Worker   position_ = (position_ + 1) % size_;
389*6dbdd20aSAndroid Build Coastguard Worker 
390*6dbdd20aSAndroid Build Coastguard Worker   // Set the filled flag the first time we wrap.
391*6dbdd20aSAndroid Build Coastguard Worker   filled_ = filled_ || position_ == 0;
392*6dbdd20aSAndroid Build Coastguard Worker   return filled_;
393*6dbdd20aSAndroid Build Coastguard Worker }
394*6dbdd20aSAndroid Build Coastguard Worker 
Mean() const395*6dbdd20aSAndroid Build Coastguard Worker double Watchdog::WindowedInterval::Mean() const {
396*6dbdd20aSAndroid Build Coastguard Worker   return MeanForArray(buffer_.get(), size_);
397*6dbdd20aSAndroid Build Coastguard Worker }
398*6dbdd20aSAndroid Build Coastguard Worker 
Clear()399*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::WindowedInterval::Clear() {
400*6dbdd20aSAndroid Build Coastguard Worker   position_ = 0;
401*6dbdd20aSAndroid Build Coastguard Worker   buffer_.reset(new uint64_t[size_]());
402*6dbdd20aSAndroid Build Coastguard Worker }
403*6dbdd20aSAndroid Build Coastguard Worker 
Reset(size_t new_size)404*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::WindowedInterval::Reset(size_t new_size) {
405*6dbdd20aSAndroid Build Coastguard Worker   position_ = 0;
406*6dbdd20aSAndroid Build Coastguard Worker   size_ = new_size;
407*6dbdd20aSAndroid Build Coastguard Worker   buffer_.reset(new_size == 0 ? nullptr : new uint64_t[new_size]());
408*6dbdd20aSAndroid Build Coastguard Worker }
409*6dbdd20aSAndroid Build Coastguard Worker 
Timer(Watchdog * watchdog,uint32_t ms,WatchdogCrashReason crash_reason)410*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer::Timer(Watchdog* watchdog,
411*6dbdd20aSAndroid Build Coastguard Worker                        uint32_t ms,
412*6dbdd20aSAndroid Build Coastguard Worker                        WatchdogCrashReason crash_reason)
413*6dbdd20aSAndroid Build Coastguard Worker     : watchdog_(watchdog) {
414*6dbdd20aSAndroid Build Coastguard Worker   if (!ms)
415*6dbdd20aSAndroid Build Coastguard Worker     return;  // No-op timer created when the watchdog is disabled.
416*6dbdd20aSAndroid Build Coastguard Worker   timer_data_.deadline = GetWallTimeMs() + std::chrono::milliseconds(ms);
417*6dbdd20aSAndroid Build Coastguard Worker   timer_data_.thread_id = GetThreadId();
418*6dbdd20aSAndroid Build Coastguard Worker   timer_data_.crash_reason = crash_reason;
419*6dbdd20aSAndroid Build Coastguard Worker   PERFETTO_DCHECK(watchdog_);
420*6dbdd20aSAndroid Build Coastguard Worker   watchdog_->AddFatalTimer(timer_data_);
421*6dbdd20aSAndroid Build Coastguard Worker }
422*6dbdd20aSAndroid Build Coastguard Worker 
~Timer()423*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer::~Timer() {
424*6dbdd20aSAndroid Build Coastguard Worker   if (timer_data_.deadline.count())
425*6dbdd20aSAndroid Build Coastguard Worker     watchdog_->RemoveFatalTimer(timer_data_);
426*6dbdd20aSAndroid Build Coastguard Worker }
427*6dbdd20aSAndroid Build Coastguard Worker 
Timer(Timer && other)428*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer::Timer(Timer&& other) noexcept {
429*6dbdd20aSAndroid Build Coastguard Worker   watchdog_ = std::move(other.watchdog_);
430*6dbdd20aSAndroid Build Coastguard Worker   other.watchdog_ = nullptr;
431*6dbdd20aSAndroid Build Coastguard Worker   timer_data_ = std::move(other.timer_data_);
432*6dbdd20aSAndroid Build Coastguard Worker   other.timer_data_ = TimerData();
433*6dbdd20aSAndroid Build Coastguard Worker }
434*6dbdd20aSAndroid Build Coastguard Worker 
435*6dbdd20aSAndroid Build Coastguard Worker }  // namespace base
436*6dbdd20aSAndroid Build Coastguard Worker }  // namespace perfetto
437*6dbdd20aSAndroid Build Coastguard Worker 
438*6dbdd20aSAndroid Build Coastguard Worker #endif  // PERFETTO_BUILDFLAG(PERFETTO_WATCHDOG)
439