1*6dbdd20aSAndroid Build Coastguard Worker /*
2*6dbdd20aSAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*6dbdd20aSAndroid Build Coastguard Worker *
4*6dbdd20aSAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*6dbdd20aSAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*6dbdd20aSAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*6dbdd20aSAndroid Build Coastguard Worker *
8*6dbdd20aSAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*6dbdd20aSAndroid Build Coastguard Worker *
10*6dbdd20aSAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*6dbdd20aSAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*6dbdd20aSAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*6dbdd20aSAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*6dbdd20aSAndroid Build Coastguard Worker * limitations under the License.
15*6dbdd20aSAndroid Build Coastguard Worker */
16*6dbdd20aSAndroid Build Coastguard Worker
17*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/platform.h"
18*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/watchdog.h"
19*6dbdd20aSAndroid Build Coastguard Worker
20*6dbdd20aSAndroid Build Coastguard Worker #if PERFETTO_BUILDFLAG(PERFETTO_WATCHDOG)
21*6dbdd20aSAndroid Build Coastguard Worker
22*6dbdd20aSAndroid Build Coastguard Worker #include <fcntl.h>
23*6dbdd20aSAndroid Build Coastguard Worker #include <poll.h>
24*6dbdd20aSAndroid Build Coastguard Worker #include <signal.h>
25*6dbdd20aSAndroid Build Coastguard Worker #include <stdint.h>
26*6dbdd20aSAndroid Build Coastguard Worker #include <stdlib.h>
27*6dbdd20aSAndroid Build Coastguard Worker #include <sys/syscall.h>
28*6dbdd20aSAndroid Build Coastguard Worker #include <sys/timerfd.h>
29*6dbdd20aSAndroid Build Coastguard Worker #include <unistd.h>
30*6dbdd20aSAndroid Build Coastguard Worker
31*6dbdd20aSAndroid Build Coastguard Worker #include <algorithm>
32*6dbdd20aSAndroid Build Coastguard Worker #include <cinttypes>
33*6dbdd20aSAndroid Build Coastguard Worker #include <fstream>
34*6dbdd20aSAndroid Build Coastguard Worker #include <thread>
35*6dbdd20aSAndroid Build Coastguard Worker
36*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/build_config.h"
37*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/logging.h"
38*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/thread_utils.h"
39*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/base/time.h"
40*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/crash_keys.h"
41*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/file_utils.h"
42*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/scoped_file.h"
43*6dbdd20aSAndroid Build Coastguard Worker #include "perfetto/ext/base/utils.h"
44*6dbdd20aSAndroid Build Coastguard Worker
45*6dbdd20aSAndroid Build Coastguard Worker namespace perfetto {
46*6dbdd20aSAndroid Build Coastguard Worker namespace base {
47*6dbdd20aSAndroid Build Coastguard Worker
48*6dbdd20aSAndroid Build Coastguard Worker namespace {
49*6dbdd20aSAndroid Build Coastguard Worker
50*6dbdd20aSAndroid Build Coastguard Worker constexpr uint32_t kDefaultPollingInterval = 30 * 1000;
51*6dbdd20aSAndroid Build Coastguard Worker
52*6dbdd20aSAndroid Build Coastguard Worker base::CrashKey g_crash_key_reason("wdog_reason");
53*6dbdd20aSAndroid Build Coastguard Worker
IsMultipleOf(uint32_t number,uint32_t divisor)54*6dbdd20aSAndroid Build Coastguard Worker bool IsMultipleOf(uint32_t number, uint32_t divisor) {
55*6dbdd20aSAndroid Build Coastguard Worker return number >= divisor && number % divisor == 0;
56*6dbdd20aSAndroid Build Coastguard Worker }
57*6dbdd20aSAndroid Build Coastguard Worker
MeanForArray(const uint64_t array[],size_t size)58*6dbdd20aSAndroid Build Coastguard Worker double MeanForArray(const uint64_t array[], size_t size) {
59*6dbdd20aSAndroid Build Coastguard Worker uint64_t total = 0;
60*6dbdd20aSAndroid Build Coastguard Worker for (size_t i = 0; i < size; i++) {
61*6dbdd20aSAndroid Build Coastguard Worker total += array[i];
62*6dbdd20aSAndroid Build Coastguard Worker }
63*6dbdd20aSAndroid Build Coastguard Worker return static_cast<double>(total / size);
64*6dbdd20aSAndroid Build Coastguard Worker }
65*6dbdd20aSAndroid Build Coastguard Worker
66*6dbdd20aSAndroid Build Coastguard Worker } // namespace
67*6dbdd20aSAndroid Build Coastguard Worker
ReadProcStat(int fd,ProcStat * out)68*6dbdd20aSAndroid Build Coastguard Worker bool ReadProcStat(int fd, ProcStat* out) {
69*6dbdd20aSAndroid Build Coastguard Worker char c[512];
70*6dbdd20aSAndroid Build Coastguard Worker size_t c_pos = 0;
71*6dbdd20aSAndroid Build Coastguard Worker while (c_pos < sizeof(c) - 1) {
72*6dbdd20aSAndroid Build Coastguard Worker ssize_t rd = PERFETTO_EINTR(read(fd, c + c_pos, sizeof(c) - c_pos));
73*6dbdd20aSAndroid Build Coastguard Worker if (rd < 0) {
74*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_ELOG("Failed to read stat file to enforce resource limits.");
75*6dbdd20aSAndroid Build Coastguard Worker return false;
76*6dbdd20aSAndroid Build Coastguard Worker }
77*6dbdd20aSAndroid Build Coastguard Worker if (rd == 0)
78*6dbdd20aSAndroid Build Coastguard Worker break;
79*6dbdd20aSAndroid Build Coastguard Worker c_pos += static_cast<size_t>(rd);
80*6dbdd20aSAndroid Build Coastguard Worker }
81*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_CHECK(c_pos < sizeof(c));
82*6dbdd20aSAndroid Build Coastguard Worker c[c_pos] = '\0';
83*6dbdd20aSAndroid Build Coastguard Worker
84*6dbdd20aSAndroid Build Coastguard Worker if (sscanf(c,
85*6dbdd20aSAndroid Build Coastguard Worker "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu "
86*6dbdd20aSAndroid Build Coastguard Worker "%lu %*d %*d %*d %*d %*d %*d %*u %*u %ld",
87*6dbdd20aSAndroid Build Coastguard Worker &out->utime, &out->stime, &out->rss_pages) != 3) {
88*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_ELOG("Invalid stat format: %s", c);
89*6dbdd20aSAndroid Build Coastguard Worker return false;
90*6dbdd20aSAndroid Build Coastguard Worker }
91*6dbdd20aSAndroid Build Coastguard Worker return true;
92*6dbdd20aSAndroid Build Coastguard Worker }
93*6dbdd20aSAndroid Build Coastguard Worker
Watchdog(uint32_t polling_interval_ms)94*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Watchdog(uint32_t polling_interval_ms)
95*6dbdd20aSAndroid Build Coastguard Worker : polling_interval_ms_(polling_interval_ms) {}
96*6dbdd20aSAndroid Build Coastguard Worker
~Watchdog()97*6dbdd20aSAndroid Build Coastguard Worker Watchdog::~Watchdog() {
98*6dbdd20aSAndroid Build Coastguard Worker if (!thread_.joinable()) {
99*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK(!enabled_);
100*6dbdd20aSAndroid Build Coastguard Worker return;
101*6dbdd20aSAndroid Build Coastguard Worker }
102*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK(enabled_);
103*6dbdd20aSAndroid Build Coastguard Worker enabled_ = false;
104*6dbdd20aSAndroid Build Coastguard Worker
105*6dbdd20aSAndroid Build Coastguard Worker // Rearm the timer to 1ns from now. This will cause the watchdog thread to
106*6dbdd20aSAndroid Build Coastguard Worker // wakeup from the poll() and see |enabled_| == false.
107*6dbdd20aSAndroid Build Coastguard Worker // This code path is used only in tests. In production code the watchdog is
108*6dbdd20aSAndroid Build Coastguard Worker // a singleton and is never destroyed.
109*6dbdd20aSAndroid Build Coastguard Worker struct itimerspec ts {};
110*6dbdd20aSAndroid Build Coastguard Worker ts.it_value.tv_sec = 0;
111*6dbdd20aSAndroid Build Coastguard Worker ts.it_value.tv_nsec = 1;
112*6dbdd20aSAndroid Build Coastguard Worker timerfd_settime(*timer_fd_, /*flags=*/0, &ts, nullptr);
113*6dbdd20aSAndroid Build Coastguard Worker
114*6dbdd20aSAndroid Build Coastguard Worker thread_.join();
115*6dbdd20aSAndroid Build Coastguard Worker }
116*6dbdd20aSAndroid Build Coastguard Worker
GetInstance()117*6dbdd20aSAndroid Build Coastguard Worker Watchdog* Watchdog::GetInstance() {
118*6dbdd20aSAndroid Build Coastguard Worker static Watchdog* watchdog = new Watchdog(kDefaultPollingInterval);
119*6dbdd20aSAndroid Build Coastguard Worker return watchdog;
120*6dbdd20aSAndroid Build Coastguard Worker }
121*6dbdd20aSAndroid Build Coastguard Worker
122*6dbdd20aSAndroid Build Coastguard Worker // Can be called from any thread.
CreateFatalTimer(uint32_t ms,WatchdogCrashReason crash_reason)123*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer Watchdog::CreateFatalTimer(uint32_t ms,
124*6dbdd20aSAndroid Build Coastguard Worker WatchdogCrashReason crash_reason) {
125*6dbdd20aSAndroid Build Coastguard Worker if (!enabled_.load(std::memory_order_relaxed))
126*6dbdd20aSAndroid Build Coastguard Worker return Watchdog::Timer(this, 0, crash_reason);
127*6dbdd20aSAndroid Build Coastguard Worker
128*6dbdd20aSAndroid Build Coastguard Worker return Watchdog::Timer(this, ms, crash_reason);
129*6dbdd20aSAndroid Build Coastguard Worker }
130*6dbdd20aSAndroid Build Coastguard Worker
131*6dbdd20aSAndroid Build Coastguard Worker // Can be called from any thread.
AddFatalTimer(TimerData timer)132*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::AddFatalTimer(TimerData timer) {
133*6dbdd20aSAndroid Build Coastguard Worker std::lock_guard<std::mutex> guard(mutex_);
134*6dbdd20aSAndroid Build Coastguard Worker timers_.emplace_back(std::move(timer));
135*6dbdd20aSAndroid Build Coastguard Worker RearmTimerFd_Locked();
136*6dbdd20aSAndroid Build Coastguard Worker }
137*6dbdd20aSAndroid Build Coastguard Worker
138*6dbdd20aSAndroid Build Coastguard Worker // Can be called from any thread.
RemoveFatalTimer(TimerData timer)139*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::RemoveFatalTimer(TimerData timer) {
140*6dbdd20aSAndroid Build Coastguard Worker std::lock_guard<std::mutex> guard(mutex_);
141*6dbdd20aSAndroid Build Coastguard Worker for (auto it = timers_.begin(); it != timers_.end(); it++) {
142*6dbdd20aSAndroid Build Coastguard Worker if (*it == timer) {
143*6dbdd20aSAndroid Build Coastguard Worker timers_.erase(it);
144*6dbdd20aSAndroid Build Coastguard Worker break; // Remove only one. Doesn't matter which one.
145*6dbdd20aSAndroid Build Coastguard Worker }
146*6dbdd20aSAndroid Build Coastguard Worker }
147*6dbdd20aSAndroid Build Coastguard Worker RearmTimerFd_Locked();
148*6dbdd20aSAndroid Build Coastguard Worker }
149*6dbdd20aSAndroid Build Coastguard Worker
RearmTimerFd_Locked()150*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::RearmTimerFd_Locked() {
151*6dbdd20aSAndroid Build Coastguard Worker if (!enabled_)
152*6dbdd20aSAndroid Build Coastguard Worker return;
153*6dbdd20aSAndroid Build Coastguard Worker auto it = std::min_element(timers_.begin(), timers_.end());
154*6dbdd20aSAndroid Build Coastguard Worker
155*6dbdd20aSAndroid Build Coastguard Worker // We use one timerfd to handle all the oustanding |timers_|. Keep it armed
156*6dbdd20aSAndroid Build Coastguard Worker // to the task expiring soonest.
157*6dbdd20aSAndroid Build Coastguard Worker struct itimerspec ts {};
158*6dbdd20aSAndroid Build Coastguard Worker if (it != timers_.end()) {
159*6dbdd20aSAndroid Build Coastguard Worker ts.it_value = ToPosixTimespec(it->deadline);
160*6dbdd20aSAndroid Build Coastguard Worker }
161*6dbdd20aSAndroid Build Coastguard Worker // If |timers_| is empty (it == end()) |ts.it_value| will remain
162*6dbdd20aSAndroid Build Coastguard Worker // zero-initialized and that will disarm the timer in the call below.
163*6dbdd20aSAndroid Build Coastguard Worker int res = timerfd_settime(*timer_fd_, TFD_TIMER_ABSTIME, &ts, nullptr);
164*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK(res == 0);
165*6dbdd20aSAndroid Build Coastguard Worker }
166*6dbdd20aSAndroid Build Coastguard Worker
Start()167*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::Start() {
168*6dbdd20aSAndroid Build Coastguard Worker std::lock_guard<std::mutex> guard(mutex_);
169*6dbdd20aSAndroid Build Coastguard Worker if (thread_.joinable()) {
170*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK(enabled_);
171*6dbdd20aSAndroid Build Coastguard Worker } else {
172*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK(!enabled_);
173*6dbdd20aSAndroid Build Coastguard Worker
174*6dbdd20aSAndroid Build Coastguard Worker #if PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \
175*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID)
176*6dbdd20aSAndroid Build Coastguard Worker // Kick the thread to start running but only on Android or Linux.
177*6dbdd20aSAndroid Build Coastguard Worker timer_fd_.reset(
178*6dbdd20aSAndroid Build Coastguard Worker timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK));
179*6dbdd20aSAndroid Build Coastguard Worker if (!timer_fd_) {
180*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_PLOG(
181*6dbdd20aSAndroid Build Coastguard Worker "timerfd_create failed, the Perfetto watchdog is not available");
182*6dbdd20aSAndroid Build Coastguard Worker return;
183*6dbdd20aSAndroid Build Coastguard Worker }
184*6dbdd20aSAndroid Build Coastguard Worker enabled_ = true;
185*6dbdd20aSAndroid Build Coastguard Worker RearmTimerFd_Locked(); // Deal with timers created before Start().
186*6dbdd20aSAndroid Build Coastguard Worker thread_ = std::thread(&Watchdog::ThreadMain, this);
187*6dbdd20aSAndroid Build Coastguard Worker #endif
188*6dbdd20aSAndroid Build Coastguard Worker }
189*6dbdd20aSAndroid Build Coastguard Worker }
190*6dbdd20aSAndroid Build Coastguard Worker
SetMemoryLimit(uint64_t bytes,uint32_t window_ms)191*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::SetMemoryLimit(uint64_t bytes, uint32_t window_ms) {
192*6dbdd20aSAndroid Build Coastguard Worker // Update the fields under the lock.
193*6dbdd20aSAndroid Build Coastguard Worker std::lock_guard<std::mutex> guard(mutex_);
194*6dbdd20aSAndroid Build Coastguard Worker
195*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_CHECK(IsMultipleOf(window_ms, polling_interval_ms_) || bytes == 0);
196*6dbdd20aSAndroid Build Coastguard Worker
197*6dbdd20aSAndroid Build Coastguard Worker size_t size = bytes == 0 ? 0 : window_ms / polling_interval_ms_ + 1;
198*6dbdd20aSAndroid Build Coastguard Worker memory_window_bytes_.Reset(size);
199*6dbdd20aSAndroid Build Coastguard Worker memory_limit_bytes_ = bytes;
200*6dbdd20aSAndroid Build Coastguard Worker }
201*6dbdd20aSAndroid Build Coastguard Worker
SetCpuLimit(uint32_t percentage,uint32_t window_ms)202*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::SetCpuLimit(uint32_t percentage, uint32_t window_ms) {
203*6dbdd20aSAndroid Build Coastguard Worker std::lock_guard<std::mutex> guard(mutex_);
204*6dbdd20aSAndroid Build Coastguard Worker
205*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_CHECK(percentage <= 100);
206*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_CHECK(IsMultipleOf(window_ms, polling_interval_ms_) ||
207*6dbdd20aSAndroid Build Coastguard Worker percentage == 0);
208*6dbdd20aSAndroid Build Coastguard Worker
209*6dbdd20aSAndroid Build Coastguard Worker size_t size = percentage == 0 ? 0 : window_ms / polling_interval_ms_ + 1;
210*6dbdd20aSAndroid Build Coastguard Worker cpu_window_time_ticks_.Reset(size);
211*6dbdd20aSAndroid Build Coastguard Worker cpu_limit_percentage_ = percentage;
212*6dbdd20aSAndroid Build Coastguard Worker }
213*6dbdd20aSAndroid Build Coastguard Worker
ThreadMain()214*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::ThreadMain() {
215*6dbdd20aSAndroid Build Coastguard Worker // Register crash keys explicitly to avoid running out of slots at crash time.
216*6dbdd20aSAndroid Build Coastguard Worker g_crash_key_reason.Register();
217*6dbdd20aSAndroid Build Coastguard Worker
218*6dbdd20aSAndroid Build Coastguard Worker base::ScopedFile stat_fd(base::OpenFile("/proc/self/stat", O_RDONLY));
219*6dbdd20aSAndroid Build Coastguard Worker if (!stat_fd) {
220*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_ELOG("Failed to open stat file to enforce resource limits.");
221*6dbdd20aSAndroid Build Coastguard Worker return;
222*6dbdd20aSAndroid Build Coastguard Worker }
223*6dbdd20aSAndroid Build Coastguard Worker
224*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK(timer_fd_);
225*6dbdd20aSAndroid Build Coastguard Worker
226*6dbdd20aSAndroid Build Coastguard Worker constexpr uint8_t kFdCount = 1;
227*6dbdd20aSAndroid Build Coastguard Worker struct pollfd fds[kFdCount]{};
228*6dbdd20aSAndroid Build Coastguard Worker fds[0].fd = *timer_fd_;
229*6dbdd20aSAndroid Build Coastguard Worker fds[0].events = POLLIN;
230*6dbdd20aSAndroid Build Coastguard Worker
231*6dbdd20aSAndroid Build Coastguard Worker for (;;) {
232*6dbdd20aSAndroid Build Coastguard Worker // We use the poll() timeout to drive the periodic ticks for the cpu/memory
233*6dbdd20aSAndroid Build Coastguard Worker // checks. The only other case when the poll() unblocks is when we crash
234*6dbdd20aSAndroid Build Coastguard Worker // (or have to quit via enabled_ == false, but that happens only in tests).
235*6dbdd20aSAndroid Build Coastguard Worker platform::BeforeMaybeBlockingSyscall();
236*6dbdd20aSAndroid Build Coastguard Worker auto ret = poll(fds, kFdCount, static_cast<int>(polling_interval_ms_));
237*6dbdd20aSAndroid Build Coastguard Worker platform::AfterMaybeBlockingSyscall();
238*6dbdd20aSAndroid Build Coastguard Worker if (!enabled_)
239*6dbdd20aSAndroid Build Coastguard Worker return;
240*6dbdd20aSAndroid Build Coastguard Worker if (ret < 0) {
241*6dbdd20aSAndroid Build Coastguard Worker if (errno == ENOMEM || errno == EINTR) {
242*6dbdd20aSAndroid Build Coastguard Worker // Should happen extremely rarely.
243*6dbdd20aSAndroid Build Coastguard Worker std::this_thread::sleep_for(std::chrono::milliseconds(100));
244*6dbdd20aSAndroid Build Coastguard Worker continue;
245*6dbdd20aSAndroid Build Coastguard Worker }
246*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_FATAL("watchdog poll() failed");
247*6dbdd20aSAndroid Build Coastguard Worker }
248*6dbdd20aSAndroid Build Coastguard Worker
249*6dbdd20aSAndroid Build Coastguard Worker // If we get here either:
250*6dbdd20aSAndroid Build Coastguard Worker // 1. poll() timed out, in which case we should process cpu/mem guardrails.
251*6dbdd20aSAndroid Build Coastguard Worker // 2. A timer expired, in which case we shall crash.
252*6dbdd20aSAndroid Build Coastguard Worker
253*6dbdd20aSAndroid Build Coastguard Worker uint64_t expired = 0; // Must be exactly 8 bytes.
254*6dbdd20aSAndroid Build Coastguard Worker auto res = PERFETTO_EINTR(read(*timer_fd_, &expired, sizeof(expired)));
255*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK((res < 0 && (errno == EAGAIN)) ||
256*6dbdd20aSAndroid Build Coastguard Worker (res == sizeof(expired) && expired > 0));
257*6dbdd20aSAndroid Build Coastguard Worker const auto now = GetWallTimeMs();
258*6dbdd20aSAndroid Build Coastguard Worker
259*6dbdd20aSAndroid Build Coastguard Worker // Check if any of the timers expired.
260*6dbdd20aSAndroid Build Coastguard Worker int tid_to_kill = 0;
261*6dbdd20aSAndroid Build Coastguard Worker WatchdogCrashReason crash_reason{};
262*6dbdd20aSAndroid Build Coastguard Worker {
263*6dbdd20aSAndroid Build Coastguard Worker std::lock_guard<std::mutex> guard(mutex_);
264*6dbdd20aSAndroid Build Coastguard Worker for (const auto& timer : timers_) {
265*6dbdd20aSAndroid Build Coastguard Worker if (now >= timer.deadline) {
266*6dbdd20aSAndroid Build Coastguard Worker tid_to_kill = timer.thread_id;
267*6dbdd20aSAndroid Build Coastguard Worker crash_reason = timer.crash_reason;
268*6dbdd20aSAndroid Build Coastguard Worker break;
269*6dbdd20aSAndroid Build Coastguard Worker }
270*6dbdd20aSAndroid Build Coastguard Worker }
271*6dbdd20aSAndroid Build Coastguard Worker }
272*6dbdd20aSAndroid Build Coastguard Worker
273*6dbdd20aSAndroid Build Coastguard Worker if (tid_to_kill)
274*6dbdd20aSAndroid Build Coastguard Worker SerializeLogsAndKillThread(tid_to_kill, crash_reason);
275*6dbdd20aSAndroid Build Coastguard Worker
276*6dbdd20aSAndroid Build Coastguard Worker // Check CPU and memory guardrails (if enabled).
277*6dbdd20aSAndroid Build Coastguard Worker lseek(stat_fd.get(), 0, SEEK_SET);
278*6dbdd20aSAndroid Build Coastguard Worker ProcStat stat;
279*6dbdd20aSAndroid Build Coastguard Worker if (!ReadProcStat(stat_fd.get(), &stat))
280*6dbdd20aSAndroid Build Coastguard Worker continue;
281*6dbdd20aSAndroid Build Coastguard Worker uint64_t cpu_time = stat.utime + stat.stime;
282*6dbdd20aSAndroid Build Coastguard Worker uint64_t rss_bytes =
283*6dbdd20aSAndroid Build Coastguard Worker static_cast<uint64_t>(stat.rss_pages) * base::GetSysPageSize();
284*6dbdd20aSAndroid Build Coastguard Worker
285*6dbdd20aSAndroid Build Coastguard Worker bool threshold_exceeded = false;
286*6dbdd20aSAndroid Build Coastguard Worker {
287*6dbdd20aSAndroid Build Coastguard Worker std::lock_guard<std::mutex> guard(mutex_);
288*6dbdd20aSAndroid Build Coastguard Worker if (CheckMemory_Locked(rss_bytes) && !IsSyncMemoryTaggingEnabled()) {
289*6dbdd20aSAndroid Build Coastguard Worker threshold_exceeded = true;
290*6dbdd20aSAndroid Build Coastguard Worker crash_reason = WatchdogCrashReason::kMemGuardrail;
291*6dbdd20aSAndroid Build Coastguard Worker } else if (CheckCpu_Locked(cpu_time)) {
292*6dbdd20aSAndroid Build Coastguard Worker threshold_exceeded = true;
293*6dbdd20aSAndroid Build Coastguard Worker crash_reason = WatchdogCrashReason::kCpuGuardrail;
294*6dbdd20aSAndroid Build Coastguard Worker }
295*6dbdd20aSAndroid Build Coastguard Worker }
296*6dbdd20aSAndroid Build Coastguard Worker
297*6dbdd20aSAndroid Build Coastguard Worker if (threshold_exceeded)
298*6dbdd20aSAndroid Build Coastguard Worker SerializeLogsAndKillThread(getpid(), crash_reason);
299*6dbdd20aSAndroid Build Coastguard Worker }
300*6dbdd20aSAndroid Build Coastguard Worker }
301*6dbdd20aSAndroid Build Coastguard Worker
SerializeLogsAndKillThread(int tid,WatchdogCrashReason crash_reason)302*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::SerializeLogsAndKillThread(int tid,
303*6dbdd20aSAndroid Build Coastguard Worker WatchdogCrashReason crash_reason) {
304*6dbdd20aSAndroid Build Coastguard Worker g_crash_key_reason.Set(static_cast<int>(crash_reason));
305*6dbdd20aSAndroid Build Coastguard Worker
306*6dbdd20aSAndroid Build Coastguard Worker // We are about to die. Serialize the logs into the crash buffer so the
307*6dbdd20aSAndroid Build Coastguard Worker // debuggerd crash handler picks them up and attaches to the bugreport.
308*6dbdd20aSAndroid Build Coastguard Worker // In the case of a PERFETTO_CHECK/PERFETTO_FATAL this is done in logging.h.
309*6dbdd20aSAndroid Build Coastguard Worker // But in the watchdog case, we don't hit that codepath and must do ourselves.
310*6dbdd20aSAndroid Build Coastguard Worker MaybeSerializeLastLogsForCrashReporting();
311*6dbdd20aSAndroid Build Coastguard Worker
312*6dbdd20aSAndroid Build Coastguard Worker // Send a SIGABRT to the thread that armed the timer. This is to see the
313*6dbdd20aSAndroid Build Coastguard Worker // callstack of the thread that is stuck in a long task rather than the
314*6dbdd20aSAndroid Build Coastguard Worker // watchdog thread.
315*6dbdd20aSAndroid Build Coastguard Worker if (syscall(__NR_tgkill, getpid(), tid, SIGABRT) < 0) {
316*6dbdd20aSAndroid Build Coastguard Worker // At this point the process must die. If for any reason the tgkill doesn't
317*6dbdd20aSAndroid Build Coastguard Worker // work (e.g. the thread has disappeared), force a crash from here.
318*6dbdd20aSAndroid Build Coastguard Worker abort();
319*6dbdd20aSAndroid Build Coastguard Worker }
320*6dbdd20aSAndroid Build Coastguard Worker
321*6dbdd20aSAndroid Build Coastguard Worker if (disable_kill_failsafe_for_testing_)
322*6dbdd20aSAndroid Build Coastguard Worker return;
323*6dbdd20aSAndroid Build Coastguard Worker
324*6dbdd20aSAndroid Build Coastguard Worker // The tgkill() above will take some milliseconds to cause a crash, as it
325*6dbdd20aSAndroid Build Coastguard Worker // involves the kernel to queue the SIGABRT on the target thread (often the
326*6dbdd20aSAndroid Build Coastguard Worker // main thread, which is != watchdog thread) and do a scheduling round.
327*6dbdd20aSAndroid Build Coastguard Worker // If something goes wrong though (the target thread has signals masked or
328*6dbdd20aSAndroid Build Coastguard Worker // is stuck in an uninterruptible+wakekill syscall) force quit from this
329*6dbdd20aSAndroid Build Coastguard Worker // thread.
330*6dbdd20aSAndroid Build Coastguard Worker std::this_thread::sleep_for(std::chrono::seconds(10));
331*6dbdd20aSAndroid Build Coastguard Worker abort();
332*6dbdd20aSAndroid Build Coastguard Worker }
333*6dbdd20aSAndroid Build Coastguard Worker
CheckMemory_Locked(uint64_t rss_bytes)334*6dbdd20aSAndroid Build Coastguard Worker bool Watchdog::CheckMemory_Locked(uint64_t rss_bytes) {
335*6dbdd20aSAndroid Build Coastguard Worker if (memory_limit_bytes_ == 0)
336*6dbdd20aSAndroid Build Coastguard Worker return false;
337*6dbdd20aSAndroid Build Coastguard Worker
338*6dbdd20aSAndroid Build Coastguard Worker // Add the current stat value to the ring buffer and check that the mean
339*6dbdd20aSAndroid Build Coastguard Worker // remains under our threshold.
340*6dbdd20aSAndroid Build Coastguard Worker if (memory_window_bytes_.Push(rss_bytes)) {
341*6dbdd20aSAndroid Build Coastguard Worker if (memory_window_bytes_.Mean() >
342*6dbdd20aSAndroid Build Coastguard Worker static_cast<double>(memory_limit_bytes_)) {
343*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_ELOG(
344*6dbdd20aSAndroid Build Coastguard Worker "Memory watchdog trigger. Memory window of %f bytes is above the "
345*6dbdd20aSAndroid Build Coastguard Worker "%" PRIu64 " bytes limit.",
346*6dbdd20aSAndroid Build Coastguard Worker memory_window_bytes_.Mean(), memory_limit_bytes_);
347*6dbdd20aSAndroid Build Coastguard Worker return true;
348*6dbdd20aSAndroid Build Coastguard Worker }
349*6dbdd20aSAndroid Build Coastguard Worker }
350*6dbdd20aSAndroid Build Coastguard Worker return false;
351*6dbdd20aSAndroid Build Coastguard Worker }
352*6dbdd20aSAndroid Build Coastguard Worker
CheckCpu_Locked(uint64_t cpu_time)353*6dbdd20aSAndroid Build Coastguard Worker bool Watchdog::CheckCpu_Locked(uint64_t cpu_time) {
354*6dbdd20aSAndroid Build Coastguard Worker if (cpu_limit_percentage_ == 0)
355*6dbdd20aSAndroid Build Coastguard Worker return false;
356*6dbdd20aSAndroid Build Coastguard Worker
357*6dbdd20aSAndroid Build Coastguard Worker // Add the cpu time to the ring buffer.
358*6dbdd20aSAndroid Build Coastguard Worker if (cpu_window_time_ticks_.Push(cpu_time)) {
359*6dbdd20aSAndroid Build Coastguard Worker // Compute the percentage over the whole window and check that it remains
360*6dbdd20aSAndroid Build Coastguard Worker // under the threshold.
361*6dbdd20aSAndroid Build Coastguard Worker uint64_t difference_ticks = cpu_window_time_ticks_.NewestWhenFull() -
362*6dbdd20aSAndroid Build Coastguard Worker cpu_window_time_ticks_.OldestWhenFull();
363*6dbdd20aSAndroid Build Coastguard Worker double window_interval_ticks =
364*6dbdd20aSAndroid Build Coastguard Worker (static_cast<double>(WindowTimeForRingBuffer(cpu_window_time_ticks_)) /
365*6dbdd20aSAndroid Build Coastguard Worker 1000.0) *
366*6dbdd20aSAndroid Build Coastguard Worker static_cast<double>(sysconf(_SC_CLK_TCK));
367*6dbdd20aSAndroid Build Coastguard Worker double percentage = static_cast<double>(difference_ticks) /
368*6dbdd20aSAndroid Build Coastguard Worker static_cast<double>(window_interval_ticks) * 100;
369*6dbdd20aSAndroid Build Coastguard Worker if (percentage > cpu_limit_percentage_) {
370*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_ELOG("CPU watchdog trigger. %f%% CPU use is above the %" PRIu32
371*6dbdd20aSAndroid Build Coastguard Worker "%% CPU limit.",
372*6dbdd20aSAndroid Build Coastguard Worker percentage, cpu_limit_percentage_);
373*6dbdd20aSAndroid Build Coastguard Worker return true;
374*6dbdd20aSAndroid Build Coastguard Worker }
375*6dbdd20aSAndroid Build Coastguard Worker }
376*6dbdd20aSAndroid Build Coastguard Worker return false;
377*6dbdd20aSAndroid Build Coastguard Worker }
378*6dbdd20aSAndroid Build Coastguard Worker
WindowTimeForRingBuffer(const WindowedInterval & window)379*6dbdd20aSAndroid Build Coastguard Worker uint32_t Watchdog::WindowTimeForRingBuffer(const WindowedInterval& window) {
380*6dbdd20aSAndroid Build Coastguard Worker return static_cast<uint32_t>(window.size() - 1) * polling_interval_ms_;
381*6dbdd20aSAndroid Build Coastguard Worker }
382*6dbdd20aSAndroid Build Coastguard Worker
Push(uint64_t sample)383*6dbdd20aSAndroid Build Coastguard Worker bool Watchdog::WindowedInterval::Push(uint64_t sample) {
384*6dbdd20aSAndroid Build Coastguard Worker // Add the sample to the current position in the ring buffer.
385*6dbdd20aSAndroid Build Coastguard Worker buffer_[position_] = sample;
386*6dbdd20aSAndroid Build Coastguard Worker
387*6dbdd20aSAndroid Build Coastguard Worker // Update the position with next one circularily.
388*6dbdd20aSAndroid Build Coastguard Worker position_ = (position_ + 1) % size_;
389*6dbdd20aSAndroid Build Coastguard Worker
390*6dbdd20aSAndroid Build Coastguard Worker // Set the filled flag the first time we wrap.
391*6dbdd20aSAndroid Build Coastguard Worker filled_ = filled_ || position_ == 0;
392*6dbdd20aSAndroid Build Coastguard Worker return filled_;
393*6dbdd20aSAndroid Build Coastguard Worker }
394*6dbdd20aSAndroid Build Coastguard Worker
Mean() const395*6dbdd20aSAndroid Build Coastguard Worker double Watchdog::WindowedInterval::Mean() const {
396*6dbdd20aSAndroid Build Coastguard Worker return MeanForArray(buffer_.get(), size_);
397*6dbdd20aSAndroid Build Coastguard Worker }
398*6dbdd20aSAndroid Build Coastguard Worker
Clear()399*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::WindowedInterval::Clear() {
400*6dbdd20aSAndroid Build Coastguard Worker position_ = 0;
401*6dbdd20aSAndroid Build Coastguard Worker buffer_.reset(new uint64_t[size_]());
402*6dbdd20aSAndroid Build Coastguard Worker }
403*6dbdd20aSAndroid Build Coastguard Worker
Reset(size_t new_size)404*6dbdd20aSAndroid Build Coastguard Worker void Watchdog::WindowedInterval::Reset(size_t new_size) {
405*6dbdd20aSAndroid Build Coastguard Worker position_ = 0;
406*6dbdd20aSAndroid Build Coastguard Worker size_ = new_size;
407*6dbdd20aSAndroid Build Coastguard Worker buffer_.reset(new_size == 0 ? nullptr : new uint64_t[new_size]());
408*6dbdd20aSAndroid Build Coastguard Worker }
409*6dbdd20aSAndroid Build Coastguard Worker
Timer(Watchdog * watchdog,uint32_t ms,WatchdogCrashReason crash_reason)410*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer::Timer(Watchdog* watchdog,
411*6dbdd20aSAndroid Build Coastguard Worker uint32_t ms,
412*6dbdd20aSAndroid Build Coastguard Worker WatchdogCrashReason crash_reason)
413*6dbdd20aSAndroid Build Coastguard Worker : watchdog_(watchdog) {
414*6dbdd20aSAndroid Build Coastguard Worker if (!ms)
415*6dbdd20aSAndroid Build Coastguard Worker return; // No-op timer created when the watchdog is disabled.
416*6dbdd20aSAndroid Build Coastguard Worker timer_data_.deadline = GetWallTimeMs() + std::chrono::milliseconds(ms);
417*6dbdd20aSAndroid Build Coastguard Worker timer_data_.thread_id = GetThreadId();
418*6dbdd20aSAndroid Build Coastguard Worker timer_data_.crash_reason = crash_reason;
419*6dbdd20aSAndroid Build Coastguard Worker PERFETTO_DCHECK(watchdog_);
420*6dbdd20aSAndroid Build Coastguard Worker watchdog_->AddFatalTimer(timer_data_);
421*6dbdd20aSAndroid Build Coastguard Worker }
422*6dbdd20aSAndroid Build Coastguard Worker
~Timer()423*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer::~Timer() {
424*6dbdd20aSAndroid Build Coastguard Worker if (timer_data_.deadline.count())
425*6dbdd20aSAndroid Build Coastguard Worker watchdog_->RemoveFatalTimer(timer_data_);
426*6dbdd20aSAndroid Build Coastguard Worker }
427*6dbdd20aSAndroid Build Coastguard Worker
Timer(Timer && other)428*6dbdd20aSAndroid Build Coastguard Worker Watchdog::Timer::Timer(Timer&& other) noexcept {
429*6dbdd20aSAndroid Build Coastguard Worker watchdog_ = std::move(other.watchdog_);
430*6dbdd20aSAndroid Build Coastguard Worker other.watchdog_ = nullptr;
431*6dbdd20aSAndroid Build Coastguard Worker timer_data_ = std::move(other.timer_data_);
432*6dbdd20aSAndroid Build Coastguard Worker other.timer_data_ = TimerData();
433*6dbdd20aSAndroid Build Coastguard Worker }
434*6dbdd20aSAndroid Build Coastguard Worker
435*6dbdd20aSAndroid Build Coastguard Worker } // namespace base
436*6dbdd20aSAndroid Build Coastguard Worker } // namespace perfetto
437*6dbdd20aSAndroid Build Coastguard Worker
438*6dbdd20aSAndroid Build Coastguard Worker #endif // PERFETTO_BUILDFLAG(PERFETTO_WATCHDOG)
439