xref: /aosp_15_r20/system/security/keystore2/watchdog/src/lib.rs (revision e1997b9af69e3155ead6e072d106a0077849ffba)
1 // Copyright 2021, The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Can be removed when instrumentations are added to keystore.
16 #![allow(dead_code)]
17 
18 //! This module implements a watchdog thread.
19 
20 use std::{
21     cmp::min,
22     collections::HashMap,
23     sync::Arc,
24     sync::{Condvar, Mutex, MutexGuard},
25     thread,
26 };
27 use std::{
28     marker::PhantomData,
29     time::{Duration, Instant},
30 };
31 
32 #[cfg(test)]
33 mod tests;
34 
35 /// Represents a Watchdog record. It can be created with `Watchdog::watch` or
36 /// `Watchdog::watch_with`. It disarms the record when dropped.
37 pub struct WatchPoint {
38     id: &'static str,
39     wd: Arc<Watchdog>,
40     not_send: PhantomData<*mut ()>, // WatchPoint must not be Send.
41 }
42 
43 impl Drop for WatchPoint {
drop(&mut self)44     fn drop(&mut self) {
45         self.wd.disarm(self.id)
46     }
47 }
48 
49 #[derive(Debug, PartialEq, Eq)]
50 enum State {
51     NotRunning,
52     Running,
53 }
54 
55 #[derive(Debug, Clone, Hash, PartialEq, Eq)]
56 struct Index {
57     tid: thread::ThreadId,
58     id: &'static str,
59 }
60 
61 struct Record {
62     started: Instant,
63     deadline: Instant,
64     context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
65 }
66 
67 struct WatchdogState {
68     state: State,
69     thread: Option<thread::JoinHandle<()>>,
70     /// How long to wait before dropping the watchdog thread when idle.
71     idle_timeout: Duration,
72     records: HashMap<Index, Record>,
73     last_report: Option<Instant>,
74     noisy_timeout: Duration,
75 }
76 
77 impl WatchdogState {
78     /// If we have overdue records, we want to log them but slowly backoff
79     /// so that we do not clog the logs. We start with logs every
80     /// `MIN_REPORT_TIMEOUT` sec then increment the timeout by 5 up
81     /// to a maximum of `MAX_REPORT_TIMEOUT`.
82     const MIN_REPORT_TIMEOUT: Duration = Duration::from_secs(1);
83     const MAX_REPORT_TIMEOUT: Duration = Duration::from_secs(30);
84 
reset_noisy_timeout(&mut self)85     fn reset_noisy_timeout(&mut self) {
86         self.noisy_timeout = Self::MIN_REPORT_TIMEOUT;
87     }
88 
update_noisy_timeout(&mut self)89     fn update_noisy_timeout(&mut self) {
90         let noisy_update = self.noisy_timeout + Duration::from_secs(5);
91         self.noisy_timeout = min(Self::MAX_REPORT_TIMEOUT, noisy_update);
92     }
93 
overdue_and_next_timeout(&self) -> (bool, Option<Duration>)94     fn overdue_and_next_timeout(&self) -> (bool, Option<Duration>) {
95         let now = Instant::now();
96         let mut next_timeout: Option<Duration> = None;
97         let mut has_overdue = false;
98         for (_, r) in self.records.iter() {
99             let timeout = r.deadline.saturating_duration_since(now);
100             if timeout == Duration::new(0, 0) {
101                 // This timeout has passed.
102                 has_overdue = true;
103             } else {
104                 // This timeout is still to come; see if it's the closest one to now.
105                 next_timeout = match next_timeout {
106                     Some(nt) if timeout < nt => Some(timeout),
107                     Some(nt) => Some(nt),
108                     None => Some(timeout),
109                 };
110             }
111         }
112         (has_overdue, next_timeout)
113     }
114 
log_report(&mut self, has_overdue: bool)115     fn log_report(&mut self, has_overdue: bool) {
116         if !has_overdue {
117             // Nothing to report.
118             self.last_report = None;
119             return;
120         }
121         // Something to report...
122         if let Some(reported_at) = self.last_report {
123             if reported_at.elapsed() < self.noisy_timeout {
124                 // .. but it's too soon since the last report.
125                 self.last_report = None;
126                 return;
127             }
128         }
129         self.update_noisy_timeout();
130         self.last_report = Some(Instant::now());
131         log::warn!("### Keystore Watchdog report - BEGIN ###");
132 
133         let now = Instant::now();
134         let mut overdue_records: Vec<(&Index, &Record)> = self
135             .records
136             .iter()
137             .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0))
138             .collect();
139 
140         log::warn!("When extracting from a bug report, please include this header");
141         log::warn!("and all {} records below.", overdue_records.len());
142 
143         // Watch points can be nested, i.e., a single thread may have multiple armed
144         // watch points. And the most recent on each thread (thread recent) is closest to the point
145         // where something is blocked. Furthermore, keystore2 has various critical section
146         // and common backend resources KeyMint that can only be entered serialized. So if one
147         // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch
148         // point is most likely pointing toward the culprit.
149         // Thus, sort by start time first.
150         overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started));
151         // Then we groups all of the watch points per thread preserving the order within
152         // groups.
153         let groups = overdue_records.iter().fold(
154             HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(),
155             |mut acc, (i, r)| {
156                 acc.entry(i.tid).or_default().push((i, r));
157                 acc
158             },
159         );
160         // Put the groups back into a vector.
161         let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect();
162         // Sort the groups by start time of the most recent (.last()) of each group.
163         // It is panic safe to use unwrap() here because we never add empty vectors to
164         // the map.
165         groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started));
166 
167         for g in groups.iter() {
168             for (i, r) in g.iter() {
169                 match &r.context {
170                     Some(ctx) => {
171                         log::warn!(
172                             "{:?} {} Pending: {:?} Overdue {:?} for {:?}",
173                             i.tid,
174                             i.id,
175                             r.started.elapsed(),
176                             r.deadline.elapsed(),
177                             ctx
178                         );
179                     }
180                     None => {
181                         log::warn!(
182                             "{:?} {} Pending: {:?} Overdue {:?}",
183                             i.tid,
184                             i.id,
185                             r.started.elapsed(),
186                             r.deadline.elapsed()
187                         );
188                     }
189                 }
190             }
191         }
192         log::warn!("### Keystore Watchdog report - END ###");
193     }
194 
disarm(&mut self, index: Index)195     fn disarm(&mut self, index: Index) {
196         let result = self.records.remove(&index);
197         if let Some(record) = result {
198             let now = Instant::now();
199             let timeout_left = record.deadline.saturating_duration_since(now);
200             if timeout_left == Duration::new(0, 0) {
201                 match &record.context {
202                     Some(ctx) => log::info!(
203                         "Watchdog complete for: {:?} {} Pending: {:?} Overdue {:?} for {:?}",
204                         index.tid,
205                         index.id,
206                         record.started.elapsed(),
207                         record.deadline.elapsed(),
208                         ctx
209                     ),
210                     None => log::info!(
211                         "Watchdog complete for: {:?} {} Pending: {:?} Overdue {:?}",
212                         index.tid,
213                         index.id,
214                         record.started.elapsed(),
215                         record.deadline.elapsed()
216                     ),
217                 }
218             }
219         }
220     }
221 
arm(&mut self, index: Index, record: Record)222     fn arm(&mut self, index: Index, record: Record) {
223         if self.records.insert(index.clone(), record).is_some() {
224             log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index);
225         }
226     }
227 }
228 
229 /// Watchdog spawns a thread that logs records of all overdue watch points when a deadline
230 /// is missed and at least every second as long as overdue watch points exist.
231 /// The thread terminates when idle for a given period of time.
232 pub struct Watchdog {
233     state: Arc<(Condvar, Mutex<WatchdogState>)>,
234 }
235 
236 impl Watchdog {
237     /// Construct a [`Watchdog`]. When `idle_timeout` has elapsed since the watchdog thread became
238     /// idle, i.e., there are no more active or overdue watch points, the watchdog thread
239     /// terminates.
new(idle_timeout: Duration) -> Arc<Self>240     pub fn new(idle_timeout: Duration) -> Arc<Self> {
241         Arc::new(Self {
242             state: Arc::new((
243                 Condvar::new(),
244                 Mutex::new(WatchdogState {
245                     state: State::NotRunning,
246                     thread: None,
247                     idle_timeout,
248                     records: HashMap::new(),
249                     last_report: None,
250                     noisy_timeout: WatchdogState::MIN_REPORT_TIMEOUT,
251                 }),
252             )),
253         })
254     }
255 
watch_with_optional( wd: Arc<Self>, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, timeout: Duration, ) -> Option<WatchPoint>256     fn watch_with_optional(
257         wd: Arc<Self>,
258         context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
259         id: &'static str,
260         timeout: Duration,
261     ) -> Option<WatchPoint> {
262         let Some(deadline) = Instant::now().checked_add(timeout) else {
263             log::warn!("Deadline computation failed for WatchPoint \"{}\"", id);
264             log::warn!("WatchPoint not armed.");
265             return None;
266         };
267         wd.arm(context, id, deadline);
268         Some(WatchPoint { id, wd, not_send: Default::default() })
269     }
270 
271     /// Create a new watch point. If the WatchPoint is not dropped before the timeout
272     /// expires, a report is logged at least every second, which includes the id string
273     /// and any provided context.
watch_with( wd: &Arc<Self>, id: &'static str, timeout: Duration, context: impl std::fmt::Debug + Send + 'static, ) -> Option<WatchPoint>274     pub fn watch_with(
275         wd: &Arc<Self>,
276         id: &'static str,
277         timeout: Duration,
278         context: impl std::fmt::Debug + Send + 'static,
279     ) -> Option<WatchPoint> {
280         Self::watch_with_optional(wd.clone(), Some(Box::new(context)), id, timeout)
281     }
282 
283     /// Like `watch_with`, but without context.
watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint>284     pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> {
285         Self::watch_with_optional(wd.clone(), None, id, timeout)
286     }
287 
arm( &self, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, deadline: Instant, )288     fn arm(
289         &self,
290         context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
291         id: &'static str,
292         deadline: Instant,
293     ) {
294         let tid = thread::current().id();
295         let index = Index { tid, id };
296         let record = Record { started: Instant::now(), deadline, context };
297 
298         let (ref condvar, ref state) = *self.state;
299 
300         let mut state = state.lock().unwrap();
301         state.arm(index, record);
302 
303         if state.state != State::Running {
304             self.spawn_thread(&mut state);
305         }
306         drop(state);
307         condvar.notify_all();
308     }
309 
disarm(&self, id: &'static str)310     fn disarm(&self, id: &'static str) {
311         let tid = thread::current().id();
312         let index = Index { tid, id };
313         let (_, ref state) = *self.state;
314 
315         let mut state = state.lock().unwrap();
316         state.disarm(index);
317         // There is no need to notify condvar. There is no action required for the
318         // watchdog thread before the next deadline.
319     }
320 
spawn_thread(&self, state: &mut MutexGuard<WatchdogState>)321     fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) {
322         if let Some(t) = state.thread.take() {
323             t.join().expect("Watchdog thread panicked.");
324         }
325 
326         let cloned_state = self.state.clone();
327 
328         state.thread = Some(thread::spawn(move || {
329             let (ref condvar, ref state) = *cloned_state;
330 
331             let mut state = state.lock().unwrap();
332 
333             loop {
334                 let (has_overdue, next_timeout) = state.overdue_and_next_timeout();
335                 state.log_report(has_overdue);
336 
337                 let (next_timeout, idle) = match (has_overdue, next_timeout) {
338                     (true, Some(next_timeout)) => (min(next_timeout, state.noisy_timeout), false),
339                     (true, None) => (state.noisy_timeout, false),
340                     (false, Some(next_timeout)) => (next_timeout, false),
341                     (false, None) => (state.idle_timeout, true),
342                 };
343 
344                 // Wait until the closest timeout pops, but use a condition variable so that if a
345                 // new watchpoint is started in the meanwhile it will interrupt the wait so we can
346                 // recalculate.
347                 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap();
348                 state = s;
349 
350                 if idle && timeout.timed_out() && state.records.is_empty() {
351                     state.reset_noisy_timeout();
352                     state.state = State::NotRunning;
353                     break;
354                 }
355             }
356             log::info!("Watchdog thread idle -> terminating. Have a great day.");
357         }));
358         state.state = State::Running;
359     }
360 }
361