1 // Copyright 2021, The Android Open Source Project 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Can be removed when instrumentations are added to keystore. 16 #![allow(dead_code)] 17 18 //! This module implements a watchdog thread. 19 20 use std::{ 21 cmp::min, 22 collections::HashMap, 23 sync::Arc, 24 sync::{Condvar, Mutex, MutexGuard}, 25 thread, 26 }; 27 use std::{ 28 marker::PhantomData, 29 time::{Duration, Instant}, 30 }; 31 32 #[cfg(test)] 33 mod tests; 34 35 /// Represents a Watchdog record. It can be created with `Watchdog::watch` or 36 /// `Watchdog::watch_with`. It disarms the record when dropped. 37 pub struct WatchPoint { 38 id: &'static str, 39 wd: Arc<Watchdog>, 40 not_send: PhantomData<*mut ()>, // WatchPoint must not be Send. 41 } 42 43 impl Drop for WatchPoint { drop(&mut self)44 fn drop(&mut self) { 45 self.wd.disarm(self.id) 46 } 47 } 48 49 #[derive(Debug, PartialEq, Eq)] 50 enum State { 51 NotRunning, 52 Running, 53 } 54 55 #[derive(Debug, Clone, Hash, PartialEq, Eq)] 56 struct Index { 57 tid: thread::ThreadId, 58 id: &'static str, 59 } 60 61 struct Record { 62 started: Instant, 63 deadline: Instant, 64 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, 65 } 66 67 struct WatchdogState { 68 state: State, 69 thread: Option<thread::JoinHandle<()>>, 70 /// How long to wait before dropping the watchdog thread when idle. 71 idle_timeout: Duration, 72 records: HashMap<Index, Record>, 73 last_report: Option<Instant>, 74 noisy_timeout: Duration, 75 } 76 77 impl WatchdogState { 78 /// If we have overdue records, we want to log them but slowly backoff 79 /// so that we do not clog the logs. We start with logs every 80 /// `MIN_REPORT_TIMEOUT` sec then increment the timeout by 5 up 81 /// to a maximum of `MAX_REPORT_TIMEOUT`. 82 const MIN_REPORT_TIMEOUT: Duration = Duration::from_secs(1); 83 const MAX_REPORT_TIMEOUT: Duration = Duration::from_secs(30); 84 reset_noisy_timeout(&mut self)85 fn reset_noisy_timeout(&mut self) { 86 self.noisy_timeout = Self::MIN_REPORT_TIMEOUT; 87 } 88 update_noisy_timeout(&mut self)89 fn update_noisy_timeout(&mut self) { 90 let noisy_update = self.noisy_timeout + Duration::from_secs(5); 91 self.noisy_timeout = min(Self::MAX_REPORT_TIMEOUT, noisy_update); 92 } 93 overdue_and_next_timeout(&self) -> (bool, Option<Duration>)94 fn overdue_and_next_timeout(&self) -> (bool, Option<Duration>) { 95 let now = Instant::now(); 96 let mut next_timeout: Option<Duration> = None; 97 let mut has_overdue = false; 98 for (_, r) in self.records.iter() { 99 let timeout = r.deadline.saturating_duration_since(now); 100 if timeout == Duration::new(0, 0) { 101 // This timeout has passed. 102 has_overdue = true; 103 } else { 104 // This timeout is still to come; see if it's the closest one to now. 105 next_timeout = match next_timeout { 106 Some(nt) if timeout < nt => Some(timeout), 107 Some(nt) => Some(nt), 108 None => Some(timeout), 109 }; 110 } 111 } 112 (has_overdue, next_timeout) 113 } 114 log_report(&mut self, has_overdue: bool)115 fn log_report(&mut self, has_overdue: bool) { 116 if !has_overdue { 117 // Nothing to report. 118 self.last_report = None; 119 return; 120 } 121 // Something to report... 122 if let Some(reported_at) = self.last_report { 123 if reported_at.elapsed() < self.noisy_timeout { 124 // .. but it's too soon since the last report. 125 self.last_report = None; 126 return; 127 } 128 } 129 self.update_noisy_timeout(); 130 self.last_report = Some(Instant::now()); 131 log::warn!("### Keystore Watchdog report - BEGIN ###"); 132 133 let now = Instant::now(); 134 let mut overdue_records: Vec<(&Index, &Record)> = self 135 .records 136 .iter() 137 .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0)) 138 .collect(); 139 140 log::warn!("When extracting from a bug report, please include this header"); 141 log::warn!("and all {} records below.", overdue_records.len()); 142 143 // Watch points can be nested, i.e., a single thread may have multiple armed 144 // watch points. And the most recent on each thread (thread recent) is closest to the point 145 // where something is blocked. Furthermore, keystore2 has various critical section 146 // and common backend resources KeyMint that can only be entered serialized. So if one 147 // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch 148 // point is most likely pointing toward the culprit. 149 // Thus, sort by start time first. 150 overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started)); 151 // Then we groups all of the watch points per thread preserving the order within 152 // groups. 153 let groups = overdue_records.iter().fold( 154 HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(), 155 |mut acc, (i, r)| { 156 acc.entry(i.tid).or_default().push((i, r)); 157 acc 158 }, 159 ); 160 // Put the groups back into a vector. 161 let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect(); 162 // Sort the groups by start time of the most recent (.last()) of each group. 163 // It is panic safe to use unwrap() here because we never add empty vectors to 164 // the map. 165 groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started)); 166 167 for g in groups.iter() { 168 for (i, r) in g.iter() { 169 match &r.context { 170 Some(ctx) => { 171 log::warn!( 172 "{:?} {} Pending: {:?} Overdue {:?} for {:?}", 173 i.tid, 174 i.id, 175 r.started.elapsed(), 176 r.deadline.elapsed(), 177 ctx 178 ); 179 } 180 None => { 181 log::warn!( 182 "{:?} {} Pending: {:?} Overdue {:?}", 183 i.tid, 184 i.id, 185 r.started.elapsed(), 186 r.deadline.elapsed() 187 ); 188 } 189 } 190 } 191 } 192 log::warn!("### Keystore Watchdog report - END ###"); 193 } 194 disarm(&mut self, index: Index)195 fn disarm(&mut self, index: Index) { 196 let result = self.records.remove(&index); 197 if let Some(record) = result { 198 let now = Instant::now(); 199 let timeout_left = record.deadline.saturating_duration_since(now); 200 if timeout_left == Duration::new(0, 0) { 201 match &record.context { 202 Some(ctx) => log::info!( 203 "Watchdog complete for: {:?} {} Pending: {:?} Overdue {:?} for {:?}", 204 index.tid, 205 index.id, 206 record.started.elapsed(), 207 record.deadline.elapsed(), 208 ctx 209 ), 210 None => log::info!( 211 "Watchdog complete for: {:?} {} Pending: {:?} Overdue {:?}", 212 index.tid, 213 index.id, 214 record.started.elapsed(), 215 record.deadline.elapsed() 216 ), 217 } 218 } 219 } 220 } 221 arm(&mut self, index: Index, record: Record)222 fn arm(&mut self, index: Index, record: Record) { 223 if self.records.insert(index.clone(), record).is_some() { 224 log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index); 225 } 226 } 227 } 228 229 /// Watchdog spawns a thread that logs records of all overdue watch points when a deadline 230 /// is missed and at least every second as long as overdue watch points exist. 231 /// The thread terminates when idle for a given period of time. 232 pub struct Watchdog { 233 state: Arc<(Condvar, Mutex<WatchdogState>)>, 234 } 235 236 impl Watchdog { 237 /// Construct a [`Watchdog`]. When `idle_timeout` has elapsed since the watchdog thread became 238 /// idle, i.e., there are no more active or overdue watch points, the watchdog thread 239 /// terminates. new(idle_timeout: Duration) -> Arc<Self>240 pub fn new(idle_timeout: Duration) -> Arc<Self> { 241 Arc::new(Self { 242 state: Arc::new(( 243 Condvar::new(), 244 Mutex::new(WatchdogState { 245 state: State::NotRunning, 246 thread: None, 247 idle_timeout, 248 records: HashMap::new(), 249 last_report: None, 250 noisy_timeout: WatchdogState::MIN_REPORT_TIMEOUT, 251 }), 252 )), 253 }) 254 } 255 watch_with_optional( wd: Arc<Self>, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, timeout: Duration, ) -> Option<WatchPoint>256 fn watch_with_optional( 257 wd: Arc<Self>, 258 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, 259 id: &'static str, 260 timeout: Duration, 261 ) -> Option<WatchPoint> { 262 let Some(deadline) = Instant::now().checked_add(timeout) else { 263 log::warn!("Deadline computation failed for WatchPoint \"{}\"", id); 264 log::warn!("WatchPoint not armed."); 265 return None; 266 }; 267 wd.arm(context, id, deadline); 268 Some(WatchPoint { id, wd, not_send: Default::default() }) 269 } 270 271 /// Create a new watch point. If the WatchPoint is not dropped before the timeout 272 /// expires, a report is logged at least every second, which includes the id string 273 /// and any provided context. watch_with( wd: &Arc<Self>, id: &'static str, timeout: Duration, context: impl std::fmt::Debug + Send + 'static, ) -> Option<WatchPoint>274 pub fn watch_with( 275 wd: &Arc<Self>, 276 id: &'static str, 277 timeout: Duration, 278 context: impl std::fmt::Debug + Send + 'static, 279 ) -> Option<WatchPoint> { 280 Self::watch_with_optional(wd.clone(), Some(Box::new(context)), id, timeout) 281 } 282 283 /// Like `watch_with`, but without context. watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint>284 pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> { 285 Self::watch_with_optional(wd.clone(), None, id, timeout) 286 } 287 arm( &self, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, deadline: Instant, )288 fn arm( 289 &self, 290 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, 291 id: &'static str, 292 deadline: Instant, 293 ) { 294 let tid = thread::current().id(); 295 let index = Index { tid, id }; 296 let record = Record { started: Instant::now(), deadline, context }; 297 298 let (ref condvar, ref state) = *self.state; 299 300 let mut state = state.lock().unwrap(); 301 state.arm(index, record); 302 303 if state.state != State::Running { 304 self.spawn_thread(&mut state); 305 } 306 drop(state); 307 condvar.notify_all(); 308 } 309 disarm(&self, id: &'static str)310 fn disarm(&self, id: &'static str) { 311 let tid = thread::current().id(); 312 let index = Index { tid, id }; 313 let (_, ref state) = *self.state; 314 315 let mut state = state.lock().unwrap(); 316 state.disarm(index); 317 // There is no need to notify condvar. There is no action required for the 318 // watchdog thread before the next deadline. 319 } 320 spawn_thread(&self, state: &mut MutexGuard<WatchdogState>)321 fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) { 322 if let Some(t) = state.thread.take() { 323 t.join().expect("Watchdog thread panicked."); 324 } 325 326 let cloned_state = self.state.clone(); 327 328 state.thread = Some(thread::spawn(move || { 329 let (ref condvar, ref state) = *cloned_state; 330 331 let mut state = state.lock().unwrap(); 332 333 loop { 334 let (has_overdue, next_timeout) = state.overdue_and_next_timeout(); 335 state.log_report(has_overdue); 336 337 let (next_timeout, idle) = match (has_overdue, next_timeout) { 338 (true, Some(next_timeout)) => (min(next_timeout, state.noisy_timeout), false), 339 (true, None) => (state.noisy_timeout, false), 340 (false, Some(next_timeout)) => (next_timeout, false), 341 (false, None) => (state.idle_timeout, true), 342 }; 343 344 // Wait until the closest timeout pops, but use a condition variable so that if a 345 // new watchpoint is started in the meanwhile it will interrupt the wait so we can 346 // recalculate. 347 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap(); 348 state = s; 349 350 if idle && timeout.timed_out() && state.records.is_empty() { 351 state.reset_noisy_timeout(); 352 state.state = State::NotRunning; 353 break; 354 } 355 } 356 log::info!("Watchdog thread idle -> terminating. Have a great day."); 357 })); 358 state.state = State::Running; 359 } 360 } 361