1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "host/libs/process_monitor/process_monitor.h"
18
19 #ifdef __linux__
20 #include <sys/prctl.h>
21 #endif
22
23 #include <sys/types.h>
24 #include <sys/wait.h>
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <signal.h>
29 #include <stdio.h>
30
31 #include <algorithm>
32 #include <atomic>
33 #include <cstdint>
34 #include <future>
35 #include <memory>
36 #include <string>
37 #include <thread>
38 #include <vector>
39
40 #include <android-base/file.h>
41 #include <android-base/logging.h>
42
43 #include "common/libs/fs/shared_buf.h"
44 #include "common/libs/fs/shared_select.h"
45 #include "common/libs/utils/contains.h"
46 #include "common/libs/utils/files.h"
47 #include "common/libs/utils/result.h"
48 #include "common/libs/utils/subprocess.h"
49 #include "host/libs/command_util/runner/defs.h"
50 #include "host/libs/command_util/util.h"
51 #include "host/libs/config/cuttlefish_config.h"
52 #include "host/libs/config/known_paths.h"
53 #include "host/libs/process_monitor/process_monitor_channel.h"
54
55 namespace cuttlefish {
56
57 namespace {
58
LogSubprocessExit(const std::string & name,pid_t pid,int wstatus)59 void LogSubprocessExit(const std::string& name, pid_t pid, int wstatus) {
60 LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
61 if (WIFEXITED(wstatus)) {
62 LOG(INFO) << "Subprocess " << name << " (" << pid
63 << ") has exited with exit code " << WEXITSTATUS(wstatus);
64 } else if (WIFSIGNALED(wstatus)) {
65 int sig_num = WTERMSIG(wstatus);
66 LOG(ERROR) << "Subprocess " << name << " (" << pid
67 << ") was interrupted by a signal '" << strsignal(sig_num)
68 << "' (" << sig_num << ")";
69 } else {
70 LOG(INFO) << "subprocess " << name << " (" << pid
71 << ") has exited for unknown reasons";
72 }
73 }
74
LogSubprocessExit(const std::string & name,const siginfo_t & infop)75 void LogSubprocessExit(const std::string& name, const siginfo_t& infop) {
76 LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
77 if (infop.si_code == CLD_EXITED) {
78 LOG(INFO) << "Subprocess " << name << " (" << infop.si_pid
79 << ") has exited with exit code " << infop.si_status;
80 } else if (infop.si_code == CLD_KILLED) {
81 LOG(ERROR) << "Subprocess " << name << " (" << infop.si_pid
82 << ") was interrupted by a signal '"
83 << strsignal(infop.si_status) << "' (" << infop.si_status << ")";
84 } else {
85 LOG(INFO) << "subprocess " << name << " (" << infop.si_pid
86 << ") has exited for unknown reasons (code = " << infop.si_code
87 << ", status = " << infop.si_status << ")";
88 }
89 }
90
MonitorLoop(std::atomic_bool & running,std::mutex & properties_mutex,const bool restart_subprocesses,std::vector<MonitorEntry> & monitored)91 Result<void> MonitorLoop(std::atomic_bool& running,
92 std::mutex& properties_mutex,
93 const bool restart_subprocesses,
94 std::vector<MonitorEntry>& monitored) {
95 while (running.load()) {
96 int wstatus;
97 pid_t pid = wait(&wstatus);
98 int error_num = errno;
99 CF_EXPECT(pid != -1, "Wait failed: " << strerror(error_num));
100 if (!WIFSIGNALED(wstatus) && !WIFEXITED(wstatus)) {
101 LOG(DEBUG) << "Unexpected status from wait: " << wstatus << " for pid "
102 << pid;
103 continue;
104 }
105 if (!running.load()) { // Avoid extra restarts near the end
106 break;
107 }
108 auto matches = [pid](const auto& it) { return it.proc->pid() == pid; };
109 std::unique_lock lock(properties_mutex);
110 auto it = std::find_if(monitored.begin(), monitored.end(), matches);
111 if (it == monitored.end()) {
112 LogSubprocessExit("(unknown)", pid, wstatus);
113 } else {
114 LogSubprocessExit(it->cmd->GetShortName(), it->proc->pid(), wstatus);
115 if (restart_subprocesses) {
116 auto options = SubprocessOptions().InGroup(true);
117 // in the future, cmd->Start might not run exec()
118 it->proc.reset(new Subprocess(it->cmd->Start(std::move(options))));
119 } else {
120 bool is_critical = it->is_critical;
121 monitored.erase(it);
122 if (running.load() && is_critical) {
123 LOG(ERROR) << "Stopping all monitored processes due to unexpected "
124 "exit of critical process";
125 running.store(false);
126 break;
127 }
128 }
129 }
130 }
131 return {};
132 }
133
StopSubprocesses(std::vector<MonitorEntry> & monitored)134 Result<void> StopSubprocesses(std::vector<MonitorEntry>& monitored) {
135 LOG(DEBUG) << "Stopping monitored subprocesses";
136 auto stop = [](const auto& it) {
137 auto stop_result = it.proc->Stop();
138 if (stop_result == StopperResult::kStopFailure) {
139 LOG(WARNING) << "Error in stopping \"" << it.cmd->GetShortName() << "\"";
140 return false;
141 }
142 siginfo_t infop;
143 auto success = it.proc->Wait(&infop, WEXITED);
144 if (success < 0) {
145 LOG(WARNING) << "Failed to wait for process " << it.cmd->GetShortName();
146 return false;
147 }
148 if (stop_result == StopperResult::kStopCrash) {
149 LogSubprocessExit(it.cmd->GetShortName(), infop);
150 }
151 return true;
152 };
153 // Processes were started in the order they appear in the vector, stop them in
154 // reverse order for symmetry.
155 size_t stopped = std::count_if(monitored.rbegin(), monitored.rend(), stop);
156 CF_EXPECT(stopped == monitored.size(), "Didn't stop all subprocesses");
157 return {};
158 }
159
SuspendResumeImpl(std::vector<MonitorEntry> & monitor_entries,std::mutex & properties_mutex,const SharedFD & channel_to_secure_env,const bool is_suspend,SharedFD child_monitor_socket)160 Result<void> SuspendResumeImpl(std::vector<MonitorEntry>& monitor_entries,
161 std::mutex& properties_mutex,
162 const SharedFD& channel_to_secure_env,
163 const bool is_suspend,
164 SharedFD child_monitor_socket) {
165 std::lock_guard lock(properties_mutex);
166 auto secure_env_itr = std::find_if(
167 monitor_entries.begin(), monitor_entries.end(), [](MonitorEntry& entry) {
168 auto prog_name = android::base::Basename(entry.cmd->Executable());
169 return (prog_name == "secure_env");
170 });
171 if (secure_env_itr != monitor_entries.end()) {
172 CF_EXPECT(channel_to_secure_env->IsOpen(),
173 "channel to secure_env is not open.");
174 run_cvd::ExtendedLauncherAction extended_action;
175 if (is_suspend) {
176 extended_action.mutable_suspend();
177 } else {
178 extended_action.mutable_resume();
179 }
180 CF_EXPECT(RunLauncherAction(channel_to_secure_env, extended_action,
181 std::nullopt));
182 }
183
184 for (const auto& entry : monitor_entries) {
185 if (!entry.cmd) {
186 LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
187 continue;
188 }
189 if (!entry.proc) {
190 LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
191 continue;
192 }
193 auto prog_name = android::base::Basename(entry.cmd->Executable());
194 auto process_restart_bin =
195 android::base::Basename(ProcessRestarterBinary());
196 if (prog_name == "log_tee") {
197 // Don't stop log_tee, we want to continue processing logs while
198 // suspended.
199 continue;
200 }
201 if (prog_name == "wmediumd") {
202 // wmediumd should be running while openWRT is saved using the
203 // guest snapshot logic
204 continue;
205 }
206 if (prog_name == "secure_env") {
207 // secure_env was handled above in a customized way
208 continue;
209 }
210
211 if (process_restart_bin == prog_name) {
212 if (is_suspend) {
213 CF_EXPECT(entry.proc->SendSignal(SIGTSTP));
214 } else {
215 CF_EXPECT(entry.proc->SendSignal(SIGCONT));
216 }
217 continue;
218 }
219 if (is_suspend) {
220 CF_EXPECT(entry.proc->SendSignalToGroup(SIGTSTP));
221 } else {
222 CF_EXPECT(entry.proc->SendSignalToGroup(SIGCONT));
223 }
224 }
225 using process_monitor_impl::ChildToParentResponse;
226 using process_monitor_impl::ChildToParentResponseType;
227 ChildToParentResponse response(ChildToParentResponseType::kSuccess);
228 CF_EXPECT(response.Write(child_monitor_socket));
229 return {};
230 }
231
232 } // namespace
233
StartSubprocesses(ProcessMonitor::Properties & properties)234 Result<void> ProcessMonitor::StartSubprocesses(
235 ProcessMonitor::Properties& properties) {
236 LOG(DEBUG) << "Starting monitored subprocesses";
237 for (auto& monitored : properties.entries_) {
238 LOG(INFO) << monitored.cmd->GetShortName();
239 auto options = SubprocessOptions().InGroup(true);
240 std::string short_name = monitored.cmd->GetShortName();
241 auto last_slash = short_name.find_last_of('/');
242 if (last_slash != std::string::npos) {
243 short_name = short_name.substr(last_slash + 1);
244 }
245 if (Contains(properties_.strace_commands_, short_name)) {
246 options.Strace(properties.strace_log_dir_ + "/strace-" + short_name);
247 }
248 monitored.proc.reset(
249 new Subprocess(monitored.cmd->Start(std::move(options))));
250 CF_EXPECT(monitored.proc->Started(), "Failed to start subprocess");
251 }
252 return {};
253 }
254
ReadMonitorSocketLoop(std::atomic_bool & running)255 Result<void> ProcessMonitor::ReadMonitorSocketLoop(std::atomic_bool& running) {
256 LOG(DEBUG) << "Waiting for a `stop` message from the parent";
257 while (running.load()) {
258 using process_monitor_impl::ParentToChildMessage;
259 auto message = CF_EXPECT(ParentToChildMessage::Read(child_monitor_socket_));
260 if (message.Stop()) {
261 running.store(false);
262 // Wake up the wait() loop by giving it an exited child process
263 if (fork() == 0) {
264 std::exit(0);
265 }
266 // will break the for-loop as running is now false
267 continue;
268 }
269 using process_monitor_impl::ParentToChildMessageType;
270 if (message.Type() == ParentToChildMessageType::kHostSuspend) {
271 CF_EXPECT(SuspendHostProcessesImpl());
272 continue;
273 }
274 if (message.Type() == ParentToChildMessageType::kHostResume) {
275 CF_EXPECT(ResumeHostProcessesImpl());
276 continue;
277 }
278 }
279 return {};
280 }
281
SuspendHostProcessesImpl()282 Result<void> ProcessMonitor::SuspendHostProcessesImpl() {
283 CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
284 channel_to_secure_env_, /* is_suspend */ true,
285 child_monitor_socket_),
286 "Failed suspend");
287 return {};
288 }
289
ResumeHostProcessesImpl()290 Result<void> ProcessMonitor::ResumeHostProcessesImpl() {
291 CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
292 channel_to_secure_env_, /* is_suspend */ false,
293 child_monitor_socket_),
294 "Failed resume");
295 return {};
296 }
297
RestartSubprocesses(bool r)298 ProcessMonitor::Properties& ProcessMonitor::Properties::RestartSubprocesses(
299 bool r) & {
300 restart_subprocesses_ = r;
301 return *this;
302 }
303
AddCommand(MonitorCommand cmd)304 ProcessMonitor::Properties& ProcessMonitor::Properties::AddCommand(
305 MonitorCommand cmd) & {
306 entries_.emplace_back(std::move(cmd.command), cmd.is_critical);
307 return *this;
308 }
309
StraceCommands(std::set<std::string> strace)310 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceCommands(
311 std::set<std::string> strace) & {
312 strace_commands_ = std::move(strace);
313 return *this;
314 }
315
StraceLogDir(std::string log_dir)316 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceLogDir(
317 std::string log_dir) & {
318 strace_log_dir_ = std::move(log_dir);
319 return *this;
320 }
321
ProcessMonitor(ProcessMonitor::Properties && properties,const SharedFD & secure_env_fd)322 ProcessMonitor::ProcessMonitor(ProcessMonitor::Properties&& properties,
323 const SharedFD& secure_env_fd)
324 : properties_(std::move(properties)),
325 channel_to_secure_env_(secure_env_fd),
326 monitor_(-1) {}
327
StopMonitoredProcesses()328 Result<void> ProcessMonitor::StopMonitoredProcesses() {
329 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
330 CF_EXPECT(parent_monitor_socket_->IsOpen(),
331 "The monitor socket is already closed");
332 using process_monitor_impl::ParentToChildMessage;
333 using process_monitor_impl::ParentToChildMessageType;
334 ParentToChildMessage message(ParentToChildMessageType::kStop);
335 CF_EXPECT(message.Write(parent_monitor_socket_));
336
337 pid_t last_monitor = monitor_;
338 monitor_ = -1;
339 parent_monitor_socket_->Close();
340 int wstatus;
341 CF_EXPECT(waitpid(last_monitor, &wstatus, 0) == last_monitor,
342 "Failed to wait for monitor process");
343 CF_EXPECT(!WIFSIGNALED(wstatus), "Monitor process exited due to a signal");
344 CF_EXPECT(WIFEXITED(wstatus), "Monitor process exited for unknown reasons");
345 CF_EXPECT(WEXITSTATUS(wstatus) == 0,
346 "Monitor process exited with code " << WEXITSTATUS(wstatus));
347 return {};
348 }
349
SuspendMonitoredProcesses()350 Result<void> ProcessMonitor::SuspendMonitoredProcesses() {
351 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
352 CF_EXPECT(parent_monitor_socket_->IsOpen(),
353 "The monitor socket is already closed");
354 using process_monitor_impl::ParentToChildMessage;
355 using process_monitor_impl::ParentToChildMessageType;
356 ParentToChildMessage message(ParentToChildMessageType::kHostSuspend);
357 CF_EXPECT(message.Write(parent_monitor_socket_));
358 using process_monitor_impl::ChildToParentResponse;
359 auto response =
360 CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
361 CF_EXPECT(response.Success(),
362 "On kHostSuspend, the child run_cvd returned kFailure.");
363 return {};
364 }
365
ResumeMonitoredProcesses()366 Result<void> ProcessMonitor::ResumeMonitoredProcesses() {
367 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
368 CF_EXPECT(parent_monitor_socket_->IsOpen(),
369 "The monitor socket is already closed");
370 using process_monitor_impl::ParentToChildMessage;
371 using process_monitor_impl::ParentToChildMessageType;
372 ParentToChildMessage message(ParentToChildMessageType::kHostResume);
373 CF_EXPECT(message.Write(parent_monitor_socket_));
374 using process_monitor_impl::ChildToParentResponse;
375 auto response =
376 CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
377 CF_EXPECT(response.Success(),
378 "On kHostResume, the child run_cvd returned kFailure.");
379 return {};
380 }
381
StartAndMonitorProcesses()382 Result<void> ProcessMonitor::StartAndMonitorProcesses() {
383 CF_EXPECT(monitor_ == -1, "The monitor process was already started");
384 CF_EXPECT(!parent_monitor_socket_->IsOpen(),
385 "Parent monitor socket was already opened");
386 SharedFD parent_sock;
387 SharedFD child_sock;
388 SharedFD::SocketPair(AF_UNIX, SOCK_STREAM, 0, &parent_sock, &child_sock);
389 monitor_ = fork();
390 if (monitor_ == 0) {
391 child_monitor_socket_ = std::move(child_sock);
392 parent_sock->Close();
393 auto monitor_result = MonitorRoutine();
394 if (!monitor_result.ok()) {
395 LOG(ERROR) << "Monitoring processes failed:\n"
396 << monitor_result.error().FormatForEnv();
397 }
398 std::exit(monitor_result.ok() ? 0 : 1);
399 } else {
400 parent_monitor_socket_ = std::move(parent_sock);
401 child_sock->Close();
402 return {};
403 }
404 }
405
MonitorRoutine()406 Result<void> ProcessMonitor::MonitorRoutine() {
407 #ifdef __linux__
408 // Make this process a subreaper to reliably catch subprocess exits.
409 // See https://man7.org/linux/man-pages/man2/prctl.2.html
410 prctl(PR_SET_CHILD_SUBREAPER, 1);
411 prctl(PR_SET_PDEATHSIG, SIGHUP); // Die when parent dies
412 #endif
413
414 LOG(DEBUG) << "Monitoring subprocesses";
415 CF_EXPECT(StartSubprocesses(properties_));
416
417 std::atomic_bool running(true);
418
419 auto read_monitor_socket_loop =
420 [this](std::atomic_bool& running) -> Result<void> {
421 CF_EXPECT(this->ReadMonitorSocketLoop(running));
422 return {};
423 };
424 auto parent_comms = std::async(std::launch::async, read_monitor_socket_loop,
425 std::ref(running));
426
427 CF_EXPECT(MonitorLoop(running, properties_mutex_,
428 properties_.restart_subprocesses_,
429 properties_.entries_));
430 CF_EXPECT(parent_comms.get(), "Should have exited if monitoring stopped");
431
432 CF_EXPECT(StopSubprocesses(properties_.entries_));
433 LOG(DEBUG) << "Done monitoring subprocesses";
434 return {};
435 }
436
437 } // namespace cuttlefish
438