1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "host/libs/process_monitor/process_monitor.h"
18 
19 #ifdef __linux__
20 #include <sys/prctl.h>
21 #endif
22 
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <signal.h>
29 #include <stdio.h>
30 
31 #include <algorithm>
32 #include <atomic>
33 #include <cstdint>
34 #include <future>
35 #include <memory>
36 #include <string>
37 #include <thread>
38 #include <vector>
39 
40 #include <android-base/file.h>
41 #include <android-base/logging.h>
42 
43 #include "common/libs/fs/shared_buf.h"
44 #include "common/libs/fs/shared_select.h"
45 #include "common/libs/utils/contains.h"
46 #include "common/libs/utils/files.h"
47 #include "common/libs/utils/result.h"
48 #include "common/libs/utils/subprocess.h"
49 #include "host/libs/command_util/runner/defs.h"
50 #include "host/libs/command_util/util.h"
51 #include "host/libs/config/cuttlefish_config.h"
52 #include "host/libs/config/known_paths.h"
53 #include "host/libs/process_monitor/process_monitor_channel.h"
54 
55 namespace cuttlefish {
56 
57 namespace {
58 
LogSubprocessExit(const std::string & name,pid_t pid,int wstatus)59 void LogSubprocessExit(const std::string& name, pid_t pid, int wstatus) {
60   LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
61   if (WIFEXITED(wstatus)) {
62     LOG(INFO) << "Subprocess " << name << " (" << pid
63               << ") has exited with exit code " << WEXITSTATUS(wstatus);
64   } else if (WIFSIGNALED(wstatus)) {
65     int sig_num = WTERMSIG(wstatus);
66     LOG(ERROR) << "Subprocess " << name << " (" << pid
67                << ") was interrupted by a signal '" << strsignal(sig_num)
68                << "' (" << sig_num << ")";
69   } else {
70     LOG(INFO) << "subprocess " << name << " (" << pid
71               << ") has exited for unknown reasons";
72   }
73 }
74 
LogSubprocessExit(const std::string & name,const siginfo_t & infop)75 void LogSubprocessExit(const std::string& name, const siginfo_t& infop) {
76   LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
77   if (infop.si_code == CLD_EXITED) {
78     LOG(INFO) << "Subprocess " << name << " (" << infop.si_pid
79               << ") has exited with exit code " << infop.si_status;
80   } else if (infop.si_code == CLD_KILLED) {
81     LOG(ERROR) << "Subprocess " << name << " (" << infop.si_pid
82                << ") was interrupted by a signal '"
83                << strsignal(infop.si_status) << "' (" << infop.si_status << ")";
84   } else {
85     LOG(INFO) << "subprocess " << name << " (" << infop.si_pid
86               << ") has exited for unknown reasons (code = " << infop.si_code
87               << ", status = " << infop.si_status << ")";
88   }
89 }
90 
MonitorLoop(std::atomic_bool & running,std::mutex & properties_mutex,const bool restart_subprocesses,std::vector<MonitorEntry> & monitored)91 Result<void> MonitorLoop(std::atomic_bool& running,
92                          std::mutex& properties_mutex,
93                          const bool restart_subprocesses,
94                          std::vector<MonitorEntry>& monitored) {
95   while (running.load()) {
96     int wstatus;
97     pid_t pid = wait(&wstatus);
98     int error_num = errno;
99     CF_EXPECT(pid != -1, "Wait failed: " << strerror(error_num));
100     if (!WIFSIGNALED(wstatus) && !WIFEXITED(wstatus)) {
101       LOG(DEBUG) << "Unexpected status from wait: " << wstatus << " for pid "
102                  << pid;
103       continue;
104     }
105     if (!running.load()) {  // Avoid extra restarts near the end
106       break;
107     }
108     auto matches = [pid](const auto& it) { return it.proc->pid() == pid; };
109     std::unique_lock lock(properties_mutex);
110     auto it = std::find_if(monitored.begin(), monitored.end(), matches);
111     if (it == monitored.end()) {
112       LogSubprocessExit("(unknown)", pid, wstatus);
113     } else {
114       LogSubprocessExit(it->cmd->GetShortName(), it->proc->pid(), wstatus);
115       if (restart_subprocesses) {
116         auto options = SubprocessOptions().InGroup(true);
117         // in the future, cmd->Start might not run exec()
118         it->proc.reset(new Subprocess(it->cmd->Start(std::move(options))));
119       } else {
120         bool is_critical = it->is_critical;
121         monitored.erase(it);
122         if (running.load() && is_critical) {
123           LOG(ERROR) << "Stopping all monitored processes due to unexpected "
124                         "exit of critical process";
125           running.store(false);
126           break;
127         }
128       }
129     }
130   }
131   return {};
132 }
133 
StopSubprocesses(std::vector<MonitorEntry> & monitored)134 Result<void> StopSubprocesses(std::vector<MonitorEntry>& monitored) {
135   LOG(DEBUG) << "Stopping monitored subprocesses";
136   auto stop = [](const auto& it) {
137     auto stop_result = it.proc->Stop();
138     if (stop_result == StopperResult::kStopFailure) {
139       LOG(WARNING) << "Error in stopping \"" << it.cmd->GetShortName() << "\"";
140       return false;
141     }
142     siginfo_t infop;
143     auto success = it.proc->Wait(&infop, WEXITED);
144     if (success < 0) {
145       LOG(WARNING) << "Failed to wait for process " << it.cmd->GetShortName();
146       return false;
147     }
148     if (stop_result == StopperResult::kStopCrash) {
149       LogSubprocessExit(it.cmd->GetShortName(), infop);
150     }
151     return true;
152   };
153   // Processes were started in the order they appear in the vector, stop them in
154   // reverse order for symmetry.
155   size_t stopped = std::count_if(monitored.rbegin(), monitored.rend(), stop);
156   CF_EXPECT(stopped == monitored.size(), "Didn't stop all subprocesses");
157   return {};
158 }
159 
SuspendResumeImpl(std::vector<MonitorEntry> & monitor_entries,std::mutex & properties_mutex,const SharedFD & channel_to_secure_env,const bool is_suspend,SharedFD child_monitor_socket)160 Result<void> SuspendResumeImpl(std::vector<MonitorEntry>& monitor_entries,
161                                std::mutex& properties_mutex,
162                                const SharedFD& channel_to_secure_env,
163                                const bool is_suspend,
164                                SharedFD child_monitor_socket) {
165   std::lock_guard lock(properties_mutex);
166   auto secure_env_itr = std::find_if(
167       monitor_entries.begin(), monitor_entries.end(), [](MonitorEntry& entry) {
168         auto prog_name = android::base::Basename(entry.cmd->Executable());
169         return (prog_name == "secure_env");
170       });
171   if (secure_env_itr != monitor_entries.end()) {
172     CF_EXPECT(channel_to_secure_env->IsOpen(),
173               "channel to secure_env is not open.");
174     run_cvd::ExtendedLauncherAction extended_action;
175     if (is_suspend) {
176       extended_action.mutable_suspend();
177     } else {
178       extended_action.mutable_resume();
179     }
180     CF_EXPECT(RunLauncherAction(channel_to_secure_env, extended_action,
181                                 std::nullopt));
182   }
183 
184   for (const auto& entry : monitor_entries) {
185     if (!entry.cmd) {
186       LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
187       continue;
188     }
189     if (!entry.proc) {
190       LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
191       continue;
192     }
193     auto prog_name = android::base::Basename(entry.cmd->Executable());
194     auto process_restart_bin =
195         android::base::Basename(ProcessRestarterBinary());
196     if (prog_name == "log_tee") {
197       // Don't stop log_tee, we want to continue processing logs while
198       // suspended.
199       continue;
200     }
201     if (prog_name == "wmediumd") {
202       // wmediumd should be running while openWRT is saved using the
203       // guest snapshot logic
204       continue;
205     }
206     if (prog_name == "secure_env") {
207       // secure_env was handled above in a customized way
208       continue;
209     }
210 
211     if (process_restart_bin == prog_name) {
212       if (is_suspend) {
213         CF_EXPECT(entry.proc->SendSignal(SIGTSTP));
214       } else {
215         CF_EXPECT(entry.proc->SendSignal(SIGCONT));
216       }
217       continue;
218     }
219     if (is_suspend) {
220       CF_EXPECT(entry.proc->SendSignalToGroup(SIGTSTP));
221     } else {
222       CF_EXPECT(entry.proc->SendSignalToGroup(SIGCONT));
223     }
224   }
225   using process_monitor_impl::ChildToParentResponse;
226   using process_monitor_impl::ChildToParentResponseType;
227   ChildToParentResponse response(ChildToParentResponseType::kSuccess);
228   CF_EXPECT(response.Write(child_monitor_socket));
229   return {};
230 }
231 
232 }  // namespace
233 
StartSubprocesses(ProcessMonitor::Properties & properties)234 Result<void> ProcessMonitor::StartSubprocesses(
235     ProcessMonitor::Properties& properties) {
236   LOG(DEBUG) << "Starting monitored subprocesses";
237   for (auto& monitored : properties.entries_) {
238     LOG(INFO) << monitored.cmd->GetShortName();
239     auto options = SubprocessOptions().InGroup(true);
240     std::string short_name = monitored.cmd->GetShortName();
241     auto last_slash = short_name.find_last_of('/');
242     if (last_slash != std::string::npos) {
243       short_name = short_name.substr(last_slash + 1);
244     }
245     if (Contains(properties_.strace_commands_, short_name)) {
246       options.Strace(properties.strace_log_dir_ + "/strace-" + short_name);
247     }
248     monitored.proc.reset(
249         new Subprocess(monitored.cmd->Start(std::move(options))));
250     CF_EXPECT(monitored.proc->Started(), "Failed to start subprocess");
251   }
252   return {};
253 }
254 
ReadMonitorSocketLoop(std::atomic_bool & running)255 Result<void> ProcessMonitor::ReadMonitorSocketLoop(std::atomic_bool& running) {
256   LOG(DEBUG) << "Waiting for a `stop` message from the parent";
257   while (running.load()) {
258     using process_monitor_impl::ParentToChildMessage;
259     auto message = CF_EXPECT(ParentToChildMessage::Read(child_monitor_socket_));
260     if (message.Stop()) {
261       running.store(false);
262       // Wake up the wait() loop by giving it an exited child process
263       if (fork() == 0) {
264         std::exit(0);
265       }
266       // will break the for-loop as running is now false
267       continue;
268     }
269     using process_monitor_impl::ParentToChildMessageType;
270     if (message.Type() == ParentToChildMessageType::kHostSuspend) {
271       CF_EXPECT(SuspendHostProcessesImpl());
272       continue;
273     }
274     if (message.Type() == ParentToChildMessageType::kHostResume) {
275       CF_EXPECT(ResumeHostProcessesImpl());
276       continue;
277     }
278   }
279   return {};
280 }
281 
SuspendHostProcessesImpl()282 Result<void> ProcessMonitor::SuspendHostProcessesImpl() {
283   CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
284                               channel_to_secure_env_, /* is_suspend */ true,
285                               child_monitor_socket_),
286             "Failed suspend");
287   return {};
288 }
289 
ResumeHostProcessesImpl()290 Result<void> ProcessMonitor::ResumeHostProcessesImpl() {
291   CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
292                               channel_to_secure_env_, /* is_suspend */ false,
293                               child_monitor_socket_),
294             "Failed resume");
295   return {};
296 }
297 
RestartSubprocesses(bool r)298 ProcessMonitor::Properties& ProcessMonitor::Properties::RestartSubprocesses(
299     bool r) & {
300   restart_subprocesses_ = r;
301   return *this;
302 }
303 
AddCommand(MonitorCommand cmd)304 ProcessMonitor::Properties& ProcessMonitor::Properties::AddCommand(
305     MonitorCommand cmd) & {
306   entries_.emplace_back(std::move(cmd.command), cmd.is_critical);
307   return *this;
308 }
309 
StraceCommands(std::set<std::string> strace)310 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceCommands(
311     std::set<std::string> strace) & {
312   strace_commands_ = std::move(strace);
313   return *this;
314 }
315 
StraceLogDir(std::string log_dir)316 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceLogDir(
317     std::string log_dir) & {
318   strace_log_dir_ = std::move(log_dir);
319   return *this;
320 }
321 
ProcessMonitor(ProcessMonitor::Properties && properties,const SharedFD & secure_env_fd)322 ProcessMonitor::ProcessMonitor(ProcessMonitor::Properties&& properties,
323                                const SharedFD& secure_env_fd)
324     : properties_(std::move(properties)),
325       channel_to_secure_env_(secure_env_fd),
326       monitor_(-1) {}
327 
StopMonitoredProcesses()328 Result<void> ProcessMonitor::StopMonitoredProcesses() {
329   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
330   CF_EXPECT(parent_monitor_socket_->IsOpen(),
331             "The monitor socket is already closed");
332   using process_monitor_impl::ParentToChildMessage;
333   using process_monitor_impl::ParentToChildMessageType;
334   ParentToChildMessage message(ParentToChildMessageType::kStop);
335   CF_EXPECT(message.Write(parent_monitor_socket_));
336 
337   pid_t last_monitor = monitor_;
338   monitor_ = -1;
339   parent_monitor_socket_->Close();
340   int wstatus;
341   CF_EXPECT(waitpid(last_monitor, &wstatus, 0) == last_monitor,
342             "Failed to wait for monitor process");
343   CF_EXPECT(!WIFSIGNALED(wstatus), "Monitor process exited due to a signal");
344   CF_EXPECT(WIFEXITED(wstatus), "Monitor process exited for unknown reasons");
345   CF_EXPECT(WEXITSTATUS(wstatus) == 0,
346             "Monitor process exited with code " << WEXITSTATUS(wstatus));
347   return {};
348 }
349 
SuspendMonitoredProcesses()350 Result<void> ProcessMonitor::SuspendMonitoredProcesses() {
351   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
352   CF_EXPECT(parent_monitor_socket_->IsOpen(),
353             "The monitor socket is already closed");
354   using process_monitor_impl::ParentToChildMessage;
355   using process_monitor_impl::ParentToChildMessageType;
356   ParentToChildMessage message(ParentToChildMessageType::kHostSuspend);
357   CF_EXPECT(message.Write(parent_monitor_socket_));
358   using process_monitor_impl::ChildToParentResponse;
359   auto response =
360       CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
361   CF_EXPECT(response.Success(),
362             "On kHostSuspend, the child run_cvd returned kFailure.");
363   return {};
364 }
365 
ResumeMonitoredProcesses()366 Result<void> ProcessMonitor::ResumeMonitoredProcesses() {
367   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
368   CF_EXPECT(parent_monitor_socket_->IsOpen(),
369             "The monitor socket is already closed");
370   using process_monitor_impl::ParentToChildMessage;
371   using process_monitor_impl::ParentToChildMessageType;
372   ParentToChildMessage message(ParentToChildMessageType::kHostResume);
373   CF_EXPECT(message.Write(parent_monitor_socket_));
374   using process_monitor_impl::ChildToParentResponse;
375   auto response =
376       CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
377   CF_EXPECT(response.Success(),
378             "On kHostResume, the child run_cvd returned kFailure.");
379   return {};
380 }
381 
StartAndMonitorProcesses()382 Result<void> ProcessMonitor::StartAndMonitorProcesses() {
383   CF_EXPECT(monitor_ == -1, "The monitor process was already started");
384   CF_EXPECT(!parent_monitor_socket_->IsOpen(),
385             "Parent monitor socket was already opened");
386   SharedFD parent_sock;
387   SharedFD child_sock;
388   SharedFD::SocketPair(AF_UNIX, SOCK_STREAM, 0, &parent_sock, &child_sock);
389   monitor_ = fork();
390   if (monitor_ == 0) {
391     child_monitor_socket_ = std::move(child_sock);
392     parent_sock->Close();
393     auto monitor_result = MonitorRoutine();
394     if (!monitor_result.ok()) {
395       LOG(ERROR) << "Monitoring processes failed:\n"
396                  << monitor_result.error().FormatForEnv();
397     }
398     std::exit(monitor_result.ok() ? 0 : 1);
399   } else {
400     parent_monitor_socket_ = std::move(parent_sock);
401     child_sock->Close();
402     return {};
403   }
404 }
405 
MonitorRoutine()406 Result<void> ProcessMonitor::MonitorRoutine() {
407 #ifdef __linux__
408   // Make this process a subreaper to reliably catch subprocess exits.
409   // See https://man7.org/linux/man-pages/man2/prctl.2.html
410   prctl(PR_SET_CHILD_SUBREAPER, 1);
411   prctl(PR_SET_PDEATHSIG, SIGHUP);  // Die when parent dies
412 #endif
413 
414   LOG(DEBUG) << "Monitoring subprocesses";
415   CF_EXPECT(StartSubprocesses(properties_));
416 
417   std::atomic_bool running(true);
418 
419   auto read_monitor_socket_loop =
420       [this](std::atomic_bool& running) -> Result<void> {
421     CF_EXPECT(this->ReadMonitorSocketLoop(running));
422     return {};
423   };
424   auto parent_comms = std::async(std::launch::async, read_monitor_socket_loop,
425                                  std::ref(running));
426 
427   CF_EXPECT(MonitorLoop(running, properties_mutex_,
428                         properties_.restart_subprocesses_,
429                         properties_.entries_));
430   CF_EXPECT(parent_comms.get(), "Should have exited if monitoring stopped");
431 
432   CF_EXPECT(StopSubprocesses(properties_.entries_));
433   LOG(DEBUG) << "Done monitoring subprocesses";
434   return {};
435 }
436 
437 }  // namespace cuttlefish
438