xref: /aosp_15_r20/external/sandboxed-api/sandboxed_api/sandbox2/forkserver.cc (revision ec63e07ab9515d95e79c211197c445ef84cefa6a)
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Implementation of the sandbox2::ForkServer class.
16 
17 #include "sandboxed_api/sandbox2/forkserver.h"
18 
19 #include <fcntl.h>
20 #include <linux/filter.h>
21 #include <linux/seccomp.h>
22 #include <sched.h>
23 #include <sys/eventfd.h>
24 #include <sys/prctl.h>
25 #include <sys/resource.h>
26 #include <sys/socket.h>
27 #include <sys/uio.h>
28 #include <sys/wait.h>
29 #include <syscall.h>
30 #include <unistd.h>
31 
32 #include <cerrno>
33 #include <csignal>
34 #include <cstdint>
35 #include <cstdlib>
36 #include <cstring>
37 #include <fstream>
38 #include <initializer_list>
39 #include <string>
40 #include <utility>
41 #include <vector>
42 
43 #include "absl/base/attributes.h"
44 #include "absl/container/flat_hash_map.h"
45 #include "absl/container/flat_hash_set.h"
46 #include "absl/status/status.h"
47 #include "absl/status/statusor.h"
48 #include "absl/strings/match.h"
49 #include "absl/strings/str_cat.h"
50 #include "absl/strings/str_join.h"
51 #include "absl/strings/str_split.h"
52 #include "absl/strings/string_view.h"
53 #include "sys/capability.h" // AOSP: match libcap exported includes
54 #include "sandboxed_api/sandbox2/client.h"
55 #include "sandboxed_api/sandbox2/comms.h"
56 #include "sandboxed_api/sandbox2/fork_client.h"
57 #include "sandboxed_api/sandbox2/forkserver.pb.h"
58 #include "sandboxed_api/sandbox2/namespace.h"
59 #include "sandboxed_api/sandbox2/policy.h"
60 #include "sandboxed_api/sandbox2/sanitizer.h"
61 #include "sandboxed_api/sandbox2/syscall.h"
62 #include "sandboxed_api/sandbox2/util.h"
63 #include "sandboxed_api/sandbox2/util/bpf_helper.h"
64 #include "sandboxed_api/util/fileops.h"
65 #include "sandboxed_api/util/raw_logging.h"
66 #include "sandboxed_api/util/strerror.h"
67 
68 namespace sandbox2 {
69 namespace {
70 
71 using ::sapi::StrError;
72 using ::sapi::file_util::fileops::FDCloser;
73 
74 // "Moves" FDs in move_fds from current to target FD number while keeping FDs
75 // in keep_fds open - potentially moving them to another FD number as well in
76 // case of colisions.
77 // Ignores invalid (-1) fds.
MoveFDs(std::initializer_list<std::pair<int *,int>> move_fds,std::initializer_list<int * > keep_fds)78 void MoveFDs(std::initializer_list<std::pair<int*, int>> move_fds,
79              std::initializer_list<int*> keep_fds) {
80   absl::flat_hash_map<int, int*> fd_map;
81   for (int* fd : keep_fds) {
82     if (*fd != -1) {
83       fd_map.emplace(*fd, fd);
84     }
85   }
86 
87   for (auto [old_fd, new_fd] : move_fds) {
88     if (*old_fd != -1) {
89       fd_map.emplace(*old_fd, old_fd);
90     }
91   }
92 
93   for (auto [old_fd, new_fd] : move_fds) {
94     if (*old_fd == -1 || *old_fd == new_fd) {
95       continue;
96     }
97 
98     // Make sure we won't override another fd
99     if (auto it = fd_map.find(new_fd); it != fd_map.end()) {
100       int fd = dup(new_fd);
101       SAPI_RAW_CHECK(fd != -1, "Duplicating an FD failed.");
102       *it->second = fd;
103       fd_map.emplace(fd, it->second);
104       fd_map.erase(it);
105     }
106 
107     if (dup2(*old_fd, new_fd) == -1) {
108       SAPI_RAW_PLOG(FATAL, "Moving temporary to proper FD failed.");
109     }
110 
111     close(*old_fd);
112     fd_map.erase(*old_fd);
113     *old_fd = new_fd;
114   }
115 }
116 
RunInitProcess(pid_t main_pid,FDCloser pipe_fd)117 ABSL_ATTRIBUTE_NORETURN void RunInitProcess(pid_t main_pid, FDCloser pipe_fd) {
118   if (prctl(PR_SET_NAME, "S2-INIT-PROC", 0, 0, 0) != 0) {
119     SAPI_RAW_PLOG(WARNING, "prctl(PR_SET_NAME, 'S2-INIT-PROC')");
120   }
121 
122   // Clear SA_NOCLDWAIT.
123   struct sigaction sa;
124   sa.sa_handler = SIG_DFL;
125   sa.sa_flags = 0;
126   sigemptyset(&sa.sa_mask);
127   SAPI_RAW_CHECK(sigaction(SIGCHLD, &sa, nullptr) == 0,
128                  "clearing SA_NOCLDWAIT");
129 
130   // Apply seccomp.
131   std::vector<sock_filter> code = {
132       LOAD_ARCH,
133       JNE32(sandbox2::Syscall::GetHostAuditArch(), DENY),
134 
135       LOAD_SYSCALL_NR,
136       SYSCALL(__NR_waitid, ALLOW),
137       SYSCALL(__NR_exit, ALLOW),
138   };
139   if (pipe_fd.get() >= 0) {
140     code.insert(code.end(),
141                 {SYSCALL(__NR_getrusage, ALLOW), SYSCALL(__NR_write, ALLOW)});
142   }
143   code.push_back(DENY);
144 
145   struct sock_fprog prog {
146     .len = static_cast<uint16_t>(code.size()), .filter = code.data(),
147   };
148 
149   SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0,
150                  "Denying new privs");
151   SAPI_RAW_CHECK(prctl(PR_SET_KEEPCAPS, 0) == 0, "Dropping caps");
152   SAPI_RAW_CHECK(
153       syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
154               reinterpret_cast<uintptr_t>(&prog)) == 0,
155       "Enabling seccomp filter");
156 
157   siginfo_t info;
158   // Reap children.
159   for (;;) {
160     int rv = TEMP_FAILURE_RETRY(waitid(P_ALL, -1, &info, WEXITED | __WALL));
161     if (rv != 0) {
162       _exit(1);
163     }
164 
165     if (info.si_pid == main_pid) {
166       if (pipe_fd.get() >= 0) {
167         write(pipe_fd.get(), &info.si_code, sizeof(info.si_code));
168         write(pipe_fd.get(), &info.si_status, sizeof(info.si_status));
169 
170         rusage usage{};
171         getrusage(RUSAGE_CHILDREN, &usage);
172         write(pipe_fd.get(), &usage, sizeof(usage));
173       }
174       _exit(0);
175     }
176   }
177 }
178 
SendPid(int signaling_fd)179 absl::Status SendPid(int signaling_fd) {
180   // Send our PID (the actual sandboxee process) via SCM_CREDENTIALS.
181   // The ancillary message will be attached to the message as SO_PASSCRED is set
182   // on the socket.
183   char dummy = ' ';
184   if (TEMP_FAILURE_RETRY(send(signaling_fd, &dummy, 1, 0)) != 1) {
185     return absl::ErrnoToStatus(errno, "Sending PID: send()");
186   }
187   return absl::OkStatus();
188 }
189 
ReceivePid(int signaling_fd)190 absl::StatusOr<pid_t> ReceivePid(int signaling_fd) {
191   union {
192     struct cmsghdr cmh;
193     char ctrl[CMSG_SPACE(sizeof(struct ucred))];
194   } ucred_msg{};
195 
196   struct msghdr msgh {};
197   struct iovec iov {};
198 
199   msgh.msg_iov = &iov;
200   msgh.msg_iovlen = 1;
201   msgh.msg_control = ucred_msg.ctrl;
202   msgh.msg_controllen = sizeof(ucred_msg);
203 
204   char dummy;
205   iov.iov_base = &dummy;
206   iov.iov_len = sizeof(char);
207 
208   if (TEMP_FAILURE_RETRY(recvmsg(signaling_fd, &msgh, MSG_WAITALL)) != 1) {
209     return absl::ErrnoToStatus(errno, "Receiving pid failed: recvmsg");
210   }
211   struct cmsghdr* cmsgp = CMSG_FIRSTHDR(&msgh);
212   if (cmsgp->cmsg_len != CMSG_LEN(sizeof(struct ucred)) ||
213       cmsgp->cmsg_level != SOL_SOCKET || cmsgp->cmsg_type != SCM_CREDENTIALS) {
214     return absl::InternalError("Receiving pid failed");
215   }
216   auto* ucredp = reinterpret_cast<struct ucred*>(CMSG_DATA(cmsgp));
217   return ucredp->pid;
218 }
219 
GetRootMountId(const std::string & proc_id)220 absl::StatusOr<std::string> GetRootMountId(const std::string& proc_id) {
221   std::ifstream mounts(absl::StrCat("/proc/", proc_id, "/mountinfo"));
222   if (!mounts.good()) {
223     return absl::InternalError("Failed to open mountinfo");
224   }
225   std::string line;
226   while (std::getline(mounts, line)) {
227     std::vector<absl::string_view> parts =
228         absl::StrSplit(line, absl::MaxSplits(' ', 4));
229     if (parts.size() >= 4 && parts[3] == "/") {
230       return std::string(parts[0]);
231     }
232   }
233   return absl::NotFoundError("Root entry not found in mountinfo");
234 }
235 
IsLikelyChrooted()236 bool IsLikelyChrooted() {
237   absl::StatusOr<std::string> self_root_id = GetRootMountId("self");
238   if (!self_root_id.ok()) {
239     return absl::IsNotFound(self_root_id.status());
240   }
241   absl::StatusOr<std::string> init_root_id = GetRootMountId("1");
242   if (!init_root_id.ok()) {
243     return false;
244   }
245   return *self_root_id != *init_root_id;
246 }
247 
248 }  // namespace
249 
PrepareExecveArgs(const ForkRequest & request,std::vector<std::string> * args,std::vector<std::string> * envp)250 void ForkServer::PrepareExecveArgs(const ForkRequest& request,
251                                    std::vector<std::string>* args,
252                                    std::vector<std::string>* envp) {
253   // Prepare arguments for execve.
254   for (const auto& arg : request.args()) {
255     args->push_back(arg);
256   }
257 
258   // Prepare environment variables for execve.
259   for (const auto& env : request.envs()) {
260     envp->push_back(env);
261   }
262 
263   // The child process should not start any fork-servers.
264   envp->push_back(absl::StrCat(kForkServerDisableEnv, "=1"));
265 
266   constexpr char kSapiVlogLevel[] = "SAPI_VLOG_LEVEL";
267   char* sapi_vlog = getenv(kSapiVlogLevel);
268   if (sapi_vlog && strlen(sapi_vlog) > 0) {
269     envp->push_back(absl::StrCat(kSapiVlogLevel, "=", sapi_vlog));
270   }
271 
272   SAPI_RAW_VLOG(1, "Will execute args:['%s'], environment:['%s']",
273                 absl::StrJoin(*args, "', '").c_str(),
274                 absl::StrJoin(*envp, "', '").c_str());
275 }
276 
LaunchChild(const ForkRequest & request,int execve_fd,uid_t uid,gid_t gid,FDCloser signaling_fd,FDCloser status_fd,bool avoid_pivot_root) const277 void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
278                              uid_t uid, gid_t gid, FDCloser signaling_fd,
279                              FDCloser status_fd, bool avoid_pivot_root) const {
280   SAPI_RAW_CHECK(request.mode() != FORKSERVER_FORK_UNSPECIFIED,
281                  "Forkserver mode is unspecified");
282 
283   const bool will_execve = execve_fd != -1;
284   const bool should_sandbox = request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX;
285 
286   absl::StatusOr<absl::flat_hash_set<int>> open_fds = sanitizer::GetListOfFDs();
287   if (!open_fds.ok()) {
288     SAPI_RAW_LOG(WARNING, "Could not get list of current open FDs: %s",
289                  std::string(open_fds.status().message()).c_str());
290     open_fds = absl::flat_hash_set<int>();
291   }
292   SanitizeEnvironment();
293 
294   InitializeNamespaces(request, uid, gid, avoid_pivot_root);
295 
296   auto caps = cap_init();
297   SAPI_RAW_CHECK(cap_set_proc(caps) == 0, "while dropping capabilities");
298   cap_free(caps);
299 
300   // A custom init process is only needed if a new PID NS is created.
301   if (request.clone_flags() & CLONE_NEWPID) {
302     // Spawn a child process
303     pid_t child = util::ForkWithFlags(SIGCHLD);
304     if (child < 0) {
305       SAPI_RAW_PLOG(FATAL, "Could not spawn init process");
306     }
307     if (child != 0) {
308       if (status_fd.get() >= 0) {
309         open_fds->erase(status_fd.get());
310       }
311       // Close all open fds (equals to CloseAllFDsExcept but does not require
312       // /proc to be available).
313       for (const auto& fd : *open_fds) {
314         close(fd);
315       }
316       RunInitProcess(child, std::move(status_fd));
317     }
318     // Send sandboxee pid
319     auto status = SendPid(signaling_fd.get());
320     SAPI_RAW_CHECK(status.ok(),
321                    absl::StrCat("sending pid: ", status.message()).c_str());
322   }
323   signaling_fd.Close();
324   status_fd.Close();
325 
326   Client c(comms_);
327 
328   // Prepare the arguments before sandboxing (if needed), as doing it after
329   // sandoxing can cause syscall violations (e.g. related to memory management).
330   std::vector<std::string> args;
331   std::vector<std::string> envs;
332   if (will_execve) {
333     PrepareExecveArgs(request, &args, &envs);
334   }
335 
336   // Sandboxing can be enabled either here - just before execve, or somewhere
337   // inside the executed binary (e.g. after basic structures have been
338   // initialized, and resources acquired). In the latter case, it's up to the
339   // sandboxed binary to establish proper Comms channel (using
340   // Comms::kSandbox2ClientCommsFD) and call sandbox2::Client::SandboxMeHere()
341   if (should_sandbox) {
342     // The following client calls are basically SandboxMeHere. We split it so
343     // that we can set up the envp after we received the file descriptors but
344     // before we enable the syscall filter.
345     c.PrepareEnvironment(&execve_fd);
346     if (comms_->GetConnectionFD() != Comms::kSandbox2ClientCommsFD) {
347       envs.push_back(absl::StrCat(Comms::kSandbox2CommsFDEnvVar, "=",
348                                   comms_->GetConnectionFD()));
349     }
350     envs.push_back(c.GetFdMapEnvVar());
351   }
352 
353   // Convert args and envs before enabling sandbox (it'll allocate which might
354   // be blocked).
355   util::CharPtrArray argv = util::CharPtrArray::FromStringVector(args);
356   util::CharPtrArray envp = util::CharPtrArray::FromStringVector(envs);
357 
358   if (should_sandbox) {
359     c.EnableSandbox();
360   }
361 
362   if (will_execve) {
363     ExecuteProcess(execve_fd, argv.data(), envp.data());
364   }
365 }
366 
ServeRequest()367 pid_t ForkServer::ServeRequest() {
368   ForkRequest fork_request;
369   if (!comms_->RecvProtoBuf(&fork_request)) {
370     if (comms_->IsTerminated()) {
371       return -1;
372     }
373     SAPI_RAW_LOG(FATAL, "Failed to receive ForkServer request");
374   }
375   int comms_fd;
376   SAPI_RAW_CHECK(comms_->RecvFD(&comms_fd), "Failed to receive Comms FD");
377 
378   SAPI_RAW_CHECK(fork_request.mode() != FORKSERVER_FORK_UNSPECIFIED,
379                  "Forkserver mode is unspecified");
380 
381   int exec_fd = -1;
382   if (fork_request.mode() == FORKSERVER_FORK_EXECVE ||
383       fork_request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX) {
384     SAPI_RAW_CHECK(comms_->RecvFD(&exec_fd), "Failed to receive Exec FD");
385   }
386 
387   // Make the kernel notify us with SIGCHLD when the process terminates.
388   // We use sigaction(SIGCHLD, flags=SA_NOCLDWAIT) in combination with
389   // this to make sure the zombie process is reaped immediately.
390   int clone_flags = fork_request.clone_flags() | SIGCHLD;
391 
392   // Store uid and gid since they will change if CLONE_NEWUSER is set.
393   uid_t uid = getuid();
394   uid_t gid = getgid();
395 
396   FDCloser pipe_fds[2];
397   {
398     int pfds[2] = {-1, -1};
399     if (fork_request.monitor_type() == FORKSERVER_MONITOR_UNOTIFY) {
400       SAPI_RAW_PCHECK(pipe(pfds) == 0, "creating status pipe");
401     }
402     pipe_fds[0] = FDCloser(pfds[0]);
403     pipe_fds[1] = FDCloser(pfds[1]);
404   }
405 
406   int socketpair_fds[2];
407   SAPI_RAW_PCHECK(
408       socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socketpair_fds) == 0,
409       "creating signaling socketpair");
410   for (int i = 0; i < 2; i++) {
411     int val = 1;
412     SAPI_RAW_PCHECK(setsockopt(socketpair_fds[i], SOL_SOCKET, SO_PASSCRED, &val,
413                                sizeof(val)) == 0,
414                     "setsockopt failed");
415   }
416 
417   FDCloser signaling_fds[] = {FDCloser(socketpair_fds[0]),
418                               FDCloser(socketpair_fds[1])};
419 
420   // Note: init_pid will be overwritten with the actual init pid if the init
421   //       process was started or stays at 0 if that is not needed - no pidns.
422   pid_t init_pid = 0;
423   pid_t sandboxee_pid = -1;
424   bool avoid_pivot_root = clone_flags & (CLONE_NEWUSER | CLONE_NEWNS);
425   if (avoid_pivot_root) {
426     // Create initial namespaces only when they're first needed.
427     // This allows sandbox2 to be still used without any namespaces support
428     if (initial_mntns_fd_ == -1) {
429       CreateInitialNamespaces();
430     }
431     // We first just fork a child, which will join the initial namespaces
432     // Note: Not a regular fork() as one really needs to be single-threaded to
433     //       setns and this is not the case with TSAN.
434     pid_t pid = util::ForkWithFlags(SIGCHLD);
435     SAPI_RAW_PCHECK(pid != -1, "fork failed");
436     if (pid == 0) {
437       SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) != -1,
438                       "joining initial user namespace");
439       SAPI_RAW_PCHECK(setns(initial_mntns_fd_, CLONE_NEWNS) != -1,
440                       "joining initial mnt namespace");
441       close(initial_userns_fd_);
442       close(initial_mntns_fd_);
443       // Do not create new userns it will be unshared later
444       sandboxee_pid =
445           util::ForkWithFlags((clone_flags & ~CLONE_NEWUSER) | CLONE_PARENT);
446       if (sandboxee_pid == -1) {
447         SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
448       }
449       if (sandboxee_pid != 0) {
450         _exit(0);
451       }
452       // Send sandboxee pid
453       absl::Status status = SendPid(signaling_fds[1].get());
454       SAPI_RAW_CHECK(status.ok(),
455                      absl::StrCat("sending pid: ", status.message()).c_str());
456     }
457   } else {
458     sandboxee_pid = util::ForkWithFlags(clone_flags);
459     if (sandboxee_pid == -1) {
460       SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
461     }
462     if (sandboxee_pid == 0) {
463       close(initial_userns_fd_);
464       close(initial_mntns_fd_);
465     }
466   }
467 
468   // Child.
469   if (sandboxee_pid == 0) {
470     signaling_fds[0].Close();
471     pipe_fds[0].Close();
472     // Make sure we override the forkserver's comms fd
473     comms_->Terminate();
474     if (exec_fd != -1) {
475       int signaling_fd = signaling_fds[1].Release();
476       int pipe_fd = pipe_fds[1].Release();
477       MoveFDs({{&exec_fd, Comms::kSandbox2TargetExecFD},
478                {&comms_fd, Comms::kSandbox2ClientCommsFD}},
479               {&signaling_fd, &pipe_fd});
480       signaling_fds[1] = FDCloser(signaling_fd);
481       pipe_fds[1] = FDCloser(pipe_fd);
482     }
483     *comms_ = Comms(comms_fd);
484     LaunchChild(fork_request, exec_fd, uid, gid, std::move(signaling_fds[1]),
485                 std::move(pipe_fds[1]), avoid_pivot_root);
486     return sandboxee_pid;
487   }
488 
489   signaling_fds[1].Close();
490 
491   if (avoid_pivot_root) {
492     if (auto pid = ReceivePid(signaling_fds[0].get()); !pid.ok()) {
493       SAPI_RAW_LOG(ERROR, "%s", std::string(pid.status().message()).c_str());
494     } else {
495       sandboxee_pid = pid.value();
496     }
497   }
498 
499   if (fork_request.clone_flags() & CLONE_NEWPID) {
500     // The pid of the init process is equal to the child process that we've
501     // previously forked.
502     init_pid = sandboxee_pid;
503     sandboxee_pid = -1;
504     // And the actual sandboxee is forked from the init process, so we need to
505     // receive the actual PID.
506     if (auto pid_or = ReceivePid(signaling_fds[0].get()); !pid_or.ok()) {
507       SAPI_RAW_LOG(ERROR, "%s", std::string(pid_or.status().message()).c_str());
508       if (init_pid != -1) {
509         kill(init_pid, SIGKILL);
510       }
511       init_pid = -1;
512     } else {
513       sandboxee_pid = pid_or.value();
514     }
515   }
516 
517   // Parent.
518   pipe_fds[1].Close();
519   close(comms_fd);
520   if (exec_fd >= 0) {
521     close(exec_fd);
522   }
523   SAPI_RAW_CHECK(comms_->SendInt32(init_pid),
524                  absl::StrCat("Failed to send init PID: ", init_pid).c_str());
525   SAPI_RAW_CHECK(
526       comms_->SendInt32(sandboxee_pid),
527       absl::StrCat("Failed to send sandboxee PID: ", sandboxee_pid).c_str());
528 
529   if (pipe_fds[0].get() >= 0) {
530     SAPI_RAW_CHECK(comms_->SendFD(pipe_fds[0].get()),
531                    "Failed to send status pipe");
532   }
533   return sandboxee_pid;
534 }
535 
IsTerminated() const536 bool ForkServer::IsTerminated() const { return comms_->IsTerminated(); }
537 
Initialize()538 bool ForkServer::Initialize() {
539   // For safety drop as many capabilities as possible.
540   // Note that cap_t is actually a pointer.
541   cap_t have_caps = cap_get_proc();  // caps we currently have
542   SAPI_RAW_CHECK(have_caps, "failed to cap_get_proc()");
543   cap_t wanted_caps = cap_init();  // starts as empty set, ie. no caps
544   SAPI_RAW_CHECK(wanted_caps, "failed to cap_init()");
545 
546   // CAP_SYS_PTRACE appears to be needed for apparmor (or possibly yama)
547   // CAP_SETFCAP is needed on newer kernels (5.10 needs it, 4.15 does not)
548   for (cap_value_t cap : {CAP_SYS_PTRACE, CAP_SETFCAP}) {
549     for (cap_flag_t flag : {CAP_EFFECTIVE, CAP_PERMITTED}) {
550       cap_flag_value_t value;
551       int rc = cap_get_flag(have_caps, cap, flag, &value);
552       SAPI_RAW_CHECK(!rc, "cap_get_flag");
553       if (value == CAP_SET) {
554         cap_value_t caps_to_set[1] = {
555             cap,
556         };
557         rc = cap_set_flag(wanted_caps, flag, 1, caps_to_set, CAP_SET);
558         SAPI_RAW_CHECK(!rc, "cap_set_flag");
559       }
560     }
561   }
562 
563   SAPI_RAW_CHECK(!cap_set_proc(wanted_caps), "while dropping capabilities");
564   SAPI_RAW_CHECK(!cap_free(wanted_caps), "while freeing wanted_caps");
565   SAPI_RAW_CHECK(!cap_free(have_caps), "while freeing have_caps");
566 
567   // All processes spawned by the fork'd/execute'd process will see this process
568   // as /sbin/init. Therefore it will receive (and ignore) their final status
569   // (see the next comment as well). PR_SET_CHILD_SUBREAPER is available since
570   // kernel version 3.4, so don't panic if it fails.
571   if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) {
572     SAPI_RAW_VLOG(3, "prctl(PR_SET_CHILD_SUBREAPER, 1): %s [%d]",
573                   StrError(errno).c_str(), errno);
574   }
575 
576   // Don't convert terminated child processes into zombies. It's up to the
577   // sandbox (Monitor) to track them and receive/report their final status.
578   struct sigaction sa;
579   sa.sa_handler = SIG_DFL;
580   sa.sa_flags = SA_NOCLDWAIT;
581   sigemptyset(&sa.sa_mask);
582   if (sigaction(SIGCHLD, &sa, nullptr) == -1) {
583     SAPI_RAW_PLOG(ERROR, "sigaction(SIGCHLD, flags=SA_NOCLDWAIT)");
584     return false;
585   }
586   return true;
587 }
588 
CreateInitialNamespaces()589 void ForkServer::CreateInitialNamespaces() {
590   // Spawn a new process to create initial user and mount namespaces to be used
591   // as a base for each namespaced sandboxee.
592 
593   // Store uid and gid to create mappings after CLONE_NEWUSER
594   uid_t uid = getuid();
595   gid_t gid = getgid();
596 
597   // Socket to synchronize so that we open ns fds before process dies
598   FDCloser create_efd(eventfd(0, EFD_CLOEXEC));
599   SAPI_RAW_PCHECK(create_efd.get() != -1, "creating eventfd");
600   FDCloser open_efd(eventfd(0, EFD_CLOEXEC));
601   SAPI_RAW_PCHECK(open_efd.get() != -1, "creating eventfd");
602   pid_t pid = util::ForkWithFlags(CLONE_NEWUSER | CLONE_NEWNS | SIGCHLD);
603   if (pid == -1 && errno == EPERM && IsLikelyChrooted()) {
604     SAPI_RAW_LOG(FATAL,
605                  "failed to fork initial namespaces process: parent process is "
606                  "likely chrooted");
607   }
608   SAPI_RAW_PCHECK(pid != -1, "failed to fork initial namespaces process");
609   uint64_t value = 1;
610   if (pid == 0) {
611     Namespace::InitializeInitialNamespaces(uid, gid);
612     SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(create_efd.get(), &value,
613                                              sizeof(value))) == sizeof(value),
614                     "synchronizing initial namespaces creation");
615     SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(open_efd.get(), &value,
616                                             sizeof(value))) == sizeof(value),
617                     "synchronizing initial namespaces creation");
618     SAPI_RAW_PCHECK(chroot("/realroot") == 0,
619                     "chrooting prior to dumping coverage");
620     util::DumpCoverageData();
621     _exit(0);
622   }
623   SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(create_efd.get(), &value,
624                                           sizeof(value))) == sizeof(value),
625                   "synchronizing initial namespaces creation");
626   initial_userns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/user").c_str(),
627                             O_RDONLY | O_CLOEXEC);
628   SAPI_RAW_PCHECK(initial_userns_fd_ != -1, "getting initial userns fd");
629   initial_mntns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/mnt").c_str(),
630                            O_RDONLY | O_CLOEXEC);
631   SAPI_RAW_PCHECK(initial_mntns_fd_ != -1, "getting initial mntns fd");
632   SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(open_efd.get(), &value,
633                                            sizeof(value))) == sizeof(value),
634                   "synchronizing initial namespaces creation");
635 }
636 
SanitizeEnvironment() const637 void ForkServer::SanitizeEnvironment() const {
638   // Mark all file descriptors, except the standard ones (needed
639   // for proper sandboxed process operations), as close-on-exec.
640   absl::Status status = sanitizer::SanitizeCurrentProcess(
641       {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO, comms_->GetConnectionFD()},
642       /* close_fds = */ false);
643   SAPI_RAW_CHECK(
644       status.ok(),
645       absl::StrCat("while sanitizing process: ", status.message()).c_str());
646 }
647 
ExecuteProcess(int execve_fd,const char * const * argv,const char * const * envp)648 void ForkServer::ExecuteProcess(int execve_fd, const char* const* argv,
649                                 const char* const* envp) {
650   // Do not add any code before execve(), as it's subject to seccomp policies.
651   // Indicate that it's a special execve(), by setting 4th, 5th and 6th syscall
652   // argument to magic values.
653   util::Execveat(execve_fd, "", argv, envp, AT_EMPTY_PATH,
654                  internal::kExecveMagic);
655 
656   int saved_errno = errno;
657   SAPI_RAW_PLOG(ERROR, "execveat failed");
658   if (argv[0]) {
659     SAPI_RAW_LOG(ERROR, "argv[0]=%s", argv[0]);
660   }
661 
662   if (saved_errno == ENOSYS) {
663     SAPI_RAW_LOG(ERROR,
664                  "This is likely caused by running on a kernel that is too old."
665     );
666   } else if (saved_errno == ENOENT && execve_fd >= 0) {
667     // Since we know the file exists, it must be that the file is dynamically
668     // linked and the ELF interpreter is what's actually missing.
669     SAPI_RAW_LOG(
670         ERROR,
671         "This is likely caused by running dynamically-linked sandboxee without "
672         "calling .AddLibrariesForBinary() on the policy builder.");
673   }
674 
675   util::Syscall(__NR_exit_group, EXIT_FAILURE);
676   abort();
677 }
678 
InitializeNamespaces(const ForkRequest & request,uid_t uid,gid_t gid,bool avoid_pivot_root)679 void ForkServer::InitializeNamespaces(const ForkRequest& request, uid_t uid,
680                                       gid_t gid, bool avoid_pivot_root) {
681   if (!request.has_mount_tree()) {
682     return;
683   }
684   Namespace::InitializeNamespaces(
685       uid, gid, request.clone_flags(), Mounts(request.mount_tree()),
686       request.hostname(), avoid_pivot_root, request.allow_mount_propagation());
687 }
688 
689 }  // namespace sandbox2
690