1 #include "sandboxed_api/sandbox2/monitor_unotify.h"
2
3 #include <linux/audit.h>
4 #include <linux/seccomp.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/ptrace.h>
9 #include <sys/resource.h>
10 #include <sys/sysinfo.h>
11 #include <sys/uio.h>
12 #include <sys/wait.h>
13 #include <syscall.h>
14 #include <unistd.h>
15
16 #include <algorithm>
17 #include <atomic>
18 #include <cerrno>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <cstring>
22 #include <memory>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27 #include "absl/base/macros.h"
28 #include "absl/cleanup/cleanup.h"
29 #include "absl/log/check.h"
30 #include "absl/log/log.h"
31 #include "absl/status/status.h"
32 #include "absl/status/statusor.h"
33 #include "absl/strings/str_cat.h"
34 #include "absl/synchronization/mutex.h"
35 #include "absl/synchronization/notification.h"
36 #include "absl/time/clock.h"
37 #include "absl/time/time.h"
38 #include "absl/types/span.h"
39 #include "sandboxed_api/config.h"
40 #include "sandboxed_api/sandbox2/client.h"
41 #include "sandboxed_api/sandbox2/executor.h"
42 #include "sandboxed_api/sandbox2/forkserver.pb.h"
43 #include "sandboxed_api/sandbox2/monitor_base.h"
44 #include "sandboxed_api/sandbox2/notify.h"
45 #include "sandboxed_api/sandbox2/policy.h"
46 #include "sandboxed_api/sandbox2/result.h"
47 #include "sandboxed_api/util/fileops.h"
48 #include "sandboxed_api/util/status_macros.h"
49 #include "sandboxed_api/util/raw_logging.h"
50
51 #ifndef SECCOMP_GET_NOTIF_SIZES
52 #define SECCOMP_GET_NOTIF_SIZES 3
53
54 struct seccomp_notif_sizes {
55 __u16 seccomp_notif;
56 __u16 seccomp_notif_resp;
57 __u16 seccomp_data;
58 };
59 #endif
60
61 #ifndef SECCOMP_IOCTL_NOTIF_RECV
62 #ifndef SECCOMP_IOWR
63 #define SECCOMP_IOC_MAGIC '!'
64 #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
65 #define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
66 #endif
67
68 /* Flags for seccomp notification fd ioctl. */
69 #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
70 #endif
71
72 namespace sandbox2 {
73
74 namespace {
75
76 using ::sapi::file_util::fileops::FDCloser;
77
seccomp(unsigned int operation,unsigned int flags,void * args)78 int seccomp(unsigned int operation, unsigned int flags, void* args) {
79 return syscall(SYS_seccomp, operation, flags, args);
80 }
81
AuditArchToCPUArch(uint32_t arch)82 sapi::cpu::Architecture AuditArchToCPUArch(uint32_t arch) {
83 switch (arch) {
84 case AUDIT_ARCH_AARCH64:
85 return sapi::cpu::Architecture::kArm64;
86 case AUDIT_ARCH_ARM:
87 return sapi::cpu::Architecture::kArm;
88 case AUDIT_ARCH_X86_64:
89 return sapi::cpu::Architecture::kX8664;
90 case AUDIT_ARCH_I386:
91 return sapi::cpu::Architecture::kX86;
92 case AUDIT_ARCH_PPC64LE:
93 return sapi::cpu::Architecture::kPPC64LE;
94 default:
95 return sapi::cpu::Architecture::kUnknown;
96 }
97 }
98
WaitForFdReadable(int fd,absl::Time deadline)99 absl::Status WaitForFdReadable(int fd, absl::Time deadline) {
100 pollfd pfds[] = {
101 {.fd = fd, .events = POLLIN},
102 };
103 for (absl::Duration remaining = deadline - absl::Now();
104 remaining > absl::ZeroDuration(); remaining = deadline - absl::Now()) {
105 int ret = poll(pfds, ABSL_ARRAYSIZE(pfds),
106 static_cast<int>(absl::ToInt64Milliseconds(remaining)));
107 if (ret > 0) {
108 if (pfds[0].revents & POLLIN) {
109 return absl::OkStatus();
110 }
111 if (pfds[0].revents & POLLHUP) {
112 return absl::UnavailableError("hangup");
113 }
114 return absl::InternalError("poll");
115 }
116 if (ret == -1 && errno != EINTR) {
117 return absl::ErrnoToStatus(errno, "poll");
118 }
119 }
120 return absl::DeadlineExceededError("waiting for fd");
121 }
122
ReadWholeWithDeadline(int fd,std::vector<iovec> vecs_vec,absl::Time deadline)123 absl::Status ReadWholeWithDeadline(int fd, std::vector<iovec> vecs_vec,
124 absl::Time deadline) {
125 absl::Span<iovec> vecs = absl::MakeSpan(vecs_vec);
126 while (!vecs.empty()) {
127 SAPI_RETURN_IF_ERROR(WaitForFdReadable(fd, deadline));
128 ssize_t r = readv(fd, vecs.data(), vecs.size());
129 if (r < 0 && errno != EINTR) {
130 return absl::ErrnoToStatus(errno, "readv");
131 }
132 while (r > 0) {
133 if (vecs.empty()) {
134 return absl::InternalError("readv return value too big");
135 }
136 iovec& vec = vecs.front();
137 if (r < vec.iov_len) {
138 vec.iov_len -= r;
139 vec.iov_base = reinterpret_cast<char*>(vec.iov_base) + r;
140 break;
141 }
142 r -= vec.iov_len;
143 vecs.remove_prefix(1);
144 }
145 }
146 return absl::OkStatus();
147 }
148
149 } // namespace
150
UnotifyMonitor(Executor * executor,Policy * policy,Notify * notify)151 UnotifyMonitor::UnotifyMonitor(Executor* executor, Policy* policy,
152 Notify* notify)
153 : MonitorBase(executor, policy, notify) {
154 type_ = FORKSERVER_MONITOR_UNOTIFY;
155 if (executor_->limits()->wall_time_limit() != absl::ZeroDuration()) {
156 auto deadline = absl::Now() + executor_->limits()->wall_time_limit();
157 deadline_millis_.store(absl::ToUnixMillis(deadline),
158 std::memory_order_relaxed);
159 }
160 external_kill_request_flag_.test_and_set(std::memory_order_relaxed);
161 dump_stack_request_flag_.test_and_set(std::memory_order_relaxed);
162 }
163
RunInternal()164 void UnotifyMonitor::RunInternal() {
165 thread_ = std::make_unique<std::thread>(&UnotifyMonitor::Run, this);
166
167 // Wait for the Monitor to set-up the sandboxee correctly (or fail while
168 // doing that). From here on, it is safe to use the IPC object for
169 // non-sandbox-related data exchange.
170 setup_notification_.WaitForNotification();
171 }
172
HandleUnotify()173 void UnotifyMonitor::HandleUnotify() {
174 memset(req_.get(), 0, req_size_);
175 if (ioctl(seccomp_notify_fd_.get(), SECCOMP_IOCTL_NOTIF_RECV, req_.get()) !=
176 0) {
177 if (errno == ENOENT) {
178 VLOG(1) << "Unotify recv failed with ENOENT";
179 } else {
180 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_NOTIFY);
181 }
182 return;
183 }
184 Syscall syscall(AuditArchToCPUArch(req_->data.arch), req_->data.nr,
185 {req_->data.args[0], req_->data.args[1], req_->data.args[2],
186 req_->data.args[3], req_->data.args[4], req_->data.args[5]},
187 req_->pid, 0, req_->data.instruction_pointer);
188 ViolationType violation_type = syscall.arch() == Syscall::GetHostArch()
189 ? kSyscallViolation
190 : kArchitectureSwitchViolation;
191 LogSyscallViolation(syscall);
192 notify_->EventSyscallViolation(syscall, violation_type);
193 MaybeGetStackTrace(req_->pid, Result::VIOLATION);
194 SetExitStatusCode(Result::VIOLATION, syscall.nr());
195 notify_->EventSyscallViolation(syscall, violation_type);
196 result_.SetSyscall(std::make_unique<Syscall>(syscall));
197 KillSandboxee();
198 }
199
Run()200 void UnotifyMonitor::Run() {
201 absl::Cleanup monitor_done = [this] {
202 getrusage(RUSAGE_THREAD, result_.GetRUsageMonitor());
203 OnDone();
204 };
205
206 absl::Cleanup setup_notify = [this] { setup_notification_.Notify(); };
207 if (!InitSetupUnotify()) {
208 SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY);
209 return;
210 }
211 if (!InitSetupNotifyEventFd()) {
212 SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY);
213 return;
214 }
215
216 std::move(setup_notify).Invoke();
217
218 pollfd pfds[] = {
219 {.fd = process_.status_fd.get(), .events = POLLIN},
220 {.fd = seccomp_notify_fd_.get(), .events = POLLIN},
221 {.fd = monitor_notify_fd_.get(), .events = POLLIN},
222 };
223 while (result_.final_status() == Result::UNSET) {
224 int64_t deadline = deadline_millis_.load(std::memory_order_relaxed);
225 absl::Duration remaining = absl::FromUnixMillis(deadline) - absl::Now();
226 if (deadline != 0 && remaining < absl::ZeroDuration()) {
227 VLOG(1) << "Sandbox process hit timeout due to the walltime timer";
228 timed_out_ = true;
229 MaybeGetStackTrace(process_.main_pid, Result::TIMEOUT);
230 KillSandboxee();
231 SetExitStatusFromStatusPipe();
232 break;
233 }
234
235 if (!external_kill_request_flag_.test_and_set(std::memory_order_relaxed)) {
236 external_kill_ = true;
237 MaybeGetStackTrace(process_.main_pid, Result::EXTERNAL_KILL);
238 KillSandboxee();
239 SetExitStatusFromStatusPipe();
240 break;
241 }
242
243 if (network_proxy_server_ &&
244 network_proxy_server_->violation_occurred_.load(
245 std::memory_order_acquire) &&
246 !network_violation_) {
247 network_violation_ = true;
248 MaybeGetStackTrace(process_.main_pid, Result::VIOLATION);
249 KillSandboxee();
250 SetExitStatusFromStatusPipe();
251 break;
252 }
253 constexpr int64_t kMinWakeupMsec = 30000;
254 int timeout_msec = kMinWakeupMsec;
255 if (remaining > absl::ZeroDuration()) {
256 timeout_msec = static_cast<int>(
257 std::min(kMinWakeupMsec, absl::ToInt64Milliseconds(remaining)));
258 }
259 int ret = poll(pfds, ABSL_ARRAYSIZE(pfds), timeout_msec);
260 if (ret == 0 || (ret == -1 && errno == EINTR)) {
261 continue;
262 }
263 if (ret == -1) {
264 PLOG(ERROR) << "waiting for action failed";
265 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
266 break;
267 }
268 if (pfds[2].revents & POLLIN) {
269 uint64_t value = 0;
270 read(monitor_notify_fd_.get(), &value, sizeof(value));
271 continue;
272 }
273 if (pfds[0].revents & POLLIN) {
274 SetExitStatusFromStatusPipe();
275 break;
276 }
277 if (pfds[0].revents & POLLHUP) {
278 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
279 break;
280 }
281 if (pfds[1].revents & POLLIN) {
282 HandleUnotify();
283 }
284 }
285 KillInit();
286 }
287
SetExitStatusFromStatusPipe()288 void UnotifyMonitor::SetExitStatusFromStatusPipe() {
289 int code, status;
290 rusage usage;
291
292 std::vector<iovec> iov = {
293 {.iov_base = &code, .iov_len = sizeof(code)},
294 {.iov_base = &status, .iov_len = sizeof(status)},
295 {.iov_base = &usage, .iov_len = sizeof(usage)},
296 };
297
298 if (absl::Status status = ReadWholeWithDeadline(
299 process_.status_fd.get(), iov, absl::Now() + absl::Seconds(1));
300 !status.ok()) {
301 PLOG(ERROR) << "reading status pipe failed " << status;
302 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
303 return;
304 }
305
306 result_.SetRUsageSandboxee(usage);
307 if (code == CLD_EXITED) {
308 SetExitStatusCode(Result::OK, status);
309 } else if (code == CLD_KILLED || code == CLD_DUMPED) {
310 if (network_violation_) {
311 SetExitStatusCode(Result::VIOLATION, Result::VIOLATION_NETWORK);
312 result_.SetNetworkViolation(network_proxy_server_->violation_msg_);
313 } else if (external_kill_) {
314 SetExitStatusCode(Result::EXTERNAL_KILL, 0);
315 } else if (timed_out_) {
316 SetExitStatusCode(Result::TIMEOUT, 0);
317 } else {
318 SetExitStatusCode(Result::SIGNALED, status);
319 }
320 } else {
321 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
322 }
323 }
324
InitSetupUnotify()325 bool UnotifyMonitor::InitSetupUnotify() {
326 if (!comms_->SendUint32(Client::kSandbox2ClientUnotify)) {
327 LOG(ERROR) << "Couldn't send Client::kSandbox2ClientUnotify message";
328 return false;
329 }
330 int fd;
331 if (!comms_->RecvFD(&fd)) {
332 LOG(ERROR) << "Couldn't recv unotify fd";
333 return false;
334 }
335 seccomp_notify_fd_ = FDCloser(fd);
336 struct seccomp_notif_sizes sizes = {};
337 if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1) {
338 LOG(ERROR) << "Couldn't get seccomp_notif_sizes";
339 return false;
340 }
341 req_size_ = sizes.seccomp_notif;
342 req_.reset(static_cast<seccomp_notif*>(malloc(req_size_)));
343 return true;
344 }
345
InitSetupNotifyEventFd()346 bool UnotifyMonitor::InitSetupNotifyEventFd() {
347 int fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
348 if (fd == -1) {
349 PLOG(ERROR) << "failed creating monitor pipe";
350 return false;
351 }
352 monitor_notify_fd_ = FDCloser(fd);
353 return true;
354 }
355
NotifyMonitor()356 void UnotifyMonitor::NotifyMonitor() {
357 absl::ReaderMutexLock lock(¬ify_mutex_);
358 if (monitor_notify_fd_.get() < 0) {
359 return;
360 }
361 uint64_t value = 1;
362 write(monitor_notify_fd_.get(), &value, sizeof(value));
363 }
364
KillSandboxee()365 bool UnotifyMonitor::KillSandboxee() {
366 VLOG(1) << "Sending SIGKILL to the PID: " << process_.main_pid;
367 if (kill(process_.main_pid, SIGKILL) != 0) {
368 PLOG(ERROR) << "Could not send SIGKILL to PID " << process_.main_pid;
369 return false;
370 }
371 return true;
372 }
373
KillInit()374 void UnotifyMonitor::KillInit() {
375 VLOG(1) << "Sending SIGKILL to the PID: " << process_.init_pid;
376 if (kill(process_.init_pid, SIGKILL) != 0) {
377 PLOG(ERROR) << "Could not send SIGKILL to PID " << process_.init_pid;
378 }
379 }
380
Join()381 void UnotifyMonitor::Join() {
382 absl::MutexLock lock(¬ify_mutex_);
383 if (thread_) {
384 thread_->join();
385 CHECK(IsDone()) << "Monitor did not terminate";
386 VLOG(1) << "Final execution status: " << result_.ToString();
387 CHECK(result_.final_status() != Result::UNSET);
388 thread_.reset();
389 monitor_notify_fd_.Close();
390 }
391 }
392
MaybeGetStackTrace(pid_t pid,Result::StatusEnum status)393 void UnotifyMonitor::MaybeGetStackTrace(pid_t pid, Result::StatusEnum status) {
394 if (ShouldCollectStackTrace(status)) {
395 auto stack = GetStackTrace(pid);
396 if (stack.ok()) {
397 result_.set_stack_trace(*stack);
398 } else {
399 LOG(ERROR) << "Getting stack trace: " << stack.status();
400 }
401 }
402 }
403
GetStackTrace(pid_t pid)404 absl::StatusOr<std::vector<std::string>> UnotifyMonitor::GetStackTrace(
405 pid_t pid) {
406 if (ptrace(PTRACE_ATTACH, pid, 0, 0) != 0) {
407 return absl::ErrnoToStatus(errno,
408 absl::StrCat("could not attach to pid = ", pid));
409 }
410 int wstatus = 0;
411 while (!WIFSTOPPED(wstatus)) {
412 pid_t ret =
413 waitpid(pid, &wstatus, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
414 if (ret == -1) {
415 return absl::ErrnoToStatus(errno,
416 absl::StrCat("waiting for stop, pid = ", pid));
417 }
418 }
419 absl::Cleanup cleanup = [pid] {
420 if (ptrace(PTRACE_DETACH, pid, 0, 0) != 0) {
421 LOG(ERROR) << "Could not detach after obtaining stack trace from pid = "
422 << pid;
423 }
424 };
425 Regs regs(pid);
426 absl::Status status = regs.Fetch();
427 if (!status.ok()) {
428 if (absl::IsNotFound(status)) {
429 LOG(WARNING) << "failed to fetch regs: " << status;
430 return status;
431 }
432 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_FETCH);
433 return status;
434 }
435 return GetAndLogStackTrace(®s);
436 }
437
438 } // namespace sandbox2
439