1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Implementation of the sandbox2::Policy class.
16
17 #include "sandboxed_api/sandbox2/policy.h"
18
19 #include <fcntl.h>
20 #include <linux/audit.h>
21 #include <linux/bpf_common.h>
22 #include <linux/filter.h>
23 #include <linux/seccomp.h>
24 #include <sched.h>
25 #include <syscall.h>
26
27 #include <cstdint>
28 #include <limits>
29 #include <optional>
30 #include <string>
31 #include <vector>
32
33 #include "absl/flags/flag.h"
34 #include "absl/log/log.h"
35 #include "absl/strings/string_view.h"
36 #include "sandboxed_api/config.h"
37 #include "sandboxed_api/sandbox2/bpfdisassembler.h"
38 #include "sandboxed_api/sandbox2/comms.h"
39 #include "sandboxed_api/sandbox2/syscall.h"
40 #include "sandboxed_api/sandbox2/util/bpf_helper.h"
41 #include "sandboxed_api/util/raw_logging.h"
42
43 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
44 #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
45 #endif
46
47 #ifndef SECCOMP_RET_USER_NOTIF
48 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */
49 #endif
50
51 #define DO_USER_NOTIF BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF)
52
53 ABSL_FLAG(bool, sandbox2_danger_danger_permit_all, false,
54 "Allow all syscalls, useful for testing");
55 ABSL_FLAG(std::string, sandbox2_danger_danger_permit_all_and_log, "",
56 "Allow all syscalls and log them into specified file");
57
58 namespace sandbox2 {
59
60 // The final policy is the concatenation of:
61 // 1. default policy (GetDefaultPolicy, private),
62 // 2. user policy (user_policy_, public),
63 // 3. default KILL action (avoid failing open if user policy did not do it).
GetPolicy(bool user_notif) const64 std::vector<sock_filter> Policy::GetPolicy(bool user_notif) const {
65 if (absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all) ||
66 !absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all_and_log).empty()) {
67 return GetTrackingPolicy();
68 }
69
70 // Now we can start building the policy.
71 // 1. Start with the default policy (e.g. syscall architecture checks).
72 auto policy = GetDefaultPolicy(user_notif);
73 VLOG(3) << "Default policy:\n" << bpf::Disasm(policy);
74
75 // 2. Append user policy.
76 VLOG(3) << "User policy:\n" << bpf::Disasm(user_policy_);
77 // Add default syscall_nr loading in case the user forgets.
78 policy.push_back(LOAD_SYSCALL_NR);
79 policy.insert(policy.end(), user_policy_.begin(), user_policy_.end());
80
81 // 3. Finish with default KILL action.
82 policy.push_back(KILL);
83
84 // In seccomp_unotify mode replace all KILLS with unotify
85 if (user_notif) {
86 for (sock_filter& filter : policy) {
87 if (filter.code == BPF_RET + BPF_K && filter.k == SECCOMP_RET_KILL) {
88 filter = DO_USER_NOTIF;
89 }
90 }
91 }
92
93 VLOG(2) << "Final policy:\n" << bpf::Disasm(policy);
94 return policy;
95 }
96
97 // If you modify this function, you should also modify.
98 // Monitor::LogAccessViolation to keep them in sync.
99 //
100 // Produces a policy which returns SECCOMP_RET_TRACE instead of SECCOMP_RET_KILL
101 // for the __NR_execve syscall, so the tracer can make a decision to allow or
102 // disallow it depending on which occurrence of __NR_execve it was.
103 // LINT.IfChange
GetDefaultPolicy(bool user_notif) const104 std::vector<sock_filter> Policy::GetDefaultPolicy(bool user_notif) const {
105 bpf_labels l = {0};
106
107 std::vector<sock_filter> policy;
108 if (user_notif) {
109 policy = {
110 // If compiled arch is different from the runtime one, inform the
111 // Monitor.
112 LOAD_ARCH,
113 JNE32(Syscall::GetHostAuditArch(), DENY),
114 LOAD_SYSCALL_NR,
115 // TODO(b/271400371) Use NOTIF_FLAG_CONTINUE once generally available
116 JNE32(__NR_seccomp, JUMP(&l, past_seccomp_l)),
117 ARG_32(3),
118 JNE32(internal::kExecveMagic, JUMP(&l, past_seccomp_l)),
119 ALLOW,
120 LABEL(&l, past_seccomp_l),
121 LOAD_SYSCALL_NR,
122 JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
123 ARG_32(4),
124 JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
125 ARG_32(5),
126 JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
127 ALLOW,
128 LABEL(&l, past_execveat_l),
129
130 LOAD_SYSCALL_NR,
131 };
132 } else {
133 policy = {
134 // If compiled arch is different from the runtime one, inform the Monitor.
135 LOAD_ARCH,
136 JEQ32(Syscall::GetHostAuditArch(), JUMP(&l, past_arch_check_l)),
137 #if defined(SAPI_X86_64)
138 JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)), // 32-bit sandboxee
139 #endif
140 TRACE(sapi::cpu::kUnknown),
141 LABEL(&l, past_arch_check_l),
142
143 // After the policy is uploaded, forkserver will execve the sandboxee. We
144 // need to allow this execve but not others. Since BPF does not have
145 // state, we need to inform the Monitor to decide, and for that we use a
146 // magic value in syscall args 5. Note that this value is not supposed to
147 // be secret, but just an optimization so that the monitor is not
148 // triggered on every call to execveat.
149 LOAD_SYSCALL_NR,
150 JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
151 ARG_32(4),
152 JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
153 ARG_32(5),
154 JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
155 SANDBOX2_TRACE,
156 LABEL(&l, past_execveat_l),
157
158 LOAD_SYSCALL_NR,
159 };
160 }
161
162 // Forbid ptrace because it's unsafe or too risky. The user policy can only
163 // block (i.e. return an error instead of killing the process) but not allow
164 // ptrace. This uses LOAD_SYSCALL_NR from above.
165 if (!user_policy_handles_ptrace_) {
166 policy.insert(policy.end(), {JEQ32(__NR_ptrace, DENY)});
167 }
168
169 // If user policy doesn't mention it, then forbid bpf because it's unsafe or
170 // too risky. This uses LOAD_SYSCALL_NR from above.
171 if (!user_policy_handles_bpf_) {
172 policy.insert(policy.end(), {JEQ32(__NR_bpf, DENY)});
173 }
174 #ifndef CLONE_NEWCGROUP
175 #define CLONE_NEWCGROUP 0x02000000
176 #endif
177 constexpr uintptr_t kNewNamespacesFlags =
178 CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS |
179 CLONE_NEWCGROUP | CLONE_NEWIPC | CLONE_NEWPID;
180 static_assert(kNewNamespacesFlags <= std::numeric_limits<uint32_t>::max());
181 constexpr uintptr_t kUnsafeCloneFlags = kNewNamespacesFlags | CLONE_UNTRACED;
182 static_assert(kUnsafeCloneFlags <= std::numeric_limits<uint32_t>::max());
183 policy.insert(policy.end(),
184 {
185 #ifdef __NR_clone3
186 // Disallow clone3. Errno instead of DENY so that libraries
187 // can fallback to regular clone/clone2.
188 JEQ32(__NR_clone3, ERRNO(ENOSYS)),
189 #endif
190 // Disallow clone3 and clone with unsafe flags. This uses
191 // LOAD_SYSCALL_NR from above.
192 JNE32(__NR_clone, JUMP(&l, past_clone_unsafe_l)),
193 // Regardless of arch, we only care about the lower 32-bits
194 // of the flags.
195 ARG_32(0),
196 JA32(kUnsafeCloneFlags, DENY),
197 LABEL(&l, past_clone_unsafe_l),
198 // Disallow unshare with unsafe flags.
199 LOAD_SYSCALL_NR,
200 JNE32(__NR_unshare, JUMP(&l, past_unshare_unsafe_l)),
201 // Regardless of arch, we only care about the lower 32-bits
202 // of the flags.
203 ARG_32(0),
204 JA32(kNewNamespacesFlags, DENY),
205 LABEL(&l, past_unshare_unsafe_l),
206 // Disallow seccomp with SECCOMP_FILTER_FLAG_NEW_LISTENER
207 // flag.
208 LOAD_SYSCALL_NR,
209 JNE32(__NR_seccomp, JUMP(&l, past_seccomp_new_listener)),
210 // Regardless of arch, we only care about the lower 32-bits
211 // of the flags.
212 ARG_32(1),
213 JA32(SECCOMP_FILTER_FLAG_NEW_LISTENER, DENY),
214 LABEL(&l, past_seccomp_new_listener),
215 });
216
217 if (bpf_resolve_jumps(&l, policy.data(), policy.size()) != 0) {
218 LOG(FATAL) << "Cannot resolve bpf jumps";
219 }
220
221 return policy;
222 }
223 // LINT.ThenChange(monitor_ptrace.cc)
224
GetTrackingPolicy() const225 std::vector<sock_filter> Policy::GetTrackingPolicy() const {
226 return {
227 LOAD_ARCH,
228 #if defined(SAPI_X86_64)
229 JEQ32(AUDIT_ARCH_X86_64, TRACE(sapi::cpu::kX8664)),
230 JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)),
231 #elif defined(SAPI_PPC64_LE)
232 JEQ32(AUDIT_ARCH_PPC64LE, TRACE(sapi::cpu::kPPC64LE)),
233 #elif defined(SAPI_ARM64)
234 JEQ32(AUDIT_ARCH_AARCH64, TRACE(sapi::cpu::kArm64)),
235 #elif defined(SAPI_ARM)
236 JEQ32(AUDIT_ARCH_ARM, TRACE(sapi::cpu::kArm)),
237 #endif
238 TRACE(sapi::cpu::kUnknown),
239 };
240 }
241
SendPolicy(Comms * comms,bool user_notif) const242 bool Policy::SendPolicy(Comms* comms, bool user_notif) const {
243 auto policy = GetPolicy(user_notif);
244 if (!comms->SendBytes(
245 reinterpret_cast<uint8_t*>(policy.data()),
246 static_cast<uint64_t>(policy.size()) * sizeof(sock_filter))) {
247 LOG(ERROR) << "Couldn't send policy";
248 return false;
249 }
250
251 return true;
252 }
253
GetPolicyDescription(PolicyDescription * policy) const254 void Policy::GetPolicyDescription(PolicyDescription* policy) const {
255 policy->set_user_bpf_policy(user_policy_.data(),
256 user_policy_.size() * sizeof(sock_filter));
257 if (policy_builder_description_) {
258 *policy->mutable_policy_builder_description() =
259 *policy_builder_description_;
260 }
261
262 if (namespace_) {
263 namespace_->GetNamespaceDescription(
264 policy->mutable_namespace_description());
265 }
266 }
267
268 } // namespace sandbox2
269