1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef PARTITION_ALLOC_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
6 #define PARTITION_ALLOC_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
7
8 #include <algorithm>
9 #include <atomic>
10 #include <cerrno>
11 #include <cstdint>
12 #include <cstring>
13
14 #include <sys/mman.h>
15
16 #include "build/build_config.h"
17 #include "partition_alloc/oom.h"
18 #include "partition_alloc/page_allocator.h"
19 #include "partition_alloc/page_allocator_constants.h"
20 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
21 #include "partition_alloc/partition_alloc_base/notreached.h"
22 #include "partition_alloc/partition_alloc_base/posix/eintr_wrapper.h"
23 #include "partition_alloc/partition_alloc_check.h"
24 #include "partition_alloc/thread_isolation/thread_isolation.h"
25
26 #if BUILDFLAG(IS_APPLE)
27 #include "partition_alloc/partition_alloc_base/apple/foundation_util.h"
28 #if BUILDFLAG(IS_IOS)
29 #include "partition_alloc/partition_alloc_base/ios/ios_util.h"
30 #elif BUILDFLAG(IS_MAC)
31 #include "partition_alloc/partition_alloc_base/mac/mac_util.h"
32 #else
33 #error "Unknown platform"
34 #endif
35 #include "partition_alloc/partition_alloc_base/apple/scoped_cftyperef.h"
36
37 #include <Availability.h>
38 #include <Security/Security.h>
39 #include <mach/mach.h>
40 #endif
41 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
42 #include <sys/prctl.h>
43 #endif
44 #if BUILDFLAG(IS_LINUX) || BUILDFLAG(IS_CHROMEOS)
45 #include <sys/resource.h>
46 #endif
47
48 #ifndef MAP_ANONYMOUS
49 #define MAP_ANONYMOUS MAP_ANON
50 #endif
51
52 #if BUILDFLAG(IS_MAC)
53
54 // SecTaskGetCodeSignStatus is marked as unavailable on macOS, although it’s
55 // available on iOS and other Apple operating systems. It is, in fact, present
56 // on the system since macOS 10.12.
57 #pragma clang diagnostic push
58 #pragma clang diagnostic ignored "-Wavailability"
59 uint32_t SecTaskGetCodeSignStatus(SecTaskRef task) API_AVAILABLE(macos(10.12));
60 #pragma clang diagnostic pop
61
62 #endif // BUILDFLAG(IS_MAC)
63
64 namespace partition_alloc::internal {
65
66 namespace {
67
68 #if defined(LINUX_NAME_REGION)
69
NameRegion(void * start,size_t length,PageTag page_tag)70 void NameRegion(void* start, size_t length, PageTag page_tag) {
71 // Important: All the names should be string literals. As per prctl.h in
72 // //third_party/android_toolchain/ndk the kernel keeps a pointer to the name
73 // instead of copying it.
74 //
75 // Having the name in .rodata ensures that the pointer remains valid as
76 // long as the mapping is alive.
77 const char* name = nullptr;
78 switch (page_tag) {
79 case PageTag::kSimulation:
80 name = "simulation";
81 break;
82 case PageTag::kBlinkGC:
83 name = "blink_gc";
84 break;
85 case PageTag::kPartitionAlloc:
86 name = "partition_alloc";
87 break;
88 case PageTag::kChromium:
89 name = "chromium";
90 break;
91 case PageTag::kV8:
92 name = "v8";
93 break;
94 default:
95 PA_NOTREACHED();
96 break;
97 }
98
99 // No error checking on purpose, testing only.
100 prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, length, name);
101 }
102
103 #endif // defined(LINUX_NAME_REGION)
104
105 #if BUILDFLAG(IS_MAC)
106 // Tests whether the version of macOS supports the MAP_JIT flag and if the
107 // current process is signed with the hardened runtime and the allow-jit
108 // entitlement, returning whether MAP_JIT should be used to allocate regions
109 // that will contain JIT-compiled executable code.
UseMapJit()110 bool UseMapJit() {
111 // Until determining that the hardened runtime is enabled, early returns will
112 // return true, so that MAP_JIT will be used. This is important on arm64,
113 // which only allows pages to be simultaneously writable and executable when
114 // in a region allocated with MAP_JIT, regardless of code signing options. On
115 // arm64, an attempt to set a non-MAP_JIT page as simultaneously writable and
116 // executable fails with EPERM. Although this is not enforced on x86_64,
117 // MAP_JIT is harmless in that case.
118
119 base::apple::ScopedCFTypeRef<SecTaskRef> task(
120 SecTaskCreateFromSelf(kCFAllocatorDefault));
121 if (!task) {
122 return true;
123 }
124
125 uint32_t flags = SecTaskGetCodeSignStatus(task);
126 if (!(flags & kSecCodeSignatureRuntime)) {
127 // The hardened runtime is not enabled. Note that kSecCodeSignatureRuntime
128 // == CS_RUNTIME.
129 return true;
130 }
131
132 // The hardened runtime is enabled. From this point on, early returns must
133 // return false, indicating that MAP_JIT is not to be used. It’s an error
134 // (EINVAL) to use MAP_JIT with the hardened runtime unless the JIT
135 // entitlement is specified.
136
137 base::apple::ScopedCFTypeRef<CFTypeRef> jit_entitlement(
138 SecTaskCopyValueForEntitlement(
139 task.get(), CFSTR("com.apple.security.cs.allow-jit"), nullptr));
140 if (!jit_entitlement) {
141 return false;
142 }
143
144 return base::apple::CFCast<CFBooleanRef>(jit_entitlement.get()) ==
145 kCFBooleanTrue;
146 }
147 #elif BUILDFLAG(IS_IOS)
UseMapJit()148 bool UseMapJit() {
149 // Always enable MAP_JIT in simulator as it is supported unconditionally.
150 #if TARGET_IPHONE_SIMULATOR
151 return true;
152 #else
153 // TODO(https://crbug.com/1413818): Fill this out when the API it is
154 // available.
155 return false;
156 #endif // TARGET_IPHONE_SIMULATOR
157 }
158 #endif // BUILDFLAG(IS_IOS)
159 } // namespace
160
161 // |mmap| uses a nearby address if the hint address is blocked.
162 constexpr bool kHintIsAdvisory = true;
163 std::atomic<int32_t> s_allocPageErrorCode{0};
164
165 int GetAccessFlags(PageAccessibilityConfiguration accessibility);
166
SystemAllocPagesInternal(uintptr_t hint,size_t length,PageAccessibilityConfiguration accessibility,PageTag page_tag,int file_descriptor_for_shared_alloc)167 uintptr_t SystemAllocPagesInternal(uintptr_t hint,
168 size_t length,
169 PageAccessibilityConfiguration accessibility,
170 PageTag page_tag,
171 int file_descriptor_for_shared_alloc) {
172 #if BUILDFLAG(IS_APPLE)
173 // Use a custom tag to make it easier to distinguish PartitionAlloc regions
174 // in vmmap(1). Tags between 240-255 are supported.
175 int fd = file_descriptor_for_shared_alloc == -1
176 ? VM_MAKE_TAG(static_cast<int>(page_tag))
177 : file_descriptor_for_shared_alloc;
178 #else
179 int fd = file_descriptor_for_shared_alloc;
180 #endif
181
182 int access_flag = GetAccessFlags(accessibility);
183 int map_flags = MAP_ANONYMOUS | MAP_PRIVATE;
184
185 #if BUILDFLAG(IS_APPLE)
186 // On macOS, executables that are code signed with the "runtime" option cannot
187 // execute writable memory by default. They can opt into this capability by
188 // specifying the "com.apple.security.cs.allow-jit" code signing entitlement
189 // and allocating the region with the MAP_JIT flag.
190 static const bool kUseMapJit = UseMapJit();
191 if (accessibility.permissions ==
192 PageAccessibilityConfiguration::kInaccessibleWillJitLater &&
193 kUseMapJit) {
194 map_flags |= MAP_JIT;
195 }
196 #endif
197
198 void* ret = mmap(reinterpret_cast<void*>(hint), length, access_flag,
199 map_flags, fd, 0);
200 if (ret == MAP_FAILED) {
201 s_allocPageErrorCode = errno;
202 ret = nullptr;
203 }
204
205 #if defined(LINUX_NAME_REGION)
206 if (ret) {
207 NameRegion(ret, length, page_tag);
208 }
209 #endif
210
211 return reinterpret_cast<uintptr_t>(ret);
212 }
213
TrySetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)214 bool TrySetSystemPagesAccessInternal(
215 uintptr_t address,
216 size_t length,
217 PageAccessibilityConfiguration accessibility) {
218 #if BUILDFLAG(ENABLE_THREAD_ISOLATION)
219 if (accessibility.thread_isolation.enabled) {
220 return 0 == MprotectWithThreadIsolation(reinterpret_cast<void*>(address),
221 length,
222 GetAccessFlags(accessibility),
223 accessibility.thread_isolation);
224 }
225 #endif // BUILDFLAG(ENABLE_THREAD_ISOLATION)
226 return 0 == WrapEINTR(mprotect)(reinterpret_cast<void*>(address), length,
227 GetAccessFlags(accessibility));
228 }
229
SetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)230 void SetSystemPagesAccessInternal(
231 uintptr_t address,
232 size_t length,
233 PageAccessibilityConfiguration accessibility) {
234 int access_flags = GetAccessFlags(accessibility);
235 int ret;
236 #if BUILDFLAG(ENABLE_THREAD_ISOLATION)
237 if (accessibility.thread_isolation.enabled) {
238 ret = MprotectWithThreadIsolation(reinterpret_cast<void*>(address), length,
239 GetAccessFlags(accessibility),
240 accessibility.thread_isolation);
241 } else
242 #endif // BUILDFLAG(ENABLE_THREAD_ISOLATION)
243 {
244 ret = WrapEINTR(mprotect)(reinterpret_cast<void*>(address), length,
245 GetAccessFlags(accessibility));
246 }
247
248 // On Linux, man mprotect(2) states that ENOMEM is returned when (1) internal
249 // kernel data structures cannot be allocated, (2) the address range is
250 // invalid, or (3) this would split an existing mapping in a way that would
251 // exceed the maximum number of allowed mappings.
252 //
253 // Neither are very likely, but we still get a lot of crashes here. This is
254 // because setrlimit(RLIMIT_DATA)'s limit is checked and enforced here, if the
255 // access flags match a "data" mapping, which in our case would be MAP_PRIVATE
256 // | MAP_ANONYMOUS, and PROT_WRITE. see the call to may_expand_vm() in
257 // mm/mprotect.c in the kernel for details.
258 //
259 // In this case, we are almost certainly bumping into the sandbox limit, mark
260 // the crash as OOM. See SandboxLinux::LimitAddressSpace() for details.
261 if (ret == -1 && errno == ENOMEM && (access_flags & PROT_WRITE)) {
262 OOM_CRASH(length);
263 }
264
265 PA_PCHECK(0 == ret);
266 }
267
FreePagesInternal(uintptr_t address,size_t length)268 void FreePagesInternal(uintptr_t address, size_t length) {
269 PA_PCHECK(0 == munmap(reinterpret_cast<void*>(address), length));
270 }
271
TrimMappingInternal(uintptr_t base_address,size_t base_length,size_t trim_length,PageAccessibilityConfiguration accessibility,size_t pre_slack,size_t post_slack)272 uintptr_t TrimMappingInternal(uintptr_t base_address,
273 size_t base_length,
274 size_t trim_length,
275 PageAccessibilityConfiguration accessibility,
276 size_t pre_slack,
277 size_t post_slack) {
278 uintptr_t ret = base_address;
279 // We can resize the allocation run. Release unneeded memory before and after
280 // the aligned range.
281 if (pre_slack) {
282 FreePages(base_address, pre_slack);
283 ret = base_address + pre_slack;
284 }
285 if (post_slack) {
286 FreePages(ret + trim_length, post_slack);
287 }
288 return ret;
289 }
290
DecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityDisposition accessibility_disposition)291 void DecommitSystemPagesInternal(
292 uintptr_t address,
293 size_t length,
294 PageAccessibilityDisposition accessibility_disposition) {
295 // In POSIX, there is no decommit concept. Discarding is an effective way of
296 // implementing the Windows semantics where the OS is allowed to not swap the
297 // pages in the region.
298 DiscardSystemPages(address, length);
299
300 bool change_permissions =
301 accessibility_disposition == PageAccessibilityDisposition::kRequireUpdate;
302 #if BUILDFLAG(PA_DCHECK_IS_ON)
303 // This is not guaranteed, show that we're serious.
304 //
305 // More specifically, several callers have had issues with assuming that
306 // memory is zeroed, this would hopefully make these bugs more visible. We
307 // don't memset() everything, because ranges can be very large, and doing it
308 // over the entire range could make Chrome unusable with
309 // BUILDFLAG(PA_DCHECK_IS_ON).
310 //
311 // Only do it when we are about to change the permissions, since we don't know
312 // the previous permissions, and cannot restore them.
313 if (!DecommittedMemoryIsAlwaysZeroed() && change_permissions) {
314 // Memory may not be writable.
315 size_t size = std::min(length, 2 * SystemPageSize());
316 void* ptr = reinterpret_cast<void*>(address);
317 PA_CHECK(mprotect(ptr, size, PROT_WRITE) == 0);
318 memset(ptr, 0xcc, size);
319 }
320 #endif
321
322 // Make pages inaccessible, unless the caller requested to keep permissions.
323 //
324 // Note, there is a small window between these calls when the pages can be
325 // incorrectly touched and brought back to memory. Not ideal, but doing those
326 // operations in the opposite order resulted in PMF regression on Mac (see
327 // crbug.com/1153021).
328 if (change_permissions) {
329 SetSystemPagesAccess(address, length,
330 PageAccessibilityConfiguration(
331 PageAccessibilityConfiguration::kInaccessible));
332 }
333 }
334
DecommitAndZeroSystemPagesInternal(uintptr_t address,size_t length,PageTag page_tag)335 bool DecommitAndZeroSystemPagesInternal(uintptr_t address,
336 size_t length,
337 PageTag page_tag) {
338 int fd = -1;
339 #if BUILDFLAG(IS_APPLE)
340 fd = VM_MAKE_TAG(static_cast<int>(page_tag));
341 #endif
342
343 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mmap.html: "If
344 // a MAP_FIXED request is successful, then any previous mappings [...] for
345 // those whole pages containing any part of the address range [pa,pa+len)
346 // shall be removed, as if by an appropriate call to munmap(), before the
347 // new mapping is established." As a consequence, the memory will be
348 // zero-initialized on next access.
349 void* ptr = reinterpret_cast<void*>(address);
350 void* ret = mmap(ptr, length, PROT_NONE,
351 MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, fd, 0);
352 if (ret == MAP_FAILED) {
353 // Decomitting may create additional VMAs (e.g. if we're decommitting pages
354 // in the middle of a larger mapping) and so it can fail with ENOMEM if the
355 // limit of VMAs is exceeded.
356 PA_CHECK(errno == ENOMEM);
357 return false;
358 }
359 PA_CHECK(ret == ptr);
360 // Since we just remapped the region, need to set is name again.
361 #if defined(LINUX_NAME_REGION)
362 NameRegion(ret, length, page_tag);
363 #endif
364 return true;
365 }
366
RecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)367 void RecommitSystemPagesInternal(
368 uintptr_t address,
369 size_t length,
370 PageAccessibilityConfiguration accessibility,
371 PageAccessibilityDisposition accessibility_disposition) {
372 // On POSIX systems, the caller needs to simply read the memory to recommit
373 // it. However, if decommit changed the permissions, recommit has to change
374 // them back.
375 if (accessibility_disposition ==
376 PageAccessibilityDisposition::kRequireUpdate) {
377 SetSystemPagesAccess(address, length, accessibility);
378 }
379
380 #if BUILDFLAG(IS_APPLE)
381 // On macOS, to update accounting, we need to make another syscall. For more
382 // details, see https://crbug.com/823915.
383 madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
384 #endif
385 }
386
TryRecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)387 bool TryRecommitSystemPagesInternal(
388 uintptr_t address,
389 size_t length,
390 PageAccessibilityConfiguration accessibility,
391 PageAccessibilityDisposition accessibility_disposition) {
392 // On POSIX systems, the caller needs to simply read the memory to recommit
393 // it. However, if decommit changed the permissions, recommit has to change
394 // them back.
395 if (accessibility_disposition ==
396 PageAccessibilityDisposition::kRequireUpdate) {
397 bool ok = TrySetSystemPagesAccess(address, length, accessibility);
398 if (!ok) {
399 return false;
400 }
401 }
402
403 #if BUILDFLAG(IS_APPLE)
404 // On macOS, to update accounting, we need to make another syscall. For more
405 // details, see https://crbug.com/823915.
406 madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
407 #endif
408
409 return true;
410 }
411
DiscardSystemPagesInternal(uintptr_t address,size_t length)412 void DiscardSystemPagesInternal(uintptr_t address, size_t length) {
413 void* ptr = reinterpret_cast<void*>(address);
414 #if BUILDFLAG(IS_APPLE)
415 int ret = madvise(ptr, length, MADV_FREE_REUSABLE);
416 if (ret) {
417 // MADV_FREE_REUSABLE sometimes fails, so fall back to MADV_DONTNEED.
418 ret = madvise(ptr, length, MADV_DONTNEED);
419 }
420 PA_PCHECK(ret == 0);
421 #else // BUILDFLAG(IS_APPLE)
422 // We have experimented with other flags, but with suboptimal results.
423 //
424 // MADV_FREE (Linux): Makes our memory measurements less predictable;
425 // performance benefits unclear.
426 //
427 // Therefore, we just do the simple thing: MADV_DONTNEED.
428 PA_PCHECK(0 == madvise(ptr, length, MADV_DONTNEED));
429 #endif // BUILDFLAG(IS_APPLE)
430 }
431
432 } // namespace partition_alloc::internal
433
434 #endif // PARTITION_ALLOC_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
435