1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef PARTITION_ALLOC_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
6 #define PARTITION_ALLOC_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
7 
8 #include <algorithm>
9 #include <atomic>
10 #include <cerrno>
11 #include <cstdint>
12 #include <cstring>
13 
14 #include <sys/mman.h>
15 
16 #include "build/build_config.h"
17 #include "partition_alloc/oom.h"
18 #include "partition_alloc/page_allocator.h"
19 #include "partition_alloc/page_allocator_constants.h"
20 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
21 #include "partition_alloc/partition_alloc_base/notreached.h"
22 #include "partition_alloc/partition_alloc_base/posix/eintr_wrapper.h"
23 #include "partition_alloc/partition_alloc_check.h"
24 #include "partition_alloc/thread_isolation/thread_isolation.h"
25 
26 #if BUILDFLAG(IS_APPLE)
27 #include "partition_alloc/partition_alloc_base/apple/foundation_util.h"
28 #if BUILDFLAG(IS_IOS)
29 #include "partition_alloc/partition_alloc_base/ios/ios_util.h"
30 #elif BUILDFLAG(IS_MAC)
31 #include "partition_alloc/partition_alloc_base/mac/mac_util.h"
32 #else
33 #error "Unknown platform"
34 #endif
35 #include "partition_alloc/partition_alloc_base/apple/scoped_cftyperef.h"
36 
37 #include <Availability.h>
38 #include <Security/Security.h>
39 #include <mach/mach.h>
40 #endif
41 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
42 #include <sys/prctl.h>
43 #endif
44 #if BUILDFLAG(IS_LINUX) || BUILDFLAG(IS_CHROMEOS)
45 #include <sys/resource.h>
46 #endif
47 
48 #ifndef MAP_ANONYMOUS
49 #define MAP_ANONYMOUS MAP_ANON
50 #endif
51 
52 #if BUILDFLAG(IS_MAC)
53 
54 // SecTaskGetCodeSignStatus is marked as unavailable on macOS, although it’s
55 // available on iOS and other Apple operating systems. It is, in fact, present
56 // on the system since macOS 10.12.
57 #pragma clang diagnostic push
58 #pragma clang diagnostic ignored "-Wavailability"
59 uint32_t SecTaskGetCodeSignStatus(SecTaskRef task) API_AVAILABLE(macos(10.12));
60 #pragma clang diagnostic pop
61 
62 #endif  // BUILDFLAG(IS_MAC)
63 
64 namespace partition_alloc::internal {
65 
66 namespace {
67 
68 #if defined(LINUX_NAME_REGION)
69 
NameRegion(void * start,size_t length,PageTag page_tag)70 void NameRegion(void* start, size_t length, PageTag page_tag) {
71   // Important: All the names should be string literals. As per prctl.h in
72   // //third_party/android_toolchain/ndk the kernel keeps a pointer to the name
73   // instead of copying it.
74   //
75   // Having the name in .rodata ensures that the pointer remains valid as
76   // long as the mapping is alive.
77   const char* name = nullptr;
78   switch (page_tag) {
79     case PageTag::kSimulation:
80       name = "simulation";
81       break;
82     case PageTag::kBlinkGC:
83       name = "blink_gc";
84       break;
85     case PageTag::kPartitionAlloc:
86       name = "partition_alloc";
87       break;
88     case PageTag::kChromium:
89       name = "chromium";
90       break;
91     case PageTag::kV8:
92       name = "v8";
93       break;
94     default:
95       PA_NOTREACHED();
96       break;
97   }
98 
99   // No error checking on purpose, testing only.
100   prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, length, name);
101 }
102 
103 #endif  // defined(LINUX_NAME_REGION)
104 
105 #if BUILDFLAG(IS_MAC)
106 // Tests whether the version of macOS supports the MAP_JIT flag and if the
107 // current process is signed with the hardened runtime and the allow-jit
108 // entitlement, returning whether MAP_JIT should be used to allocate regions
109 // that will contain JIT-compiled executable code.
UseMapJit()110 bool UseMapJit() {
111   // Until determining that the hardened runtime is enabled, early returns will
112   // return true, so that MAP_JIT will be used. This is important on arm64,
113   // which only allows pages to be simultaneously writable and executable when
114   // in a region allocated with MAP_JIT, regardless of code signing options. On
115   // arm64, an attempt to set a non-MAP_JIT page as simultaneously writable and
116   // executable fails with EPERM. Although this is not enforced on x86_64,
117   // MAP_JIT is harmless in that case.
118 
119   base::apple::ScopedCFTypeRef<SecTaskRef> task(
120       SecTaskCreateFromSelf(kCFAllocatorDefault));
121   if (!task) {
122     return true;
123   }
124 
125   uint32_t flags = SecTaskGetCodeSignStatus(task);
126   if (!(flags & kSecCodeSignatureRuntime)) {
127     // The hardened runtime is not enabled. Note that kSecCodeSignatureRuntime
128     // == CS_RUNTIME.
129     return true;
130   }
131 
132   // The hardened runtime is enabled. From this point on, early returns must
133   // return false, indicating that MAP_JIT is not to be used. It’s an error
134   // (EINVAL) to use MAP_JIT with the hardened runtime unless the JIT
135   // entitlement is specified.
136 
137   base::apple::ScopedCFTypeRef<CFTypeRef> jit_entitlement(
138       SecTaskCopyValueForEntitlement(
139           task.get(), CFSTR("com.apple.security.cs.allow-jit"), nullptr));
140   if (!jit_entitlement) {
141     return false;
142   }
143 
144   return base::apple::CFCast<CFBooleanRef>(jit_entitlement.get()) ==
145          kCFBooleanTrue;
146 }
147 #elif BUILDFLAG(IS_IOS)
UseMapJit()148 bool UseMapJit() {
149 // Always enable MAP_JIT in simulator as it is supported unconditionally.
150 #if TARGET_IPHONE_SIMULATOR
151   return true;
152 #else
153   // TODO(https://crbug.com/1413818): Fill this out when the API it is
154   // available.
155   return false;
156 #endif  // TARGET_IPHONE_SIMULATOR
157 }
158 #endif  // BUILDFLAG(IS_IOS)
159 }  // namespace
160 
161 // |mmap| uses a nearby address if the hint address is blocked.
162 constexpr bool kHintIsAdvisory = true;
163 std::atomic<int32_t> s_allocPageErrorCode{0};
164 
165 int GetAccessFlags(PageAccessibilityConfiguration accessibility);
166 
SystemAllocPagesInternal(uintptr_t hint,size_t length,PageAccessibilityConfiguration accessibility,PageTag page_tag,int file_descriptor_for_shared_alloc)167 uintptr_t SystemAllocPagesInternal(uintptr_t hint,
168                                    size_t length,
169                                    PageAccessibilityConfiguration accessibility,
170                                    PageTag page_tag,
171                                    int file_descriptor_for_shared_alloc) {
172 #if BUILDFLAG(IS_APPLE)
173   // Use a custom tag to make it easier to distinguish PartitionAlloc regions
174   // in vmmap(1). Tags between 240-255 are supported.
175   int fd = file_descriptor_for_shared_alloc == -1
176                ? VM_MAKE_TAG(static_cast<int>(page_tag))
177                : file_descriptor_for_shared_alloc;
178 #else
179   int fd = file_descriptor_for_shared_alloc;
180 #endif
181 
182   int access_flag = GetAccessFlags(accessibility);
183   int map_flags = MAP_ANONYMOUS | MAP_PRIVATE;
184 
185 #if BUILDFLAG(IS_APPLE)
186   // On macOS, executables that are code signed with the "runtime" option cannot
187   // execute writable memory by default. They can opt into this capability by
188   // specifying the "com.apple.security.cs.allow-jit" code signing entitlement
189   // and allocating the region with the MAP_JIT flag.
190   static const bool kUseMapJit = UseMapJit();
191   if (accessibility.permissions ==
192           PageAccessibilityConfiguration::kInaccessibleWillJitLater &&
193       kUseMapJit) {
194     map_flags |= MAP_JIT;
195   }
196 #endif
197 
198   void* ret = mmap(reinterpret_cast<void*>(hint), length, access_flag,
199                    map_flags, fd, 0);
200   if (ret == MAP_FAILED) {
201     s_allocPageErrorCode = errno;
202     ret = nullptr;
203   }
204 
205 #if defined(LINUX_NAME_REGION)
206   if (ret) {
207     NameRegion(ret, length, page_tag);
208   }
209 #endif
210 
211   return reinterpret_cast<uintptr_t>(ret);
212 }
213 
TrySetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)214 bool TrySetSystemPagesAccessInternal(
215     uintptr_t address,
216     size_t length,
217     PageAccessibilityConfiguration accessibility) {
218 #if BUILDFLAG(ENABLE_THREAD_ISOLATION)
219   if (accessibility.thread_isolation.enabled) {
220     return 0 == MprotectWithThreadIsolation(reinterpret_cast<void*>(address),
221                                             length,
222                                             GetAccessFlags(accessibility),
223                                             accessibility.thread_isolation);
224   }
225 #endif  // BUILDFLAG(ENABLE_THREAD_ISOLATION)
226   return 0 == WrapEINTR(mprotect)(reinterpret_cast<void*>(address), length,
227                                   GetAccessFlags(accessibility));
228 }
229 
SetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)230 void SetSystemPagesAccessInternal(
231     uintptr_t address,
232     size_t length,
233     PageAccessibilityConfiguration accessibility) {
234   int access_flags = GetAccessFlags(accessibility);
235   int ret;
236 #if BUILDFLAG(ENABLE_THREAD_ISOLATION)
237   if (accessibility.thread_isolation.enabled) {
238     ret = MprotectWithThreadIsolation(reinterpret_cast<void*>(address), length,
239                                       GetAccessFlags(accessibility),
240                                       accessibility.thread_isolation);
241   } else
242 #endif  // BUILDFLAG(ENABLE_THREAD_ISOLATION)
243   {
244     ret = WrapEINTR(mprotect)(reinterpret_cast<void*>(address), length,
245                               GetAccessFlags(accessibility));
246   }
247 
248   // On Linux, man mprotect(2) states that ENOMEM is returned when (1) internal
249   // kernel data structures cannot be allocated, (2) the address range is
250   // invalid, or (3) this would split an existing mapping in a way that would
251   // exceed the maximum number of allowed mappings.
252   //
253   // Neither are very likely, but we still get a lot of crashes here. This is
254   // because setrlimit(RLIMIT_DATA)'s limit is checked and enforced here, if the
255   // access flags match a "data" mapping, which in our case would be MAP_PRIVATE
256   // | MAP_ANONYMOUS, and PROT_WRITE. see the call to may_expand_vm() in
257   // mm/mprotect.c in the kernel for details.
258   //
259   // In this case, we are almost certainly bumping into the sandbox limit, mark
260   // the crash as OOM. See SandboxLinux::LimitAddressSpace() for details.
261   if (ret == -1 && errno == ENOMEM && (access_flags & PROT_WRITE)) {
262     OOM_CRASH(length);
263   }
264 
265   PA_PCHECK(0 == ret);
266 }
267 
FreePagesInternal(uintptr_t address,size_t length)268 void FreePagesInternal(uintptr_t address, size_t length) {
269   PA_PCHECK(0 == munmap(reinterpret_cast<void*>(address), length));
270 }
271 
TrimMappingInternal(uintptr_t base_address,size_t base_length,size_t trim_length,PageAccessibilityConfiguration accessibility,size_t pre_slack,size_t post_slack)272 uintptr_t TrimMappingInternal(uintptr_t base_address,
273                               size_t base_length,
274                               size_t trim_length,
275                               PageAccessibilityConfiguration accessibility,
276                               size_t pre_slack,
277                               size_t post_slack) {
278   uintptr_t ret = base_address;
279   // We can resize the allocation run. Release unneeded memory before and after
280   // the aligned range.
281   if (pre_slack) {
282     FreePages(base_address, pre_slack);
283     ret = base_address + pre_slack;
284   }
285   if (post_slack) {
286     FreePages(ret + trim_length, post_slack);
287   }
288   return ret;
289 }
290 
DecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityDisposition accessibility_disposition)291 void DecommitSystemPagesInternal(
292     uintptr_t address,
293     size_t length,
294     PageAccessibilityDisposition accessibility_disposition) {
295   // In POSIX, there is no decommit concept. Discarding is an effective way of
296   // implementing the Windows semantics where the OS is allowed to not swap the
297   // pages in the region.
298   DiscardSystemPages(address, length);
299 
300   bool change_permissions =
301       accessibility_disposition == PageAccessibilityDisposition::kRequireUpdate;
302 #if BUILDFLAG(PA_DCHECK_IS_ON)
303   // This is not guaranteed, show that we're serious.
304   //
305   // More specifically, several callers have had issues with assuming that
306   // memory is zeroed, this would hopefully make these bugs more visible.  We
307   // don't memset() everything, because ranges can be very large, and doing it
308   // over the entire range could make Chrome unusable with
309   // BUILDFLAG(PA_DCHECK_IS_ON).
310   //
311   // Only do it when we are about to change the permissions, since we don't know
312   // the previous permissions, and cannot restore them.
313   if (!DecommittedMemoryIsAlwaysZeroed() && change_permissions) {
314     // Memory may not be writable.
315     size_t size = std::min(length, 2 * SystemPageSize());
316     void* ptr = reinterpret_cast<void*>(address);
317     PA_CHECK(mprotect(ptr, size, PROT_WRITE) == 0);
318     memset(ptr, 0xcc, size);
319   }
320 #endif
321 
322   // Make pages inaccessible, unless the caller requested to keep permissions.
323   //
324   // Note, there is a small window between these calls when the pages can be
325   // incorrectly touched and brought back to memory. Not ideal, but doing those
326   // operations in the opposite order resulted in PMF regression on Mac (see
327   // crbug.com/1153021).
328   if (change_permissions) {
329     SetSystemPagesAccess(address, length,
330                          PageAccessibilityConfiguration(
331                              PageAccessibilityConfiguration::kInaccessible));
332   }
333 }
334 
DecommitAndZeroSystemPagesInternal(uintptr_t address,size_t length,PageTag page_tag)335 bool DecommitAndZeroSystemPagesInternal(uintptr_t address,
336                                         size_t length,
337                                         PageTag page_tag) {
338   int fd = -1;
339 #if BUILDFLAG(IS_APPLE)
340   fd = VM_MAKE_TAG(static_cast<int>(page_tag));
341 #endif
342 
343   // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mmap.html: "If
344   // a MAP_FIXED request is successful, then any previous mappings [...] for
345   // those whole pages containing any part of the address range [pa,pa+len)
346   // shall be removed, as if by an appropriate call to munmap(), before the
347   // new mapping is established." As a consequence, the memory will be
348   // zero-initialized on next access.
349   void* ptr = reinterpret_cast<void*>(address);
350   void* ret = mmap(ptr, length, PROT_NONE,
351                    MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, fd, 0);
352   if (ret == MAP_FAILED) {
353     // Decomitting may create additional VMAs (e.g. if we're decommitting pages
354     // in the middle of a larger mapping) and so it can fail with ENOMEM if the
355     // limit of VMAs is exceeded.
356     PA_CHECK(errno == ENOMEM);
357     return false;
358   }
359   PA_CHECK(ret == ptr);
360   // Since we just remapped the region, need to set is name again.
361 #if defined(LINUX_NAME_REGION)
362   NameRegion(ret, length, page_tag);
363 #endif
364   return true;
365 }
366 
RecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)367 void RecommitSystemPagesInternal(
368     uintptr_t address,
369     size_t length,
370     PageAccessibilityConfiguration accessibility,
371     PageAccessibilityDisposition accessibility_disposition) {
372   // On POSIX systems, the caller needs to simply read the memory to recommit
373   // it. However, if decommit changed the permissions, recommit has to change
374   // them back.
375   if (accessibility_disposition ==
376       PageAccessibilityDisposition::kRequireUpdate) {
377     SetSystemPagesAccess(address, length, accessibility);
378   }
379 
380 #if BUILDFLAG(IS_APPLE)
381   // On macOS, to update accounting, we need to make another syscall. For more
382   // details, see https://crbug.com/823915.
383   madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
384 #endif
385 }
386 
TryRecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)387 bool TryRecommitSystemPagesInternal(
388     uintptr_t address,
389     size_t length,
390     PageAccessibilityConfiguration accessibility,
391     PageAccessibilityDisposition accessibility_disposition) {
392   // On POSIX systems, the caller needs to simply read the memory to recommit
393   // it. However, if decommit changed the permissions, recommit has to change
394   // them back.
395   if (accessibility_disposition ==
396       PageAccessibilityDisposition::kRequireUpdate) {
397     bool ok = TrySetSystemPagesAccess(address, length, accessibility);
398     if (!ok) {
399       return false;
400     }
401   }
402 
403 #if BUILDFLAG(IS_APPLE)
404   // On macOS, to update accounting, we need to make another syscall. For more
405   // details, see https://crbug.com/823915.
406   madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
407 #endif
408 
409   return true;
410 }
411 
DiscardSystemPagesInternal(uintptr_t address,size_t length)412 void DiscardSystemPagesInternal(uintptr_t address, size_t length) {
413   void* ptr = reinterpret_cast<void*>(address);
414 #if BUILDFLAG(IS_APPLE)
415   int ret = madvise(ptr, length, MADV_FREE_REUSABLE);
416   if (ret) {
417     // MADV_FREE_REUSABLE sometimes fails, so fall back to MADV_DONTNEED.
418     ret = madvise(ptr, length, MADV_DONTNEED);
419   }
420   PA_PCHECK(ret == 0);
421 #else   // BUILDFLAG(IS_APPLE)
422   // We have experimented with other flags, but with suboptimal results.
423   //
424   // MADV_FREE (Linux): Makes our memory measurements less predictable;
425   // performance benefits unclear.
426   //
427   // Therefore, we just do the simple thing: MADV_DONTNEED.
428   PA_PCHECK(0 == madvise(ptr, length, MADV_DONTNEED));
429 #endif  // BUILDFLAG(IS_APPLE)
430 }
431 
432 }  // namespace partition_alloc::internal
433 
434 #endif  // PARTITION_ALLOC_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
435