1 /*
2 * Copyright 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "debuggerd/handler.h"
18
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <inttypes.h>
22 #include <linux/futex.h>
23 #include <pthread.h>
24 #include <sched.h>
25 #include <signal.h>
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <sys/capability.h>
32 #include <sys/mman.h>
33 #include <sys/prctl.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/uio.h>
37 #include <sys/un.h>
38 #include <sys/wait.h>
39 #include <time.h>
40 #include <unistd.h>
41
42 #include <android-base/macros.h>
43 #include <android-base/parsebool.h>
44 #include <android-base/parseint.h>
45 #include <android-base/properties.h>
46 #include <android-base/unique_fd.h>
47 #include <async_safe/log.h>
48 #include <bionic/reserved_signals.h>
49
50 #include <libdebuggerd/utility.h>
51
52 #include "dump_type.h"
53 #include "protocol.h"
54
55 #include "handler/fallback.h"
56
57 using ::android::base::ParseBool;
58 using ::android::base::ParseBoolResult;
59 using ::android::base::Pipe;
60
61 // We muck with our fds in a 'thread' that doesn't share the same fd table.
62 // Close fds in that thread with a raw close syscall instead of going through libc.
63 struct FdsanBypassCloser {
CloseFdsanBypassCloser64 static void Close(int fd) {
65 syscall(__NR_close, fd);
66 }
67 };
68
69 using unique_fd = android::base::unique_fd_impl<FdsanBypassCloser>;
70
71 // see man(2) prctl, specifically the section about PR_GET_NAME
72 #define MAX_TASK_NAME_LEN (16)
73
74 #if defined(__LP64__)
75 #define CRASH_DUMP_NAME "crash_dump64"
76 #else
77 #define CRASH_DUMP_NAME "crash_dump32"
78 #endif
79
80 #define CRASH_DUMP_PATH "/apex/com.android.runtime/bin/" CRASH_DUMP_NAME
81
82 // Wrappers that directly invoke the respective syscalls, in case the cached values are invalid.
83 #pragma GCC poison getpid gettid
__getpid()84 static pid_t __getpid() {
85 return syscall(__NR_getpid);
86 }
87
__gettid()88 static pid_t __gettid() {
89 return syscall(__NR_gettid);
90 }
91
property_parse_bool(const char * name)92 static bool property_parse_bool(const char* name) {
93 const prop_info* pi = __system_property_find(name);
94 if (!pi) return false;
95 bool cookie = false;
96 __system_property_read_callback(
97 pi,
98 [](void* cookie, const char*, const char* value, uint32_t) {
99 *reinterpret_cast<bool*>(cookie) = ParseBool(value) == ParseBoolResult::kTrue;
100 },
101 &cookie);
102 return cookie;
103 }
104
is_permissive_mte()105 static bool is_permissive_mte() {
106 // Environment variable for testing or local use from shell.
107 char* permissive_env = getenv("MTE_PERMISSIVE");
108 char process_sysprop_name[512];
109 async_safe_format_buffer(process_sysprop_name, sizeof(process_sysprop_name),
110 "persist.device_config.memory_safety_native.permissive.process.%s",
111 getprogname());
112 // DO NOT REPLACE this with GetBoolProperty. That uses std::string which allocates, so it is
113 // not async-safe, and this function gets used in a signal handler.
114 return property_parse_bool("persist.sys.mte.permissive") ||
115 property_parse_bool("persist.device_config.memory_safety_native.permissive.default") ||
116 property_parse_bool(process_sysprop_name) ||
117 (permissive_env && ParseBool(permissive_env) == ParseBoolResult::kTrue);
118 }
119
parse_uint_with_error_reporting(const char * s,const char * name,int * v)120 static bool parse_uint_with_error_reporting(const char* s, const char* name, int* v) {
121 if (android::base::ParseInt(s, v) && *v >= 0) {
122 return true;
123 }
124 async_safe_format_log(ANDROID_LOG_ERROR, "libc", "invalid %s: %s", name, s);
125 return false;
126 }
127
128 // We cannot use base::GetIntProperty, because that internally uses
129 // std::string, which allocates.
property_parse_int(const char * name,int * out)130 static bool property_parse_int(const char* name, int* out) {
131 const prop_info* pi = __system_property_find(name);
132 if (!pi) return false;
133 struct cookie_t {
134 int* out;
135 bool empty;
136 } cookie{out, true};
137 __system_property_read_callback(
138 pi,
139 [](void* raw_cookie, const char* name, const char* value, uint32_t) {
140 // Property is set to empty value, ignoring.
141 if (!*value) return;
142 cookie_t* cookie = reinterpret_cast<cookie_t*>(raw_cookie);
143 if (parse_uint_with_error_reporting(value, name, cookie->out)) cookie->empty = false;
144 },
145 &cookie);
146 return !cookie.empty;
147 }
148
permissive_mte_renable_timer()149 static int permissive_mte_renable_timer() {
150 if (char* env = getenv("MTE_PERMISSIVE_REENABLE_TIME_CPUMS")) {
151 int v;
152 if (parse_uint_with_error_reporting(env, "MTE_PERMISSIVE_REENABLE_TIME_CPUMS", &v)) return v;
153 }
154
155 char process_sysprop_name[512];
156 async_safe_format_buffer(process_sysprop_name, sizeof(process_sysprop_name),
157 "persist.sys.mte.permissive_reenable_timer.process.%s", getprogname());
158 int v;
159 if (property_parse_int(process_sysprop_name, &v)) return v;
160 if (property_parse_int("persist.sys.mte.permissive_reenable_timer.default", &v)) return v;
161 char process_deviceconf_sysprop_name[512];
162 async_safe_format_buffer(
163 process_deviceconf_sysprop_name, sizeof(process_deviceconf_sysprop_name),
164 "persist.device_config.memory_safety_native.permissive_reenable_timer.process.%s",
165 getprogname());
166 if (property_parse_int(process_deviceconf_sysprop_name, &v)) return v;
167 if (property_parse_int(
168 "persist.device_config.memory_safety_native.permissive_reenable_timer.default", &v))
169 return v;
170 return 0;
171 }
172
futex_wait(volatile void * ftx,int value)173 static inline void futex_wait(volatile void* ftx, int value) {
174 syscall(__NR_futex, ftx, FUTEX_WAIT, value, nullptr, nullptr, 0);
175 }
176
177 class ErrnoRestorer {
178 public:
ErrnoRestorer()179 ErrnoRestorer() : saved_errno_(errno) {
180 }
181
~ErrnoRestorer()182 ~ErrnoRestorer() {
183 errno = saved_errno_;
184 }
185
186 private:
187 int saved_errno_;
188 };
189
190 extern "C" void* android_fdsan_get_fd_table();
191 extern "C" void debuggerd_fallback_handler(siginfo_t*, ucontext_t*, void*);
192
193 static debuggerd_callbacks_t g_callbacks;
194
195 // Mutex to ensure only one crashing thread dumps itself.
196 static pthread_mutex_t crash_mutex = PTHREAD_MUTEX_INITIALIZER;
197
198 // Don't use async_safe_fatal because it exits via abort, which might put us back into
199 // a signal handler.
fatal(const char * fmt,...)200 static void __noreturn __printflike(1, 2) fatal(const char* fmt, ...) {
201 va_list args;
202 va_start(args, fmt);
203 async_safe_format_log_va_list(ANDROID_LOG_FATAL, "libc", fmt, args);
204 _exit(1);
205 }
206
fatal_errno(const char * fmt,...)207 static void __noreturn __printflike(1, 2) fatal_errno(const char* fmt, ...) {
208 int err = errno;
209 va_list args;
210 va_start(args, fmt);
211
212 char buf[256];
213 async_safe_format_buffer_va_list(buf, sizeof(buf), fmt, args);
214 fatal("%s: %s", buf, strerror(err));
215 }
216
get_main_thread_name(char * buf,size_t len)217 static bool get_main_thread_name(char* buf, size_t len) {
218 unique_fd fd(open("/proc/self/comm", O_RDONLY | O_CLOEXEC));
219 if (fd == -1) {
220 return false;
221 }
222
223 ssize_t rc = read(fd, buf, len);
224 if (rc == -1) {
225 return false;
226 } else if (rc == 0) {
227 // Should never happen?
228 return false;
229 }
230
231 // There's a trailing newline, replace it with a NUL.
232 buf[rc - 1] = '\0';
233 return true;
234 }
235
236 /*
237 * Writes a summary of the signal to the log file. We do this so that, if
238 * for some reason we're not able to contact debuggerd, there is still some
239 * indication of the failure in the log.
240 *
241 * We could be here as a result of native heap corruption, or while a
242 * mutex is being held, so we don't want to use any libc functions that
243 * could allocate memory or hold a lock.
244 */
log_signal_summary(const siginfo_t * si)245 static void log_signal_summary(const siginfo_t* si) {
246 char main_thread_name[MAX_TASK_NAME_LEN + 1];
247 if (!get_main_thread_name(main_thread_name, sizeof(main_thread_name))) {
248 strncpy(main_thread_name, "<unknown>", sizeof(main_thread_name));
249 }
250
251 if (si->si_signo == BIONIC_SIGNAL_DEBUGGER) {
252 async_safe_format_log(ANDROID_LOG_INFO, "libc", "Requested dump for pid %d (%s)", __getpid(),
253 main_thread_name);
254 return;
255 }
256
257 // Many signals don't have a sender or extra detail, but some do...
258 pid_t self_pid = __getpid();
259 char sender_desc[32] = {}; // " from pid 1234, uid 666"
260 if (signal_has_sender(si, self_pid)) {
261 get_signal_sender(sender_desc, sizeof(sender_desc), si);
262 }
263 char extra_desc[32] = {}; // ", fault addr 0x1234" or ", syscall 1234"
264 if (si->si_signo == SIGSYS && si->si_code == SYS_SECCOMP) {
265 async_safe_format_buffer(extra_desc, sizeof(extra_desc), ", syscall %d", si->si_syscall);
266 } else if (signal_has_si_addr(si)) {
267 async_safe_format_buffer(extra_desc, sizeof(extra_desc), ", fault addr %p", si->si_addr);
268 }
269
270 char thread_name[MAX_TASK_NAME_LEN + 1]; // one more for termination
271 if (prctl(PR_GET_NAME, reinterpret_cast<unsigned long>(thread_name), 0, 0, 0) != 0) {
272 strcpy(thread_name, "<name unknown>");
273 } else {
274 // short names are null terminated by prctl, but the man page
275 // implies that 16 byte names are not.
276 thread_name[MAX_TASK_NAME_LEN] = 0;
277 }
278
279 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
280 "Fatal signal %d (%s), code %d (%s%s)%s in tid %d (%s), pid %d (%s)",
281 si->si_signo, get_signame(si), si->si_code, get_sigcode(si), sender_desc,
282 extra_desc, __gettid(), thread_name, self_pid, main_thread_name);
283 }
284
285 /*
286 * Returns true if the handler for signal "signum" has SA_SIGINFO set.
287 */
have_siginfo(int signum)288 static bool have_siginfo(int signum) {
289 struct sigaction old_action;
290 if (sigaction(signum, nullptr, &old_action) < 0) {
291 async_safe_format_log(ANDROID_LOG_WARN, "libc", "Failed testing for SA_SIGINFO: %s",
292 strerror(errno));
293 return false;
294 }
295 return (old_action.sa_flags & SA_SIGINFO) != 0;
296 }
297
raise_caps()298 static void raise_caps() {
299 // Raise CapInh to match CapPrm, so that we can set the ambient bits.
300 __user_cap_header_struct capheader;
301 memset(&capheader, 0, sizeof(capheader));
302 capheader.version = _LINUX_CAPABILITY_VERSION_3;
303 capheader.pid = 0;
304
305 __user_cap_data_struct capdata[2];
306 if (capget(&capheader, &capdata[0]) == -1) {
307 fatal_errno("capget failed");
308 }
309
310 if (capdata[0].permitted != capdata[0].inheritable ||
311 capdata[1].permitted != capdata[1].inheritable) {
312 capdata[0].inheritable = capdata[0].permitted;
313 capdata[1].inheritable = capdata[1].permitted;
314
315 if (capset(&capheader, &capdata[0]) == -1) {
316 async_safe_format_log(ANDROID_LOG_ERROR, "libc", "capset failed: %s", strerror(errno));
317 }
318 }
319
320 // Set the ambient capability bits so that crash_dump gets all of our caps and can ptrace us.
321 uint64_t capmask = capdata[0].inheritable;
322 capmask |= static_cast<uint64_t>(capdata[1].inheritable) << 32;
323 for (unsigned long i = 0; i < 64; ++i) {
324 if (capmask & (1ULL << i)) {
325 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0, 0) != 0) {
326 async_safe_format_log(ANDROID_LOG_ERROR, "libc",
327 "failed to raise ambient capability %lu: %s", i, strerror(errno));
328 }
329 }
330 }
331 }
332
333 // Double-clone, with CLONE_FILES to share the file descriptor table for kcmp validation.
334 // Returns 0 in the orphaned child, the pid of the orphan in the original process, or -1 on failure.
create_vm_process()335 static void create_vm_process() {
336 pid_t first = clone(nullptr, nullptr, CLONE_FILES, nullptr);
337 if (first == -1) {
338 fatal_errno("failed to clone vm process");
339 } else if (first == 0) {
340 drop_capabilities();
341
342 if (clone(nullptr, nullptr, CLONE_FILES, nullptr) == -1) {
343 _exit(errno);
344 }
345
346 // crash_dump is ptracing both sides of the fork; it'll let the parent exit,
347 // but keep the orphan stopped to peek at its memory.
348
349 // There appears to be a bug in the kernel where our death causes SIGHUP to
350 // be sent to our process group if we exit while it has stopped jobs (e.g.
351 // because of wait_for_debugger). Use setsid to create a new process group to
352 // avoid hitting this.
353 setsid();
354
355 _exit(0);
356 }
357
358 int status;
359 if (TEMP_FAILURE_RETRY(waitpid(first, &status, __WCLONE)) != first) {
360 fatal_errno("failed to waitpid in double fork");
361 } else if (!WIFEXITED(status)) {
362 fatal("intermediate process didn't exit cleanly in double fork (status = %d)", status);
363 } else if (WEXITSTATUS(status)) {
364 fatal("second clone failed: %s", strerror(WEXITSTATUS(status)));
365 }
366 }
367
368 struct debugger_thread_info {
369 pid_t crashing_tid;
370 pid_t pseudothread_tid;
371 siginfo_t* siginfo;
372 void* ucontext;
373 debugger_process_info process_info;
374 };
375
376 // Logging and contacting debuggerd requires free file descriptors, which we might not have.
377 // Work around this by spawning a "thread" that shares its parent's address space, but not its file
378 // descriptor table, so that we can close random file descriptors without affecting the original
379 // process. Note that this doesn't go through pthread_create, so TLS is shared with the spawning
380 // process.
381 static void* pseudothread_stack;
382
get_dump_type(const debugger_thread_info * thread_info)383 static DebuggerdDumpType get_dump_type(const debugger_thread_info* thread_info) {
384 if (thread_info->siginfo->si_signo == BIONIC_SIGNAL_DEBUGGER &&
385 thread_info->siginfo->si_value.sival_int) {
386 return kDebuggerdNativeBacktrace;
387 }
388
389 return kDebuggerdTombstoneProto;
390 }
391
get_unwind_type(const debugger_thread_info * thread_info)392 static const char* get_unwind_type(const debugger_thread_info* thread_info) {
393 if (thread_info->siginfo->si_signo == BIONIC_SIGNAL_DEBUGGER) {
394 return "Unwind request";
395 }
396 return "Crash due to signal";
397 }
398
debuggerd_dispatch_pseudothread(void * arg)399 static int debuggerd_dispatch_pseudothread(void* arg) {
400 debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);
401
402 for (int i = 0; i < 1024; ++i) {
403 // Don't use close to avoid bionic's file descriptor ownership checks.
404 syscall(__NR_close, i);
405 }
406
407 int devnull = TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR));
408 if (devnull == -1) {
409 fatal_errno("failed to open /dev/null");
410 } else if (devnull != 0) {
411 fatal_errno("expected /dev/null fd to be 0, actually %d", devnull);
412 }
413
414 // devnull will be 0.
415 TEMP_FAILURE_RETRY(dup2(devnull, 1));
416 TEMP_FAILURE_RETRY(dup2(devnull, 2));
417
418 unique_fd input_read, input_write;
419 unique_fd output_read, output_write;
420 if (!Pipe(&input_read, &input_write) != 0 || !Pipe(&output_read, &output_write)) {
421 fatal_errno("failed to create pipe");
422 }
423
424 uint32_t version;
425 ssize_t expected;
426
427 // ucontext_t is absurdly large on AArch64, so piece it together manually with writev.
428 struct iovec iovs[4] = {
429 {.iov_base = &version, .iov_len = sizeof(version)},
430 {.iov_base = thread_info->siginfo, .iov_len = sizeof(siginfo_t)},
431 {.iov_base = thread_info->ucontext, .iov_len = sizeof(ucontext_t)},
432 };
433
434 constexpr size_t kHeaderSize = sizeof(version) + sizeof(siginfo_t) + sizeof(ucontext_t);
435
436 if (thread_info->process_info.fdsan_table) {
437 // Dynamic executables always use version 4. There is no need to increment the version number if
438 // the format changes, because the sender (linker) and receiver (crash_dump) are version locked.
439 version = 4;
440 expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataDynamic);
441
442 static_assert(sizeof(CrashInfoHeader) + sizeof(CrashInfoDataDynamic) ==
443 kHeaderSize + sizeof(thread_info->process_info),
444 "Wire protocol structs do not match the data sent.");
445 #define ASSERT_SAME_OFFSET(MEMBER1, MEMBER2) \
446 static_assert(sizeof(CrashInfoHeader) + offsetof(CrashInfoDataDynamic, MEMBER1) == \
447 kHeaderSize + offsetof(debugger_process_info, MEMBER2), \
448 "Wire protocol offset does not match data sent: " #MEMBER1);
449 ASSERT_SAME_OFFSET(fdsan_table_address, fdsan_table);
450 ASSERT_SAME_OFFSET(gwp_asan_state, gwp_asan_state);
451 ASSERT_SAME_OFFSET(gwp_asan_metadata, gwp_asan_metadata);
452 ASSERT_SAME_OFFSET(scudo_stack_depot, scudo_stack_depot);
453 ASSERT_SAME_OFFSET(scudo_region_info, scudo_region_info);
454 ASSERT_SAME_OFFSET(scudo_ring_buffer, scudo_ring_buffer);
455 ASSERT_SAME_OFFSET(scudo_ring_buffer_size, scudo_ring_buffer_size);
456 ASSERT_SAME_OFFSET(scudo_stack_depot_size, scudo_stack_depot_size);
457 ASSERT_SAME_OFFSET(recoverable_crash, recoverable_crash);
458 ASSERT_SAME_OFFSET(crash_detail_page, crash_detail_page);
459 #undef ASSERT_SAME_OFFSET
460
461 iovs[3] = {.iov_base = &thread_info->process_info,
462 .iov_len = sizeof(thread_info->process_info)};
463 } else {
464 // Static executables always use version 1.
465 version = 1;
466 expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataStatic);
467
468 static_assert(
469 sizeof(CrashInfoHeader) + sizeof(CrashInfoDataStatic) == kHeaderSize + sizeof(uintptr_t),
470 "Wire protocol structs do not match the data sent.");
471
472 iovs[3] = {.iov_base = &thread_info->process_info.abort_msg, .iov_len = sizeof(uintptr_t)};
473 }
474 errno = 0;
475 if (fcntl(output_write.get(), F_SETPIPE_SZ, expected) < static_cast<int>(expected)) {
476 fatal_errno("failed to set pipe buffer size");
477 }
478
479 ssize_t rc = TEMP_FAILURE_RETRY(writev(output_write.get(), iovs, arraysize(iovs)));
480 if (rc == -1) {
481 fatal_errno("failed to write crash info");
482 } else if (rc != expected) {
483 fatal("failed to write crash info, wrote %zd bytes, expected %zd", rc, expected);
484 }
485
486 // Don't use fork(2) to avoid calling pthread_atfork handlers.
487 pid_t crash_dump_pid = _Fork();
488 if (crash_dump_pid == -1) {
489 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
490 "failed to fork in debuggerd signal handler: %s", strerror(errno));
491 } else if (crash_dump_pid == 0) {
492 TEMP_FAILURE_RETRY(dup2(input_write.get(), STDOUT_FILENO));
493 TEMP_FAILURE_RETRY(dup2(output_read.get(), STDIN_FILENO));
494 input_read.reset();
495 input_write.reset();
496 output_read.reset();
497 output_write.reset();
498
499 raise_caps();
500
501 char main_tid[10];
502 char pseudothread_tid[10];
503 char debuggerd_dump_type[10];
504 async_safe_format_buffer(main_tid, sizeof(main_tid), "%d", thread_info->crashing_tid);
505 async_safe_format_buffer(pseudothread_tid, sizeof(pseudothread_tid), "%d",
506 thread_info->pseudothread_tid);
507 async_safe_format_buffer(debuggerd_dump_type, sizeof(debuggerd_dump_type), "%d",
508 get_dump_type(thread_info));
509
510 execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
511 nullptr, nullptr);
512 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "%s: failed to exec crash_dump helper: %s",
513 get_unwind_type(thread_info), strerror(errno));
514 return 1;
515 }
516
517 input_write.reset();
518 output_read.reset();
519
520 // crash_dump will ptrace and pause all of our threads, and then write to the pipe to tell
521 // us to fork off a process to read memory from.
522 char buf[4];
523 rc = TEMP_FAILURE_RETRY(read(input_read.get(), &buf, sizeof(buf)));
524
525 bool success = false;
526 if (rc == 1 && buf[0] == '\1') {
527 // crash_dump successfully started, and is ptracing us.
528 // Fork off a copy of our address space for it to use.
529 create_vm_process();
530 success = true;
531 } else {
532 // Something went wrong, log it.
533 if (rc == -1) {
534 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "%s: read of IPC pipe failed: %s",
535 get_unwind_type(thread_info), strerror(errno));
536 } else if (rc == 0) {
537 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
538 "%s: crash_dump helper failed to exec, or was killed",
539 get_unwind_type(thread_info));
540 } else if (rc != 1) {
541 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
542 "%s: read of IPC pipe returned unexpected value: %zd",
543 get_unwind_type(thread_info), rc);
544 } else if (buf[0] != '\1') {
545 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "%s: crash_dump helper reported failure",
546 get_unwind_type(thread_info));
547 }
548 }
549
550 // Don't leave a zombie child.
551 int status;
552 if (TEMP_FAILURE_RETRY(waitpid(crash_dump_pid, &status, 0)) == -1) {
553 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "%s: failed to wait for crash_dump helper: %s",
554 get_unwind_type(thread_info), strerror(errno));
555 } else if (WIFSTOPPED(status) || WIFSIGNALED(status)) {
556 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "%s: crash_dump helper crashed or stopped",
557 get_unwind_type(thread_info));
558 }
559
560 if (success) {
561 if (thread_info->siginfo->si_signo != BIONIC_SIGNAL_DEBUGGER) {
562 // For crashes, we don't need to minimize pause latency.
563 // Wait for the dump to complete before having the process exit, to avoid being murdered by
564 // ActivityManager or init.
565 TEMP_FAILURE_RETRY(read(input_read, &buf, sizeof(buf)));
566 }
567 }
568
569 return success ? 0 : 1;
570 }
571
resend_signal(siginfo_t * info)572 static void resend_signal(siginfo_t* info) {
573 // Signals can either be fatal or nonfatal.
574 // For fatal signals, crash_dump will send us the signal we crashed with
575 // before resuming us, so that processes using waitpid on us will see that we
576 // exited with the correct exit status (e.g. so that sh will report
577 // "Segmentation fault" instead of "Killed"). For this to work, we need
578 // to deregister our signal handler for that signal before continuing.
579 if (info->si_signo != BIONIC_SIGNAL_DEBUGGER) {
580 signal(info->si_signo, SIG_DFL);
581 int rc = syscall(SYS_rt_tgsigqueueinfo, __getpid(), __gettid(), info->si_signo, info);
582 if (rc != 0) {
583 fatal_errno("failed to resend signal during crash");
584 }
585 }
586 }
587
588 // Handler that does crash dumping by forking and doing the processing in the child.
589 // Do this by ptracing the relevant thread, and then execing debuggerd to do the actual dump.
debuggerd_signal_handler(int signal_number,siginfo_t * info,void * context)590 static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
591 // Make sure we don't change the value of errno, in case a signal comes in between the process
592 // making a syscall and checking errno.
593 ErrnoRestorer restorer;
594
595 auto *ucontext = static_cast<ucontext_t*>(context);
596
597 // It's possible somebody cleared the SA_SIGINFO flag, which would mean
598 // our "info" arg holds an undefined value.
599 if (!have_siginfo(signal_number)) {
600 info = nullptr;
601 }
602
603 struct siginfo dummy_info = {};
604 if (!info) {
605 memset(&dummy_info, 0, sizeof(dummy_info));
606 dummy_info.si_signo = signal_number;
607 dummy_info.si_code = SI_USER;
608 dummy_info.si_pid = __getpid();
609 dummy_info.si_uid = getuid();
610 info = &dummy_info;
611 } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
612 // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
613 // that contain commit 66dd34a (3.9+). The manpage claims to only allow
614 // negative si_code values that are not SI_TKILL, but 66dd34a changed the
615 // check to allow all si_code values in calls coming from inside the house.
616 }
617
618 debugger_process_info process_info = {};
619 if (g_callbacks.get_process_info) {
620 process_info = g_callbacks.get_process_info();
621 }
622 uintptr_t si_val = reinterpret_cast<uintptr_t>(info->si_ptr);
623 if (signal_number == BIONIC_SIGNAL_DEBUGGER) {
624 // Applications can set abort messages via android_set_abort_message without
625 // actually aborting; ignore those messages in non-fatal dumps.
626 process_info.abort_msg = nullptr;
627 if (info->si_code == SI_QUEUE && info->si_pid == __getpid()) {
628 // Allow for the abort message to be explicitly specified via the sigqueue value.
629 // Keep the bottom bit intact for representing whether we want a backtrace or a tombstone.
630 if (si_val != kDebuggerdFallbackSivalUintptrRequestDump) {
631 process_info.abort_msg = reinterpret_cast<void*>(si_val & ~1);
632 info->si_ptr = reinterpret_cast<void*>(si_val & 1);
633 }
634 }
635 }
636
637 gwp_asan_callbacks_t gwp_asan_callbacks = {};
638 bool recoverable_gwp_asan_crash = false;
639 if (g_callbacks.get_gwp_asan_callbacks != nullptr) {
640 // GWP-ASan catches use-after-free and heap-buffer-overflow by using PROT_NONE
641 // guard pages, which lead to SEGV. Normally, debuggerd prints a bug report
642 // and the process terminates, but in some cases, we actually want to print
643 // the bug report and let the signal handler return, and restart the process.
644 // In order to do that, we need to disable GWP-ASan's guard pages. The
645 // following callbacks handle this case.
646 gwp_asan_callbacks = g_callbacks.get_gwp_asan_callbacks();
647 if (signal_number == SIGSEGV && signal_has_si_addr(info) &&
648 gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery &&
649 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report &&
650 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report &&
651 gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery(info->si_addr)) {
652 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report(info->si_addr);
653 recoverable_gwp_asan_crash = true;
654 process_info.recoverable_crash = true;
655 }
656 }
657
658 if (info->si_signo == SIGSEGV &&
659 (info->si_code == SEGV_MTESERR || info->si_code == SEGV_MTEAERR) && is_permissive_mte()) {
660 process_info.recoverable_crash = true;
661 // If we are in permissive MTE mode, we do not crash, but instead disable MTE on this thread,
662 // and then let the failing instruction be retried. The second time should work (except
663 // if there is another non-MTE fault).
664 int tagged_addr_ctrl = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0);
665 if (tagged_addr_ctrl < 0) {
666 fatal_errno("failed to PR_GET_TAGGED_ADDR_CTRL");
667 }
668 int previous = tagged_addr_ctrl & PR_MTE_TCF_MASK;
669 tagged_addr_ctrl = (tagged_addr_ctrl & ~PR_MTE_TCF_MASK) | PR_MTE_TCF_NONE;
670 if (prctl(PR_SET_TAGGED_ADDR_CTRL, tagged_addr_ctrl, 0, 0, 0) < 0) {
671 fatal_errno("failed to PR_SET_TAGGED_ADDR_CTRL");
672 }
673 if (int reenable_timer = permissive_mte_renable_timer()) {
674 async_safe_format_log(ANDROID_LOG_ERROR, "libc",
675 "MTE ERROR DETECTED BUT RUNNING IN PERMISSIVE MODE. CONTINUING WITH "
676 "MTE DISABLED FOR %d MS OF CPU TIME.",
677 reenable_timer);
678 timer_t timerid{};
679 struct sigevent sev {};
680 sev.sigev_signo = BIONIC_ENABLE_MTE;
681 sev.sigev_notify = SIGEV_THREAD_ID;
682 sev.sigev_value.sival_int = previous;
683 sev.sigev_notify_thread_id = __gettid();
684 // This MUST be CLOCK_THREAD_CPUTIME_ID. If we used CLOCK_MONOTONIC we could get stuck
685 // in an endless loop of re-running the same instruction, calling this signal handler,
686 // and re-enabling MTE before we had a chance to re-run the instruction.
687 if (timer_create(CLOCK_THREAD_CPUTIME_ID, &sev, &timerid) == -1) {
688 fatal_errno("timer_create() failed");
689 }
690 struct itimerspec its {};
691 its.it_value.tv_sec = reenable_timer / 1000;
692 its.it_value.tv_nsec = (reenable_timer % 1000) * 1000000;
693
694 if (timer_settime(timerid, 0, &its, nullptr) == -1) {
695 fatal_errno("timer_settime() failed");
696 }
697 } else {
698 async_safe_format_log(
699 ANDROID_LOG_ERROR, "libc",
700 "MTE ERROR DETECTED BUT RUNNING IN PERMISSIVE MODE. CONTINUING WITH MTE DISABLED.");
701 }
702 pthread_mutex_unlock(&crash_mutex);
703 }
704
705 // If sival_int is ~0, it means that the fallback handler has been called
706 // once before and this function is being called again to dump the stack
707 // of a specific thread. It is possible that the prctl call might return 1,
708 // then return 0 in subsequent calls, so check the sival_int to determine if
709 // the fallback handler should be called first.
710 bool no_new_privs = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) == 1;
711 if (si_val == kDebuggerdFallbackSivalUintptrRequestDump || no_new_privs) {
712 // This check might be racy if another thread sets NO_NEW_PRIVS, but this should be unlikely,
713 // you can only set NO_NEW_PRIVS to 1, and the effect should be at worst a single missing
714 // ANR trace.
715 debuggerd_fallback_handler(info, ucontext, process_info.abort_msg);
716 if (no_new_privs && recoverable_gwp_asan_crash) {
717 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
718 return;
719 }
720 resend_signal(info);
721 return;
722 }
723
724 // Only allow one thread to handle a signal at a time.
725 int ret = pthread_mutex_lock(&crash_mutex);
726 if (ret != 0) {
727 async_safe_format_log(ANDROID_LOG_INFO, "libc", "pthread_mutex_lock failed: %s", strerror(ret));
728 return;
729 }
730
731 log_signal_summary(info);
732
733 // If we got here due to the signal BIONIC_SIGNAL_DEBUGGER, it's possible
734 // this is not the main thread, which can cause the intercept logic to fail
735 // since the intercept is only looking for the main thread. In this case,
736 // setting crashing_tid to pid instead of the current thread's tid avoids
737 // the problem.
738 debugger_thread_info thread_info = {
739 .crashing_tid = (signal_number == BIONIC_SIGNAL_DEBUGGER) ? __getpid() : __gettid(),
740 .pseudothread_tid = -1,
741 .siginfo = info,
742 .ucontext = context,
743 .process_info = process_info,
744 };
745
746 // Set PR_SET_DUMPABLE to 1, so that crash_dump can ptrace us.
747 int orig_dumpable = prctl(PR_GET_DUMPABLE);
748 if (prctl(PR_SET_DUMPABLE, 1) != 0) {
749 fatal_errno("failed to set dumpable");
750 }
751
752 // On kernels with yama_ptrace enabled, also allow any process to attach.
753 bool restore_orig_ptracer = true;
754 if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY) != 0) {
755 if (errno == EINVAL) {
756 // This kernel does not support PR_SET_PTRACER_ANY, or Yama is not enabled.
757 restore_orig_ptracer = false;
758 } else {
759 fatal_errno("failed to set traceable");
760 }
761 }
762
763 // Essentially pthread_create without CLONE_FILES, so we still work during file descriptor
764 // exhaustion.
765 pid_t child_pid =
766 clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
767 CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
768 &thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
769 if (child_pid == -1) {
770 fatal_errno("failed to spawn debuggerd dispatch thread");
771 }
772
773 // Wait for the child to start...
774 futex_wait(&thread_info.pseudothread_tid, -1);
775
776 // and then wait for it to terminate.
777 futex_wait(&thread_info.pseudothread_tid, child_pid);
778
779 // Restore PR_SET_DUMPABLE to its original value.
780 if (prctl(PR_SET_DUMPABLE, orig_dumpable) != 0) {
781 fatal_errno("failed to restore dumpable");
782 }
783
784 // Restore PR_SET_PTRACER to its original value.
785 if (restore_orig_ptracer && prctl(PR_SET_PTRACER, 0) != 0) {
786 fatal_errno("failed to restore traceable");
787 }
788
789 if (info->si_signo == BIONIC_SIGNAL_DEBUGGER) {
790 // If the signal is fatal, don't unlock the mutex to prevent other crashing threads from
791 // starting to dump right before our death.
792 pthread_mutex_unlock(&crash_mutex);
793 } else if (process_info.recoverable_crash) {
794 if (recoverable_gwp_asan_crash) {
795 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
796 }
797 pthread_mutex_unlock(&crash_mutex);
798 }
799 #ifdef __aarch64__
800 else if (info->si_signo == SIGSEGV && info->si_code == SEGV_MTEAERR && getppid() == 1) {
801 // Back channel to init (see system/core/init/service.cpp) to signal that
802 // this process crashed due to an ASYNC MTE fault and should be considered
803 // for upgrade to SYNC mode. We are re-using the ART profiler signal, which
804 // is always handled (ignored in native processes, handled for generating a
805 // dump in ART processes), so a process will never crash from this signal
806 // except from here.
807 // The kernel is not particularly receptive to adding this information:
808 // https://lore.kernel.org/all/[email protected]/, so we work around
809 // like this.
810 info->si_signo = BIONIC_SIGNAL_ART_PROFILER;
811 resend_signal(info);
812 }
813 #endif
814 else {
815 // Resend the signal, so that either the debugger or the parent's waitpid sees it.
816 resend_signal(info);
817 }
818 }
819
debuggerd_init(debuggerd_callbacks_t * callbacks)820 void debuggerd_init(debuggerd_callbacks_t* callbacks) {
821 if (callbacks) {
822 g_callbacks = *callbacks;
823 }
824
825 size_t thread_stack_pages = 8;
826 void* thread_stack_allocation = mmap(nullptr, getpagesize() * (thread_stack_pages + 2), PROT_NONE,
827 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
828 if (thread_stack_allocation == MAP_FAILED) {
829 fatal_errno("failed to allocate debuggerd thread stack");
830 }
831
832 char* stack = static_cast<char*>(thread_stack_allocation) + getpagesize();
833 if (mprotect(stack, getpagesize() * thread_stack_pages, PROT_READ | PROT_WRITE) != 0) {
834 fatal_errno("failed to mprotect debuggerd thread stack");
835 }
836
837 // Stack grows negatively, set it to the last byte in the page...
838 stack = (stack + thread_stack_pages * getpagesize() - 1);
839 // and align it.
840 stack -= 15;
841 pseudothread_stack = stack;
842
843 struct sigaction action;
844 memset(&action, 0, sizeof(action));
845 sigfillset(&action.sa_mask);
846 action.sa_sigaction = debuggerd_signal_handler;
847 action.sa_flags = SA_RESTART | SA_SIGINFO;
848
849 // Use the alternate signal stack if available so we can catch stack overflows.
850 action.sa_flags |= SA_ONSTACK;
851
852 // Request that the kernel set tag bits in the fault address. This is necessary for diagnosing MTE
853 // faults.
854 action.sa_flags |= SA_EXPOSE_TAGBITS;
855
856 debuggerd_register_handlers(&action);
857 }
858
debuggerd_handle_gwp_asan_signal(int signal_number,siginfo_t * info,void * context)859 bool debuggerd_handle_gwp_asan_signal(int signal_number, siginfo_t* info, void* context) {
860 if (g_callbacks.get_gwp_asan_callbacks == nullptr) return false;
861 gwp_asan_callbacks_t gwp_asan_callbacks = g_callbacks.get_gwp_asan_callbacks();
862 if (gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery == nullptr ||
863 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report == nullptr ||
864 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report == nullptr ||
865 !gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery(info->si_addr)) {
866 return false;
867 }
868
869 // Only dump a crash report for the first GWP-ASan crash. ActivityManager
870 // doesn't like it when an app crashes multiple times, and is even more strict
871 // about an app crashing multiple times in a short time period. While the app
872 // won't crash fully when we do GWP-ASan recovery, ActivityManager still gets
873 // the information about the crash through the DropBoxManager service. If an
874 // app has multiple back-to-back GWP-ASan crashes, this would lead to the app
875 // being killed, which defeats the purpose of having the recoverable mode. To
876 // mitigate against this, only generate a debuggerd crash report for the first
877 // GWP-ASan crash encountered. We still need to do the patching up of the
878 // allocator though, so do that.
879 static pthread_mutex_t first_crash_mutex = PTHREAD_MUTEX_INITIALIZER;
880 pthread_mutex_lock(&first_crash_mutex);
881 static bool first_crash = true;
882
883 if (first_crash) {
884 // `debuggerd_signal_handler` will call
885 // `debuggerd_gwp_asan_(pre|post)_crash_report`, so no need to manually call
886 // them here.
887 debuggerd_signal_handler(signal_number, info, context);
888 first_crash = false;
889 } else {
890 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report(info->si_addr);
891 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
892 }
893
894 pthread_mutex_unlock(&first_crash_mutex);
895 return true;
896 }
897
898 // When debuggerd's signal handler is the first handler called, it's great at
899 // handling the recoverable GWP-ASan and permissive MTE modes. For apps,
900 // sigchain (from libart) is always the first signal handler, and so the
901 // following function is what sigchain must call before processing the signal.
902 // This allows for processing of a potentially recoverable GWP-ASan or MTE
903 // crash. If the signal requires recovery, then dump a report (via the regular
904 // debuggerd hanndler), and patch up the allocator (in the case of GWP-ASan) or
905 // disable MTE on the thread, and allow the process to continue (indicated by
906 // returning 'true'). If the crash has nothing to do with GWP-ASan/MTE, or
907 // recovery isn't possible, return 'false'.
debuggerd_handle_signal(int signal_number,siginfo_t * info,void * context)908 bool debuggerd_handle_signal(int signal_number, siginfo_t* info, void* context) {
909 if (signal_number != SIGSEGV) return false;
910 if (info->si_code == SEGV_MTEAERR || info->si_code == SEGV_MTESERR) {
911 if (!is_permissive_mte()) return false;
912 // Because permissive MTE disables MTE for the entire thread, we're less
913 // worried about getting a whole bunch of crashes in a row. ActivityManager
914 // doesn't like multiple native crashes for an app in a short period of time
915 // (see the comment about recoverable GWP-ASan in
916 // `debuggerd_handle_gwp_asan_signal`), but that shouldn't happen if MTE is
917 // disabled for the entire thread. This might need to be changed if there's
918 // some low-hanging bug that happens across multiple threads in quick
919 // succession.
920 debuggerd_signal_handler(signal_number, info, context);
921 return true;
922 }
923
924 if (!signal_has_si_addr(info)) return false;
925 return debuggerd_handle_gwp_asan_signal(signal_number, info, context);
926 }
927