
In this PR we are adding the functionality to collect all the process's threads' backtraces. ## Changes made in this PR ### **introduce threads mngr API** The **threads mngr API** which has 2 abilities: * `ThreadsManager_init() `- register to SIGUSR2. called on the server start-up. * ` ThreadsManager_runOnThreads()` - receives a list of a pid_t and a callback, tells every thread in the list to invoke the callback, and returns the output collected by each invocation. **Elaborating atomicvar API** * `atomicIncrGet(var,newvalue_var,count) `-- Increment and get the atomic counter new value * `atomicFlagGetSet` -- Get and set the atomic counter value to 1 ### **Always set SIGALRM handler** SIGALRM handler prints the process's stacktrace to the log file. Up until now, it was set only if the `server.watchdog_period` > 0. This can be also useful if debugging is needed. However, in situations where the server can't get requests, (a deadlock, for example) we weren't able to change the signal handler. To make it available at run time we set SIGALRM handler on server startup. The signal handler name was changed to a more general `sigalrmSignalHandler`. ### **Print all the process' threads' stacktraces** `logStackTrace()` now calls `writeStacktraces()`, instead of logging the current thread stacktrace. `writeStacktraces()`: * On Linux systems we use the threads manager API to collect the backtraces of all the process' threads. To get the `tids` list (threads ids) we read the `/proc/<redis-server-pid>/tasks` file which includes a list of directories. Each directory name corresponds to one tid (including the main thread). For each thread, we also need to check if it can get the signal from the threads manager (meaning it is not blocking/ignoring that signal). We send the threads manager this tids list and `collect_stacktrace_data()` callback, which collects the thread's backtrace addresses, its name, and tid. * On other systems, the behavior remained as it was (writing only the current thread stacktrace to the log file). ## compatibility notes 1. **The threads mngr API is only supported in linux.** 2. glibc earlier than 2.3 We use `syscall(SYS_gettid)` and `syscall(SYS_tgkill...)` because their dedicated alternatives (`gettid()` and `tgkill`) were added in glibc 2.3. ## Output example Each thread backtrace will have the following format: `<tid> <thread_name> [additional_info]` * **tid**: as read from the `/proc/<redis-server-pid>/tasks` file * **thread_name**: the tread name as it is registered in the os/ * **additional_info**: Sometimes we want to add specific information about one of the threads. currently. it is only used to mark the thread that handles the backtraces collection by adding "*". In case of crash - this also indicates which thread caused the crash. The handling thread in won't necessarily appear first. ``` ------ STACK TRACE ------ EIP: /lib/aarch64-linux-gnu/libc.so.6(epoll_pwait+0x9c)[0xffffb9295ebc] 67089 redis-server * linux-vdso.so.1(__kernel_rt_sigreturn+0x0)[0xffffb9437790] /lib/aarch64-linux-gnu/libc.so.6(epoll_pwait+0x9c)[0xffffb9295ebc] redis-server *:6379(+0x75e0c)[0xaaaac2fe5e0c] redis-server *:6379(aeProcessEvents+0x18c)[0xaaaac2fe6c00] redis-server *:6379(aeMain+0x24)[0xaaaac2fe7038] redis-server *:6379(main+0xe0c)[0xaaaac3001afc] /lib/aarch64-linux-gnu/libc.so.6(+0x273fc)[0xffffb91d73fc] /lib/aarch64-linux-gnu/libc.so.6(__libc_start_main+0x98)[0xffffb91d74cc] redis-server *:6379(_start+0x30)[0xaaaac2fe0370] 67093 bio_lazy_free /lib/aarch64-linux-gnu/libc.so.6(+0x79dfc)[0xffffb9229dfc] /lib/aarch64-linux-gnu/libc.so.6(pthread_cond_wait+0x208)[0xffffb922c8fc] redis-server *:6379(bioProcessBackgroundJobs+0x174)[0xaaaac30976e8] /lib/aarch64-linux-gnu/libc.so.6(+0x7d5c8)[0xffffb922d5c8] /lib/aarch64-linux-gnu/libc.so.6(+0xe5d1c)[0xffffb9295d1c] 67091 bio_close_file /lib/aarch64-linux-gnu/libc.so.6(+0x79dfc)[0xffffb9229dfc] /lib/aarch64-linux-gnu/libc.so.6(pthread_cond_wait+0x208)[0xffffb922c8fc] redis-server *:6379(bioProcessBackgroundJobs+0x174)[0xaaaac30976e8] /lib/aarch64-linux-gnu/libc.so.6(+0x7d5c8)[0xffffb922d5c8] /lib/aarch64-linux-gnu/libc.so.6(+0xe5d1c)[0xffffb9295d1c] 67092 bio_aof /lib/aarch64-linux-gnu/libc.so.6(+0x79dfc)[0xffffb9229dfc] /lib/aarch64-linux-gnu/libc.so.6(pthread_cond_wait+0x208)[0xffffb922c8fc] redis-server *:6379(bioProcessBackgroundJobs+0x174)[0xaaaac30976e8] /lib/aarch64-linux-gnu/libc.so.6(+0x7d5c8)[0xffffb922d5c8] /lib/aarch64-linux-gnu/libc.so.6(+0xe5d1c)[0xffffb9295d1c] 67089:signal-handler (1693824528) -------- ```
198 lines
6.4 KiB
C
198 lines
6.4 KiB
C
/*
|
|
* Copyright (c) 2021, Redis Ltd.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "threads_mngr.h"
|
|
/* Anti-warning macro... */
|
|
#define UNUSED(V) ((void) V)
|
|
|
|
#ifdef __linux__
|
|
#include "zmalloc.h"
|
|
#include "atomicvar.h"
|
|
#include "server.h"
|
|
|
|
#include <signal.h>
|
|
#include <time.h>
|
|
#include <errno.h>
|
|
#include <semaphore.h>
|
|
#include <sys/syscall.h>
|
|
|
|
#define IN_PROGRESS 1
|
|
static const clock_t RUN_ON_THREADS_TIMEOUT = 2;
|
|
|
|
/*================================= Globals ================================= */
|
|
|
|
static run_on_thread_cb g_callback = NULL;
|
|
static volatile size_t g_tids_len = 0;
|
|
static void **g_output_array = NULL;
|
|
static redisAtomic size_t g_thread_ids = 0;
|
|
static redisAtomic size_t g_num_threads_done = 0;
|
|
|
|
static sem_t wait_for_threads_sem;
|
|
|
|
/* This flag is set while ThreadsManager_runOnThreads is running */
|
|
static redisAtomic int g_in_progress = 0;
|
|
|
|
/*============================ Internal prototypes ========================== */
|
|
|
|
static void invoke_callback(int sig);
|
|
/* returns 0 if it is safe to start, IN_PROGRESS otherwise. */
|
|
static int test_and_start(void);
|
|
static void wait_threads(void);
|
|
/* Clean up global variable.
|
|
Assuming we are under the g_in_progress protection, this is not a thread-safe function */
|
|
static void ThreadsManager_cleanups(void);
|
|
|
|
/*============================ API functions implementations ========================== */
|
|
|
|
void ThreadsManager_init(void) {
|
|
/* Register signal handler */
|
|
struct sigaction act;
|
|
sigemptyset(&act.sa_mask);
|
|
/* Not setting SA_RESTART flag means that If a signal handler is invoked while a
|
|
system call or library function call is blocked, use the default behavior
|
|
i.e., the call fails with the error EINTR */
|
|
act.sa_flags = 0;
|
|
act.sa_handler = invoke_callback;
|
|
sigaction(SIGUSR2, &act, NULL);
|
|
}
|
|
|
|
void **ThreadsManager_runOnThreads(pid_t *tids, size_t tids_len, run_on_thread_cb callback) {
|
|
/* Check if it is safe to start running. If not - return */
|
|
if(test_and_start() == IN_PROGRESS) {
|
|
return NULL;
|
|
}
|
|
|
|
/* Update g_callback */
|
|
g_callback = callback;
|
|
|
|
/* Set g_tids_len */
|
|
g_tids_len = tids_len;
|
|
|
|
/* Allocate the output buffer */
|
|
g_output_array = zmalloc(sizeof(void*) * tids_len);
|
|
|
|
/* Initialize a semaphore that we will be waiting on for the threads
|
|
use pshared = 0 to indicate the semaphore is shared between the process's threads (and not between processes),
|
|
and value = 0 as the initial semaphore value. */
|
|
sem_init(&wait_for_threads_sem, 0, 0);
|
|
|
|
/* Send signal to all the threads in tids */
|
|
pid_t pid = getpid();
|
|
for (size_t i = 0; i < tids_len ; ++i) {
|
|
syscall(SYS_tgkill, pid, tids[i], THREADS_SIGNAL);
|
|
}
|
|
|
|
/* Wait for all the threads to write to the output array, or until timeout is reached */
|
|
wait_threads();
|
|
|
|
void **ret = g_output_array;
|
|
|
|
/* Cleanups to allow next execution */
|
|
ThreadsManager_cleanups();
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*============================ Internal functions implementations ========================== */
|
|
|
|
|
|
static int test_and_start(void) {
|
|
/* atomicFlagGetSet sets the variable to 1 and returns the previous value */
|
|
int prev_state;
|
|
atomicFlagGetSet(g_in_progress, prev_state);
|
|
|
|
/* If prev_state is 1, g_in_progress was on. */
|
|
return prev_state;
|
|
}
|
|
|
|
static void invoke_callback(int sig) {
|
|
UNUSED(sig);
|
|
|
|
size_t thread_id;
|
|
atomicGetIncr(g_thread_ids, thread_id, 1);
|
|
g_output_array[thread_id] = g_callback();
|
|
size_t curr_done_count;
|
|
atomicIncrGet(g_num_threads_done, curr_done_count, 1);
|
|
|
|
/* last thread shuts down the light */
|
|
if (curr_done_count == g_tids_len) {
|
|
sem_post(&wait_for_threads_sem);
|
|
}
|
|
}
|
|
|
|
static void wait_threads(void) {
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_REALTIME, &ts);
|
|
|
|
/* calculate relative time until timeout */
|
|
ts.tv_sec += RUN_ON_THREADS_TIMEOUT;
|
|
|
|
int status = 0;
|
|
|
|
/* lock the semaphore until the semaphore value rises above zero or a signal
|
|
handler interrupts the call. In the later case continue to wait. */
|
|
while ((status = sem_timedwait(&wait_for_threads_sem, &ts)) == -1 && errno == EINTR) {
|
|
serverLog(LL_WARNING, "threads_mngr: waiting for threads' output was interrupted by signal. Continue waiting.");
|
|
continue;
|
|
}
|
|
|
|
if (status == -1) {
|
|
if (errno == ETIMEDOUT) {
|
|
serverLog(LL_WARNING, "threads_mngr: waiting for threads' output timed out");
|
|
}
|
|
}
|
|
}
|
|
|
|
static void ThreadsManager_cleanups(void) {
|
|
g_callback = NULL;
|
|
g_tids_len = 0;
|
|
g_output_array = NULL;
|
|
g_thread_ids = 0;
|
|
g_num_threads_done = 0;
|
|
sem_destroy(&wait_for_threads_sem);
|
|
|
|
/* Lastly, turn off g_in_progress */
|
|
atomicSet(g_in_progress, 0);
|
|
}
|
|
#else
|
|
|
|
void ThreadsManager_init(void) {
|
|
/* DO NOTHING */
|
|
}
|
|
|
|
void **ThreadsManager_runOnThreads(pid_t *tids, size_t tids_len, run_on_thread_cb callback) {
|
|
/* DO NOTHING */
|
|
UNUSED(tids);
|
|
UNUSED(tids_len);
|
|
UNUSED(callback);
|
|
return NULL;
|
|
}
|
|
|
|
#endif /* __linux__ */
|