[lttng-dev] [PATCH lttng-tools] Fix: run-as thread deadlock on itself in restart error path
Jérémie Galarneau
jeremie.galarneau at efficios.com
Tue Feb 12 11:23:27 EST 2019
Merged in master and stable-2.11.
Thanks!
Jérémie
On Wed, 16 Jan 2019 at 13:39, Jonathan Rajotte
<jonathan.rajotte-julien at efficios.com> wrote:
>
> The deadlock was found using this backtrace
>
> Thread 5:
> 0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
> 1 0x00007efc6b650023 in __GI___pthread_mutex_lock (mutex=mutex at entry=0x55fc37128400 <worker_lock>) at ../nptl/pthread_mutex_lock.c:78
> 2 0x000055fc36efbe05 in run_as_destroy_worker () at runas.c:1233
> 3 0x000055fc36efc2e7 in run_as_restart_worker (worker=<optimized out>) at runas.c:998
> 4 run_as (cmd=cmd at entry=RUN_AS_UNLINK, data=data at entry=0x7efc5b7fa630, ret_value=ret_value at entry=0x7efc5b7fa510, uid=uid at entry=1000, gid=gid at entry=1000) at runas.c:1033
> 5 0x000055fc36efc9ce in run_as_unlink (path=path at entry=0x7efc5b7fb690 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index/channel0_3.idx", uid=uid at entry=1000, gid=gid at entry=1000) at runas.c :1120
> 6 0x000055fc36ef7feb in utils_unlink_stream_file (path_name=path_name at entry=0x7efc5b7fc7e0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index", file_name=file_name at entry=0x7efc500085d4 "channel0_3", size=size at entry=0, count=count at entry=0, uid=uid at entry=1000, gid=gid at entry=1000, suffix=0x55fc36f19b26 ".idx") at utils.c:929
> 7 0x000055fc36f01d4e in lttng_index_file_create (path_name=path_name at entry=0x7efc500087a0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel", stream_name=stream_name at entry=0x7efc500085d4 "channel0_3", uid=1000, gid=1000, size=0, count=0, major=1, minor=1) at index.c:79
> 8 0x000055fc36ed9475 in rotate_local_stream (ctx=<optimized out>, stream=0x7efc50008460) at consumer.c:4105
> 9 0x000055fc36ed98b5 in lttng_consumer_rotate_stream (ctx=ctx at entry=0x55fc37428d80, stream=stream at entry=0x7efc50008460, rotated=rotated at entry=0x7efc5b7fdb27) at consumer.c:4181
> 10 0x000055fc36ee354e in lttng_kconsumer_read_subbuffer (stream=stream at entry=0x7efc50008460, ctx=ctx at entry=0x55fc37428d80, rotated=rotated at entry=0x7efc5b7fdb27) at kernel-consumer.c:1740
> 11 0x000055fc36ed7a30 in lttng_consumer_read_subbuffer (stream=0x7efc50008460, ctx=0x55fc37428d80) at consumer.c:3383
> 12 0x000055fc36ed4b74 in consumer_thread_data_poll (data=0x55fc37428d80) at consumer.c:2751
> 13 0x00007efc6b64d6db in start_thread (arg=0x7efc5b7fe700) at pthread_create.c:463
> 14 0x00007efc6af6488f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
>
> The owner of the lock is itself:
> print worker_lock.__data.__owner
> $2 = 25725
> thread find 25725
> Thread 5 has target id 'Thread 0x7efc5b7fe700 (LWP 25725)'
>
> The worker_lock is first taken in frame #4: run_as runas.c:1033
>
> pthread_mutex_lock(&worker_lock);
> if (use_clone()) {
> ...
> /*
> * If the worker thread crashed the errno is set to EIO. we log
> * the error and start a new worker process.
> */
> if (ret == -1 && saved_errno == EIO) {
> DBG("Socket closed unexpectedly... "
> "Restarting the worker process");
> -> ret = run_as_restart_worker(global_worker);
> if (ret == -1) {
> ERR("Failed to restart worker process.");
> goto err;
> }
>
> Solution
> ========
>
> Create run_as_restart_worker_no_lock which does not to take the lock on
> execution.
> Use run_as_restart_worker_no_lock at the run_as error path call site.
> Use run_as_restart_worker_no_lock inside run_as_restart_worker while
> holding the worker lock to provide identical behaviour to other call sites.
>
> Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien at efficios.com>
> ---
> src/common/runas.c | 89 ++++++++++++++++++++++++----------------------
> 1 file changed, 47 insertions(+), 42 deletions(-)
>
> diff --git a/src/common/runas.c b/src/common/runas.c
> index d727cf417..9e6277078 100644
> --- a/src/common/runas.c
> +++ b/src/common/runas.c
> @@ -986,6 +986,51 @@ error_procname_alloc:
> return ret;
> }
>
> +static
> +void run_as_destroy_worker_no_lock(void)
> +{
> + struct run_as_worker *worker = global_worker;
> +
> + DBG("Destroying run_as worker");
> + if (!worker) {
> + return;
> + }
> + /* Close unix socket */
> + DBG("Closing run_as worker socket");
> + if (lttcomm_close_unix_sock(worker->sockpair[0])) {
> + PERROR("close");
> + }
> + worker->sockpair[0] = -1;
> + /* Wait for worker. */
> + for (;;) {
> + int status;
> + pid_t wait_ret;
> +
> + wait_ret = waitpid(worker->pid, &status, 0);
> + if (wait_ret < 0) {
> + if (errno == EINTR) {
> + continue;
> + }
> + PERROR("waitpid");
> + break;
> + }
> +
> + if (WIFEXITED(status)) {
> + LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
> + DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
> + WEXITSTATUS(status));
> + break;
> + } else if (WIFSIGNALED(status)) {
> + ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
> + WTERMSIG(status));
> + break;
> + }
> + }
> + free(worker->procname);
> + free(worker);
> + global_worker = NULL;
> +}
> +
> static
> int run_as_restart_worker(struct run_as_worker *worker)
> {
> @@ -995,7 +1040,7 @@ int run_as_restart_worker(struct run_as_worker *worker)
> procname = worker->procname;
>
> /* Close socket to run_as worker process and clean up the zombie process */
> - run_as_destroy_worker();
> + run_as_destroy_worker_no_lock();
>
> /* Create a new run_as worker process*/
> ret = run_as_create_worker_no_lock(procname);
> @@ -1227,47 +1272,7 @@ int run_as_create_worker(const char *procname)
> LTTNG_HIDDEN
> void run_as_destroy_worker(void)
> {
> - struct run_as_worker *worker = global_worker;
> -
> - DBG("Destroying run_as worker");
> pthread_mutex_lock(&worker_lock);
> - if (!worker) {
> - goto end;
> - }
> - /* Close unix socket */
> - DBG("Closing run_as worker socket");
> - if (lttcomm_close_unix_sock(worker->sockpair[0])) {
> - PERROR("close");
> - }
> - worker->sockpair[0] = -1;
> - /* Wait for worker. */
> - for (;;) {
> - int status;
> - pid_t wait_ret;
> -
> - wait_ret = waitpid(worker->pid, &status, 0);
> - if (wait_ret < 0) {
> - if (errno == EINTR) {
> - continue;
> - }
> - PERROR("waitpid");
> - break;
> - }
> -
> - if (WIFEXITED(status)) {
> - LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
> - DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
> - WEXITSTATUS(status));
> - break;
> - } else if (WIFSIGNALED(status)) {
> - ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
> - WTERMSIG(status));
> - break;
> - }
> - }
> - free(worker->procname);
> - free(worker);
> - global_worker = NULL;
> -end:
> + run_as_destroy_worker_no_lock();
> pthread_mutex_unlock(&worker_lock);
> }
> --
> 2.17.1
>
--
Jérémie Galarneau
EfficiOS Inc.
http://www.efficios.com
More information about the lttng-dev
mailing list