[lttng-dev] [PATCH lttng-tools] Fix: run-as thread deadlock on itself in restart error path

Jérémie Galarneau jeremie.galarneau at efficios.com
Tue Feb 12 11:23:27 EST 2019


Merged in master and stable-2.11.

Thanks!
Jérémie

On Wed, 16 Jan 2019 at 13:39, Jonathan Rajotte
<jonathan.rajotte-julien at efficios.com> wrote:
>
> The deadlock was found using this backtrace
>
> Thread 5:
> 0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
> 1  0x00007efc6b650023 in __GI___pthread_mutex_lock (mutex=mutex at entry=0x55fc37128400 <worker_lock>) at ../nptl/pthread_mutex_lock.c:78
> 2  0x000055fc36efbe05 in run_as_destroy_worker () at runas.c:1233
> 3  0x000055fc36efc2e7 in run_as_restart_worker (worker=<optimized out>) at runas.c:998
> 4  run_as (cmd=cmd at entry=RUN_AS_UNLINK, data=data at entry=0x7efc5b7fa630, ret_value=ret_value at entry=0x7efc5b7fa510, uid=uid at entry=1000, gid=gid at entry=1000) at runas.c:1033
> 5  0x000055fc36efc9ce in run_as_unlink (path=path at entry=0x7efc5b7fb690 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index/channel0_3.idx", uid=uid at entry=1000, gid=gid at entry=1000) at runas.c :1120
> 6  0x000055fc36ef7feb in utils_unlink_stream_file (path_name=path_name at entry=0x7efc5b7fc7e0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index", file_name=file_name at entry=0x7efc500085d4 "channel0_3", size=size at entry=0, count=count at entry=0, uid=uid at entry=1000, gid=gid at entry=1000, suffix=0x55fc36f19b26 ".idx") at utils.c:929
> 7  0x000055fc36f01d4e in lttng_index_file_create (path_name=path_name at entry=0x7efc500087a0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel", stream_name=stream_name at entry=0x7efc500085d4 "channel0_3", uid=1000, gid=1000, size=0, count=0, major=1, minor=1) at index.c:79
> 8  0x000055fc36ed9475 in rotate_local_stream (ctx=<optimized out>, stream=0x7efc50008460) at consumer.c:4105
> 9  0x000055fc36ed98b5 in lttng_consumer_rotate_stream (ctx=ctx at entry=0x55fc37428d80, stream=stream at entry=0x7efc50008460, rotated=rotated at entry=0x7efc5b7fdb27) at consumer.c:4181
> 10 0x000055fc36ee354e in lttng_kconsumer_read_subbuffer (stream=stream at entry=0x7efc50008460, ctx=ctx at entry=0x55fc37428d80, rotated=rotated at entry=0x7efc5b7fdb27) at kernel-consumer.c:1740
> 11 0x000055fc36ed7a30 in lttng_consumer_read_subbuffer (stream=0x7efc50008460, ctx=0x55fc37428d80) at consumer.c:3383
> 12 0x000055fc36ed4b74 in consumer_thread_data_poll (data=0x55fc37428d80) at consumer.c:2751
> 13 0x00007efc6b64d6db in start_thread (arg=0x7efc5b7fe700) at pthread_create.c:463
> 14 0x00007efc6af6488f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
>
> The owner of the lock is itself:
>   print worker_lock.__data.__owner
>     $2 = 25725
>   thread find 25725
>     Thread 5 has target id 'Thread 0x7efc5b7fe700 (LWP 25725)'
>
> The worker_lock is first taken in frame #4: run_as runas.c:1033
>
>   pthread_mutex_lock(&worker_lock);
>   if (use_clone()) {
> ...
>     /*
>      * If the worker thread crashed the errno is set to EIO. we log
>      * the error and  start a new worker process.
>      */
>     if (ret == -1 && saved_errno == EIO) {
>         DBG("Socket closed unexpectedly... "
>         "Restarting the worker process");
> ->      ret = run_as_restart_worker(global_worker);
>         if (ret == -1) {
>           ERR("Failed to restart worker process.");
>           goto err;
>         }
>
> Solution
> ========
>
> Create run_as_restart_worker_no_lock which does not to take the lock on
> execution.
> Use run_as_restart_worker_no_lock at the run_as error path call site.
> Use run_as_restart_worker_no_lock inside run_as_restart_worker while
> holding the worker lock to provide identical behaviour to other call sites.
>
> Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien at efficios.com>
> ---
>  src/common/runas.c | 89 ++++++++++++++++++++++++----------------------
>  1 file changed, 47 insertions(+), 42 deletions(-)
>
> diff --git a/src/common/runas.c b/src/common/runas.c
> index d727cf417..9e6277078 100644
> --- a/src/common/runas.c
> +++ b/src/common/runas.c
> @@ -986,6 +986,51 @@ error_procname_alloc:
>         return ret;
>  }
>
> +static
> +void run_as_destroy_worker_no_lock(void)
> +{
> +       struct run_as_worker *worker = global_worker;
> +
> +       DBG("Destroying run_as worker");
> +       if (!worker) {
> +               return;
> +       }
> +       /* Close unix socket */
> +       DBG("Closing run_as worker socket");
> +       if (lttcomm_close_unix_sock(worker->sockpair[0])) {
> +               PERROR("close");
> +       }
> +       worker->sockpair[0] = -1;
> +       /* Wait for worker. */
> +       for (;;) {
> +               int status;
> +               pid_t wait_ret;
> +
> +               wait_ret = waitpid(worker->pid, &status, 0);
> +               if (wait_ret < 0) {
> +                       if (errno == EINTR) {
> +                               continue;
> +                       }
> +                       PERROR("waitpid");
> +                       break;
> +               }
> +
> +               if (WIFEXITED(status)) {
> +                       LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
> +                                       DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
> +                                       WEXITSTATUS(status));
> +                       break;
> +               } else if (WIFSIGNALED(status)) {
> +                       ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
> +                                       WTERMSIG(status));
> +                       break;
> +               }
> +       }
> +       free(worker->procname);
> +       free(worker);
> +       global_worker = NULL;
> +}
> +
>  static
>  int run_as_restart_worker(struct run_as_worker *worker)
>  {
> @@ -995,7 +1040,7 @@ int run_as_restart_worker(struct run_as_worker *worker)
>         procname = worker->procname;
>
>         /* Close socket to run_as worker process and clean up the zombie process */
> -       run_as_destroy_worker();
> +       run_as_destroy_worker_no_lock();
>
>         /* Create a new run_as worker process*/
>         ret = run_as_create_worker_no_lock(procname);
> @@ -1227,47 +1272,7 @@ int run_as_create_worker(const char *procname)
>  LTTNG_HIDDEN
>  void run_as_destroy_worker(void)
>  {
> -       struct run_as_worker *worker = global_worker;
> -
> -       DBG("Destroying run_as worker");
>         pthread_mutex_lock(&worker_lock);
> -       if (!worker) {
> -               goto end;
> -       }
> -       /* Close unix socket */
> -       DBG("Closing run_as worker socket");
> -       if (lttcomm_close_unix_sock(worker->sockpair[0])) {
> -               PERROR("close");
> -       }
> -       worker->sockpair[0] = -1;
> -       /* Wait for worker. */
> -       for (;;) {
> -               int status;
> -               pid_t wait_ret;
> -
> -               wait_ret = waitpid(worker->pid, &status, 0);
> -               if (wait_ret < 0) {
> -                       if (errno == EINTR) {
> -                               continue;
> -                       }
> -                       PERROR("waitpid");
> -                       break;
> -               }
> -
> -               if (WIFEXITED(status)) {
> -                       LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
> -                                       DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
> -                                       WEXITSTATUS(status));
> -                       break;
> -               } else if (WIFSIGNALED(status)) {
> -                       ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
> -                                       WTERMSIG(status));
> -                       break;
> -               }
> -       }
> -       free(worker->procname);
> -       free(worker);
> -       global_worker = NULL;
> -end:
> +       run_as_destroy_worker_no_lock();
>         pthread_mutex_unlock(&worker_lock);
>  }
> --
> 2.17.1
>


-- 
Jérémie Galarneau
EfficiOS Inc.
http://www.efficios.com


More information about the lttng-dev mailing list