[lttng-dev] [PATCH lttng-tools] Fix: run-as thread deadlock on itself in restart error path

Jonathan Rajotte jonathan.rajotte-julien at efficios.com
Wed Jan 16 13:38:57 EST 2019


The deadlock was found using this backtrace

Thread 5:
0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1  0x00007efc6b650023 in __GI___pthread_mutex_lock (mutex=mutex at entry=0x55fc37128400 <worker_lock>) at ../nptl/pthread_mutex_lock.c:78
2  0x000055fc36efbe05 in run_as_destroy_worker () at runas.c:1233
3  0x000055fc36efc2e7 in run_as_restart_worker (worker=<optimized out>) at runas.c:998
4  run_as (cmd=cmd at entry=RUN_AS_UNLINK, data=data at entry=0x7efc5b7fa630, ret_value=ret_value at entry=0x7efc5b7fa510, uid=uid at entry=1000, gid=gid at entry=1000) at runas.c:1033
5  0x000055fc36efc9ce in run_as_unlink (path=path at entry=0x7efc5b7fb690 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index/channel0_3.idx", uid=uid at entry=1000, gid=gid at entry=1000) at runas.c :1120
6  0x000055fc36ef7feb in utils_unlink_stream_file (path_name=path_name at entry=0x7efc5b7fc7e0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index", file_name=file_name at entry=0x7efc500085d4 "channel0_3", size=size at entry=0, count=count at entry=0, uid=uid at entry=1000, gid=gid at entry=1000, suffix=0x55fc36f19b26 ".idx") at utils.c:929
7  0x000055fc36f01d4e in lttng_index_file_create (path_name=path_name at entry=0x7efc500087a0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel", stream_name=stream_name at entry=0x7efc500085d4 "channel0_3", uid=1000, gid=1000, size=0, count=0, major=1, minor=1) at index.c:79
8  0x000055fc36ed9475 in rotate_local_stream (ctx=<optimized out>, stream=0x7efc50008460) at consumer.c:4105
9  0x000055fc36ed98b5 in lttng_consumer_rotate_stream (ctx=ctx at entry=0x55fc37428d80, stream=stream at entry=0x7efc50008460, rotated=rotated at entry=0x7efc5b7fdb27) at consumer.c:4181
10 0x000055fc36ee354e in lttng_kconsumer_read_subbuffer (stream=stream at entry=0x7efc50008460, ctx=ctx at entry=0x55fc37428d80, rotated=rotated at entry=0x7efc5b7fdb27) at kernel-consumer.c:1740
11 0x000055fc36ed7a30 in lttng_consumer_read_subbuffer (stream=0x7efc50008460, ctx=0x55fc37428d80) at consumer.c:3383
12 0x000055fc36ed4b74 in consumer_thread_data_poll (data=0x55fc37428d80) at consumer.c:2751
13 0x00007efc6b64d6db in start_thread (arg=0x7efc5b7fe700) at pthread_create.c:463
14 0x00007efc6af6488f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

The owner of the lock is itself:
  print worker_lock.__data.__owner
    $2 = 25725
  thread find 25725
    Thread 5 has target id 'Thread 0x7efc5b7fe700 (LWP 25725)'

The worker_lock is first taken in frame #4: run_as runas.c:1033

  pthread_mutex_lock(&worker_lock);
  if (use_clone()) {
...
    /*
     * If the worker thread crashed the errno is set to EIO. we log
     * the error and  start a new worker process.
     */
    if (ret == -1 && saved_errno == EIO) {
        DBG("Socket closed unexpectedly... "
        "Restarting the worker process");
->      ret = run_as_restart_worker(global_worker);
        if (ret == -1) {
          ERR("Failed to restart worker process.");
          goto err;
        }

Solution
========

Create run_as_restart_worker_no_lock which does not to take the lock on
execution.
Use run_as_restart_worker_no_lock at the run_as error path call site.
Use run_as_restart_worker_no_lock inside run_as_restart_worker while
holding the worker lock to provide identical behaviour to other call sites.

Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien at efficios.com>
---
 src/common/runas.c | 89 ++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/src/common/runas.c b/src/common/runas.c
index d727cf417..9e6277078 100644
--- a/src/common/runas.c
+++ b/src/common/runas.c
@@ -986,6 +986,51 @@ error_procname_alloc:
 	return ret;
 }
 
+static
+void run_as_destroy_worker_no_lock(void)
+{
+	struct run_as_worker *worker = global_worker;
+
+	DBG("Destroying run_as worker");
+	if (!worker) {
+		return;
+	}
+	/* Close unix socket */
+	DBG("Closing run_as worker socket");
+	if (lttcomm_close_unix_sock(worker->sockpair[0])) {
+		PERROR("close");
+	}
+	worker->sockpair[0] = -1;
+	/* Wait for worker. */
+	for (;;) {
+		int status;
+		pid_t wait_ret;
+
+		wait_ret = waitpid(worker->pid, &status, 0);
+		if (wait_ret < 0) {
+			if (errno == EINTR) {
+				continue;
+			}
+			PERROR("waitpid");
+			break;
+		}
+
+		if (WIFEXITED(status)) {
+			LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
+					DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
+				        WEXITSTATUS(status));
+			break;
+		} else if (WIFSIGNALED(status)) {
+			ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
+					WTERMSIG(status));
+			break;
+		}
+	}
+	free(worker->procname);
+	free(worker);
+	global_worker = NULL;
+}
+
 static
 int run_as_restart_worker(struct run_as_worker *worker)
 {
@@ -995,7 +1040,7 @@ int run_as_restart_worker(struct run_as_worker *worker)
 	procname = worker->procname;
 
 	/* Close socket to run_as worker process and clean up the zombie process */
-	run_as_destroy_worker();
+	run_as_destroy_worker_no_lock();
 
 	/* Create a new run_as worker process*/
 	ret = run_as_create_worker_no_lock(procname);
@@ -1227,47 +1272,7 @@ int run_as_create_worker(const char *procname)
 LTTNG_HIDDEN
 void run_as_destroy_worker(void)
 {
-	struct run_as_worker *worker = global_worker;
-
-	DBG("Destroying run_as worker");
 	pthread_mutex_lock(&worker_lock);
-	if (!worker) {
-		goto end;
-	}
-	/* Close unix socket */
-	DBG("Closing run_as worker socket");
-	if (lttcomm_close_unix_sock(worker->sockpair[0])) {
-		PERROR("close");
-	}
-	worker->sockpair[0] = -1;
-	/* Wait for worker. */
-	for (;;) {
-		int status;
-		pid_t wait_ret;
-
-		wait_ret = waitpid(worker->pid, &status, 0);
-		if (wait_ret < 0) {
-			if (errno == EINTR) {
-				continue;
-			}
-			PERROR("waitpid");
-			break;
-		}
-
-		if (WIFEXITED(status)) {
-			LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
-					DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
-				        WEXITSTATUS(status));
-			break;
-		} else if (WIFSIGNALED(status)) {
-			ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
-					WTERMSIG(status));
-			break;
-		}
-	}
-	free(worker->procname);
-	free(worker);
-	global_worker = NULL;
-end:
+	run_as_destroy_worker_no_lock();
 	pthread_mutex_unlock(&worker_lock);
 }
-- 
2.17.1



More information about the lttng-dev mailing list