[lttng-dev] [PATCH lttng-tools] Fix: health monitoring (various fixes)

Mathieu Desnoyers mathieu.desnoyers at efficios.com
Mon Jul 23 14:00:42 EDT 2012


* Fix modulo operation bug on
  #define HEALTH_IS_IN_CODE(x) (x % HEALTH_POLL_VALUE)
  which is causing the check to think it is never within code.
  (x % 1 always equals 0). Simplify this by using a simple
  & on the poll value, and remove the IS_IN_CODE, using ! on IS_IN_POLL
  instead (which removes nothing to clarity).
* Atomic operations should apply to at most "unsigned long" (32-bit on
  32-bit arch) rather than uint64_t.
* Separate the "error" condition from the counters. We clearly cannot
  use the "0" value as an error on 32-bit counters anymore, because they
  can easily wrap.
* Introduce "exit" condition, will be useful for state tracking in the
  future. Error and exit conditions implemented as flags.
* Add "APP_MANAGE" in addition to "APP_REG" health check, to monitor the
  app registration thread (which was missing, only the app manager
  thread was checked, under the name "APP_REG", which was misleading).
* Comment the last, current and flag values in struct health state,
  especially how they are accessed.
* Remove bogus usage of uatomic_xchg() in health_check_state():
  It is not needed to update the "last" value, since the last value is
  read and written to by a single thread. Moreover, this specific use of
  xchg was not exchanging anything: it was just setting the last value
  to the "current" one, and doing nothing with the return value.
  Whatever was expected to be achieved by using uatomic_xchg() clearly
  wasn't.
* Because the health check thread could still be answering a request
  concurrently sessiond teardown, we need to ensure that all threads
  only set the "error" condition if they reach teardown paths due to an
  actual error, not on "normal" teardown condition (thread quit pipe
  being closed). Flagging threads as being in error condition upon all
  exit paths would lead to false "errors" sent to the client, which we
  want to avoid, since the client could then think it needs to kill a
  sessiond when the sessiond might be in the process of gracefully
  restarting.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at efficios.com>
CC: David Goulet <dgoulet at efficios.com>
---
diff --git a/include/lttng/lttng.h b/include/lttng/lttng.h
index 6823579..7754bcd 100644
--- a/include/lttng/lttng.h
+++ b/include/lttng/lttng.h
@@ -134,6 +134,7 @@ enum lttng_calibrate_type {
 /* Health component for the health check function. */
 enum lttng_health_component {
 	LTTNG_HEALTH_CMD,
+	LTTNG_HEALTH_APP_MANAGE,
 	LTTNG_HEALTH_APP_REG,
 	LTTNG_HEALTH_KERNEL,
 	LTTNG_HEALTH_CONSUMER,
diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c
index 58f804e..d7d25cf 100644
--- a/src/bin/lttng-sessiond/health.c
+++ b/src/bin/lttng-sessiond/health.c
@@ -32,33 +32,29 @@
  */
 int health_check_state(struct health_state *state)
 {
-	int ret;
-	uint64_t current;
-	uint64_t last;
+	unsigned long current, last;
+	int ret = 1;
 
 	assert(state);
 
+	last = state->last;
 	current = uatomic_read(&state->current);
-	last = uatomic_read(&state->last);
 
 	/*
-	 * Here are the conditions for a bad health. Current state set to 0 or the
-	 * current state is the same as the last one and we are NOT waiting for a
-	 * poll() call.
+	 * Here are the conditions for a bad health. Either flag
+	 * HEALTH_ERROR is set, or the progress counter is the same as
+	 * the last one and we are NOT waiting for a poll() call.
 	 */
-	if (current == 0 || (current == last && HEALTH_IS_IN_CODE(current))) {
+	if ((uatomic_read(&state->flags) & HEALTH_ERROR)
+			|| (current == last && !HEALTH_IS_IN_POLL(current))) {
+		/* error */
 		ret = 0;
-		goto error;
 	}
 
-	/* All good */
-	ret = 1;
-
-error:
 	DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d",
 			current, last, ret);
 
-	/* Exchange current state counter into last one */
-	uatomic_xchg(&state->last, state->current);
+	/* update last counter */
+	state->last = current;
 	return ret;
 }
diff --git a/src/bin/lttng-sessiond/health.h b/src/bin/lttng-sessiond/health.h
index 9a1ef39..b268860 100644
--- a/src/bin/lttng-sessiond/health.h
+++ b/src/bin/lttng-sessiond/health.h
@@ -25,20 +25,35 @@
  * These are the value added to the current state depending of the position in
  * the thread where is either waiting on a poll() or running in the code.
  */
-#define HEALTH_POLL_VALUE 1
-#define HEALTH_CODE_VALUE 2
+#define HEALTH_POLL_VALUE 	(1UL << 0)
+#define HEALTH_CODE_VALUE	(1UL << 1)
 
-#define HEALTH_IS_IN_POLL(x) (x % HEALTH_CODE_VALUE)
-#define HEALTH_IS_IN_CODE(x) (x % HEALTH_POLL_VALUE)
+#define HEALTH_IS_IN_POLL(x)	((x) & HEALTH_POLL_VALUE)
+
+enum health_flags {
+	HEALTH_EXIT = (1U << 0),
+	HEALTH_ERROR = (1U << 1),
+};
 
 struct health_state {
-	uint64_t last;
-	uint64_t current;
+	/*
+	 * last counter is only read and updated by the health_check
+	 * thread (single updater).
+	 */
+	unsigned long last;
+	/*
+	 * current and flags are updated by multiple threads concurrently.
+	 */
+	unsigned long current;		/* progress counter, updated atomically */
+	enum health_flags flags;	/* other flags, updated atomically */
 };
 
 /* Health state counters for the client command thread */
 extern struct health_state health_thread_cmd;
 
+/* Health state counters for the application management thread */
+extern struct health_state health_thread_app_manage;
+
 /* Health state counters for the application registration thread */
 extern struct health_state health_thread_app_reg;
 
@@ -46,36 +61,41 @@ extern struct health_state health_thread_app_reg;
 extern struct health_state health_thread_kernel;
 
 /*
- * Update current counter by 1 to indicate that the thread is in a blocking
- * state cause by a poll().
+ * Update current counter by 1 to indicate that the thread entered or
+ * left a blocking state caused by a poll().
  */
 static inline void health_poll_update(struct health_state *state)
 {
 	assert(state);
-
 	uatomic_add(&state->current, HEALTH_POLL_VALUE);
 }
 
 /*
- * Update current counter by 2 which indicates that we are currently running in
- * a thread and NOT blocked at a poll().
+ * Update current counter by 2 indicates progress in execution of a
+ * thread.
  */
 static inline void health_code_update(struct health_state *state)
 {
 	assert(state);
-
 	uatomic_add(&state->current, HEALTH_CODE_VALUE);
 }
 
 /*
- * Reset health state. A value of zero indicate a bad health state.
+ * Set health "exit" flag.
  */
-static inline void health_reset(struct health_state *state)
+static inline void health_exit(struct health_state *state)
 {
 	assert(state);
+	uatomic_or(&state->flags, HEALTH_EXIT);
+}
 
-	uatomic_set(&state->current, 0);
-	uatomic_set(&state->last, 0);
+/*
+ * Set health "error" flag.
+ */
+static inline void health_error(struct health_state *state)
+{
+	assert(state);
+	uatomic_or(&state->flags, HEALTH_ERROR);
 }
 
 /*
@@ -84,9 +104,9 @@ static inline void health_reset(struct health_state *state)
 static inline void health_init(struct health_state *state)
 {
 	assert(state);
-
 	uatomic_set(&state->last, 0);
-	uatomic_set(&state->current, HEALTH_CODE_VALUE);
+	uatomic_set(&state->current, 0);
+	uatomic_set(&state->flags, 0);
 }
 
 int health_check_state(struct health_state *state);
diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c
index 5664b0b..c699f1b 100644
--- a/src/bin/lttng-sessiond/main.c
+++ b/src/bin/lttng-sessiond/main.c
@@ -216,6 +216,7 @@ static unsigned int relayd_net_seq_idx;
 
 /* Used for the health monitoring of the session daemon. See health.h */
 struct health_state health_thread_cmd;
+struct health_state health_thread_app_manage;
 struct health_state health_thread_app_reg;
 struct health_state health_thread_kernel;
 
@@ -716,7 +717,7 @@ static void update_ust_app(int app_sock)
  */
 static void *thread_manage_kernel(void *data)
 {
-	int ret, i, pollfd, update_poll_flag = 1;
+	int ret, i, pollfd, update_poll_flag = 1, err = -1;
 	uint32_t revents, nb_fd;
 	char tmp;
 	struct lttng_poll_event events;
@@ -789,7 +790,8 @@ static void *thread_manage_kernel(void *data)
 			/* Thread quit pipe has been closed. Killing thread. */
 			ret = check_thread_quit_pipe(pollfd, revents);
 			if (ret) {
-				goto error;
+				err = 0;
+				goto exit;
 			}
 
 			/* Check for data on kernel pipe */
@@ -817,10 +819,15 @@ static void *thread_manage_kernel(void *data)
 		}
 	}
 
+exit:
 error:
 	lttng_poll_clean(&events);
 error_poll_create:
-	health_reset(&health_thread_kernel);
+	if (err) {
+		DBG("An error occurred in %s", __func__);
+		health_error(&health_thread_kernel);
+	}
+	health_exit(&health_thread_kernel);
 	DBG("Kernel thread dying");
 	return NULL;
 }
@@ -830,7 +837,7 @@ error_poll_create:
  */
 static void *thread_manage_consumer(void *data)
 {
-	int sock = -1, i, ret, pollfd;
+	int sock = -1, i, ret, pollfd, err = -1;
 	uint32_t revents, nb_fd;
 	enum lttcomm_return_code code;
 	struct lttng_poll_event events;
@@ -888,7 +895,8 @@ restart:
 		/* Thread quit pipe has been closed. Killing thread. */
 		ret = check_thread_quit_pipe(pollfd, revents);
 		if (ret) {
-			goto error;
+			err = 0;
+			goto exit;
 		}
 
 		/* Event on the registration socket */
@@ -976,7 +984,8 @@ restart_poll:
 		/* Thread quit pipe has been closed. Killing thread. */
 		ret = check_thread_quit_pipe(pollfd, revents);
 		if (ret) {
-			goto error;
+			err = 0;
+			goto exit;
 		}
 
 		/* Event on the kconsumerd socket */
@@ -1000,6 +1009,7 @@ restart_poll:
 
 	ERR("consumer return code : %s", lttcomm_get_readable_code(-code));
 
+exit:
 error:
 	/* Immediately set the consumerd state to stopped */
 	if (consumer_data->type == LTTNG_CONSUMER_KERNEL) {
@@ -1038,7 +1048,11 @@ error:
 	lttng_poll_clean(&events);
 error_poll:
 error_listen:
-	health_reset(&consumer_data->health);
+	if (err) {
+		DBG("An error occurred in %s", __func__);
+		health_error(&consumer_data->health);
+	}
+	health_exit(&consumer_data->health);
 	DBG("consumer thread cleanup completed");
 
 	return NULL;
@@ -1049,7 +1063,7 @@ error_listen:
  */
 static void *thread_manage_apps(void *data)
 {
-	int i, ret, pollfd;
+	int i, ret, pollfd, err = -1;
 	uint32_t revents, nb_fd;
 	struct ust_command ust_cmd;
 	struct lttng_poll_event events;
@@ -1059,7 +1073,7 @@ static void *thread_manage_apps(void *data)
 	rcu_register_thread();
 	rcu_thread_online();
 
-	health_code_update(&health_thread_app_reg);
+	health_code_update(&health_thread_app_manage);
 
 	ret = create_thread_poll_set(&events, 2);
 	if (ret < 0) {
@@ -1071,7 +1085,7 @@ static void *thread_manage_apps(void *data)
 		goto error;
 	}
 
-	health_code_update(&health_thread_app_reg);
+	health_code_update(&health_thread_app_manage);
 
 	while (1) {
 		/* Zeroed the events structure */
@@ -1083,9 +1097,9 @@ static void *thread_manage_apps(void *data)
 
 		/* Inifinite blocking call, waiting for transmission */
 	restart:
-		health_poll_update(&health_thread_app_reg);
+		health_poll_update(&health_thread_app_manage);
 		ret = lttng_poll_wait(&events, -1);
-		health_poll_update(&health_thread_app_reg);
+		health_poll_update(&health_thread_app_manage);
 		if (ret < 0) {
 			/*
 			 * Restart interrupted system call.
@@ -1101,12 +1115,13 @@ static void *thread_manage_apps(void *data)
 			revents = LTTNG_POLL_GETEV(&events, i);
 			pollfd = LTTNG_POLL_GETFD(&events, i);
 
-			health_code_update(&health_thread_app_reg);
+			health_code_update(&health_thread_app_manage);
 
 			/* Thread quit pipe has been closed. Killing thread. */
 			ret = check_thread_quit_pipe(pollfd, revents);
 			if (ret) {
-				goto error;
+				err = 0;
+				goto exit;
 			}
 
 			/* Inspect the apps cmd pipe */
@@ -1122,7 +1137,7 @@ static void *thread_manage_apps(void *data)
 						goto error;
 					}
 
-					health_code_update(&health_thread_app_reg);
+					health_code_update(&health_thread_app_manage);
 
 					/* Register applicaton to the session daemon */
 					ret = ust_app_register(&ust_cmd.reg_msg,
@@ -1133,7 +1148,7 @@ static void *thread_manage_apps(void *data)
 						break;
 					}
 
-					health_code_update(&health_thread_app_reg);
+					health_code_update(&health_thread_app_manage);
 
 					/*
 					 * Validate UST version compatibility.
@@ -1147,7 +1162,7 @@ static void *thread_manage_apps(void *data)
 						update_ust_app(ust_cmd.sock);
 					}
 
-					health_code_update(&health_thread_app_reg);
+					health_code_update(&health_thread_app_manage);
 
 					ret = ust_app_register_done(ust_cmd.sock);
 					if (ret < 0) {
@@ -1172,7 +1187,7 @@ static void *thread_manage_apps(void *data)
 								ust_cmd.sock);
 					}
 
-					health_code_update(&health_thread_app_reg);
+					health_code_update(&health_thread_app_manage);
 
 					break;
 				}
@@ -1194,14 +1209,19 @@ static void *thread_manage_apps(void *data)
 				}
 			}
 
-			health_code_update(&health_thread_app_reg);
+			health_code_update(&health_thread_app_manage);
 		}
 	}
 
+exit:
 error:
 	lttng_poll_clean(&events);
 error_poll_create:
-	health_reset(&health_thread_app_reg);
+	if (err) {
+		DBG("An error occurred in %s", __func__);
+		health_error(&health_thread_app_manage);
+	}
+	health_exit(&health_thread_app_manage);
 	DBG("Application communication apps thread cleanup complete");
 	rcu_thread_offline();
 	rcu_unregister_thread();
@@ -1277,7 +1297,7 @@ error:
  */
 static void *thread_registration_apps(void *data)
 {
-	int sock = -1, i, ret, pollfd;
+	int sock = -1, i, ret, pollfd, err = -1;
 	uint32_t revents, nb_fd;
 	struct lttng_poll_event events;
 	/*
@@ -1323,7 +1343,9 @@ static void *thread_registration_apps(void *data)
 
 		/* Inifinite blocking call, waiting for transmission */
 	restart:
+		health_poll_update(&health_thread_app_reg);
 		ret = lttng_poll_wait(&events, -1);
+		health_poll_update(&health_thread_app_reg);
 		if (ret < 0) {
 			/*
 			 * Restart interrupted system call.
@@ -1335,6 +1357,8 @@ static void *thread_registration_apps(void *data)
 		}
 
 		for (i = 0; i < nb_fd; i++) {
+			health_code_update(&health_thread_app_reg);
+
 			/* Fetch once the poll data */
 			revents = LTTNG_POLL_GETEV(&events, i);
 			pollfd = LTTNG_POLL_GETFD(&events, i);
@@ -1342,7 +1366,8 @@ static void *thread_registration_apps(void *data)
 			/* Thread quit pipe has been closed. Killing thread. */
 			ret = check_thread_quit_pipe(pollfd, revents);
 			if (ret) {
-				goto error;
+				err = 0;
+				goto exit;
 			}
 
 			/* Event on the registration socket */
@@ -1378,6 +1403,7 @@ static void *thread_registration_apps(void *data)
 						sock = -1;
 						continue;
 					}
+					health_code_update(&health_thread_app_reg);
 					ret = lttcomm_recv_unix_sock(sock, &ust_cmd->reg_msg,
 							sizeof(struct ust_register_msg));
 					if (ret < 0 || ret < sizeof(struct ust_register_msg)) {
@@ -1395,6 +1421,7 @@ static void *thread_registration_apps(void *data)
 						sock = -1;
 						continue;
 					}
+					health_code_update(&health_thread_app_reg);
 
 					ust_cmd->sock = sock;
 					sock = -1;
@@ -1422,7 +1449,14 @@ static void *thread_registration_apps(void *data)
 		}
 	}
 
+exit:
 error:
+	if (err) {
+		DBG("An error occurred in %s", __func__);
+		health_error(&health_thread_app_reg);
+	}
+	health_exit(&health_thread_app_reg);
+
 	/* Notify that the registration thread is gone */
 	notify_ust_apps(0);
 
@@ -1742,15 +1776,15 @@ error:
 }
 
 /*
- * Compute health status of each consumer.
+ * Compute health status of each consumer. If one of them is zero (bad
+ * state), we return 0.
  */
 static int check_consumer_health(void)
 {
 	int ret;
 
-	ret =
-		health_check_state(&kconsumer_data.health) &
-		health_check_state(&ustconsumer32_data.health) &
+	ret = health_check_state(&kconsumer_data.health) &&
+		health_check_state(&ustconsumer32_data.health) &&
 		health_check_state(&ustconsumer64_data.health);
 
 	DBG3("Health consumer check %d", ret);
@@ -4627,7 +4661,7 @@ init_setup_error:
  */
 static void *thread_manage_health(void *data)
 {
-	int sock = -1, new_sock, ret, i, pollfd;
+	int sock = -1, new_sock, ret, i, pollfd, err = -1;
 	uint32_t revents, nb_fd;
 	struct lttng_poll_event events;
 	struct lttcomm_health_msg msg;
@@ -4691,7 +4725,8 @@ restart:
 			/* Thread quit pipe has been closed. Killing thread. */
 			ret = check_thread_quit_pipe(pollfd, revents);
 			if (ret) {
-				goto error;
+				err = 0;
+				goto exit;
 			}
 
 			/* Event on the registration socket */
@@ -4726,6 +4761,9 @@ restart:
 		case LTTNG_HEALTH_CMD:
 			reply.ret_code = health_check_state(&health_thread_cmd);
 			break;
+		case LTTNG_HEALTH_APP_MANAGE:
+			reply.ret_code = health_check_state(&health_thread_app_manage);
+			break;
 		case LTTNG_HEALTH_APP_REG:
 			reply.ret_code = health_check_state(&health_thread_app_reg);
 			break;
@@ -4736,13 +4774,12 @@ restart:
 			reply.ret_code = check_consumer_health();
 			break;
 		case LTTNG_HEALTH_ALL:
-			ret = check_consumer_health();
-
 			reply.ret_code =
-				health_check_state(&health_thread_app_reg) &
-				health_check_state(&health_thread_cmd) &
-				health_check_state(&health_thread_kernel) &
-				ret;
+				health_check_state(&health_thread_app_manage) &&
+				health_check_state(&health_thread_app_reg) &&
+				health_check_state(&health_thread_cmd) &&
+				health_check_state(&health_thread_kernel) &&
+				check_consumer_health();
 			break;
 		default:
 			reply.ret_code = LTTCOMM_UND;
@@ -4774,7 +4811,11 @@ restart:
 		new_sock = -1;
 	}
 
+exit:
 error:
+	if (err) {
+		DBG("An error occurred in %s", __func__);
+	}
 	DBG("Health check thread dying");
 	unlink(health_unix_sock_path);
 	if (sock >= 0) {
@@ -4802,7 +4843,7 @@ error:
  */
 static void *thread_manage_clients(void *data)
 {
-	int sock = -1, ret, i, pollfd;
+	int sock = -1, ret, i, pollfd, err = -1;
 	int sock_error;
 	uint32_t revents, nb_fd;
 	struct command_ctx *cmd_ctx = NULL;
@@ -4873,7 +4914,8 @@ static void *thread_manage_clients(void *data)
 			/* Thread quit pipe has been closed. Killing thread. */
 			ret = check_thread_quit_pipe(pollfd, revents);
 			if (ret) {
-				goto error;
+				err = 0;
+				goto exit;
 			}
 
 			/* Event on the registration socket */
@@ -4993,8 +5035,13 @@ static void *thread_manage_clients(void *data)
 		health_code_update(&health_thread_cmd);
 	}
 
+exit:
 error:
-	health_reset(&health_thread_cmd);
+	if (err) {
+		DBG("An error occurred in %s", __func__);
+		health_error(&health_thread_cmd);
+	}
+	health_exit(&health_thread_cmd);
 
 	DBG("Client thread dying");
 	unlink(client_unix_sock_path);
@@ -5740,6 +5787,7 @@ int main(int argc, char **argv)
 	/* Init all health thread counters. */
 	health_init(&health_thread_cmd);
 	health_init(&health_thread_kernel);
+	health_init(&health_thread_app_manage);
 	health_init(&health_thread_app_reg);
 
 	/*

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com



More information about the lttng-dev mailing list