[ltt-dev] [RFC PATCH v3] TRACE_CLOCK and TRACE_CLOCK_FREQ in clock_gettime

Mathieu Desnoyers mathieu.desnoyers at efficios.com
Sat Nov 27 11:37:20 EST 2010


Hi Julien,

I did a few modifications to your code. Please check the TODO that are left and
update benchmarks. Some information about what vget_cycles actually does on Xen
and KVM compared to get_cycles would be welcome.

Thanks,

Mathieu


These new options to clock_gettime allows the user to retrieve the TSC
frequency and the current TSC from userspace.
We use the LTTng infrastructure to make sure the TSC is synchronized. If
it is not, we fallback to a syscall (which for the moment does the same
thing but in the future will be modified to ensure consistency for the
tracing between user and kernel space).

The main difference with using the TSC clocksource directly is that the
time starts at machine boot and not at Linux boot which makes it possible
to correlate user and kernelspace events. Also we export frequency and
cycles, we don't do the conversion in sec.nsec from the kernel since we
don't need it.

The differences between the v1 are :
- we validated on 32 bits the clock_gettime vDSO doesn't exist so it
  cleans up the vDSO code;
- the syscall is now properly defined using the posix timer architecture
- we export the frequency to userspace so we don't need to convert the
  cycles in sec.nsec anymore. Which means that on 64 bits machine, the
  nsec field will contain the whole cycle counter and on 32 bits the
  value is split between the two fields sec and nsec.
- remove the rdtsc_barrier() which is overkill for tracing purpose
- trace_clock_is_sync field is updated as soon as the LTTng trace clock
  detects an inconsistency

Updated benchmarks (with 20000000 iterations reading the tsc before and
after each call on an i7 920):

64 bits with vDSO
average cycles for clock_realtime: 101
average cycles for clock_monotonic: 104
average cycles for clock_trace: 52

64 bits without vDSO (using syscall)
average cycles for clock_realtime: 240
average cycles for clock_monotonic: 256
average cycles for clock_trace: 219

32 bits (without vDSO)
average cycles for clock_realtime: 649
average cycles for clock_monotonic: 661
average cycles for clock_trace: 616

Signed-off-by: Julien Desfossez <julien.desfossez at polymtl.ca>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at efficios.com>
---
 arch/x86/include/asm/trace-clock.h |    7 ++++
 arch/x86/include/asm/vgtod.h       |    1 
 arch/x86/include/asm/vsyscall.h    |    8 +++++
 arch/x86/kernel/trace-clock.c      |   58 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/vsyscall_64.c      |   14 ++++++++
 arch/x86/vdso/vclock_gettime.c     |   48 ++++++++++++++++++++++++++++++
 include/linux/time.h               |    2 +
 7 files changed, 138 insertions(+)

Index: linux-2.6-lttng/arch/x86/include/asm/trace-clock.h
===================================================================
--- linux-2.6-lttng.orig/arch/x86/include/asm/trace-clock.h
+++ linux-2.6-lttng/arch/x86/include/asm/trace-clock.h
@@ -11,12 +11,19 @@
  */
 
 #include <linux/timex.h>
+#include <linux/time.h>
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/atomic.h>
 
 /* Minimum duration of a probe, in cycles */
 #define TRACE_CLOCK_MIN_PROBE_DURATION 200
+#define TRACE_CLOCK_RES TRACE_CLOCK_MIN_PROBE_DURATION
+
+union lttng_timespec {
+	struct timespec ts;
+	u64 lttng_ts;
+};
 
 extern cycles_t trace_clock_async_tsc_read(void);
 
Index: linux-2.6-lttng/arch/x86/include/asm/vgtod.h
===================================================================
--- linux-2.6-lttng.orig/arch/x86/include/asm/vgtod.h
+++ linux-2.6-lttng/arch/x86/include/asm/vgtod.h
@@ -12,6 +12,7 @@ struct vsyscall_gtod_data {
 	u32		wall_time_nsec;
 
 	int		sysctl_enabled;
+	int		trace_clock_is_sync;
 	struct timezone sys_tz;
 	struct { /* extract of a clocksource struct */
 		cycle_t (*vread)(void);
Index: linux-2.6-lttng/arch/x86/include/asm/vsyscall.h
===================================================================
--- linux-2.6-lttng.orig/arch/x86/include/asm/vsyscall.h
+++ linux-2.6-lttng/arch/x86/include/asm/vsyscall.h
@@ -39,6 +39,14 @@ extern struct timezone sys_tz;
 
 extern void map_vsyscall(void);
 
+#ifdef CONFIG_X86_64
+extern void update_trace_clock_is_sync_vdso(void);
+#else
+static inline void update_trace_clock_is_sync_vdso(void)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_VSYSCALL_H */
Index: linux-2.6-lttng/arch/x86/kernel/trace-clock.c
===================================================================
--- linux-2.6-lttng.orig/arch/x86/kernel/trace-clock.c
+++ linux-2.6-lttng/arch/x86/kernel/trace-clock.c
@@ -11,6 +11,8 @@
 #include <linux/jiffies.h>
 #include <linux/timer.h>
 #include <linux/cpu.h>
+#include <linux/posix-timers.h>
+#include <asm/vgtod.h>
 
 static cycles_t trace_clock_last_tsc;
 static DEFINE_PER_CPU(struct timer_list, update_timer);
@@ -22,11 +24,19 @@ int _trace_clock_is_sync = 1;
 EXPORT_SYMBOL_GPL(_trace_clock_is_sync);
 
 /*
+ * Is the trace clock being used by user-space ? We leave the trace clock active
+ * as soon as user-space starts using it. We never unref the trace clock
+ * reference taken by user-space.
+ */
+static atomic_t user_trace_clock_ref;
+
+/*
  * Called by check_tsc_sync_source from CPU hotplug.
  */
 void set_trace_clock_is_sync(int state)
 {
 	_trace_clock_is_sync = state;
+	update_trace_clock_is_sync_vdso();
 }
 
 #if BITS_PER_LONG == 64
@@ -236,8 +246,56 @@ end:
 }
 EXPORT_SYMBOL_GPL(put_trace_clock);
 
+static int posix_get_trace(clockid_t which_clock, struct timespec *tp)
+{
+	union lttng_timespec *lts = (union lttng_timespec *) tp;
+	int ret;
+
+	/*
+	 * Yes, there is a race here that would lead to refcount being
+	 * incremented more than once, but all we care is to leave the trace
+	 * clock active forever, so precise accounting is not needed.
+	 */
+	if (unlikely(!atomic_read(&user_trace_clock_ref))) {
+		ret = get_trace_clock();
+		if (ret)
+			return ret;
+		atomic_inc(&user_trace_clock_ref);
+	}
+	lts->lttng_ts = trace_clock_read64();
+	return 0;
+}
+
+static int posix_get_trace_freq(clockid_t which_clock, struct timespec *tp)
+{
+	union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+	lts->lttng_ts = trace_clock_frequency();
+	return 0;
+}
+
+static int posix_get_trace_res(const clockid_t which_clock, struct timespec *tp)
+{
+	union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+	lts->lttng_ts = TRACE_CLOCK_RES;
+	return 0;
+}
+
 static __init int init_unsync_trace_clock(void)
 {
+	struct k_clock clock_trace = {
+		.clock_getres = posix_get_trace_res,
+		.clock_get = posix_get_trace,
+	};
+	struct k_clock clock_trace_freq = {
+		.clock_getres = posix_get_trace_res,
+		.clock_get = posix_get_trace_freq,
+	};
+
+	register_posix_clock(CLOCK_TRACE, &clock_trace);
+	register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq);
+
 	hotcpu_notifier(hotcpu_callback, 4);
 	return 0;
 }
Index: linux-2.6-lttng/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-2.6-lttng.orig/arch/x86/kernel/vsyscall_64.c
+++ linux-2.6-lttng/arch/x86/kernel/vsyscall_64.c
@@ -44,6 +44,8 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 
 #define __vsyscall(nr) \
 		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
@@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gto
 {
 	.lock = SEQLOCK_UNLOCKED,
 	.sysctl_enabled = 1,
+	.trace_clock_is_sync = 1,
 };
 
 void update_vsyscall_tz(void)
@@ -73,6 +76,16 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
+void update_trace_clock_is_sync_vdso(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
+	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+EXPORT_SYMBOL_GPL(update_trace_clock_is_sync_vdso);
+
 void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 			struct clocksource *clock, u32 mult)
 {
@@ -89,6 +102,7 @@ void update_vsyscall(struct timespec *wa
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
Index: linux-2.6-lttng/arch/x86/vdso/vclock_gettime.c
===================================================================
--- linux-2.6-lttng.orig/arch/x86/vdso/vclock_gettime.c
+++ linux-2.6-lttng/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,8 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 #include "vextern.h"
 
 #define gtod vdso_vsyscall_gtod_data
@@ -111,6 +113,46 @@ notrace static noinline int do_monotonic
 	return 0;
 }
 
+/*
+ * If the TSC is synchronized across all CPUs, read the current TSC
+ * and export its value in the nsec field of the timespec
+ */
+notrace static noinline int do_trace_clock(struct timespec *ts)
+{
+	unsigned long seq;
+	union lttng_timespec *lts = (union lttng_timespec *) ts;
+
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		if (unlikely(!gtod->trace_clock_is_sync))
+			return vdso_fallback_gettime(CLOCK_TRACE, ts);
+		/*
+		 * We don't protect the rdtsc with the rdtsc_barrier because
+		 * we can't obtain with tracing that level of precision.
+		 * The operation of recording an event is not atomic therefore
+		 * the small chance of imprecision doesn't justify the overhead
+		 * of a barrier.
+		 */
+		/*
+		 * TODO: check that vget_cycles(), using paravirt ops, will
+		 * match the TSC read by get_cycles() at the kernel level.
+		 */
+		lts->lttng_ts = vget_cycles();
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+
+	return 0;
+}
+
+/*
+ * Returns the cpu_khz, it needs to be a syscall because we can't access
+ * this value from userspace and it will only be called at the beginning
+ * of the tracing session
+ */
+notrace static noinline int do_trace_clock_freq(struct timespec *ts)
+{
+	return vdso_fallback_gettime(CLOCK_TRACE_FREQ, ts);
+}
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled))
@@ -127,6 +169,12 @@ notrace int __vdso_clock_gettime(clockid
 			return do_realtime_coarse(ts);
 		case CLOCK_MONOTONIC_COARSE:
 			return do_monotonic_coarse(ts);
+		case CLOCK_TRACE:
+			return do_trace_clock(ts);
+		case CLOCK_TRACE_FREQ:
+			return do_trace_clock_freq(ts);
+		default:
+			return -EINVAL;
 		}
 	return vdso_fallback_gettime(clock, ts);
 }
Index: linux-2.6-lttng/include/linux/time.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/time.h
+++ linux-2.6-lttng/include/linux/time.h
@@ -290,6 +290,8 @@ struct itimerval {
 #define CLOCK_MONOTONIC_RAW		4
 #define CLOCK_REALTIME_COARSE		5
 #define CLOCK_MONOTONIC_COARSE		6
+#define CLOCK_TRACE_FREQ		14
+#define CLOCK_TRACE				15
 
 /*
  * The IDs of various hardware clocks:

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com




More information about the lttng-dev mailing list