[ltt-dev] [RFC PATCH v2] TRACE_CLOCK and TRACE_CLOCK_FREQ in clock_gettime

Julien Desfossez julien.desfossez at polymtl.ca
Fri Nov 26 22:27:55 EST 2010


These new options to clock_gettime allows the user to retreive the TSC
frequency and the current TSC from userspace.
We use the LTTng infrastructure to make sure the TSC is synchronized. If
it is not, we fallback to a syscall (which for the moment does the same
thing but in the future will be modified to ensure consistency for the
tracing between user and kernel space).

The main difference with using the TSC clocksource directly is that the
time starts at machine boot and not at Linux boot which makes it possible
to correlate user and kernelspace events. Also we export frequency and
cycles, we don't do the conversion in sec.nsec from the kernel since we
don't need it.

The differences between the v1 are :
- we validated on 32 bits the clock_gettime vDSO doesn't exist so it
  cleans up the vDSO code;
- the syscall is now properly defined using the posix timer architecture
- we export the frequency to userspace so we don't need to convert the
  cycles in sec.nsec anymore. Which means that on 64 bits machine, the
  nsec field will contain the whole cycle counter and on 32 bits the
  value is split between the two fields sec and nsec.
- remove the rdtsc_barrier() which is overkill for tracing purpose
- trace_clock_is_sync field is updated as soon as the LTTng trace clock
  detects an inconsistency

Updated benchmarks (with 20000000 iterations reading the tsc before and
after each call on an i7 920):

64 bits with vDSO
average cycles for clock_realtime: 101
average cycles for clock_monotonic: 104
average cycles for clock_trace: 52

64 bits without vDSO (using syscall)
average cycles for clock_realtime: 240
average cycles for clock_monotonic: 256
average cycles for clock_trace: 219

32 bits (without vDSO)
average cycles for clock_realtime: 649
average cycles for clock_monotonic: 661
average cycles for clock_trace: 616

Signed-off-by: Julien Desfossez <julien.desfossez at polymtl.ca>
---
 arch/x86/include/asm/trace-clock.h |    1 +
 arch/x86/include/asm/vgtod.h       |    1 +
 arch/x86/include/asm/vsyscall.h    |    2 +
 arch/x86/kernel/trace-clock.c      |   51 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/vsyscall_64.c      |   14 ++++++++++
 arch/x86/vdso/vclock_gettime.c     |   44 +++++++++++++++++++++++++++++++
 include/linux/time.h               |    5 +++
 7 files changed, 118 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h
index 01bc2f5..c1fd160 100644
--- a/arch/x86/include/asm/trace-clock.h
+++ b/arch/x86/include/asm/trace-clock.h
@@ -14,6 +14,7 @@
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/atomic.h>
+#include <asm/vgtod.h>
 
 /* Minimum duration of a probe, in cycles */
 #define TRACE_CLOCK_MIN_PROBE_DURATION 200
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e20..06abe8f 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -12,6 +12,7 @@ struct vsyscall_gtod_data {
 	u32		wall_time_nsec;
 
 	int		sysctl_enabled;
+	int		trace_clock_is_sync;
 	struct timezone sys_tz;
 	struct { /* extract of a clocksource struct */
 		cycle_t (*vread)(void);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d2..ace5576 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -39,6 +39,8 @@ extern struct timezone sys_tz;
 
 extern void map_vsyscall(void);
 
+extern void update_trace_clock_is_sync(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/trace-clock.c b/arch/x86/kernel/trace-clock.c
index ec866aa..841a435 100644
--- a/arch/x86/kernel/trace-clock.c
+++ b/arch/x86/kernel/trace-clock.c
@@ -11,6 +11,8 @@
 #include <linux/jiffies.h>
 #include <linux/timer.h>
 #include <linux/cpu.h>
+#include <linux/posix-timers.h>
+#include <asm/vgtod.h>
 
 static cycles_t trace_clock_last_tsc;
 static DEFINE_PER_CPU(struct timer_list, update_timer);
@@ -27,6 +29,10 @@ EXPORT_SYMBOL_GPL(_trace_clock_is_sync);
 void set_trace_clock_is_sync(int state)
 {
 	_trace_clock_is_sync = state;
+#ifdef CONFIG_X86_64
+	/* Update the clock_gettime vDSO shared structure */
+	update_trace_clock_is_sync();
+#endif
 }
 
 #if BITS_PER_LONG == 64
@@ -238,8 +244,53 @@ end:
 }
 EXPORT_SYMBOL_GPL(put_trace_clock);
 
+/*
+ * If the TSC is synchronized across the CPUs
+ * encode it in the timespec fields
+ */
+static int posix_get_trace(clockid_t which_clock,
+		struct timespec *tp)
+{
+	if (!_trace_clock_is_sync)
+		return -EPERM;
+
+#ifdef CONFIG_X86_64
+	tp->tv_sec = 0;
+	rdtscll(tp->tv_nsec);
+#else
+	/* Need to cast the nsec in unsigned long in userspace */
+	asm volatile ("rdtsc" : "=d" (tp->tv_sec), "=a" (tp->tv_nsec));
+#endif
+	return 0;
+}
+
+/*
+ * Encode the cpu frequency in the nsec field of the timespec
+ */
+static int posix_get_trace_freq(clockid_t which_clock,
+		struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = (long)cpu_khz;
+	return 0;
+}
+
 static __init int init_unsync_trace_clock(void)
 {
+	struct k_clock clock_trace = {
+		/* FIXME : placeholder, need to be not NULL but is not used */
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_get_trace,
+	};
+	struct k_clock clock_trace_freq = {
+		/* FIXME : placeholder, need to be not NULL but is not used */
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_get_trace_freq,
+	};
+
+	register_posix_clock(CLOCK_TRACE, &clock_trace);
+	register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq);
+
 	hotcpu_notifier(hotcpu_callback, 4);
 	return 0;
 }
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c..74a94cb 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,6 +44,8 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 
 #define __vsyscall(nr) \
 		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
@@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
 {
 	.lock = SEQLOCK_UNLOCKED,
 	.sysctl_enabled = 1,
+	.trace_clock_is_sync = 0,
 };
 
 void update_vsyscall_tz(void)
@@ -73,6 +76,16 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
+void update_trace_clock_is_sync(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
+	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+EXPORT_SYMBOL_GPL(update_trace_clock_is_sync);
+
 void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 			struct clocksource *clock, u32 mult)
 {
@@ -89,6 +102,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754..54031f9 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,8 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 #include "vextern.h"
 
 #define gtod vdso_vsyscall_gtod_data
@@ -111,6 +113,42 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 	return 0;
 }
 
+/*
+ * If the TSC is synchronized across all CPUs, read the current TSC
+ * and export its value in the nsec field of the timespec
+ */
+notrace static noinline int do_trace_clock(struct timespec *ts)
+{
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		if (unlikely(!gtod->trace_clock_is_sync))
+			return vdso_fallback_gettime(CLOCK_TRACE, ts);
+		/*
+		 * We don't protect the rdtsc with the rdtsc_barrier because
+		 * we can't obtain with tracing that level of precision.
+		 * The operation of recording an event is not atomic therefore
+		 * the small chance of imprecision doesn't justify the overhead
+		 * of a barrier.
+		 */
+		ts->tv_nsec = (cycle_t)vget_cycles();
+		ts->tv_sec = 0;
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+
+	return 0;
+}
+
+/*
+ * Returns the cpu_khz, it needs to be a syscall because we can't access
+ * this value from userspace and it will only be called at the beginning
+ * of the tracing session
+ */
+notrace static noinline int do_trace_clock_freq(struct timespec *ts)
+{
+	return vdso_fallback_gettime(CLOCK_TRACE_FREQ, ts);
+}
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled))
@@ -127,6 +165,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 			return do_realtime_coarse(ts);
 		case CLOCK_MONOTONIC_COARSE:
 			return do_monotonic_coarse(ts);
+		case CLOCK_TRACE:
+			return do_trace_clock(ts);
+		case CLOCK_TRACE_FREQ:
+			return do_trace_clock_freq(ts);
+		default:
+			return -EINVAL;
 		}
 	return vdso_fallback_gettime(clock, ts);
 }
diff --git a/include/linux/time.h b/include/linux/time.h
index 9f15ac7..ea72a9d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -127,6 +127,9 @@ struct timespec current_kernel_time(void);
 struct timespec __current_kernel_time(void); /* does not take xtime_lock */
 struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */
 struct timespec get_monotonic_coarse(void);
+#ifdef CONFIG_X86
+struct timespec get_trace(void);
+#endif
 
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
@@ -290,6 +293,8 @@ struct itimerval {
 #define CLOCK_MONOTONIC_RAW		4
 #define CLOCK_REALTIME_COARSE		5
 #define CLOCK_MONOTONIC_COARSE		6
+#define CLOCK_TRACE_FREQ		14
+#define CLOCK_TRACE				15
 
 /*
  * The IDs of various hardware clocks:
-- 
1.7.0.4





More information about the lttng-dev mailing list