[ltt-dev] [RFC PATCH] TRACE_CLOCK in clock_gettime vDSO and syscall

Julien Desfossez julien.desfossez at polymtl.ca
Tue Nov 23 17:22:04 EST 2010


This new option to clock_gettime allows the user to retreive a precise
timestamp using the LTTng infrastructure. If the TSC is not synchronized
across the CPUs (validated at boot time) the function returns an error
to the user in order to make him understand he won't be able to
synchronize automatically kernel and userspace traces and he need to
fallback on an other clocksource.

The main difference with using the TSC clocksource directly is that the
time starts at machine boot and not at Linux boot which make it possible
to correlate user and kernelspace events.
On 64 bits architecture we only use the nsec field of the timespec to
encode the time in nanoseconds, the sec field is always 0.
On 32 bits we have to make the division and split the sec and nsec.

I am not particularly proud of the way the syscall is implemented (in
kernel/posix-timers.c), I would be really glad if anyone has an idea to
clean that up.

Some benchmarks (with 20000000 iterations reading the tsc before and after each
call on an i7 920):

64 bits with vDSO
average cycles for clock_realtime: 101
average cycles for clock_monotonic: 104
average cycles for clock_trace: 66

64 bits without vDSO (using syscall)
average cycles for clock_realtime: 240
average cycles for clock_monotonic: 256
average cycles for clock_trace: 231

32 bits with AND without vDSO
average cycles for clock_realtime: 649
average cycles for clock_monotonic: 661
average cycles for clock_trace: 630

Signed-off-by: Julien Desfossez <julien.desfossez at polymtl.ca>
---
 arch/x86/include/asm/trace-clock.h |    1 +
 arch/x86/include/asm/vgtod.h       |    2 +
 arch/x86/kernel/vsyscall_64.c      |    5 ++++
 arch/x86/vdso/vclock_gettime.c     |   44 ++++++++++++++++++++++++++++++++++++
 include/linux/time.h               |    1 +
 kernel/posix-timers.c              |   27 ++++++++++++++++++++++
 6 files changed, 80 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h
index 01bc2f5..c1fd160 100644
--- a/arch/x86/include/asm/trace-clock.h
+++ b/arch/x86/include/asm/trace-clock.h
@@ -14,6 +14,7 @@
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/atomic.h>
+#include <asm/vgtod.h>
 
 /* Minimum duration of a probe, in cycles */
 #define TRACE_CLOCK_MIN_PROBE_DURATION 200
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e20..7367f1c 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -12,6 +12,8 @@ struct vsyscall_gtod_data {
 	u32		wall_time_nsec;
 
 	int		sysctl_enabled;
+	int		trace_clock_is_sync;
+	unsigned long		cpu_khz;
 	struct timezone sys_tz;
 	struct { /* extract of a clocksource struct */
 		cycle_t (*vread)(void);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c..ce750b5 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,6 +44,8 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 
 #define __vsyscall(nr) \
 		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
@@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
 {
 	.lock = SEQLOCK_UNLOCKED,
 	.sysctl_enabled = 1,
+	.trace_clock_is_sync = 0,
 };
 
 void update_vsyscall_tz(void)
@@ -89,6 +92,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+	vsyscall_gtod_data.cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
+	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754..81bc9c7 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,8 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 #include "vextern.h"
 
 #define gtod vdso_vsyscall_gtod_data
@@ -69,6 +71,7 @@ vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 		--sec;
 	}
 	ts->tv_sec = sec;
+
 	ts->tv_nsec = nsec;
 }
 
@@ -111,6 +114,41 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 	return 0;
 }
 
+#ifdef CONFIG_X86
+notrace static noinline int do_trace_clock(struct timespec *ts)
+{
+	cycle_t cycles;
+	unsigned long tmp;
+
+	if (!gtod->trace_clock_is_sync)
+		return -EPERM;
+
+	/* Copy of the version in kernel/tsc.c which we cannot directly access
+	 *
+	 * Surround the RDTSC by barriers, to make sure it's not
+	 * speculated to outside the seqlock critical section and
+	 * does not cause time warps:
+	 */
+	rdtsc_barrier();
+	cycles = (cycle_t)vget_cycles();
+	rdtsc_barrier();
+
+	/* On 64 bits, the nsec field is enough to store the whole information,
+	 * on 32 bits we need to split it in two fields
+	 */
+	tmp = cycles * (USEC_PER_SEC / gtod->cpu_khz) >> CYC2NS_SCALE_FACTOR;
+#ifdef CONFIG_X86_64
+	ts->tv_nsec = tmp;
+	ts->tv_sec = 0;
+#elif defined CONFIG_X86_32
+	ts->tv_nsec = do_div(tmp, NSEC_PER_SEC);
+	ts->tv_sec = tmp;
+#endif
+
+	return 0;
+}
+#endif /* CONFIG_X86 */
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled))
@@ -127,6 +165,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 			return do_realtime_coarse(ts);
 		case CLOCK_MONOTONIC_COARSE:
 			return do_monotonic_coarse(ts);
+#ifdef CONFIG_X86
+		case CLOCK_TRACE:
+			return do_trace_clock(ts);
+#endif
+		default:
+			return -EINVAL;
 		}
 	return vdso_fallback_gettime(clock, ts);
 }
diff --git a/include/linux/time.h b/include/linux/time.h
index 9f15ac7..9e6d9e2 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -290,6 +290,7 @@ struct itimerval {
 #define CLOCK_MONOTONIC_RAW		4
 #define CLOCK_REALTIME_COARSE		5
 #define CLOCK_MONOTONIC_COARSE		6
+#define CLOCK_TRACE					32
 
 /*
  * The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973..4517f1d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -37,6 +37,8 @@
 #include <linux/mutex.h>
 
 #include <asm/uaccess.h>
+#include <linux/trace-clock.h>
+#include <linux/timer.h>
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
@@ -957,6 +959,31 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 {
 	struct timespec kernel_tp;
 	int error;
+#ifdef CONFIG_X86
+	u64 cycles, tmp, scaled_cpu_khz;
+
+	if (which_clock == CLOCK_TRACE) {
+		if (!_trace_clock_is_sync)
+			return -EPERM;
+
+		scaled_cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
+
+		rdtsc_barrier();
+		rdtscll(cycles);
+		rdtsc_barrier();
+
+#ifdef CONFIG_X86_64
+		tmp = cycles*(USEC_PER_SEC/scaled_cpu_khz) >> CYC2NS_SCALE_FACTOR;
+		tp->tv_nsec = tmp;
+		tp->tv_sec = 0;
+#elif defined CONFIG_X86_32
+		tmp = cycles*(div64_u64(USEC_PER_SEC, scaled_cpu_khz)) >> CYC2NS_SCALE_FACTOR;
+		tp->tv_sec = div64_u64(tmp, NSEC_PER_SEC);
+		tp->tv_nsec = tmp - NSEC_PER_SEC * tp->tv_sec;
+#endif
+		return 0;
+	}
+#endif
 
 	if (invalid_clockid(which_clock))
 		return -EINVAL;
-- 
1.7.0.4





More information about the lttng-dev mailing list