[ltt-dev] [RFC PATCH] TRACE_CLOCK in clock_gettime vDSO and syscall

Mathieu Desnoyers mathieu.desnoyers at efficios.com
Tue Nov 23 17:46:22 EST 2010


* Julien Desfossez (julien.desfossez at polymtl.ca) wrote:
> This new option to clock_gettime allows the user to retreive a precise
> timestamp using the LTTng infrastructure. If the TSC is not synchronized
> across the CPUs (validated at boot time) the function returns an error
> to the user in order to make him understand he won't be able to
> synchronize automatically kernel and userspace traces and he need to
> fallback on an other clocksource.

I'm very happy to see the ball rolling! Comments below,

> 
> The main difference with using the TSC clocksource directly is that the
> time starts at machine boot and not at Linux boot which make it possible
> to correlate user and kernelspace events.
> On 64 bits architecture we only use the nsec field of the timespec to
> encode the time in nanoseconds, the sec field is always 0.
> On 32 bits we have to make the division and split the sec and nsec.
> 
> I am not particularly proud of the way the syscall is implemented (in
> kernel/posix-timers.c), I would be really glad if anyone has an idea to
> clean that up.
> 
> Some benchmarks (with 20000000 iterations reading the tsc before and after each
> call on an i7 920):
> 
> 64 bits with vDSO
> average cycles for clock_realtime: 101
> average cycles for clock_monotonic: 104
> average cycles for clock_trace: 66
> 
> 64 bits without vDSO (using syscall)
> average cycles for clock_realtime: 240
> average cycles for clock_monotonic: 256
> average cycles for clock_trace: 231
> 
> 32 bits with AND without vDSO
> average cycles for clock_realtime: 649
> average cycles for clock_monotonic: 661
> average cycles for clock_trace: 630

You always fallback on syscall on 32-bit ?

> 
> Signed-off-by: Julien Desfossez <julien.desfossez at polymtl.ca>
> ---
>  arch/x86/include/asm/trace-clock.h |    1 +
>  arch/x86/include/asm/vgtod.h       |    2 +
>  arch/x86/kernel/vsyscall_64.c      |    5 ++++
>  arch/x86/vdso/vclock_gettime.c     |   44 ++++++++++++++++++++++++++++++++++++
>  include/linux/time.h               |    1 +
>  kernel/posix-timers.c              |   27 ++++++++++++++++++++++
>  6 files changed, 80 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h
> index 01bc2f5..c1fd160 100644
> --- a/arch/x86/include/asm/trace-clock.h
> +++ b/arch/x86/include/asm/trace-clock.h
> @@ -14,6 +14,7 @@
>  #include <asm/system.h>
>  #include <asm/processor.h>
>  #include <asm/atomic.h>
> +#include <asm/vgtod.h>
>  
>  /* Minimum duration of a probe, in cycles */
>  #define TRACE_CLOCK_MIN_PROBE_DURATION 200
> diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
> index 3d61e20..7367f1c 100644
> --- a/arch/x86/include/asm/vgtod.h
> +++ b/arch/x86/include/asm/vgtod.h
> @@ -12,6 +12,8 @@ struct vsyscall_gtod_data {
>  	u32		wall_time_nsec;
>  
>  	int		sysctl_enabled;
> +	int		trace_clock_is_sync;
> +	unsigned long		cpu_khz;
>  	struct timezone sys_tz;
>  	struct { /* extract of a clocksource struct */
>  		cycle_t (*vread)(void);
> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
> index dcbb28c..ce750b5 100644
> --- a/arch/x86/kernel/vsyscall_64.c
> +++ b/arch/x86/kernel/vsyscall_64.c
> @@ -44,6 +44,8 @@
>  #include <asm/desc.h>
>  #include <asm/topology.h>
>  #include <asm/vgtod.h>
> +#include <asm/trace-clock.h>
> +#include <asm/timer.h>
>  
>  #define __vsyscall(nr) \
>  		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
> @@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
>  {
>  	.lock = SEQLOCK_UNLOCKED,
>  	.sysctl_enabled = 1,
> +	.trace_clock_is_sync = 0,
>  };
>  
>  void update_vsyscall_tz(void)
> @@ -89,6 +92,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
>  	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
>  	vsyscall_gtod_data.wall_to_monotonic = *wtm;
>  	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
> +	vsyscall_gtod_data.cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
> +	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
>  	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
>  }
>  
> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index ee55754..81bc9c7 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -22,6 +22,8 @@
>  #include <asm/hpet.h>
>  #include <asm/unistd.h>
>  #include <asm/io.h>
> +#include <asm/trace-clock.h>
> +#include <asm/timer.h>
>  #include "vextern.h"
>  
>  #define gtod vdso_vsyscall_gtod_data
> @@ -69,6 +71,7 @@ vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
>  		--sec;
>  	}
>  	ts->tv_sec = sec;
> +

extra whitespace to remove.

>  	ts->tv_nsec = nsec;
>  }
>  
> @@ -111,6 +114,41 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_X86
> +notrace static noinline int do_trace_clock(struct timespec *ts)
> +{
> +	cycle_t cycles;
> +	unsigned long tmp;
> +
> +	if (!gtod->trace_clock_is_sync)
> +		return -EPERM;
> +
> +	/* Copy of the version in kernel/tsc.c which we cannot directly access
> +	 *
> +	 * Surround the RDTSC by barriers, to make sure it's not
> +	 * speculated to outside the seqlock critical section and
> +	 * does not cause time warps:
> +	 */
> +	rdtsc_barrier();
> +	cycles = (cycle_t)vget_cycles();
> +	rdtsc_barrier();
> +
> +	/* On 64 bits, the nsec field is enough to store the whole information,
> +	 * on 32 bits we need to split it in two fields
> +	 */
> +	tmp = cycles * (USEC_PER_SEC / gtod->cpu_khz) >> CYC2NS_SCALE_FACTOR;

I'd go for creation of:

union aliased_ts {
  struct timespec ts;
  u64 count;
};

and then:

union aliased_ts *ats = ts;

ats->count = tmp;

both for x86 32 and 64.

Then you cast the returned struct timespec (in user-space) into the same aliased
structure.

Does it seem clean enough ?

The goal is to trash the code below, especially the division.

> +#ifdef CONFIG_X86_64
> +	ts->tv_nsec = tmp;
> +	ts->tv_sec = 0;
> +#elif defined CONFIG_X86_32
> +	ts->tv_nsec = do_div(tmp, NSEC_PER_SEC);
> +	ts->tv_sec = tmp;
> +#endif
> +
> +	return 0;
> +}
> +#endif /* CONFIG_X86 */
> +
>  notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>  {
>  	if (likely(gtod->sysctl_enabled))
> @@ -127,6 +165,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>  			return do_realtime_coarse(ts);
>  		case CLOCK_MONOTONIC_COARSE:
>  			return do_monotonic_coarse(ts);

OK, so on kernels not supporting our clock ID, we can detect this because
"-EINVAL" is returned. Yep, looks good.

> +#ifdef CONFIG_X86
> +		case CLOCK_TRACE:
> +			return do_trace_clock(ts);
> +#endif
> +		default:
> +			return -EINVAL;
>  		}
>  	return vdso_fallback_gettime(clock, ts);
>  }
> diff --git a/include/linux/time.h b/include/linux/time.h
> index 9f15ac7..9e6d9e2 100644
> --- a/include/linux/time.h
> +++ b/include/linux/time.h
> @@ -290,6 +290,7 @@ struct itimerval {
>  #define CLOCK_MONOTONIC_RAW		4
>  #define CLOCK_REALTIME_COARSE		5
>  #define CLOCK_MONOTONIC_COARSE		6
> +#define CLOCK_TRACE					32
>  
>  /*
>   * The IDs of various hardware clocks:
> diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
> index 9ca4973..4517f1d 100644
> --- a/kernel/posix-timers.c
> +++ b/kernel/posix-timers.c
> @@ -37,6 +37,8 @@
>  #include <linux/mutex.h>
>  
>  #include <asm/uaccess.h>
> +#include <linux/trace-clock.h>
> +#include <linux/timer.h>
>  #include <linux/list.h>
>  #include <linux/init.h>
>  #include <linux/compiler.h>
> @@ -957,6 +959,31 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
>  {
>  	struct timespec kernel_tp;
>  	int error;
> +#ifdef CONFIG_X86

ifdefs in generic code are usually frowned upon.

You probably want to modify "invalid_clockid" so it calls a per-architecture
check, which will only succeed for the x86 arch.

This way, you don't even have to bypass the currently existing test.

Thanks,

Mathieu

> +	u64 cycles, tmp, scaled_cpu_khz;
> +
> +	if (which_clock == CLOCK_TRACE) {
> +		if (!_trace_clock_is_sync)
> +			return -EPERM;
> +
> +		scaled_cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
> +
> +		rdtsc_barrier();
> +		rdtscll(cycles);
> +		rdtsc_barrier();
> +
> +#ifdef CONFIG_X86_64
> +		tmp = cycles*(USEC_PER_SEC/scaled_cpu_khz) >> CYC2NS_SCALE_FACTOR;
> +		tp->tv_nsec = tmp;
> +		tp->tv_sec = 0;
> +#elif defined CONFIG_X86_32
> +		tmp = cycles*(div64_u64(USEC_PER_SEC, scaled_cpu_khz)) >> CYC2NS_SCALE_FACTOR;
> +		tp->tv_sec = div64_u64(tmp, NSEC_PER_SEC);
> +		tp->tv_nsec = tmp - NSEC_PER_SEC * tp->tv_sec;
> +#endif
> +		return 0;
> +	}
> +#endif
>  
>  	if (invalid_clockid(which_clock))
>  		return -EINVAL;
> -- 
> 1.7.0.4
> 

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com




More information about the lttng-dev mailing list