[ltt-dev] [RFC PATCH] TRACE_CLOCK in clock_gettime vDSO and syscall
Mathieu Desnoyers
mathieu.desnoyers at efficios.com
Tue Nov 23 17:46:22 EST 2010
* Julien Desfossez (julien.desfossez at polymtl.ca) wrote:
> This new option to clock_gettime allows the user to retreive a precise
> timestamp using the LTTng infrastructure. If the TSC is not synchronized
> across the CPUs (validated at boot time) the function returns an error
> to the user in order to make him understand he won't be able to
> synchronize automatically kernel and userspace traces and he need to
> fallback on an other clocksource.
I'm very happy to see the ball rolling! Comments below,
>
> The main difference with using the TSC clocksource directly is that the
> time starts at machine boot and not at Linux boot which make it possible
> to correlate user and kernelspace events.
> On 64 bits architecture we only use the nsec field of the timespec to
> encode the time in nanoseconds, the sec field is always 0.
> On 32 bits we have to make the division and split the sec and nsec.
>
> I am not particularly proud of the way the syscall is implemented (in
> kernel/posix-timers.c), I would be really glad if anyone has an idea to
> clean that up.
>
> Some benchmarks (with 20000000 iterations reading the tsc before and after each
> call on an i7 920):
>
> 64 bits with vDSO
> average cycles for clock_realtime: 101
> average cycles for clock_monotonic: 104
> average cycles for clock_trace: 66
>
> 64 bits without vDSO (using syscall)
> average cycles for clock_realtime: 240
> average cycles for clock_monotonic: 256
> average cycles for clock_trace: 231
>
> 32 bits with AND without vDSO
> average cycles for clock_realtime: 649
> average cycles for clock_monotonic: 661
> average cycles for clock_trace: 630
You always fallback on syscall on 32-bit ?
>
> Signed-off-by: Julien Desfossez <julien.desfossez at polymtl.ca>
> ---
> arch/x86/include/asm/trace-clock.h | 1 +
> arch/x86/include/asm/vgtod.h | 2 +
> arch/x86/kernel/vsyscall_64.c | 5 ++++
> arch/x86/vdso/vclock_gettime.c | 44 ++++++++++++++++++++++++++++++++++++
> include/linux/time.h | 1 +
> kernel/posix-timers.c | 27 ++++++++++++++++++++++
> 6 files changed, 80 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h
> index 01bc2f5..c1fd160 100644
> --- a/arch/x86/include/asm/trace-clock.h
> +++ b/arch/x86/include/asm/trace-clock.h
> @@ -14,6 +14,7 @@
> #include <asm/system.h>
> #include <asm/processor.h>
> #include <asm/atomic.h>
> +#include <asm/vgtod.h>
>
> /* Minimum duration of a probe, in cycles */
> #define TRACE_CLOCK_MIN_PROBE_DURATION 200
> diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
> index 3d61e20..7367f1c 100644
> --- a/arch/x86/include/asm/vgtod.h
> +++ b/arch/x86/include/asm/vgtod.h
> @@ -12,6 +12,8 @@ struct vsyscall_gtod_data {
> u32 wall_time_nsec;
>
> int sysctl_enabled;
> + int trace_clock_is_sync;
> + unsigned long cpu_khz;
> struct timezone sys_tz;
> struct { /* extract of a clocksource struct */
> cycle_t (*vread)(void);
> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
> index dcbb28c..ce750b5 100644
> --- a/arch/x86/kernel/vsyscall_64.c
> +++ b/arch/x86/kernel/vsyscall_64.c
> @@ -44,6 +44,8 @@
> #include <asm/desc.h>
> #include <asm/topology.h>
> #include <asm/vgtod.h>
> +#include <asm/trace-clock.h>
> +#include <asm/timer.h>
>
> #define __vsyscall(nr) \
> __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
> @@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
> {
> .lock = SEQLOCK_UNLOCKED,
> .sysctl_enabled = 1,
> + .trace_clock_is_sync = 0,
> };
>
> void update_vsyscall_tz(void)
> @@ -89,6 +92,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
> vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
> vsyscall_gtod_data.wall_to_monotonic = *wtm;
> vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
> + vsyscall_gtod_data.cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
> + vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
> write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
> }
>
> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index ee55754..81bc9c7 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -22,6 +22,8 @@
> #include <asm/hpet.h>
> #include <asm/unistd.h>
> #include <asm/io.h>
> +#include <asm/trace-clock.h>
> +#include <asm/timer.h>
> #include "vextern.h"
>
> #define gtod vdso_vsyscall_gtod_data
> @@ -69,6 +71,7 @@ vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
> --sec;
> }
> ts->tv_sec = sec;
> +
extra whitespace to remove.
> ts->tv_nsec = nsec;
> }
>
> @@ -111,6 +114,41 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
> return 0;
> }
>
> +#ifdef CONFIG_X86
> +notrace static noinline int do_trace_clock(struct timespec *ts)
> +{
> + cycle_t cycles;
> + unsigned long tmp;
> +
> + if (!gtod->trace_clock_is_sync)
> + return -EPERM;
> +
> + /* Copy of the version in kernel/tsc.c which we cannot directly access
> + *
> + * Surround the RDTSC by barriers, to make sure it's not
> + * speculated to outside the seqlock critical section and
> + * does not cause time warps:
> + */
> + rdtsc_barrier();
> + cycles = (cycle_t)vget_cycles();
> + rdtsc_barrier();
> +
> + /* On 64 bits, the nsec field is enough to store the whole information,
> + * on 32 bits we need to split it in two fields
> + */
> + tmp = cycles * (USEC_PER_SEC / gtod->cpu_khz) >> CYC2NS_SCALE_FACTOR;
I'd go for creation of:
union aliased_ts {
struct timespec ts;
u64 count;
};
and then:
union aliased_ts *ats = ts;
ats->count = tmp;
both for x86 32 and 64.
Then you cast the returned struct timespec (in user-space) into the same aliased
structure.
Does it seem clean enough ?
The goal is to trash the code below, especially the division.
> +#ifdef CONFIG_X86_64
> + ts->tv_nsec = tmp;
> + ts->tv_sec = 0;
> +#elif defined CONFIG_X86_32
> + ts->tv_nsec = do_div(tmp, NSEC_PER_SEC);
> + ts->tv_sec = tmp;
> +#endif
> +
> + return 0;
> +}
> +#endif /* CONFIG_X86 */
> +
> notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
> {
> if (likely(gtod->sysctl_enabled))
> @@ -127,6 +165,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
> return do_realtime_coarse(ts);
> case CLOCK_MONOTONIC_COARSE:
> return do_monotonic_coarse(ts);
OK, so on kernels not supporting our clock ID, we can detect this because
"-EINVAL" is returned. Yep, looks good.
> +#ifdef CONFIG_X86
> + case CLOCK_TRACE:
> + return do_trace_clock(ts);
> +#endif
> + default:
> + return -EINVAL;
> }
> return vdso_fallback_gettime(clock, ts);
> }
> diff --git a/include/linux/time.h b/include/linux/time.h
> index 9f15ac7..9e6d9e2 100644
> --- a/include/linux/time.h
> +++ b/include/linux/time.h
> @@ -290,6 +290,7 @@ struct itimerval {
> #define CLOCK_MONOTONIC_RAW 4
> #define CLOCK_REALTIME_COARSE 5
> #define CLOCK_MONOTONIC_COARSE 6
> +#define CLOCK_TRACE 32
>
> /*
> * The IDs of various hardware clocks:
> diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
> index 9ca4973..4517f1d 100644
> --- a/kernel/posix-timers.c
> +++ b/kernel/posix-timers.c
> @@ -37,6 +37,8 @@
> #include <linux/mutex.h>
>
> #include <asm/uaccess.h>
> +#include <linux/trace-clock.h>
> +#include <linux/timer.h>
> #include <linux/list.h>
> #include <linux/init.h>
> #include <linux/compiler.h>
> @@ -957,6 +959,31 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
> {
> struct timespec kernel_tp;
> int error;
> +#ifdef CONFIG_X86
ifdefs in generic code are usually frowned upon.
You probably want to modify "invalid_clockid" so it calls a per-architecture
check, which will only succeed for the x86 arch.
This way, you don't even have to bypass the currently existing test.
Thanks,
Mathieu
> + u64 cycles, tmp, scaled_cpu_khz;
> +
> + if (which_clock == CLOCK_TRACE) {
> + if (!_trace_clock_is_sync)
> + return -EPERM;
> +
> + scaled_cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
> +
> + rdtsc_barrier();
> + rdtscll(cycles);
> + rdtsc_barrier();
> +
> +#ifdef CONFIG_X86_64
> + tmp = cycles*(USEC_PER_SEC/scaled_cpu_khz) >> CYC2NS_SCALE_FACTOR;
> + tp->tv_nsec = tmp;
> + tp->tv_sec = 0;
> +#elif defined CONFIG_X86_32
> + tmp = cycles*(div64_u64(USEC_PER_SEC, scaled_cpu_khz)) >> CYC2NS_SCALE_FACTOR;
> + tp->tv_sec = div64_u64(tmp, NSEC_PER_SEC);
> + tp->tv_nsec = tmp - NSEC_PER_SEC * tp->tv_sec;
> +#endif
> + return 0;
> + }
> +#endif
>
> if (invalid_clockid(which_clock))
> return -EINVAL;
> --
> 1.7.0.4
>
--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com
More information about the lttng-dev
mailing list