[ltt-dev] [PATCH] LTTng optimize write to page function

Wed Feb 4 00:44:22 EST 2009

* KOSAKI Motohiro (kosaki.motohiro at jp.fujitsu.com) wrote:
> Hi
> 
> > +static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> > +{
> > +	switch (len) {
> > +	case 1:	*(u8 *)dest = *(const u8 *)src;
> > +		break;
> > +	case 2:	*(u16 *)dest = *(const u16 *)src;
> > +		break;
> > +	case 4:	*(u32 *)dest = *(const u32 *)src;
> > +		break;
> > +#if (BITS_PER_LONG == 64)
> > +	case 8:	*(u64 *)dest = *(const u64 *)src;
> > +		break;
> > +#endif
> > +	default:
> > +		memcpy(dest, src, len);
> > +	}
> > +}
> 
> hm, interesting.
> 
> IIRC, few month ago, linus said this optimization is not optimazation.
> lastest gcc does this inlining automatically.
> (but I can't point its url, sorry)
> 
> Is this result gcc version independent? and can you send
> the difference of gcc assembly outout?

Here we go :

x86_64
gcc (Debian 4.3.2-1) 4.3.2 (haven't tried other compiler versions)
kernel 2.6.29-rc3

char dataout[100];
char datain[100];

int sizea = 8;

void testfct_ltt(void)
{
        asm ("/* begin */");
        ltt_relay_do_copy(dataout, datain, sizea);
        asm ("/* end*/");
}

Turns into a jump table :

        movslq  sizea(%rip),%rdx
        cmpq    $8, %rdx
        jbe     .L15
.L6:
        movq    $datain, %rsi
        movq    $dataout, %rdi
        call    memcpy
        .p2align 4,,10
        .p2align 3
.L7:
[...]
.L15:
        jmp     *.L12(,%rdx,8)

        .section        .rodata
        .align 8
        .align 4
.L12:
        .quad   .L7
        .quad   .L8
        .quad   .L9
        .quad   .L6
        .quad   .L10
        .quad   .L6
        .quad   .L6
        .quad   .L6
        .quad   .L11
        .text
        .p2align 4,,10
        .p2align 3
.L11:
        movq    datain(%rip), %rax
        movq    %rax, dataout(%rip)
        jmp     .L7
        .p2align 4,,10
        .p2align 3
.L8:
        movzbl  datain(%rip), %eax
        movb    %al, dataout(%rip)
        jmp     .L7
        .p2align 4,,10
        .p2align 3
.L9:
        movzwl  datain(%rip), %eax
        movw    %ax, dataout(%rip)
        jmp     .L7
        .p2align 4,,10
        .p2align 3
.L10:
        movl    datain(%rip), %eax
        movl    %eax, dataout(%rip)
        jmp     .L7
        .size   testfct_ltt, .-testfct_ltt
        .p2align 4,,15

void testfct_memcpy(void)
{
        asm ("/* begin */");
        memcpy(dataout, datain, sizea);
        asm ("/* end */");
}

Turns into a function call because the size is not statically known :

        movslq  sizea(%rip),%rdx
        movq    $datain, %rsi
        movq    $dataout, %rdi
        call    memcpy

Below, when a constant is passed, both behave similarly :

void testfct_ltt_const(void)
{
        asm ("/* begin */");
        ltt_relay_do_copy(dataout, datain, 8);
        asm ("/* end*/");
}

        movq    datain(%rip), %rax
        movq    %rax, dataout(%rip)

void testfct_memcpy_const(void)
{
        asm ("/* begin */");
        memcpy(dataout, datain, 8);
        asm ("/* end */");
}

        movq    datain(%rip), %rax
        movq    %rax, dataout(%rip)

Therefore, I agree that when memcpy is passed a constant, it will do
the same as my ltt_relay_do_copy. However, when we know we usually
expect sizes of 1, 2, 4 and 8 bytes (unknown at compile-time), the jump
table saves the costly function call to memcpy.

Mathieu

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68