[ltt-dev] [PATCH] LTTng optimize write to page function

Tue Feb 3 23:53:04 EST 2009

* Zhaolei (zhaolei at cn.fujitsu.com) wrote:
> Mathieu Desnoyers Wrote:
> > * Lai Jiangshan (laijs at cn.fujitsu.com) wrote:
> >> Mathieu Desnoyers wrote:
> >>> The functions in ltt-relay-alloc.c take care of writing the data into
> >>> the buffer pages. Those pages are allocated from the page allocator and
> >>> no virtual mapping is done so we can save precious TLB entries.
> >>> ltt-relay-alloc.c is the abstraction layer which makes the buffers
> >>> "look" like a contiguous memory area, although they are made from
> >>> physically discontiguous pages linked with a linked list. A caching
> >>> mechanism makes sure we never walk over more than 1-2 entries of the
> >>> list. We use a linked list rather than a table to make sure we don't
> >>> depend on vmalloc to allocate large pointer arrays.
> >>>
> >>> I did a bit of profiling with oprofile on LTTng and found out that write
> >>> functions in ltt-relay-alloc.c were taking a lot of CPU time. I through it would
> >>> be good to improve them a bit.
> >>>
> >>> Running a 2.6.29-rc3 kernel
> >>>
> >>> Compiling a 2.6.25 kernel using make -j10 on a 8-cores x86_64 with a vanilla
> >>> 2.6.29-rc3 kernel (all tests are cache-hot) :
> >>> real 1m22.103s
> >>>
> >>> With dormant instrumentation
> >>> real 1m24.667s
> >>> (note : this 2s regression should be identified eventually by doing a bissection
> >>> of the LTTng tree.)
> >>>
> >>> ltt-armall
> >>>
> >>> Without modification, with flight recorder tracing active :
> >>> real 1m31.135s
> >>>
> >>> Replacing the memcpy call with a specialized call for 1, 2, 4 and 8 bytes :
> >>> real 1m30.440s
> >>>
> >>> Inlining the fast path of the write function :
> >>> real 1m29.614s
> >>>
> >>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at polymtl.ca>
> >>> CC: Martin Bligh <mbligh at google.com>
> >>> CC: Zhaolei <zhaolei at cn.fujitsu.com>
> >>> ---
> >>>  include/linux/ltt-relay.h |   88 ++++++++++++++++++++++++++++++++++++++++++++--
> >>>  ltt/ltt-relay-alloc.c     |   66 ++++++----------------------------
> >>>  2 files changed, 98 insertions(+), 56 deletions(-)
> >>>
> >>> Index: linux-2.6-lttng/ltt/ltt-relay-alloc.c
> >>> ===================================================================
> >>> --- linux-2.6-lttng.orig/ltt/ltt-relay-alloc.c	2009-02-03 10:37:05.000000000 -0500
> >>> +++ linux-2.6-lttng/ltt/ltt-relay-alloc.c	2009-02-03 10:37:13.000000000 -0500
> >>> @@ -424,7 +424,7 @@ EXPORT_SYMBOL_GPL(ltt_relay_close);
> >>>  /*
> >>>   * Start iteration at the previous element. Skip the real list head.
> >>>   */
> >>> -static struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
> >>> +struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
> >>>  	struct buf_page *page, size_t offset, ssize_t diff_offset)
> >>>  {
> >>>  	struct buf_page *iter;
> >>> @@ -456,13 +456,15 @@ static struct buf_page *ltt_relay_find_p
> >>>  			return iter;
> >>>  		}
> >>>  	}
> >>> +	WARN_ON(1);
> >>>  	return NULL;
> >>>  }
> >>> +EXPORT_SYMBOL_GPL(ltt_relay_find_prev_page);
> >>>  
> >>>  /*
> >>>   * Start iteration at the next element. Skip the real list head.
> >>>   */
> >>> -static struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
> >>> +struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
> >>>  	struct buf_page *page, size_t offset, ssize_t diff_offset)
> >>>  {
> >>>  	struct buf_page *iter;
> >>> @@ -494,48 +496,10 @@ static struct buf_page *ltt_relay_find_n
> >>>  			return iter;
> >>>  		}
> >>>  	}
> >>> +	WARN_ON(1);
> >>>  	return NULL;
> >>>  }
> >>> -
> >>> -/*
> >>> - * Find the page containing "offset". Cache it if it is after the currently
> >>> - * cached page.
> >>> - */
> >>> -static struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
> >>> -		struct buf_page **page_cache,
> >>> -		struct buf_page *page, size_t offset)
> >>> -{
> >>> -	ssize_t diff_offset;
> >>> -	ssize_t half_buf_size = buf->chan->alloc_size >> 1;
> >>> -
> >>> -	/*
> >>> -	 * Make sure this is the page we want to write into. The current
> >>> -	 * page is changed concurrently by other writers. [wrh]page are
> >>> -	 * used as a cache remembering the last page written
> >>> -	 * to/read/looked up for header address. No synchronization;
> >>> -	 * could have to find the previous page is a nested write
> >>> -	 * occured. Finding the right page is done by comparing the
> >>> -	 * dest_offset with the buf_page offsets.
> >>> -	 * When at the exact opposite of the buffer, bias towards forward search
> >>> -	 * because it will be cached.
> >>> -	 */
> >>> -
> >>> -	diff_offset = (ssize_t)offset - (ssize_t)page->offset;
> >>> -	if (diff_offset <= -(ssize_t)half_buf_size)
> >>> -		diff_offset += buf->chan->alloc_size;
> >>> -	else if (diff_offset > half_buf_size)
> >>> -		diff_offset -= buf->chan->alloc_size;
> >>> -
> >>> -	if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
> >>> -		page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
> >>> -		WARN_ON(!page);
> >>> -		*page_cache = page;
> >>> -	} else if (unlikely(diff_offset < 0)) {
> >>> -		page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
> >>> -		WARN_ON(!page);
> >>> -	}
> >>> -	return page;
> >>> -}
> >>> +EXPORT_SYMBOL_GPL(ltt_relay_find_next_page);
> >>>  
> >>>  /**
> >>>   * ltt_relay_write - write data to a ltt_relay buffer.
> >>> @@ -543,22 +507,17 @@ static struct buf_page *ltt_relay_cache_
> >>>   * @offset : offset within the buffer
> >>>   * @src : source address
> >>>   * @len : length to write
> >>> + * @page : cached buffer page
> >>>   */
> >>> -int ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> -	const void *src, size_t len)
> >>> +void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> +	const void *src, size_t len, struct buf_page *page)
> >>>  {
> >>> -	struct buf_page *page;
> >>> -	ssize_t pagecpy, orig_len;
> >>> +	ssize_t pagecpy;
> >>>  
> >>> -	orig_len = len;
> >>> -	offset &= buf->chan->alloc_size - 1;
> >>> -	page = buf->wpage;
> >>> -	if (unlikely(!len))
> >>> -		return 0;
> >>>  	for (;;) {
> >>>  		page = ltt_relay_cache_page(buf, &buf->wpage, page, offset);
> >>>  		pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
> >>> -		memcpy(page_address(page->page)
> >>> +		ltt_relay_do_copy(page_address(page->page)
> >>>  			+ (offset & ~PAGE_MASK), src, pagecpy);
> > 
> > Hi Lai,
> > 
> > Happy new chinese year :)
> > 
> > Please see the corrected v2 of this patch. This v1 is broken.
> > 
> >> I think memcpy() is better than ltt_relay_do_copy() here.
> >> (offset & ~PAGE_MASK) is unlikely 1,2,4,8,16 here.
> > 
> > ltt_relay_do_copy has the switch for 1, 2, 4, 8 for the "len" parameter,
> > which corresponds to "pagecpy", not (offset & ~PAGE_MASK).
> > 
> >>>  		len -= pagecpy;
> >>>  		if (likely(!len))
> >>> @@ -571,9 +530,8 @@ int ltt_relay_write(struct rchan_buf *bu
> >>>  		 */
> >>>  		WARN_ON(offset >= buf->chan->alloc_size);
> >>>  	}
> >>> -	return orig_len;
> >>>  }
> >>> -EXPORT_SYMBOL_GPL(ltt_relay_write);
> >>> +EXPORT_SYMBOL_GPL(_ltt_relay_write);
> >>>  
> >>>  /**
> >>>   * ltt_relay_read - read data from ltt_relay_buffer.
> >>> Index: linux-2.6-lttng/include/linux/ltt-relay.h
> >>> ===================================================================
> >>> --- linux-2.6-lttng.orig/include/linux/ltt-relay.h	2009-02-03 10:37:06.000000000 -0500
> >>> +++ linux-2.6-lttng/include/linux/ltt-relay.h	2009-02-03 10:37:13.000000000 -0500
> >>> @@ -140,8 +140,14 @@ struct rchan_callbacks {
> >>>  	int (*remove_buf_file)(struct dentry *dentry);
> >>>  };
> >>>  
> >>> -extern int ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> -	const void *src, size_t len);
> >>> +extern struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
> >>> +	struct buf_page *page, size_t offset, ssize_t diff_offset);
> >>> +
> >>> +extern struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
> >>> +	struct buf_page *page, size_t offset, ssize_t diff_offset);
> >>> +
> >>> +extern void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> +	const void *src, size_t len, struct buf_page *page);
> >>>  
> >>>  extern int ltt_relay_read(struct rchan_buf *buf, size_t offset,
> >>>  	void *dest, size_t len);
> >>> @@ -159,6 +165,84 @@ extern void *ltt_relay_offset_address(st
> >>>  	size_t offset);
> >>>  
> >>>  /*
> >>> + * Find the page containing "offset". Cache it if it is after the currently
> >>> + * cached page.
> >>> + */
> >>> +static inline struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
> >>> +		struct buf_page **page_cache,
> >>> +		struct buf_page *page, size_t offset)
> >>> +{
> >>> +	ssize_t diff_offset;
> >>> +	ssize_t half_buf_size = buf->chan->alloc_size >> 1;
> >>> +
> >>> +	/*
> >>> +	 * Make sure this is the page we want to write into. The current
> >>> +	 * page is changed concurrently by other writers. [wrh]page are
> >>> +	 * used as a cache remembering the last page written
> >>> +	 * to/read/looked up for header address. No synchronization;
> >>> +	 * could have to find the previous page is a nested write
> >>> +	 * occured. Finding the right page is done by comparing the
> >>> +	 * dest_offset with the buf_page offsets.
> >>> +	 * When at the exact opposite of the buffer, bias towards forward search
> >>> +	 * because it will be cached.
> >>> +	 */
> >>> +
> >>> +	diff_offset = (ssize_t)offset - (ssize_t)page->offset;
> >>> +	if (diff_offset <= -(ssize_t)half_buf_size)
> >>> +		diff_offset += buf->chan->alloc_size;
> >>> +	else if (diff_offset > half_buf_size)
> >>> +		diff_offset -= buf->chan->alloc_size;
> >>> +
> >>> +	if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
> >>> +		page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
> >>> +		*page_cache = page;
> >>> +	} else if (unlikely(diff_offset < 0)) {
> >>> +		page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
> >>> +	}
> >>> +	return page;
> >>> +}
> >>> +
> >>> +static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> >>> +{
> >>> +	switch (len) {
> >>> +	case 1:	*(u8 *)dest = *(const u8 *)src;
> >>> +		break;
> >>> +	case 2:	*(u16 *)dest = *(const u16 *)src;
> >>> +		break;
> >>> +	case 4:	*(u32 *)dest = *(const u32 *)src;
> >>> +		break;
> >>> +#if (BITS_PER_LONG == 64)
> >>> +	case 8:	*(u64 *)dest = *(const u64 *)src;
> >>> +		break;
> >>> +#endif
> >>> +	default:
> >>> +		memcpy(dest, src, len);
> >>> +	}
> >>> +}
> >> I think this function is not correct when @src is not alignment for
> >> 2,4,8,or 16.
> >>
> > 
> > Hrm, interesting. So if we need to copy 4 chars, e.g.
> > 
> > char data[4]
> > 
> > Then there are no requirements for alignment within the data.
> > 
> > This is normally not a problem for architectures with
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, but could be a problem for
> > architectures with slow unaligned accesses.
> > 
> > What do you think of this proposal for ltt_relay_do_copy ?
> > 
> > #if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> > {
> > 	switch (len) {
> > 	case 0:	break;
> > 	case 1:	*(u8 *)dest = *(const u8 *)src;
> > 		break;
> > 	case 2:	*(u16 *)dest = *(const u16 *)src;
> > 		break;
> > 	case 4:	*(u32 *)dest = *(const u32 *)src;
> > 		break;
> > #if (BITS_PER_LONG == 64)
> > 	case 8:	*(u64 *)dest = *(const u64 *)src;
> > 		break;
> > #endif
> Hello, Mathieu
> 
> Why not use instructions generated by gcc instead of memcpy on arch without 64bit write as:
> case 4:	*(u32 *)dest = *(const u32 *)src;
> 	break;
> case 8:	*(u64 *)dest = *(const u64 *)src;
> 	break;
> 
> IMHO, even on arch without 64bit write, memcpy is more complex.
> 

Hi Zhaolei,

Good chinese new year to you :)

Hrm, that's an interesting idea. I wonder, on 32-bits x86, how those two
compare :


#include <inttypes.h>

char dest[100];
char src[100];

typedef uint64_t u64;
typedef uint32_t u32;

void gcc_u64(void)
{
        asm("/* begin */");
        *(u64 *)dest = *(const u64 *)src;
        asm("/* end */");
}


        movl    src, %eax
        movl    src+4, %edx
        movl    %eax, dest
        movl    %edx, dest+4


void twice_u32(void)
{
        asm("/* begin */");
        ((u32 *)dest)[0] = ((const u32 *)src)[0];
        ((u32 *)dest)[1] = ((const u32 *)src)[1];
        asm("/* end */");
}
        movl    src, %eax
        movl    %eax, dest
        movl    src+4, %eax
        movl    %eax, dest+4

gcc seems to do a better register scheduler than my code, so I think
it's not so bad. I will take your proposal.

As a side-note : a side-effect of doing this is to have, on 32-bits
architectures without efficient aligned access, the following test :

        case 8: if (unlikely(!addr_aligned(dest, src, 8)))
                        goto memcpy_fallback;
                *(u64 *)dest = *(const u64 *)src;

Which would succeed if the address is aligned on 4 bytes but not 8 on a
32-bits arch (an access aligned on the pointer size is considered
aligned even if the requested alignment amount is bigger), see
ltt_align()). Therefore, the compiler would have to generate code able
to write a u64 to a u32-aligned address. Given this is on a 32-bits
architecture, I cannot foresee how in the world the compiler could mess
this up, so I think it's ok.


> > 	default:
> > 		memcpy(dest, src, len);
> > 	}
> > }
> > #else
> > /*
> >  * Returns whether the dest and src addresses are aligned on
> >  * min(sizeof(void *), len). Call this with statically known len for efficiency.
> >  */
> > static inline addr_aligned(void *dest, void *src, size_t len)
> > {
> > 	if (ltt_align((size_t)dest, len))
> > 		return 0;
> > 	if (ltt_align((size_t)src, len))
> > 		return 0;
> > 	return 1;
> > }
> > 
> > static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> > {
> > 	switch (len) {
> > 	case 0:	break;
> > 	case 1:	*(u8 *)dest = *(const u8 *)src;
> > 		break;
> > 	case 2:	if (unlikely(!addr_aligned(dest, src, 2)))
> > 			goto memcpy_fallback;
> > 		*(u16 *)dest = *(const u16 *)src;
> > 		break;
> > 	case 4:	if (unlikely(!addr_aligned(dest, src, 4)))
> > 			goto memcpy_fallback;
> > 		*(u32 *)dest = *(const u32 *)src;
> > 		break;
> > #if (BITS_PER_LONG == 64)
> > 	case 8:	if (unlikely(!addr_aligned(dest, src, 8)))
> > 			goto memcpy_fallback;
> > 		*(u64 *)dest = *(const u64 *)src;
> > 		break;
> > #endif
> > 	default:
> > 		goto memcpy_fallback;
> > 	}
> > 	return;
> > memcpy_fallback:
> > 	memcpy(dest, src, len);
> It seems code is more complex on arch without "CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS"...
> 
> Because len is less than 8 here, copy them one by one maybe effectively than complex memcpy:
> memcpy_fallback:
> 	for(;len>0; len--)
> 		*(u8 *)dest++ = *(const u8 *)src++;
> 

except for default: , which should still be a call to memcpy(). I will
also integrate your suggestion, thanks. Some benchmarking would be
welcome though.

Also this assumes that 

   dest++ and src++

are done on the void *. Given that increment on void * is equivalent to
increment on char * (+1), this is ok.

Mathieu

 Here is the resulting patch :


lttng-optimize-write-to-page-function-remove-some-memcpy-calls

Zhaolei :
> Hello, Mathieu
> 
> Why not use instructions generated by gcc instead of memcpy on arch without
> 64bit write as:
> case 4: *(u32 *)dest = *(const u32 *)src;
>   break;
> case 8: *(u64 *)dest = *(const u64 *)src;
>   break;
> 
> IMHO, even on arch without 64bit write, memcpy is more complex.

#include <inttypes.h>

char dest[100];
char src[100];

typedef uint64_t u64;
typedef uint32_t u32;

void gcc_u64(void)
{
        asm("/* begin */");
        *(u64 *)dest = *(const u64 *)src;
        asm("/* end */");
}


        movl    src, %eax
        movl    src+4, %edx
        movl    %eax, dest
        movl    %edx, dest+4


void twice_u32(void)
{
        asm("/* begin */");
        ((u32 *)dest)[0] = ((const u32 *)src)[0];
        ((u32 *)dest)[1] = ((const u32 *)src)[1];
        asm("/* end */");
}
        movl    src, %eax
        movl    %eax, dest
        movl    src+4, %eax
        movl    %eax, dest+4

gcc seems to do a better register scheduler than my code, so I think
it's not so bad. I will take your proposal.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at polymtl.ca>
CC: Zhaolei <zhaolei at cn.fujitsu.com>
---
 include/linux/ltt-relay.h |    9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

Index: linux-2.6-lttng/include/linux/ltt-relay.h
===================================================================

--- linux-2.6-lttng.orig/include/linux/ltt-relay.h	2009-02-03 23:21:16.000000000 -0500
+++ linux-2.6-lttng/include/linux/ltt-relay.h	2009-02-03 23:29:17.000000000 -0500
@@ -214,10 +214,8 @@ static inline void ltt_relay_do_copy(voi
 		break;
 	case 4:	*(u32 *)dest = *(const u32 *)src;
 		break;
-#if (BITS_PER_LONG == 64)
 	case 8:	*(u64 *)dest = *(const u64 *)src;
 		break;
-#endif
 	default:
 		memcpy(dest, src, len);
 	}
@@ -250,18 +248,17 @@ static inline void ltt_relay_do_copy(voi
 			goto memcpy_fallback;
 		*(u32 *)dest = *(const u32 *)src;
 		break;
-#if (BITS_PER_LONG == 64)
 	case 8:	if (unlikely(!addr_aligned(dest, src, 8)))
 			goto memcpy_fallback;
 		*(u64 *)dest = *(const u64 *)src;
 		break;
-#endif
 	default:
-		goto memcpy_fallback;
+		memcpy(dest, src, len);
 	}
 	return;
 memcpy_fallback:
-	memcpy(dest, src, len);
+	for(; len > 0; len--)
+		*(u8 *)dest++ = *(const u8 *)src++;
 }
 #endif
 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68