[ltt-dev] [PATCH] LTTng optimize write to page function
Mathieu Desnoyers
compudj at krystal.dyndns.org
Tue Feb 3 23:53:04 EST 2009
* Zhaolei (zhaolei at cn.fujitsu.com) wrote:
> Mathieu Desnoyers Wrote:
> > * Lai Jiangshan (laijs at cn.fujitsu.com) wrote:
> >> Mathieu Desnoyers wrote:
> >>> The functions in ltt-relay-alloc.c take care of writing the data into
> >>> the buffer pages. Those pages are allocated from the page allocator and
> >>> no virtual mapping is done so we can save precious TLB entries.
> >>> ltt-relay-alloc.c is the abstraction layer which makes the buffers
> >>> "look" like a contiguous memory area, although they are made from
> >>> physically discontiguous pages linked with a linked list. A caching
> >>> mechanism makes sure we never walk over more than 1-2 entries of the
> >>> list. We use a linked list rather than a table to make sure we don't
> >>> depend on vmalloc to allocate large pointer arrays.
> >>>
> >>> I did a bit of profiling with oprofile on LTTng and found out that write
> >>> functions in ltt-relay-alloc.c were taking a lot of CPU time. I through it would
> >>> be good to improve them a bit.
> >>>
> >>> Running a 2.6.29-rc3 kernel
> >>>
> >>> Compiling a 2.6.25 kernel using make -j10 on a 8-cores x86_64 with a vanilla
> >>> 2.6.29-rc3 kernel (all tests are cache-hot) :
> >>> real 1m22.103s
> >>>
> >>> With dormant instrumentation
> >>> real 1m24.667s
> >>> (note : this 2s regression should be identified eventually by doing a bissection
> >>> of the LTTng tree.)
> >>>
> >>> ltt-armall
> >>>
> >>> Without modification, with flight recorder tracing active :
> >>> real 1m31.135s
> >>>
> >>> Replacing the memcpy call with a specialized call for 1, 2, 4 and 8 bytes :
> >>> real 1m30.440s
> >>>
> >>> Inlining the fast path of the write function :
> >>> real 1m29.614s
> >>>
> >>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at polymtl.ca>
> >>> CC: Martin Bligh <mbligh at google.com>
> >>> CC: Zhaolei <zhaolei at cn.fujitsu.com>
> >>> ---
> >>> include/linux/ltt-relay.h | 88 ++++++++++++++++++++++++++++++++++++++++++++--
> >>> ltt/ltt-relay-alloc.c | 66 ++++++----------------------------
> >>> 2 files changed, 98 insertions(+), 56 deletions(-)
> >>>
> >>> Index: linux-2.6-lttng/ltt/ltt-relay-alloc.c
> >>> ===================================================================
> >>> --- linux-2.6-lttng.orig/ltt/ltt-relay-alloc.c 2009-02-03 10:37:05.000000000 -0500
> >>> +++ linux-2.6-lttng/ltt/ltt-relay-alloc.c 2009-02-03 10:37:13.000000000 -0500
> >>> @@ -424,7 +424,7 @@ EXPORT_SYMBOL_GPL(ltt_relay_close);
> >>> /*
> >>> * Start iteration at the previous element. Skip the real list head.
> >>> */
> >>> -static struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
> >>> +struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
> >>> struct buf_page *page, size_t offset, ssize_t diff_offset)
> >>> {
> >>> struct buf_page *iter;
> >>> @@ -456,13 +456,15 @@ static struct buf_page *ltt_relay_find_p
> >>> return iter;
> >>> }
> >>> }
> >>> + WARN_ON(1);
> >>> return NULL;
> >>> }
> >>> +EXPORT_SYMBOL_GPL(ltt_relay_find_prev_page);
> >>>
> >>> /*
> >>> * Start iteration at the next element. Skip the real list head.
> >>> */
> >>> -static struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
> >>> +struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
> >>> struct buf_page *page, size_t offset, ssize_t diff_offset)
> >>> {
> >>> struct buf_page *iter;
> >>> @@ -494,48 +496,10 @@ static struct buf_page *ltt_relay_find_n
> >>> return iter;
> >>> }
> >>> }
> >>> + WARN_ON(1);
> >>> return NULL;
> >>> }
> >>> -
> >>> -/*
> >>> - * Find the page containing "offset". Cache it if it is after the currently
> >>> - * cached page.
> >>> - */
> >>> -static struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
> >>> - struct buf_page **page_cache,
> >>> - struct buf_page *page, size_t offset)
> >>> -{
> >>> - ssize_t diff_offset;
> >>> - ssize_t half_buf_size = buf->chan->alloc_size >> 1;
> >>> -
> >>> - /*
> >>> - * Make sure this is the page we want to write into. The current
> >>> - * page is changed concurrently by other writers. [wrh]page are
> >>> - * used as a cache remembering the last page written
> >>> - * to/read/looked up for header address. No synchronization;
> >>> - * could have to find the previous page is a nested write
> >>> - * occured. Finding the right page is done by comparing the
> >>> - * dest_offset with the buf_page offsets.
> >>> - * When at the exact opposite of the buffer, bias towards forward search
> >>> - * because it will be cached.
> >>> - */
> >>> -
> >>> - diff_offset = (ssize_t)offset - (ssize_t)page->offset;
> >>> - if (diff_offset <= -(ssize_t)half_buf_size)
> >>> - diff_offset += buf->chan->alloc_size;
> >>> - else if (diff_offset > half_buf_size)
> >>> - diff_offset -= buf->chan->alloc_size;
> >>> -
> >>> - if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
> >>> - page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
> >>> - WARN_ON(!page);
> >>> - *page_cache = page;
> >>> - } else if (unlikely(diff_offset < 0)) {
> >>> - page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
> >>> - WARN_ON(!page);
> >>> - }
> >>> - return page;
> >>> -}
> >>> +EXPORT_SYMBOL_GPL(ltt_relay_find_next_page);
> >>>
> >>> /**
> >>> * ltt_relay_write - write data to a ltt_relay buffer.
> >>> @@ -543,22 +507,17 @@ static struct buf_page *ltt_relay_cache_
> >>> * @offset : offset within the buffer
> >>> * @src : source address
> >>> * @len : length to write
> >>> + * @page : cached buffer page
> >>> */
> >>> -int ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> - const void *src, size_t len)
> >>> +void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> + const void *src, size_t len, struct buf_page *page)
> >>> {
> >>> - struct buf_page *page;
> >>> - ssize_t pagecpy, orig_len;
> >>> + ssize_t pagecpy;
> >>>
> >>> - orig_len = len;
> >>> - offset &= buf->chan->alloc_size - 1;
> >>> - page = buf->wpage;
> >>> - if (unlikely(!len))
> >>> - return 0;
> >>> for (;;) {
> >>> page = ltt_relay_cache_page(buf, &buf->wpage, page, offset);
> >>> pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
> >>> - memcpy(page_address(page->page)
> >>> + ltt_relay_do_copy(page_address(page->page)
> >>> + (offset & ~PAGE_MASK), src, pagecpy);
> >
> > Hi Lai,
> >
> > Happy new chinese year :)
> >
> > Please see the corrected v2 of this patch. This v1 is broken.
> >
> >> I think memcpy() is better than ltt_relay_do_copy() here.
> >> (offset & ~PAGE_MASK) is unlikely 1,2,4,8,16 here.
> >
> > ltt_relay_do_copy has the switch for 1, 2, 4, 8 for the "len" parameter,
> > which corresponds to "pagecpy", not (offset & ~PAGE_MASK).
> >
> >>> len -= pagecpy;
> >>> if (likely(!len))
> >>> @@ -571,9 +530,8 @@ int ltt_relay_write(struct rchan_buf *bu
> >>> */
> >>> WARN_ON(offset >= buf->chan->alloc_size);
> >>> }
> >>> - return orig_len;
> >>> }
> >>> -EXPORT_SYMBOL_GPL(ltt_relay_write);
> >>> +EXPORT_SYMBOL_GPL(_ltt_relay_write);
> >>>
> >>> /**
> >>> * ltt_relay_read - read data from ltt_relay_buffer.
> >>> Index: linux-2.6-lttng/include/linux/ltt-relay.h
> >>> ===================================================================
> >>> --- linux-2.6-lttng.orig/include/linux/ltt-relay.h 2009-02-03 10:37:06.000000000 -0500
> >>> +++ linux-2.6-lttng/include/linux/ltt-relay.h 2009-02-03 10:37:13.000000000 -0500
> >>> @@ -140,8 +140,14 @@ struct rchan_callbacks {
> >>> int (*remove_buf_file)(struct dentry *dentry);
> >>> };
> >>>
> >>> -extern int ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> - const void *src, size_t len);
> >>> +extern struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
> >>> + struct buf_page *page, size_t offset, ssize_t diff_offset);
> >>> +
> >>> +extern struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
> >>> + struct buf_page *page, size_t offset, ssize_t diff_offset);
> >>> +
> >>> +extern void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
> >>> + const void *src, size_t len, struct buf_page *page);
> >>>
> >>> extern int ltt_relay_read(struct rchan_buf *buf, size_t offset,
> >>> void *dest, size_t len);
> >>> @@ -159,6 +165,84 @@ extern void *ltt_relay_offset_address(st
> >>> size_t offset);
> >>>
> >>> /*
> >>> + * Find the page containing "offset". Cache it if it is after the currently
> >>> + * cached page.
> >>> + */
> >>> +static inline struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
> >>> + struct buf_page **page_cache,
> >>> + struct buf_page *page, size_t offset)
> >>> +{
> >>> + ssize_t diff_offset;
> >>> + ssize_t half_buf_size = buf->chan->alloc_size >> 1;
> >>> +
> >>> + /*
> >>> + * Make sure this is the page we want to write into. The current
> >>> + * page is changed concurrently by other writers. [wrh]page are
> >>> + * used as a cache remembering the last page written
> >>> + * to/read/looked up for header address. No synchronization;
> >>> + * could have to find the previous page is a nested write
> >>> + * occured. Finding the right page is done by comparing the
> >>> + * dest_offset with the buf_page offsets.
> >>> + * When at the exact opposite of the buffer, bias towards forward search
> >>> + * because it will be cached.
> >>> + */
> >>> +
> >>> + diff_offset = (ssize_t)offset - (ssize_t)page->offset;
> >>> + if (diff_offset <= -(ssize_t)half_buf_size)
> >>> + diff_offset += buf->chan->alloc_size;
> >>> + else if (diff_offset > half_buf_size)
> >>> + diff_offset -= buf->chan->alloc_size;
> >>> +
> >>> + if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
> >>> + page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
> >>> + *page_cache = page;
> >>> + } else if (unlikely(diff_offset < 0)) {
> >>> + page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
> >>> + }
> >>> + return page;
> >>> +}
> >>> +
> >>> +static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> >>> +{
> >>> + switch (len) {
> >>> + case 1: *(u8 *)dest = *(const u8 *)src;
> >>> + break;
> >>> + case 2: *(u16 *)dest = *(const u16 *)src;
> >>> + break;
> >>> + case 4: *(u32 *)dest = *(const u32 *)src;
> >>> + break;
> >>> +#if (BITS_PER_LONG == 64)
> >>> + case 8: *(u64 *)dest = *(const u64 *)src;
> >>> + break;
> >>> +#endif
> >>> + default:
> >>> + memcpy(dest, src, len);
> >>> + }
> >>> +}
> >> I think this function is not correct when @src is not alignment for
> >> 2,4,8,or 16.
> >>
> >
> > Hrm, interesting. So if we need to copy 4 chars, e.g.
> >
> > char data[4]
> >
> > Then there are no requirements for alignment within the data.
> >
> > This is normally not a problem for architectures with
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, but could be a problem for
> > architectures with slow unaligned accesses.
> >
> > What do you think of this proposal for ltt_relay_do_copy ?
> >
> > #if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> > {
> > switch (len) {
> > case 0: break;
> > case 1: *(u8 *)dest = *(const u8 *)src;
> > break;
> > case 2: *(u16 *)dest = *(const u16 *)src;
> > break;
> > case 4: *(u32 *)dest = *(const u32 *)src;
> > break;
> > #if (BITS_PER_LONG == 64)
> > case 8: *(u64 *)dest = *(const u64 *)src;
> > break;
> > #endif
> Hello, Mathieu
>
> Why not use instructions generated by gcc instead of memcpy on arch without 64bit write as:
> case 4: *(u32 *)dest = *(const u32 *)src;
> break;
> case 8: *(u64 *)dest = *(const u64 *)src;
> break;
>
> IMHO, even on arch without 64bit write, memcpy is more complex.
>
Hi Zhaolei,
Good chinese new year to you :)
Hrm, that's an interesting idea. I wonder, on 32-bits x86, how those two
compare :
#include <inttypes.h>
char dest[100];
char src[100];
typedef uint64_t u64;
typedef uint32_t u32;
void gcc_u64(void)
{
asm("/* begin */");
*(u64 *)dest = *(const u64 *)src;
asm("/* end */");
}
movl src, %eax
movl src+4, %edx
movl %eax, dest
movl %edx, dest+4
void twice_u32(void)
{
asm("/* begin */");
((u32 *)dest)[0] = ((const u32 *)src)[0];
((u32 *)dest)[1] = ((const u32 *)src)[1];
asm("/* end */");
}
movl src, %eax
movl %eax, dest
movl src+4, %eax
movl %eax, dest+4
gcc seems to do a better register scheduler than my code, so I think
it's not so bad. I will take your proposal.
As a side-note : a side-effect of doing this is to have, on 32-bits
architectures without efficient aligned access, the following test :
case 8: if (unlikely(!addr_aligned(dest, src, 8)))
goto memcpy_fallback;
*(u64 *)dest = *(const u64 *)src;
Which would succeed if the address is aligned on 4 bytes but not 8 on a
32-bits arch (an access aligned on the pointer size is considered
aligned even if the requested alignment amount is bigger), see
ltt_align()). Therefore, the compiler would have to generate code able
to write a u64 to a u32-aligned address. Given this is on a 32-bits
architecture, I cannot foresee how in the world the compiler could mess
this up, so I think it's ok.
> > default:
> > memcpy(dest, src, len);
> > }
> > }
> > #else
> > /*
> > * Returns whether the dest and src addresses are aligned on
> > * min(sizeof(void *), len). Call this with statically known len for efficiency.
> > */
> > static inline addr_aligned(void *dest, void *src, size_t len)
> > {
> > if (ltt_align((size_t)dest, len))
> > return 0;
> > if (ltt_align((size_t)src, len))
> > return 0;
> > return 1;
> > }
> >
> > static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> > {
> > switch (len) {
> > case 0: break;
> > case 1: *(u8 *)dest = *(const u8 *)src;
> > break;
> > case 2: if (unlikely(!addr_aligned(dest, src, 2)))
> > goto memcpy_fallback;
> > *(u16 *)dest = *(const u16 *)src;
> > break;
> > case 4: if (unlikely(!addr_aligned(dest, src, 4)))
> > goto memcpy_fallback;
> > *(u32 *)dest = *(const u32 *)src;
> > break;
> > #if (BITS_PER_LONG == 64)
> > case 8: if (unlikely(!addr_aligned(dest, src, 8)))
> > goto memcpy_fallback;
> > *(u64 *)dest = *(const u64 *)src;
> > break;
> > #endif
> > default:
> > goto memcpy_fallback;
> > }
> > return;
> > memcpy_fallback:
> > memcpy(dest, src, len);
> It seems code is more complex on arch without "CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS"...
>
> Because len is less than 8 here, copy them one by one maybe effectively than complex memcpy:
> memcpy_fallback:
> for(;len>0; len--)
> *(u8 *)dest++ = *(const u8 *)src++;
>
except for default: , which should still be a call to memcpy(). I will
also integrate your suggestion, thanks. Some benchmarking would be
welcome though.
Also this assumes that
dest++ and src++
are done on the void *. Given that increment on void * is equivalent to
increment on char * (+1), this is ok.
Mathieu
Here is the resulting patch :
lttng-optimize-write-to-page-function-remove-some-memcpy-calls
Zhaolei :
> Hello, Mathieu
>
> Why not use instructions generated by gcc instead of memcpy on arch without
> 64bit write as:
> case 4: *(u32 *)dest = *(const u32 *)src;
> break;
> case 8: *(u64 *)dest = *(const u64 *)src;
> break;
>
> IMHO, even on arch without 64bit write, memcpy is more complex.
#include <inttypes.h>
char dest[100];
char src[100];
typedef uint64_t u64;
typedef uint32_t u32;
void gcc_u64(void)
{
asm("/* begin */");
*(u64 *)dest = *(const u64 *)src;
asm("/* end */");
}
movl src, %eax
movl src+4, %edx
movl %eax, dest
movl %edx, dest+4
void twice_u32(void)
{
asm("/* begin */");
((u32 *)dest)[0] = ((const u32 *)src)[0];
((u32 *)dest)[1] = ((const u32 *)src)[1];
asm("/* end */");
}
movl src, %eax
movl %eax, dest
movl src+4, %eax
movl %eax, dest+4
gcc seems to do a better register scheduler than my code, so I think
it's not so bad. I will take your proposal.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at polymtl.ca>
CC: Zhaolei <zhaolei at cn.fujitsu.com>
---
include/linux/ltt-relay.h | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
Index: linux-2.6-lttng/include/linux/ltt-relay.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/ltt-relay.h 2009-02-03 23:21:16.000000000 -0500
+++ linux-2.6-lttng/include/linux/ltt-relay.h 2009-02-03 23:29:17.000000000 -0500
@@ -214,10 +214,8 @@ static inline void ltt_relay_do_copy(voi
break;
case 4: *(u32 *)dest = *(const u32 *)src;
break;
-#if (BITS_PER_LONG == 64)
case 8: *(u64 *)dest = *(const u64 *)src;
break;
-#endif
default:
memcpy(dest, src, len);
}
@@ -250,18 +248,17 @@ static inline void ltt_relay_do_copy(voi
goto memcpy_fallback;
*(u32 *)dest = *(const u32 *)src;
break;
-#if (BITS_PER_LONG == 64)
case 8: if (unlikely(!addr_aligned(dest, src, 8)))
goto memcpy_fallback;
*(u64 *)dest = *(const u64 *)src;
break;
-#endif
default:
- goto memcpy_fallback;
+ memcpy(dest, src, len);
}
return;
memcpy_fallback:
- memcpy(dest, src, len);
+ for(; len > 0; len--)
+ *(u8 *)dest++ = *(const u8 *)src++;
}
#endif
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
More information about the lttng-dev
mailing list