[ltt-dev] [PATCH] LTTng optimize write to page function
Zhaolei
zhaolei at cn.fujitsu.com
Tue Feb 3 23:07:55 EST 2009
Mathieu Desnoyers Wrote:
> * Lai Jiangshan (laijs at cn.fujitsu.com) wrote:
>> Mathieu Desnoyers wrote:
>>> The functions in ltt-relay-alloc.c take care of writing the data into
>>> the buffer pages. Those pages are allocated from the page allocator and
>>> no virtual mapping is done so we can save precious TLB entries.
>>> ltt-relay-alloc.c is the abstraction layer which makes the buffers
>>> "look" like a contiguous memory area, although they are made from
>>> physically discontiguous pages linked with a linked list. A caching
>>> mechanism makes sure we never walk over more than 1-2 entries of the
>>> list. We use a linked list rather than a table to make sure we don't
>>> depend on vmalloc to allocate large pointer arrays.
>>>
>>> I did a bit of profiling with oprofile on LTTng and found out that write
>>> functions in ltt-relay-alloc.c were taking a lot of CPU time. I through it would
>>> be good to improve them a bit.
>>>
>>> Running a 2.6.29-rc3 kernel
>>>
>>> Compiling a 2.6.25 kernel using make -j10 on a 8-cores x86_64 with a vanilla
>>> 2.6.29-rc3 kernel (all tests are cache-hot) :
>>> real 1m22.103s
>>>
>>> With dormant instrumentation
>>> real 1m24.667s
>>> (note : this 2s regression should be identified eventually by doing a bissection
>>> of the LTTng tree.)
>>>
>>> ltt-armall
>>>
>>> Without modification, with flight recorder tracing active :
>>> real 1m31.135s
>>>
>>> Replacing the memcpy call with a specialized call for 1, 2, 4 and 8 bytes :
>>> real 1m30.440s
>>>
>>> Inlining the fast path of the write function :
>>> real 1m29.614s
>>>
>>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at polymtl.ca>
>>> CC: Martin Bligh <mbligh at google.com>
>>> CC: Zhaolei <zhaolei at cn.fujitsu.com>
>>> ---
>>> include/linux/ltt-relay.h | 88 ++++++++++++++++++++++++++++++++++++++++++++--
>>> ltt/ltt-relay-alloc.c | 66 ++++++----------------------------
>>> 2 files changed, 98 insertions(+), 56 deletions(-)
>>>
>>> Index: linux-2.6-lttng/ltt/ltt-relay-alloc.c
>>> ===================================================================
>>> --- linux-2.6-lttng.orig/ltt/ltt-relay-alloc.c 2009-02-03 10:37:05.000000000 -0500
>>> +++ linux-2.6-lttng/ltt/ltt-relay-alloc.c 2009-02-03 10:37:13.000000000 -0500
>>> @@ -424,7 +424,7 @@ EXPORT_SYMBOL_GPL(ltt_relay_close);
>>> /*
>>> * Start iteration at the previous element. Skip the real list head.
>>> */
>>> -static struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
>>> +struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
>>> struct buf_page *page, size_t offset, ssize_t diff_offset)
>>> {
>>> struct buf_page *iter;
>>> @@ -456,13 +456,15 @@ static struct buf_page *ltt_relay_find_p
>>> return iter;
>>> }
>>> }
>>> + WARN_ON(1);
>>> return NULL;
>>> }
>>> +EXPORT_SYMBOL_GPL(ltt_relay_find_prev_page);
>>>
>>> /*
>>> * Start iteration at the next element. Skip the real list head.
>>> */
>>> -static struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
>>> +struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
>>> struct buf_page *page, size_t offset, ssize_t diff_offset)
>>> {
>>> struct buf_page *iter;
>>> @@ -494,48 +496,10 @@ static struct buf_page *ltt_relay_find_n
>>> return iter;
>>> }
>>> }
>>> + WARN_ON(1);
>>> return NULL;
>>> }
>>> -
>>> -/*
>>> - * Find the page containing "offset". Cache it if it is after the currently
>>> - * cached page.
>>> - */
>>> -static struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
>>> - struct buf_page **page_cache,
>>> - struct buf_page *page, size_t offset)
>>> -{
>>> - ssize_t diff_offset;
>>> - ssize_t half_buf_size = buf->chan->alloc_size >> 1;
>>> -
>>> - /*
>>> - * Make sure this is the page we want to write into. The current
>>> - * page is changed concurrently by other writers. [wrh]page are
>>> - * used as a cache remembering the last page written
>>> - * to/read/looked up for header address. No synchronization;
>>> - * could have to find the previous page is a nested write
>>> - * occured. Finding the right page is done by comparing the
>>> - * dest_offset with the buf_page offsets.
>>> - * When at the exact opposite of the buffer, bias towards forward search
>>> - * because it will be cached.
>>> - */
>>> -
>>> - diff_offset = (ssize_t)offset - (ssize_t)page->offset;
>>> - if (diff_offset <= -(ssize_t)half_buf_size)
>>> - diff_offset += buf->chan->alloc_size;
>>> - else if (diff_offset > half_buf_size)
>>> - diff_offset -= buf->chan->alloc_size;
>>> -
>>> - if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
>>> - page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
>>> - WARN_ON(!page);
>>> - *page_cache = page;
>>> - } else if (unlikely(diff_offset < 0)) {
>>> - page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
>>> - WARN_ON(!page);
>>> - }
>>> - return page;
>>> -}
>>> +EXPORT_SYMBOL_GPL(ltt_relay_find_next_page);
>>>
>>> /**
>>> * ltt_relay_write - write data to a ltt_relay buffer.
>>> @@ -543,22 +507,17 @@ static struct buf_page *ltt_relay_cache_
>>> * @offset : offset within the buffer
>>> * @src : source address
>>> * @len : length to write
>>> + * @page : cached buffer page
>>> */
>>> -int ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> - const void *src, size_t len)
>>> +void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> + const void *src, size_t len, struct buf_page *page)
>>> {
>>> - struct buf_page *page;
>>> - ssize_t pagecpy, orig_len;
>>> + ssize_t pagecpy;
>>>
>>> - orig_len = len;
>>> - offset &= buf->chan->alloc_size - 1;
>>> - page = buf->wpage;
>>> - if (unlikely(!len))
>>> - return 0;
>>> for (;;) {
>>> page = ltt_relay_cache_page(buf, &buf->wpage, page, offset);
>>> pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
>>> - memcpy(page_address(page->page)
>>> + ltt_relay_do_copy(page_address(page->page)
>>> + (offset & ~PAGE_MASK), src, pagecpy);
>
> Hi Lai,
>
> Happy new chinese year :)
>
> Please see the corrected v2 of this patch. This v1 is broken.
>
>> I think memcpy() is better than ltt_relay_do_copy() here.
>> (offset & ~PAGE_MASK) is unlikely 1,2,4,8,16 here.
>
> ltt_relay_do_copy has the switch for 1, 2, 4, 8 for the "len" parameter,
> which corresponds to "pagecpy", not (offset & ~PAGE_MASK).
>
>>> len -= pagecpy;
>>> if (likely(!len))
>>> @@ -571,9 +530,8 @@ int ltt_relay_write(struct rchan_buf *bu
>>> */
>>> WARN_ON(offset >= buf->chan->alloc_size);
>>> }
>>> - return orig_len;
>>> }
>>> -EXPORT_SYMBOL_GPL(ltt_relay_write);
>>> +EXPORT_SYMBOL_GPL(_ltt_relay_write);
>>>
>>> /**
>>> * ltt_relay_read - read data from ltt_relay_buffer.
>>> Index: linux-2.6-lttng/include/linux/ltt-relay.h
>>> ===================================================================
>>> --- linux-2.6-lttng.orig/include/linux/ltt-relay.h 2009-02-03 10:37:06.000000000 -0500
>>> +++ linux-2.6-lttng/include/linux/ltt-relay.h 2009-02-03 10:37:13.000000000 -0500
>>> @@ -140,8 +140,14 @@ struct rchan_callbacks {
>>> int (*remove_buf_file)(struct dentry *dentry);
>>> };
>>>
>>> -extern int ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> - const void *src, size_t len);
>>> +extern struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
>>> + struct buf_page *page, size_t offset, ssize_t diff_offset);
>>> +
>>> +extern struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
>>> + struct buf_page *page, size_t offset, ssize_t diff_offset);
>>> +
>>> +extern void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> + const void *src, size_t len, struct buf_page *page);
>>>
>>> extern int ltt_relay_read(struct rchan_buf *buf, size_t offset,
>>> void *dest, size_t len);
>>> @@ -159,6 +165,84 @@ extern void *ltt_relay_offset_address(st
>>> size_t offset);
>>>
>>> /*
>>> + * Find the page containing "offset". Cache it if it is after the currently
>>> + * cached page.
>>> + */
>>> +static inline struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
>>> + struct buf_page **page_cache,
>>> + struct buf_page *page, size_t offset)
>>> +{
>>> + ssize_t diff_offset;
>>> + ssize_t half_buf_size = buf->chan->alloc_size >> 1;
>>> +
>>> + /*
>>> + * Make sure this is the page we want to write into. The current
>>> + * page is changed concurrently by other writers. [wrh]page are
>>> + * used as a cache remembering the last page written
>>> + * to/read/looked up for header address. No synchronization;
>>> + * could have to find the previous page is a nested write
>>> + * occured. Finding the right page is done by comparing the
>>> + * dest_offset with the buf_page offsets.
>>> + * When at the exact opposite of the buffer, bias towards forward search
>>> + * because it will be cached.
>>> + */
>>> +
>>> + diff_offset = (ssize_t)offset - (ssize_t)page->offset;
>>> + if (diff_offset <= -(ssize_t)half_buf_size)
>>> + diff_offset += buf->chan->alloc_size;
>>> + else if (diff_offset > half_buf_size)
>>> + diff_offset -= buf->chan->alloc_size;
>>> +
>>> + if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
>>> + page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
>>> + *page_cache = page;
>>> + } else if (unlikely(diff_offset < 0)) {
>>> + page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
>>> + }
>>> + return page;
>>> +}
>>> +
>>> +static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
>>> +{
>>> + switch (len) {
>>> + case 1: *(u8 *)dest = *(const u8 *)src;
>>> + break;
>>> + case 2: *(u16 *)dest = *(const u16 *)src;
>>> + break;
>>> + case 4: *(u32 *)dest = *(const u32 *)src;
>>> + break;
>>> +#if (BITS_PER_LONG == 64)
>>> + case 8: *(u64 *)dest = *(const u64 *)src;
>>> + break;
>>> +#endif
>>> + default:
>>> + memcpy(dest, src, len);
>>> + }
>>> +}
>> I think this function is not correct when @src is not alignment for
>> 2,4,8,or 16.
>>
>
> Hrm, interesting. So if we need to copy 4 chars, e.g.
>
> char data[4]
>
> Then there are no requirements for alignment within the data.
>
> This is normally not a problem for architectures with
> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, but could be a problem for
> architectures with slow unaligned accesses.
>
> What do you think of this proposal for ltt_relay_do_copy ?
>
> #if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> {
> switch (len) {
> case 0: break;
> case 1: *(u8 *)dest = *(const u8 *)src;
> break;
> case 2: *(u16 *)dest = *(const u16 *)src;
> break;
> case 4: *(u32 *)dest = *(const u32 *)src;
> break;
> #if (BITS_PER_LONG == 64)
> case 8: *(u64 *)dest = *(const u64 *)src;
> break;
> #endif
Hello, Mathieu
Why not use instructions generated by gcc instead of memcpy on arch without 64bit write as:
case 4: *(u32 *)dest = *(const u32 *)src;
break;
case 8: *(u64 *)dest = *(const u64 *)src;
break;
IMHO, even on arch without 64bit write, memcpy is more complex.
> default:
> memcpy(dest, src, len);
> }
> }
> #else
> /*
> * Returns whether the dest and src addresses are aligned on
> * min(sizeof(void *), len). Call this with statically known len for efficiency.
> */
> static inline addr_aligned(void *dest, void *src, size_t len)
> {
> if (ltt_align((size_t)dest, len))
> return 0;
> if (ltt_align((size_t)src, len))
> return 0;
> return 1;
> }
>
> static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> {
> switch (len) {
> case 0: break;
> case 1: *(u8 *)dest = *(const u8 *)src;
> break;
> case 2: if (unlikely(!addr_aligned(dest, src, 2)))
> goto memcpy_fallback;
> *(u16 *)dest = *(const u16 *)src;
> break;
> case 4: if (unlikely(!addr_aligned(dest, src, 4)))
> goto memcpy_fallback;
> *(u32 *)dest = *(const u32 *)src;
> break;
> #if (BITS_PER_LONG == 64)
> case 8: if (unlikely(!addr_aligned(dest, src, 8)))
> goto memcpy_fallback;
> *(u64 *)dest = *(const u64 *)src;
> break;
> #endif
> default:
> goto memcpy_fallback;
> }
> return;
> memcpy_fallback:
> memcpy(dest, src, len);
It seems code is more complex on arch without "CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS"...
Because len is less than 8 here, copy them one by one maybe effectively than complex memcpy:
memcpy_fallback:
for(;len>0; len--)
*(u8 *)dest++ = *(const u8 *)src++;
B.R.
Zhaolei
> }
> #endif
>
>
> Mathieu
>
>>> +
>>> +static inline int ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> + const void *src, size_t len)
>>> +{
>>> + struct buf_page *page;
>>> + ssize_t pagecpy, orig_len;
>>> +
>>> + if (unlikely(!len))
>>> + return 0;
>>> + orig_len = len;
>>> + offset &= buf->chan->alloc_size - 1;
>>> + page = buf->wpage;
>>> +
>>> + page = ltt_relay_cache_page(buf, &buf->wpage, page, offset);
>>> + pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
>>> + ltt_relay_do_copy(page_address(page->page)
>>> + + (offset & ~PAGE_MASK), src, pagecpy);
>>> + len -= pagecpy;
>>> + if (unlikely(len))
>>> + _ltt_relay_write(buf, offset, src, len, page);
>>> + return orig_len;
>>> +}
>>> +
>>> +/*
>>> * CONFIG_LTT_RELAY kernel API, ltt/ltt-relay-alloc.c
>>> */
>>>
>>>
>>
>>
>> _______________________________________________
>> ltt-dev mailing list
>> ltt-dev at lists.casi.polymtl.ca
>> http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev
>>
>
More information about the lttng-dev
mailing list