[ltt-dev] [PATCH] LTTng optimize write to page function

Tue Feb 3 23:07:55 EST 2009

Mathieu Desnoyers Wrote:
> * Lai Jiangshan (laijs at cn.fujitsu.com) wrote:
>> Mathieu Desnoyers wrote:
>>> The functions in ltt-relay-alloc.c take care of writing the data into
>>> the buffer pages. Those pages are allocated from the page allocator and
>>> no virtual mapping is done so we can save precious TLB entries.
>>> ltt-relay-alloc.c is the abstraction layer which makes the buffers
>>> "look" like a contiguous memory area, although they are made from
>>> physically discontiguous pages linked with a linked list. A caching
>>> mechanism makes sure we never walk over more than 1-2 entries of the
>>> list. We use a linked list rather than a table to make sure we don't
>>> depend on vmalloc to allocate large pointer arrays.
>>>
>>> I did a bit of profiling with oprofile on LTTng and found out that write
>>> functions in ltt-relay-alloc.c were taking a lot of CPU time. I through it would
>>> be good to improve them a bit.
>>>
>>> Running a 2.6.29-rc3 kernel
>>>
>>> Compiling a 2.6.25 kernel using make -j10 on a 8-cores x86_64 with a vanilla
>>> 2.6.29-rc3 kernel (all tests are cache-hot) :
>>> real 1m22.103s
>>>
>>> With dormant instrumentation
>>> real 1m24.667s
>>> (note : this 2s regression should be identified eventually by doing a bissection
>>> of the LTTng tree.)
>>>
>>> ltt-armall
>>>
>>> Without modification, with flight recorder tracing active :
>>> real 1m31.135s
>>>
>>> Replacing the memcpy call with a specialized call for 1, 2, 4 and 8 bytes :
>>> real 1m30.440s
>>>
>>> Inlining the fast path of the write function :
>>> real 1m29.614s
>>>
>>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at polymtl.ca>
>>> CC: Martin Bligh <mbligh at google.com>
>>> CC: Zhaolei <zhaolei at cn.fujitsu.com>
>>> ---
>>>  include/linux/ltt-relay.h |   88 ++++++++++++++++++++++++++++++++++++++++++++--
>>>  ltt/ltt-relay-alloc.c     |   66 ++++++----------------------------
>>>  2 files changed, 98 insertions(+), 56 deletions(-)
>>>
>>> Index: linux-2.6-lttng/ltt/ltt-relay-alloc.c
>>> ===================================================================
>>> --- linux-2.6-lttng.orig/ltt/ltt-relay-alloc.c	2009-02-03 10:37:05.000000000 -0500
>>> +++ linux-2.6-lttng/ltt/ltt-relay-alloc.c	2009-02-03 10:37:13.000000000 -0500
>>> @@ -424,7 +424,7 @@ EXPORT_SYMBOL_GPL(ltt_relay_close);
>>>  /*
>>>   * Start iteration at the previous element. Skip the real list head.
>>>   */
>>> -static struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
>>> +struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
>>>  	struct buf_page *page, size_t offset, ssize_t diff_offset)
>>>  {
>>>  	struct buf_page *iter;
>>> @@ -456,13 +456,15 @@ static struct buf_page *ltt_relay_find_p
>>>  			return iter;
>>>  		}
>>>  	}
>>> +	WARN_ON(1);
>>>  	return NULL;
>>>  }
>>> +EXPORT_SYMBOL_GPL(ltt_relay_find_prev_page);
>>>  
>>>  /*
>>>   * Start iteration at the next element. Skip the real list head.
>>>   */
>>> -static struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
>>> +struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
>>>  	struct buf_page *page, size_t offset, ssize_t diff_offset)
>>>  {
>>>  	struct buf_page *iter;
>>> @@ -494,48 +496,10 @@ static struct buf_page *ltt_relay_find_n
>>>  			return iter;
>>>  		}
>>>  	}
>>> +	WARN_ON(1);
>>>  	return NULL;
>>>  }
>>> -
>>> -/*
>>> - * Find the page containing "offset". Cache it if it is after the currently
>>> - * cached page.
>>> - */
>>> -static struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
>>> -		struct buf_page **page_cache,
>>> -		struct buf_page *page, size_t offset)
>>> -{
>>> -	ssize_t diff_offset;
>>> -	ssize_t half_buf_size = buf->chan->alloc_size >> 1;
>>> -
>>> -	/*
>>> -	 * Make sure this is the page we want to write into. The current
>>> -	 * page is changed concurrently by other writers. [wrh]page are
>>> -	 * used as a cache remembering the last page written
>>> -	 * to/read/looked up for header address. No synchronization;
>>> -	 * could have to find the previous page is a nested write
>>> -	 * occured. Finding the right page is done by comparing the
>>> -	 * dest_offset with the buf_page offsets.
>>> -	 * When at the exact opposite of the buffer, bias towards forward search
>>> -	 * because it will be cached.
>>> -	 */
>>> -
>>> -	diff_offset = (ssize_t)offset - (ssize_t)page->offset;
>>> -	if (diff_offset <= -(ssize_t)half_buf_size)
>>> -		diff_offset += buf->chan->alloc_size;
>>> -	else if (diff_offset > half_buf_size)
>>> -		diff_offset -= buf->chan->alloc_size;
>>> -
>>> -	if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
>>> -		page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
>>> -		WARN_ON(!page);
>>> -		*page_cache = page;
>>> -	} else if (unlikely(diff_offset < 0)) {
>>> -		page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
>>> -		WARN_ON(!page);
>>> -	}
>>> -	return page;
>>> -}
>>> +EXPORT_SYMBOL_GPL(ltt_relay_find_next_page);
>>>  
>>>  /**
>>>   * ltt_relay_write - write data to a ltt_relay buffer.
>>> @@ -543,22 +507,17 @@ static struct buf_page *ltt_relay_cache_
>>>   * @offset : offset within the buffer
>>>   * @src : source address
>>>   * @len : length to write
>>> + * @page : cached buffer page
>>>   */
>>> -int ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> -	const void *src, size_t len)
>>> +void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> +	const void *src, size_t len, struct buf_page *page)
>>>  {
>>> -	struct buf_page *page;
>>> -	ssize_t pagecpy, orig_len;
>>> +	ssize_t pagecpy;
>>>  
>>> -	orig_len = len;
>>> -	offset &= buf->chan->alloc_size - 1;
>>> -	page = buf->wpage;
>>> -	if (unlikely(!len))
>>> -		return 0;
>>>  	for (;;) {
>>>  		page = ltt_relay_cache_page(buf, &buf->wpage, page, offset);
>>>  		pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
>>> -		memcpy(page_address(page->page)
>>> +		ltt_relay_do_copy(page_address(page->page)
>>>  			+ (offset & ~PAGE_MASK), src, pagecpy);
> 
> Hi Lai,
> 
> Happy new chinese year :)
> 
> Please see the corrected v2 of this patch. This v1 is broken.
> 
>> I think memcpy() is better than ltt_relay_do_copy() here.
>> (offset & ~PAGE_MASK) is unlikely 1,2,4,8,16 here.
> 
> ltt_relay_do_copy has the switch for 1, 2, 4, 8 for the "len" parameter,
> which corresponds to "pagecpy", not (offset & ~PAGE_MASK).
> 
>>>  		len -= pagecpy;
>>>  		if (likely(!len))
>>> @@ -571,9 +530,8 @@ int ltt_relay_write(struct rchan_buf *bu
>>>  		 */
>>>  		WARN_ON(offset >= buf->chan->alloc_size);
>>>  	}
>>> -	return orig_len;
>>>  }
>>> -EXPORT_SYMBOL_GPL(ltt_relay_write);
>>> +EXPORT_SYMBOL_GPL(_ltt_relay_write);
>>>  
>>>  /**
>>>   * ltt_relay_read - read data from ltt_relay_buffer.
>>> Index: linux-2.6-lttng/include/linux/ltt-relay.h
>>> ===================================================================
>>> --- linux-2.6-lttng.orig/include/linux/ltt-relay.h	2009-02-03 10:37:06.000000000 -0500
>>> +++ linux-2.6-lttng/include/linux/ltt-relay.h	2009-02-03 10:37:13.000000000 -0500
>>> @@ -140,8 +140,14 @@ struct rchan_callbacks {
>>>  	int (*remove_buf_file)(struct dentry *dentry);
>>>  };
>>>  
>>> -extern int ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> -	const void *src, size_t len);
>>> +extern struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf,
>>> +	struct buf_page *page, size_t offset, ssize_t diff_offset);
>>> +
>>> +extern struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf,
>>> +	struct buf_page *page, size_t offset, ssize_t diff_offset);
>>> +
>>> +extern void _ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> +	const void *src, size_t len, struct buf_page *page);
>>>  
>>>  extern int ltt_relay_read(struct rchan_buf *buf, size_t offset,
>>>  	void *dest, size_t len);
>>> @@ -159,6 +165,84 @@ extern void *ltt_relay_offset_address(st
>>>  	size_t offset);
>>>  
>>>  /*
>>> + * Find the page containing "offset". Cache it if it is after the currently
>>> + * cached page.
>>> + */
>>> +static inline struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf,
>>> +		struct buf_page **page_cache,
>>> +		struct buf_page *page, size_t offset)
>>> +{
>>> +	ssize_t diff_offset;
>>> +	ssize_t half_buf_size = buf->chan->alloc_size >> 1;
>>> +
>>> +	/*
>>> +	 * Make sure this is the page we want to write into. The current
>>> +	 * page is changed concurrently by other writers. [wrh]page are
>>> +	 * used as a cache remembering the last page written
>>> +	 * to/read/looked up for header address. No synchronization;
>>> +	 * could have to find the previous page is a nested write
>>> +	 * occured. Finding the right page is done by comparing the
>>> +	 * dest_offset with the buf_page offsets.
>>> +	 * When at the exact opposite of the buffer, bias towards forward search
>>> +	 * because it will be cached.
>>> +	 */
>>> +
>>> +	diff_offset = (ssize_t)offset - (ssize_t)page->offset;
>>> +	if (diff_offset <= -(ssize_t)half_buf_size)
>>> +		diff_offset += buf->chan->alloc_size;
>>> +	else if (diff_offset > half_buf_size)
>>> +		diff_offset -= buf->chan->alloc_size;
>>> +
>>> +	if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) {
>>> +		page = ltt_relay_find_next_page(buf, page, offset, diff_offset);
>>> +		*page_cache = page;
>>> +	} else if (unlikely(diff_offset < 0)) {
>>> +		page = ltt_relay_find_prev_page(buf, page, offset, diff_offset);
>>> +	}
>>> +	return page;
>>> +}
>>> +
>>> +static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
>>> +{
>>> +	switch (len) {
>>> +	case 1:	*(u8 *)dest = *(const u8 *)src;
>>> +		break;
>>> +	case 2:	*(u16 *)dest = *(const u16 *)src;
>>> +		break;
>>> +	case 4:	*(u32 *)dest = *(const u32 *)src;
>>> +		break;
>>> +#if (BITS_PER_LONG == 64)
>>> +	case 8:	*(u64 *)dest = *(const u64 *)src;
>>> +		break;
>>> +#endif
>>> +	default:
>>> +		memcpy(dest, src, len);
>>> +	}
>>> +}
>> I think this function is not correct when @src is not alignment for
>> 2,4,8,or 16.
>>
> 
> Hrm, interesting. So if we need to copy 4 chars, e.g.
> 
> char data[4]
> 
> Then there are no requirements for alignment within the data.
> 
> This is normally not a problem for architectures with
> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, but could be a problem for
> architectures with slow unaligned accesses.
> 
> What do you think of this proposal for ltt_relay_do_copy ?
> 
> #if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> {
> 	switch (len) {
> 	case 0:	break;
> 	case 1:	*(u8 *)dest = *(const u8 *)src;
> 		break;
> 	case 2:	*(u16 *)dest = *(const u16 *)src;
> 		break;
> 	case 4:	*(u32 *)dest = *(const u32 *)src;
> 		break;
> #if (BITS_PER_LONG == 64)
> 	case 8:	*(u64 *)dest = *(const u64 *)src;
> 		break;
> #endif
Hello, Mathieu

Why not use instructions generated by gcc instead of memcpy on arch without 64bit write as:
case 4:	*(u32 *)dest = *(const u32 *)src;
	break;
case 8:	*(u64 *)dest = *(const u64 *)src;
	break;

IMHO, even on arch without 64bit write, memcpy is more complex.

> 	default:
> 		memcpy(dest, src, len);
> 	}
> }
> #else
> /*
>  * Returns whether the dest and src addresses are aligned on
>  * min(sizeof(void *), len). Call this with statically known len for efficiency.
>  */
> static inline addr_aligned(void *dest, void *src, size_t len)
> {
> 	if (ltt_align((size_t)dest, len))
> 		return 0;
> 	if (ltt_align((size_t)src, len))
> 		return 0;
> 	return 1;
> }
> 
> static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len)
> {
> 	switch (len) {
> 	case 0:	break;
> 	case 1:	*(u8 *)dest = *(const u8 *)src;
> 		break;
> 	case 2:	if (unlikely(!addr_aligned(dest, src, 2)))
> 			goto memcpy_fallback;
> 		*(u16 *)dest = *(const u16 *)src;
> 		break;
> 	case 4:	if (unlikely(!addr_aligned(dest, src, 4)))
> 			goto memcpy_fallback;
> 		*(u32 *)dest = *(const u32 *)src;
> 		break;
> #if (BITS_PER_LONG == 64)
> 	case 8:	if (unlikely(!addr_aligned(dest, src, 8)))
> 			goto memcpy_fallback;
> 		*(u64 *)dest = *(const u64 *)src;
> 		break;
> #endif
> 	default:
> 		goto memcpy_fallback;
> 	}
> 	return;
> memcpy_fallback:
> 	memcpy(dest, src, len);
It seems code is more complex on arch without "CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS"...

Because len is less than 8 here, copy them one by one maybe effectively than complex memcpy:
memcpy_fallback:
	for(;len>0; len--)
		*(u8 *)dest++ = *(const u8 *)src++;

B.R.
Zhaolei

> }
> #endif
> 
> 
> Mathieu
> 
>>> +
>>> +static inline int ltt_relay_write(struct rchan_buf *buf, size_t offset,
>>> +	const void *src, size_t len)
>>> +{
>>> +	struct buf_page *page;
>>> +	ssize_t pagecpy, orig_len;
>>> +
>>> +	if (unlikely(!len))
>>> +		return 0;
>>> +	orig_len = len;
>>> +	offset &= buf->chan->alloc_size - 1;
>>> +	page = buf->wpage;
>>> +
>>> +	page = ltt_relay_cache_page(buf, &buf->wpage, page, offset);
>>> +	pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
>>> +	ltt_relay_do_copy(page_address(page->page)
>>> +		+ (offset & ~PAGE_MASK), src, pagecpy);
>>> +	len -= pagecpy;
>>> +	if (unlikely(len))
>>> +		_ltt_relay_write(buf, offset, src, len, page);
>>> +	return orig_len;
>>> +}
>>> +
>>> +/*
>>>   * CONFIG_LTT_RELAY kernel API, ltt/ltt-relay-alloc.c
>>>   */
>>>  
>>>
>>
>>
>> _______________________________________________
>> ltt-dev mailing list
>> ltt-dev at lists.casi.polymtl.ca
>> http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev
>>
>