[lttng-dev] lttng-consumerd can NOT get notification to consume ring buffer data
zhenyu.ren
zhenyu.ren at aliyun.com
Tue Dec 7 04:03:45 EST 2021
Hi, Lttng-dev:
I found a strange problem related to lttng-consumed and ctf files recently. The ctf files belongs to some CPUs have been stopped rotating but other ctf files(belong to other CPUs) keeped working as usual. I am very sure all CPUs were producing spans all the time.
#date; ls -ltrh channel0_0_* |tail -n 3; date;ls -ltrh channel0_1_* |tail -n 3
Tue Dec 7 16:25:45 CST 2021
-rw-rw---- 1 root root 1.8M Dec 7 16:20 channel0_0_17
-rw-rw---- 1 root root 2.0M Dec 7 16:23 channel0_0_18
-rw-rw---- 1 root root 916K Dec 7 16:24 channel0_0_19
Tue Dec 7 16:25:45 CST 2021
-rw-rw---- 1 root root 1.7M Dec 6 00:30 channel0_1_8
-rw-rw---- 1 root root 1.9M Dec 6 00:31 channel0_1_9
-rw-rw---- 1 root root 388K Dec 6 00:32 channel0_1_10
Notice that the ctf files with CPU0 (channel0_0_19) was modified at the time "now", but the ctf files with CPU1(channeo0_1_10) has been stopped working at Dec 6.
I gdb lttng-consumerd(break at lib_ring_buffer_channel_do_read() and lib_ring_buffer_poll_deliver()--I configured a read timer on channels). See the followings (lib_ring_buffer_poll_deliver() )
0x00007f0f2418a445 <+213>: mov 0x0(%r13),%rcx rcx:0x7f0f0402e610(handle->table)
0x00007f0f2418a449 <+217>: mov 0x98(%rsi),%r9 rsi:0x7f0efb86f000(buf) r9:0x86c400000(consumed_old = buf->consumed)
0x00007f0f2418a450 <+224>: lea 0x150(%rsi),%rdi rdi:0x7f0efda75150
0x00007f0f2418a457 <+231>: mov 0x150(%rsi),%rax rax:3(cpu?)
0x00007f0f2418a45e <+238>: mov 0x78(%rbp),%rdx rbp:chan rdx:0x1000000(chan->backend.buf_size)
0x00007f0f2418a462 <+242>: mov 0x88(%rbp),%r8d r8d:0x14(20)(chan->backend.buf_size_order)
0x00007f0f2418a469 <+249>: cmp %rax,0x8(%rcx)
0x00007f0f2418a46d <+253>: jbe 0x7f0f2418a80c <lib_ring_buffer_channel_do_read+1180>
0x00007f0f2418a473 <+259>: shl $0x6,%rax rax:192
0x00007f0f2418a477 <+263>: sub $0x1,%rdx rdx:0xffffff 16777215 (chan->backend.buf_size - 1)
0x00007f0f2418a47b <+267>: lea 0x10(%rax,%rcx,1),%r10 rax:192 rcx:0x7f0f0402e610(handle->table) r10:0x7f0f0402e6e0
0x00007f0f2418a480 <+272>: and %r9,%rdx r9:0x8fdc00000 rdx:0xc00000 12582912 buf_trunc(consume_old, chan->backend.buf_size - 1)
0x00007f0f2418a483 <+275>: mov 0x8(%rdi),%rax rdi:0x7f0efda75150 rax:2688 ref_offset = (size_t) ref->offset
0x00007f0f2418a487 <+279>: mov %r8d,%ecx ecx:0x14(20)
0x00007f0f2418a48a <+282>: shr %cl,%rdx (....)
0x00007f0f2418a48d <+285>: shl $0x7,%rdx
0x00007f0f2418a491 <+289>: add %rax,%rdx
0x00007f0f2418a494 <+292>: lea 0x80(%rdx),%rax
0x00007f0f2418a49b <+299>: cmp 0x28(%r10),%rax
0x00007f0f2418a49f <+303>: ja 0x7f0f2418a80c <lib_ring_buffer_channel_do_read+1180>
0x00007f0f2418a4a5 <+309>: mov 0x20(%r10),%rax r10:0x7f0f0402e6e0 rax:0x7f0efb86f000(buf)
0x00007f0f2418a4a9 <+313>: add %rdx,%rax rdx:3200 rax:0x7f0efb86fc80
0x00007f0f2418a4ac <+316>: mov (%rax),%rax rax:0x86c00000(commit_count!!!!!!Incremented _once_ at sb switch cc_sb)
0x00007f0f2418a4af <+319>: mov 0x80(%rbp),%r8 r8:0x100000(chan->backend.subbuf_size)
0x00007f0f2418a4b6 <+326>: mov 0x78(%rbp),%rdx rdx:0x1000000(chan->backend.buf_size)
0x00007f0f2418a4ba <+330>: mov 0x8c(%rbp),%ecx ecx:4
0x00007f0f2418a4c0 <+336>: mov 0x80(%rsi),%rdi rdi:0x86d400000
0x00007f0f2418a4c7 <+343>: sub %r8,%rax rax:0x86b00000(commit_count - chan->backend.subbuf_size)
0x00007f0f2418a4ca <+346>: and 0x8(%rbp),%rax rax & chan->commit_count_mask = rax:0x86b00000
0x00007f0f2418a4ce <+350>: neg %rdx rdx:0x1000000(16M chan->backend.buf_size) --> 0xffffffffff000000(-16777216)
0x00007f0f2418a4d1 <+353>: and %r9,%rdx r9:0x86c400000(consume_old) rdx:0x86c000000
0x00007f0f2418a4d4 <+356>: shr %cl,%rdx cl:4 rdx:0x86c00000
0x00007f0f2418a4d7 <+359>: cmp %rdx,%rax rax:0x86b0000 rdx:0x86c00000
0x00007f0f2418a4da <+362>: jne 0x7f0f2418a411 <lib_ring_buffer_channel_do_read+161>
0x00007f0f2418a4e0 <+368>: mov %r8,%rax
209 static inline
210 int lib_ring_buffer_poll_deliver(const struct lttng_ust_lib_ring_buffer_config *config,
211 struct lttng_ust_lib_ring_buffer *buf,
212 struct channel *chan,
213 struct lttng_ust_shm_handle *handle)
214 {
215 unsigned long consumed_old, consumed_idx, commit_count, write_offset;
216
217 consumed_old = uatomic_read(&buf->consumed);
218 consumed_idx = subbuf_index(consumed_old, chan);
219 commit_count = v_read(config, &shmp_index(handle, buf->commit_cold, consumed_idx)->cc_sb);
220 /*
221 * No memory barrier here, since we are only interested
222 * in a statistically correct polling result. The next poll will
223 * get the data is we are racing. The mb() that ensures correct
224 * memory order is in get_subbuf.
225 */
226 write_offset = v_read(config, &buf->offset);
227
228 /*
229 * Check that the subbuffer we are trying to consume has been
230 * already fully committed.
231 */
232
233 if (((commit_count - chan->backend.subbuf_size)
234 & chan->commit_count_mask)
235 - (buf_trunc(consumed_old, chan)
236 >> chan->backend.num_subbuf_order)
237 != 0)
238 return 0;
239
240 /*
241 * Check that we are not about to read the same subbuffer in
242 * which the writer head is.
243 */
244 if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_old, chan)
245 == 0)
246 return 0;
247
248 return 1;
249
250 }
It seems that lib_ring_buffer_channel_do_read()->lib_ring_buffer_poll_deliver() returned 0 at line 238 ($consume_old:0x86c400000, $commint_count:0x86c00000, $write_offset:0x86d400000).
Buffer type: per UID
Channels:
-------------
- channel0: [enabled]
Attributes:
overwrite mode: 0
subbufers size: 1048576
number of subbufers: 16
switch timer interval: 20000000
read timer interval: 60000000
trace file count: 80
trace file size (bytes): 2097152
output: mmap()
PS: Lttng version is 2.7(I know it is an very old version:) )
Do you have any ideas about this problem? Thanks in advance
Thanks a lot
zhenyu.ren
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.lttng.org/pipermail/lttng-dev/attachments/20211207/2f8c17e1/attachment-0001.htm>
More information about the lttng-dev
mailing list