[lttng-dev] lttng-consumerd can NOT get notification to consume ring buffer data

* [lttng-dev] lttng-consumerd can NOT get notification to consume ring buffer data
@ 2021-12-07  9:03 zhenyu.ren via lttng-dev
  2021-12-07 20:07 ` Mathieu Desnoyers via lttng-dev
  0 siblings, 1 reply; 5+ messages in thread
From: zhenyu.ren via lttng-dev @ 2021-12-07  9:03 UTC (permalink / raw)
  To: lttng-dev

[-- Attachment #1.1: Type: text/plain, Size: 6724 bytes --]

Hi, Lttng-dev:

  I found a strange problem related to lttng-consumed and ctf files recently. The ctf files belongs to some CPUs have been stopped rotating but other ctf files(belong to other CPUs) keeped working as usual. I am very sure all CPUs were producing spans all the time.

#date; ls -ltrh channel0_0_* |tail -n 3; date;ls -ltrh channel0_1_* |tail -n 3
Tue Dec  7 16:25:45 CST 2021
-rw-rw---- 1 root root 1.8M Dec  7 16:20 channel0_0_17
-rw-rw---- 1 root root 2.0M Dec  7 16:23 channel0_0_18
-rw-rw---- 1 root root 916K Dec  7 16:24 channel0_0_19
Tue Dec  7 16:25:45 CST 2021
-rw-rw---- 1 root root 1.7M Dec  6 00:30 channel0_1_8
-rw-rw---- 1 root root 1.9M Dec  6 00:31 channel0_1_9
-rw-rw---- 1 root root 388K Dec  6 00:32 channel0_1_10

    Notice that the ctf files with CPU0 (channel0_0_19) was modified at the time "now", but the ctf files with CPU1(channeo0_1_10) has been stopped working at Dec 6.
    I gdb lttng-consumerd（break at lib_ring_buffer_channel_do_read() and  lib_ring_buffer_poll_deliver()--I configured a read timer on channels). See the followings (lib_ring_buffer_poll_deliver() )

   0x00007f0f2418a445 <+213>: mov    0x0(%r13),%rcx                rcx:0x7f0f0402e610(handle->table)
   0x00007f0f2418a449 <+217>: mov    0x98(%rsi),%r9                rsi:0x7f0efb86f000(buf) r9:0x86c400000(consumed_old = buf->consumed)
   0x00007f0f2418a450 <+224>: lea    0x150(%rsi),%rdi              rdi:0x7f0efda75150
   0x00007f0f2418a457 <+231>: mov    0x150(%rsi),%rax              rax:3(cpu?)
   0x00007f0f2418a45e <+238>: mov    0x78(%rbp),%rdx               rbp:chan      rdx:0x1000000(chan->backend.buf_size)
   0x00007f0f2418a462 <+242>: mov    0x88(%rbp),%r8d               r8d:0x14(20)(chan->backend.buf_size_order)
   0x00007f0f2418a469 <+249>: cmp    %rax,0x8(%rcx)
   0x00007f0f2418a46d <+253>: jbe    0x7f0f2418a80c <lib_ring_buffer_channel_do_read+1180>
   0x00007f0f2418a473 <+259>: shl    $0x6,%rax                     rax:192
   0x00007f0f2418a477 <+263>: sub    $0x1,%rdx                     rdx:0xffffff 16777215 (chan->backend.buf_size - 1)
   0x00007f0f2418a47b <+267>: lea    0x10(%rax,%rcx,1),%r10        rax:192 rcx:0x7f0f0402e610(handle->table) r10:0x7f0f0402e6e0
   0x00007f0f2418a480 <+272>: and    %r9,%rdx                      r9:0x8fdc00000 rdx:0xc00000 12582912  buf_trunc(consume_old, chan->backend.buf_size - 1)
   0x00007f0f2418a483 <+275>: mov    0x8(%rdi),%rax                rdi:0x7f0efda75150 rax:2688 ref_offset = (size_t) ref->offset
   0x00007f0f2418a487 <+279>: mov    %r8d,%ecx                     ecx:0x14(20)
   0x00007f0f2418a48a <+282>: shr    %cl,%rdx               (....) 
   0x00007f0f2418a48d <+285>: shl    $0x7,%rdx
   0x00007f0f2418a491 <+289>: add    %rax,%rdx
   0x00007f0f2418a494 <+292>: lea    0x80(%rdx),%rax
   0x00007f0f2418a49b <+299>: cmp    0x28(%r10),%rax
   0x00007f0f2418a49f <+303>: ja     0x7f0f2418a80c <lib_ring_buffer_channel_do_read+1180>
   0x00007f0f2418a4a5 <+309>: mov    0x20(%r10),%rax               r10:0x7f0f0402e6e0 rax:0x7f0efb86f000(buf)
   0x00007f0f2418a4a9 <+313>: add    %rdx,%rax                     rdx:3200  rax:0x7f0efb86fc80
   0x00007f0f2418a4ac <+316>: mov    (%rax),%rax                   rax:0x86c00000(commit_count!!!!!!Incremented _once_ at sb switch cc_sb)
   0x00007f0f2418a4af <+319>: mov    0x80(%rbp),%r8                r8:0x100000(chan->backend.subbuf_size)
   0x00007f0f2418a4b6 <+326>: mov    0x78(%rbp),%rdx               rdx:0x1000000(chan->backend.buf_size)
   0x00007f0f2418a4ba <+330>: mov    0x8c(%rbp),%ecx               ecx:4
   0x00007f0f2418a4c0 <+336>: mov    0x80(%rsi),%rdi               rdi:0x86d400000
   0x00007f0f2418a4c7 <+343>: sub    %r8,%rax                      rax:0x86b00000(commit_count - chan->backend.subbuf_size)
   0x00007f0f2418a4ca <+346>: and    0x8(%rbp),%rax                rax & chan->commit_count_mask = rax:0x86b00000
   0x00007f0f2418a4ce <+350>: neg    %rdx                          rdx:0x1000000(16M chan->backend.buf_size)  --> 0xffffffffff000000(-16777216)
   0x00007f0f2418a4d1 <+353>: and    %r9,%rdx                      r9:0x86c400000(consume_old)   rdx:0x86c000000
   0x00007f0f2418a4d4 <+356>: shr    %cl,%rdx                      cl:4                          rdx:0x86c00000
   0x00007f0f2418a4d7 <+359>: cmp    %rdx,%rax                     rax:0x86b0000 rdx:0x86c00000
   0x00007f0f2418a4da <+362>: jne    0x7f0f2418a411 <lib_ring_buffer_channel_do_read+161>
   0x00007f0f2418a4e0 <+368>: mov    %r8,%rax
209 static inline
210 int lib_ring_buffer_poll_deliver(const struct lttng_ust_lib_ring_buffer_config *config,
211                  struct lttng_ust_lib_ring_buffer *buf,
212                      struct channel *chan,
213                  struct lttng_ust_shm_handle *handle)
214 {
215     unsigned long consumed_old, consumed_idx, commit_count, write_offset;
216 
217     consumed_old = uatomic_read(&buf->consumed);
218     consumed_idx = subbuf_index(consumed_old, chan);
219     commit_count = v_read(config, &shmp_index(handle, buf->commit_cold, consumed_idx)->cc_sb);
220     /*
221      * No memory barrier here, since we are only interested
222      * in a statistically correct polling result. The next poll will
223      * get the data is we are racing. The mb() that ensures correct
224      * memory order is in get_subbuf.
225      */
226     write_offset = v_read(config, &buf->offset);
227 
228     /*
229      * Check that the subbuffer we are trying to consume has been
230      * already fully committed.
231      */
232 
233     if (((commit_count - chan->backend.subbuf_size)
234          & chan->commit_count_mask)
235         - (buf_trunc(consumed_old, chan)
236            >> chan->backend.num_subbuf_order)
237         != 0)
238         return 0;
239 
240     /*
241      * Check that we are not about to read the same subbuffer in
242      * which the writer head is.
243      */
244     if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_old, chan)
245         == 0)
246         return 0;
247 
248     return 1;
249 
250 }

      It seems that lib_ring_buffer_channel_do_read()->lib_ring_buffer_poll_deliver() returned 0 at line 238 ($consume_old:0x86c400000, $commint_count:0x86c00000, $write_offset:0x86d400000).

Buffer type: per UID
Channels:
-------------
- channel0: [enabled]

    Attributes:
      overwrite mode: 0
      subbufers size: 1048576
      number of subbufers: 16
      switch timer interval: 20000000
      read timer interval: 60000000
      trace file count: 80
      trace file size (bytes): 2097152
      output: mmap()

     PS: Lttng version is 2.7(I know it is an very old version:) )
     Do you have any ideas about this problem? Thanks in advance

Thanks a lot
zhenyu.ren

[-- Attachment #1.2: Type: text/html, Size: 19042 bytes --]

[-- Attachment #2: Type: text/plain, Size: 156 bytes --]

_______________________________________________
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev

^ permalink raw reply	[flat|nested] 5+ messages in thread