Topics

NRF52 Scanning stops without indication


Jon Pry
 

Hi, 

    We are running continuous active scan on NRF52 with duplicate filtering disabled.  Under some circumstances all scan data stops being generated. This has been difficult to reproduce but is a serious bug for our application. It takes thousands of hours of device run time for us to create and identify this issue. I was able to capture a memory dump of such a device and am trying to examine it for any indication of the problem. I'm looking primarily at _radio state, but if anyone has any pointers I would appreciate it. The zephyr version that create this dump is master branch as of 10/26. 


(gdb) p 'ctrl.c'::_radio
$20 = {
  hf_clock = 0x20007668 <__device_clock_nrf5_m16src>, 
  entropy = 0x20007680 <__device_entropy_nrf5>, 
  ticks_anchor = 3215498, 
  remainder_anchor = 0, 
  is_k32src_stable = 1 '\001', 
  ticker_id_prepare = 5 '\005', 
  ticker_id_event = 0 '\000', 
  ticker_id_stop = 0 '\000', 
  role = ROLE_NONE, 
  state = STATE_NONE, 
  advertiser = {
    hdr = {
      ticks_xtal_to_start = 39, 
      ticks_active_to_start = 0, 
      ticks_preempt_to_start = 0, 
      ticks_slot = 150
    }, 
    chan_map_current = 0 '\000', 
    rfu = 0 '\000', 
    is_hdcd = 0 '\000', 
    is_enabled = 1 '\001', 
    phy_p = 1 '\001', 
    chan_map = 7 '\a', 
    filter_policy = 0 '\000', 
    rl_idx = 255 '\377', 
    adv_data = {
      data = {"`\033\070\r\230X't\002\001\004\021\a\236\312\334$\016\345\251\340\223\363\243\265\001\000@n\000\000\000\000\000\000\000\000\000", "`\033\070\r\230X't\002\001\004\021\a\236\312\334$\016\345\251\340\223\363\243\265\001\000@n\000\000\000\000\000\000\000\000\000"}, 
      first = 0 '\000', 
      last = 0 '\000'
    }, 
    scan_data = {
      data = {"D\022\070\r\230X't\v\tStarGateΛ", '\000' <repeats 18 times>, "D\022\070\r\230X't\v\tStarGateΛ", '\000' <repeats 18 times>}, 
      first = 0 '\000', 
      last = 0 '\000'
    }, 
    conn = 0x20002220 <_radio>
  }, 
  scanner = {
    hdr = {
      ticks_xtal_to_start = 39, 
      ticks_active_to_start = 0, 
      ticks_preempt_to_start = 10, 
      ticks_slot = 5948
    }, 
    is_enabled = 1 '\001', 
    state = 0 '\000', 
    chan = 0 '\000', 
    rfu = 0 '\000', 
    phy = 0 '\000', 
    type = 1 '\001', 
    filter_policy = 0 '\000', 
    adv_addr_type = 0 '\000', 
    init_addr_type = 1 '\001', 
    rpa_gen = 1 '\001', 
    rl_idx = 255 '\377', 
    init_addr = "8\r\230X't", 
    adv_addr = "\344\350\016\036\341", <incomplete sequence \364>, 
    ticks_window = 5939, 
    conn_interval = 40, 
    conn_latency = 0, 
    conn_timeout = 400, 
    ticks_conn_slot = 37, 
    conn = 0x0 <z_clock_driver_init>, 
    win_offset_us = 2
  }, 
  conn_pool = 0x20002220 <_radio>, 
  conn_free = 0x0 <z_clock_driver_init>, 
  connection_count = 1 '\001', 
  conn_curr = 0x0 <z_clock_driver_init>, 
  packet_counter = 1 '\001', 
  crc_expire = 0 '\000', 
  data_chan_map = "\377\377\377\377\037", 
  data_chan_count = 37 '%', 
  sca = 5 '\005', 
  default_tx_octets = 27, 
  default_tx_time = 328, 
  default_phy_tx = 3, 
  default_phy_rx = 3, 
  pkt_rx_data_pool = 0x200023d8 <_radio+440>, 
  pkt_rx_data_free = 0x0 <z_clock_driver_init>, 
  packet_data_octets_max = 27, 
  packet_rx_data_pool_size = 208, 
  packet_rx_data_size = 52, 
  packet_rx_data_count = 4 '\004', 
  packet_rx = 0x20002374 <_radio+340>, 
  packet_rx_count = 5 '\005', 
  packet_rx_last = 0 '\000', 
  packet_rx_acquire = 2 '\002', 
  link_rx_pool = 0x200024a8 <_radio+648>, 
  link_rx_free = 0x200024d0 <_radio+688>, 
  link_rx_head = 0x200024a8 <_radio+648>, 
  link_rx_tail = 0x200024a8 <_radio+648>, 
  link_rx_data_quota = 2 '\002', 
  pkt_tx_ctrl_pool = 0x200024d8 <_radio+696>, 
  pkt_tx_ctrl_free = 0x200024d8 <_radio+696>, 
  pkt_tx_data_pool = 0x20002520 <_radio+768>, 
  pkt_tx_data_free = 0x20002520 <_radio+768>, 
  packet_tx_data_size = 36, 
  pkt_tx = 0x20002388 <_radio+360>, 
  pkt_release = 0x200023b0 <_radio+400>, 
  packet_tx_count = 5 '\005', 
  packet_tx_first = 1 '\001', 
  packet_tx_last = 1 '\001', 
  packet_release_first = 1 '\001', 
  packet_release_last = 1 '\001', 
  fc_handle = {0, 0, 0}, 
  fc_req = 2 '\002', 
  fc_ack = 2 '\002', 
  fc_ena = 1 '\001', 
  ticks_active_to_start = 0, 
  conn_upd = 0x0 <z_clock_driver_init>
}

Thanks,

Jon Pry


Chettimada, Vinayak Kariappa
 

Hi Jon,

Could you please share a simple application or an upstream Zephyr sample that can reproduce this memory leak?

From the look of it, the controller's rx buffers have been leaked. Is there any chance, any of the threads aborted? (incorrect application code in direct call path of stack's callbacks can crash or stall the stack).

Something that can help me debug is the SHA hash of the Zephyr source code, a simple application that will reproduce it and an approximate duration you want me to wait for the issue be reproduced.

Regards,
Vinayak

-----Original Message-----
From: <devel@lists.zephyrproject.org> on behalf of Jon Pry <jon@stel.life>
Date: Thursday, 15 November 2018 at 5:54 PM
To: "devel@lists.zephyrproject.org" <devel@lists.zephyrproject.org>
Subject: [Zephyr-devel] NRF52 Scanning stops without indication




Hi,

We are running continuous active scan on NRF52 with duplicate filtering disabled. Under some circumstances all scan data stops being generated. This has been difficult to reproduce but is a serious bug for our application. It takes thousands of hours of
device run time for us to create and identify this issue. I was able to capture a memory dump of such a device and am trying to examine it for any indication of the problem. I'm looking primarily at _radio state, but if anyone has any pointers I would appreciate
it. The zephyr version that create this dump is master branch as of 10/26.


(gdb) p 'ctrl.c'::_radio
$20 = {
hf_clock = 0x20007668 <__device_clock_nrf5_m16src>,
entropy = 0x20007680 <__device_entropy_nrf5>,
ticks_anchor = 3215498,
remainder_anchor = 0,
is_k32src_stable = 1 '\001',
ticker_id_prepare = 5 '\005',
ticker_id_event = 0 '\000',
ticker_id_stop = 0 '\000',
role = ROLE_NONE,
state = STATE_NONE,
advertiser = {
hdr = {
ticks_xtal_to_start = 39,
ticks_active_to_start = 0,
ticks_preempt_to_start = 0,
ticks_slot = 150
},
chan_map_current = 0 '\000',
rfu = 0 '\000',
is_hdcd = 0 '\000',
is_enabled = 1 '\001',
phy_p = 1 '\001',
chan_map = 7 '\a',
filter_policy = 0 '\000',
rl_idx = 255 '\377',
adv_data = {
data = {"`\033\070\r\230X't\002\001\004\021\a\236\312\334$\016\345\251\340\223\363\243\265\001\000@n\000\000\000\000\000\000\000\000\000", "`\033\070\r\230X't\002\001\004\021\a\236\312\334$\016\345\251\340\223\363\243\265\001\000@n\000\000\000\000\000\000\000\000\000"},
first = 0 '\000',
last = 0 '\000'
},
scan_data = {
data = {"D\022\070\r\230X't\v\tStarGateΛ", '\000' <repeats 18 times>, "D\022\070\r\230X't\v\tStarGateΛ", '\000' <repeats 18 times>},
first = 0 '\000',
last = 0 '\000'
},
conn = 0x20002220 <_radio>
},
scanner = {
hdr = {
ticks_xtal_to_start = 39,
ticks_active_to_start = 0,
ticks_preempt_to_start = 10,
ticks_slot = 5948
},
is_enabled = 1 '\001',
state = 0 '\000',
chan = 0 '\000',
rfu = 0 '\000',
phy = 0 '\000',
type = 1 '\001',
filter_policy = 0 '\000',
adv_addr_type = 0 '\000',
init_addr_type = 1 '\001',
rpa_gen = 1 '\001',
rl_idx = 255 '\377',
init_addr = "8\r\230X't",
adv_addr = "\344\350\016\036\341", <incomplete sequence \364>,
ticks_window = 5939,
conn_interval = 40,
conn_latency = 0,
conn_timeout = 400,
ticks_conn_slot = 37,
conn = 0x0 <z_clock_driver_init>,
win_offset_us = 2
},
conn_pool = 0x20002220 <_radio>,
conn_free = 0x0 <z_clock_driver_init>,
connection_count = 1 '\001',
conn_curr = 0x0 <z_clock_driver_init>,
packet_counter = 1 '\001',
crc_expire = 0 '\000',
data_chan_map = "\377\377\377\377\037",
data_chan_count = 37 '%',
sca = 5 '\005',
default_tx_octets = 27,
default_tx_time = 328,
default_phy_tx = 3,
default_phy_rx = 3,
pkt_rx_data_pool = 0x200023d8 <_radio+440>,
pkt_rx_data_free = 0x0 <z_clock_driver_init>,
packet_data_octets_max = 27,
packet_rx_data_pool_size = 208,
packet_rx_data_size = 52,
packet_rx_data_count = 4 '\004',
packet_rx = 0x20002374 <_radio+340>,
packet_rx_count = 5 '\005',
packet_rx_last = 0 '\000',
packet_rx_acquire = 2 '\002',
link_rx_pool = 0x200024a8 <_radio+648>,
link_rx_free = 0x200024d0 <_radio+688>,
link_rx_head = 0x200024a8 <_radio+648>,
link_rx_tail = 0x200024a8 <_radio+648>,
link_rx_data_quota = 2 '\002',
pkt_tx_ctrl_pool = 0x200024d8 <_radio+696>,
pkt_tx_ctrl_free = 0x200024d8 <_radio+696>,
pkt_tx_data_pool = 0x20002520 <_radio+768>,
pkt_tx_data_free = 0x20002520 <_radio+768>,
packet_tx_data_size = 36,
pkt_tx = 0x20002388 <_radio+360>,
pkt_release = 0x200023b0 <_radio+400>,
packet_tx_count = 5 '\005',
packet_tx_first = 1 '\001',
packet_tx_last = 1 '\001',
packet_release_first = 1 '\001',
packet_release_last = 1 '\001',
fc_handle = {0, 0, 0},
fc_req = 2 '\002',
fc_ack = 2 '\002',
fc_ena = 1 '\001',
ticks_active_to_start = 0,
conn_upd = 0x0 <z_clock_driver_init>
}

Thanks,

Jon Pry