01 背景
小概率出现 kernel 异常重启问题:3 个月内出现 3 例报 linux kernel 地址异常导致重启问题。通过场景分析,构造测试场景,并打开 ramdump 配置进行复测,抓到了问题现场。
02 crash 解析 ramdump
crash ./vmlinux /dev/random@0x80000000,DDRCS0-1.bin@0x80970000,DDRCS0-2.bin@0x100970000,DDRCS0-3.bin@0x140970000,DDRCS0-4.bin@0x180970000,DDRCS0-5.bin@0x1c0970000,DDRCS0-6.bin@0x200970000,DDRCS0-7.bin@0x240970000 --machdep vabits_actual=48
2.1 查看出错 log
通过 dmesg 命令抓取 log 缓存区,找到出错的日志和调用栈:
crash> dmesg
...
[ 1134.509848] ==================================================================
[ 1134.509888] BUG: KASAN: user-memory-access in cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.510057] Write of size 1 at addr 0000050100000780 by task ipi6_thread/4052
[ 1134.510080]
[ 1134.510092] CPU: 1 PID: 4052 Comm: ipi6_thread Tainted: P O 5.10.59-rt52-gbdf2977878dd-dirty #2
[ 1134.510119] Hardware name: Horizon AI Technologies, Inc. HOBOT j5 RHODE B2 & C & Ca & Cb & Cc & Cd & Ce (DT)
[ 1134.510138] Call trace:
[ 1134.510147] dump_backtrace+0x0/0x2e0
[ 1134.510192] show_stack+0x14/0x20
[ 1134.510224] dump_stack+0xf8/0x160
[ 1134.510256] kasan_report+0x1a8/0x200
[ 1134.510284] __asan_store1+0x9c/0xa8
[ 1134.510308] cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.510443] kthread+0x258/0x260
[ 1134.510475] ret_from_fork+0x10/0x1c
[ 1134.510504] ==================================================================
...[ 1134.690628] CPU: 6 PID: 4052 Comm: ipi6_thread Tainted: P B O 5.10.59-rt52-gbdf2977878dd-dirty #2
[ 1134.693905] Hardware name: Horizon AI Technologies, Inc. HOBOT j5 RHODE B2 & C & Ca & Cb & Cc & Cd & Ce (DT)
[ 1134.695165] pstate: 40c00005 (nZcv daif +PAN +UAO -TCO BTYPE=--)
[ 1134.695951] pc : cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.696841] lr : cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.697726] sp : ffff0001c0e5fc50
[ 1134.698163] pmr_save: 000000e0
[ 1134.698566] x29: ffff0001c0e5fc50 x28: 0000000000000000
[ 1134.699273] x27: 0000000000000000 x26: 0000000000000000
[ 1134.699977] x25: 000000000000004c x24: 0000050100000780
[ 1134.700682] x23: ffff000180e0b3c0 x22: ffff00020d4c0a50
[ 1134.701390] x21: ffff000180e0b1b0 x20: ffff000180e0b248
[ 1134.702099] x19: ffff000180e0b120 x18: 0000000000000000
[ 1134.702805] x17: 0000000000000000 x16: 0000000000000000
[ 1134.703509] x15: 0000000000000000 x14: 3d3d3d3d3d3d3d3d
[ 1134.704215] x13: 3d3d3d3d3d3d3d3d x12: ffff9400025cf1cf
[ 1134.704923] x11: 1ffff400025cf1ce x10: ffff9400025cf1ce
[ 1134.705631] x9 : dfffa00000000000 x8 : ffffa00012e78e70
[ 1134.706340] x7 : 0000000000000001 x6 : ffffa00012e78e70
[ 1134.707045] x5 : 00006bfffda30e32 x4 : dfffa00000000000
[ 1134.707754] x3 : ffffa00010c3c6a8 x2 : 0000000000000007
[ 1134.708459] x1 : ffff00017d1ec4c0 x0 : 0000000000000001
[ 1134.709166] Call trace:
[ 1134.709497] cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.710340] kthread+0x258/0x260
[ 1134.710787] ret_from_fork+0x10/0x1c
[ 1134.711288] Code: f94033e1 8b190300 387b4839 95d23efc (383c4b19)
通过日志中的信息可知,出错位置是 cimdma_swap_buffer+0x2a4,以及出错的线程 ipi6_thread=>pipeline 8。
2.2 定位出错代码行
加载 hobot_cim_dma.ko 符号表,并反汇编 cimdma_swap_buffer, 找到偏移是 0x2a4(676)的指令行:
//加载符号表
crash> mod -s hobot_cim_dma /home/kaikai.sun/cimdma_ramdump/symbols/kernel/hobot_cim_dma.koMODULE NAME BASE SIZE OBJECT FILE
ffffa00008d97d80 hobot_cim_dma ffffa00008d70000 184320 /home/kaikai.sun/cimdma_ramdump/symbols/kernel/hobot_cim_dma.ko//反汇编
crash> dis -l cimdma_swap_buffer
/home/ycj/work/adnoa/software/adpro_j5_acore_public_origin/kernel/drivers/media/platform/hobot/cim_dma/hobot_cim_dma_ops.c: 1124
0xffffa00008d724a4: ldr x0, [sp, #120]
0xffffa00008d724a8: bl 0xffffa00010202488 <__asan_load8>
0xffffa00008d724ac: ldr x0, [x22, #176]
0xffffa00008d724b0: str x0, [sp, #96]
0xffffa00008d724b4: ldr x0, [sp, #104]
0xffffa00008d724b8: bl 0xffffa00010202488 <__asan_load8>
0xffffa00008d724bc: ldr x0, [sp, #96]
0xffffa00008d724c0: ldr x24, [x23, #168]
0xffffa00008d724c4: add x0, x0, x26
0xffffa00008d724c8: bl 0xffffa00010202020 <__asan_load1>
0xffffa00008d724cc: ldr x1, [sp, #96]
0xffffa00008d724d0: add x0, x24, x25
0xffffa00008d724d4: ldrb w25, [x1, w27, uxtw]
0xffffa00008d724d8: bl 0xffffa000102020c8 <__asan_store1>
0xffffa00008d724dc: strb w25, [x24, w28, uxtw]
出错的指令行是 strb w25, [x24, w28, uxtw],结合 log 中的调用栈,x24: 0000050100000780,说明指令确实是出错点。
再来看 X24 是怎么赋值的:ldr x24, [x23, #168],结合 log 调用和代码 hobot_cim_dma_ops.c:1124。
x23: ffff000180e0b3c0emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];
X23 的地址便是 emb_frame 的指针。
2.3 查看出错内存信息
查看 emb_frame 的内存信息:
crash> struct vio_frame 0xffff000180e0b3c0
struct vio_frame {list = {next = 0xffff00020d538000,prev = 0xffff00020d538a50},work = {node = {next = 0xffff000180e0b3d0,prev = 0xffff000180e0b3d0},func = 0xffff000180e0b3e0,worker = 0xffff000180e0b3e0,canceling = -2132757520},group = 0xffff000180e0b3f0,buffer = {ion_alloced = 30 '\036',ion_cached = 0 '\000',ion_cachesync = 0 '\000',consecutive_mode = 0 '\000',ion_mmap = 128 '\200',planeSize = {1281, 0, 1},info = {index = 0,planecount = 1,share_id = {0, 0, 0},planeSize = {1, 0, 1},paddr = {8589934595, 0, 4294967296},addr = {0x50100000780, 0x1e, 0x6}},
....
从上面的结构体解析地址来看,数值基本上不对;crash> ptype /o struct vio_frame
/* offset | size */ type = struct vio_frame {
/* 0 | 16 */ struct list_head {
/* 0 | 8 */ struct list_head *next;
/* 8 | 8 */ struct list_head *prev;
..../* 488 | 12 */ u32 paddr_buffer[3];
/* 500 | 4 */ enum vio_frame_state state;
/* 504 | 4 */ u32 instance;
/* 508 | 4 */ u32 fcount;
/* 512 | 4 */ u32 index;
/* 516 | 2 */ u16 dispatch_cnt;
/* 518 | 1 */ u8 iommu_map;
/* 519 | 1 */ u8 remote_buf;
/* 520 | 8 */ void *ext_data;/* total size (bytes): 528 */}crash> rd ffff000180e0b3c0 -e ffff000180e0b600
ffff000180e0b3c0: ffff00020d538000 ffff00020d538a50 ..S.....P.S.....
ffff000180e0b3d0: ffff000180e0b3d0 ffff000180e0b3d0 ................
ffff000180e0b3e0: ffff000180e0b3e0 ffff000180e0b3e0 ................
ffff000180e0b3f0: ffff000180e0b3f0 ffff000180e0b3f0 ................
ffff000180e0b400: 000007800000001e 0000000000000501 ................
ffff000180e0b410: 0000000000000000 0000000000000001 ................
ffff000180e0b420: 0000000100000000 0000000000000000 ................
ffff000180e0b430: 0000000100000000 0000000000000001 ................
ffff000180e0b440: 0000000000000000 0000000000000001 ................
ffff000180e0b450: 0000000200000003 0000000000000000 ................
ffff000180e0b460: 0000000100000000 0000050100000780 ................
ffff000180e0b470: 000000000000001e 0000000000000006 ................
ffff000180e0b480: 0000000000000000 0000000000000001 ................
ffff000180e0b490: 0000000100000001 0000000100000780 ................
ffff000180e0b4a0: 0000000000000000 0000000000000000 ................
ffff000180e0b4b0: 0000000000000000 0000000000000000 ................
ffff000180e0b4c0: 1234567800000000 0000000000000000 ....xV4.........
ffff000180e0b4d0: 0000000000000000 0000000000000000 ................
ffff000180e0b4e0: 0000000000000000 0000000000000000 ................
ffff000180e0b4f0: 0000000000000000 0000000000000000 ................
ffff000180e0b500: 0000000000000000 0000000000000000 ................
ffff000180e0b510: 0000000000000000 0000000000000000 ................
ffff000180e0b520: 0000000000000000 00000000000080ac ................
ffff000180e0b530: 000064b81c513364 0000000064b81c50 d3Q..d..P..d....
ffff000180e0b540: 00000000000e0011 0006000001010100 ................
ffff000180e0b550: 0000000000010001 0000000000000002 ................
ffff000180e0b560: 000080ad000080ad 0000000064b81c50 ........P..d....
ffff000180e0b570: 0000001c0000001e 0000000000000001 ................
ffff000180e0b580: 0000000064b81c50 0000001b0000001e P..d............
ffff000180e0b590: 0000000000000001 0000000000000000 ................
ffff000180e0b5a0: 0000000000000000 0000000000000000 ................
ffff000180e0b5b0: 0000000000000001 ffff00017d1ec4c0 ...........}....
ffff000180e0b5c0: 0000000000000000 0000000000000000 ................
ffff000180e0b5d0: 0000000000000000 0000000000000000 ................
ffff000180e0b5e0: 0000000000000000 0000000000000000 ................
ffff000180e0b5f0: ffff000180e0b5f0 ffff000180e0b5f0 ................
2.4 找到出错地址的保存位置
通过查看 0xffff000180e0b3c0 前后的内存信息,是用户设置的配置信息,都是保存在 struct cimdma_subdev 中,cimdma_subdev 是 struct j5_cimdma_dev 的成员变量。
struct j5_cimdma_dev {/* j5 cimdma information */struct platform_device *pdev;void __iomem *base_reg;resource_size_t regs_start;resource_size_t regs_end;s32 irq;unsigned long state;struct class *class;struct cdev cdev;dev_t devno;...struct cimdma_subdev subdev[VIO_MAX_STREAM];struct vio_group *group[VIO_MAX_STREAM];struct vio_group_task gtask[VIO_MAX_STREAM];...}//通过静态变量g_cimdma找到struct j5_cimdma_dev指针crash> g_cimdma
g_cimdma = $2 = (struct j5_cimdma_dev *) 0xffff000180e08080crash> struct j5_cimdma_dev 0xffff000180e08080 -o
struct j5_cimdma_dev {[ffff000180e08080] struct platform_device *pdev;[ffff000180e08088] void *base_reg;[ffff000180e08090] resource_size_t regs_start;[ffff000180e08098] resource_size_t regs_end;[ffff000180e080a0] s32 irq;[ffff000180e080a8] unsigned long state;[ffff000180e080b0] struct class *class;[ffff000180e080b8] struct cdev cdev;[ffff000180e08120] dev_t devno;[ffff000180e08124] atomic_t instance;[ffff000180e08128] atomic_t rsccount;[ffff000180e0812c] atomic_t open_cnt;[ffff000180e08130] u32 sw_drop_count[16];[ffff000180e08170] u32 hw_drop_count[16];[ffff000180e081b0] raw_spinlock_t raw_slock;[ffff000180e081b8] struct mutex mlock;[ffff000180e081e0] atomic_t sensor_fcount[8];[ffff000180e08200] atomic_t backup_fcount[8];[ffff000180e08220] atomic_t enable_cnt[8];[ffff000180e08240] u32 cur_output_flag[8];[ffff000180e08260] struct cimdma_subdev subdev[16];[ffff000180e0dfe0] struct vio_group *group[16];[ffff000180e0e060] struct vio_group_task gtask[16];[ffff000180e0e760] u32 fusa_enable;[ffff000180e0e768] u64 jiffi;[ffff000180e0e770] struct vio_stl stl;[ffff000180e0e7a8] u32 last_frameid[8];[ffff000180e0e7c8] u32 error_cnt[8];
}
SIZE: 26472
由于通过之前的 log 已知出错的通路是 pipeline 8,对应的结构体是 subdev[8],下一步查看 subdev[8]内存信息。
crash> struct cimdma_subdev ffff000180e08260 -o 9
....
struct cimdma_subdev {[ffff000180e0b120] struct vio_subdev vdev;[ffff000180e0b300] struct j5_cimdma_dev *cimdma;[ffff000180e0b308] wait_queue_head_t done_wq;[ffff000180e0b348] struct vio_framemgr emb_fmgr;[ffff000180e0b400] cim_dma_cfg_t cim_cfg;[ffff000180e0b4c8] struct frame_info preint_info;[ffff000180e0b548] u8 initial_frameid;[ffff000180e0b549] u8 yuv_format;[ffff000180e0b54a] u8 embeded_data;[ffff000180e0b54b] u8 embeded_dependence;[ffff000180e0b54c] u8 embeded_start_cnt;[ffff000180e0b54d] u8 pack_mode;[ffff000180e0b54e] u8 ipi_index;[ffff000180e0b54f] u8 tpg_en;[ffff000180e0b550] u8 reqbuf_flag;[ffff000180e0b551] u8 stop_flag;[ffff000180e0b552] u8 start_flag;[ffff000180e0b554] u32 cnt_shift;[ffff000180e0b558] u32 irq_status;[ffff000180e0b55c] u32 force_drop;[ffff000180e0b560] u32 sw_frameid;[ffff000180e0b564] u32 last_hw_frameid;[ffff000180e0b568] struct fps_debug fps[2];[ffff000180e0b598] fps_ctrl_t fps_ctrl;[ffff000180e0b5b0] u32 thread_run;[ffff000180e0b5b8] struct task_struct *cimdma_thread;[ffff000180e0b5c0] wait_queue_head_t cimdma_done_wq;[ffff000180e0b600] struct completion stop_complete;[ffff000180e0b620] struct vio_drop_mgr drop_mgr;
}
SIZE: 1496
X23:ffff000180e0b3c0 是在[ffff000180e0b348]struct vio_framemgr emb_fmgr 内,下一步查看 emb_fmgr 结构体信息。
crash> struct vio_framemgr ffff000180e0b348 -o
struct vio_framemgr {[ffff000180e0b348] u32 id;[ffff000180e0b34c] raw_spinlock_t raw_slock;[ffff000180e0b350] spinlock_t slock;[ffff000180e0b380] ulong sindex;[ffff000180e0b388] u32 num_frames;[ffff000180e0b38c] u32 num_buffers;[ffff000180e0b390] struct vio_frame *frames;[ffff000180e0b398] u32 queued_count[5];[ffff000180e0b3b0] struct list_head queued_list[5];
}
SIZE: 184crash> list ffff000180e0b3f0
ffff000180e0b3f0
crash> struct list_head ffff000180e0b3b0 -o 5
struct list_head {[ffff000180e0b3b0] struct list_head *next;[ffff000180e0b3b8] struct list_head *prev;
}
SIZE: 16struct list_head {[ffff000180e0b3c0] struct list_head *next;[ffff000180e0b3c8] struct list_head *prev;
}
SIZE: 16struct list_head {[ffff000180e0b3d0] struct list_head *next;[ffff000180e0b3d8] struct list_head *prev;
}
SIZE: 16struct list_head {[ffff000180e0b3e0] struct list_head *next;[ffff000180e0b3e8] struct list_head *prev;
}
SIZE: 16struct list_head {[ffff000180e0b3f0] struct list_head *next;[ffff000180e0b3f8] struct list_head *prev;
}
SIZE: 16
2.5 定位原因
由此可知 X23:ffff000180e0b3c0 是 queued_list[1]的起始地址,queued_list 是 5 个 list 的 list head,queued_list[1]是 FS_REQUEST queue,对应代码 emb_frame 是从 FS_REQUEST 队列中获取,也就是说 peek_frame 拿到的是 FS_REQUEST 队列的 head。
static void cimdma_separate_embedded_data(struct cimdma_subdev *subdev)
{
....emb_fmgr = &subdev->emb_fmgr;vio_e_barrier_irqs(emb_fmgr, flags);/*PRQA S 2996*/emb_frame = peek_frame(emb_fmgr, FS_REQUEST);vio_x_barrier_irqr(emb_fmgr, flags);/*PRQA S 2996*/if (emb_frame == NULL) {vio_err("[S%d] emb FS_REQUEST queue has no member;\n", group->instance);framemgr_print_queues(emb_fmgr);return;}emb_frame->frameinfo.frame_id = frame->frameinfo.frame_id;emb_frame->frameinfo.timestamps = frame->frameinfo.timestamps;emb_frame->frameinfo.tv_sec = frame->frameinfo.tv_sec;emb_frame->frameinfo.tv_usec = frame->frameinfo.tv_usec;vio_frame_sync_for_cpu(frame);for (i = 0; i < subdev->cim_cfg.embeded_width; i++) {if (i % 2 == 0)emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];elseemb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[0][i/2];} emb_fmgr = &subdev->emb_fmgr;vio_e_barrier_irqs(emb_fmgr, flags);/*PRQA S 2996*/emb_frame = peek_frame(emb_fmgr, FS_REQUEST);vio_x_barrier_irqr(emb_fmgr, flags);/*PRQA S 2996*/if (emb_frame == NULL) {vio_err("[S%d] emb FS_REQUEST queue has no member;\n", group->instance);framemgr_print_queues(emb_fmgr);return;}emb_frame->frameinfo.frame_id = frame->frameinfo.frame_id;emb_frame->frameinfo.timestamps = frame->frameinfo.timestamps;emb_frame->frameinfo.tv_sec = frame->frameinfo.tv_sec;emb_frame->frameinfo.tv_usec = frame->frameinfo.tv_usec;vio_frame_sync_for_cpu(frame);for (i = 0; i < subdev->cim_cfg.embeded_width; i++) {if (i % 2 == 0)emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];elseemb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[0][i/2];}...}
通过以上可知,emb_fmgr 的链表操作存在的问题,应该是锁保护异常了,重新 review 代码发现确实是锁异常了,emb_fmgr 链表操作时使用了 framemgr 的 spinlock。
static void cimdma_separate_embedded_data(struct cimdma_subdev *subdev){...vio_e_barrier_irqs(framemgr, flags);/*PRQA S 2996*/trans_frame(emb_fmgr, emb_frame, FS_COMPLETE);vio_x_barrier_irqr(framemgr, flags);/*PRQA S 2996*/wake_up(&subdev->done_wq);}
修改此次的锁异常,便能根本的修复该问题。
03 结论与反思
用锁保护临界资源是多进程并发问题的常用手段,但是锁保护的范围是否正确一直没有有效手段进行检查;在后续的项目或者芯片平台上,用锁保护得增加注释,方便自己其他同学检查,减少出错概率。