linux的crash之hardlock排查记录

不羁的心 提交于 2020-03-06 08:11:19

3.10.0-327的内核,crash记录如下:

KERNEL: vmlinux
DUMPFILE: vmcore [PARTIAL DUMP]
CPUS: 48
DATE: Wed Oct 18 20:37:18 2017
UPTIME: 1 days, 09:43:06
LOAD AVERAGE: 13.42, 10.66, 9.48
TASKS: 7329
NODENAME: host-10-229-143-10
RELEASE: 3.10.0-327.22.2.el7.x86_64
VERSION: #1 SMP Fri Sep 29 15:13:08 CST 2017
MACHINE: x86_64 (2199 Mhz)
MEMORY: 383.6 GB
PANIC: "Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 10"
PID: 24023
COMMAND: "fas_readwriter"
TASK: ffff882f460a2e00 [THREAD_INFO: ffff882f10c44000]
CPU: 10
STATE: TASK_RUNNING (PANIC)-----------------------------------------R状态死锁,进程长时间处于TASK_RUNNING 状态抢占CPU而不发生切换,一般是,进程关抢占后一直执行任务,或者进程关抢占后处于死循环或者睡眠,此时往往会导致多个CPU互锁,整个系统异常。

crash> bt
PID: 24023 TASK: ffff882f460a2e00 CPU: 10 COMMAND: "fas_readwriter"
#0 [ffff882fbfd459c8] machine_kexec at ffffffff81051c5b
#1 [ffff882fbfd45a28] crash_kexec at ffffffff810f3ec2
#2 [ffff882fbfd45af8] panic at ffffffff816326d1
#3 [ffff882fbfd45b78] watchdog_overflow_callback at ffffffff8111d0e2
#4 [ffff882fbfd45b88] __perf_event_overflow at ffffffff811608d1
#5 [ffff882fbfd45c00] perf_event_overflow at ffffffff811613a4
#6 [ffff882fbfd45c10] intel_pmu_handle_irq at ffffffff81032628
#7 [ffff882fbfd45e60] perf_event_nmi_handler at ffffffff81642bcb
#8 [ffff882fbfd45e80] nmi_handle at ffffffff81642319
#9 [ffff882fbfd45ec8] do_nmi at ffffffff81642430
#10 [ffff882fbfd45ef0] end_repeat_nmi at ffffffff81641753
[exception RIP: put_compound_page+336]-----------------------------------------这个RIP
RIP: ffffffff81178b60 RSP: ffff882f10c47d80 RFLAGS: 00000006
RAX: 006016c60138402c RBX: ffffea0123302a40 RCX: 0000000000000022
RDX: 0000000000000246 RSI: 000000000a6a9000 RDI: ffffea0123300000
RBP: ffff882f10c47d98 R8: ffff882f10c47dc8 R9: ffff882f10c47d74
R10: ffff880000000298 R11: 000000000a6aa000 R12: ffffea0123300000
R13: 0000000000000246 R14: 0000000000000000 R15: ffffea0123302a40
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
--- <NMI exception stack> ---
#11 [ffff882f10c47d80] put_compound_page at ffffffff81178b60
#12 [ffff882f10c47da0] put_page at ffffffff81178bac
#13 [ffff882f10c47db0] get_futex_key at ffffffff810e3c86
#14 [ffff882f10c47e08] futex_wake at ffffffff810e3f1a
#15 [ffff882f10c47e70] do_futex at ffffffff810e6a12
#16 [ffff882f10c47f08] sys_futex at ffffffff810e6f20
#17 [ffff882f10c47f80] system_call_fastpath at ffffffff81649909

 

首先,一般hardlock的触发原因是因为关中断时间太长,那么需要查找对应的堆栈中是否有如此处理,而关中断的常见函数,如spinlock,irq_disable等。

根据堆栈,get_futex_key有一段代码如下:

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
page_head = page;
if (unlikely(PageTail(page))) {
put_page(page);
/* serialize against __split_huge_page_splitting() */
local_irq_disable();-------------------------------------------------------------------------关中断
if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {------------------调用了__get_user_pages_fast
page_head = compound_head(page);
/*
* page_head is valid pointer but we must pin
* it before taking the PG_lock and/or
* PG_compound_lock. The moment we re-enable
* irqs __split_huge_page_splitting() can
* return and the head page can be freed from
* under us. We can't take the PG_lock and/or
* PG_compound_lock on a page that could be
* freed from under us.
*/
if (page != page_head) {
get_page(page_head);
put_page(page);
}
local_irq_enable();
} else {
local_irq_enable();
goto again;
}
}
#else
page_head = compound_head(page);
if (page != page_head) {
get_page(page_head);
put_page(page);
}
#endif

确定下CONFIG_TRANSPARENT_HUGEPAGE 是否配置了:

grep CONFIG_TRANSPARENT_HUGEPAGE /boot/config-3.10.0-327.22.2.el7.x86_64
CONFIG_TRANSPARENT_HUGEPAGE=y

说明已经配置了,反汇编get_futex_key确认下,通过简单搜索__get_user_pages_fast是否编译确认了确实开启了透明巨页。

下一步,需要分析,为什么put_page调用put_compound_page会长时间不返回。

void put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
__put_single_page(page);
}

首先需要获取page的值,这个是个指针,通过反汇编get_futex_key函数,

0xffffffff810e3b53 <get_futex_key+227>: je 0xffffffff810e3c10 <get_futex_key+416>
0xffffffff810e3b59 <get_futex_key+233>: lea -0x38(%rbp),%rcx
0xffffffff810e3b5d <get_futex_key+237>: mov $0x1,%edx
0xffffffff810e3b62 <get_futex_key+242>: mov $0x1,%esi
0xffffffff810e3b67 <get_futex_key+247>: mov %r12,%rdi
0xffffffff810e3b6a <get_futex_key+250>: callq 0xffffffff810653c0 <get_user_pages_fast>
0xffffffff810e3b6f <get_futex_key+255>: cmp $0xfffffff2,%eax
0xffffffff810e3b72 <get_futex_key+258>: jne 0xffffffff810e3b00 <get_futex_key+144>
0xffffffff810e3b74 <get_futex_key+260>: cmpb $0x0,-0x39(%rbp)
0xffffffff810e3b78 <get_futex_key+264>: je 0xffffffff810e3b00 <get_futex_key+144>
0xffffffff810e3b7a <get_futex_key+266>: lea -0x38(%rbp),%rcx -------------取page的地址,然后传参数
0xffffffff810e3b7e <get_futex_key+270>: xor %edx,%edx
0xffffffff810e3b80 <get_futex_key+272>: mov $0x1,%esi
0xffffffff810e3b85 <get_futex_key+277>: mov %r12,%rdi
0xffffffff810e3b88 <get_futex_key+280>: mov $0x1,%r14d
0xffffffff810e3b8e <get_futex_key+286>: callq 0xffffffff810653c0 <get_user_pages_fast>
0xffffffff810e3b93 <get_futex_key+291>: test %eax,%eax
0xffffffff810e3b95 <get_futex_key+293>: jns 0xffffffff810e3b08 <get_futex_key+152>

 在调用get_user_pages_fast前,需要获取page指针用来传参数,所以 -0x38(%rbp)肯定是page指针存放的地址,注意是page指针这个变量的地址,不是page指向的变量的地址。

查看堆栈:bt -f查看堆栈信息:

#13 [ffff882f10c47db0] get_futex_key at ffffffff810e3c86
ffff882f10c47db8: ffff885f706ae400 01ffc9004039e688
ffff882f10c47dc8: ffffea0123302a40 00000000dfe458bb
ffff882f10c47dd8: 000000000a6a9b28 00000000ffffffff
ffff882f10c47de8: 0000000000000001 000000000a6a9b28
ffff882f10c47df8: 0000000000000001 ffff882f10c47e68 ---------------------rbp就存放在这,因为它是除了返回地址之外第一个压栈的。
ffff882f10c47e08: ffffffff810e3f1a
#14 [ffff882f10c47e08] futex_wake at ffffffff810e3f1a---------------------这个是get_futex_key 返回上一层函数的地址,
ffff882f10c47e10: ffffc9004039e680 ffffc9004039e684
ffff882f10c47e20: 0000000000000000 0000000000000000
ffff882f10c47e30: 0000000000000b28 00000000dfe458bb
ffff882f10c47e40: 000000000a6a9b28 0000000000000001
ffff882f10c47e50: 0000000000000035 000000000a6a9b28

rbp存放在ffff882f10c47e00就是rbp存放的地方,然后根据:

0xffffffff810e3a75 <get_futex_key+5>: push %rbp
0xffffffff810e3a76 <get_futex_key+6>: mov %rsp,%rbp-----------------此时rsp就是rbp的值,而rsp此时指向栈顶,即ffff882f10c47e00.

crash> p /x 0xffff882f10c47e00-0x38
$3 = 0xffff882f10c47dc8

通过堆栈可以看出0xffff882f10c47dc8 位置的内容为ffffea0123302a40,它其实就是我们寻找的page指针。

crash> struct page -x ffffea0123302a40
struct page {
flags = 0x6fffff00008000,---------------这个标志,刚好包含tail标志。

使用kmem确定下:

crash> kmem ffffea0123302a40
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea0123302a40 48cc0a9000 0 0 0 6fffff00008000 tail-----------------有tail标志

获取位于head的page

crash> struct page.first_page -x ffffea0123302a40
first_page = 0xffffea0123300000
crash> struct page -x 0xffffea0123300000
struct page {
flags = 0x6016c60138402c,

crash> kmem 0xffffea0123300000------------------确认当时head的标志如下:
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea0123300000 48cc000000 ffff882f74a96701 a600 74 6016c60138402c referenced,uptodate,lru,head,swapbacked,unevictable,mlocked,compound_lock

 

下面开始查看put_compound_page的代码。

static void put_compound_page(struct page *page)
{
struct page *page_head;

/*
* We see the PageCompound set and PageTail not set, so @page maybe:
* 1. hugetlbfs head page, or
* 2. THP head page.
*/
if (likely(!PageTail(page))) {
if (put_page_testzero(page)) {
/*
* By the time all refcounts have been released
* split_huge_page cannot run anymore from under us.
*/
if (PageHead(page))
__put_compound_page(page);
else
__put_single_page(page);
}
return;
}

/*
* We see the PageCompound set and PageTail set, so @page maybe:
* 1. a tail hugetlbfs page, or
* 2. a tail THP page, or
* 3. a split THP page.
*
* Case 3 is possible, as we may race with
* __split_huge_page_refcount tearing down a THP page.
*/
page_head = compound_head_by_tail(page);page_head 即为:0xffffea0123300000
if (!__compound_tail_refcounted(page_head))-------------将首页传入作为参数,
put_unrefcounted_compound_page(page_head, page);
else
put_refcounted_compound_page(page_head, page);-----------------最终走的这个流程
}

从代码可以看出,他有两个流程,我们属于第二种,tail页。

解析来,需要判断是走 put_unrefcounted_compound_page 还是走 put_refcounted_compound_page。

static inline bool __compound_tail_refcounted(struct page *page)
{
return !PageSlab(page) && !PageHeadHuge(page);
}

首页的slab标志没有,需要查看PageHeadHuge ,代码如下:

int PageHeadHuge(struct page *page_head)
{
compound_page_dtor *dtor;

if (!PageHead(page_head))
return 0;

dtor = get_compound_page_dtor(page_head);-----------

return dtor == free_huge_page;
}

 

static inline compound_page_dtor *get_compound_page_dtor(struct page *page)--------------获取page的析构函数
{
return (compound_page_dtor *)page[1].lru.next;
}

crash> p ((struct page*)0xffffea0123300000 +1)
$5 = (struct page *) 0xffffea0123300040-------------先偏移一个指针

crash> struct page.lru.next 0xffffea0123300040
lru.next = 0xffffffff81173140 <free_compound_page>,

free_compound_page与free_huge_page 不是同一个函数,所以 PageHeadHuge 返回false,

那么 __compound_tail_refcounted返回true,所以下面需要分析 put_refcounted_compound_page 流程的代码。

 

void put_refcounted_compound_page(struct page *page_head, struct page *page)
{
if (likely(page != page_head && get_page_unless_zero(page_head))) {
unsigned long flags;

 

/*
* Try to grab a ref unless the page has a refcount of zero, return false if
* that is the case.
*/
static inline int get_page_unless_zero(struct page *page)
{
return atomic_inc_not_zero(&page->_count);---------------page_head->_count为74
}

crash> struct page._count 0xffffea0123300000
_count = {
counter = 74
}

那么,当时函数到底执行到哪一行被nmi狗咬死的呢?我们回过头看这个RIP,

[exception RIP: put_compound_page+336]-----------------------------------------这个RIP

反汇编结果:

/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/arch/x86/include/asm/processor.h: 689
0xffffffff81178b5a <put_compound_page+330>: pause
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/arch/x86/include/asm/bitops.h: 319
0xffffffff81178b5c <put_compound_page+332>: mov (%r12),%rax
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/bit_spinlock.h: 30
0xffffffff81178b60 <put_compound_page+336>: test $0x1000000,%eax

说明调用了bit_spin_lock。

根据之前打印的flag:

crash> kmem 0xffffea0123300000------------------确认当时head的标志如下:
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea0123300000 48cc000000 ffff882f74a96701 a600 74 6016c60138402c referenced,uptodate,lru,head,swapbacked,unevictable,mlocked,compound_lock

说明PG_compound_lock已经被设置,假设已经被设置的锁,这次再次自旋,岂不是死锁了?倒是挺符合故障现象的,那么,怎么确定这把锁是进来自来获取的,

还是进来之后获取的呢?也就是RIP执行的时候,到底执行到put_refcounted_compound_page下面的哪一行呢?

 

通过走查其他的堆栈,发现很多线程都是一模一样的堆栈,都在等待这把锁,唯独有一个不一样:

收集堆栈信息:

crash> bt -a >caq_bt_all.txt 

过滤put_compound_page 打印如下:

#11 [ffff882f10c47d80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+336]
#4 [ffff882ef1d1bd80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+336]
#4 [ffff882ef29fbd80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+336]
#4 [ffff882ef2d97d80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+332]
#4 [ffff882ef6d93d80] put_compound_page at ffffffff81178b5c
[exception RIP: put_compound_page+332]
#4 [ffff882ef2cb3d80] put_compound_page at ffffffff81178b5c
[exception RIP: put_compound_page+336]
#4 [ffff882ef1eafd80] put_compound_page at ffffffff81178b60
[exception RIP: put_compound_page+332]

然后获取堆栈具体内容:

[root@host-10-229-143-10 127.0.0.1-2017-10-18-20:38:02]# grep put_compound_page -B 10 -A 5 caq_bt_all.txt |grep fas_readwriter |wc -l
34
[root@host-10-229-143-10 127.0.0.1-2017-10-18-20:38:02]# grep put_compound_page -B 10 -A 5 caq_bt_all.txt |grep put_compound_page |wc -l
69

说明有一个堆栈是使用了put_compound_page ,但是rip不是put_compound_page ,找到这个task堆栈如下:

crash> bt 24194
PID: 24194 TASK: ffff882ef4e2e780 CPU: 3 COMMAND: "fas_readwriter"
#0 [ffff882ef3927c00] __schedule at ffffffff8163df9b------------------------------------被调度出去了。
#1 [ffff882ef3927c38] put_compound_page at ffffffff81178af5  ---------------------被调度出去的返回地址
#2 [ffff882ef3927c60] put_page at ffffffff81178bac
#3 [ffff882ef3927c70] get_futex_key at ffffffff810e3cfa
#4 [ffff882ef3927cc8] futex_wait_setup at ffffffff810e46fd
#5 [ffff882ef3927d28] futex_wait at ffffffff810e4940
#6 [ffff882ef3927e70] do_futex at ffffffff810e69ee
#7 [ffff882ef3927f08] sys_futex at ffffffff810e6f20
#8 [ffff882ef3927f80] system_call_fastpath at ffffffff81649909
RIP: 00007f544487379b RSP: 00007f4a59874868 RFLAGS: 00003206
RAX: 00000000000000ca RBX: ffffffff81649909 RCX: 0000000000010000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000a6a9b28
RBP: 000000000a6a9b28 R8: 0000000000000000 R9: 0000000000005e82
R10: 0000000000000000 R11: 0000000000003246 R12: 00007f5311b49000
R13: 00007f4a59874cd0 R14: 00007f5301dfb000 R15: fffffffeffffffff
ORIG_RAX: 00000000000000ca CS: 0033 SS: 002b

crash> dis -l ffffffff81178af5 4    -----查看从哪被调度出去的,从下面信息看,还是自旋锁,和上面hardlock的锁一样。
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/mm.h: 376
0xffffffff81178af5 <put_compound_page+229>: mov %rax,%r13
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/arch/x86/include/asm/bitops.h: 199
0xffffffff81178af8 <put_compound_page+232>: lock btsl $0x18,(%r12)
0xffffffff81178aff <put_compound_page+239>: sbb %eax,%eax
/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/bit_spinlock.h: 26
0xffffffff81178b01 <put_compound_page+241>: test %eax,%eax

crash> struct task_struct.stack ffff882ef4e2e780
stack = 0xffff882ef3924000

stack变量存储的是thread_info信息

crash> struct thread_info -x 0xffff882ef3924000
struct thread_info {
task = 0xffff882ef4e2e780,
exec_domain = 0xffffffff8197be20 <default_exec_domain>,
flags = 0x88,--------------------换算成2进制就是0x10001000

#define TIF_NEED_RESCHED 3 /* rescheduling necessary */说明设置了这个标志。

#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 说明设置了这个标志。

 在__schedule函数中,会调用clear_tsk_need_resched来清除这个标志,说明当时还没走到这个。

 

经os同事文洋查看新的内核代码:

commit ddc58f27f9eee9117219936f77e90ad5b2e00e96
Author: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Date: Fri Jan 15 16:52:56 2016 -0800

mm: drop tail page refcounting

Tail page refcounting is utterly complicated and painful to support.

It uses ->_mapcount on tail pages to store how many times this page is
pinned. get_page() bumps ->_mapcount on tail page in addition to
->_count on head. This information is required by split_huge_page() to
be able to distribute pins from head of compound page to tails during
the split.

We will need ->_mapcount to account PTE mappings of subpages of the
compound page. We eliminate need in current meaning of ->_mapcount in
tail pages by forbidding split entirely if the page is pinned.

The only user of tail page refcounting is THP which is marked BROKEN for
now.

Let's drop all this mess. It makes get_page() and put_page() much
simpler.

 代码被重构了,问题记录到此结束。

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!