解析kernel 2.6.24使用NMI中断对Hard lock的处理

CPU的NMI中断常用作Hard lock检测。无论CPU是否lock，硬件始终要保证NMI中断能够被响应。作为Hard lock检测的方法，当CPU硬件锁死后，其时钟中断可能无法被响应，导致时钟计数值无法变化。

在NMI中断处理时，通过判断当前时钟计数值是否与前一次NMI中断的时钟计数值相同来判断CPU硬件死锁。该过程可以用下图表示：

一. 时钟计数值的处理方式

对于X86，当时钟中断来临时，kernel都会在中断上文中对时钟计数器+1。在低分辨率模式时，计数器+1的处理过程如下：

//arch/x86/kernel/time_32.c
irqreturn_t timer_interrupt(int irq, void *dev_id)
{
	/* Keep nmi watchdog up to date */
	per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;

        //...
}

开启高分辨率模式后，在0xef本地APIC中断处理中对时钟计数值+1，其处理如下：

//arch/x86/kernel/apic_32.c
/* The guts of the apic timer interrupt */
static void local_apic_timer_interrupt(void)
{
	int cpu = smp_processor_id();
	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
	//....
	per_cpu(irq_stat, cpu).apic_timer_irqs++;  //计数值+1
	evt->event_handler(evt);
}

二. NMI中断的处理

NMI中断号为2，其中断门在trap_init函数中设置，中断入口函数是nmi。该函数是汇编函数，定义在 arch/x86/kernel/entry_32.S中（为了突出重要部分，其中省略了部分内容) ：

/* arch/x86/kernel/entry_32.S */
/* NMI is doubly nasty. It can happen _while_ we're handling
 * a debug fault, and the debug fault hasn't yet been able to
 * clear up the stack. So we first check whether we got  an
 * NMI on the sysenter entry path, but after that we need to
 * check whether we got an NMI on the debug path where the debug
 * fault happened on the sysenter path.*/
KPROBE_ENTRY(nmi)
	RING0_INT_FRAME
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	...
	je nmi_debug_stack_check
nmi_stack_correct:
	/* We have a RING0_INT_FRAME here */
	pushl %eax                           #压入中断号
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL                             #保护寄存器环境
	xorl %edx,%edx		# zero error code  设置函数实参error_code
	movl %esp,%eax		# pt_regs pointer  设置函数实参regs
	call do_nmi             #进入中断处理函数
	jmp restore_nocheck_notrace
	CFI_ENDPROC

nmi_stack_fixup:
	RING0_INT_FRAME
	FIX_STACK(12,nmi_stack_correct, 1)
	jmp nmi_stack_correct
        ...
1:	INTERRUPT_RETURN
	CFI_ENDPROC
.section __ex_table,"a"
	.align 4
	.long 1b,iret_exc
.previous
KPROBE_END(nmi)

NMI中断处理入口首先进行一些状态寄存器更改和栈处理。然后将中断号从eax入栈，将函数实参写入edx和eax，调用do_nmi函数开始处理，do_nmi函数定义如下：

//arch/x86/kernel/traps_32.c
fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
	int cpu;
	nmi_enter();
	cpu = smp_processor_id();
	++nmi_count(cpu);
	if (!ignore_nmis)
		default_do_nmi(regs);  //开始处理

	nmi_exit();
}

do_nmi函数没什么可说明的，我们感兴趣其中的default_do_nmi函数，该函数定义如下：

//arch/x86/kernel/traps_32.c
static __kprobes void default_do_nmi(struct pt_regs * regs)
{
	unsigned char reason = 0;
	/* Only the BSP gets external NMIs from the system.  */
	if (!smp_processor_id())
		reason = get_nmi_reason();
	if (!(reason & 0xc0)) {
		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
							== NOTIFY_STOP)
			return;
#ifdef CONFIG_X86_LOCAL_APIC
		/* Ok, so this is none of the documented NMI sources,
		 * so it must be the NMI watchdog. */
		if (nmi_watchdog_tick(regs, reason))  //开始判断是否hard lock
			return;
		if (!do_nmi_callback(regs, smp_processor_id()))
#endif
			unknown_nmi_error(reason, regs);
		return;
	}
	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
		return;
	if (reason & 0x80)
		mem_parity_error(reason, regs);
	if (reason & 0x40)
		io_check_error(reason, regs);
	/* Reassert NMI in case it became active meanwhile
	 * as it's edge-triggered. */
	reassert_nmi();
}

该函数在配置LOCAL_APIC后，调用nmi_watchdog_tick，该函数是判断hard lock的关键函数，直接进入函数定义：


__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
{
	/* Since current_thread_info()-> is always on the stack, and we
	 * always switch the stack NMI-atomically, it's safe to use
	 * smp_processor_id(). */
	unsigned int sum;
	int touched = 0;
	int cpu = smp_processor_id();
	int rc=0;
	...
	if (cpu_isset(cpu, backtrace_mask)) {  //backtrace跟踪
		...
	}
	/* Take the local apic timer and PIT/HPET into account. We don't
	 * know which one is active, when we have highres/dyntick on */
	sum = per_cpu(irq_stat, cpu).apic_timer_irqs +
		per_cpu(irq_stat, cpu).irq0_irqs;

	/* if the none of the timers isn't firing, this cpu isn't doing much */
	if (!touched && last_irq_sums[cpu] == sum) {
		/*
		 * Ayiee, looks like this CPU is stuck ...
		 * wait a few IRQs (5 seconds) before doing the oops ...
		 */
		alert_counter[cpu]++;
		if (alert_counter[cpu] == 5*nmi_hz)
			/* die_nmi will return ONLY if NOTIFY_STOP happens..*/
			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
	} else {
		last_irq_sums[cpu] = sum;
		alert_counter[cpu] = 0;
	}
	.....
	return rc;
}

在该函数内，首先使用notify_die(DIE_NMI)从MSR读取当前CPU的模式，再判断时钟计数器，这里使用了一个技巧：始终将低分辨率时钟计数值+高分辨率时钟计数值作为时钟计数值，无论是否开启高分辨时钟模式，该值一定反映时钟计数的变化。

若相邻两次NMI的时钟计数值不变，此时即判断发生了hard lock，进一步确认，kernel等待5s，若5s内该状况一直保持不变，则进入die_nmi执行oops。否则更新上一次时钟计数值last_irq_sum，至此hard lock判断完毕。

参考文档：

https://www.cnblogs.com/muahao/p/7595158.html (解释hard lock和soft lock）

《Professional Linux Kernel Architecture》 chapter14.1

Intel Intel 64 and lA-32 Architectures Software Developer's Manual

来源：CSDN

作者：zhaojia92

链接：https://blog.csdn.net/zhaojia92/article/details/104731185

标签

中断处理

kernel