XFS导致进程内核栈溢出的解决办法

系统环境

系统版本： CentOS release 6.5
kenel版本：2.6.32-431.20.3.el6.x86_64
文件系统： XFS

问题描述

系统panic，并打印以下calltrace信息：

kvm: 16396: cpu1 unhandled wrmsr: 0x391 data 2000000fBUG: scheduling while atomic:qemu-system-x86/27122/0xffff8811BUG: unable to handle kernel paging request at 00000000dd7ed3a8IP: [<fffffff81058e5d>] task_rq_lock+0x4d/0xa8PGD 0Oops:0000 [#1] SMPlast sysfs file: /sys/devices/pci0000:00/0000:00:02.2/0000:04:00.0/host0/target0:2:1/0:2:1/block/sdb/queue/logical_block_size...[<ffffffff81058e5d>] ？ task_rq_lock+0x4d/0xa0[<ffffffff8106195c>] ？ try_to_wakeup+0x3c/0x3e0[<ffffffff81061d55>] ？ wake_up_process+0x15/0x20[<ffffffff810a0f62>] ？ __up+0x2a/0x40[<ffffffffa03394c2>] ？ xfs_buf_unlock+0x32/0x90 [xfs][<ffffffffa030297f>] ？ xfs_buf_item_unpin+0xcf/0x1a0 [xfs][<ffffffffa032f18c>] ？ xfs_trans_committed_bulk+0x29c/0x2b0 [xfs][<ffffffff81069f15>] ？ enqueue_entity+0x125/0x450[<ffffffff81060aa3>] ？ perf_event_task_sched_out+0x33/0x70[<ffffffff81069973>] ？ dequeue_entity+0x113/0x2e0[<ffffffffa032326d>] ？ xlog_cli_committed+0x0x3d/0x100 [xfs][<ffffffffa031f79d>] ？ xlog_state_do_callback+0x15d/0x2b0 [xfs][<ffffffffa031f96e>] ？ xlog_state_done_syncing+0x7e/0xb0 [xfs][<ffffffffa03200e9>] ？ xlog_iodone+0x59/0xb0 [xfs][<ffffffffa033ae50>] ？ xfs_buf_iodone_work+0x0/0x50 [xfs][<ffffffffa033ae76>] ？ xfs_buf_iodone_work+0x26/0x50 [xfs]

截图如下：

错误跟踪

unable to handle kernel paging request at 00000000dd7ed3a0
00000000dd7ed3a0是用户空间地址，内核正常是不会访问的，所以，可以定性为内核出了BUG。IP: [<ffffffff81058e5d>] task_rq_lock + 0x4d/0xa8由于系统中没有部署kdump，只能通过objdump静态分析，进一步跟踪出错的指令地址。

ffffffff81058e10 <task_rq_lock>:* interrupts. Note the ordering: we can safely lookup the task_rq without* explicitly disabling preemption.*/static struct rq *task_rq_lock（struct task_struct *p, unsigned long *flags）__acquires（rq->lock）{ffffffff81058e10: 55push %rbpffffffff81058e11: 48 89 e5mov%rsp,%rbpffffffff81058e14: 48 83 ec 20 sub$0x20,%rspffffffff81058e18: 48 89 1c 24 mov%rbx,（%rsp）ffffffff81058e1c: 4c 89 64 24 08mov%r12,0x8（%rsp）ffffffff81058e21: 4c 89 6c 24 10mov%r13,0x10（%rsp）ffffffff81058e26: 4c 89 74 24 18mov%r14,0x18（%rsp）ffffffff81058e2b: e8 10 1f fb ffcallqffffffff8100ad40 <mcount>ffffffff81058e30: 48 c7 c3 40 68 01 00mov$0x16840,%rbxffffffff81058e37: 49 89 fcmov%rdi,%r12ffffffff81058e3a: 49 89 f5mov%rsi,%r13ffffffff81058e3d: ff 14 25 80 8b a9 81callq*0xffffffff81a98b80ffffffff81058e44: 48 89 c2mov%rax,%rdxPVOP_VCALLEE1（pv_irq_ops.restore_fl, f）;}static inline void raw_local_irq_disable（void）{PVOP_VCALLEE0（pv_irq_ops.irq_disable）;ffffffff81058e47: ff 14 25 90 8b a9 81callq*0xffffffff81a98b90struct rq *rq;for （;;） {local_irq_save（*flags）;ffffffff81058e4e: 49 89 55 00 mov%rdx,0x0（%r13）rq = task_rq（p）;ffffffff81058e52: 49 8b 44 24 08mov0x8（%r12）,%raxffffffff81058e57: 49 89 demov%rbx,%r14ffffffff81058e5a: 8b 40 18mov0x18（%rax）,%eaxffffffff81058e5d: 4c 03 34 c5 60 cf bfadd-0x7e4030a0（,%rax,8）,%r14ffffffff81058e64: 81spin_lock（&rq->lock）;ffffffff81058e65: 4c 89 f7mov%r14,%rdiffffffff81058e68: e8 a3 23 4d 00callqffffffff8152b210 <_spin_lock>

通过objdump反汇编vmlinux，定位出错的指令，当运行到ffffffff81058e5d这个地址时，系统出错，找到对应的代码段，发现是在task_rq_lock（）调用task_rq（）时出错。kernel/sched.c

#define task_rq（p）cpu_rq（task_cpu（p））/** task_rq_lock - lock the runqueue a given task resides on and disable* interrupts. Note the ordering: we can safely lookup the task_rq without* explicitly disabling preemption.*/static struct rq *task_rq_lock（struct task_struct *p, unsigned long *flags）__acquires（rq->lock）{struct rq *rq;for （;;） {local_irq_save（*flags）;rq = task_rq（p）;spin_lock（&rq->lock）;if （likely（rq == task_rq（p）））return rq;spin_unlock_irqrestore（&rq->lock, *flags）;}}

include/linux/sched.h

#define task_thread_info（task）（（struct thread_info *）（task）->stack）static inline unsigned int task_cpu（const struct task_struct *p）{return task_thread_info（p）->cpu;}union thread_union {struct thread_info thread_info;unsigned long stack[THREAD_SIZE/sizeof（long）];};

看到这里终于有了眉目，原来进程的thread_info和内核栈stack共处在一个union中，由于内核栈溢出导致thread_info被破坏。再来看看内核栈的大小：
arch/x86/include/asm/page_64_types.h#define THREAD_ORDER1#define THREAD_SIZE（PAGE_SIZE << THREAD_ORDER）#define CURRENT_MASK （~（THREAD_SIZE - 1））在64位系统中，内核栈大小为8K。thread_info结构和进程的内核态stack结构共存于一个union结构中，结构总大小默认是8KB。XFS进程由于某种原因使用过多的stack空间，导致stack溢出，破坏thread_info结构。“scheduling while atomic”应该是由于堆栈溢出覆盖了进程的thread_info结构体中的抢占计数（preempt count），导致下次被唤醒时抢占计数非零，出现panic。

原因分析

经objdump分析，XFS导致堆栈溢出有两种可能性：一种可能是xfs_iomap_write_direct（）函数未使用XFS_BMAPI_STACK_SWITCH标志，导致xfs_bmapi_allocate分配时，没有使用xfs_bmapi_allocate_worker分配到一个新的thread上（新的thread能保证有充足的栈），而是直接分配到了进程自己的内核栈，从而导致进程的内核栈溢出。该bug在kernel-3.4的（commit c999a22 “xfs: introduce an allocation workqueue”）中得到fix。另有一种争议认为，使用专门的allocation工作队列会因为线程创建的增加系统开销导致IO回写变慢，并且8K的内核栈对于超过8K的调用深度的进程依然会束手无策，所以kernel-3.16引入了（6538b8e x86_64: expand kernel stack to 16K）内核讨论组https://lwn.net/Articles/600647/比较了（commit c999a22 “xfs: introduce an allocation workqueue”）将writeback stack分到一个worker thread上和扩展内核栈为16K（6538b8e x86_64: expand kernel stack to 16K）这两种方案，有兴趣可以读一下。目前centos的2.6.32-520.el6已经将kernel-3.16的这个patch（6538b8e x86_64: expand kernel stack to 16K）从mainline拉了回来。这两个patch并不冲突，建议先将kernel升级看一下扩展内核栈为16K能否解决xfs_iomap_write_direct的问题，如果不能可以进一步把（commit c999a22 “xfs: introduce an allocation workqueue”）拉回来。另外一个可能的原因是xfs_buf_lock（）函数恰好在被一个信号量阻塞之前，执行了一个log force操作，而log force的调用比较深，堆栈消耗比较大，导致系统panic。与centos kernel changelog里的bug号1028831是同一个问题，该bug已经在2.6.32-495.el6中fix。

解决方案

升级kenel版本至≥2.6.32-520.el6，保证相关的patch已经包含进来。

changelog

[2.6.32-520.el6]

[kernel] x86_64: expand kernel stack to 16K （Johannes Weiner） [1045190 1060721]

[2.6.32-495.el6]

[fs] xfs: always do log forces via the workqueue （Eric Sandeen） [1028831]
[fs] xfs: Do background CIL flushes via a workqueue （Eric Sandeen） [1028831]

本文永久更新链接地址