Linux Block子系统——IO调度层

概述

本文主要来讨论Linux Block子系统中的IO调度层。我们知道应用层发起磁盘数据访问时内核并不会立即将请求下发到磁盘的驱动程序中进行响应，而是做适当的延迟，尝试能否扩展之前请求的磁盘范围来满足该请求。这样做的好处也很明显，以机械硬盘为例，访问不同位置的数据是通过磁头的移动实现的，如果下发给驱动程序的请求是按照磁头移动的方向进行了排序，那么磁盘只需要按照特定的方向连续的访问数据即可响应这些请求，节省了磁头移动定位的时间。对IO请求进行排序和并就是IO调度层的主要工作，由于这种机制很像我们显示生活中的电梯（只朝着一个方向运行），因此IO调度层所使用的算法也被统称为电梯调度算法。

数据结构

IO调度层涉及到的数据结构主要为两种，request表示IO请求，由通用块层的bio初始化或者合并得到；request_queue表示请求队列，包含了对一个块设备的所有request。下面我们来看一下这两种数据结构中主要的成员。

struct request {
#ifdef __GENKSYMS__
	union {
		struct list_head queuelist;
		struct llist_node ll_list;
	};
#else
	struct list_head queuelist;
#endif
	union {
		struct call_single_data csd;
		RH_KABI_REPLACE(struct work_struct mq_flush_work,
			        unsigned long fifo_time)
	};

	struct request_queue *q;
	struct blk_mq_ctx *mq_ctx;

	u64 cmd_flags;
	enum rq_cmd_type_bits cmd_type;
	unsigned long atomic_flags;

	int cpu;

	/* the following two fields are internal, NEVER access directly */
	unsigned int __data_len;	/* total data len */
	sector_t __sector;		/* sector cursor */

	struct bio *bio;
	struct bio *biotail;

#ifdef __GENKSYMS__
	struct hlist_node hash;	/* merge hash */
#else
	/*
	 * The hash is used inside the scheduler, and killed once the
	 * request reaches the dispatch list. The ipi_list is only used
	 * to queue the request for softirq completion, which is long
	 * after the request has been unhashed (and even removed from
	 * the dispatch list).
	 */
	union {
		struct hlist_node hash;	/* merge hash */
		struct list_head ipi_list;
	};
#endif

	/*
	 * The rb_node is only used inside the io scheduler, requests
	 * are pruned when moved to the dispatch queue. So let the
	 * completion_data share space with the rb_node.
	 */
	union {
		struct rb_node rb_node;	/* sort/lookup */
		void *completion_data;
	};

	/*
	 * Three pointers are available for the IO schedulers, if they need
	 * more they have to dynamically allocate it.  Flush requests are
	 * never put on the IO scheduler. So let the flush fields share
	 * space with the elevator data.
	 */
	union {
		struct {
			struct io_cq		*icq;
			void			*priv[2];
		} elv;

		struct {
			unsigned int		seq;
			struct list_head	list;
			rq_end_io_fn		*saved_end_io;
		} flush;
	};

	struct gendisk *rq_disk;
	struct hd_struct *part;
	unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
	struct request_list *rl;		/* rl this rq is alloced from */
	unsigned long long start_time_ns;
	unsigned long long io_start_time_ns;    /* when passed to hardware */
#endif
	/* Number of scatter-gather DMA addr+len pairs after
	 * physical address coalescing is performed.
	 */
	unsigned short nr_phys_segments;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	unsigned short nr_integrity_segments;
#endif

	unsigned short ioprio;

	void *special;		/* opaque pointer available for LLD use */
	char *buffer;		/* kaddr of the current segment if available */

	int tag;
	int errors;

	/*
	 * when request is used as a packet command carrier
	 */
	unsigned char __cmd[BLK_MAX_CDB];
	unsigned char *cmd;
	unsigned short cmd_len;

	unsigned int extra_len;	/* length of alignment and padding */
	unsigned int sense_len;
	unsigned int resid_len;	/* residual count */
	void *sense;

	unsigned long deadline;
	struct list_head timeout_list;
	unsigned int timeout;
	int retries;

	/*
	 * completion callback.
	 */
	rq_end_io_fn *end_io;
	void *end_io_data;

	/* for bidi */
	struct request *next_rq;
}

struct request_queue {
	struct list_head	queue_head;
	struct request		*last_merge;
	struct elevator_queue	*elevator;
	int			nr_rqs[2];	/* # allocated [a]sync rqs */
	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */

	struct request_list	root_rl;

	request_fn_proc		*request_fn;
	make_request_fn		*make_request_fn;
	prep_rq_fn		*prep_rq_fn;
	merge_bvec_fn		*merge_bvec_fn;
	softirq_done_fn		*softirq_done_fn;
	rq_timed_out_fn		*rq_timed_out_fn;
	dma_drain_needed_fn	*dma_drain_needed;
	lld_busy_fn		*lld_busy_fn;

	RH_KABI_CONST struct blk_mq_ops *mq_ops;

	unsigned int		*mq_map;

	/* sw queues */
	RH_KABI_REPLACE(struct blk_mq_ctx	*queue_ctx,
		          struct blk_mq_ctx __percpu	*queue_ctx)

	unsigned int		nr_queues;

	/* hw dispatch queues */
	struct blk_mq_hw_ctx	**queue_hw_ctx;
	unsigned int		nr_hw_queues;

	sector_t		end_sector;
	struct request		*boundary_rq;

	struct delayed_work	delay_work;

	struct backing_dev_info	backing_dev_info;

	void			*queuedata;

	unsigned long		queue_flags;

	int			id;

	gfp_t			bounce_gfp;

	spinlock_t		__queue_lock;
	spinlock_t		*queue_lock;

	struct kobject kobj;

	struct kobject mq_kobj;

#ifdef CONFIG_PM_RUNTIME
	struct device		*dev;
	int			rpm_status;
	unsigned int		nr_pending;
#endif

	unsigned long		nr_requests;	/* Max # of requests */
	unsigned int		nr_congestion_on;
	unsigned int		nr_congestion_off;
	unsigned int		nr_batching;

	unsigned int		dma_drain_size;
	void			*dma_drain_buffer;
	unsigned int		dma_pad_mask;
	unsigned int		dma_alignment;

	struct blk_queue_tag	*queue_tags;
	struct list_head	tag_busy_list;

	unsigned int		nr_sorted;
	unsigned int		in_flight[2];

	unsigned int		request_fn_active;

	unsigned int		rq_timeout;
	struct timer_list	timeout;
	struct list_head	timeout_list;

	struct list_head	icq_list;
#ifdef CONFIG_BLK_CGROUP
	DECLARE_BITMAP		(blkcg_pols, BLKCG_MAX_POLS);
	struct blkcg_gq		*root_blkg;
	struct list_head	blkg_list;
#endif

	struct queue_limits	limits;

	unsigned int		sg_timeout;
	unsigned int		sg_reserved_size;
	int			node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	struct blk_trace	*blk_trace;
#endif
	/*
	 * for flush operations
	 */
	unsigned int		flush_flags;
	unsigned int		flush_not_queueable:1;
	RH_KABI_DEPRECATE(unsigned int,            flush_queue_delayed:1)
	RH_KABI_DEPRECATE(unsigned int,            flush_pending_idx:1)
	RH_KABI_DEPRECATE(unsigned int,            flush_running_idx:1)
	RH_KABI_DEPRECATE(unsigned long,           flush_pending_since)
	RH_KABI_DEPRECATE(struct list_head,        flush_queue[2])
	RH_KABI_DEPRECATE(struct list_head,        flush_data_in_flight)
	RH_KABI_DEPRECATE(struct request *,        flush_rq)
	RH_KABI_DEPRECATE(spinlock_t,              mq_flush_lock)

	struct mutex		sysfs_lock;

	int			bypass_depth;

#if defined(CONFIG_BLK_DEV_BSG)
	bsg_job_fn		*bsg_job_fn;
	int			bsg_job_size;
	struct bsg_class_device bsg_dev;
#endif

#ifdef CONFIG_BLK_DEV_THROTTLING
	/* Throttle data */
	struct throtl_data *td;
#endif
	struct rcu_head		rcu_head;
	wait_queue_head_t	mq_freeze_wq;
	RH_KABI_DEPRECATE(struct percpu_counter, mq_usage_counter)
	struct list_head	all_q_node;

	RH_KABI_EXTEND(unprep_rq_fn		*unprep_rq_fn)

	RH_KABI_EXTEND(struct blk_mq_tag_set	*tag_set)
	RH_KABI_EXTEND(struct list_head		tag_set_list)

	RH_KABI_EXTEND(struct list_head		requeue_list)
	RH_KABI_EXTEND(spinlock_t			requeue_lock)
	/* requeue_work's type is changed from 'work_struct' to 'delayed_work' below */
	RH_KABI_EXTEND(struct work_struct	rh_reserved_requeue_work)
	RH_KABI_EXTEND(atomic_t				mq_freeze_depth)
	RH_KABI_EXTEND(struct blk_flush_queue   *fq)
	RH_KABI_EXTEND(struct percpu_ref	q_usage_counter)
	RH_KABI_EXTEND(bool			mq_sysfs_init_done)
	RH_KABI_EXTEND(struct work_struct	timeout_work)
	RH_KABI_EXTEND(struct delayed_work	requeue_work)
	RH_KABI_EXTEND(struct blk_queue_stats	*stats)
	RH_KABI_EXTEND(struct blk_stat_callback	*poll_cb)
	RH_KABI_EXTEND(struct blk_rq_stat	poll_stat[2])
	RH_KABI_EXTEND(atomic_t		shared_hctx_restart)
	RH_KABI_EXTEND(unsigned int		queue_depth)

	RH_KABI_EXTEND(unsigned int         front_queue:1)

	RH_KABI_EXTEND(unsigned int         tail_queue:1)

	/* This flag is set if the driver can split bio */
	RH_KABI_EXTEND(unsigned int         can_split_bio:1)
#ifdef CONFIG_BLK_DEBUG_FS
	RH_KABI_EXTEND(struct dentry		*debugfs_dir)
	RH_KABI_EXTEND(struct dentry		*sched_debugfs_dir)
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
	RH_KABI_EXTEND(struct mutex		blk_trace_mutex)
#endif

	RH_KABI_EXTEND(init_rq_fn		*init_rq_fn)
	RH_KABI_EXTEND(exit_rq_fn		*exit_rq_fn)
	RH_KABI_EXTEND(size_t			cmd_size)
	RH_KABI_EXTEND(void			*rq_alloc_data)
}

IO调度

IO调度层的核心就是IO请求的调度，涉及到IO请求合并与下发的时间点，如何提高IO请求效率等问题。接下来我们将讨论IO请求如何合并，IO调度层的调度器。

IO请求合并

在讨论IO请求如何合并前，需要明确为什么需要进行IO请求合并。前文已经说过合并有利于提高磁盘IO效率，具体时如何体现呢？假设现在现在应用层发起了一个数据访问请求，请求的扇区为5~10；此时另一个用户进程紧跟着也发起了一个请求，访问的扇区时1~4，在没有IO请求合并的情况下，驱动程序驱动磁盘读取5~10扇区的数据后将磁头移动回第1扇区，接着读取1~4扇区数据，这样做使得有一部分时间浪费在了磁头移动上（实际上磁头移动时间比数据访问时间要大）。因此Linux中引入了IO请求合并，将第一个请求向后延迟，第二个请求出现后发现可以和第一个请求合并，直接访问1~10扇区的数据，这样就节省了磁头移动的时。

Linux中IO请求的合并也比较复杂，首先我们来参考一张前人总结的IO合并图，图中描述了应用层发起数据访问请求后IO请求经过了哪些merge的点，最终下发到块设备的请求队列中。

图1 IO合并

如图1所示，对IO请求进行merge的点有三处：

Cache：缓存机制层面的合并，本文不详细讨论。
Plug List：蓄流泄流机制，进程会进行蓄流操作，将本进程的IO请求合并，并在适当的时机将合并后的请求泄流进调度队列中。
Elevator Queue：在调度器将请求添加到调度队列时会进行IO请求的合并。

从图中我们可以看到应用层发起数据请求后不管是经过Cache还是跳过Cache机制，都需要经过上述三个merge点中的至少一个，这样就保证最终下发给驱动的请求经过了最大可能的合并和排序。接下来我们详细讨论一下蓄流泄流机制以及IO调度层的调度器，理解IO请求时如何进行合并的。

蓄流泄流

plug/unplug即蓄流和泄流，是Linux中提高IO合并效率的一种机制。从图1中可以看到一部分IO请求在被添加到调度队列中时会先经过Plug List一层，然后经过unplug操作才会被添加到调度队列中，这整个过程被称为蓄流和泄流。该机制主要涉及到的数据结构只有一种：

struct blk_plug {
	unsigned long magic; /* detect uninitialized use-cases */
	struct list_head list; /* plug中的请求链表表头 */
	struct list_head mq_list; /* blk-mq requests */
	struct list_head cb_list; /* md requires an unplug callback */
};

每个进程都会有一个Plug List（如果支持plug机制），上层下发的IOi请求进入Plug List后都会被链入list成员指向的链表中。因为同一个进程中的IO请求访问磁盘中相邻连续扇区的可能性更大，所以在Plug List中IO请求合并的几率也更大，而且没有把IO请求直接添加到调度队列中，这也使得调度队列的负荷降低。蓄流泄流机制的通用流程如图2所示：

图2 plug流程

如果支持蓄流泄流机制，内核一般会在IO调度层之上开启蓄流机制，并将bio提交给调度层并添加到Plug List中，最后进行泄流操作将Plug List中的request提交给调度器，由调度器进行进一步的合并和排序并添加到调度器的调度队列中。

图2中调用blk_start_plug开启蓄流，最后调用blk_finish_plug进行泄流操作，这两个函数一般都是成对出现。我们先来看一下blk_start_plug函数：

void blk_start_plug(struct blk_plug *plug)
{
	struct task_struct *tsk = current;

	plug->magic = PLUG_MAGIC;
	INIT_LIST_HEAD(&plug->list);
	INIT_LIST_HEAD(&plug->mq_list);
	INIT_LIST_HEAD(&plug->cb_list);

	/*
	 * If this is a nested plug, don't actually assign it. It will be
	 * flushed on its own.
	 */
	if (!tsk->plug) {
		/*
		 * Store ordering should not be needed here, since a potential
		 * preempt will imply a full memory barrier
		 */
		tsk->plug = plug;
	}
}

该函数功能比较简单，在创建并初始化一个plug之后将其添加到当前进程描述符current的plug字段中，后续该进程访问Plug List都是通过访问current->plug的方式实现的。此处也体现了前文所说的每个进程都有一个Plug List。

接着我们来分析一下泄流操作：

void blk_finish_plug(struct blk_plug *plug)
{
	blk_flush_plug_list(plug, false);

	if (plug == current->plug)
		current->plug = NULL;
}

从代码中我们可以看到泄流操作调用了blk_flush_plug_list将plug中的request泄流到调度队列中，并设置current->plug字段为NULL。我们接着往下分析blk_flush_plug_list时如何实现泄流的。

void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
	...

	if (list_empty(&plug->list))
		return;

	list_splice_init(&plug->list, &list);

	list_sort(NULL, &list, plug_rq_cmp);

	q = NULL;
	depth = 0;

	local_irq_save(flags);
	while (!list_empty(&list)) {
		rq = list_entry_rq(list.next);
		list_del_init(&rq->queuelist);
		BUG_ON(!rq->q);
		if (rq->q != q) {
			if (q)
				queue_unplugged(q, depth, from_schedule);
			q = rq->q;
			depth = 0;
			spin_lock(q->queue_lock);
		}
                ...
	        elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);

		depth++;
	}

	if (q)
		queue_unplugged(q, depth, from_schedule);

	local_irq_restore(flags);
}

以上是blk_flush_plug_list核心的代码，主要是将Plug List中的request通过elv_add_request函数提交给调度器，由调度器进行排序和进一步的合并，最后调用queue_unplugged，该函数会调用块设备驱动程序提供的request响应函数，从调度队列中获取request进行数据数据传输，至此泄流操作完成。

从上面的分析中可以看出泄流最关键的一步就是调用queue_unplugged函数将IO请求下发给驱动程序，由驱动程序进行数据传输。我们来看看这个函数做了哪些操作：

static void queue_unplugged(struct request_queue *q, unsigned int depth,
			    bool from_schedule)
	__releases(q->queue_lock)
{
	trace_block_unplug(q, depth, !from_schedule);

	if (from_schedule)
		blk_run_queue_async(q);
	else
		__blk_run_queue(q);
	spin_unlock(q->queue_lock);
}

这个函数通过判断from_schedule参数决定执行什么操作。这个参数的取值由两种：Ture和False，Ture表示异步执行，调用blk_run_queue_async异步地将IO请求下发给驱动程序；False表示同步执行，调用__blk_run_queue立即将调度队列中的IO请求下发给驱动程序。上文我们提到的blk_finish_plug最终在调用该函数时传递的参数就是False，即立即将IO请求下发给驱动程序进行响应，以及在新的request添加到Plug List的过程中发现Plug List已满，也会以同步的方式unplug，本文不再详细讲述，感兴趣的读者可以去查阅相关源码。但是这种方法有一个弊端就是需要等待IO传输完成，我们都知道IO操作是比较耗费CPU时间的，因此可能会造成进程的阻塞。异步操作就不会由这种问题，因此Linux内核中也提供了异步unplug的机制。

异步unplug是由工作队列实现的，在Linux初始化Block子系统时会为每个CPU都创建一个kblockd的工作队列，该队列的主要功能由blk_delay_work函数实现，并由定时器定时触发该工作队列：

static void blk_delay_work(struct work_struct *work)
{
	struct request_queue *q;

	q = container_of(work, struct request_queue, delay_work.work);
	spin_lock_irq(q->queue_lock);
	__blk_run_queue(q);
	spin_unlock_irq(q->queue_lock);
}

blk_delay_work最终也是调用了__blk_run_queue下发IO请求。那什么时候会进行异步unplug呢？在进程切换时调用schedule函数，该函数会调用blk_schedule_flush_plug，blk_schedule_plug在调用blk_flush_plug_list时传递的就是true，表示异步unplug。

static inline void blk_schedule_flush_plug(struct task_struct *tsk)
{
	struct blk_plug *plug = tsk->plug;

	if (plug)
		blk_flush_plug_list(plug, true);
}

我们来总结一下Linux内核在何时进行蓄流与泄流。

蓄流
- 调用blk_start_plug初始化Plug List
泄流
- 调用blk_finish_plug同步unplug，将request下发到驱动程序
- 新的request添加到Plug List中发现Plug List已满，调用blk_flush_plug_list同步unplug
- 进程切换时schedule调用blk_flush_plug_list异步unplug，最终定时器定时触发kblockd将request下发给驱动程序

调度器

从图1中可以看到，不管IO请求经过哪条路径，最后都会汇聚到调度器，由调度器统一下发给驱动程序进行处理，因此我们需要来探讨一下IO调度器的工作原理。

Linux中提供了多种调度器策略供用户选择，用户可以通过/sys接口动态调整调度器策略。Linux中主要的调度器策略有如下几种：

Noop算法：IO调度器最简单的算法，将IO请求放入队列中并顺序的执行这些IO请求，对于连续的IO请求也会做相应的合并。
Deadline算法：保证IO请求在一定时间内能够被服务，避免某个请求饥饿
Anticipatory算法：心是局部性原理，它期望一个进程做完一次IO请求后还会继续在此处做IO请求
CFQ算法：即绝对公平算法，试图为竞争块设备使用权的所有进程分配一个请求队列和一个时间片，在调度器分配给进程的时间片内，进程可以将其读写请求发送给底层块设备，当进程的时间片消耗完，进程的请求队列将被挂起，等待调度。

Linux通过将调度器抽象化提出了一个统一的规范接口，各个调度器算法通过该接口向调度器注册，这样便有利于调度器策略的扩展和切换。我们首先来了解一下调度器规范接口。Linux中调度器的接口由struct elevator_ops结构定义：

struct elevator_ops
{
	elevator_merge_fn *elevator_merge_fn;
	elevator_merged_fn *elevator_merged_fn;
	elevator_merge_req_fn *elevator_merge_req_fn;
	elevator_allow_merge_fn *elevator_allow_merge_fn;
	elevator_bio_merged_fn *elevator_bio_merged_fn;

	elevator_dispatch_fn *elevator_dispatch_fn;
	elevator_add_req_fn *elevator_add_req_fn;
	elevator_activate_req_fn *elevator_activate_req_fn;
	elevator_deactivate_req_fn *elevator_deactivate_req_fn;

	elevator_completed_req_fn *elevator_completed_req_fn;

	elevator_request_list_fn *elevator_former_req_fn;
	elevator_request_list_fn *elevator_latter_req_fn;

	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */

	elevator_set_req_fn *elevator_set_req_fn;
	elevator_put_req_fn *elevator_put_req_fn;

	elevator_may_queue_fn *elevator_may_queue_fn;

	elevator_init_fn *elevator_init_fn;
	elevator_exit_fn *elevator_exit_fn;
};

elevator_ops结构定义了将IO请求添加到调度队列以及从调度队列中将IO请求下发给驱动的函数和相关数据结构。例如DeadLine调度算法定义了如下接口供调度器使用：

static struct elevator_type iosched_deadline = {
	.ops = {
		.elevator_merge_fn = 		deadline_merge,
		.elevator_merged_fn =		deadline_merged_request,
		.elevator_merge_req_fn =	deadline_merged_requests,
		.elevator_dispatch_fn =		deadline_dispatch_requests,
		.elevator_add_req_fn =		deadline_add_request,
		.elevator_former_req_fn =	elv_rb_former_request,
		.elevator_latter_req_fn =	elv_rb_latter_request,
		.elevator_init_fn =		deadline_init_queue,
		.elevator_exit_fn =		deadline_exit_queue,
	},

	.elevator_attrs = deadline_attrs,
	.elevator_name = "deadline",
	.elevator_owner = THIS_MODULE,
};

名字中带有merge字段的方法是将IO请求通过不同的形式添加到调度器的调度队列中，具体实现本文不做详细讲述，我们主要来分析一下IO请求时如何从调度队列中下发给驱动的。以DeadLine算法为例，根据调度算法的策略计算得到当前最优的IO请求，并调用deadline_dispath_request函数将该请求调出调度队列。该函数经过层层调用，最终会调用elv_dispatch_add_tail将IO请求调度出队列：

void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}

我们可以从上面的代码中看到，该函数更新调度队列的信息后将IO请求rq添加到了块设备请求队列q的queuelist字段中。在稍后的时间里内核会进行unplug操作，最终驱动程序会从请求队列的queuelist字段中读取IO请求进行响应。

来源：oschina

链接：https://my.oschina.net/u/4361306/blog/3477679

标签

unplug

RequestQueue

TSK