blktap
又回到那个问题,I/O请求从前端到后端blkback之后,如何从blkback到大tapdisk2进程?
pvops/Drivers/Xen/Blktap/ 下面是blktap的驱动代码,其中blktap的设备结构如下:
struct blktap {
int minor;
unsigned long dev_inuse;
struct blktap_ring ring;
struct blktap_device device;
struct blktap_page_pool *pool;
wait_queue_head_t remove_wait;
struct work_struct remove_work;
char name[BLKTAP2_MAX_MESSAGE_LEN];
struct blktap_statistics stats;
};
int minor;
unsigned long dev_inuse;
struct blktap_ring ring;
struct blktap_device device;
struct blktap_page_pool *pool;
wait_queue_head_t remove_wait;
struct work_struct remove_work;
char name[BLKTAP2_MAX_MESSAGE_LEN];
struct blktap_statistics stats;
};
根据目前的理解,blktap包含了四个模块:blktap_device, blktap_ring, blktap_sysfs, blktap_control
static struct blktap_page_pool *default_pool 定义了blktap驱动和tapdisk2(??)进程的消息交换的内存页空间?? (个人推测,待求证)
blktap_page_pool_init 为slab allocator 分配 blktap_request 结构的 kmem_cache
在blktap_control这个device被初始化的时候,会通过blktap_page_pool_get("default") 创建一个 2 * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST 个page那么大小的一个 page pool。个人认为这个pool的大小也可以hack/sys/class/misc/blktap-control/,用于提高 blktap 的 I/O 吞吐
blktap_control
blktap_control是一个字符设备
crw------- 1 root root 10, 55 Aug 16 03:35 /dev/blktap-control
/sys/class/misc/blktap-control/
这个字符设备只提供了一个ioctl的接口
static struct file_operations blktap_control_file_operations = {
.owner = THIS_MODULE,
.ioctl = blktap_control_ioctl,
};
.owner = THIS_MODULE,
.ioctl = blktap_control_ioctl,
};
blktap_control_init 是 blktap_control 设备驱动的 init 函数
static int __init
blktap_control_init(void)
{
int err;
err = misc_register(&blktap_control);
if (err)
return err;
control_device = blktap_control.this_device;
blktap_max_minor = min(64, MAX_BLKTAP_DEVICE);
blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
if (!blktaps) {
BTERR("failed to allocate blktap minor map");
return -ENOMEM;
}
err = blktap_page_pool_init(&control_device->kobj);
if (err)
return err;
default_pool = blktap_page_pool_get("default");
if (!default_pool)
return -ENOMEM;
err = device_create_file(control_device, &dev_attr_default_pool);
if (err)
return err;
return 0;
}
blktap_control_init(void)
{
int err;
err = misc_register(&blktap_control);
if (err)
return err;
control_device = blktap_control.this_device;
blktap_max_minor = min(64, MAX_BLKTAP_DEVICE);
blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
if (!blktaps) {
BTERR("failed to allocate blktap minor map");
return -ENOMEM;
}
err = blktap_page_pool_init(&control_device->kobj);
if (err)
return err;
default_pool = blktap_page_pool_get("default");
if (!default_pool)
return -ENOMEM;
err = device_create_file(control_device, &dev_attr_default_pool);
if (err)
return err;
return 0;
}
首先注册misc设备,blktap-control,可在/sys/class/misc/blktap-control 看到详细信息。control_device是blktap-control对应的struct device结构。
调用blktap_page_pool_init 初始化 request_cache, request_pool, pool_set
创建default_pool, 默认大小 704个page
调用device_create_file, 在sysfs里创建设备文件。
blktap-control设备允许外界通过ioctl对其进行操作。主要是创建和释放blktap设备。内核里维护了一个blktaps的 struct blktap** 的结构,为dom0创建的所有blktap设备的列表。其中用设备minor号来表示具体的blktap设备,e.g. blktaps[minor] 就表示设备minor号对应的 struct blktap* 指向的结构。
创建blktap设备 (BLKTAP2_IOCTL_ALLOC_TAP) :
调用blktap_control_create_tap,内部首先调用blktap_control_get_minor获取可用的minor号和struct blktap所需内存。然后生成blktap_handle h,其中
h.ring = blktap_ring_major
h.device = blktap_device_major
h.minor = tap->minor
这里blktap_ring_major是 blktapX这个字符设备创建时系统自动选取的主设备号,同样blktap_device_major是 tapdevX 这个块设备创建时系统自动选取的主设备号
最后copy_to_user把blktap_handle的值返回给用户态程序
释放blktap设备(BLKTAP2_IOCTL_FREE_TAP):
传入的参数是blktap设备的minor次设备号,通过minor找到 blktaps[minor]得到 struct blktap* 结构体tap。调用blktap_control_destroy_tap(tap)释放blktap相关设备。
blktap_ring
blktap的ring设备指的就是/dev/xen/blktapX所代表的字符设备。
blktap_ring_init 是字符设备的初始化函数,基本是字符设备的循规蹈矩那一套,其中
static struct file_operations blktap_ring_file_operations = {
.owner = THIS_MODULE,
.open = blktap_ring_open,
.release = blktap_ring_release,
.ioctl = blktap_ring_ioctl,
.mmap = blktap_ring_mmap,
.poll = blktap_ring_poll,
};
.owner = THIS_MODULE,
.open = blktap_ring_open,
.release = blktap_ring_release,
.ioctl = blktap_ring_ioctl,
.mmap = blktap_ring_mmap,
.poll = blktap_ring_poll,
};
blktap_ring_debug 是个很有用的工具,传入一个blktap结构和一块足够大的内存,会把当前blktap->ring 的所有pending requests的信息写到内存里。
blktap_ring_open 传入 struct inode, struct file* 都是device常用结构。 通过inode算出设备minor号,通过blktaps[minor] 得到struct blktap* 结构,把这个struct blktap* 结构赋值给 file->private_data,把当前kernel thread赋值给 blktap->ring.task
blktap_ring_release 最终调用blktap_device_destroy release掉这个blktap->device设备
blktap_ring_mmap 传入struct file*,可通过该指针得到struct blktap* 指针tap,struct blktap_ring 指针 tap->ring,调用alloc_page先分配一个内存页,SetPageReserved把页设置为保留,vm_insert_page 把这个页插入到 vm_area_struct 结构中。 这个页的用处,和设备前端后端之间的IO环一样(??)。 最后调用SHARED_RING_INIT
初始化IO环。blktap_ring->ring_vstart保存 IO环起始的虚拟地址,blktap_ring->ring是和IO环attach的前端设备环 frontend-ring。
blktap_ring_ioctl 提供给user space的应用程序(应该只是tapdisk2进程) 一个ioctl 的接口,目前支持的命令有:
BLKTAP2_IOCTL_CREATE_DEVICE:user space程序传入一个blktap_params结构,调用 blktap_device_create创建一个 tapdevX 的块设备。blktap_device_create内部是调用linux内核块设备接口,创建块设备。基本的数据结构为 struct blktap_device, struct gendisk等。其中gendisk->queue 就是块设备的请求队列,我们熟知的电梯算法,就是对这个队列上的请求进行调度,包括各种调度器算法,比如CFQ,
Deadline, FIFO(noop)等等
BLKTAP2_IOCTL_REMOVE_DEVICE: 调用blktap_device_destroy 销毁块设备
BLKTAP2_IOCTL_KICK_FE:调用blktap_read_ring。blktap_read_ring 查看blktap_ring设备还有哪些未处理的response请求,即rsp_prod - rsp_cons个response,对每一个调用 blktap_ring_read_response
blktap_ring_read_response做如下事情:对于传入的struct blkif_response* rsp,找出对应的id: rsp->id,这个response对应的request为blktap_ring->pending[rsp->id]
接下来是一系列check,确保response和request 一一对应之后,调用blktap_device_end_request(tap, request, err)
blktap_device_end_request 由一系列调用组成,最重要的是blktap_ring_unmap_request
void blktap_ring_unmap_request(struct blktap *tap, struct blktap_request *request)
{
struct blktap_ring *ring = &tap->ring;
unsigned long uaddr;
unsigned size;
int seg, read;
uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
size = request->nr_pages << PAGE_SHIFT;
read = request->operation == BLKIF_OP_READ;
if (read)
for (seg = 0; seg < request->nr_pages; seg++)
blktap_request_bounce(tap, request, seg, !read);
zap_page_range(ring->vma, uaddr, size, NULL);
}
{
struct blktap_ring *ring = &tap->ring;
unsigned long uaddr;
unsigned size;
int seg, read;
uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
size = request->nr_pages << PAGE_SHIFT;
read = request->operation == BLKIF_OP_READ;
if (read)
for (seg = 0; seg < request->nr_pages; seg++)
blktap_request_bounce(tap, request, seg, !read);
zap_page_range(ring->vma, uaddr, size, NULL);
}
uaddr 是真正request I/O读写segment用到的内存的起始地址,可以看到MMAP_VADDR宏的定义,可知对于每个request,无论请求多少个segment的读写,ring里都会划定同样的内存大小 BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE 。 size 是用到的内存大小,为request->nr_pages * 页大小。这里nr_pages应该就是request里读写segment的个数。xen为每个segment都分配了一个page大小的内存。
可以从blkif_request结构里看出,每个segment对应一段sectors
如果是读请求,调用blktap_request_bounce,好吧这个函数我暂时没看懂。。。
最后应该是清空相应内存了吧,不看了。。
所以blktap_device_end_request 就是做结束request的扫尾工作,如果指定了err,就认为这个request成功,或者返回-EIO
综上所述,BLKTAP_IOCTL_KICK_FE应该就是处理所有没处理完的response,由于response没有处理完,因此之前对应的request都是pending_request,这个函数也可以理解为处理所有的pending_request。之后应该就可以 KICK FRONTEND设备了吧,呵呵
blktap_ring_map_request/blktap_ring_unmap_request/blktap_ring_map_segment
核心函数是blktap_ring_map_segment, blktap_ring_map_request也是对request每一个segment调用blktap_ring_map_segment。
blktap_ring -> user_vstart 指向真正的请求segment 的内存地址, blktap_ring->ring_vstart 是 请求IO环的内存地址, blktap_ring->user_vstart = blktap_ring -> ring_vstart + PAGE_SIZE。 这块blktap_ring->ring_vstart开头,长度最大为 1 + BLKIF_MAX_SEGMENTS_PER_REQUEST * MAX_PENDING_REQS * PAGE_SIZE
的内存由
blktap_ring->vma来管理。
blktap_ring_map_segment 最终调用 vm_insert_page 把 uaddr 开头的第seg个segment对应的page插入到blktap_ring->vma里面。
blktap_ring_free_request/blktap_ring_make_request
这两个函数比较简单, blktap_ring_make_request 就是调用blktap_request_alloc 分配一个 blktap_request,在blktap_ring里找一个空的坑,把这个blktap_request 填进去完事儿。 blktap_ring_free就是把这个blktap_request释放了呗。
blktap_ring_submit_request
void
blktap_ring_submit_request(struct blktap *tap,
struct blktap_request *request)
{
struct blktap_ring *ring = &tap->ring;
struct blkif_request *breq;
struct scatterlist *sg;
int i, nsecs = 0;
dev_dbg(ring->dev,
"request %d [%p] submit\n", request->usr_idx, request);
breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
breq->id = request->usr_idx;
breq->sector_number = blk_rq_pos(request->rq);
breq->handle = 0;
breq->operation = request->operation;
breq->nr_segments = request->nr_pages;
blktap_for_each_sg(sg, request, i) {
struct blkif_request_segment *seg = &breq->seg[i];
int first, count;
count = sg->length >> 9;
first = sg->offset >> 9;
seg->first_sect = first;
seg->last_sect = first + count - 1;
nsecs += count;
}
ring->ring.req_prod_pvt++;
do_gettimeofday(&request->time);
if (request->operation == BLKIF_OP_WRITE) {
tap->stats.st_wr_sect += nsecs;
tap->stats.st_wr_req++;
}
if (request->operation == BLKIF_OP_READ) {
tap->stats.st_rd_sect += nsecs;
tap->stats.st_rd_req++;
}
}
blktap_ring_submit_request(struct blktap *tap,
struct blktap_request *request)
{
struct blktap_ring *ring = &tap->ring;
struct blkif_request *breq;
struct scatterlist *sg;
int i, nsecs = 0;
dev_dbg(ring->dev,
"request %d [%p] submit\n", request->usr_idx, request);
breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
breq->id = request->usr_idx;
breq->sector_number = blk_rq_pos(request->rq);
breq->handle = 0;
breq->operation = request->operation;
breq->nr_segments = request->nr_pages;
blktap_for_each_sg(sg, request, i) {
struct blkif_request_segment *seg = &breq->seg[i];
int first, count;
count = sg->length >> 9;
first = sg->offset >> 9;
seg->first_sect = first;
seg->last_sect = first + count - 1;
nsecs += count;
}
ring->ring.req_prod_pvt++;
do_gettimeofday(&request->time);
if (request->operation == BLKIF_OP_WRITE) {
tap->stats.st_wr_sect += nsecs;
tap->stats.st_wr_req++;
}
if (request->operation == BLKIF_OP_READ) {
tap->stats.st_rd_sect += nsecs;
tap->stats.st_rd_req++;
}
}
breq,是从设备IO环(blktapX字符设备的成员ring,是一个blkif_front_ring的结构,所以我们可以把blktapX看做另一个前端,而对应的后端应该就是tapdevX设备)找一个可用的blkif_request,把blktap_request的内容填到这个空闲的blkif_request里面。同时对于blkif_request 的每个segment,计算出segment的first_sector, last_sector。
最后blktap_ring->ring.req_prod_pvt++相当于把这个request挂到了设备IO环里
来源:CSDN
作者:majieyue
链接:https://blog.csdn.net/majieyue/article/details/6709300