Ceph 0.94.1
Qemu 2.4.0
rbd.c
在qemu_rbd_open函数中,操作了一个重要的数据结构——结构体BDRVRBDState,这个结构体保存了非常重要的信息。
typedef struct BDRVRBDState {
rados_t cluster; //cluster的handle
rados_ioctx_t io_ctx; //cluster的IO上下文
rbd_image_t image; //rbd镜像的结构体
char name[RBD_MAX_IMAGE_NAME_SIZE]; //rbd镜像的名字
char *snap; //rbd镜像快照的名字
} BDRVRBDState;
对qemu_rbd_open函数的分析:
static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
BDRVRBDState *s = bs->opaque;
char pool[RBD_MAX_POOL_NAME_SIZE];
char snap_buf[RBD_MAX_SNAP_NAME_SIZE];
char conf[RBD_MAX_CONF_SIZE];
char clientname_buf[RBD_MAX_CONF_SIZE];
char *clientname;
QemuOpts *opts;
Error *local_err = NULL;
const char *filename;
int r;
/*这里应该是参数相关*/
opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &local_err);
if (local_err) {
error_propagate(errp, local_err);
qemu_opts_del(opts);
return -EINVAL;
}
filename = qemu_opt_get(opts, "filename");
/*分析filename,然后分别赋值*/
if (qemu_rbd_parsename(filename, pool, sizeof(pool),
snap_buf, sizeof(snap_buf),
s->name, sizeof(s->name),
conf, sizeof(conf), errp) < 0) {
r = -EINVAL;
goto failed_opts;
}
/*
pool:pool的名字
snap_buf:镜像快照的名字
s->name:rbd镜像的名字
conf:配置的内容
*/
clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
/*client_buf:client的名字*/
r = rados_create(&s->cluster, clientname); //创建cluster handle
if (r < 0) {
error_setg(errp, "error initializing");
goto failed_opts;
}
/*复制snap_buf中的内容到s->snap*/
s->snap = NULL;
if (snap_buf[0] != '\0') {
s->snap = g_strdup(snap_buf);
}
/*设置配置文件,如果用户没有设置则采用默认设置*/
if (strstr(conf, "conf=") == NULL) {
/* try default location, but ignore failure */
rados_conf_read_file(s->cluster, NULL);
} else if (conf[0] != '\0') {
r = qemu_rbd_set_conf(s->cluster, conf, true, errp);
if (r < 0) {
goto failed_shutdown;
}
}
/*这里没有理解是什么意思*/
if (conf[0] != '\0') {
r = qemu_rbd_set_conf(s->cluster, conf, false, errp);
if (r < 0) {
goto failed_shutdown;
}
}
/*
* Fallback to more conservative semantics if setting cache
* options fails. Ignore errors from setting rbd_cache because the
* only possible error is that the option does not exist, and
* librbd defaults to no caching. If write through caching cannot
* be set up, fall back to no caching.
*/
/*设置cache的参数*/
if (flags & BDRV_O_NOCACHE) {
rados_conf_set(s->cluster, "rbd_cache", "false");
} else {
rados_conf_set(s->cluster, "rbd_cache", "true");
}
r = rados_connect(s->cluster); //连接cluster
if (r < 0) {
error_setg(errp, "error connecting");
goto failed_shutdown;
}
r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); //创建IO上下文
if (r < 0) {
error_setg(errp, "error opening pool %s", pool);
goto failed_shutdown;
}
r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); //在取得了IO上下文的情况下,打开rbd镜像
if (r < 0) {
error_setg(errp, "error reading header from %s", s->name);
goto failed_open;
}
bs->read_only = (s->snap != NULL);
qemu_opts_del(opts);
return 0;
failed_open:
rados_ioctx_destroy(s->io_ctx);
failed_shutdown:
rados_shutdown(s->cluster);
g_free(s->snap);
failed_opts:
qemu_opts_del(opts);
return r;
}
可见,在完成qemu_rbd_open函数之后,关于rbd镜像的信息被保存在了bs->opaque中,从而可以被其它函数所利用。
qemu_rbd_close函数非常简单,在这里就不贴出来了。
qemu对于rbd的读写操作貌似都是以异步的方式进行的。对读、写和flush函数的分析如下:
/*异步读函数*/
static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
int nb_sectors,
BlockCompletionFunc *cb,
void *opaque)
{
return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
RBD_AIO_READ);
}
/*异步写函数*/
static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
int nb_sectors,
BlockCompletionFunc *cb,
void *opaque)
{
return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
RBD_AIO_WRITE);
}
/*flush操作函数*/
#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
BlockCompletionFunc *cb,
void *opaque)
{
return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
}
#else
static int qemu_rbd_co_flush(BlockDriverState *bs)
{
#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
/* rbd_flush added in 0.1.1 */
BDRVRBDState *s = bs->opaque;
return rbd_flush(s->image);
#else
return 0;
#endif
}
#endif
#ifdef LIBRBD_SUPPORTS_DISCARD
static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
int64_t sector_num,
int nb_sectors,
BlockCompletionFunc *cb,
void *opaque)
{
return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
RBD_AIO_DISCARD);
}
#endif
可见,读写和flush都是通过rbd_start_aio这个函数来完成。当然,这是在librbd支持异步flush的情况下。那么rbd_start_aio又是怎样的呢?
/*开始异步IO操作的函数*/
/*在参数列表当中,sector_num, qiov, nb_sectors, cb应该都是与qemu磁盘操作有关的参数*/
static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
int nb_sectors,
BlockCompletionFunc *cb,
void *opaque,
RBDAIOCmd cmd)
{
RBDAIOCB *acb;
RADOSCB *rcb = NULL;
rbd_completion_t c;
int64_t off, size;
char *buf;
int r;
BDRVRBDState *s = bs->opaque;
acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
acb->cmd = cmd;
acb->qiov = qiov;
if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
acb->bounce = NULL;
} else {
acb->bounce = qemu_try_blockalign(bs, qiov->size);
if (acb->bounce == NULL) {
goto failed;
}
}
acb->ret = 0;
acb->error = 0;
acb->s = s;
acb->bh = NULL;
if (cmd == RBD_AIO_WRITE) {
qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
}
buf = acb->bounce;
off = sector_num * BDRV_SECTOR_SIZE;
size = nb_sectors * BDRV_SECTOR_SIZE;
rcb = g_new(RADOSCB, 1);
rcb->acb = acb;
rcb->buf = buf;
rcb->s = acb->s;
rcb->size = size;
r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
if (r < 0) {
goto failed;
}
/*acb和rcb应该都是与磁盘IO相关的数据结构,描述了磁盘IO的信息,比如写入位置的扇区位置,写入大小,写入信息等等。上面的代码初始化了磁盘IO的信息*/
/*根据command的类型,来对磁盘镜像进行操作*/
switch (cmd) {
case RBD_AIO_WRITE:
r = rbd_aio_write(s->image, off, size, buf, c);
break;
case RBD_AIO_READ:
r = rbd_aio_read(s->image, off, size, buf, c);
break;
case RBD_AIO_DISCARD:
r = rbd_aio_discard_wrapper(s->image, off, size, c);
break;
case RBD_AIO_FLUSH:
r = rbd_aio_flush_wrapper(s->image, c);
break;
default:
r = -EINVAL;
}
if (r < 0) {
goto failed_completion;
}
return &acb->common;
failed_completion:
rbd_aio_release(c);
failed:
g_free(rcb);
qemu_vfree(acb->bounce);
qemu_aio_unref(acb);
return NULL;
}
在对rbd.c的分析中,我们可以看到无论是读还是写,最后都是对image的操作。
那么,我们来rbd_open这个函数是如何利用IO上下文和snap来打开一个rbd镜像的。
在Ceph的文档里并没有介绍如何使用C/C++版的librbd,差评啊。只能自己对照qemu和Ceph的源码进行分析了。
Ceph的源码版本:0.94.1
在src/include/rbd/librbd.h中声明了librbd的C语言接口
其中rbd_open的定义如下:
CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name, rbd_image_t *image, const char *snap_name);
rbd_close的定义如下:
CEPH_RBD_API int rbd_close(rbd_image_t image);
我们的目标是让一个image连接到两个pool,一个ssd pool和一个hdd pool。在更改源码的过程当中,我们尽量少的改动源码。当前我的想法是:
可以设置一个当前的默认pool,将image映射到该pool上,然后在运行的过程当中,image对应的pool可能改变,这样的话就不必改写那些读写image的代码。
librbd.h中对应的函数声明在src/librbd/librbd.cc中定义
extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image,
const char *snap_name)
{
librados::IoCtx io_ctx;
librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
false); //创建镜像上下文
tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); //创建回溯点?
int r = librbd::open_image(ictx);
//打开rbd镜像
if (r >= 0)
*image = (rbd_image_t)ictx;
将镜像上下文赋给image,提供给函数的调用者使用
tracepoint(librbd, open_image_exit, r);
return r;
}
extern "C" int rbd_close(rbd_image_t image)
{
librbd::ImageCtx *ctx = (librbd::ImageCtx *)image;
tracepoint(librbd, close_image_enter, ctx, ctx->name.c_str(), ctx->id.c_str());
librbd::close_image(ctx);
tracepoint(librbd, close_image_exit);
return 0;
}
来源:oschina
链接:https://my.oschina.net/u/1047616/blog/525156