接前一篇文章: QEMU源码全解析 —— 块设备虚拟化(21)
本文内容参考:
《 QEMU /KVM源码解析与应用》 —— 李强,机械工业出版社
特此致谢!
QEMU启动过程中的块设备虚拟化
上一回沿着bdrv_open函数 ---> bdrv_open_inherit函数 ---> bdrv_open_common函数,讲解了bdrv_open_common函数的两大关键步骤:
bdrv_open_driver函数也在block中,代码如下:
/*
* The caller must always hold @bs AioContext lock, because this function calls
* bdrv_refresh_total_sectors() which polls when called from non-coroutine
* context.
*/
static int no_coroutine_fn GRAPH_UNLOCKED
bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
QDict *options, int open_flags, Error **errp)
{
AioContext *ctx;
Error *local_err = NULL;
int i, ret;
GLOBAL_STATE_CODE();
bdrv_assign_node_name(bs, node_name, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return -EINVAL;
}
bs->drv = drv;
bs->opaque = g_malloc0(drv->instance_size);
if (drv->bdrv_file_open) {
assert(!drv->bdrv_needs_filename || bs->filename[0]);
ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
} else if (drv->bdrv_open) {
ret = drv->bdrv_open(bs, options, open_flags, &local_err);
} else {
ret = 0;
}
if (ret < 0) {
if (local_err) {
error_propagate(errp, local_err);
} else if (bs->filename[0]) {
error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
} else {
error_setg_errno(errp, -ret, "Could not open image");
}
goto open_failed;
}
assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
/*
* Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
* drivers that pass read/write requests through to a child the trouble of
* declaring support explicitly.
*
* Drivers must not propagate this flag accidentally when they initiate I/O
* to a bounce buffer. That case should be rare though.
*/
bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
/* Get the context after .bdrv_open, it can change the context */
ctx = bdrv_get_aio_context(bs);
aio_context_acquire(ctx);
ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
if (ret < 0) {
error_setg_errno(errp, -ret, "Could not refresh total sector count");
aio_context_release(ctx);
return ret;
}
bdrv_graph_rdlock_main_loop();
bdrv_refresh_limits(bs, NULL, &local_err);
bdrv_graph_rdunlock_main_loop();
aio_context_release(ctx);
if (local_err) {
error_propagate(errp, local_err);
return -EINVAL;
}
assert(bdrv_opt_mem_align(bs) != 0);
assert(bdrv_min_mem_align(bs) != 0);
assert(is_power_of_2(bs->bl.request_alignment));
for (i = 0; i < bs->quiesce_counter; i++) {
if (drv->bdrv_drain_begin) {
drv->bdrv_drain_begin(bs);
}
}
return 0;
open_failed:
bs->drv = NULL;
if (bs->file != NULL) {
bdrv_unref_child(bs, bs->file);
assert(!bs->file);
}
g_free(bs->opaque);
bs->opaque = NULL;
return ret;
}
上一回也讲了QEMU支持的几种磁盘文件格式,包括raw、qcow2、vmdk、vdi等。对于不同的格式,打开的方式不尽相同。以raw和qcow2为例,它们的BlockDriver的定义分别如下:
- raw
block/raw-format.c中:
BlockDriver bdrv_raw = {
.format_name = "raw",
.instance_size = sizeof(BDRVRawState),
.supports_zoned_children = true,
.bdrv_probe = &raw_probe,
.bdrv_reopen_prepare = &raw_reopen_prepare,
.bdrv_reopen_commit = &raw_reopen_commit,
.bdrv_reopen_abort = &raw_reopen_abort,
.bdrv_open = &raw_open,
.bdrv_child_perm = raw_child_perm,
.bdrv_co_create_opts = &raw_co_create_opts,
.bdrv_co_preadv = &raw_co_preadv,
.bdrv_co_pwritev = &raw_co_pwritev,
.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
.bdrv_co_pdiscard = &raw_co_pdiscard,
.bdrv_co_zone_report = &raw_co_zone_report,
.bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
.bdrv_co_zone_append = &raw_co_zone_append,
.bdrv_co_block_status = &raw_co_block_status,
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
.bdrv_co_truncate = &raw_co_truncate,
.bdrv_co_getlength = &raw_co_getlength,
.is_format = true,
.bdrv_measure = &raw_measure,
.bdrv_co_get_info = &raw_co_get_info,
.bdrv_refresh_limits = &raw_refresh_limits,
.bdrv_probe_blocksizes = &raw_probe_blocksizes,
.bdrv_probe_geometry = &raw_probe_geometry,
.bdrv_co_eject = &raw_co_eject,
.bdrv_co_lock_medium = &raw_co_lock_medium,
.bdrv_co_ioctl = &raw_co_ioctl,
.create_opts = &raw_create_opts,
.bdrv_has_zero_init = &raw_has_zero_init,
.strong_runtime_opts = raw_strong_runtime_opts,
.mutable_opts = mutable_opts,
.bdrv_cancel_in_flight = raw_cancel_in_flight,
};
static void bdrv_raw_init(void)
{
bdrv_register(&bdrv_raw);
}
block_init(bdrv_raw_init);
- qcow2
block/qcow2.c中:
BlockDriver bdrv_qcow2 = {
.format_name = "qcow2",
.instance_size = sizeof(BDRVQcow2State),
.bdrv_probe = qcow2_probe,
.bdrv_open = qcow2_open,
.bdrv_close = qcow2_close,
.bdrv_reopen_prepare = qcow2_reopen_prepare,
.bdrv_reopen_commit = qcow2_reopen_commit,
.bdrv_reopen_commit_post = qcow2_reopen_commit_post,
.bdrv_reopen_abort = qcow2_reopen_abort,
.bdrv_join_options = qcow2_join_options,
.bdrv_child_perm = bdrv_default_perms,
.bdrv_co_create_opts = qcow2_co_create_opts,
.bdrv_co_create = qcow2_co_create,
.bdrv_has_zero_init = qcow2_has_zero_init,
.bdrv_co_block_status = qcow2_co_block_status,
.bdrv_co_preadv_part = qcow2_co_preadv_part,
.bdrv_co_pwritev_part = qcow2_co_pwritev_part,
.bdrv_co_flush_to_os = qcow2_co_flush_to_os,
.bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes,
.bdrv_co_pdiscard = qcow2_co_pdiscard,
.bdrv_co_copy_range_from = qcow2_co_copy_range_from,
.bdrv_co_copy_range_to = qcow2_co_copy_range_to,
.bdrv_co_truncate = qcow2_co_truncate,
.bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
.bdrv_make_empty = qcow2_make_empty,
.bdrv_snapshot_create = qcow2_snapshot_create,
.bdrv_snapshot_goto = qcow2_snapshot_goto,
.bdrv_snapshot_delete = qcow2_snapshot_delete,
.bdrv_snapshot_list = qcow2_snapshot_list,
.bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
.bdrv_measure = qcow2_measure,
.bdrv_co_get_info = qcow2_co_get_info,
.bdrv_get_specific_info = qcow2_get_specific_info,
.bdrv_co_save_vmstate = qcow2_co_save_vmstate,
.bdrv_co_load_vmstate = qcow2_co_load_vmstate,
.is_format = true,
.supports_backing = true,
.bdrv_change_backing_file = qcow2_change_backing_file,
.bdrv_refresh_limits = qcow2_refresh_limits,
.bdrv_co_invalidate_cache = qcow2_co_invalidate_cache,
.bdrv_inactivate = qcow2_inactivate,
.create_opts = &qcow2_create_opts,
.amend_opts = &qcow2_amend_opts,
.strong_runtime_opts = qcow2_strong_runtime_opts,
.mutable_opts = mutable_opts,
.bdrv_co_check = qcow2_co_check,
.bdrv_amend_options = qcow2_amend_options,
.bdrv_co_amend = qcow2_co_amend,
.bdrv_detach_aio_context = qcow2_detach_aio_context,
.bdrv_attach_aio_context = qcow2_attach_aio_context,
.bdrv_supports_persistent_dirty_bitmap =
qcow2_supports_persistent_dirty_bitmap,
.bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
.bdrv_co_remove_persistent_dirty_bitmap =
qcow2_co_remove_persistent_dirty_bitmap,
};
static void bdrv_qcow2_init(void)
{
bdrv_register(&bdrv_qcow2);
}
block_init(bdrv_qcow2_init);
那么,对于raw格式来说,drv->bdrv_open()调用的就是raw_open函数;对于qcow2格式来说,drv->bdrv_open()调用的是qcow2_open函数。
BlockDriver bdrv_raw = {
.format_name = "raw",
.instance_size = sizeof(BDRVRawState),
.supports_zoned_children = true,
.bdrv_probe = &raw_probe,
.bdrv_reopen_prepare = &raw_reopen_prepare,
.bdrv_reopen_commit = &raw_reopen_commit,
.bdrv_reopen_abort = &raw_reopen_abort,
.bdrv_open = &raw_open,
……
};
BlockDriver bdrv_qcow2 = {
.format_name = "qcow2",
.instance_size = sizeof(BDRVQcow2State),
.bdrv_probe = qcow2_probe,
.bdrv_open = qcow2_open,
.bdrv_close = qcow2_close,
……
};
这次先来看qcow2格式对应的qcow2_open函数,在block/qcow2.c中,代码如下:
static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
BDRVQcow2State *s = bs->opaque;
QCow2OpenCo qoc = {
.bs = bs,
.options = options,
.flags = flags,
.errp = errp,
.ret = -EINPROGRESS
};
int ret;
ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
if (ret < 0) {
return ret;
}
/* Initialise locks */
qemu_co_mutex_init(&s->lock);
assert(!qemu_in_coroutine());
assert(qemu_get_current_aio_context() == qemu_get_aio_context());
aio_co_enter(bdrv_get_aio_context(bs),
qemu_coroutine_create(qcow2_open_entry, &qoc));
AIO_WAIT_WHILE_UNLOCKED(NULL, qoc.ret == -EINPROGRESS);
return qoc.ret;
}
在qcow2_open函数中,会通过qemu_coroutine_create()以及aio_co_center()进入一个协程coroutine。什么是协程呢?知识补强一下。
协程是一种轻量级的线程,也可以称为用户态线程。传统线程的调度是由内核完成的,而协程则可以在用户层自主调度。协程的优点包括:
- 上下文切换效率高。因为切换对内核来说是无感知的,其切换开销基本和函数调用差不多。
- 协程间共享数据无需加锁。因为它们还是在一个线程上运行,并不会真正并行执行。
线程是一个内核的概念,创建的每一个线程,在内核中都对应一个task_struct对象,内核的调度也是以线程为单位的。这对于普通进程没有什么问题,但别忘了此处的场景是QEMU虚拟机。如果在用户态和内核态之间切来切去,那么代价会比较大(尤其还涉及到虚拟机的状态)。
但是,QEMU中的设备也需要有并发能力,那怎么办呢?就在用户态实现一个类似线程的机制,即引入协程,用它来实现并发,并且无须切换到内核,调度全部在用户态完成。
这里,打开一个qcow2格式的文件就使用一个协程。创建一个协程和创建一个线程很像,也需要指定一个函数(协程函数)来执行,上边代码中的qcow2_open_entry就是协程函数。
qcow2_open_entry函数也在block/qcow2.c中,代码如下:
static void coroutine_fn qcow2_open_entry(void *opaque)
{
QCow2OpenCo *qoc = opaque;
BDRVQcow2State *s = qoc->bs->opaque;
GRAPH_RDLOCK_GUARD();
qemu_co_mutex_lock(&s->lock);
qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
qoc->errp);
qemu_co_mutex_unlock(&s->lock);
aio_wait_kick();
}
看到函数名前边那个coroutine_fn了吧?这就说明该函数是一个协程函数。核心是调用了qcow2_do_open函数。qcow2_do_open函数也在block/qcow2.c中,代码很长(将近600行),在此就不深入解析了,只要知道其功能是根据qcow2的格式打开磁盘文件就可以了。
这样,qcow2格式对应的qcow2_open函数就基本解析完了,下一回解析raw格式对应的raw_open函数。