接前一篇文章: QEMU源码全解析 —— 块设备虚拟化(20)
本文内容参考:
《 QEMU /KVM源码解析与应用》 —— 李强,机械工业出版社
特此致谢!
QEMU启动过程中的块设备虚拟化
上一回开始解析blockdev_init函数,讲到了其中调用的blk_new_open函数,该函数的作用是打开宿主机上的硬盘文件。blk_new_open函数中又调用bdrv_open函数,其在block.c中,代码如下:
/* The caller must always hold the main AioContext lock. */
BlockDriverState *bdrv_open(const char *filename, const char *reference,
QDict *options, int flags, Error **errp)
{
GLOBAL_STATE_CODE();
return bdrv_open_inherit(filename, reference, options, flags, NULL,
NULL, 0, errp);
}
bdrv_open_inherit函数也在block.c中,代码如下:
/*
* Opens a disk image (raw, qcow2, vmdk, ...)
*
* options is a QDict of options to pass to the block drivers, or NULL for an
* empty set of options. The reference to the QDict belongs to the block layer
* after the call (even on failure), so if the caller intends to reuse the
* dictionary, it needs to use qobject_ref() before calling bdrv_open.
*
* If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
* If it is not NULL, the referenced BDS will be reused.
*
* The reference parameter may be used to specify an existing block device which
* should be opened. If specified, neither options nor a filename may be given,
* nor can an existing BDS be reused (that is, *pbs has to be NULL).
*
* The caller must always hold the main AioContext lock.
*/
static BlockDriverState * no_coroutine_fn
bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
int flags, BlockDriverState *parent,
const BdrvChildClass *child_class, BdrvChildRole child_role,
Error **errp)
{
int ret;
BlockBackend *file = NULL;
BlockDriverState *bs;
BlockDriver *drv = NULL;
BdrvChild *child;
const char *drvname;
const char *backing;
Error *local_err = NULL;
QDict *snapshot_options = NULL;
int snapshot_flags = 0;
AioContext *ctx = qemu_get_aio_context();
assert(!child_class || !flags);
assert(!child_class == !parent);
GLOBAL_STATE_CODE();
assert(!qemu_in_coroutine());
if (reference) {
bool options_non_empty = options ? qdict_size(options) : false;
qobject_unref(options);
if (filename || options_non_empty) {
error_setg(errp, "Cannot reference an existing block device with "
"additional options or a new filename");
return NULL;
}
bs = bdrv_lookup_bs(reference, reference, errp);
if (!bs) {
return NULL;
}
bdrv_ref(bs);
return bs;
}
bs = bdrv_new();
/* NULL means an empty set of options */
if (options == NULL) {
options = qdict_new();
}
/* json: syntax counts as explicit options, as if in the QDict */
parse_json_protocol(options, &filename, &local_err);
if (local_err) {
goto fail;
}
bs->explicit_options = qdict_clone_shallow(options);
if (child_class) {
bool parent_is_format;
if (parent->drv) {
parent_is_format = parent->drv->is_format;
} else {
/*
* parent->drv is not set yet because this node is opened for
* (potential) format probing. That means that @parent is going
* to be a format node.
*/
parent_is_format = true;
}
bs->inherits_from = parent;
child_class->inherit_options(child_role, parent_is_format,
&flags, options,
parent->open_flags, parent->options);
}
ret = bdrv_fill_options(&options, filename, &flags, &local_err);
if (ret < 0) {
goto fail;
}
/*
* Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
* Caution: getting a boolean member of @options requires care.
* When @options come from -blockdev or blockdev_add, members are
* typed according to the QAPI schema, but when they come from
* -drive, they're all QString.
*/
if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
!qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
} else {
flags &= ~BDRV_O_RDWR;
}
if (flags & BDRV_O_SNAPSHOT) {
snapshot_options = qdict_new();
bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
flags, options);
/* Let bdrv_backing_options() override "read-only" */
qdict_del(options, BDRV_OPT_READ_ONLY);
bdrv_inherited_options(BDRV_CHILD_COW, true,
&flags, options, flags, options);
}
bs->open_flags = flags;
bs->options = options;
options = qdict_clone_shallow(options);
/* Find the right image format driver */
/* See cautionary note on accessing @options above */
drvname = qdict_get_try_str(options, "driver");
if (drvname) {
drv = bdrv_find_format(drvname);
if (!drv) {
error_setg(errp, "Unknown driver: '%s'", drvname);
goto fail;
}
}
assert(drvname || !(flags & BDRV_O_PROTOCOL));
/* See cautionary note on accessing @options above */
backing = qdict_get_try_str(options, "backing");
if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
(backing && *backing == '\0'))
{
if (backing) {
warn_report("Use of \"backing\": \"\" is deprecated; "
"use \"backing\": null instead");
}
flags |= BDRV_O_NO_BACKING;
qdict_del(bs->explicit_options, "backing");
qdict_del(bs->options, "backing");
qdict_del(options, "backing");
}
/* Open image file without format layer. This BlockBackend is only used for
* probing, the block drivers will do their own bdrv_open_child() for the
* same BDS, which is why we put the node name back into options. */
if ((flags & BDRV_O_PROTOCOL) == 0) {
BlockDriverState *file_bs;
file_bs = bdrv_open_child_bs(filename, options, "file", bs,
&child_of_bds, BDRV_CHILD_IMAGE,
true, &local_err);
if (local_err) {
goto fail;
}
if (file_bs != NULL) {
/* Not requesting BLK_PERM_CONSISTENT_READ because we're only
* looking at the header to guess the image format. This works even
* in cases where a guest would not see a consistent state. */
ctx = bdrv_get_aio_context(file_bs);
aio_context_acquire(ctx);
file = blk_new(ctx, 0, BLK_PERM_ALL);
blk_insert_bs(file, file_bs, &local_err);
bdrv_unref(file_bs);
aio_context_release(ctx);
if (local_err) {
goto fail;
}
qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
}
}
/* Image format probing */
bs->probed = !drv;
if (!drv && file) {
ret = find_image_format(file, filename, &drv, &local_err);
if (ret < 0) {
goto fail;
}
/*
* This option update would logically belong in bdrv_fill_options(),
* but we first need to open bs->file for the probing to work, while
* opening bs->file already requires the (mostly) final set of options
* so that cache mode etc. can be inherited.
*
* Adding the driver later is somewhat ugly, but it's not an option
* that would ever be inherited, so it's correct. We just need to make
* sure to update both bs->options (which has the full effective
* options for bs) and options (which has file.* already removed).
*/
qdict_put_str(bs->options, "driver", drv->format_name);
qdict_put_str(options, "driver", drv->format_name);
} else if (!drv) {
error_setg(errp, "Must specify either driver or file");
goto fail;
}
/* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
/* file must be NULL if a protocol BDS is about to be created
* (the inverse results in an error message from bdrv_open_common()) */
assert(!(flags & BDRV_O_PROTOCOL) || !file);
/* Open the image */
ret = bdrv_open_common(bs, file, options, &local_err);
if (ret < 0) {
goto fail;
}
/* The AioContext could have changed during bdrv_open_common() */
ctx = bdrv_get_aio_context(bs);
if (file) {
aio_context_acquire(ctx);
blk_unref(file);
aio_context_release(ctx);
file = NULL;
}
/* If there is a backing file, use it */
if ((flags & BDRV_O_NO_BACKING) == 0) {
ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
if (ret < 0) {
goto close_and_fail;
}
}
/* Remove all children options and references
* from bs->options and bs->explicit_options */
QLIST_FOREACH(child, &bs->children, next) {
char *child_key_dot;
child_key_dot = g_strdup_printf("%s.", child->name);
qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
qdict_extract_subqdict(bs->options, NULL, child_key_dot);
qdict_del(bs->explicit_options, child->name);
qdict_del(bs->options, child->name);
g_free(child_key_dot);
}
/* Check if any unknown options were used */
if (qdict_size(options) != 0) {
const QDictEntry *entry = qdict_first(options);
if (flags & BDRV_O_PROTOCOL) {
error_setg(errp, "Block protocol '%s' doesn't support the option "
"'%s'", drv->format_name, entry->key);
} else {
error_setg(errp,
"Block format '%s' does not support the option '%s'",
drv->format_name, entry->key);
}
goto close_and_fail;
}
bdrv_parent_cb_change_media(bs, true);
qobject_unref(options);
options = NULL;
/* For snapshot=on, create a temporary qcow2 overlay. bs points to the
* temporary snapshot afterwards. */
if (snapshot_flags) {
BlockDriverState *snapshot_bs;
snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
snapshot_options, &local_err);
snapshot_options = NULL;
if (local_err) {
goto close_and_fail;
}
/* We are not going to return bs but the overlay on top of it
* (snapshot_bs); thus, we have to drop the strong reference to bs
* (which we obtained by calling bdrv_new()). bs will not be deleted,
* though, because the overlay still has a reference to it. */
aio_context_acquire(ctx);
bdrv_unref(bs);
aio_context_release(ctx);
bs = snapshot_bs;
}
return bs;
fail:
aio_context_acquire(ctx);
blk_unref(file);
qobject_unref(snapshot_options);
qobject_unref(bs->explicit_options);
qobject_unref(bs->options);
qobject_unref(options);
bs->options = NULL;
bs->explicit_options = NULL;
bdrv_unref(bs);
aio_context_release(ctx);
error_propagate(errp, local_err);
return NULL;
close_and_fail:
aio_context_acquire(ctx);
bdrv_unref(bs);
aio_context_release(ctx);
qobject_unref(snapshot_options);
qobject_unref(options);
error_propagate(errp, local_err);
return NULL;
}
bdrv_open_inherit函数的作用是打开磁盘映像(raw、qcow2、vmdk等格式)。其中最为核心的一段是:
/* Open the image */
ret = bdrv_open_common(bs, file, options, &local_err);
if (ret < 0) {
goto fail;
}
bdrv_open_common函数也在block.c中,代码如下:
/*
* Common part for opening disk images and files
*
* Removes all processed options from *options.
*/
static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
QDict *options, Error **errp)
{
int ret, open_flags;
const char *filename;
const char *driver_name = NULL;
const char *node_name = NULL;
const char *discard;
QemuOpts *opts;
BlockDriver *drv;
Error *local_err = NULL;
bool ro;
assert(bs->file == NULL);
assert(options != NULL && bs->options != options);
GLOBAL_STATE_CODE();
opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
if (!qemu_opts_absorb_qdict(opts, options, errp)) {
ret = -EINVAL;
goto fail_opts;
}
update_flags_from_options(&bs->open_flags, opts);
driver_name = qemu_opt_get(opts, "driver");
drv = bdrv_find_format(driver_name);
assert(drv != NULL);
bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
error_setg(errp,
BDRV_OPT_FORCE_SHARE
"=on can only be used with read-only images");
ret = -EINVAL;
goto fail_opts;
}
if (file != NULL) {
bdrv_refresh_filename(blk_bs(file));
filename = blk_bs(file)->filename;
} else {
/*
* Caution: while qdict_get_try_str() is fine, getting
* non-string types would require more care. When @options
* come from -blockdev or blockdev_add, its members are typed
* according to the QAPI schema, but when they come from
* -drive, they're all QString.
*/
filename = qdict_get_try_str(options, "filename");
}
if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
error_setg(errp, "The '%s' block driver requires a file name",
drv->format_name);
ret = -EINVAL;
goto fail_opts;
}
trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
drv->format_name);
ro = bdrv_is_read_only(bs);
if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
if (!ro && bdrv_is_whitelisted(drv, true)) {
ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
} else {
ret = -ENOTSUP;
}
if (ret < 0) {
error_setg(errp,
!ro && bdrv_is_whitelisted(drv, true)
? "Driver '%s' can only be used for read-only devices"
: "Driver '%s' is not whitelisted",
drv->format_name);
goto fail_opts;
}
}
/* bdrv_new() and bdrv_close() make it so */
assert(qatomic_read(&bs->copy_on_read) == 0);
if (bs->open_flags & BDRV_O_COPY_ON_READ) {
if (!ro) {
bdrv_enable_copy_on_read(bs);
} else {
error_setg(errp, "Can't use copy-on-read on read-only device");
ret = -EINVAL;
goto fail_opts;
}
}
discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
if (discard != NULL) {
if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
error_setg(errp, "Invalid discard option");
ret = -EINVAL;
goto fail_opts;
}
}
bs->detect_zeroes =
bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
if (local_err) {
error_propagate(errp, local_err);
ret = -EINVAL;
goto fail_opts;
}
if (filename != NULL) {
pstrcpy(bs->filename, sizeof(bs->filename), filename);
} else {
bs->filename[0] = '\0';
}
pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
/* Open the image, either directly or using a protocol */
open_flags = bdrv_open_flags(bs, bs->open_flags);
node_name = qemu_opt_get(opts, "node-name");
assert(!drv->bdrv_file_open || file == NULL);
ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
if (ret < 0) {
goto fail_opts;
}
qemu_opts_del(opts);
return 0;
fail_opts:
qemu_opts_del(opts);
return ret;
}
在bdrv_open_common函数中,根据磁盘文件的格式,得到BlockDriver *drv。代码片段如下:
static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
QDict *options, Error **errp)
{
int ret, open_flags;
const char *filename;
const char *driver_name = NULL;
const char *node_name = NULL;
const char *discard;
QemuOpts *opts;
BlockDriver *drv;
Error *local_err = NULL;
bool ro;
……
driver_name = qemu_opt_get(opts, "driver");
drv = bdrv_find_format(driver_name);
assert(drv != NULL);
……
}
然后再调用bdrv_open_driver函数,根据得到的BlockDriver *drv,打开磁盘文件。代码片段如下:
assert(!drv->bdrv_file_open || file == NULL);
ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
if (ret < 0) {
goto fail_opts;
}
bdrv_open_driver函数也在block中,代码如下:
/*
* The caller must always hold @bs AioContext lock, because this function calls
* bdrv_refresh_total_sectors() which polls when called from non-coroutine
* context.
*/
static int no_coroutine_fn GRAPH_UNLOCKED
bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
QDict *options, int open_flags, Error **errp)
{
AioContext *ctx;
Error *local_err = NULL;
int i, ret;
GLOBAL_STATE_CODE();
bdrv_assign_node_name(bs, node_name, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return -EINVAL;
}
bs->drv = drv;
bs->opaque = g_malloc0(drv->instance_size);
if (drv->bdrv_file_open) {
assert(!drv->bdrv_needs_filename || bs->filename[0]);
ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
} else if (drv->bdrv_open) {
ret = drv->bdrv_open(bs, options, open_flags, &local_err);
} else {
ret = 0;
}
if (ret < 0) {
if (local_err) {
error_propagate(errp, local_err);
} else if (bs->filename[0]) {
error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
} else {
error_setg_errno(errp, -ret, "Could not open image");
}
goto open_failed;
}
assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
/*
* Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
* drivers that pass read/write requests through to a child the trouble of
* declaring support explicitly.
*
* Drivers must not propagate this flag accidentally when they initiate I/O
* to a bounce buffer. That case should be rare though.
*/
bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
/* Get the context after .bdrv_open, it can change the context */
ctx = bdrv_get_aio_context(bs);
aio_context_acquire(ctx);
ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
if (ret < 0) {
error_setg_errno(errp, -ret, "Could not refresh total sector count");
aio_context_release(ctx);
return ret;
}
bdrv_graph_rdlock_main_loop();
bdrv_refresh_limits(bs, NULL, &local_err);
bdrv_graph_rdunlock_main_loop();
aio_context_release(ctx);
if (local_err) {
error_propagate(errp, local_err);
return -EINVAL;
}
assert(bdrv_opt_mem_align(bs) != 0);
assert(bdrv_min_mem_align(bs) != 0);
assert(is_power_of_2(bs->bl.request_alignment));
for (i = 0; i < bs->quiesce_counter; i++) {
if (drv->bdrv_drain_begin) {
drv->bdrv_drain_begin(bs);
}
}
return 0;
open_failed:
bs->drv = NULL;
if (bs->file != NULL) {
bdrv_unref_child(bs, bs->file);
assert(!bs->file);
}
g_free(bs->opaque);
bs->opaque = NULL;
return ret;
}
上边已提到,虚拟机的硬盘文件格式有很多种,如:raw、qcow2、vmdk等,每一种格式都有各自的优缺点。启动虚拟机的时候,可以自由指定。
这里对于文件格式进行一定的知识补强,参考以下文章:
虚拟机硬盘格式的选择:qcow2、 raw等_linux_qq_42533216-华为开发者空间
- raw
raw是原始磁盘镜像格式,也称裸磁盘格式,是默认的虚拟磁盘格式。
raw,原意就是“未经加工”,也指未格式化的磁盘。在Linux中,raw是磁盘映像的纯二进制映像格式。在支持稀疏文件的文件系统里,raw格式的映像只占用记录在其中的数据实际使用的空间。
这个格式的优点是简单,易于导出导入到其它模拟器(虚拟化环境)中使用。如果你的文件系统支持“空洞”(holes),例如linux上的ext2、ext3、ext4或windows上的ntfs,这种环境下对于raw磁盘来说只有已经写入的扇区才占用存储空间,可以使用“qemu-img info”命令来查看raw磁盘的真实大小,用“ls -ls”也可以看出来。
- qcow2
qcow2镜像格式是QEMU最通用的镜像文件格式,它也可以用一个文件的形式来表示一块固定大小的块设备磁盘。在不支持“空洞”的文件系统上能够占用更小的存储空间,并且具备可选的一些配置项,比如AES加密、zlib压缩、快照等。
qcow2是目前比较主流的一种虚拟化镜像格式,目前qcow2的性能上接近raw裸格式的性能,与普通的raw格式的镜像相比,它还有以下特性:
- 更小的存储空间占用,即使文件系统不支持空洞(holes)。qcow2格式的镜像比raw格式文件更小,只有在虚拟机实际占用了磁盘空间时,其文件才会增长,能方便地减少迁移花费的流量,更适用于云计算系统。
- 支持写时拷贝(COW,copy-on-write),镜像文件只反映底层磁盘的变化。
- 支持快照(snapshot),镜像文件能够包含多个快照的历史。
- 可选择基于zlib的压缩方式,它允许每个簇(cluster)单独使用zlib压缩。
- 可以选择AES加密,支持使用128位的AES密钥进行加密。