接前一篇文章: QEMU源码全解析 —— 块设备虚拟化(19)
本文内容参考:
《 QEMU /KVM源码解析与应用》 —— 李强,机械工业出版社
特此致谢!
QEMU启动过程中的块设备虚拟化
上一回大致解析了drive_new函数,本回重点对于drive_new函数中调用的blockdev_init函数进行解析。为了便于理解和回顾,再次贴出drive_new函数中调用blockdev_init函数的代码片段,如下所示:
/* Actual block device init: Functionality shared with blockdev-add */
blk = blockdev_init(filename, bs_opts, errp);
bs_opts = NULL;
if (!blk) {
goto fail;
}
blockdev_init函数也在blockdev.c中,代码如下:
/* Takes the ownership of bs_opts */
static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
Error **errp)
{
const char *buf;
int bdrv_flags = 0;
int on_read_error, on_write_error;
OnOffAuto account_invalid, account_failed;
bool writethrough, read_only;
BlockBackend *blk;
BlockDriverState *bs;
ThrottleConfig cfg;
int snapshot = 0;
Error *error = NULL;
QemuOpts *opts;
QDict *interval_dict = NULL;
QList *interval_list = NULL;
const char *id;
BlockdevDetectZeroesOptions detect_zeroes =
BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF;
const char *throttling_group = NULL;
/* Check common options by copying from bs_opts to opts, all other options
* stay in bs_opts for processing by bdrv_open(). */
id = qdict_get_try_str(bs_opts, "id");
opts = qemu_opts_create(&qemu_common_drive_opts, id, 1, errp);
if (!opts) {
goto err_no_opts;
}
if (!qemu_opts_absorb_qdict(opts, bs_opts, errp)) {
goto early_err;
}
if (id) {
qdict_del(bs_opts, "id");
}
/* extract parameters */
snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
account_invalid = account_get_opt(opts, "stats-account-invalid");
account_failed = account_get_opt(opts, "stats-account-failed");
writethrough = !qemu_opt_get_bool(opts, BDRV_OPT_CACHE_WB, true);
id = qemu_opts_id(opts);
qdict_extract_subqdict(bs_opts, &interval_dict, "stats-intervals.");
qdict_array_split(interval_dict, &interval_list);
if (qdict_size(interval_dict) != 0) {
error_setg(errp, "Invalid option stats-intervals.%s",
qdict_first(interval_dict)->key);
goto early_err;
}
extract_common_blockdev_options(opts, &bdrv_flags, &throttling_group, &cfg,
&detect_zeroes, &error);
if (error) {
error_propagate(errp, error);
goto early_err;
}
if ((buf = qemu_opt_get(opts, "format")) != NULL) {
if (is_help_option(buf)) {
qemu_printf("Supported formats:");
bdrv_iterate_format(bdrv_format_print, NULL, false);
qemu_printf("\nSupported formats (read-only):");
bdrv_iterate_format(bdrv_format_print, NULL, true);
qemu_printf("\n");
goto early_err;
}
if (qdict_haskey(bs_opts, "driver")) {
error_setg(errp, "Cannot specify both 'driver' and 'format'");
goto early_err;
}
qdict_put_str(bs_opts, "driver", buf);
}
on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
on_write_error = parse_block_error_action(buf, 0, &error);
if (error) {
error_propagate(errp, error);
goto early_err;
}
}
on_read_error = BLOCKDEV_ON_ERROR_REPORT;
if ((buf = qemu_opt_get(opts, "rerror")) != NULL) {
on_read_error = parse_block_error_action(buf, 1, &error);
if (error) {
error_propagate(errp, error);
goto early_err;
}
}
if (snapshot) {
bdrv_flags |= BDRV_O_SNAPSHOT;
}
read_only = qemu_opt_get_bool(opts, BDRV_OPT_READ_ONLY, false);
/* init */
if ((!file || !*file) && !qdict_size(bs_opts)) {
BlockBackendRootState *blk_rs;
blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
blk_rs = blk_get_root_state(blk);
blk_rs->open_flags = bdrv_flags | (read_only ? 0 : BDRV_O_RDWR);
blk_rs->detect_zeroes = detect_zeroes;
qobject_unref(bs_opts);
} else {
if (file && !*file) {
file = NULL;
}
/* bdrv_open() defaults to the values in bdrv_flags (for compatibility
* with other callers) rather than what we want as the real defaults.
* Apply the defaults here instead. */
qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
qdict_set_default_str(bs_opts, BDRV_OPT_READ_ONLY,
read_only ? "on" : "off");
qdict_set_default_str(bs_opts, BDRV_OPT_AUTO_READ_ONLY, "on");
assert((bdrv_flags & BDRV_O_CACHE_MASK) == 0);
if (runstate_check(RUN_STATE_INMIGRATE)) {
bdrv_flags |= BDRV_O_INACTIVE;
}
blk = blk_new_open(file, NULL, bs_opts, bdrv_flags, errp);
if (!blk) {
goto err_no_bs_opts;
}
bs = blk_bs(blk);
bs->detect_zeroes = detect_zeroes;
block_acct_setup(blk_get_stats(blk), account_invalid, account_failed);
if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) {
blk_unref(blk);
blk = NULL;
goto err_no_bs_opts;
}
}
/* disk I/O throttling */
if (throttle_enabled(&cfg)) {
if (!throttling_group) {
throttling_group = id;
}
blk_io_limits_enable(blk, throttling_group);
blk_set_io_limits(blk, &cfg);
}
blk_set_enable_write_cache(blk, !writethrough);
blk_set_on_error(blk, on_read_error, on_write_error);
if (!monitor_add_blk(blk, id, errp)) {
blk_unref(blk);
blk = NULL;
goto err_no_bs_opts;
}
err_no_bs_opts:
qemu_opts_del(opts);
qobject_unref(interval_dict);
qobject_unref(interval_list);
return blk;
early_err:
qemu_opts_del(opts);
qobject_unref(interval_dict);
qobject_unref(interval_list);
err_no_opts:
qobject_unref(bs_opts);
return NULL;
}
在blockdev_init函数中,如果file不为空,则会调用blk_new_open函数,打开宿主机上的硬盘文件。代码片段如下:
/* init */
if ((!file || !*file) && !qdict_size(bs_opts)) {
BlockBackendRootState *blk_rs;
blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
blk_rs = blk_get_root_state(blk);
blk_rs->open_flags = bdrv_flags | (read_only ? 0 : BDRV_O_RDWR);
blk_rs->detect_zeroes = detect_zeroes;
qobject_unref(bs_opts);
} else {
if (file && !*file) {
file = NULL;
}
/* bdrv_open() defaults to the values in bdrv_flags (for compatibility
* with other callers) rather than what we want as the real defaults.
* Apply the defaults here instead. */
qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
qdict_set_default_str(bs_opts, BDRV_OPT_READ_ONLY,
read_only ? "on" : "off");
qdict_set_default_str(bs_opts, BDRV_OPT_AUTO_READ_ONLY, "on");
assert((bdrv_flags & BDRV_O_CACHE_MASK) == 0);
if (runstate_check(RUN_STATE_INMIGRATE)) {
bdrv_flags |= BDRV_O_INACTIVE;
}
blk = blk_new_open(file, NULL, bs_opts, bdrv_flags, errp);
if (!blk) {
goto err_no_bs_opts;
}
bs = blk_bs(blk);
bs->detect_zeroes = detect_zeroes;
block_acct_setup(blk_get_stats(blk), account_invalid, account_failed);
if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) {
blk_unref(blk);
blk = NULL;
goto err_no_bs_opts;
}
}
这里,file是blockdev_init函数的入参:
static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
Error **errp)
其对应的实参上一回讲过(参见 QEMU源码全解析 —— 块设备虚拟化(19)-CSDN博客 ):
实参filename当然不是空,因此会调用blk_new_open函数,打开宿主机上的硬盘文件(应该就是命令行中的那个/var/lib/nova/instances/1f8e6f7e-5a70-4780-89c1-464dc0e7f308/disk)。
blk_new_open函数在block/block-backend.c中,代码如下:
/*
* Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
* By default, the new BlockBackend is in the main AioContext, but if the
* parameters connect it with any existing node in a different AioContext, it
* may end up there instead.
*
* Just as with bdrv_open(), after having called this function the reference to
* @options belongs to the block layer (even on failure).
*
* Called without holding an AioContext lock.
*
* TODO: Remove @filename and @flags; it should be possible to specify a whole
* BDS tree just by specifying the @options QDict (or @reference,
* alternatively). At the time of adding this function, this is not possible,
* though, so callers of this function have to be able to specify @filename and
* @flags.
*/
BlockBackend *blk_new_open(const char *filename, const char *reference,
QDict *options, int flags, Error **errp)
{
BlockBackend *blk;
BlockDriverState *bs;
AioContext *ctx;
uint64_t perm = 0;
uint64_t shared = BLK_PERM_ALL;
GLOBAL_STATE_CODE();
/*
* blk_new_open() is mainly used in .bdrv_create implementations and the
* tools where sharing isn't a major concern because the BDS stays private
* and the file is generally not supposed to be used by a second process,
* so we just request permission according to the flags.
*
* The exceptions are xen_disk and blockdev_init(); in these cases, the
* caller of blk_new_open() doesn't make use of the permissions, but they
* shouldn't hurt either. We can still share everything here because the
* guest devices will add their own blockers if they can't share.
*/
if ((flags & BDRV_O_NO_IO) == 0) {
perm |= BLK_PERM_CONSISTENT_READ;
if (flags & BDRV_O_RDWR) {
perm |= BLK_PERM_WRITE;
}
}
if (flags & BDRV_O_RESIZE) {
perm |= BLK_PERM_RESIZE;
}
if (flags & BDRV_O_NO_SHARE) {
shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
}
aio_context_acquire(qemu_get_aio_context());
bs = bdrv_open(filename, reference, options, flags, errp);
aio_context_release(qemu_get_aio_context());
if (!bs) {
return NULL;
}
/* bdrv_open() could have moved bs to a different AioContext */
ctx = bdrv_get_aio_context(bs);
blk = blk_new(bdrv_get_aio_context(bs), perm, shared);
blk->perm = perm;
blk->shared_perm = shared;
aio_context_acquire(ctx);
blk_insert_bs(blk, bs, errp);
bdrv_unref(bs);
aio_context_release(ctx);
if (!blk->root) {
blk_unref(blk);
return NULL;
}
return blk;
}
上边已提到,blk_new_open函数的作用是打开宿主机上的硬盘文件(就是命令行参数file对应的那个/var/lib/nova/instances/1f8e6f7e-5a70-4780-89c1-464dc0e7f308/disk文件)。其返回结果是BlockBackend *blk,而这个blk是在函数中通过bdrv_open函数得到的。
这里通过名字就能看到,已经属于是后端设备的范畴了。blk_new_open函数的注释写得清楚:
创建新的BlockBackend,打开新的BlockDriverState,并连接两者。
bdrv_open函数在block.c中,代码如下:
/* The caller must always hold the main AioContext lock. */
BlockDriverState *bdrv_open(const char *filename, const char *reference,
QDict *options, int flags, Error **errp)
{
GLOBAL_STATE_CODE();
return bdrv_open_inherit(filename, reference, options, flags, NULL,
NULL, 0, errp);
}
对于bdrv_open函数的解析,放在下一回中。