QEMU源码全解析 —— 块设备虚拟化(20)

接前一篇文章: QEMU源码全解析 —— 块设备虚拟化(19)

本文内容参考:

《趣谈 Linux操作系统 》 —— 刘超, 极客时间

QEMU /KVM源码解析与应用》 —— 李强,机械工业出版社

特此致谢!

QEMU启动过程中的块设备虚拟化

上一回大致解析了drive_new函数,本回重点对于drive_new函数中调用的blockdev_init函数进行解析。为了便于理解和回顾,再次贴出drive_new函数中调用blockdev_init函数的代码片段,如下所示:

    /* Actual block device init: Functionality shared with blockdev-add */
    blk = blockdev_init(filename, bs_opts, errp);
    bs_opts = NULL;
    if (!blk) {
        goto fail;
    }

blockdev_init函数也在blockdev.c中,代码如下:

/* Takes the ownership of bs_opts */
static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
                                   Error **errp)
{
    const char *buf;
    int bdrv_flags = 0;
    int on_read_error, on_write_error;
    OnOffAuto account_invalid, account_failed;
    bool writethrough, read_only;
    BlockBackend *blk;
    BlockDriverState *bs;
    ThrottleConfig cfg;
    int snapshot = 0;
    Error *error = NULL;
    QemuOpts *opts;
    QDict *interval_dict = NULL;
    QList *interval_list = NULL;
    const char *id;
    BlockdevDetectZeroesOptions detect_zeroes =
        BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF;
    const char *throttling_group = NULL;

    /* Check common options by copying from bs_opts to opts, all other options
     * stay in bs_opts for processing by bdrv_open(). */
    id = qdict_get_try_str(bs_opts, "id");
    opts = qemu_opts_create(&qemu_common_drive_opts, id, 1, errp);
    if (!opts) {
        goto err_no_opts;
    }

    if (!qemu_opts_absorb_qdict(opts, bs_opts, errp)) {
        goto early_err;
    }

    if (id) {
        qdict_del(bs_opts, "id");
    }

    /* extract parameters */
    snapshot = qemu_opt_get_bool(opts, "snapshot", 0);

    account_invalid = account_get_opt(opts, "stats-account-invalid");
    account_failed = account_get_opt(opts, "stats-account-failed");

    writethrough = !qemu_opt_get_bool(opts, BDRV_OPT_CACHE_WB, true);

    id = qemu_opts_id(opts);

    qdict_extract_subqdict(bs_opts, &interval_dict, "stats-intervals.");
    qdict_array_split(interval_dict, &interval_list);

    if (qdict_size(interval_dict) != 0) {
        error_setg(errp, "Invalid option stats-intervals.%s",
                   qdict_first(interval_dict)->key);
        goto early_err;
    }

    extract_common_blockdev_options(opts, &bdrv_flags, &throttling_group, &cfg,
                                    &detect_zeroes, &error);
    if (error) {
        error_propagate(errp, error);
        goto early_err;
    }

    if ((buf = qemu_opt_get(opts, "format")) != NULL) {
        if (is_help_option(buf)) {
            qemu_printf("Supported formats:");
            bdrv_iterate_format(bdrv_format_print, NULL, false);
            qemu_printf("\nSupported formats (read-only):");
            bdrv_iterate_format(bdrv_format_print, NULL, true);
            qemu_printf("\n");
            goto early_err;
        }

        if (qdict_haskey(bs_opts, "driver")) {
            error_setg(errp, "Cannot specify both 'driver' and 'format'");
            goto early_err;
        }
        qdict_put_str(bs_opts, "driver", buf);
    }

    on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
    if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
        on_write_error = parse_block_error_action(buf, 0, &error);
        if (error) {
            error_propagate(errp, error);
            goto early_err;
        }
    }

    on_read_error = BLOCKDEV_ON_ERROR_REPORT;
    if ((buf = qemu_opt_get(opts, "rerror")) != NULL) {
        on_read_error = parse_block_error_action(buf, 1, &error);
        if (error) {
            error_propagate(errp, error);
            goto early_err;
        }
    }

    if (snapshot) {
        bdrv_flags |= BDRV_O_SNAPSHOT;
    }

    read_only = qemu_opt_get_bool(opts, BDRV_OPT_READ_ONLY, false);

    /* init */
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

        blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
        blk_rs = blk_get_root_state(blk);
        blk_rs->open_flags    = bdrv_flags | (read_only ? 0 : BDRV_O_RDWR);
        blk_rs->detect_zeroes = detect_zeroes;

        qobject_unref(bs_opts);
    } else {
        if (file && !*file) {
            file = NULL;
        }

        /* bdrv_open() defaults to the values in bdrv_flags (for compatibility
         * with other callers) rather than what we want as the real defaults.
         * Apply the defaults here instead. */
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_READ_ONLY,
                              read_only ? "on" : "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_AUTO_READ_ONLY, "on");
        assert((bdrv_flags & BDRV_O_CACHE_MASK) == 0);

        if (runstate_check(RUN_STATE_INMIGRATE)) {
            bdrv_flags |= BDRV_O_INACTIVE;
        }

        blk = blk_new_open(file, NULL, bs_opts, bdrv_flags, errp);
        if (!blk) {
            goto err_no_bs_opts;
        }
        bs = blk_bs(blk);

        bs->detect_zeroes = detect_zeroes;

        block_acct_setup(blk_get_stats(blk), account_invalid, account_failed);

        if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) {
            blk_unref(blk);
            blk = NULL;
            goto err_no_bs_opts;
        }
    }

    /* disk I/O throttling */
    if (throttle_enabled(&cfg)) {
        if (!throttling_group) {
            throttling_group = id;
        }
        blk_io_limits_enable(blk, throttling_group);
        blk_set_io_limits(blk, &cfg);
    }

    blk_set_enable_write_cache(blk, !writethrough);
    blk_set_on_error(blk, on_read_error, on_write_error);

    if (!monitor_add_blk(blk, id, errp)) {
        blk_unref(blk);
        blk = NULL;
        goto err_no_bs_opts;
    }

err_no_bs_opts:
    qemu_opts_del(opts);
    qobject_unref(interval_dict);
    qobject_unref(interval_list);
    return blk;

early_err:
    qemu_opts_del(opts);
    qobject_unref(interval_dict);
    qobject_unref(interval_list);
err_no_opts:
    qobject_unref(bs_opts);
    return NULL;
}

在blockdev_init函数中,如果file不为空,则会调用blk_new_open函数,打开宿主机上的硬盘文件。代码片段如下:

    /* init */
    if ((!file || !*file) && !qdict_size(bs_opts)) {
        BlockBackendRootState *blk_rs;

        blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
        blk_rs = blk_get_root_state(blk);
        blk_rs->open_flags    = bdrv_flags | (read_only ? 0 : BDRV_O_RDWR);
        blk_rs->detect_zeroes = detect_zeroes;

        qobject_unref(bs_opts);
    } else {
        if (file && !*file) {
            file = NULL;
        }

        /* bdrv_open() defaults to the values in bdrv_flags (for compatibility
         * with other callers) rather than what we want as the real defaults.
         * Apply the defaults here instead. */
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_READ_ONLY,
                              read_only ? "on" : "off");
        qdict_set_default_str(bs_opts, BDRV_OPT_AUTO_READ_ONLY, "on");
        assert((bdrv_flags & BDRV_O_CACHE_MASK) == 0);

        if (runstate_check(RUN_STATE_INMIGRATE)) {
            bdrv_flags |= BDRV_O_INACTIVE;
        }

        blk = blk_new_open(file, NULL, bs_opts, bdrv_flags, errp);
        if (!blk) {
            goto err_no_bs_opts;
        }
        bs = blk_bs(blk);

        bs->detect_zeroes = detect_zeroes;

        block_acct_setup(blk_get_stats(blk), account_invalid, account_failed);

        if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) {
            blk_unref(blk);
            blk = NULL;
            goto err_no_bs_opts;
        }
    }

这里,file是blockdev_init函数的入参:

static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
                                   Error **errp)

其对应的实参上一回讲过(参见 QEMU源码全解析 —— 块设备虚拟化(19)-CSDN博客 ):

实参filename当然不是空,因此会调用blk_new_open函数,打开宿主机上的硬盘文件(应该就是命令行中的那个/var/lib/nova/instances/1f8e6f7e-5a70-4780-89c1-464dc0e7f308/disk)。

blk_new_open函数在block/block-backend.c中,代码如下:

/*
 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 * By default, the new BlockBackend is in the main AioContext, but if the
 * parameters connect it with any existing node in a different AioContext, it
 * may end up there instead.
 *
 * Just as with bdrv_open(), after having called this function the reference to
 * @options belongs to the block layer (even on failure).
 *
 * Called without holding an AioContext lock.
 *
 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 * BDS tree just by specifying the @options QDict (or @reference,
 * alternatively). At the time of adding this function, this is not possible,
 * though, so callers of this function have to be able to specify @filename and
 * @flags.
 */
BlockBackend *blk_new_open(const char *filename, const char *reference,
                           QDict *options, int flags, Error **errp)
{
    BlockBackend *blk;
    BlockDriverState *bs;
    AioContext *ctx;
    uint64_t perm = 0;
    uint64_t shared = BLK_PERM_ALL;

    GLOBAL_STATE_CODE();

    /*
     * blk_new_open() is mainly used in .bdrv_create implementations and the
     * tools where sharing isn't a major concern because the BDS stays private
     * and the file is generally not supposed to be used by a second process,
     * so we just request permission according to the flags.
     *
     * The exceptions are xen_disk and blockdev_init(); in these cases, the
     * caller of blk_new_open() doesn't make use of the permissions, but they
     * shouldn't hurt either. We can still share everything here because the
     * guest devices will add their own blockers if they can't share.
     */
    if ((flags & BDRV_O_NO_IO) == 0) {
        perm |= BLK_PERM_CONSISTENT_READ;
        if (flags & BDRV_O_RDWR) {
            perm |= BLK_PERM_WRITE;
        }
    }
    if (flags & BDRV_O_RESIZE) {
        perm |= BLK_PERM_RESIZE;
    }
    if (flags & BDRV_O_NO_SHARE) {
        shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
    }

    aio_context_acquire(qemu_get_aio_context());
    bs = bdrv_open(filename, reference, options, flags, errp);
    aio_context_release(qemu_get_aio_context());
    if (!bs) {
        return NULL;
    }

    /* bdrv_open() could have moved bs to a different AioContext */
    ctx = bdrv_get_aio_context(bs);
    blk = blk_new(bdrv_get_aio_context(bs), perm, shared);
    blk->perm = perm;
    blk->shared_perm = shared;

    aio_context_acquire(ctx);
    blk_insert_bs(blk, bs, errp);
    bdrv_unref(bs);
    aio_context_release(ctx);

    if (!blk->root) {
        blk_unref(blk);
        return NULL;
    }

    return blk;
}

上边已提到,blk_new_open函数的作用是打开宿主机上的硬盘文件(就是命令行参数file对应的那个/var/lib/nova/instances/1f8e6f7e-5a70-4780-89c1-464dc0e7f308/disk文件)。其返回结果是BlockBackend *blk,而这个blk是在函数中通过bdrv_open函数得到的。

这里通过名字就能看到,已经属于是后端设备的范畴了。blk_new_open函数的注释写得清楚:

创建新的BlockBackend,打开新的BlockDriverState,并连接两者。

bdrv_open函数在block.c中,代码如下:

/* The caller must always hold the main AioContext lock. */
BlockDriverState *bdrv_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp)
{
    GLOBAL_STATE_CODE();

    return bdrv_open_inherit(filename, reference, options, flags, NULL,
                             NULL, 0, errp);
}

对于bdrv_open函数的解析,放在下一回中。