QEMU源码全解析 —— 块设备虚拟化(22)

接前一篇文章: QEMU源码全解析 —— 块设备虚拟化(21)

本文内容参考:

《趣谈 Linux操作系统 》 —— 刘超, 极客时间

QEMU /KVM源码解析与应用》 —— 李强,机械工业出版社

特此致谢!

QEMU启动过程中的块设备虚拟化

上一回沿着bdrv_open函数 ---> bdrv_open_inherit函数 ---> bdrv_open_common函数,讲解了bdrv_open_common函数的两大关键步骤:

bdrv_open_driver函数也在block中,代码如下:

/*
 * The caller must always hold @bs AioContext lock, because this function calls
 * bdrv_refresh_total_sectors() which polls when called from non-coroutine
 * context.
 */
static int no_coroutine_fn GRAPH_UNLOCKED
bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
                 QDict *options, int open_flags, Error **errp)
{
    AioContext *ctx;
    Error *local_err = NULL;
    int i, ret;
    GLOBAL_STATE_CODE();
 
    bdrv_assign_node_name(bs, node_name, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        return -EINVAL;
    }
 
    bs->drv = drv;
    bs->opaque = g_malloc0(drv->instance_size);
 
    if (drv->bdrv_file_open) {
        assert(!drv->bdrv_needs_filename || bs->filename[0]);
        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
    } else if (drv->bdrv_open) {
        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
    } else {
        ret = 0;
    }
 
    if (ret < 0) {
        if (local_err) {
            error_propagate(errp, local_err);
        } else if (bs->filename[0]) {
            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
        } else {
            error_setg_errno(errp, -ret, "Could not open image");
        }
        goto open_failed;
    }
 
    assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
    assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
 
    /*
     * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
     * drivers that pass read/write requests through to a child the trouble of
     * declaring support explicitly.
     *
     * Drivers must not propagate this flag accidentally when they initiate I/O
     * to a bounce buffer. That case should be rare though.
     */
    bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
    bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
 
    /* Get the context after .bdrv_open, it can change the context */
    ctx = bdrv_get_aio_context(bs);
    aio_context_acquire(ctx);
 
    ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not refresh total sector count");
        aio_context_release(ctx);
        return ret;
    }
 
    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs, NULL, &local_err);
    bdrv_graph_rdunlock_main_loop();
    aio_context_release(ctx);
 
    if (local_err) {
        error_propagate(errp, local_err);
        return -EINVAL;
    }
 
    assert(bdrv_opt_mem_align(bs) != 0);
    assert(bdrv_min_mem_align(bs) != 0);
    assert(is_power_of_2(bs->bl.request_alignment));
 
    for (i = 0; i < bs->quiesce_counter; i++) {
        if (drv->bdrv_drain_begin) {
            drv->bdrv_drain_begin(bs);
        }
    }
 
    return 0;
open_failed:
    bs->drv = NULL;
    if (bs->file != NULL) {
        bdrv_unref_child(bs, bs->file);
        assert(!bs->file);
    }
    g_free(bs->opaque);
    bs->opaque = NULL;
    return ret;
}

上一回也讲了QEMU支持的几种磁盘文件格式,包括raw、qcow2、vmdk、vdi等。对于不同的格式,打开的方式不尽相同。以raw和qcow2为例,它们的BlockDriver的定义分别如下:

  • raw

block/raw-format.c中:

BlockDriver bdrv_raw = {
    .format_name          = "raw",
    .instance_size        = sizeof(BDRVRawState),
    .supports_zoned_children = true,
    .bdrv_probe           = &raw_probe,
    .bdrv_reopen_prepare  = &raw_reopen_prepare,
    .bdrv_reopen_commit   = &raw_reopen_commit,
    .bdrv_reopen_abort    = &raw_reopen_abort,
    .bdrv_open            = &raw_open,
    .bdrv_child_perm      = raw_child_perm,
    .bdrv_co_create_opts  = &raw_co_create_opts,
    .bdrv_co_preadv       = &raw_co_preadv,
    .bdrv_co_pwritev      = &raw_co_pwritev,
    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
    .bdrv_co_pdiscard     = &raw_co_pdiscard,
    .bdrv_co_zone_report  = &raw_co_zone_report,
    .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
    .bdrv_co_zone_append = &raw_co_zone_append,
    .bdrv_co_block_status = &raw_co_block_status,
    .bdrv_co_copy_range_from = &raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
    .bdrv_co_truncate     = &raw_co_truncate,
    .bdrv_co_getlength    = &raw_co_getlength,
    .is_format            = true,
    .bdrv_measure         = &raw_measure,
    .bdrv_co_get_info     = &raw_co_get_info,
    .bdrv_refresh_limits  = &raw_refresh_limits,
    .bdrv_probe_blocksizes = &raw_probe_blocksizes,
    .bdrv_probe_geometry  = &raw_probe_geometry,
    .bdrv_co_eject        = &raw_co_eject,
    .bdrv_co_lock_medium  = &raw_co_lock_medium,
    .bdrv_co_ioctl        = &raw_co_ioctl,
    .create_opts          = &raw_create_opts,
    .bdrv_has_zero_init   = &raw_has_zero_init,
    .strong_runtime_opts  = raw_strong_runtime_opts,
    .mutable_opts         = mutable_opts,
    .bdrv_cancel_in_flight = raw_cancel_in_flight,
};

static void bdrv_raw_init(void)
{
    bdrv_register(&bdrv_raw);
}

block_init(bdrv_raw_init);
  • qcow2

block/qcow2.c中:

BlockDriver bdrv_qcow2 = {
    .format_name        = "qcow2",
    .instance_size      = sizeof(BDRVQcow2State),
    .bdrv_probe         = qcow2_probe,
    .bdrv_open          = qcow2_open,
    .bdrv_close         = qcow2_close,
    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
    .bdrv_reopen_commit   = qcow2_reopen_commit,
    .bdrv_reopen_commit_post = qcow2_reopen_commit_post,
    .bdrv_reopen_abort    = qcow2_reopen_abort,
    .bdrv_join_options    = qcow2_join_options,
    .bdrv_child_perm      = bdrv_default_perms,
    .bdrv_co_create_opts  = qcow2_co_create_opts,
    .bdrv_co_create       = qcow2_co_create,
    .bdrv_has_zero_init   = qcow2_has_zero_init,
    .bdrv_co_block_status = qcow2_co_block_status,

    .bdrv_co_preadv_part    = qcow2_co_preadv_part,
    .bdrv_co_pwritev_part   = qcow2_co_pwritev_part,
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,

    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
    .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
    .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
    .bdrv_co_truncate       = qcow2_co_truncate,
    .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
    .bdrv_make_empty        = qcow2_make_empty,

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
    .bdrv_snapshot_list     = qcow2_snapshot_list,
    .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
    .bdrv_measure           = qcow2_measure,
    .bdrv_co_get_info       = qcow2_co_get_info,
    .bdrv_get_specific_info = qcow2_get_specific_info,

    .bdrv_co_save_vmstate   = qcow2_co_save_vmstate,
    .bdrv_co_load_vmstate   = qcow2_co_load_vmstate,

    .is_format                  = true,
    .supports_backing           = true,
    .bdrv_change_backing_file   = qcow2_change_backing_file,

    .bdrv_refresh_limits        = qcow2_refresh_limits,
    .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
    .bdrv_inactivate            = qcow2_inactivate,

    .create_opts         = &qcow2_create_opts,
    .amend_opts          = &qcow2_amend_opts,
    .strong_runtime_opts = qcow2_strong_runtime_opts,
    .mutable_opts        = mutable_opts,
    .bdrv_co_check       = qcow2_co_check,
    .bdrv_amend_options  = qcow2_amend_options,
    .bdrv_co_amend       = qcow2_co_amend,

    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
    .bdrv_attach_aio_context  = qcow2_attach_aio_context,

    .bdrv_supports_persistent_dirty_bitmap =
            qcow2_supports_persistent_dirty_bitmap,
    .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
    .bdrv_co_remove_persistent_dirty_bitmap =
            qcow2_co_remove_persistent_dirty_bitmap,
};

static void bdrv_qcow2_init(void)
{
    bdrv_register(&bdrv_qcow2);
}

block_init(bdrv_qcow2_init);

那么,对于raw格式来说,drv->bdrv_open()调用的就是raw_open函数;对于qcow2格式来说,drv->bdrv_open()调用的是qcow2_open函数。

BlockDriver bdrv_raw = {
    .format_name          = "raw",
    .instance_size        = sizeof(BDRVRawState),
    .supports_zoned_children = true,
    .bdrv_probe           = &raw_probe,
    .bdrv_reopen_prepare  = &raw_reopen_prepare,
    .bdrv_reopen_commit   = &raw_reopen_commit,
    .bdrv_reopen_abort    = &raw_reopen_abort,
    .bdrv_open            = &raw_open,
    ……
};
BlockDriver bdrv_qcow2 = {
    .format_name        = "qcow2",
    .instance_size      = sizeof(BDRVQcow2State),
    .bdrv_probe         = qcow2_probe,
    .bdrv_open          = qcow2_open,
    .bdrv_close         = qcow2_close,
    ……
};

这次先来看qcow2格式对应的qcow2_open函数,在block/qcow2.c中,代码如下:

static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
{
    BDRVQcow2State *s = bs->opaque;
    QCow2OpenCo qoc = {
        .bs = bs,
        .options = options,
        .flags = flags,
        .errp = errp,
        .ret = -EINPROGRESS
    };
    int ret;

    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
        return ret;
    }

    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);

    assert(!qemu_in_coroutine());
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());

    aio_co_enter(bdrv_get_aio_context(bs),
                 qemu_coroutine_create(qcow2_open_entry, &qoc));
    AIO_WAIT_WHILE_UNLOCKED(NULL, qoc.ret == -EINPROGRESS);

    return qoc.ret;
}

在qcow2_open函数中,会通过qemu_coroutine_create()以及aio_co_center()进入一个协程coroutine。什么是协程呢?知识补强一下。

协程是一种轻量级的线程,也可以称为用户态线程。传统线程的调度是由内核完成的,而协程则可以在用户层自主调度。协程的优点包括:

  • 上下文切换效率高。因为切换对内核来说是无感知的,其切换开销基本和函数调用差不多。
  • 协程间共享数据无需加锁。因为它们还是在一个线程上运行,并不会真正并行执行。

线程是一个内核的概念,创建的每一个线程,在内核中都对应一个task_struct对象,内核的调度也是以线程为单位的。这对于普通进程没有什么问题,但别忘了此处的场景是QEMU虚拟机。如果在用户态和内核态之间切来切去,那么代价会比较大(尤其还涉及到虚拟机的状态)。

但是,QEMU中的设备也需要有并发能力,那怎么办呢?就在用户态实现一个类似线程的机制,即引入协程,用它来实现并发,并且无须切换到内核,调度全部在用户态完成。

这里,打开一个qcow2格式的文件就使用一个协程。创建一个协程和创建一个线程很像,也需要指定一个函数(协程函数)来执行,上边代码中的qcow2_open_entry就是协程函数。

qcow2_open_entry函数也在block/qcow2.c中,代码如下:

static void coroutine_fn qcow2_open_entry(void *opaque)
{
    QCow2OpenCo *qoc = opaque;
    BDRVQcow2State *s = qoc->bs->opaque;

    GRAPH_RDLOCK_GUARD();

    qemu_co_mutex_lock(&s->lock);
    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
                             qoc->errp);
    qemu_co_mutex_unlock(&s->lock);

    aio_wait_kick();
}

看到函数名前边那个coroutine_fn了吧?这就说明该函数是一个协程函数。核心是调用了qcow2_do_open函数。qcow2_do_open函数也在block/qcow2.c中,代码很长(将近600行),在此就不深入解析了,只要知道其功能是根据qcow2的格式打开磁盘文件就可以了。

这样,qcow2格式对应的qcow2_open函数就基本解析完了,下一回解析raw格式对应的raw_open函数。