QEMU源码全解析 —— 内存虚拟化(5)

接前一篇文章: QEMU源码全解析 —— 内存虚拟化(4)

本文内容参考:

《趣谈 Linux操作系统 》 —— 刘超, 极客时间

QEMU /KVM源码解析与应用》 —— 李强,机械工业出版社

QEMU内存管理模型

浅谈QEMU Memory Region 与 Address Space

特此致谢!

距离上一篇文章( QEMU源码全解析 —— 内存虚拟化(4)_qemu 内存-CSDN博客 )的发表已经过去了半年多快一年了,并非笔者懈怠了,更不是对虚拟化没兴趣了,而是这半年太忙了。4月份入职新公司后一上来就“跑步前进”,再加上有其它“课题”需要研究,因此一直没有继续这一部分了,真的感觉很对不起虚拟化。不过忙碌的日子终于过去了,可以至少有一部分时间和精力回到这一块,继续向前进行探索和研究。

一上来先得“圆扣”,找到上一回讲到哪了。上一回或者说前几回解析了内存虚拟化的基础数据结构,基础有了,接下来该开始讲解具体代码了。

2. QEMU虚拟机内存初始化

内存作为虚拟机的基础部分,其初始化也是在pc_init1函数中进行的。为了便于理解和回顾,这里再次贴出pc_init1函数源码,在hw/i386/pc_piix.c中,如下:

/* PC hardware initialisation */
static void pc_init1(MachineState *machine,
                     const char *host_type, const char *pci_type)
{
    PCMachineState *pcms = PC_MACHINE(machine);
    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
    X86MachineState *x86ms = X86_MACHINE(machine);
    MemoryRegion *system_memory = get_system_memory();
    MemoryRegion *system_io = get_system_io();
    PCIBus *pci_bus = NULL;
    ISABus *isa_bus;
    int piix3_devfn = -1;
    qemu_irq smi_irq;
    GSIState *gsi_state;
    BusState *idebus[MAX_IDE_BUS];
    ISADevice *rtc_state;
    MemoryRegion *ram_memory;
    MemoryRegion *pci_memory = NULL;
    MemoryRegion *rom_memory = system_memory;
    ram_addr_t lowmem;
    uint64_t hole64_size = 0;

    /*
     * Calculate ram split, for memory below and above 4G.  It's a bit
     * complicated for backward compatibility reasons ...
     *
     *  - Traditional split is 3.5G (lowmem = 0xe0000000).  This is the
     *    default value for max_ram_below_4g now.
     *
     *  - Then, to gigabyte align the memory, we move the split to 3G
     *    (lowmem = 0xc0000000).  But only in case we have to split in
     *    the first place, i.e. ram_size is larger than (traditional)
     *    lowmem.  And for new machine types (gigabyte_align = true)
     *    only, for live migration compatibility reasons.
     *
     *  - Next the max-ram-below-4g option was added, which allowed to
     *    reduce lowmem to a smaller value, to allow a larger PCI I/O
     *    window below 4G.  qemu doesn't enforce gigabyte alignment here,
     *    but prints a warning.
     *
     *  - Finally max-ram-below-4g got updated to also allow raising lowmem,
     *    so legacy non-PAE guests can get as much memory as possible in
     *    the 32bit address space below 4G.
     *
     *  - Note that Xen has its own ram setup code in xen_ram_init(),
     *    called via xen_hvm_init_pc().
     *
     * Examples:
     *    qemu -M pc-1.7 -m 4G    (old default)    -> 3584M low,  512M high
     *    qemu -M pc -m 4G        (new default)    -> 3072M low, 1024M high
     *    qemu -M pc,max-ram-below-4g=2G -m 4G     -> 2048M low, 2048M high
     *    qemu -M pc,max-ram-below-4g=4G -m 3968M  -> 3968M low (=4G-128M)
     */
    if (xen_enabled()) {
        xen_hvm_init_pc(pcms, &ram_memory);
    } else {
        ram_memory = machine->ram;
        if (!pcms->max_ram_below_4g) {
            pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
        }
        lowmem = pcms->max_ram_below_4g;
        if (machine->ram_size >= pcms->max_ram_below_4g) {
            if (pcmc->gigabyte_align) {
                if (lowmem > 0xc0000000) {
                    lowmem = 0xc0000000;
                }
                if (lowmem & (1 * GiB - 1)) {
                    warn_report("Large machine and max_ram_below_4g "
                                "(%" PRIu64 ") not a multiple of 1G; "
                                "possible bad performance.",
                                pcms->max_ram_below_4g);
                }
            }
        }

        if (machine->ram_size >= lowmem) {
            x86ms->above_4g_mem_size = machine->ram_size - lowmem;
            x86ms->below_4g_mem_size = lowmem;
        } else {
            x86ms->above_4g_mem_size = 0;
            x86ms->below_4g_mem_size = machine->ram_size;
        }
    }

    pc_machine_init_sgx_epc(pcms);
    x86_cpus_init(x86ms, pcmc->default_cpu_version);

    if (pcmc->kvmclock_enabled) {
        kvmclock_create(pcmc->kvmclock_create_always);
    }

    if (pcmc->pci_enabled) {
        Object *phb;

        pci_memory = g_new(MemoryRegion, 1);
        memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
        rom_memory = pci_memory;

        phb = OBJECT(qdev_new(host_type));
        object_property_add_child(OBJECT(machine), "i440fx", phb);
        object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
                                 OBJECT(ram_memory), &error_fatal);
        object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
                                 OBJECT(pci_memory), &error_fatal);
        object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
                                 OBJECT(system_memory), &error_fatal);
        object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
                                 OBJECT(system_io), &error_fatal);
        object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
                                 x86ms->below_4g_mem_size, &error_fatal);
        object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
                                 x86ms->above_4g_mem_size, &error_fatal);
        object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type,
                                &error_fatal);
        sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);

        pci_bus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0"));
        pci_bus_map_irqs(pci_bus,
                         xen_enabled() ? xen_pci_slot_get_pirq
                                       : pc_pci_slot_get_pirq);
        pcms->bus = pci_bus;

        hole64_size = object_property_get_uint(phb,
                                               PCI_HOST_PROP_PCI_HOLE64_SIZE,
                                               &error_abort);
    }

    pc_guest_info_init(pcms);

    if (pcmc->smbios_defaults) {
        MachineClass *mc = MACHINE_GET_CLASS(machine);
        /* These values are guest ABI, do not change */
        smbios_set_defaults("QEMU", mc->desc,
                            mc->name, pcmc->smbios_legacy_mode,
                            pcmc->smbios_uuid_encoded,
                            pcms->smbios_entry_point_type);
    }

    /* allocate ram and load rom/bios */
    if (!xen_enabled()) {
        pc_memory_init(pcms, system_memory, rom_memory, hole64_size);
    } else {
        assert(machine->ram_size == x86ms->below_4g_mem_size +
                                    x86ms->above_4g_mem_size);

        pc_system_flash_cleanup_unused(pcms);
        if (machine->kernel_filename != NULL) {
            /* For xen HVM direct kernel boot, load linux here */
            xen_load_linux(pcms);
        }
    }

    gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled);

    if (pcmc->pci_enabled) {
        PIIX3State *piix3;
        PCIDevice *pci_dev;

        pci_dev = pci_create_simple_multifunction(pci_bus, -1, TYPE_PIIX3_DEVICE);

        if (xen_enabled()) {
            pci_device_set_intx_routing_notifier(
                        pci_dev, piix_intx_routing_notifier_xen);

            /*
             * Xen supports additional interrupt routes from the PCI devices to
             * the IOAPIC: the four pins of each PCI device on the bus are also
             * connected to the IOAPIC directly.
             * These additional routes can be discovered through ACPI.
             */
            pci_bus_irqs(pci_bus, xen_intx_set_irq, pci_dev,
                         XEN_IOAPIC_NUM_PIRQS);
        }

        piix3 = PIIX3_PCI_DEVICE(pci_dev);
        piix3->pic = x86ms->gsi;
        piix3_devfn = piix3->dev.devfn;
        isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(piix3), "isa.0"));
        rtc_state = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev),
                                                             "rtc"));
    } else {
        isa_bus = isa_bus_new(NULL, system_memory, system_io,
                              &error_abort);

        rtc_state = isa_new(TYPE_MC146818_RTC);
        qdev_prop_set_int32(DEVICE(rtc_state), "base_year", 2000);
        isa_realize_and_unref(rtc_state, isa_bus, &error_fatal);

        i8257_dma_init(isa_bus, 0);
        pcms->hpet_enabled = false;
    }
    isa_bus_register_input_irqs(isa_bus, x86ms->gsi);

    if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
        pc_i8259_create(isa_bus, gsi_state->i8259_irq);
    }

    if (pcmc->pci_enabled) {
        ioapic_init_gsi(gsi_state, "i440fx");
    }

    if (tcg_enabled()) {
        x86_register_ferr_irq(x86ms->gsi[13]);
    }

    pc_vga_init(isa_bus, pcmc->pci_enabled ? pci_bus : NULL);

    assert(pcms->vmport != ON_OFF_AUTO__MAX);
    if (pcms->vmport == ON_OFF_AUTO_AUTO) {
        pcms->vmport = xen_enabled() ? ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON;
    }

    /* init basic PC hardware */
    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, rtc_state, true,
                         0x4);

    pc_nic_init(pcmc, isa_bus, pci_bus);

    if (pcmc->pci_enabled) {
        PCIDevice *dev;

        dev = pci_create_simple(pci_bus, piix3_devfn + 1, TYPE_PIIX3_IDE);
        pci_ide_create_devs(dev);
        idebus[0] = qdev_get_child_bus(&dev->qdev, "ide.0");
        idebus[1] = qdev_get_child_bus(&dev->qdev, "ide.1");
        pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
    }
#ifdef CONFIG_IDE_ISA
    else {
        DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
        int i;

        ide_drive_get(hd, ARRAY_SIZE(hd));
        for (i = 0; i < MAX_IDE_BUS; i++) {
            ISADevice *dev;
            char busname[] = "ide.0";
            dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i],
                               ide_irq[i],
                               hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]);
            /*
             * The ide bus name is ide.0 for the first bus and ide.1 for the
             * second one.
             */
            busname[4] = '0' + i;
            idebus[i] = qdev_get_child_bus(DEVICE(dev), busname);
        }
        pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
    }
#endif

    if (pcmc->pci_enabled && machine_usb(machine)) {
        pci_create_simple(pci_bus, piix3_devfn + 2, TYPE_PIIX3_USB_UHCI);
    }

    if (pcmc->pci_enabled && x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) {
        PCIDevice *piix4_pm;

        smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);
        piix4_pm = pci_new(piix3_devfn + 3, TYPE_PIIX4_PM);
        qdev_prop_set_uint32(DEVICE(piix4_pm), "smb_io_base", 0xb100);
        qdev_prop_set_bit(DEVICE(piix4_pm), "smm-enabled",
                          x86_machine_is_smm_enabled(x86ms));
        pci_realize_and_unref(piix4_pm, pci_bus, &error_fatal);

        qdev_connect_gpio_out(DEVICE(piix4_pm), 0, x86ms->gsi[9]);
        qdev_connect_gpio_out_named(DEVICE(piix4_pm), "smi-irq", 0, smi_irq);
        pcms->smbus = I2C_BUS(qdev_get_child_bus(DEVICE(piix4_pm), "i2c"));
        /* TODO: Populate SPD eeprom data.  */
        smbus_eeprom_init(pcms->smbus, 8, NULL, 0);

        object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
                                 TYPE_HOTPLUG_HANDLER,
                                 (Object **)&x86ms->acpi_dev,
                                 object_property_allow_set_link,
                                 OBJ_PROP_LINK_STRONG);
        object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
                                 OBJECT(piix4_pm), &error_abort);
    }

    if (machine->nvdimms_state->is_enabled) {
        nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
                               x86_nvdimm_acpi_dsmio,
                               x86ms->fw_cfg, OBJECT(pcms));
    }
}

pc_init1函数很长,有将近300行,不过这里只关注与内存初始化相关的代码。

内存分为低端内存和高端内存,之所以会有这个区分,是因为一些有传统设备的虚拟机,其设备必须使用一些地址空间在4GB以下的内存。低端内存和高端内存的计算见以下代码片段:

        ram_memory = machine->ram;
        if (!pcms->max_ram_below_4g) {
            pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
        }
        lowmem = pcms->max_ram_below_4g;
        if (machine->ram_size >= pcms->max_ram_below_4g) {
            if (pcmc->gigabyte_align) {
                if (lowmem > 0xc0000000) {
                    lowmem = 0xc0000000;
                }
                if (lowmem & (1 * GiB - 1)) {
                    warn_report("Large machine and max_ram_below_4g "
                                "(%" PRIu64 ") not a multiple of 1G; "
                                "possible bad performance.",
                                pcms->max_ram_below_4g);
                }
            }
        }

        if (machine->ram_size >= lowmem) {
            x86ms->above_4g_mem_size = machine->ram_size - lowmem;
            x86ms->below_4g_mem_size = lowmem;
        } else {
            x86ms->above_4g_mem_size = 0;
            x86ms->below_4g_mem_size = machine->ram_size;
        }
    }

如果用户在命令行指定了max-ram-below-4g参数,则使用用户指定的参数,这可以让一些非传统的虚拟机使用更多的4GB以下的地址空间;如果用户并未指定max-ram-below-4g参数,则又要分为两种情况:

1)传统的虚拟机(qemu-2.5版本以下)使用3.5GB(0xe0000000)作为分界线;

        ram_memory = machine->ram;
        if (!pcms->max_ram_below_4g) {
            pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
        }
        lowmem = pcms->max_ram_below_4g;

2)在(更)高版本的虚拟机下,会设置gigabyte_align。当虚拟机的内存大于传统低端内存(3.5GB)时,会以3GB(0xc0000000)作为分界线。

        if (machine->ram_size >= pcms->max_ram_below_4g) {
            if (pcmc->gigabyte_align) {
                if (lowmem > 0xc0000000) {
                    lowmem = 0xc0000000;
                }
                if (lowmem & (1 * GiB - 1)) {
                    warn_report("Large machine and max_ram_below_4g "
                                "(%" PRIu64 ") not a multiple of 1G; "
                                "possible bad performance.",
                                pcms->max_ram_below_4g);
                }
            }
        }

超过lowmem的地址就是高端内存了,否则就是低端内存。

        if (machine->ram_size >= lowmem) {
            x86ms->above_4g_mem_size = machine->ram_size - lowmem;
            x86ms->below_4g_mem_size = lowmem;
        } else {
            x86ms->above_4g_mem_size = 0;
            x86ms->below_4g_mem_size = machine->ram_size;
        }

由于本回是隔了很长时间之后才回来继续之前研究的,所以更多以回顾和恢复状态为主,内容就只讲到这里了。随着状态的逐步恢复,下一回开始回到正轨。更多内容请看下回。