接前一篇文章: QEMU源码全解析 —— 内存虚拟化(4)
本文内容参考:
《 QEMU /KVM源码解析与应用》 —— 李强,机械工业出版社
浅谈QEMU Memory Region 与 Address Space
特此致谢!
距离上一篇文章( QEMU源码全解析 —— 内存虚拟化(4)_qemu 内存-CSDN博客 )的发表已经过去了半年多快一年了,并非笔者懈怠了,更不是对虚拟化没兴趣了,而是这半年太忙了。4月份入职新公司后一上来就“跑步前进”,再加上有其它“课题”需要研究,因此一直没有继续这一部分了,真的感觉很对不起虚拟化。不过忙碌的日子终于过去了,可以至少有一部分时间和精力回到这一块,继续向前进行探索和研究。
一上来先得“圆扣”,找到上一回讲到哪了。上一回或者说前几回解析了内存虚拟化的基础数据结构,基础有了,接下来该开始讲解具体代码了。
2. QEMU虚拟机内存初始化
内存作为虚拟机的基础部分,其初始化也是在pc_init1函数中进行的。为了便于理解和回顾,这里再次贴出pc_init1函数源码,在hw/i386/pc_piix.c中,如下:
/* PC hardware initialisation */
static void pc_init1(MachineState *machine,
const char *host_type, const char *pci_type)
{
PCMachineState *pcms = PC_MACHINE(machine);
PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
X86MachineState *x86ms = X86_MACHINE(machine);
MemoryRegion *system_memory = get_system_memory();
MemoryRegion *system_io = get_system_io();
PCIBus *pci_bus = NULL;
ISABus *isa_bus;
int piix3_devfn = -1;
qemu_irq smi_irq;
GSIState *gsi_state;
BusState *idebus[MAX_IDE_BUS];
ISADevice *rtc_state;
MemoryRegion *ram_memory;
MemoryRegion *pci_memory = NULL;
MemoryRegion *rom_memory = system_memory;
ram_addr_t lowmem;
uint64_t hole64_size = 0;
/*
* Calculate ram split, for memory below and above 4G. It's a bit
* complicated for backward compatibility reasons ...
*
* - Traditional split is 3.5G (lowmem = 0xe0000000). This is the
* default value for max_ram_below_4g now.
*
* - Then, to gigabyte align the memory, we move the split to 3G
* (lowmem = 0xc0000000). But only in case we have to split in
* the first place, i.e. ram_size is larger than (traditional)
* lowmem. And for new machine types (gigabyte_align = true)
* only, for live migration compatibility reasons.
*
* - Next the max-ram-below-4g option was added, which allowed to
* reduce lowmem to a smaller value, to allow a larger PCI I/O
* window below 4G. qemu doesn't enforce gigabyte alignment here,
* but prints a warning.
*
* - Finally max-ram-below-4g got updated to also allow raising lowmem,
* so legacy non-PAE guests can get as much memory as possible in
* the 32bit address space below 4G.
*
* - Note that Xen has its own ram setup code in xen_ram_init(),
* called via xen_hvm_init_pc().
*
* Examples:
* qemu -M pc-1.7 -m 4G (old default) -> 3584M low, 512M high
* qemu -M pc -m 4G (new default) -> 3072M low, 1024M high
* qemu -M pc,max-ram-below-4g=2G -m 4G -> 2048M low, 2048M high
* qemu -M pc,max-ram-below-4g=4G -m 3968M -> 3968M low (=4G-128M)
*/
if (xen_enabled()) {
xen_hvm_init_pc(pcms, &ram_memory);
} else {
ram_memory = machine->ram;
if (!pcms->max_ram_below_4g) {
pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
}
lowmem = pcms->max_ram_below_4g;
if (machine->ram_size >= pcms->max_ram_below_4g) {
if (pcmc->gigabyte_align) {
if (lowmem > 0xc0000000) {
lowmem = 0xc0000000;
}
if (lowmem & (1 * GiB - 1)) {
warn_report("Large machine and max_ram_below_4g "
"(%" PRIu64 ") not a multiple of 1G; "
"possible bad performance.",
pcms->max_ram_below_4g);
}
}
}
if (machine->ram_size >= lowmem) {
x86ms->above_4g_mem_size = machine->ram_size - lowmem;
x86ms->below_4g_mem_size = lowmem;
} else {
x86ms->above_4g_mem_size = 0;
x86ms->below_4g_mem_size = machine->ram_size;
}
}
pc_machine_init_sgx_epc(pcms);
x86_cpus_init(x86ms, pcmc->default_cpu_version);
if (pcmc->kvmclock_enabled) {
kvmclock_create(pcmc->kvmclock_create_always);
}
if (pcmc->pci_enabled) {
Object *phb;
pci_memory = g_new(MemoryRegion, 1);
memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
rom_memory = pci_memory;
phb = OBJECT(qdev_new(host_type));
object_property_add_child(OBJECT(machine), "i440fx", phb);
object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
OBJECT(ram_memory), &error_fatal);
object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
OBJECT(pci_memory), &error_fatal);
object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
OBJECT(system_memory), &error_fatal);
object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
OBJECT(system_io), &error_fatal);
object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
x86ms->below_4g_mem_size, &error_fatal);
object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
x86ms->above_4g_mem_size, &error_fatal);
object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type,
&error_fatal);
sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
pci_bus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0"));
pci_bus_map_irqs(pci_bus,
xen_enabled() ? xen_pci_slot_get_pirq
: pc_pci_slot_get_pirq);
pcms->bus = pci_bus;
hole64_size = object_property_get_uint(phb,
PCI_HOST_PROP_PCI_HOLE64_SIZE,
&error_abort);
}
pc_guest_info_init(pcms);
if (pcmc->smbios_defaults) {
MachineClass *mc = MACHINE_GET_CLASS(machine);
/* These values are guest ABI, do not change */
smbios_set_defaults("QEMU", mc->desc,
mc->name, pcmc->smbios_legacy_mode,
pcmc->smbios_uuid_encoded,
pcms->smbios_entry_point_type);
}
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
pc_memory_init(pcms, system_memory, rom_memory, hole64_size);
} else {
assert(machine->ram_size == x86ms->below_4g_mem_size +
x86ms->above_4g_mem_size);
pc_system_flash_cleanup_unused(pcms);
if (machine->kernel_filename != NULL) {
/* For xen HVM direct kernel boot, load linux here */
xen_load_linux(pcms);
}
}
gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled);
if (pcmc->pci_enabled) {
PIIX3State *piix3;
PCIDevice *pci_dev;
pci_dev = pci_create_simple_multifunction(pci_bus, -1, TYPE_PIIX3_DEVICE);
if (xen_enabled()) {
pci_device_set_intx_routing_notifier(
pci_dev, piix_intx_routing_notifier_xen);
/*
* Xen supports additional interrupt routes from the PCI devices to
* the IOAPIC: the four pins of each PCI device on the bus are also
* connected to the IOAPIC directly.
* These additional routes can be discovered through ACPI.
*/
pci_bus_irqs(pci_bus, xen_intx_set_irq, pci_dev,
XEN_IOAPIC_NUM_PIRQS);
}
piix3 = PIIX3_PCI_DEVICE(pci_dev);
piix3->pic = x86ms->gsi;
piix3_devfn = piix3->dev.devfn;
isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(piix3), "isa.0"));
rtc_state = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev),
"rtc"));
} else {
isa_bus = isa_bus_new(NULL, system_memory, system_io,
&error_abort);
rtc_state = isa_new(TYPE_MC146818_RTC);
qdev_prop_set_int32(DEVICE(rtc_state), "base_year", 2000);
isa_realize_and_unref(rtc_state, isa_bus, &error_fatal);
i8257_dma_init(isa_bus, 0);
pcms->hpet_enabled = false;
}
isa_bus_register_input_irqs(isa_bus, x86ms->gsi);
if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
pc_i8259_create(isa_bus, gsi_state->i8259_irq);
}
if (pcmc->pci_enabled) {
ioapic_init_gsi(gsi_state, "i440fx");
}
if (tcg_enabled()) {
x86_register_ferr_irq(x86ms->gsi[13]);
}
pc_vga_init(isa_bus, pcmc->pci_enabled ? pci_bus : NULL);
assert(pcms->vmport != ON_OFF_AUTO__MAX);
if (pcms->vmport == ON_OFF_AUTO_AUTO) {
pcms->vmport = xen_enabled() ? ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON;
}
/* init basic PC hardware */
pc_basic_device_init(pcms, isa_bus, x86ms->gsi, rtc_state, true,
0x4);
pc_nic_init(pcmc, isa_bus, pci_bus);
if (pcmc->pci_enabled) {
PCIDevice *dev;
dev = pci_create_simple(pci_bus, piix3_devfn + 1, TYPE_PIIX3_IDE);
pci_ide_create_devs(dev);
idebus[0] = qdev_get_child_bus(&dev->qdev, "ide.0");
idebus[1] = qdev_get_child_bus(&dev->qdev, "ide.1");
pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
}
#ifdef CONFIG_IDE_ISA
else {
DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
int i;
ide_drive_get(hd, ARRAY_SIZE(hd));
for (i = 0; i < MAX_IDE_BUS; i++) {
ISADevice *dev;
char busname[] = "ide.0";
dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i],
ide_irq[i],
hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]);
/*
* The ide bus name is ide.0 for the first bus and ide.1 for the
* second one.
*/
busname[4] = '0' + i;
idebus[i] = qdev_get_child_bus(DEVICE(dev), busname);
}
pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state);
}
#endif
if (pcmc->pci_enabled && machine_usb(machine)) {
pci_create_simple(pci_bus, piix3_devfn + 2, TYPE_PIIX3_USB_UHCI);
}
if (pcmc->pci_enabled && x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) {
PCIDevice *piix4_pm;
smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);
piix4_pm = pci_new(piix3_devfn + 3, TYPE_PIIX4_PM);
qdev_prop_set_uint32(DEVICE(piix4_pm), "smb_io_base", 0xb100);
qdev_prop_set_bit(DEVICE(piix4_pm), "smm-enabled",
x86_machine_is_smm_enabled(x86ms));
pci_realize_and_unref(piix4_pm, pci_bus, &error_fatal);
qdev_connect_gpio_out(DEVICE(piix4_pm), 0, x86ms->gsi[9]);
qdev_connect_gpio_out_named(DEVICE(piix4_pm), "smi-irq", 0, smi_irq);
pcms->smbus = I2C_BUS(qdev_get_child_bus(DEVICE(piix4_pm), "i2c"));
/* TODO: Populate SPD eeprom data. */
smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
TYPE_HOTPLUG_HANDLER,
(Object **)&x86ms->acpi_dev,
object_property_allow_set_link,
OBJ_PROP_LINK_STRONG);
object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
OBJECT(piix4_pm), &error_abort);
}
if (machine->nvdimms_state->is_enabled) {
nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
x86_nvdimm_acpi_dsmio,
x86ms->fw_cfg, OBJECT(pcms));
}
}
pc_init1函数很长,有将近300行,不过这里只关注与内存初始化相关的代码。
内存分为低端内存和高端内存,之所以会有这个区分,是因为一些有传统设备的虚拟机,其设备必须使用一些地址空间在4GB以下的内存。低端内存和高端内存的计算见以下代码片段:
ram_memory = machine->ram;
if (!pcms->max_ram_below_4g) {
pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
}
lowmem = pcms->max_ram_below_4g;
if (machine->ram_size >= pcms->max_ram_below_4g) {
if (pcmc->gigabyte_align) {
if (lowmem > 0xc0000000) {
lowmem = 0xc0000000;
}
if (lowmem & (1 * GiB - 1)) {
warn_report("Large machine and max_ram_below_4g "
"(%" PRIu64 ") not a multiple of 1G; "
"possible bad performance.",
pcms->max_ram_below_4g);
}
}
}
if (machine->ram_size >= lowmem) {
x86ms->above_4g_mem_size = machine->ram_size - lowmem;
x86ms->below_4g_mem_size = lowmem;
} else {
x86ms->above_4g_mem_size = 0;
x86ms->below_4g_mem_size = machine->ram_size;
}
}
如果用户在命令行指定了max-ram-below-4g参数,则使用用户指定的参数,这可以让一些非传统的虚拟机使用更多的4GB以下的地址空间;如果用户并未指定max-ram-below-4g参数,则又要分为两种情况:
1)传统的虚拟机(qemu-2.5版本以下)使用3.5GB(0xe0000000)作为分界线;
ram_memory = machine->ram;
if (!pcms->max_ram_below_4g) {
pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
}
lowmem = pcms->max_ram_below_4g;
2)在(更)高版本的虚拟机下,会设置gigabyte_align。当虚拟机的内存大于传统低端内存(3.5GB)时,会以3GB(0xc0000000)作为分界线。
if (machine->ram_size >= pcms->max_ram_below_4g) {
if (pcmc->gigabyte_align) {
if (lowmem > 0xc0000000) {
lowmem = 0xc0000000;
}
if (lowmem & (1 * GiB - 1)) {
warn_report("Large machine and max_ram_below_4g "
"(%" PRIu64 ") not a multiple of 1G; "
"possible bad performance.",
pcms->max_ram_below_4g);
}
}
}
超过lowmem的地址就是高端内存了,否则就是低端内存。
if (machine->ram_size >= lowmem) {
x86ms->above_4g_mem_size = machine->ram_size - lowmem;
x86ms->below_4g_mem_size = lowmem;
} else {
x86ms->above_4g_mem_size = 0;
x86ms->below_4g_mem_size = machine->ram_size;
}
由于本回是隔了很长时间之后才回来继续之前研究的,所以更多以回顾和恢复状态为主,内容就只讲到这里了。随着状态的逐步恢复,下一回开始回到正轨。更多内容请看下回。