接前一篇文章:
本文内容参考:
《 QEMU /KVM》 源码 解析与应用 —— 李强,机械工业出版社
《深度探索 Linux 系统 虚拟化 原理与实现》—— 王柏生 谢广军, 机械工业出版社
特此致谢!
三、KVM模块初始化介绍
2. KVM模块初始化
本回开始对于kvm_init函数进行解析。首先贴出Linux 6.1.10内核版本下该函数的源码,在Linux内核源码根目录/virt/kvm/kvm_main.c中,代码如下:
- int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
- struct module *module)
- {
- struct kvm_cpu_compat_check c;
- int r;
- int cpu;
-
- r = kvm_arch_init(opaque);
- if (r)
- goto out_fail;
-
- /*
- * kvm_arch_init makes sure there's at most one caller
- * for architectures that support multiple implementations,
- * like intel and amd on x86.
- * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
- * conflicts in case kvm is already setup for another implementation.
- */
- r = kvm_irqfd_init();
- if (r)
- goto out_irqfd;
-
- if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- r = kvm_arch_hardware_setup(opaque);
- if (r < 0)
- goto out_free_1;
-
- c.ret = &r;
- c.opaque = opaque;
- for_each_online_cpu(cpu) {
- smp_call_function_single(cpu, check_processor_compat, &c, 1);
- if (r < 0)
- goto out_free_2;
- }
-
- r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
- kvm_starting_cpu, kvm_dying_cpu);
- if (r)
- goto out_free_2;
- register_reboot_notifier(&kvm_reboot_notifier);
-
- /* A kmem cache lets us meet the alignment requirements of fx_save. */
- if (!vcpu_align)
- vcpu_align = __alignof__(struct kvm_vcpu);
- kvm_vcpu_cache =
- kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
- SLAB_ACCOUNT,
- offsetof(struct kvm_vcpu, arch),
- offsetofend(struct kvm_vcpu, stats_id)
- - offsetof(struct kvm_vcpu, arch),
- NULL);
- if (!kvm_vcpu_cache) {
- r = -ENOMEM;
- goto out_free_3;
- }
-
- for_each_possible_cpu(cpu) {
- if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
- GFP_KERNEL, cpu_to_node(cpu))) {
- r = -ENOMEM;
- goto out_free_4;
- }
- }
-
- r = kvm_async_pf_init();
- if (r)
- goto out_free_4;
-
- kvm_chardev_ops.owner = module;
-
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
- register_syscore_ops(&kvm_syscore_ops);
-
- kvm_preempt_ops.sched_in = kvm_sched_in;
- kvm_preempt_ops.sched_out = kvm_sched_out;
-
- kvm_init_debug();
-
- r = kvm_vfio_ops_init();
- WARN_ON(r);
-
- return 0;
-
- out_unreg:
- kvm_async_pf_deinit();
- out_free_4:
- for_each_possible_cpu(cpu)
- free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
- kmem_cache_destroy(kvm_vcpu_cache);
- out_free_3:
- unregister_reboot_notifier(&kvm_reboot_notifier);
- cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
- out_free_2:
- kvm_arch_hardware_unsetup();
- out_free_1:
- free_cpumask_var(cpus_hardware_enabled);
- out_free_0:
- kvm_irqfd_exit();
- out_irqfd:
- kvm_arch_exit();
- out_fail:
- return r;
- }
- EXPORT_SYMBOL_GPL(kvm_init);
为了便于理解,再次贴出vmx_init()中调用kvm_init函数的代码片段,如下:
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- return r;
kvm_init函数总体调用的函数如下图所示:
一个函数一个函数来看。
(1)kvm_arch_init函数
代码片段如下:
- r = kvm_arch_init(opaque);
- if (r)
- goto out_fail;
kvm_arch_init函数用来初始化架构相关的代码。它是一个体系结构相关的函数,在不同的体系结构中均有实现,视不同体系结构不同而不同。
这里仍以x86为例。x86体系结构下的kvm_arch_init函数在Linux内核源码根目录/arch/x86/kvm/x86.c中,代码如下:
- int kvm_arch_init(void *opaque)
- {
- struct kvm_x86_init_ops *ops = opaque;
- u64 host_pat;
- int r;
-
- if (kvm_x86_ops.hardware_enable) {
- pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
- return -EEXIST;
- }
-
- if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
-
- /*
- * KVM explicitly assumes that the guest has an FPU and
- * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
- * vCPU's FPU state as a fxregs_state struct.
- */
- if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
- printk(KERN_ERR "kvm: inadequate fpu\n");
- return -EOPNOTSUPP;
- }
-
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
- return -EOPNOTSUPP;
- }
-
- /*
- * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
- * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something
- * other than WB. Note, EPT doesn't utilize the PAT, but don't bother
- * with an exception. PAT[0] is set to WB on RESET and also by the
- * kernel, i.e. failure indicates a kernel bug or broken firmware.
- */
- if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
- (host_pat & GENMASK(2, 0)) != 6) {
- pr_err("kvm: host PAT[0] is not WB\n");
- return -EIO;
- }
-
- x86_emulator_cache = kvm_alloc_emulator_cache();
- if (!x86_emulator_cache) {
- pr_err("kvm: failed to allocate cache for x86 emulator\n");
- return -ENOMEM;
- }
-
- user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
- if (!user_return_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
- r = -ENOMEM;
- goto out_free_x86_emulator_cache;
- }
- kvm_nr_uret_msrs = 0;
-
- r = kvm_mmu_vendor_module_init();
- if (r)
- goto out_free_percpu;
-
- kvm_timer_init();
-
- if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
- }
-
- if (pi_inject_timer == -1)
- pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
- #ifdef CONFIG_X86_64
- pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
-
- if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
- set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
- #endif
-
- return 0;
-
- out_free_percpu:
- free_percpu(user_return_msrs);
- out_free_x86_emulator_cache:
- kmem_cache_destroy(x86_emulator_cache);
- return r;
- }
kvm_arch_init函数做了一些初始化的工作,确保只有一个KVM能够加载到内核。KVM实现的结构体会被赋值到全局变量kvm_x86_ops中(如前文所述),这里传递到kvm_arch_init的参数opaque就是vmx_x86_ops(新版本中封了一层,意思一样),在其中存放了Intel CPU下KVM实现的各类回调函数。
再来回顾一下vmx_x86_ops的相关代码,在同文件(arch/x86/kvm/vmx/vmx.c)中,如下:
- static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = "kvm_intel",
-
- .hardware_unsetup = vmx_hardware_unsetup,
-
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
-
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .request_immediate_exit = vmx_request_immediate_exit,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .nested_ops = &vmx_nested_ops,
-
- .pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
-
- #ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
- #endif
-
- .setup_mce = vmx_setup_mce,
-
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
-
- .can_emulate_instruction = vmx_can_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
-
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
-
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
- };
kvm_arch_init函数调用实现相关的回调函数cpu_has_kvm_support()和disable_by_bios()。代码片段如下:
- if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
cpu_has_kvm_support()用来检测CPU是否支持VMX模式(对应开启条件1),通过CPUID指令的返回值判断;disable_by_bios()用来检测是否被BIOS关闭(对应开启条件7),通过读取MSR寄存器判断。
kvm_arch_init函数其它的初始化工作包括:
- 分配一个percpu变量user_return_msrs
代码片段如下:
- user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
- if (!user_return_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
- r = -ENOMEM;
- goto out_free_x86_emulator_cache;
- }
- kvm_nr_uret_msrs = 0;
- 完成内存虚拟化的初始化工作
代码片段如下:
- r = kvm_mmu_vendor_module_init();
- if (r)
- goto out_free_percpu;
- 完成timer的初始化
代码片段如下:
kvm_timer_init();
当然,还有其它一些功能,这里不一一详述。
这里提一下,在更早的内核版本中,kvm_arch_init函数中还有以下函数:
- kvm_set_mmio_spte_mask()
设置MMIO内存的标识符(该标识符通过shadow_mmio_mask表示)。
- kvm_lapic_init()
完成lapic的初始化。
但在6.1.10内核版本中,这几个函数已经没有了。
本回内容较多,暂告一段落,下一回继续解析kvm_arch_init函数。