QEMU源码全解析 —— CPU虚拟化(16)

123 篇文章 36 订阅 ¥49.90 ¥99.00

接前一篇文章:

本文内容参考:

《趣谈 Linux操作系统 》 —— 刘超, 极客时间

QEMU /KVM》 源码 解析与应用 —— 李强,机械工业出版社

《深度探索 Linux 系统 虚拟化 原理与实现》—— 王柏生 谢广军, 机械工业出版社

特此致谢!

三、KVM模块初始化介绍

2. KVM模块初始化

本回开始对于kvm_init函数进行解析。首先贴出Linux 6.1.10内核版本下该函数的源码,在Linux内核源码根目录/virt/kvm/kvm_main.c中,代码如下:

  1. int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
  2. struct module *module)
  3. {
  4. struct kvm_cpu_compat_check c;
  5. int r;
  6. int cpu;
  7. r = kvm_arch_init(opaque);
  8. if (r)
  9. goto out_fail;
  10. /*
  11. * kvm_arch_init makes sure there's at most one caller
  12. * for architectures that support multiple implementations,
  13. * like intel and amd on x86.
  14. * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
  15. * conflicts in case kvm is already setup for another implementation.
  16. */
  17. r = kvm_irqfd_init();
  18. if (r)
  19. goto out_irqfd;
  20. if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
  21. r = -ENOMEM;
  22. goto out_free_0;
  23. }
  24. r = kvm_arch_hardware_setup(opaque);
  25. if (r < 0)
  26. goto out_free_1;
  27. c.ret = &r;
  28. c.opaque = opaque;
  29. for_each_online_cpu(cpu) {
  30. smp_call_function_single(cpu, check_processor_compat, &c, 1);
  31. if (r < 0)
  32. goto out_free_2;
  33. }
  34. r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
  35. kvm_starting_cpu, kvm_dying_cpu);
  36. if (r)
  37. goto out_free_2;
  38. register_reboot_notifier(&kvm_reboot_notifier);
  39. /* A kmem cache lets us meet the alignment requirements of fx_save. */
  40. if (!vcpu_align)
  41. vcpu_align = __alignof__(struct kvm_vcpu);
  42. kvm_vcpu_cache =
  43. kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
  44. SLAB_ACCOUNT,
  45. offsetof(struct kvm_vcpu, arch),
  46. offsetofend(struct kvm_vcpu, stats_id)
  47. - offsetof(struct kvm_vcpu, arch),
  48. NULL);
  49. if (!kvm_vcpu_cache) {
  50. r = -ENOMEM;
  51. goto out_free_3;
  52. }
  53. for_each_possible_cpu(cpu) {
  54. if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
  55. GFP_KERNEL, cpu_to_node(cpu))) {
  56. r = -ENOMEM;
  57. goto out_free_4;
  58. }
  59. }
  60. r = kvm_async_pf_init();
  61. if (r)
  62. goto out_free_4;
  63. kvm_chardev_ops.owner = module;
  64. r = misc_register(&kvm_dev);
  65. if (r) {
  66. pr_err("kvm: misc device register failed\n");
  67. goto out_unreg;
  68. }
  69. register_syscore_ops(&kvm_syscore_ops);
  70. kvm_preempt_ops.sched_in = kvm_sched_in;
  71. kvm_preempt_ops.sched_out = kvm_sched_out;
  72. kvm_init_debug();
  73. r = kvm_vfio_ops_init();
  74. WARN_ON(r);
  75. return 0;
  76. out_unreg:
  77. kvm_async_pf_deinit();
  78. out_free_4:
  79. for_each_possible_cpu(cpu)
  80. free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
  81. kmem_cache_destroy(kvm_vcpu_cache);
  82. out_free_3:
  83. unregister_reboot_notifier(&kvm_reboot_notifier);
  84. cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
  85. out_free_2:
  86. kvm_arch_hardware_unsetup();
  87. out_free_1:
  88. free_cpumask_var(cpus_hardware_enabled);
  89. out_free_0:
  90. kvm_irqfd_exit();
  91. out_irqfd:
  92. kvm_arch_exit();
  93. out_fail:
  94. return r;
  95. }
  96. EXPORT_SYMBOL_GPL(kvm_init);

为了便于理解,再次贴出vmx_init()中调用kvm_init函数的代码片段,如下:

  1. r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
  2. __alignof__(struct vcpu_vmx), THIS_MODULE);
  3. if (r)
  4. return r;

kvm_init函数总体调用的函数如下图所示:

一个函数一个函数来看。

(1)kvm_arch_init函数

代码片段如下:

  1. r = kvm_arch_init(opaque);
  2. if (r)
  3. goto out_fail;

kvm_arch_init函数用来初始化架构相关的代码。它是一个体系结构相关的函数,在不同的体系结构中均有实现,视不同体系结构不同而不同。

这里仍以x86为例。x86体系结构下的kvm_arch_init函数在Linux内核源码根目录/arch/x86/kvm/x86.c中,代码如下:

  1. int kvm_arch_init(void *opaque)
  2. {
  3. struct kvm_x86_init_ops *ops = opaque;
  4. u64 host_pat;
  5. int r;
  6. if (kvm_x86_ops.hardware_enable) {
  7. pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
  8. return -EEXIST;
  9. }
  10. if (!ops->cpu_has_kvm_support()) {
  11. pr_err_ratelimited("kvm: no hardware support for '%s'\n",
  12. ops->runtime_ops->name);
  13. return -EOPNOTSUPP;
  14. }
  15. if (ops->disabled_by_bios()) {
  16. pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
  17. ops->runtime_ops->name);
  18. return -EOPNOTSUPP;
  19. }
  20. /*
  21. * KVM explicitly assumes that the guest has an FPU and
  22. * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
  23. * vCPU's FPU state as a fxregs_state struct.
  24. */
  25. if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
  26. printk(KERN_ERR "kvm: inadequate fpu\n");
  27. return -EOPNOTSUPP;
  28. }
  29. if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
  30. pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
  31. return -EOPNOTSUPP;
  32. }
  33. /*
  34. * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
  35. * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something
  36. * other than WB. Note, EPT doesn't utilize the PAT, but don't bother
  37. * with an exception. PAT[0] is set to WB on RESET and also by the
  38. * kernel, i.e. failure indicates a kernel bug or broken firmware.
  39. */
  40. if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
  41. (host_pat & GENMASK(2, 0)) != 6) {
  42. pr_err("kvm: host PAT[0] is not WB\n");
  43. return -EIO;
  44. }
  45. x86_emulator_cache = kvm_alloc_emulator_cache();
  46. if (!x86_emulator_cache) {
  47. pr_err("kvm: failed to allocate cache for x86 emulator\n");
  48. return -ENOMEM;
  49. }
  50. user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
  51. if (!user_return_msrs) {
  52. printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
  53. r = -ENOMEM;
  54. goto out_free_x86_emulator_cache;
  55. }
  56. kvm_nr_uret_msrs = 0;
  57. r = kvm_mmu_vendor_module_init();
  58. if (r)
  59. goto out_free_percpu;
  60. kvm_timer_init();
  61. if (boot_cpu_has(X86_FEATURE_XSAVE)) {
  62. host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
  63. kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
  64. }
  65. if (pi_inject_timer == -1)
  66. pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
  67. #ifdef CONFIG_X86_64
  68. pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
  69. if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
  70. set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
  71. #endif
  72. return 0;
  73. out_free_percpu:
  74. free_percpu(user_return_msrs);
  75. out_free_x86_emulator_cache:
  76. kmem_cache_destroy(x86_emulator_cache);
  77. return r;
  78. }

kvm_arch_init函数做了一些初始化的工作,确保只有一个KVM能够加载到内核。KVM实现的结构体会被赋值到全局变量kvm_x86_ops中(如前文所述),这里传递到kvm_arch_init的参数opaque就是vmx_x86_ops(新版本中封了一层,意思一样),在其中存放了Intel CPU下KVM实现的各类回调函数。

再来回顾一下vmx_x86_ops的相关代码,在同文件(arch/x86/kvm/vmx/vmx.c)中,如下:

  1. static struct kvm_x86_ops vmx_x86_ops __initdata = {
  2. .name = "kvm_intel",
  3. .hardware_unsetup = vmx_hardware_unsetup,
  4. .hardware_enable = vmx_hardware_enable,
  5. .hardware_disable = vmx_hardware_disable,
  6. .has_emulated_msr = vmx_has_emulated_msr,
  7. .vm_size = sizeof(struct kvm_vmx),
  8. .vm_init = vmx_vm_init,
  9. .vm_destroy = vmx_vm_destroy,
  10. .vcpu_precreate = vmx_vcpu_precreate,
  11. .vcpu_create = vmx_vcpu_create,
  12. .vcpu_free = vmx_vcpu_free,
  13. .vcpu_reset = vmx_vcpu_reset,
  14. .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
  15. .vcpu_load = vmx_vcpu_load,
  16. .vcpu_put = vmx_vcpu_put,
  17. .update_exception_bitmap = vmx_update_exception_bitmap,
  18. .get_msr_feature = vmx_get_msr_feature,
  19. .get_msr = vmx_get_msr,
  20. .set_msr = vmx_set_msr,
  21. .get_segment_base = vmx_get_segment_base,
  22. .get_segment = vmx_get_segment,
  23. .set_segment = vmx_set_segment,
  24. .get_cpl = vmx_get_cpl,
  25. .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
  26. .set_cr0 = vmx_set_cr0,
  27. .is_valid_cr4 = vmx_is_valid_cr4,
  28. .set_cr4 = vmx_set_cr4,
  29. .set_efer = vmx_set_efer,
  30. .get_idt = vmx_get_idt,
  31. .set_idt = vmx_set_idt,
  32. .get_gdt = vmx_get_gdt,
  33. .set_gdt = vmx_set_gdt,
  34. .set_dr7 = vmx_set_dr7,
  35. .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
  36. .cache_reg = vmx_cache_reg,
  37. .get_rflags = vmx_get_rflags,
  38. .set_rflags = vmx_set_rflags,
  39. .get_if_flag = vmx_get_if_flag,
  40. .flush_tlb_all = vmx_flush_tlb_all,
  41. .flush_tlb_current = vmx_flush_tlb_current,
  42. .flush_tlb_gva = vmx_flush_tlb_gva,
  43. .flush_tlb_guest = vmx_flush_tlb_guest,
  44. .vcpu_pre_run = vmx_vcpu_pre_run,
  45. .vcpu_run = vmx_vcpu_run,
  46. .handle_exit = vmx_handle_exit,
  47. .skip_emulated_instruction = vmx_skip_emulated_instruction,
  48. .update_emulated_instruction = vmx_update_emulated_instruction,
  49. .set_interrupt_shadow = vmx_set_interrupt_shadow,
  50. .get_interrupt_shadow = vmx_get_interrupt_shadow,
  51. .patch_hypercall = vmx_patch_hypercall,
  52. .inject_irq = vmx_inject_irq,
  53. .inject_nmi = vmx_inject_nmi,
  54. .inject_exception = vmx_inject_exception,
  55. .cancel_injection = vmx_cancel_injection,
  56. .interrupt_allowed = vmx_interrupt_allowed,
  57. .nmi_allowed = vmx_nmi_allowed,
  58. .get_nmi_mask = vmx_get_nmi_mask,
  59. .set_nmi_mask = vmx_set_nmi_mask,
  60. .enable_nmi_window = vmx_enable_nmi_window,
  61. .enable_irq_window = vmx_enable_irq_window,
  62. .update_cr8_intercept = vmx_update_cr8_intercept,
  63. .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
  64. .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
  65. .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
  66. .load_eoi_exitmap = vmx_load_eoi_exitmap,
  67. .apicv_post_state_restore = vmx_apicv_post_state_restore,
  68. .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
  69. .hwapic_irr_update = vmx_hwapic_irr_update,
  70. .hwapic_isr_update = vmx_hwapic_isr_update,
  71. .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
  72. .sync_pir_to_irr = vmx_sync_pir_to_irr,
  73. .deliver_interrupt = vmx_deliver_interrupt,
  74. .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
  75. .set_tss_addr = vmx_set_tss_addr,
  76. .set_identity_map_addr = vmx_set_identity_map_addr,
  77. .get_mt_mask = vmx_get_mt_mask,
  78. .get_exit_info = vmx_get_exit_info,
  79. .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
  80. .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  81. .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
  82. .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
  83. .write_tsc_offset = vmx_write_tsc_offset,
  84. .write_tsc_multiplier = vmx_write_tsc_multiplier,
  85. .load_mmu_pgd = vmx_load_mmu_pgd,
  86. .check_intercept = vmx_check_intercept,
  87. .handle_exit_irqoff = vmx_handle_exit_irqoff,
  88. .request_immediate_exit = vmx_request_immediate_exit,
  89. .sched_in = vmx_sched_in,
  90. .cpu_dirty_log_size = PML_ENTITY_NUM,
  91. .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
  92. .nested_ops = &vmx_nested_ops,
  93. .pi_update_irte = vmx_pi_update_irte,
  94. .pi_start_assignment = vmx_pi_start_assignment,
  95. #ifdef CONFIG_X86_64
  96. .set_hv_timer = vmx_set_hv_timer,
  97. .cancel_hv_timer = vmx_cancel_hv_timer,
  98. #endif
  99. .setup_mce = vmx_setup_mce,
  100. .smi_allowed = vmx_smi_allowed,
  101. .enter_smm = vmx_enter_smm,
  102. .leave_smm = vmx_leave_smm,
  103. .enable_smi_window = vmx_enable_smi_window,
  104. .can_emulate_instruction = vmx_can_emulate_instruction,
  105. .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
  106. .migrate_timers = vmx_migrate_timers,
  107. .msr_filter_changed = vmx_msr_filter_changed,
  108. .complete_emulated_msr = kvm_complete_insn_gp,
  109. .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
  110. };

kvm_arch_init函数调用实现相关的回调函数cpu_has_kvm_support()和disable_by_bios()。代码片段如下:

  1. if (!ops->cpu_has_kvm_support()) {
  2. pr_err_ratelimited("kvm: no hardware support for '%s'\n",
  3. ops->runtime_ops->name);
  4. return -EOPNOTSUPP;
  5. }
  6. if (ops->disabled_by_bios()) {
  7. pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
  8. ops->runtime_ops->name);
  9. return -EOPNOTSUPP;
  10. }

cpu_has_kvm_support()用来检测CPU是否支持VMX模式(对应开启条件1),通过CPUID指令的返回值判断;disable_by_bios()用来检测是否被BIOS关闭(对应开启条件7),通过读取MSR寄存器判断。

kvm_arch_init函数其它的初始化工作包括:

  • 分配一个percpu变量user_return_msrs

代码片段如下:

  1. user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
  2. if (!user_return_msrs) {
  3. printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
  4. r = -ENOMEM;
  5. goto out_free_x86_emulator_cache;
  6. }
  7. kvm_nr_uret_msrs = 0;
  • 完成内存虚拟化的初始化工作

代码片段如下:

  1. r = kvm_mmu_vendor_module_init();
  2. if (r)
  3. goto out_free_percpu;
  • 完成timer的初始化

代码片段如下:

    kvm_timer_init();

当然,还有其它一些功能,这里不一一详述。

这里提一下,在更早的内核版本中,kvm_arch_init函数中还有以下函数:

  • kvm_set_mmio_spte_mask()

设置MMIO内存的标识符(该标识符通过shadow_mmio_mask表示)。

  • kvm_lapic_init()

完成lapic的初始化。

但在6.1.10内核版本中,这几个函数已经没有了。

本回内容较多,暂告一段落,下一回继续解析kvm_arch_init函数。

举报

选择你想要举报的内容(必选)
  • 内容涉黄
  • 政治相关
  • 内容抄袭
  • 涉嫌广告
  • 内容侵权
  • 侮辱谩骂
  • 样式问题
  • 其他