QEMU源码全解析 —— CPU虚拟化(15)

123 篇文章 36 订阅 ¥49.90 ¥99.00

接前一篇文章:

本文内容参考:

《趣谈 Linux操作系统 》 —— 刘超, 极客时间

QEMU /KVM》 源码 解析与应用 —— 李强,机械工业出版社

《深度探索 Linux 系统 虚拟化 原理与实现》—— 王柏生 谢广军, 机械工业出版社

特此致谢!

三、KVM模块初始化介绍

2. KVM模块初始化

上一回讲到为了更好、更方便地理解KVM模块的初始化部分(先有一、后有二),笔者找了一个较早版本的Linux内核linux-6.1.10。再来看一下其arch/86/kvm/vmx/vmx.c中的vmx_init函数,代码如下:

  1. static int __init vmx_init(void)
  2. {
  3. int r, cpu;
  4. #if IS_ENABLED(CONFIG_HYPERV)
  5. /*
  6. * Enlightened VMCS usage should be recommended and the host needs
  7. * to support eVMCS v1 or above. We can also disable eVMCS support
  8. * with module parameter.
  9. */
  10. if (enlightened_vmcs &&
  11. ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
  12. (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
  13. KVM_EVMCS_VERSION) {
  14. /* Check that we have assist pages on all online CPUs */
  15. for_each_online_cpu(cpu) {
  16. if (!hv_get_vp_assist_page(cpu)) {
  17. enlightened_vmcs = false;
  18. break;
  19. }
  20. }
  21. if (enlightened_vmcs) {
  22. pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
  23. static_branch_enable(&enable_evmcs);
  24. }
  25. if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
  26. vmx_x86_ops.enable_direct_tlbflush
  27. = hv_enable_direct_tlbflush;
  28. } else {
  29. enlightened_vmcs = false;
  30. }
  31. #endif
  32. r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
  33. __alignof__(struct vcpu_vmx), THIS_MODULE);
  34. if (r)
  35. return r;
  36. /*
  37. * Must be called after kvm_init() so enable_ept is properly set
  38. * up. Hand the parameter mitigation value in which was stored in
  39. * the pre module init parser. If no parameter was given, it will
  40. * contain 'auto' which will be turned into the default 'cond'
  41. * mitigation mode.
  42. */
  43. r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
  44. if (r) {
  45. vmx_exit();
  46. return r;
  47. }
  48. vmx_setup_fb_clear_ctrl();
  49. for_each_possible_cpu(cpu) {
  50. INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
  51. pi_init_cpu(cpu);
  52. }
  53. #ifdef CONFIG_KEXEC_CORE
  54. rcu_assign_pointer(crash_vmclear_loaded_vmcss,
  55. crash_vmclear_local_loaded_vmcss);
  56. #endif
  57. vmx_check_vmcs12_offsets();
  58. /*
  59. * Shadow paging doesn't have a (further) performance penalty
  60. * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
  61. * by default
  62. */
  63. if (!enable_ept)
  64. allow_smaller_maxphyaddr = true;
  65. return 0;
  66. }
  67. module_init(vmx_init);

vmx_init_ops的定义和初始化在同文件(Linux内核/arch/x86/kvm/vmx/vmx.c)中,代码如下:

  1. static struct kvm_x86_init_ops vmx_init_ops __initdata = {
  2. .cpu_has_kvm_support = cpu_has_kvm_support,
  3. .disabled_by_bios = vmx_disabled_by_bios,
  4. .check_processor_compatibility = vmx_check_processor_compat,
  5. .hardware_setup = hardware_setup,
  6. .handle_intel_pt_intr = NULL,
  7. .runtime_ops = &vmx_x86_ops,
  8. .pmu_ops = &intel_pmu_ops,
  9. };

前边反复提到,KVM的所有虚拟化实现(Intel和AMD)都会向KVM模块注册一个kvm_x86_ops(这个内核版本中已经改为了struct kvm_x86_init_ops)结构体(实例)。这样,KVM中的一些函数就是一个外壳,首先会调用kvm_arch_xxx函数,表示的是调用CPU架构相关的回调函数。如果kvm_arch_xxx函数需要调用到实现相关的代码,则会调用kvm_x86_ops结构中的相关回调函数。

VMX_init()中调用kvm_init函数的代码片段如下:

  1. r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
  2. __alignof__(struct vcpu_vmx), THIS_MODULE);
  3. if (r)
  4. return r;

其实这个版本的Linux内核代码也比较新了,也不是早先第一个参数直接传递&vmx_x86_ops的方式,而是又包了一层,将vmx_x86_ops封到了struct kvm_x86_init_ops vmx_init_ops中(参见上边代码)。

vmx_x86_ops的定义和初始化在同文件(arch/x86/kvm/vmx/vmx.c)中,如下:

  1. static struct kvm_x86_ops vmx_x86_ops __initdata = {
  2. .name = "kvm_intel",
  3. .hardware_unsetup = vmx_hardware_unsetup,
  4. .hardware_enable = vmx_hardware_enable,
  5. .hardware_disable = vmx_hardware_disable,
  6. .has_emulated_msr = vmx_has_emulated_msr,
  7. .vm_size = sizeof(struct kvm_vmx),
  8. .vm_init = vmx_vm_init,
  9. .vm_destroy = vmx_vm_destroy,
  10. .vcpu_precreate = vmx_vcpu_precreate,
  11. .vcpu_create = vmx_vcpu_create,
  12. .vcpu_free = vmx_vcpu_free,
  13. .vcpu_reset = vmx_vcpu_reset,
  14. .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
  15. .vcpu_load = vmx_vcpu_load,
  16. .vcpu_put = vmx_vcpu_put,
  17. .update_exception_bitmap = vmx_update_exception_bitmap,
  18. .get_msr_feature = vmx_get_msr_feature,
  19. .get_msr = vmx_get_msr,
  20. .set_msr = vmx_set_msr,
  21. .get_segment_base = vmx_get_segment_base,
  22. .get_segment = vmx_get_segment,
  23. .set_segment = vmx_set_segment,
  24. .get_cpl = vmx_get_cpl,
  25. .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
  26. .set_cr0 = vmx_set_cr0,
  27. .is_valid_cr4 = vmx_is_valid_cr4,
  28. .set_cr4 = vmx_set_cr4,
  29. .set_efer = vmx_set_efer,
  30. .get_idt = vmx_get_idt,
  31. .set_idt = vmx_set_idt,
  32. .get_gdt = vmx_get_gdt,
  33. .set_gdt = vmx_set_gdt,
  34. .set_dr7 = vmx_set_dr7,
  35. .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
  36. .cache_reg = vmx_cache_reg,
  37. .get_rflags = vmx_get_rflags,
  38. .set_rflags = vmx_set_rflags,
  39. .get_if_flag = vmx_get_if_flag,
  40. .flush_tlb_all = vmx_flush_tlb_all,
  41. .flush_tlb_current = vmx_flush_tlb_current,
  42. .flush_tlb_gva = vmx_flush_tlb_gva,
  43. .flush_tlb_guest = vmx_flush_tlb_guest,
  44. .vcpu_pre_run = vmx_vcpu_pre_run,
  45. .vcpu_run = vmx_vcpu_run,
  46. .handle_exit = vmx_handle_exit,
  47. .skip_emulated_instruction = vmx_skip_emulated_instruction,
  48. .update_emulated_instruction = vmx_update_emulated_instruction,
  49. .set_interrupt_shadow = vmx_set_interrupt_shadow,
  50. .get_interrupt_shadow = vmx_get_interrupt_shadow,
  51. .patch_hypercall = vmx_patch_hypercall,
  52. .inject_irq = vmx_inject_irq,
  53. .inject_nmi = vmx_inject_nmi,
  54. .inject_exception = vmx_inject_exception,
  55. .cancel_injection = vmx_cancel_injection,
  56. .interrupt_allowed = vmx_interrupt_allowed,
  57. .nmi_allowed = vmx_nmi_allowed,
  58. .get_nmi_mask = vmx_get_nmi_mask,
  59. .set_nmi_mask = vmx_set_nmi_mask,
  60. .enable_nmi_window = vmx_enable_nmi_window,
  61. .enable_irq_window = vmx_enable_irq_window,
  62. .update_cr8_intercept = vmx_update_cr8_intercept,
  63. .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
  64. .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
  65. .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
  66. .load_eoi_exitmap = vmx_load_eoi_exitmap,
  67. .apicv_post_state_restore = vmx_apicv_post_state_restore,
  68. .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
  69. .hwapic_irr_update = vmx_hwapic_irr_update,
  70. .hwapic_isr_update = vmx_hwapic_isr_update,
  71. .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
  72. .sync_pir_to_irr = vmx_sync_pir_to_irr,
  73. .deliver_interrupt = vmx_deliver_interrupt,
  74. .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
  75. .set_tss_addr = vmx_set_tss_addr,
  76. .set_identity_map_addr = vmx_set_identity_map_addr,
  77. .get_mt_mask = vmx_get_mt_mask,
  78. .get_exit_info = vmx_get_exit_info,
  79. .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
  80. .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  81. .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
  82. .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
  83. .write_tsc_offset = vmx_write_tsc_offset,
  84. .write_tsc_multiplier = vmx_write_tsc_multiplier,
  85. .load_mmu_pgd = vmx_load_mmu_pgd,
  86. .check_intercept = vmx_check_intercept,
  87. .handle_exit_irqoff = vmx_handle_exit_irqoff,
  88. .request_immediate_exit = vmx_request_immediate_exit,
  89. .sched_in = vmx_sched_in,
  90. .cpu_dirty_log_size = PML_ENTITY_NUM,
  91. .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
  92. .nested_ops = &vmx_nested_ops,
  93. .pi_update_irte = vmx_pi_update_irte,
  94. .pi_start_assignment = vmx_pi_start_assignment,
  95. #ifdef CONFIG_X86_64
  96. .set_hv_timer = vmx_set_hv_timer,
  97. .cancel_hv_timer = vmx_cancel_hv_timer,
  98. #endif
  99. .setup_mce = vmx_setup_mce,
  100. .smi_allowed = vmx_smi_allowed,
  101. .enter_smm = vmx_enter_smm,
  102. .leave_smm = vmx_leave_smm,
  103. .enable_smi_window = vmx_enable_smi_window,
  104. .can_emulate_instruction = vmx_can_emulate_instruction,
  105. .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
  106. .migrate_timers = vmx_migrate_timers,
  107. .msr_filter_changed = vmx_msr_filter_changed,
  108. .complete_emulated_msr = kvm_complete_insn_gp,
  109. .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
  110. };

struct kvm_x86_ops vmx_x86_ops是一个非常大的结构,包括具体硬件检测、 虚拟机 创建VCPU的实现、一些寄存器的设置、虚拟机退出的处理函数等。

kvm_init函数的第2个参数为sizeof(struct vcpu_vmx),表示VMX实现的VCPU结构体的大小。struct vcpu_vmx的定义在Linux内核源码/arch/x86/kvm/vmx/vmx.h中,如下:

  1. struct vcpu_vmx {
  2. struct kvm_vcpu vcpu;
  3. u8 fail;
  4. u8 x2apic_msr_bitmap_mode;
  5. /*
  6. * If true, host state has been stored in vmx->loaded_vmcs for
  7. * the CPU registers that only need to be switched when transitioning
  8. * to/from the kernel, and the registers have been loaded with guest
  9. * values. If false, host state is loaded in the CPU registers
  10. * and vmx->loaded_vmcs->host_state is invalid.
  11. */
  12. bool guest_state_loaded;
  13. unsigned long exit_qualification;
  14. u32 exit_intr_info;
  15. u32 idt_vectoring_info;
  16. ulong rflags;
  17. /*
  18. * User return MSRs are always emulated when enabled in the guest, but
  19. * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
  20. * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
  21. * be loaded into hardware if those conditions aren't met.
  22. */
  23. struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
  24. bool guest_uret_msrs_loaded;
  25. #ifdef CONFIG_X86_64
  26. u64 msr_host_kernel_gs_base;
  27. u64 msr_guest_kernel_gs_base;
  28. #endif
  29. u64 spec_ctrl;
  30. u32 msr_ia32_umwait_control;
  31. /*
  32. * loaded_vmcs points to the VMCS currently used in this vcpu. For a
  33. * non-nested (L1) guest, it always points to vmcs01. For a nested
  34. * guest (L2), it points to a different VMCS.
  35. */
  36. struct loaded_vmcs vmcs01;
  37. struct loaded_vmcs *loaded_vmcs;
  38. struct msr_autoload {
  39. struct vmx_msrs guest;
  40. struct vmx_msrs host;
  41. } msr_autoload;
  42. struct msr_autostore {
  43. struct vmx_msrs guest;
  44. } msr_autostore;
  45. struct {
  46. int vm86_active;
  47. ulong save_rflags;
  48. struct kvm_segment segs[8];
  49. } rmode;
  50. struct {
  51. u32 bitmask; /* 4 bits per segment (1 bit per field) */
  52. struct kvm_save_segment {
  53. u16 selector;
  54. unsigned long base;
  55. u32 limit;
  56. u32 ar;
  57. } seg[8];
  58. } segment_cache;
  59. int vpid;
  60. bool emulation_required;
  61. union vmx_exit_reason exit_reason;
  62. /* Posted interrupt descriptor */
  63. struct pi_desc pi_desc;
  64. /* Used if this vCPU is waiting for PI notification wakeup. */
  65. struct list_head pi_wakeup_list;
  66. /* Support for a guest hypervisor (nested VMX) */
  67. struct nested_vmx nested;
  68. /* Dynamic PLE window. */
  69. unsigned int ple_window;
  70. bool ple_window_dirty;
  71. bool req_immediate_exit;
  72. /* Support for PML */
  73. #define PML_ENTITY_NUM 512
  74. struct page *pml_pg;
  75. /* apic deadline value in host tsc */
  76. u64 hv_deadline_tsc;
  77. unsigned long host_debugctlmsr;
  78. /*
  79. * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
  80. * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
  81. * in msr_ia32_feature_control_valid_bits.
  82. */
  83. u64 msr_ia32_feature_control;
  84. u64 msr_ia32_feature_control_valid_bits;
  85. /* SGX Launch Control public key hash */
  86. u64 msr_ia32_sgxlepubkeyhash[4];
  87. u64 msr_ia32_mcu_opt_ctrl;
  88. bool disable_fb_clear;
  89. struct pt_desc pt_desc;
  90. struct lbr_desc lbr_desc;
  91. /* Save desired MSR intercept (read: pass-through) state */
  92. #define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
  93. struct {
  94. DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
  95. DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
  96. } shadow_msr_intercept;
  97. };

对于kvm_init函数的讲解,请看下回。

举报

选择你想要举报的内容(必选)
  • 内容涉黄
  • 政治相关
  • 内容抄袭
  • 涉嫌广告
  • 内容侵权
  • 侮辱谩骂
  • 样式问题
  • 其他