接前一篇文章:
本文内容参考:
《 QEMU /KVM》 源码 解析与应用 —— 李强,机械工业出版社
《深度探索 Linux 系统 虚拟化 原理与实现》—— 王柏生 谢广军, 机械工业出版社
特此致谢!
三、KVM模块初始化介绍
2. KVM模块初始化
上一回讲到为了更好、更方便地理解KVM模块的初始化部分(先有一、后有二),笔者找了一个较早版本的Linux内核linux-6.1.10。再来看一下其arch/86/kvm/vmx/vmx.c中的vmx_init函数,代码如下:
- static int __init vmx_init(void)
- {
- int r, cpu;
-
- #if IS_ENABLED(CONFIG_HYPERV)
- /*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
- */
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
-
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
-
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
-
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_direct_tlbflush
- = hv_enable_direct_tlbflush;
-
- } else {
- enlightened_vmcs = false;
- }
- #endif
-
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- return r;
-
- /*
- * Must be called after kvm_init() so enable_ept is properly set
- * up. Hand the parameter mitigation value in which was stored in
- * the pre module init parser. If no parameter was given, it will
- * contain 'auto' which will be turned into the default 'cond'
- * mitigation mode.
- */
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
-
- vmx_setup_fb_clear_ctrl();
-
- for_each_possible_cpu(cpu) {
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
-
- pi_init_cpu(cpu);
- }
-
- #ifdef CONFIG_KEXEC_CORE
- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
- crash_vmclear_local_loaded_vmcss);
- #endif
- vmx_check_vmcs12_offsets();
-
- /*
- * Shadow paging doesn't have a (further) performance penalty
- * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
- * by default
- */
- if (!enable_ept)
- allow_smaller_maxphyaddr = true;
-
- return 0;
- }
- module_init(vmx_init);
vmx_init_ops的定义和初始化在同文件(Linux内核/arch/x86/kvm/vmx/vmx.c)中,代码如下:
- static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .check_processor_compatibility = vmx_check_processor_compat,
- .hardware_setup = hardware_setup,
- .handle_intel_pt_intr = NULL,
-
- .runtime_ops = &vmx_x86_ops,
- .pmu_ops = &intel_pmu_ops,
- };
前边反复提到,KVM的所有虚拟化实现(Intel和AMD)都会向KVM模块注册一个kvm_x86_ops(这个内核版本中已经改为了struct kvm_x86_init_ops)结构体(实例)。这样,KVM中的一些函数就是一个外壳,首先会调用kvm_arch_xxx函数,表示的是调用CPU架构相关的回调函数。如果kvm_arch_xxx函数需要调用到实现相关的代码,则会调用kvm_x86_ops结构中的相关回调函数。
VMX_init()中调用kvm_init函数的代码片段如下:
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- return r;
其实这个版本的Linux内核代码也比较新了,也不是早先第一个参数直接传递&vmx_x86_ops的方式,而是又包了一层,将vmx_x86_ops封到了struct kvm_x86_init_ops vmx_init_ops中(参见上边代码)。
vmx_x86_ops的定义和初始化在同文件(arch/x86/kvm/vmx/vmx.c)中,如下:
- static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = "kvm_intel",
-
- .hardware_unsetup = vmx_hardware_unsetup,
-
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
-
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .request_immediate_exit = vmx_request_immediate_exit,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .nested_ops = &vmx_nested_ops,
-
- .pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
-
- #ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
- #endif
-
- .setup_mce = vmx_setup_mce,
-
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
-
- .can_emulate_instruction = vmx_can_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
-
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
-
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
- };
struct kvm_x86_ops vmx_x86_ops是一个非常大的结构,包括具体硬件检测、 虚拟机 创建VCPU的实现、一些寄存器的设置、虚拟机退出的处理函数等。
kvm_init函数的第2个参数为sizeof(struct vcpu_vmx),表示VMX实现的VCPU结构体的大小。struct vcpu_vmx的定义在Linux内核源码/arch/x86/kvm/vmx/vmx.h中,如下:
- struct vcpu_vmx {
- struct kvm_vcpu vcpu;
- u8 fail;
- u8 x2apic_msr_bitmap_mode;
-
- /*
- * If true, host state has been stored in vmx->loaded_vmcs for
- * the CPU registers that only need to be switched when transitioning
- * to/from the kernel, and the registers have been loaded with guest
- * values. If false, host state is loaded in the CPU registers
- * and vmx->loaded_vmcs->host_state is invalid.
- */
- bool guest_state_loaded;
-
- unsigned long exit_qualification;
- u32 exit_intr_info;
- u32 idt_vectoring_info;
- ulong rflags;
-
- /*
- * User return MSRs are always emulated when enabled in the guest, but
- * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
- * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
- * be loaded into hardware if those conditions aren't met.
- */
- struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
- bool guest_uret_msrs_loaded;
- #ifdef CONFIG_X86_64
- u64 msr_host_kernel_gs_base;
- u64 msr_guest_kernel_gs_base;
- #endif
-
- u64 spec_ctrl;
- u32 msr_ia32_umwait_control;
-
- /*
- * loaded_vmcs points to the VMCS currently used in this vcpu. For a
- * non-nested (L1) guest, it always points to vmcs01. For a nested
- * guest (L2), it points to a different VMCS.
- */
- struct loaded_vmcs vmcs01;
- struct loaded_vmcs *loaded_vmcs;
-
- struct msr_autoload {
- struct vmx_msrs guest;
- struct vmx_msrs host;
- } msr_autoload;
-
- struct msr_autostore {
- struct vmx_msrs guest;
- } msr_autostore;
-
- struct {
- int vm86_active;
- ulong save_rflags;
- struct kvm_segment segs[8];
- } rmode;
- struct {
- u32 bitmask; /* 4 bits per segment (1 bit per field) */
- struct kvm_save_segment {
- u16 selector;
- unsigned long base;
- u32 limit;
- u32 ar;
- } seg[8];
- } segment_cache;
- int vpid;
- bool emulation_required;
-
- union vmx_exit_reason exit_reason;
-
- /* Posted interrupt descriptor */
- struct pi_desc pi_desc;
-
- /* Used if this vCPU is waiting for PI notification wakeup. */
- struct list_head pi_wakeup_list;
-
- /* Support for a guest hypervisor (nested VMX) */
- struct nested_vmx nested;
-
- /* Dynamic PLE window. */
- unsigned int ple_window;
- bool ple_window_dirty;
-
- bool req_immediate_exit;
-
- /* Support for PML */
- #define PML_ENTITY_NUM 512
- struct page *pml_pg;
-
- /* apic deadline value in host tsc */
- u64 hv_deadline_tsc;
-
- unsigned long host_debugctlmsr;
-
- /*
- * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
- * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
- * in msr_ia32_feature_control_valid_bits.
- */
- u64 msr_ia32_feature_control;
- u64 msr_ia32_feature_control_valid_bits;
- /* SGX Launch Control public key hash */
- u64 msr_ia32_sgxlepubkeyhash[4];
- u64 msr_ia32_mcu_opt_ctrl;
- bool disable_fb_clear;
-
- struct pt_desc pt_desc;
- struct lbr_desc lbr_desc;
-
- /* Save desired MSR intercept (read: pass-through) state */
- #define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
- struct {
- DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- } shadow_msr_intercept;
- };
对于kvm_init函数的讲解,请看下回。