正如上文谈到的状态切换,类比于系统调用时从用户态切换至内核态使用栈来进行当前状态信息的保存和恢复。在vmx也提供了vmcs(Virtual-Machine Control Structure) 结构,来进行从host到guest上下文切换时保存状态信息,vt-x提供VM_ENTRY和VM_EXIT来进行两种状态切换,在发生VM-Exit时,硬件自动保存当前的上下文环境到VMCS的客户机状态域中,同时从VMCS的宿主机状态域中加载信息到CPU中;在发生VM-Entry时,CPU自动从VMCS的客户机状态域中加载信息CPU中(注意并不保存宿主机状态,因为每次都是相同的)。这样就实现了由硬件完成状态的切换。
/* The two memslot sets - active and inactive (per address space) */ //guest gpa(客户机物理地址)和host hva(宿主机虚拟地址)映射关系 structkvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2]; /* The current active memslot set for each address space */ structkvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
//虚拟机创建的所有vcpu数组 structxarrayvcpu_array; /* * Protected by slots_lock, but can be read outside if an * incorrect answer is acceptable. */ atomic_t nr_memslots_dirty_logging;
/* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; unsignedlong mn_active_invalidate_count; structrcuwaitmn_memslots_update_rcuwait;
/* For management / invalidation of gfn_to_pfn_caches */ spinlock_t gpc_lock; structlist_headgpc_list;
if (!kvm) return ERR_PTR(-ENOMEM); ... //将mm指向当前进程空间(即用户态进程qemu-kvm进程空间) kvm->mm = current->mm;
r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_disable;
r = hardware_enable_all(); if (r) goto out_err_no_disable;
//为虚拟机分配内存空间 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { structkvm_memslots *slots = kvm_alloc_memslots(); if (!slots) goto out_err_no_srcu; /* * Generations must be different for each address space. * Init kvm generation close to the maximum to easily test the * code of handling generation number wrap-around. */ slots->generation = i * 2 - 150; rcu_assign_pointer(kvm->memslots[i], slots); }
//初始化io总线,为相关总线分配内存空间 for (i = 0; i < KVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); if (!kvm->buses[i]) goto out_err; }
staticlongkvm_vm_ioctl(struct file *filp, unsignedint ioctl, unsignedlong arg) { case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; }
staticintkvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) { int r; structkvm_vcpu *vcpu;
一些判断。。。
kvm_arch_vcpu_create() 初始化vcpu结构体,跟进发现是主要是设置一些VCPU进入非根模式下寄存器的相关信息 r = kvm_arch_vcpu_setup(vcpu); if (r) goto vcpu_destroy;
r = kvm_create_vcpu_debugfs(vcpu); if (r) goto vcpu_destroy;
mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; }
/* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); 创建vcpu r = create_vcpu_fd(vcpu); if (r < 0) { kvm_put_kvm(kvm); goto unlock_vcpu_destroy; }
/* * Same thing as before the guest run: we're about to switch * the MMU context, so let's make sure we don't have any * ongoing EL1&0 translations. */ dsb(nsh);
__deactivate_traps(vcpu); __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) __fpsimd_save_fpexc32(vcpu);
__debug_switch_to_host(vcpu); /* * This must come after restoring the host sysregs, since a non-VHE * system may enable SPE here and make use of the TTBRs. */ __debug_restore_host_buffers_nvhe(vcpu);
if (pmu_switch_needed) __pmu_switch_to_host(vcpu);
/* Returning to host will clear PSR.I, remask PMR if needed */ if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQOFF);
host_ctxt->__hyp_running_vcpu = NULL;
return exit_code; }
最终调用的vcpu函数,不同kernel实现方式不同,但大同小异 staticintvcpu_enter_guest(struct kvm_vcpu *vcpu) { 处理挂起的请求,包括定时器迁移、TLB刷新、主时钟更新、MMU同步等 if (kvm_request_pending(vcpu)) { }
for (;;) { WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break;
if (kvm_lapic_enabled(vcpu)) static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; break; }
/* Note, VM-Exits that go down the "slow" path are accounted below. */ ++vcpu->stat.exits; }
/* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit * can (a) read the correct value of the debug registers, (b) set * KVM_DEBUGREG_WONT_EXIT again. */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); }
/* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't * care about the messed up debug address registers. But if * we have some of them active, restore the old state. */ if (hw_breakpoint_active()) hw_breakpoint_restore();
/* * Sync xfd before calling handle_exit_irqoff() which may * rely on the fact that guest_fpu::xfd is up-to-date (e.g. * in #NM irqoff handler). */ if (vcpu->arch.xfd_no_write_intercept) fpu_sync_guest_vmexit_xfd_state();
static_call(kvm_x86_handle_exit_irqoff)(vcpu);
if (vcpu->arch.guest_fpu.xfd_err) wrmsrl(MSR_IA32_XFD_ERR, 0);
/* * Consume any pending interrupts, including the possible source of * VM-Exit on SVM and any ticks that occur between VM-Exit and now. * An instruction is required after local_irq_enable() to fully unblock * interrupts on processors that implement an interrupt shadow, the * stat.exits increment will do nicely. */ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); local_irq_enable(); ++vcpu->stat.exits; local_irq_disable(); kvm_after_interrupt(vcpu);