From cb678d6016510cc8c6572dd9f426fe74dd4eda84 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 30 Mar 2016 14:33:59 +0100 Subject: [PATCH 1/9] arm64: kvm: 4.6-rc1: Fix VTCR_EL2 VS setting When we detect support for 16bit VMID in ID_AA64MMFR1, we set the VTCR_EL2_VS field to 1 to make use of 16bit vmids. But, with commit 3a3604bc5eb4 ("arm64: KVM: Switch to C-based stage2 init") this is broken and we corrupt VTCR_EL2:T0SZ instead of updating the VS field. VTCR_EL2_VS was actually defined to the field shift (19) and not the real value for VS. This patch fixes the issue. Fixes: commit 3a3604bc5eb4 ("arm64: KVM: Switch to C-based stage2 init") Cc: Christoffer Dall Cc: Mark Rutland Acked-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_arm.h | 4 +++- arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kvm/hyp/s2-setup.c | 6 ++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 0e391dbfc420..4150fd8bae01 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -124,7 +124,9 @@ #define VTCR_EL2_SL0_LVL1 (1 << 6) #define VTCR_EL2_T0SZ_MASK 0x3f #define VTCR_EL2_T0SZ_40B 24 -#define VTCR_EL2_VS 19 +#define VTCR_EL2_VS_SHIFT 19 +#define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) +#define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) /* * We configure the Stage-2 page tables to always restrict the IPA space to be diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 1a78d6e2a78b..12874164b0ae 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -141,6 +141,9 @@ #define ID_AA64MMFR1_VMIDBITS_SHIFT 4 #define ID_AA64MMFR1_HADBS_SHIFT 0 +#define ID_AA64MMFR1_VMIDBITS_8 0 +#define ID_AA64MMFR1_VMIDBITS_16 2 + /* id_aa64mmfr2 */ #define ID_AA64MMFR2_UAO_SHIFT 4 diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index bfc54fd82797..5a9f3bf542b0 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -36,8 +36,10 @@ void __hyp_text __init_stage2_translation(void) * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS * bit in VTCR_EL2. */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> 4) & 0xf; - val |= (tmp == 2) ? VTCR_EL2_VS : 0; + tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf; + val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ? + VTCR_EL2_VS_16BIT : + VTCR_EL2_VS_8BIT; write_sysreg(val, vtcr_el2); } From 5f5560b1c5f3a80e91c6babb2da34a51943bbdec Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Mar 2016 18:33:04 +0100 Subject: [PATCH 2/9] arm64: KVM: Register CPU notifiers when the kernel runs at HYP When the kernel is running at EL2, it doesn't need init_hyp_mode() to configure page tables for HYP. This function also registers the CPU hotplug and lower power notifiers that cause HYP to be re-initialised after the CPU has been reset. To avoid losing the register state that controls stage2 translation, move the registering of these notifiers into init_subsystems(), and add a is_kernel_in_hyp_mode() path to each callback. Acked-by: Marc Zyngier Acked-by: Christoffer Dall Fixes: 1e947bad0b6 ("arm64: KVM: Skip HYP setup when already running in HYP") Signed-off-by: James Morse Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 52 +++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 6accd66d26f0..b5384311dec4 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1061,15 +1061,27 @@ static void cpu_init_hyp_mode(void *dummy) kvm_arm_init_debug(); } +static void cpu_hyp_reinit(void) +{ + if (is_kernel_in_hyp_mode()) { + /* + * cpu_init_stage2() is safe to call even if the PM + * event was cancelled before the CPU was reset. + */ + cpu_init_stage2(NULL); + } else { + if (__hyp_get_vectors() == hyp_default_vectors) + cpu_init_hyp_mode(NULL); + } +} + static int hyp_init_cpu_notify(struct notifier_block *self, unsigned long action, void *cpu) { switch (action) { case CPU_STARTING: case CPU_STARTING_FROZEN: - if (__hyp_get_vectors() == hyp_default_vectors) - cpu_init_hyp_mode(NULL); - break; + cpu_hyp_reinit(); } return NOTIFY_OK; @@ -1084,9 +1096,8 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self, unsigned long cmd, void *v) { - if (cmd == CPU_PM_EXIT && - __hyp_get_vectors() == hyp_default_vectors) { - cpu_init_hyp_mode(NULL); + if (cmd == CPU_PM_EXIT) { + cpu_hyp_reinit(); return NOTIFY_OK; } @@ -1127,6 +1138,22 @@ static int init_subsystems(void) { int err; + /* + * Register CPU Hotplug notifier + */ + cpu_notifier_register_begin(); + err = __register_cpu_notifier(&hyp_init_cpu_nb); + cpu_notifier_register_done(); + if (err) { + kvm_err("Cannot register KVM init CPU notifier (%d)\n", err); + return err; + } + + /* + * Register CPU lower-power notifier + */ + hyp_cpu_pm_init(); + /* * Init HYP view of VGIC */ @@ -1270,19 +1297,6 @@ static int init_hyp_mode(void) free_boot_hyp_pgd(); #endif - cpu_notifier_register_begin(); - - err = __register_cpu_notifier(&hyp_init_cpu_nb); - - cpu_notifier_register_done(); - - if (err) { - kvm_err("Cannot register HYP init CPU notifier (%d)\n", err); - goto out_err; - } - - hyp_cpu_pm_init(); - /* set size of VMID supported by CPU */ kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); From 321c5658c5e9192dea0d58ab67cf1791e45b2b26 Mon Sep 17 00:00:00 2001 From: Yuki Shibuya Date: Thu, 24 Mar 2016 05:17:03 +0000 Subject: [PATCH 3/9] KVM: x86: Inject pending interrupt even if pending nmi exist Non maskable interrupts (NMI) are preferred to interrupts in current implementation. If a NMI is pending and NMI is blocked by the result of nmi_allowed(), pending interrupt is not injected and enable_irq_window() is not executed, even if interrupts injection is allowed. In old kernel (e.g. 2.6.32), schedule() is often called in NMI context. In this case, interrupts are needed to execute iret that intends end of NMI. The flag of blocking new NMI is not cleared until the guest execute the iret, and interrupts are blocked by pending NMI. Due to this, iret can't be invoked in the guest, and the guest is starved until block is cleared by some events (e.g. canceling injection). This patch injects pending interrupts, when it's allowed, even if NMI is blocked. And, If an interrupts is pending after executing inject_pending_event(), enable_irq_window() is executed regardless of NMI pending counter. Cc: stable@vger.kernel.org Signed-off-by: Yuki Shibuya Suggested-by: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 742d0f7d3556..0a2c70e43bc8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6095,12 +6095,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending) { - if (kvm_x86_ops->nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); - } + if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + kvm_x86_ops->set_nmi(vcpu); } else if (kvm_cpu_has_injectable_intr(vcpu)) { /* * Because interrupts can be injected asynchronously, we are @@ -6569,10 +6567,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - else if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + else { + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + } if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); From a2b5c3c0c8eea2d5d0eefcfc0fc0bdf386daa260 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Mar 2016 11:23:25 +0200 Subject: [PATCH 4/9] KVM: Hyper-V: do not do hypercall userspace exits if SynIC is disabled If SynIC is disabled, there is nothing that userspace can do to handle these exits; on the other hand, userspace probably will not know about KVM_EXIT_HYPERV_HCALL and complain about it or even exit. Just prevent anything bad from happening by handling the hypercall in KVM and returning an "invalid hypercall" code. Fixes: 83326e43f27e9a8a501427a0060f8af519a39bb2 Cc: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5ff3485acb60..01bd7b7a6866 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1116,6 +1116,11 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; case HVCALL_POST_MESSAGE: case HVCALL_SIGNAL_EVENT: + /* don't bother userspace if it has no way to handle it */ + if (!vcpu_to_synic(vcpu)->active) { + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; vcpu->run->hyperv.u.hcall.input = param; From 14ebda3394fd3e5388747e742e510b0802a65d24 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Mar 2016 17:56:57 +0200 Subject: [PATCH 5/9] KVM: x86: reduce default value of halt_poll_ns parameter Windows lets applications choose the frequency of the timer tick, and in Windows 10 the maximum rate was changed from 1024 Hz to 2048 Hz. Unfortunately, because of the way the Windows API works, most applications who need a higher rate than the default 64 Hz will just do timeGetDevCaps(&tc, sizeof(tc)); timeBeginPeriod(tc.wPeriodMin); and pick the maximum rate. This causes very high CPU usage when playing media or games on Windows 10, even if the guest does not actually use the CPU very much, because the frequent timer tick causes halt_poll_ns to kick in. There is no really good solution, especially because Microsoft could sooner or later bump the limit to 4096 Hz, but for now the best we can do is lower a bit the upper limit for halt_poll_ns. :-( Reported-by: Jon Panozzo Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f62a9f37f79f..b7e394485a5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,7 +43,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 -#define KVM_HALT_POLL_NS_DEFAULT 500000 +#define KVM_HALT_POLL_NS_DEFAULT 400000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS From 14f4760562e41d50817d56b42c821d70ad10b483 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 30 Mar 2016 13:38:09 -0700 Subject: [PATCH 6/9] kvm: set page dirty only if page has been writable In absence of shadow dirty mask, there is no need to set page dirty if page has never been writable. This is a tiny optimization but good to have for people who care much about dirty page tracking. Signed-off-by: Yu Zhao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 70e95d097ef1..1ff4dbb73fb7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -557,8 +557,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) !is_writable_pte(new_spte)) ret = true; - if (!shadow_accessed_mask) + if (!shadow_accessed_mask) { + /* + * We don't set page dirty when dropping non-writable spte. + * So do it now if the new spte is becoming non-writable. + */ + if (ret) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); return ret; + } /* * Flush TLB when accessed/dirty bits are changed in the page tables, @@ -605,7 +612,8 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) + if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : + PT_WRITABLE_MASK)) kvm_set_pfn_dirty(pfn); return 1; } From 9c650d09a9c3029cc90cae5d2cd7ab131bdb86c2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 4 Apr 2016 09:41:32 +0200 Subject: [PATCH 7/9] s390/mm/kvm: fix mis-merge in gmap handling commit 1e133ab296f3 ("s390/mm: split arch/s390/mm/pgtable.c") dropped some changes from commit a3a92c31bf0b ("KVM: s390: fix mismatch between user and in-kernel guest limit") - this breaks KVM for some memory sizes (kvm-s390: failed to commit memory region) like exactly 2GB. Cc: Dominik Dingel Cc: Martin Schwidefsky Acked-by: Heiko Carstens Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- arch/s390/mm/gmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 69247b4dcc43..cace818d86eb 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -23,7 +23,7 @@ /** * gmap_alloc - allocate a guest address space * @mm: pointer to the parent mm_struct - * @limit: maximum size of the gmap address space + * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ @@ -292,7 +292,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, if ((from | to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || from + len < from || to + len < to || - from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) + from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end) return -EINVAL; flush = 0; From 61abdbe0bcc2b32745ab4479cc550f4c1f518ee2 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 4 Apr 2016 16:46:07 -0400 Subject: [PATCH 8/9] kvm: x86: make lapic hrtimer pinned When a vCPU runs on a nohz_full core, the hrtimer used by the lapic emulation code can be migrated to another core. When this happens, it's possible to observe milisecond latency when delivering timer IRQs to KVM guests. The huge latency is mainly due to the fact that apic_timer_fn() expects to run during a kvm exit. It sets KVM_REQ_PENDING_TIMER and let it be handled on kvm entry. However, if the timer fires on a different core, we have to wait until the next kvm exit for the guest to see KVM_REQ_PENDING_TIMER set. This problem became visible after commit 9642d18ee. This commit changed the timer migration code to always attempt to migrate timers away from nohz_full cores. While it's discussable if this is correct/desirable (I don't think it is), it's clear that the lapic emulation code has a requirement on firing the hrtimer in the same core where it was started. This is achieved by making the hrtimer pinned. Lastly, note that KVM has code to migrate timers when a vCPU is scheduled to run in different core. However, this forced migration may fail. When this happens, we can have the same problem. If we want 100% correctness, we'll have to modify apic_timer_fn() to cause a kvm exit when it runs on a different core than the vCPU. Not sure if this is possible. Here's a reproducer for the issue being fixed: 1. Set all cores but core0 to be nohz_full cores 2. Start a guest with a single vCPU 3. Trace apic_timer_fn() and kvm_inject_apic_timer_irqs() You'll see that apic_timer_fn() will run in core0 while kvm_inject_apic_timer_irqs() runs in a different core. If you get both on core0, try running a program that takes 100% of the CPU and pin it to core0 to force the vCPU out. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 443d2a57ad3d..1a2da0e5a373 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1369,7 +1369,7 @@ static void start_apic_timer(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " @@ -1402,7 +1402,7 @@ static void start_apic_timer(struct kvm_lapic *apic) expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -1868,7 +1868,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; /* @@ -2003,7 +2003,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* From 95272c29378ee7dc15f43fa2758cb28a5913a06d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 31 Mar 2016 09:38:51 +0200 Subject: [PATCH 9/9] compiler-gcc: disable -ftracer for __noclone functions -ftracer can duplicate asm blocks causing compilation to fail in noclone functions. For example, KVM declares a global variable in an asm like asm("2: ... \n .pushsection data \n .global vmx_return \n vmx_return: .long 2b"); and -ftracer causes a double declaration. Cc: Andrew Morton Cc: Michal Marek Cc: stable@vger.kernel.org Cc: kvm@vger.kernel.org Reported-by: Linda Walsh Signed-off-by: Paolo Bonzini --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 22ab246feed3..eeae401a2412 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -199,7 +199,7 @@ #define unreachable() __builtin_unreachable() /* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) +#define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) #endif /* GCC_VERSION >= 40500 */