linux/arch/x86/kvm/x86.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Kernel-based Virtual Machine driver for Linux
   4 *
   5 * derived from drivers/kvm/kvm_main.c
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 * Copyright (C) 2008 Qumranet, Inc.
   9 * Copyright IBM Corporation, 2008
  10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  11 *
  12 * Authors:
  13 *   Avi Kivity   <avi@qumranet.com>
  14 *   Yaniv Kamay  <yaniv@qumranet.com>
  15 *   Amit Shah    <amit.shah@qumranet.com>
  16 *   Ben-Ami Yassour <benami@il.ibm.com>
  17 */
  18
  19#include <linux/kvm_host.h>
  20#include "irq.h"
  21#include "ioapic.h"
  22#include "mmu.h"
  23#include "i8254.h"
  24#include "tss.h"
  25#include "kvm_cache_regs.h"
  26#include "kvm_emulate.h"
  27#include "x86.h"
  28#include "cpuid.h"
  29#include "pmu.h"
  30#include "hyperv.h"
  31#include "lapic.h"
  32#include "xen.h"
  33
  34#include <linux/clocksource.h>
  35#include <linux/interrupt.h>
  36#include <linux/kvm.h>
  37#include <linux/fs.h>
  38#include <linux/vmalloc.h>
  39#include <linux/export.h>
  40#include <linux/moduleparam.h>
  41#include <linux/mman.h>
  42#include <linux/highmem.h>
  43#include <linux/iommu.h>
  44#include <linux/intel-iommu.h>
  45#include <linux/cpufreq.h>
  46#include <linux/user-return-notifier.h>
  47#include <linux/srcu.h>
  48#include <linux/slab.h>
  49#include <linux/perf_event.h>
  50#include <linux/uaccess.h>
  51#include <linux/hash.h>
  52#include <linux/pci.h>
  53#include <linux/timekeeper_internal.h>
  54#include <linux/pvclock_gtod.h>
  55#include <linux/kvm_irqfd.h>
  56#include <linux/irqbypass.h>
  57#include <linux/sched/stat.h>
  58#include <linux/sched/isolation.h>
  59#include <linux/mem_encrypt.h>
  60#include <linux/entry-kvm.h>
  61#include <linux/suspend.h>
  62
  63#include <trace/events/kvm.h>
  64
  65#include <asm/debugreg.h>
  66#include <asm/msr.h>
  67#include <asm/desc.h>
  68#include <asm/mce.h>
  69#include <asm/pkru.h>
  70#include <linux/kernel_stat.h>
  71#include <asm/fpu/internal.h> /* Ugh! */
  72#include <asm/pvclock.h>
  73#include <asm/div64.h>
  74#include <asm/irq_remapping.h>
  75#include <asm/mshyperv.h>
  76#include <asm/hypervisor.h>
  77#include <asm/tlbflush.h>
  78#include <asm/intel_pt.h>
  79#include <asm/emulate_prefix.h>
  80#include <asm/sgx.h>
  81#include <clocksource/hyperv_timer.h>
  82
  83#define CREATE_TRACE_POINTS
  84#include "trace.h"
  85
  86#define MAX_IO_MSRS 256
  87#define KVM_MAX_MCE_BANKS 32
  88u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
  89EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
  90
  91#define emul_to_vcpu(ctxt) \
  92        ((struct kvm_vcpu *)(ctxt)->vcpu)
  93
  94/* EFER defaults:
  95 * - enable syscall per default because its emulated by KVM
  96 * - enable LME and LMA per default on 64 bit KVM
  97 */
  98#ifdef CONFIG_X86_64
  99static
 100u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
 101#else
 102static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 103#endif
 104
 105static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 106
 107#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
 108
 109#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
 110                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 111
 112static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 113static void process_nmi(struct kvm_vcpu *vcpu);
 114static void process_smi(struct kvm_vcpu *vcpu);
 115static void enter_smm(struct kvm_vcpu *vcpu);
 116static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 117static void store_regs(struct kvm_vcpu *vcpu);
 118static int sync_regs(struct kvm_vcpu *vcpu);
 119
 120static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 121static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 122
 123struct kvm_x86_ops kvm_x86_ops __read_mostly;
 124EXPORT_SYMBOL_GPL(kvm_x86_ops);
 125
 126#define KVM_X86_OP(func)                                             \
 127        DEFINE_STATIC_CALL_NULL(kvm_x86_##func,                      \
 128                                *(((struct kvm_x86_ops *)0)->func));
 129#define KVM_X86_OP_NULL KVM_X86_OP
 130#include <asm/kvm-x86-ops.h>
 131EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
 132EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
 133EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
 134
 135static bool __read_mostly ignore_msrs = 0;
 136module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 137
 138bool __read_mostly report_ignored_msrs = true;
 139module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
 140EXPORT_SYMBOL_GPL(report_ignored_msrs);
 141
 142unsigned int min_timer_period_us = 200;
 143module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 144
 145static bool __read_mostly kvmclock_periodic_sync = true;
 146module_param(kvmclock_periodic_sync, bool, S_IRUGO);
 147
 148bool __read_mostly kvm_has_tsc_control;
 149EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 150u32  __read_mostly kvm_max_guest_tsc_khz;
 151EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 152u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 153EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 154u64  __read_mostly kvm_max_tsc_scaling_ratio;
 155EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
 156u64 __read_mostly kvm_default_tsc_scaling_ratio;
 157EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 158bool __read_mostly kvm_has_bus_lock_exit;
 159EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
 160
 161/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 162static u32 __read_mostly tsc_tolerance_ppm = 250;
 163module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 164
 165/*
 166 * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
 167 * adaptive tuning starting from default advancement of 1000ns.  '0' disables
 168 * advancement entirely.  Any other value is used as-is and disables adaptive
 169 * tuning, i.e. allows privileged userspace to set an exact advancement time.
 170 */
 171static int __read_mostly lapic_timer_advance_ns = -1;
 172module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
 173
 174static bool __read_mostly vector_hashing = true;
 175module_param(vector_hashing, bool, S_IRUGO);
 176
 177bool __read_mostly enable_vmware_backdoor = false;
 178module_param(enable_vmware_backdoor, bool, S_IRUGO);
 179EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
 180
 181static bool __read_mostly force_emulation_prefix = false;
 182module_param(force_emulation_prefix, bool, S_IRUGO);
 183
 184int __read_mostly pi_inject_timer = -1;
 185module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 186
 187/*
 188 * Restoring the host value for MSRs that are only consumed when running in
 189 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
 190 * returns to userspace, i.e. the kernel can run with the guest's value.
 191 */
 192#define KVM_MAX_NR_USER_RETURN_MSRS 16
 193
 194struct kvm_user_return_msrs {
 195        struct user_return_notifier urn;
 196        bool registered;
 197        struct kvm_user_return_msr_values {
 198                u64 host;
 199                u64 curr;
 200        } values[KVM_MAX_NR_USER_RETURN_MSRS];
 201};
 202
 203u32 __read_mostly kvm_nr_uret_msrs;
 204EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
 205static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
 206static struct kvm_user_return_msrs __percpu *user_return_msrs;
 207
 208#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 209                                | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
 210                                | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
 211                                | XFEATURE_MASK_PKRU)
 212
 213u64 __read_mostly host_efer;
 214EXPORT_SYMBOL_GPL(host_efer);
 215
 216bool __read_mostly allow_smaller_maxphyaddr = 0;
 217EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 218
 219bool __read_mostly enable_apicv = true;
 220EXPORT_SYMBOL_GPL(enable_apicv);
 221
 222u64 __read_mostly host_xss;
 223EXPORT_SYMBOL_GPL(host_xss);
 224u64 __read_mostly supported_xss;
 225EXPORT_SYMBOL_GPL(supported_xss);
 226
 227const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 228        KVM_GENERIC_VM_STATS(),
 229        STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
 230        STATS_DESC_COUNTER(VM, mmu_pte_write),
 231        STATS_DESC_COUNTER(VM, mmu_pde_zapped),
 232        STATS_DESC_COUNTER(VM, mmu_flooded),
 233        STATS_DESC_COUNTER(VM, mmu_recycled),
 234        STATS_DESC_COUNTER(VM, mmu_cache_miss),
 235        STATS_DESC_ICOUNTER(VM, mmu_unsync),
 236        STATS_DESC_ICOUNTER(VM, lpages),
 237        STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
 238        STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
 239};
 240static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
 241                sizeof(struct kvm_vm_stat) / sizeof(u64));
 242
 243const struct kvm_stats_header kvm_vm_stats_header = {
 244        .name_size = KVM_STATS_NAME_SIZE,
 245        .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
 246        .id_offset = sizeof(struct kvm_stats_header),
 247        .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
 248        .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
 249                       sizeof(kvm_vm_stats_desc),
 250};
 251
 252const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 253        KVM_GENERIC_VCPU_STATS(),
 254        STATS_DESC_COUNTER(VCPU, pf_fixed),
 255        STATS_DESC_COUNTER(VCPU, pf_guest),
 256        STATS_DESC_COUNTER(VCPU, tlb_flush),
 257        STATS_DESC_COUNTER(VCPU, invlpg),
 258        STATS_DESC_COUNTER(VCPU, exits),
 259        STATS_DESC_COUNTER(VCPU, io_exits),
 260        STATS_DESC_COUNTER(VCPU, mmio_exits),
 261        STATS_DESC_COUNTER(VCPU, signal_exits),
 262        STATS_DESC_COUNTER(VCPU, irq_window_exits),
 263        STATS_DESC_COUNTER(VCPU, nmi_window_exits),
 264        STATS_DESC_COUNTER(VCPU, l1d_flush),
 265        STATS_DESC_COUNTER(VCPU, halt_exits),
 266        STATS_DESC_COUNTER(VCPU, request_irq_exits),
 267        STATS_DESC_COUNTER(VCPU, irq_exits),
 268        STATS_DESC_COUNTER(VCPU, host_state_reload),
 269        STATS_DESC_COUNTER(VCPU, fpu_reload),
 270        STATS_DESC_COUNTER(VCPU, insn_emulation),
 271        STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
 272        STATS_DESC_COUNTER(VCPU, hypercalls),
 273        STATS_DESC_COUNTER(VCPU, irq_injections),
 274        STATS_DESC_COUNTER(VCPU, nmi_injections),
 275        STATS_DESC_COUNTER(VCPU, req_event),
 276        STATS_DESC_COUNTER(VCPU, nested_run),
 277        STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
 278        STATS_DESC_COUNTER(VCPU, directed_yield_successful),
 279        STATS_DESC_ICOUNTER(VCPU, guest_mode)
 280};
 281static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
 282                sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 283
 284const struct kvm_stats_header kvm_vcpu_stats_header = {
 285        .name_size = KVM_STATS_NAME_SIZE,
 286        .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
 287        .id_offset = sizeof(struct kvm_stats_header),
 288        .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
 289        .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
 290                       sizeof(kvm_vcpu_stats_desc),
 291};
 292
 293u64 __read_mostly host_xcr0;
 294u64 __read_mostly supported_xcr0;
 295EXPORT_SYMBOL_GPL(supported_xcr0);
 296
 297static struct kmem_cache *x86_fpu_cache;
 298
 299static struct kmem_cache *x86_emulator_cache;
 300
 301/*
 302 * When called, it means the previous get/set msr reached an invalid msr.
 303 * Return true if we want to ignore/silent this failed msr access.
 304 */
 305static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
 306{
 307        const char *op = write ? "wrmsr" : "rdmsr";
 308
 309        if (ignore_msrs) {
 310                if (report_ignored_msrs)
 311                        kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
 312                                      op, msr, data);
 313                /* Mask the error */
 314                return true;
 315        } else {
 316                kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
 317                                      op, msr, data);
 318                return false;
 319        }
 320}
 321
 322static struct kmem_cache *kvm_alloc_emulator_cache(void)
 323{
 324        unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
 325        unsigned int size = sizeof(struct x86_emulate_ctxt);
 326
 327        return kmem_cache_create_usercopy("x86_emulator", size,
 328                                          __alignof__(struct x86_emulate_ctxt),
 329                                          SLAB_ACCOUNT, useroffset,
 330                                          size - useroffset, NULL);
 331}
 332
 333static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 334
 335static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 336{
 337        int i;
 338        for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
 339                vcpu->arch.apf.gfns[i] = ~0;
 340}
 341
 342static void kvm_on_user_return(struct user_return_notifier *urn)
 343{
 344        unsigned slot;
 345        struct kvm_user_return_msrs *msrs
 346                = container_of(urn, struct kvm_user_return_msrs, urn);
 347        struct kvm_user_return_msr_values *values;
 348        unsigned long flags;
 349
 350        /*
 351         * Disabling irqs at this point since the following code could be
 352         * interrupted and executed through kvm_arch_hardware_disable()
 353         */
 354        local_irq_save(flags);
 355        if (msrs->registered) {
 356                msrs->registered = false;
 357                user_return_notifier_unregister(urn);
 358        }
 359        local_irq_restore(flags);
 360        for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
 361                values = &msrs->values[slot];
 362                if (values->host != values->curr) {
 363                        wrmsrl(kvm_uret_msrs_list[slot], values->host);
 364                        values->curr = values->host;
 365                }
 366        }
 367}
 368
 369static int kvm_probe_user_return_msr(u32 msr)
 370{
 371        u64 val;
 372        int ret;
 373
 374        preempt_disable();
 375        ret = rdmsrl_safe(msr, &val);
 376        if (ret)
 377                goto out;
 378        ret = wrmsrl_safe(msr, val);
 379out:
 380        preempt_enable();
 381        return ret;
 382}
 383
 384int kvm_add_user_return_msr(u32 msr)
 385{
 386        BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
 387
 388        if (kvm_probe_user_return_msr(msr))
 389                return -1;
 390
 391        kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
 392        return kvm_nr_uret_msrs++;
 393}
 394EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
 395
 396int kvm_find_user_return_msr(u32 msr)
 397{
 398        int i;
 399
 400        for (i = 0; i < kvm_nr_uret_msrs; ++i) {
 401                if (kvm_uret_msrs_list[i] == msr)
 402                        return i;
 403        }
 404        return -1;
 405}
 406EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
 407
 408static void kvm_user_return_msr_cpu_online(void)
 409{
 410        unsigned int cpu = smp_processor_id();
 411        struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 412        u64 value;
 413        int i;
 414
 415        for (i = 0; i < kvm_nr_uret_msrs; ++i) {
 416                rdmsrl_safe(kvm_uret_msrs_list[i], &value);
 417                msrs->values[i].host = value;
 418                msrs->values[i].curr = value;
 419        }
 420}
 421
 422int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
 423{
 424        unsigned int cpu = smp_processor_id();
 425        struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 426        int err;
 427
 428        value = (value & mask) | (msrs->values[slot].host & ~mask);
 429        if (value == msrs->values[slot].curr)
 430                return 0;
 431        err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
 432        if (err)
 433                return 1;
 434
 435        msrs->values[slot].curr = value;
 436        if (!msrs->registered) {
 437                msrs->urn.on_user_return = kvm_on_user_return;
 438                user_return_notifier_register(&msrs->urn);
 439                msrs->registered = true;
 440        }
 441        return 0;
 442}
 443EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
 444
 445static void drop_user_return_notifiers(void)
 446{
 447        unsigned int cpu = smp_processor_id();
 448        struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 449
 450        if (msrs->registered)
 451                kvm_on_user_return(&msrs->urn);
 452}
 453
 454u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 455{
 456        return vcpu->arch.apic_base;
 457}
 458EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 459
 460enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
 461{
 462        return kvm_apic_mode(kvm_get_apic_base(vcpu));
 463}
 464EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
 465
 466int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 467{
 468        enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
 469        enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
 470        u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
 471                (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
 472
 473        if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
 474                return 1;
 475        if (!msr_info->host_initiated) {
 476                if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
 477                        return 1;
 478                if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
 479                        return 1;
 480        }
 481
 482        kvm_lapic_set_base(vcpu, msr_info->data);
 483        kvm_recalculate_apic_map(vcpu->kvm);
 484        return 0;
 485}
 486EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 487
 488asmlinkage __visible noinstr void kvm_spurious_fault(void)
 489{
 490        /* Fault while not rebooting.  We want the trace. */
 491        BUG_ON(!kvm_rebooting);
 492}
 493EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 494
 495#define EXCPT_BENIGN            0
 496#define EXCPT_CONTRIBUTORY      1
 497#define EXCPT_PF                2
 498
 499static int exception_class(int vector)
 500{
 501        switch (vector) {
 502        case PF_VECTOR:
 503                return EXCPT_PF;
 504        case DE_VECTOR:
 505        case TS_VECTOR:
 506        case NP_VECTOR:
 507        case SS_VECTOR:
 508        case GP_VECTOR:
 509                return EXCPT_CONTRIBUTORY;
 510        default:
 511                break;
 512        }
 513        return EXCPT_BENIGN;
 514}
 515
 516#define EXCPT_FAULT             0
 517#define EXCPT_TRAP              1
 518#define EXCPT_ABORT             2
 519#define EXCPT_INTERRUPT         3
 520
 521static int exception_type(int vector)
 522{
 523        unsigned int mask;
 524
 525        if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
 526                return EXCPT_INTERRUPT;
 527
 528        mask = 1 << vector;
 529
 530        /* #DB is trap, as instruction watchpoints are handled elsewhere */
 531        if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
 532                return EXCPT_TRAP;
 533
 534        if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
 535                return EXCPT_ABORT;
 536
 537        /* Reserved exceptions will result in fault */
 538        return EXCPT_FAULT;
 539}
 540
 541void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
 542{
 543        unsigned nr = vcpu->arch.exception.nr;
 544        bool has_payload = vcpu->arch.exception.has_payload;
 545        unsigned long payload = vcpu->arch.exception.payload;
 546
 547        if (!has_payload)
 548                return;
 549
 550        switch (nr) {
 551        case DB_VECTOR:
 552                /*
 553                 * "Certain debug exceptions may clear bit 0-3.  The
 554                 * remaining contents of the DR6 register are never
 555                 * cleared by the processor".
 556                 */
 557                vcpu->arch.dr6 &= ~DR_TRAP_BITS;
 558                /*
 559                 * In order to reflect the #DB exception payload in guest
 560                 * dr6, three components need to be considered: active low
 561                 * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
 562                 * DR6_BS and DR6_BT)
 563                 * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
 564                 * In the target guest dr6:
 565                 * FIXED_1 bits should always be set.
 566                 * Active low bits should be cleared if 1-setting in payload.
 567                 * Active high bits should be set if 1-setting in payload.
 568                 *
 569                 * Note, the payload is compatible with the pending debug
 570                 * exceptions/exit qualification under VMX, that active_low bits
 571                 * are active high in payload.
 572                 * So they need to be flipped for DR6.
 573                 */
 574                vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
 575                vcpu->arch.dr6 |= payload;
 576                vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
 577
 578                /*
 579                 * The #DB payload is defined as compatible with the 'pending
 580                 * debug exceptions' field under VMX, not DR6. While bit 12 is
 581                 * defined in the 'pending debug exceptions' field (enabled
 582                 * breakpoint), it is reserved and must be zero in DR6.
 583                 */
 584                vcpu->arch.dr6 &= ~BIT(12);
 585                break;
 586        case PF_VECTOR:
 587                vcpu->arch.cr2 = payload;
 588                break;
 589        }
 590
 591        vcpu->arch.exception.has_payload = false;
 592        vcpu->arch.exception.payload = 0;
 593}
 594EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
 595
 596static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 597                unsigned nr, bool has_error, u32 error_code,
 598                bool has_payload, unsigned long payload, bool reinject)
 599{
 600        u32 prev_nr;
 601        int class1, class2;
 602
 603        kvm_make_request(KVM_REQ_EVENT, vcpu);
 604
 605        if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
 606        queue:
 607                if (reinject) {
 608                        /*
 609                         * On vmentry, vcpu->arch.exception.pending is only
 610                         * true if an event injection was blocked by
 611                         * nested_run_pending.  In that case, however,
 612                         * vcpu_enter_guest requests an immediate exit,
 613                         * and the guest shouldn't proceed far enough to
 614                         * need reinjection.
 615                         */
 616                        WARN_ON_ONCE(vcpu->arch.exception.pending);
 617                        vcpu->arch.exception.injected = true;
 618                        if (WARN_ON_ONCE(has_payload)) {
 619                                /*
 620                                 * A reinjected event has already
 621                                 * delivered its payload.
 622                                 */
 623                                has_payload = false;
 624                                payload = 0;
 625                        }
 626                } else {
 627                        vcpu->arch.exception.pending = true;
 628                        vcpu->arch.exception.injected = false;
 629                }
 630                vcpu->arch.exception.has_error_code = has_error;
 631                vcpu->arch.exception.nr = nr;
 632                vcpu->arch.exception.error_code = error_code;
 633                vcpu->arch.exception.has_payload = has_payload;
 634                vcpu->arch.exception.payload = payload;
 635                if (!is_guest_mode(vcpu))
 636                        kvm_deliver_exception_payload(vcpu);
 637                return;
 638        }
 639
 640        /* to check exception */
 641        prev_nr = vcpu->arch.exception.nr;
 642        if (prev_nr == DF_VECTOR) {
 643                /* triple fault -> shutdown */
 644                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 645                return;
 646        }
 647        class1 = exception_class(prev_nr);
 648        class2 = exception_class(nr);
 649        if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
 650                || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
 651                /*
 652                 * Generate double fault per SDM Table 5-5.  Set
 653                 * exception.pending = true so that the double fault
 654                 * can trigger a nested vmexit.
 655                 */
 656                vcpu->arch.exception.pending = true;
 657                vcpu->arch.exception.injected = false;
 658                vcpu->arch.exception.has_error_code = true;
 659                vcpu->arch.exception.nr = DF_VECTOR;
 660                vcpu->arch.exception.error_code = 0;
 661                vcpu->arch.exception.has_payload = false;
 662                vcpu->arch.exception.payload = 0;
 663        } else
 664                /* replace previous exception with a new one in a hope
 665                   that instruction re-execution will regenerate lost
 666                   exception */
 667                goto queue;
 668}
 669
 670void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 671{
 672        kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
 673}
 674EXPORT_SYMBOL_GPL(kvm_queue_exception);
 675
 676void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 677{
 678        kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
 679}
 680EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 681
 682void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
 683                           unsigned long payload)
 684{
 685        kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
 686}
 687EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
 688
 689static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
 690                                    u32 error_code, unsigned long payload)
 691{
 692        kvm_multiple_exception(vcpu, nr, true, error_code,
 693                               true, payload, false);
 694}
 695
 696int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 697{
 698        if (err)
 699                kvm_inject_gp(vcpu, 0);
 700        else
 701                return kvm_skip_emulated_instruction(vcpu);
 702
 703        return 1;
 704}
 705EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 706
 707void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 708{
 709        ++vcpu->stat.pf_guest;
 710        vcpu->arch.exception.nested_apf =
 711                is_guest_mode(vcpu) && fault->async_page_fault;
 712        if (vcpu->arch.exception.nested_apf) {
 713                vcpu->arch.apf.nested_apf_token = fault->address;
 714                kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 715        } else {
 716                kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
 717                                        fault->address);
 718        }
 719}
 720EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 721
 722bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 723                                    struct x86_exception *fault)
 724{
 725        struct kvm_mmu *fault_mmu;
 726        WARN_ON_ONCE(fault->vector != PF_VECTOR);
 727
 728        fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
 729                                               vcpu->arch.walk_mmu;
 730
 731        /*
 732         * Invalidate the TLB entry for the faulting address, if it exists,
 733         * else the access will fault indefinitely (and to emulate hardware).
 734         */
 735        if ((fault->error_code & PFERR_PRESENT_MASK) &&
 736            !(fault->error_code & PFERR_RSVD_MASK))
 737                kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
 738                                       fault_mmu->root_hpa);
 739
 740        fault_mmu->inject_page_fault(vcpu, fault);
 741        return fault->nested_page_fault;
 742}
 743EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
 744
 745void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 746{
 747        atomic_inc(&vcpu->arch.nmi_queued);
 748        kvm_make_request(KVM_REQ_NMI, vcpu);
 749}
 750EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 751
 752void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 753{
 754        kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
 755}
 756EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 757
 758void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 759{
 760        kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
 761}
 762EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 763
 764/*
 765 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 766 * a #GP and return false.
 767 */
 768bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 769{
 770        if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
 771                return true;
 772        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 773        return false;
 774}
 775EXPORT_SYMBOL_GPL(kvm_require_cpl);
 776
 777bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
 778{
 779        if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 780                return true;
 781
 782        kvm_queue_exception(vcpu, UD_VECTOR);
 783        return false;
 784}
 785EXPORT_SYMBOL_GPL(kvm_require_dr);
 786
 787/*
 788 * This function will be used to read from the physical memory of the currently
 789 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
 790 * can read from guest physical or from the guest's guest physical memory.
 791 */
 792int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 793                            gfn_t ngfn, void *data, int offset, int len,
 794                            u32 access)
 795{
 796        struct x86_exception exception;
 797        gfn_t real_gfn;
 798        gpa_t ngpa;
 799
 800        ngpa     = gfn_to_gpa(ngfn);
 801        real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
 802        if (real_gfn == UNMAPPED_GVA)
 803                return -EFAULT;
 804
 805        real_gfn = gpa_to_gfn(real_gfn);
 806
 807        return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
 808}
 809EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 810
 811static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
 812{
 813        return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
 814}
 815
 816/*
 817 * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
 818 */
 819int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 820{
 821        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 822        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 823        int i;
 824        int ret;
 825        u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 826
 827        ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
 828                                      offset * sizeof(u64), sizeof(pdpte),
 829                                      PFERR_USER_MASK|PFERR_WRITE_MASK);
 830        if (ret < 0) {
 831                ret = 0;
 832                goto out;
 833        }
 834        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 835                if ((pdpte[i] & PT_PRESENT_MASK) &&
 836                    (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
 837                        ret = 0;
 838                        goto out;
 839                }
 840        }
 841        ret = 1;
 842
 843        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 844        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
 845        vcpu->arch.pdptrs_from_userspace = false;
 846
 847out:
 848
 849        return ret;
 850}
 851EXPORT_SYMBOL_GPL(load_pdptrs);
 852
 853void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
 854{
 855        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
 856                kvm_clear_async_pf_completion_queue(vcpu);
 857                kvm_async_pf_hash_reset(vcpu);
 858        }
 859
 860        if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
 861                kvm_mmu_reset_context(vcpu);
 862
 863        if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
 864            kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
 865            !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 866                kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 867}
 868EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
 869
 870int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 871{
 872        unsigned long old_cr0 = kvm_read_cr0(vcpu);
 873        unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
 874
 875        cr0 |= X86_CR0_ET;
 876
 877#ifdef CONFIG_X86_64
 878        if (cr0 & 0xffffffff00000000UL)
 879                return 1;
 880#endif
 881
 882        cr0 &= ~CR0_RESERVED_BITS;
 883
 884        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
 885                return 1;
 886
 887        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
 888                return 1;
 889
 890#ifdef CONFIG_X86_64
 891        if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
 892            (cr0 & X86_CR0_PG)) {
 893                int cs_db, cs_l;
 894
 895                if (!is_pae(vcpu))
 896                        return 1;
 897                static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
 898                if (cs_l)
 899                        return 1;
 900        }
 901#endif
 902        if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
 903            is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
 904            !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
 905                return 1;
 906
 907        if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
 908                return 1;
 909
 910        static_call(kvm_x86_set_cr0)(vcpu, cr0);
 911
 912        kvm_post_set_cr0(vcpu, old_cr0, cr0);
 913
 914        return 0;
 915}
 916EXPORT_SYMBOL_GPL(kvm_set_cr0);
 917
 918void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 919{
 920        (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 921}
 922EXPORT_SYMBOL_GPL(kvm_lmsw);
 923
 924void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
 925{
 926        if (vcpu->arch.guest_state_protected)
 927                return;
 928
 929        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
 930
 931                if (vcpu->arch.xcr0 != host_xcr0)
 932                        xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 933
 934                if (vcpu->arch.xsaves_enabled &&
 935                    vcpu->arch.ia32_xss != host_xss)
 936                        wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
 937        }
 938
 939        if (static_cpu_has(X86_FEATURE_PKU) &&
 940            (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 941             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
 942            vcpu->arch.pkru != vcpu->arch.host_pkru)
 943                write_pkru(vcpu->arch.pkru);
 944}
 945EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
 946
 947void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 948{
 949        if (vcpu->arch.guest_state_protected)
 950                return;
 951
 952        if (static_cpu_has(X86_FEATURE_PKU) &&
 953            (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 954             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
 955                vcpu->arch.pkru = rdpkru();
 956                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
 957                        write_pkru(vcpu->arch.host_pkru);
 958        }
 959
 960        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
 961
 962                if (vcpu->arch.xcr0 != host_xcr0)
 963                        xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
 964
 965                if (vcpu->arch.xsaves_enabled &&
 966                    vcpu->arch.ia32_xss != host_xss)
 967                        wrmsrl(MSR_IA32_XSS, host_xss);
 968        }
 969
 970}
 971EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
 972
 973static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 974{
 975        u64 xcr0 = xcr;
 976        u64 old_xcr0 = vcpu->arch.xcr0;
 977        u64 valid_bits;
 978
 979        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 980        if (index != XCR_XFEATURE_ENABLED_MASK)
 981                return 1;
 982        if (!(xcr0 & XFEATURE_MASK_FP))
 983                return 1;
 984        if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
 985                return 1;
 986
 987        /*
 988         * Do not allow the guest to set bits that we do not support
 989         * saving.  However, xcr0 bit 0 is always set, even if the
 990         * emulated CPU does not support XSAVE (see fx_init).
 991         */
 992        valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
 993        if (xcr0 & ~valid_bits)
 994                return 1;
 995
 996        if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
 997            (!(xcr0 & XFEATURE_MASK_BNDCSR)))
 998                return 1;
 999
1000        if (xcr0 & XFEATURE_MASK_AVX512) {
1001                if (!(xcr0 & XFEATURE_MASK_YMM))
1002                        return 1;
1003                if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
1004                        return 1;
1005        }
1006        vcpu->arch.xcr0 = xcr0;
1007
1008        if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
1009                kvm_update_cpuid_runtime(vcpu);
1010        return 0;
1011}
1012
1013int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
1014{
1015        if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
1016            __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
1017                kvm_inject_gp(vcpu, 0);
1018                return 1;
1019        }
1020
1021        return kvm_skip_emulated_instruction(vcpu);
1022}
1023EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
1024
1025bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1026{
1027        if (cr4 & cr4_reserved_bits)
1028                return false;
1029
1030        if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
1031                return false;
1032
1033        return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
1034}
1035EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
1036
1037void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
1038{
1039        if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
1040            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
1041                kvm_mmu_reset_context(vcpu);
1042}
1043EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
1044
1045int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1046{
1047        unsigned long old_cr4 = kvm_read_cr4(vcpu);
1048        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
1049                                   X86_CR4_SMEP;
1050
1051        if (!kvm_is_valid_cr4(vcpu, cr4))
1052                return 1;
1053
1054        if (is_long_mode(vcpu)) {
1055                if (!(cr4 & X86_CR4_PAE))
1056                        return 1;
1057                if ((cr4 ^ old_cr4) & X86_CR4_LA57)
1058                        return 1;
1059        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
1060                   && ((cr4 ^ old_cr4) & pdptr_bits)
1061                   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
1062                                   kvm_read_cr3(vcpu)))
1063                return 1;
1064
1065        if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
1066                if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
1067                        return 1;
1068
1069                /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1070                if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1071                        return 1;
1072        }
1073
1074        static_call(kvm_x86_set_cr4)(vcpu, cr4);
1075
1076        kvm_post_set_cr4(vcpu, old_cr4, cr4);
1077
1078        return 0;
1079}
1080EXPORT_SYMBOL_GPL(kvm_set_cr4);
1081
1082static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
1083{
1084        struct kvm_mmu *mmu = vcpu->arch.mmu;
1085        unsigned long roots_to_free = 0;
1086        int i;
1087
1088        /*
1089         * If neither the current CR3 nor any of the prev_roots use the given
1090         * PCID, then nothing needs to be done here because a resync will
1091         * happen anyway before switching to any other CR3.
1092         */
1093        if (kvm_get_active_pcid(vcpu) == pcid) {
1094                kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1095                kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1096        }
1097
1098        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
1099                if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1100                        roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
1101
1102        kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
1103}
1104
1105int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1106{
1107        bool skip_tlb_flush = false;
1108        unsigned long pcid = 0;
1109#ifdef CONFIG_X86_64
1110        bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1111
1112        if (pcid_enabled) {
1113                skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1114                cr3 &= ~X86_CR3_PCID_NOFLUSH;
1115                pcid = cr3 & X86_CR3_PCID_MASK;
1116        }
1117#endif
1118
1119        /* PDPTRs are always reloaded for PAE paging. */
1120        if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
1121                goto handle_tlb_flush;
1122
1123        /*
1124         * Do not condition the GPA check on long mode, this helper is used to
1125         * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
1126         * the current vCPU mode is accurate.
1127         */
1128        if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
1129                return 1;
1130
1131        if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
1132                return 1;
1133
1134        if (cr3 != kvm_read_cr3(vcpu))
1135                kvm_mmu_new_pgd(vcpu, cr3);
1136
1137        vcpu->arch.cr3 = cr3;
1138        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1139
1140handle_tlb_flush:
1141        /*
1142         * A load of CR3 that flushes the TLB flushes only the current PCID,
1143         * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
1144         * moot point in the end because _disabling_ PCID will flush all PCIDs,
1145         * and it's impossible to use a non-zero PCID when PCID is disabled,
1146         * i.e. only PCID=0 can be relevant.
1147         */
1148        if (!skip_tlb_flush)
1149                kvm_invalidate_pcid(vcpu, pcid);
1150
1151        return 0;
1152}
1153EXPORT_SYMBOL_GPL(kvm_set_cr3);
1154
1155int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
1156{
1157        if (cr8 & CR8_RESERVED_BITS)
1158                return 1;
1159        if (lapic_in_kernel(vcpu))
1160                kvm_lapic_set_tpr(vcpu, cr8);
1161        else
1162                vcpu->arch.cr8 = cr8;
1163        return 0;
1164}
1165EXPORT_SYMBOL_GPL(kvm_set_cr8);
1166
1167unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
1168{
1169        if (lapic_in_kernel(vcpu))
1170                return kvm_lapic_get_cr8(vcpu);
1171        else
1172                return vcpu->arch.cr8;
1173}
1174EXPORT_SYMBOL_GPL(kvm_get_cr8);
1175
1176static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1177{
1178        int i;
1179
1180        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1181                for (i = 0; i < KVM_NR_DB_REGS; i++)
1182                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1183                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
1184        }
1185}
1186
1187void kvm_update_dr7(struct kvm_vcpu *vcpu)
1188{
1189        unsigned long dr7;
1190
1191        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1192                dr7 = vcpu->arch.guest_debug_dr7;
1193        else
1194                dr7 = vcpu->arch.dr7;
1195        static_call(kvm_x86_set_dr7)(vcpu, dr7);
1196        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1197        if (dr7 & DR7_BP_EN_MASK)
1198                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1199}
1200EXPORT_SYMBOL_GPL(kvm_update_dr7);
1201
1202static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1203{
1204        u64 fixed = DR6_FIXED_1;
1205
1206        if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1207                fixed |= DR6_RTM;
1208
1209        if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
1210                fixed |= DR6_BUS_LOCK;
1211        return fixed;
1212}
1213
1214int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1215{
1216        size_t size = ARRAY_SIZE(vcpu->arch.db);
1217
1218        switch (dr) {
1219        case 0 ... 3:
1220                vcpu->arch.db[array_index_nospec(dr, size)] = val;
1221                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1222                        vcpu->arch.eff_db[dr] = val;
1223                break;
1224        case 4:
1225        case 6:
1226                if (!kvm_dr6_valid(val))
1227                        return 1; /* #GP */
1228                vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1229                break;
1230        case 5:
1231        default: /* 7 */
1232                if (!kvm_dr7_valid(val))
1233                        return 1; /* #GP */
1234                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1235                kvm_update_dr7(vcpu);
1236                break;
1237        }
1238
1239        return 0;
1240}
1241EXPORT_SYMBOL_GPL(kvm_set_dr);
1242
1243void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1244{
1245        size_t size = ARRAY_SIZE(vcpu->arch.db);
1246
1247        switch (dr) {
1248        case 0 ... 3:
1249                *val = vcpu->arch.db[array_index_nospec(dr, size)];
1250                break;
1251        case 4:
1252        case 6:
1253                *val = vcpu->arch.dr6;
1254                break;
1255        case 5:
1256        default: /* 7 */
1257                *val = vcpu->arch.dr7;
1258                break;
1259        }
1260}
1261EXPORT_SYMBOL_GPL(kvm_get_dr);
1262
1263int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
1264{
1265        u32 ecx = kvm_rcx_read(vcpu);
1266        u64 data;
1267
1268        if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
1269                kvm_inject_gp(vcpu, 0);
1270                return 1;
1271        }
1272
1273        kvm_rax_write(vcpu, (u32)data);
1274        kvm_rdx_write(vcpu, data >> 32);
1275        return kvm_skip_emulated_instruction(vcpu);
1276}
1277EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
1278
1279/*
1280 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1281 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1282 *
1283 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1284 * extract the supported MSRs from the related const lists.
1285 * msrs_to_save is selected from the msrs_to_save_all to reflect the
1286 * capabilities of the host cpu. This capabilities test skips MSRs that are
1287 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1288 * may depend on host virtualization features rather than host cpu features.
1289 */
1290
1291static const u32 msrs_to_save_all[] = {
1292        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1293        MSR_STAR,
1294#ifdef CONFIG_X86_64
1295        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1296#endif
1297        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1298        MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1299        MSR_IA32_SPEC_CTRL,
1300        MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1301        MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1302        MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1303        MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1304        MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1305        MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1306        MSR_IA32_UMWAIT_CONTROL,
1307
1308        MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1309        MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
1310        MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1311        MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1312        MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1313        MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1314        MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1315        MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1316        MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1317        MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1318        MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1319        MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1320        MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
1321        MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1322        MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1323        MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1324        MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1325        MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1326        MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1327        MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1328        MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1329        MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
1330};
1331
1332static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1333static unsigned num_msrs_to_save;
1334
1335static const u32 emulated_msrs_all[] = {
1336        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1337        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1338        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1339        HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1340        HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1341        HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1342        HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1343        HV_X64_MSR_RESET,
1344        HV_X64_MSR_VP_INDEX,
1345        HV_X64_MSR_VP_RUNTIME,
1346        HV_X64_MSR_SCONTROL,
1347        HV_X64_MSR_STIMER0_CONFIG,
1348        HV_X64_MSR_VP_ASSIST_PAGE,
1349        HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1350        HV_X64_MSR_TSC_EMULATION_STATUS,
1351        HV_X64_MSR_SYNDBG_OPTIONS,
1352        HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1353        HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1354        HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1355
1356        MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1357        MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1358
1359        MSR_IA32_TSC_ADJUST,
1360        MSR_IA32_TSC_DEADLINE,
1361        MSR_IA32_ARCH_CAPABILITIES,
1362        MSR_IA32_PERF_CAPABILITIES,
1363        MSR_IA32_MISC_ENABLE,
1364        MSR_IA32_MCG_STATUS,
1365        MSR_IA32_MCG_CTL,
1366        MSR_IA32_MCG_EXT_CTL,
1367        MSR_IA32_SMBASE,
1368        MSR_SMI_COUNT,
1369        MSR_PLATFORM_INFO,
1370        MSR_MISC_FEATURES_ENABLES,
1371        MSR_AMD64_VIRT_SPEC_CTRL,
1372        MSR_IA32_POWER_CTL,
1373        MSR_IA32_UCODE_REV,
1374
1375        /*
1376         * The following list leaves out MSRs whose values are determined
1377         * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1378         * We always support the "true" VMX control MSRs, even if the host
1379         * processor does not, so I am putting these registers here rather
1380         * than in msrs_to_save_all.
1381         */
1382        MSR_IA32_VMX_BASIC,
1383        MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1384        MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1385        MSR_IA32_VMX_TRUE_EXIT_CTLS,
1386        MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1387        MSR_IA32_VMX_MISC,
1388        MSR_IA32_VMX_CR0_FIXED0,
1389        MSR_IA32_VMX_CR4_FIXED0,
1390        MSR_IA32_VMX_VMCS_ENUM,
1391        MSR_IA32_VMX_PROCBASED_CTLS2,
1392        MSR_IA32_VMX_EPT_VPID_CAP,
1393        MSR_IA32_VMX_VMFUNC,
1394
1395        MSR_K7_HWCR,
1396        MSR_KVM_POLL_CONTROL,
1397};
1398
1399static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1400static unsigned num_emulated_msrs;
1401
1402/*
1403 * List of msr numbers which are used to expose MSR-based features that
1404 * can be used by a hypervisor to validate requested CPU features.
1405 */
1406static const u32 msr_based_features_all[] = {
1407        MSR_IA32_VMX_BASIC,
1408        MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1409        MSR_IA32_VMX_PINBASED_CTLS,
1410        MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1411        MSR_IA32_VMX_PROCBASED_CTLS,
1412        MSR_IA32_VMX_TRUE_EXIT_CTLS,
1413        MSR_IA32_VMX_EXIT_CTLS,
1414        MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1415        MSR_IA32_VMX_ENTRY_CTLS,
1416        MSR_IA32_VMX_MISC,
1417        MSR_IA32_VMX_CR0_FIXED0,
1418        MSR_IA32_VMX_CR0_FIXED1,
1419        MSR_IA32_VMX_CR4_FIXED0,
1420        MSR_IA32_VMX_CR4_FIXED1,
1421        MSR_IA32_VMX_VMCS_ENUM,
1422        MSR_IA32_VMX_PROCBASED_CTLS2,
1423        MSR_IA32_VMX_EPT_VPID_CAP,
1424        MSR_IA32_VMX_VMFUNC,
1425
1426        MSR_F10H_DECFG,
1427        MSR_IA32_UCODE_REV,
1428        MSR_IA32_ARCH_CAPABILITIES,
1429        MSR_IA32_PERF_CAPABILITIES,
1430};
1431
1432static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1433static unsigned int num_msr_based_features;
1434
1435static u64 kvm_get_arch_capabilities(void)
1436{
1437        u64 data = 0;
1438
1439        if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
1440                rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1441
1442        /*
1443         * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1444         * the nested hypervisor runs with NX huge pages.  If it is not,
1445         * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
1446         * L1 guests, so it need not worry about its own (L2) guests.
1447         */
1448        data |= ARCH_CAP_PSCHANGE_MC_NO;
1449
1450        /*
1451         * If we're doing cache flushes (either "always" or "cond")
1452         * we will do one whenever the guest does a vmlaunch/vmresume.
1453         * If an outer hypervisor is doing the cache flush for us
1454         * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1455         * capability to the guest too, and if EPT is disabled we're not
1456         * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1457         * require a nested hypervisor to do a flush of its own.
1458         */
1459        if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1460                data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1461
1462        if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1463                data |= ARCH_CAP_RDCL_NO;
1464        if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1465                data |= ARCH_CAP_SSB_NO;
1466        if (!boot_cpu_has_bug(X86_BUG_MDS))
1467                data |= ARCH_CAP_MDS_NO;
1468
1469        if (!boot_cpu_has(X86_FEATURE_RTM)) {
1470                /*
1471                 * If RTM=0 because the kernel has disabled TSX, the host might
1472                 * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
1473                 * and therefore knows that there cannot be TAA) but keep
1474                 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1475                 * and we want to allow migrating those guests to tsx=off hosts.
1476                 */
1477                data &= ~ARCH_CAP_TAA_NO;
1478        } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
1479                data |= ARCH_CAP_TAA_NO;
1480        } else {
1481                /*
1482                 * Nothing to do here; we emulate TSX_CTRL if present on the
1483                 * host so the guest can choose between disabling TSX or
1484                 * using VERW to clear CPU buffers.
1485                 */
1486        }
1487
1488        return data;
1489}
1490
1491static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1492{
1493        switch (msr->index) {
1494        case MSR_IA32_ARCH_CAPABILITIES:
1495                msr->data = kvm_get_arch_capabilities();
1496                break;
1497        case MSR_IA32_UCODE_REV:
1498                rdmsrl_safe(msr->index, &msr->data);
1499                break;
1500        default:
1501                return static_call(kvm_x86_get_msr_feature)(msr);
1502        }
1503        return 0;
1504}
1505
1506static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1507{
1508        struct kvm_msr_entry msr;
1509        int r;
1510
1511        msr.index = index;
1512        r = kvm_get_msr_feature(&msr);
1513
1514        if (r == KVM_MSR_RET_INVALID) {
1515                /* Unconditionally clear the output for simplicity */
1516                *data = 0;
1517                if (kvm_msr_ignored_check(index, 0, false))
1518                        r = 0;
1519        }
1520
1521        if (r)
1522                return r;
1523
1524        *data = msr.data;
1525
1526        return 0;
1527}
1528
1529static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1530{
1531        if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1532                return false;
1533
1534        if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1535                return false;
1536
1537        if (efer & (EFER_LME | EFER_LMA) &&
1538            !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1539                return false;
1540
1541        if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1542                return false;
1543
1544        return true;
1545
1546}
1547bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1548{
1549        if (efer & efer_reserved_bits)
1550                return false;
1551
1552        return __kvm_valid_efer(vcpu, efer);
1553}
1554EXPORT_SYMBOL_GPL(kvm_valid_efer);
1555
1556static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1557{
1558        u64 old_efer = vcpu->arch.efer;
1559        u64 efer = msr_info->data;
1560        int r;
1561
1562        if (efer & efer_reserved_bits)
1563                return 1;
1564
1565        if (!msr_info->host_initiated) {
1566                if (!__kvm_valid_efer(vcpu, efer))
1567                        return 1;
1568
1569                if (is_paging(vcpu) &&
1570                    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1571                        return 1;
1572        }
1573
1574        efer &= ~EFER_LMA;
1575        efer |= vcpu->arch.efer & EFER_LMA;
1576
1577        r = static_call(kvm_x86_set_efer)(vcpu, efer);
1578        if (r) {
1579                WARN_ON(r > 0);
1580                return r;
1581        }
1582
1583        /* Update reserved bits */
1584        if ((efer ^ old_efer) & EFER_NX)
1585                kvm_mmu_reset_context(vcpu);
1586
1587        return 0;
1588}
1589
1590void kvm_enable_efer_bits(u64 mask)
1591{
1592       efer_reserved_bits &= ~mask;
1593}
1594EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1595
1596bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1597{
1598        struct kvm_x86_msr_filter *msr_filter;
1599        struct msr_bitmap_range *ranges;
1600        struct kvm *kvm = vcpu->kvm;
1601        bool allowed;
1602        int idx;
1603        u32 i;
1604
1605        /* x2APIC MSRs do not support filtering. */
1606        if (index >= 0x800 && index <= 0x8ff)
1607                return true;
1608
1609        idx = srcu_read_lock(&kvm->srcu);
1610
1611        msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1612        if (!msr_filter) {
1613                allowed = true;
1614                goto out;
1615        }
1616
1617        allowed = msr_filter->default_allow;
1618        ranges = msr_filter->ranges;
1619
1620        for (i = 0; i < msr_filter->count; i++) {
1621                u32 start = ranges[i].base;
1622                u32 end = start + ranges[i].nmsrs;
1623                u32 flags = ranges[i].flags;
1624                unsigned long *bitmap = ranges[i].bitmap;
1625
1626                if ((index >= start) && (index < end) && (flags & type)) {
1627                        allowed = !!test_bit(index - start, bitmap);
1628                        break;
1629                }
1630        }
1631
1632out:
1633        srcu_read_unlock(&kvm->srcu, idx);
1634
1635        return allowed;
1636}
1637EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1638
1639/*
1640 * Write @data into the MSR specified by @index.  Select MSR specific fault
1641 * checks are bypassed if @host_initiated is %true.
1642 * Returns 0 on success, non-0 otherwise.
1643 * Assumes vcpu_load() was already called.
1644 */
1645static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1646                         bool host_initiated)
1647{
1648        struct msr_data msr;
1649
1650        if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1651                return KVM_MSR_RET_FILTERED;
1652
1653        switch (index) {
1654        case MSR_FS_BASE:
1655        case MSR_GS_BASE:
1656        case MSR_KERNEL_GS_BASE:
1657        case MSR_CSTAR:
1658        case MSR_LSTAR:
1659                if (is_noncanonical_address(data, vcpu))
1660                        return 1;
1661                break;
1662        case MSR_IA32_SYSENTER_EIP:
1663        case MSR_IA32_SYSENTER_ESP:
1664                /*
1665                 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1666                 * non-canonical address is written on Intel but not on
1667                 * AMD (which ignores the top 32-bits, because it does
1668                 * not implement 64-bit SYSENTER).
1669                 *
1670                 * 64-bit code should hence be able to write a non-canonical
1671                 * value on AMD.  Making the address canonical ensures that
1672                 * vmentry does not fail on Intel after writing a non-canonical
1673                 * value, and that something deterministic happens if the guest
1674                 * invokes 64-bit SYSENTER.
1675                 */
1676                data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
1677                break;
1678        case MSR_TSC_AUX:
1679                if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1680                        return 1;
1681
1682                if (!host_initiated &&
1683                    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1684                    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1685                        return 1;
1686
1687                /*
1688                 * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
1689                 * incomplete and conflicting architectural behavior.  Current
1690                 * AMD CPUs completely ignore bits 63:32, i.e. they aren't
1691                 * reserved and always read as zeros.  Enforce Intel's reserved
1692                 * bits check if and only if the guest CPU is Intel, and clear
1693                 * the bits in all other cases.  This ensures cross-vendor
1694                 * migration will provide consistent behavior for the guest.
1695                 */
1696                if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
1697                        return 1;
1698
1699                data = (u32)data;
1700                break;
1701        }
1702
1703        msr.data = data;
1704        msr.index = index;
1705        msr.host_initiated = host_initiated;
1706
1707        return static_call(kvm_x86_set_msr)(vcpu, &msr);
1708}
1709
1710static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1711                                     u32 index, u64 data, bool host_initiated)
1712{
1713        int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1714
1715        if (ret == KVM_MSR_RET_INVALID)
1716                if (kvm_msr_ignored_check(index, data, true))
1717                        ret = 0;
1718
1719        return ret;
1720}
1721
1722/*
1723 * Read the MSR specified by @index into @data.  Select MSR specific fault
1724 * checks are bypassed if @host_initiated is %true.
1725 * Returns 0 on success, non-0 otherwise.
1726 * Assumes vcpu_load() was already called.
1727 */
1728int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1729                  bool host_initiated)
1730{
1731        struct msr_data msr;
1732        int ret;
1733
1734        if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1735                return KVM_MSR_RET_FILTERED;
1736
1737        switch (index) {
1738        case MSR_TSC_AUX:
1739                if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1740                        return 1;
1741
1742                if (!host_initiated &&
1743                    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1744                    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1745                        return 1;
1746                break;
1747        }
1748
1749        msr.index = index;
1750        msr.host_initiated = host_initiated;
1751
1752        ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
1753        if (!ret)
1754                *data = msr.data;
1755        return ret;
1756}
1757
1758static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1759                                     u32 index, u64 *data, bool host_initiated)
1760{
1761        int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1762
1763        if (ret == KVM_MSR_RET_INVALID) {
1764                /* Unconditionally clear *data for simplicity */
1765                *data = 0;
1766                if (kvm_msr_ignored_check(index, 0, false))
1767                        ret = 0;
1768        }
1769
1770        return ret;
1771}
1772
1773int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1774{
1775        return kvm_get_msr_ignored_check(vcpu, index, data, false);
1776}
1777EXPORT_SYMBOL_GPL(kvm_get_msr);
1778
1779int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1780{
1781        return kvm_set_msr_ignored_check(vcpu, index, data, false);
1782}
1783EXPORT_SYMBOL_GPL(kvm_set_msr);
1784
1785static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1786{
1787        int err = vcpu->run->msr.error;
1788        if (!err) {
1789                kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1790                kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1791        }
1792
1793        return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
1794}
1795
1796static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
1797{
1798        return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
1799}
1800
1801static u64 kvm_msr_reason(int r)
1802{
1803        switch (r) {
1804        case KVM_MSR_RET_INVALID:
1805                return KVM_MSR_EXIT_REASON_UNKNOWN;
1806        case KVM_MSR_RET_FILTERED:
1807                return KVM_MSR_EXIT_REASON_FILTER;
1808        default:
1809                return KVM_MSR_EXIT_REASON_INVAL;
1810        }
1811}
1812
1813static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1814                              u32 exit_reason, u64 data,
1815                              int (*completion)(struct kvm_vcpu *vcpu),
1816                              int r)
1817{
1818        u64 msr_reason = kvm_msr_reason(r);
1819
1820        /* Check if the user wanted to know about this MSR fault */
1821        if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
1822                return 0;
1823
1824        vcpu->run->exit_reason = exit_reason;
1825        vcpu->run->msr.error = 0;
1826        memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
1827        vcpu->run->msr.reason = msr_reason;
1828        vcpu->run->msr.index = index;
1829        vcpu->run->msr.data = data;
1830        vcpu->arch.complete_userspace_io = completion;
1831
1832        return 1;
1833}
1834
1835static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
1836{
1837        return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
1838                                   complete_emulated_rdmsr, r);
1839}
1840
1841static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
1842{
1843        return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
1844                                   complete_emulated_wrmsr, r);
1845}
1846
1847int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1848{
1849        u32 ecx = kvm_rcx_read(vcpu);
1850        u64 data;
1851        int r;
1852
1853        r = kvm_get_msr(vcpu, ecx, &data);
1854
1855        /* MSR read failed? See if we should ask user space */
1856        if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
1857                /* Bounce to user space */
1858                return 0;
1859        }
1860
1861        if (!r) {
1862                trace_kvm_msr_read(ecx, data);
1863
1864                kvm_rax_write(vcpu, data & -1u);
1865                kvm_rdx_write(vcpu, (data >> 32) & -1u);
1866        } else {
1867                trace_kvm_msr_read_ex(ecx);
1868        }
1869
1870        return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
1871}
1872EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1873
1874int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1875{
1876        u32 ecx = kvm_rcx_read(vcpu);
1877        u64 data = kvm_read_edx_eax(vcpu);
1878        int r;
1879
1880        r = kvm_set_msr(vcpu, ecx, data);
1881
1882        /* MSR write failed? See if we should ask user space */
1883        if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
1884                /* Bounce to user space */
1885                return 0;
1886
1887        /* Signal all other negative errors to userspace */
1888        if (r < 0)
1889                return r;
1890
1891        if (!r)
1892                trace_kvm_msr_write(ecx, data);
1893        else
1894                trace_kvm_msr_write_ex(ecx, data);
1895
1896        return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
1897}
1898EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1899
1900int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
1901{
1902        return kvm_skip_emulated_instruction(vcpu);
1903}
1904EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
1905
1906int kvm_emulate_invd(struct kvm_vcpu *vcpu)
1907{
1908        /* Treat an INVD instruction as a NOP and just skip it. */
1909        return kvm_emulate_as_nop(vcpu);
1910}
1911EXPORT_SYMBOL_GPL(kvm_emulate_invd);
1912
1913int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
1914{
1915        pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
1916        return kvm_emulate_as_nop(vcpu);
1917}
1918EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
1919
1920int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
1921{
1922        kvm_queue_exception(vcpu, UD_VECTOR);
1923        return 1;
1924}
1925EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
1926
1927int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
1928{
1929        pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
1930        return kvm_emulate_as_nop(vcpu);
1931}
1932EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
1933
1934static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
1935{
1936        xfer_to_guest_mode_prepare();
1937        return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
1938                xfer_to_guest_mode_work_pending();
1939}
1940
1941/*
1942 * The fast path for frequent and performance sensitive wrmsr emulation,
1943 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1944 * the latency of virtual IPI by avoiding the expensive bits of transitioning
1945 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1946 * other cases which must be called after interrupts are enabled on the host.
1947 */
1948static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1949{
1950        if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1951                return 1;
1952
1953        if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1954                ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
1955                ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1956                ((u32)(data >> 32) != X2APIC_BROADCAST)) {
1957
1958                data &= ~(1 << 12);
1959                kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1960                kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
1961                kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1962                trace_kvm_apic_write(APIC_ICR, (u32)data);
1963                return 0;
1964        }
1965
1966        return 1;
1967}
1968
1969static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
1970{
1971        if (!kvm_can_use_hv_timer(vcpu))
1972                return 1;
1973
1974        kvm_set_lapic_tscdeadline_msr(vcpu, data);
1975        return 0;
1976}
1977
1978fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1979{
1980        u32 msr = kvm_rcx_read(vcpu);
1981        u64 data;
1982        fastpath_t ret = EXIT_FASTPATH_NONE;
1983
1984        switch (msr) {
1985        case APIC_BASE_MSR + (APIC_ICR >> 4):
1986                data = kvm_read_edx_eax(vcpu);
1987                if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
1988                        kvm_skip_emulated_instruction(vcpu);
1989                        ret = EXIT_FASTPATH_EXIT_HANDLED;
1990                }
1991                break;
1992        case MSR_IA32_TSC_DEADLINE:
1993                data = kvm_read_edx_eax(vcpu);
1994                if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
1995                        kvm_skip_emulated_instruction(vcpu);
1996                        ret = EXIT_FASTPATH_REENTER_GUEST;
1997                }
1998                break;
1999        default:
2000                break;
2001        }
2002
2003        if (ret != EXIT_FASTPATH_NONE)
2004                trace_kvm_msr_write(msr, data);
2005
2006        return ret;
2007}
2008EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
2009
2010/*
2011 * Adapt set_msr() to msr_io()'s calling convention
2012 */
2013static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2014{
2015        return kvm_get_msr_ignored_check(vcpu, index, data, true);
2016}
2017
2018static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2019{
2020        return kvm_set_msr_ignored_check(vcpu, index, *data, true);
2021}
2022
2023#ifdef CONFIG_X86_64
2024struct pvclock_clock {
2025        int vclock_mode;
2026        u64 cycle_last;
2027        u64 mask;
2028        u32 mult;
2029        u32 shift;
2030        u64 base_cycles;
2031        u64 offset;
2032};
2033
2034struct pvclock_gtod_data {
2035        seqcount_t      seq;
2036
2037        struct pvclock_clock clock; /* extract of a clocksource struct */
2038        struct pvclock_clock raw_clock; /* extract of a clocksource struct */
2039
2040        ktime_t         offs_boot;
2041        u64             wall_time_sec;
2042};
2043
2044static struct pvclock_gtod_data pvclock_gtod_data;
2045
2046static void update_pvclock_gtod(struct timekeeper *tk)
2047{
2048        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
2049
2050        write_seqcount_begin(&vdata->seq);
2051
2052        /* copy pvclock gtod data */
2053        vdata->clock.vclock_mode        = tk->tkr_mono.clock->vdso_clock_mode;
2054        vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
2055        vdata->clock.mask               = tk->tkr_mono.mask;
2056        vdata->clock.mult               = tk->tkr_mono.mult;
2057        vdata->clock.shift              = tk->tkr_mono.shift;
2058        vdata->clock.base_cycles        = tk->tkr_mono.xtime_nsec;
2059        vdata->clock.offset             = tk->tkr_mono.base;
2060
2061        vdata->raw_clock.vclock_mode    = tk->tkr_raw.clock->vdso_clock_mode;
2062        vdata->raw_clock.cycle_last     = tk->tkr_raw.cycle_last;
2063        vdata->raw_clock.mask           = tk->tkr_raw.mask;
2064        vdata->raw_clock.mult           = tk->tkr_raw.mult;
2065        vdata->raw_clock.shift          = tk->tkr_raw.shift;
2066        vdata->raw_clock.base_cycles    = tk->tkr_raw.xtime_nsec;
2067        vdata->raw_clock.offset         = tk->tkr_raw.base;
2068
2069        vdata->wall_time_sec            = tk->xtime_sec;
2070
2071        vdata->offs_boot                = tk->offs_boot;
2072
2073        write_seqcount_end(&vdata->seq);
2074}
2075
2076static s64 get_kvmclock_base_ns(void)
2077{
2078        /* Count up from boot time, but with the frequency of the raw clock.  */
2079        return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
2080}
2081#else
2082static s64 get_kvmclock_base_ns(void)
2083{
2084        /* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
2085        return ktime_get_boottime_ns();
2086}
2087#endif
2088
2089void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
2090{
2091        int version;
2092        int r;
2093        struct pvclock_wall_clock wc;
2094        u32 wc_sec_hi;
2095        u64 wall_nsec;
2096
2097        if (!wall_clock)
2098                return;
2099
2100        r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
2101        if (r)
2102                return;
2103
2104        if (version & 1)
2105                ++version;  /* first time write, random junk */
2106
2107        ++version;
2108
2109        if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
2110                return;
2111
2112        /*
2113         * The guest calculates current wall clock time by adding
2114         * system time (updated by kvm_guest_time_update below) to the
2115         * wall clock specified here.  We do the reverse here.
2116         */
2117        wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
2118
2119        wc.nsec = do_div(wall_nsec, 1000000000);
2120        wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
2121        wc.version = version;
2122
2123        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
2124
2125        if (sec_hi_ofs) {
2126                wc_sec_hi = wall_nsec >> 32;
2127                kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
2128                                &wc_sec_hi, sizeof(wc_sec_hi));
2129        }
2130
2131        version++;
2132        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
2133}
2134
2135static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2136                                  bool old_msr, bool host_initiated)
2137{
2138        struct kvm_arch *ka = &vcpu->kvm->arch;
2139
2140        if (vcpu->vcpu_id == 0 && !host_initiated) {
2141                if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2142                        kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2143
2144                ka->boot_vcpu_runs_old_kvmclock = old_msr;
2145        }
2146
2147        vcpu->arch.time = system_time;
2148        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2149
2150        /* we verify if the enable bit is set... */
2151        vcpu->arch.pv_time_enabled = false;
2152        if (!(system_time & 1))
2153                return;
2154
2155        if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
2156                                       &vcpu->arch.pv_time, system_time & ~1ULL,
2157                                       sizeof(struct pvclock_vcpu_time_info)))
2158                vcpu->arch.pv_time_enabled = true;
2159
2160        return;
2161}
2162
2163static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
2164{
2165        do_shl32_div32(dividend, divisor);
2166        return dividend;
2167}
2168
2169static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
2170                               s8 *pshift, u32 *pmultiplier)
2171{
2172        uint64_t scaled64;
2173        int32_t  shift = 0;
2174        uint64_t tps64;
2175        uint32_t tps32;
2176
2177        tps64 = base_hz;
2178        scaled64 = scaled_hz;
2179        while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
2180                tps64 >>= 1;
2181                shift--;
2182        }
2183
2184        tps32 = (uint32_t)tps64;
2185        while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2186                if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
2187                        scaled64 >>= 1;
2188                else
2189                        tps32 <<= 1;
2190                shift++;
2191        }
2192
2193        *pshift = shift;
2194        *pmultiplier = div_frac(scaled64, tps32);
2195}
2196
2197#ifdef CONFIG_X86_64
2198static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
2199#endif
2200
2201static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
2202static unsigned long max_tsc_khz;
2203
2204static u32 adjust_tsc_khz(u32 khz, s32 ppm)
2205{
2206        u64 v = (u64)khz * (1000000 + ppm);
2207        do_div(v, 1000000);
2208        return v;
2209}
2210
2211static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
2212
2213static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2214{
2215        u64 ratio;
2216
2217        /* Guest TSC same frequency as host TSC? */
2218        if (!scale) {
2219                kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
2220                return 0;
2221        }
2222
2223        /* TSC scaling supported? */
2224        if (!kvm_has_tsc_control) {
2225                if (user_tsc_khz > tsc_khz) {
2226                        vcpu->arch.tsc_catchup = 1;
2227                        vcpu->arch.tsc_always_catchup = 1;
2228                        return 0;
2229                } else {
2230                        pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
2231                        return -1;
2232                }
2233        }
2234
2235        /* TSC scaling required  - calculate ratio */
2236        ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
2237                                user_tsc_khz, tsc_khz);
2238
2239        if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
2240                pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2241                                    user_tsc_khz);
2242                return -1;
2243        }
2244
2245        kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
2246        return 0;
2247}
2248
2249static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
2250{
2251        u32 thresh_lo, thresh_hi;
2252        int use_scaling = 0;
2253
2254        /* tsc_khz can be zero if TSC calibration fails */
2255        if (user_tsc_khz == 0) {
2256                /* set tsc_scaling_ratio to a safe value */
2257                kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
2258                return -1;
2259        }
2260
2261        /* Compute a scale to convert nanoseconds in TSC cycles */
2262        kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
2263                           &vcpu->arch.virtual_tsc_shift,
2264                           &vcpu->arch.virtual_tsc_mult);
2265        vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2266
2267        /*
2268         * Compute the variation in TSC rate which is acceptable
2269         * within the range of tolerance and decide if the
2270         * rate being applied is within that bounds of the hardware
2271         * rate.  If so, no scaling or compensation need be done.
2272         */
2273        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2274        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
2275        if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
2276                pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
2277                use_scaling = 1;
2278        }
2279        return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
2280}
2281
2282static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2283{
2284        u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2285                                      vcpu->arch.virtual_tsc_mult,
2286                                      vcpu->arch.virtual_tsc_shift);
2287        tsc += vcpu->arch.this_tsc_write;
2288        return tsc;
2289}
2290
2291static inline int gtod_is_based_on_tsc(int mode)
2292{
2293        return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2294}
2295
2296static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
2297{
2298#ifdef CONFIG_X86_64
2299        bool vcpus_matched;
2300        struct kvm_arch *ka = &vcpu->kvm->arch;
2301        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2302
2303        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2304                         atomic_read(&vcpu->kvm->online_vcpus));
2305
2306        /*
2307         * Once the masterclock is enabled, always perform request in
2308         * order to update it.
2309         *
2310         * In order to enable masterclock, the host clocksource must be TSC
2311         * and the vcpus need to have matched TSCs.  When that happens,
2312         * perform request to enable masterclock.
2313         */
2314        if (ka->use_master_clock ||
2315            (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
2316                kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2317
2318        trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2319                            atomic_read(&vcpu->kvm->online_vcpus),
2320                            ka->use_master_clock, gtod->clock.vclock_mode);
2321#endif
2322}
2323
2324/*
2325 * Multiply tsc by a fixed point number represented by ratio.
2326 *
2327 * The most significant 64-N bits (mult) of ratio represent the
2328 * integral part of the fixed point number; the remaining N bits
2329 * (frac) represent the fractional part, ie. ratio represents a fixed
2330 * point number (mult + frac * 2^(-N)).
2331 *
2332 * N equals to kvm_tsc_scaling_ratio_frac_bits.
2333 */
2334static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2335{
2336        return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
2337}
2338
2339u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
2340{
2341        u64 _tsc = tsc;
2342
2343        if (ratio != kvm_default_tsc_scaling_ratio)
2344                _tsc = __scale_tsc(ratio, tsc);
2345
2346        return _tsc;
2347}
2348EXPORT_SYMBOL_GPL(kvm_scale_tsc);
2349
2350static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2351{
2352        u64 tsc;
2353
2354        tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
2355
2356        return target_tsc - tsc;
2357}
2358
2359u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2360{
2361        return vcpu->arch.l1_tsc_offset +
2362                kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
2363}
2364EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
2365
2366u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
2367{
2368        u64 nested_offset;
2369
2370        if (l2_multiplier == kvm_default_tsc_scaling_ratio)
2371                nested_offset = l1_offset;
2372        else
2373                nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
2374                                                kvm_tsc_scaling_ratio_frac_bits);
2375
2376        nested_offset += l2_offset;
2377        return nested_offset;
2378}
2379EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
2380
2381u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
2382{
2383        if (l2_multiplier != kvm_default_tsc_scaling_ratio)
2384                return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
2385                                       kvm_tsc_scaling_ratio_frac_bits);
2386
2387        return l1_multiplier;
2388}
2389EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
2390
2391static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
2392{
2393        trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2394                                   vcpu->arch.l1_tsc_offset,
2395                                   l1_offset);
2396
2397        vcpu->arch.l1_tsc_offset = l1_offset;
2398
2399        /*
2400         * If we are here because L1 chose not to trap WRMSR to TSC then
2401         * according to the spec this should set L1's TSC (as opposed to
2402         * setting L1's offset for L2).
2403         */
2404        if (is_guest_mode(vcpu))
2405                vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2406                        l1_offset,
2407                        static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
2408                        static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2409        else
2410                vcpu->arch.tsc_offset = l1_offset;
2411
2412        static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
2413}
2414
2415static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
2416{
2417        vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2418
2419        /* Userspace is changing the multiplier while L2 is active */
2420        if (is_guest_mode(vcpu))
2421                vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2422                        l1_multiplier,
2423                        static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2424        else
2425                vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2426
2427        if (kvm_has_tsc_control)
2428                static_call(kvm_x86_write_tsc_multiplier)(
2429                        vcpu, vcpu->arch.tsc_scaling_ratio);
2430}
2431
2432static inline bool kvm_check_tsc_unstable(void)
2433{
2434#ifdef CONFIG_X86_64
2435        /*
2436         * TSC is marked unstable when we're running on Hyper-V,
2437         * 'TSC page' clocksource is good.
2438         */
2439        if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
2440                return false;
2441#endif
2442        return check_tsc_unstable();
2443}
2444
2445static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
2446{
2447        struct kvm *kvm = vcpu->kvm;
2448        u64 offset, ns, elapsed;
2449        unsigned long flags;
2450        bool matched;
2451        bool already_matched;
2452        bool synchronizing = false;
2453
2454        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2455        offset = kvm_compute_l1_tsc_offset(vcpu, data);
2456        ns = get_kvmclock_base_ns();
2457        elapsed = ns - kvm->arch.last_tsc_nsec;
2458
2459        if (vcpu->arch.virtual_tsc_khz) {
2460                if (data == 0) {
2461                        /*
2462                         * detection of vcpu initialization -- need to sync
2463                         * with other vCPUs. This particularly helps to keep
2464                         * kvm_clock stable after CPU hotplug
2465                         */
2466                        synchronizing = true;
2467                } else {
2468                        u64 tsc_exp = kvm->arch.last_tsc_write +
2469                                                nsec_to_cycles(vcpu, elapsed);
2470                        u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2471                        /*
2472                         * Special case: TSC write with a small delta (1 second)
2473                         * of virtual cycle time against real time is
2474                         * interpreted as an attempt to synchronize the CPU.
2475                         */
2476                        synchronizing = data < tsc_exp + tsc_hz &&
2477                                        data + tsc_hz > tsc_exp;
2478                }
2479        }
2480
2481        /*
2482         * For a reliable TSC, we can match TSC offsets, and for an unstable
2483         * TSC, we add elapsed time in this computation.  We could let the
2484         * compensation code attempt to catch up if we fall behind, but
2485         * it's better to try to match offsets from the beginning.
2486         */
2487        if (synchronizing &&
2488            vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2489                if (!kvm_check_tsc_unstable()) {
2490                        offset = kvm->arch.cur_tsc_offset;
2491                } else {
2492                        u64 delta = nsec_to_cycles(vcpu, elapsed);
2493                        data += delta;
2494                        offset = kvm_compute_l1_tsc_offset(vcpu, data);
2495                }
2496                matched = true;
2497                already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
2498        } else {
2499                /*
2500                 * We split periods of matched TSC writes into generations.
2501                 * For each generation, we track the original measured
2502                 * nanosecond time, offset, and write, so if TSCs are in
2503                 * sync, we can match exact offset, and if not, we can match
2504                 * exact software computation in compute_guest_tsc()
2505                 *
2506                 * These values are tracked in kvm->arch.cur_xxx variables.
2507                 */
2508                kvm->arch.cur_tsc_generation++;
2509                kvm->arch.cur_tsc_nsec = ns;
2510                kvm->arch.cur_tsc_write = data;
2511                kvm->arch.cur_tsc_offset = offset;
2512                matched = false;
2513        }
2514
2515        /*
2516         * We also track th most recent recorded KHZ, write and time to
2517         * allow the matching interval to be extended at each write.
2518         */
2519        kvm->arch.last_tsc_nsec = ns;
2520        kvm->arch.last_tsc_write = data;
2521        kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2522
2523        vcpu->arch.last_guest_tsc = data;
2524
2525        /* Keep track of which generation this VCPU has synchronized to */
2526        vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2527        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2528        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2529
2530        kvm_vcpu_write_tsc_offset(vcpu, offset);
2531        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2532
2533        spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
2534        if (!matched) {
2535                kvm->arch.nr_vcpus_matched_tsc = 0;
2536        } else if (!already_matched) {
2537                kvm->arch.nr_vcpus_matched_tsc++;
2538        }
2539
2540        kvm_track_tsc_matching(vcpu);
2541        spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
2542}
2543
2544static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2545                                           s64 adjustment)
2546{
2547        u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2548        kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
2549}
2550
2551static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2552{
2553        if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
2554                WARN_ON(adjustment < 0);
2555        adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
2556                                   vcpu->arch.l1_tsc_scaling_ratio);
2557        adjust_tsc_offset_guest(vcpu, adjustment);
2558}
2559
2560#ifdef CONFIG_X86_64
2561
2562static u64 read_tsc(void)
2563{
2564        u64 ret = (u64)rdtsc_ordered();
2565        u64 last = pvclock_gtod_data.clock.cycle_last;
2566
2567        if (likely(ret >= last))
2568                return ret;
2569
2570        /*
2571         * GCC likes to generate cmov here, but this branch is extremely
2572         * predictable (it's just a function of time and the likely is
2573         * very likely) and there's a data dependence, so force GCC
2574         * to generate a branch instead.  I don't barrier() because
2575         * we don't actually need a barrier, and if this function
2576         * ever gets inlined it will generate worse code.
2577         */
2578        asm volatile ("");
2579        return last;
2580}
2581
2582static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2583                          int *mode)
2584{
2585        long v;
2586        u64 tsc_pg_val;
2587
2588        switch (clock->vclock_mode) {
2589        case VDSO_CLOCKMODE_HVCLOCK:
2590                tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2591                                                  tsc_timestamp);
2592                if (tsc_pg_val != U64_MAX) {
2593                        /* TSC page valid */
2594                        *mode = VDSO_CLOCKMODE_HVCLOCK;
2595                        v = (tsc_pg_val - clock->cycle_last) &
2596                                clock->mask;
2597                } else {
2598                        /* TSC page invalid */
2599                        *mode = VDSO_CLOCKMODE_NONE;
2600                }
2601                break;
2602        case VDSO_CLOCKMODE_TSC:
2603                *mode = VDSO_CLOCKMODE_TSC;
2604                *tsc_timestamp = read_tsc();
2605                v = (*tsc_timestamp - clock->cycle_last) &
2606                        clock->mask;
2607                break;
2608        default:
2609                *mode = VDSO_CLOCKMODE_NONE;
2610        }
2611
2612        if (*mode == VDSO_CLOCKMODE_NONE)
2613                *tsc_timestamp = v = 0;
2614
2615        return v * clock->mult;
2616}
2617
2618static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2619{
2620        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2621        unsigned long seq;
2622        int mode;
2623        u64 ns;
2624
2625        do {
2626                seq = read_seqcount_begin(&gtod->seq);
2627                ns = gtod->raw_clock.base_cycles;
2628                ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2629                ns >>= gtod->raw_clock.shift;
2630                ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2631        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2632        *t = ns;
2633
2634        return mode;
2635}
2636
2637static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
2638{
2639        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2640        unsigned long seq;
2641        int mode;
2642        u64 ns;
2643
2644        do {
2645                seq = read_seqcount_begin(&gtod->seq);
2646                ts->tv_sec = gtod->wall_time_sec;
2647                ns = gtod->clock.base_cycles;
2648                ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2649                ns >>= gtod->clock.shift;
2650        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2651
2652        ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2653        ts->tv_nsec = ns;
2654
2655        return mode;
2656}
2657
2658/* returns true if host is using TSC based clocksource */
2659static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2660{
2661        /* checked again under seqlock below */
2662        if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2663                return false;
2664
2665        return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2666                                                      tsc_timestamp));
2667}
2668
2669/* returns true if host is using TSC based clocksource */
2670static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
2671                                           u64 *tsc_timestamp)
2672{
2673        /* checked again under seqlock below */
2674        if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2675                return false;
2676
2677        return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
2678}
2679#endif
2680
2681/*
2682 *
2683 * Assuming a stable TSC across physical CPUS, and a stable TSC
2684 * across virtual CPUs, the following condition is possible.
2685 * Each numbered line represents an event visible to both
2686 * CPUs at the next numbered event.
2687 *
2688 * "timespecX" represents host monotonic time. "tscX" represents
2689 * RDTSC value.
2690 *
2691 *              VCPU0 on CPU0           |       VCPU1 on CPU1
2692 *
2693 * 1.  read timespec0,tsc0
2694 * 2.                                   | timespec1 = timespec0 + N
2695 *                                      | tsc1 = tsc0 + M
2696 * 3. transition to guest               | transition to guest
2697 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2698 * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
2699 *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2700 *
2701 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2702 *
2703 *      - ret0 < ret1
2704 *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2705 *              ...
2706 *      - 0 < N - M => M < N
2707 *
2708 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2709 * always the case (the difference between two distinct xtime instances
2710 * might be smaller then the difference between corresponding TSC reads,
2711 * when updating guest vcpus pvclock areas).
2712 *
2713 * To avoid that problem, do not allow visibility of distinct
2714 * system_timestamp/tsc_timestamp values simultaneously: use a master
2715 * copy of host monotonic time values. Update that master copy
2716 * in lockstep.
2717 *
2718 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2719 *
2720 */
2721
2722static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2723{
2724#ifdef CONFIG_X86_64
2725        struct kvm_arch *ka = &kvm->arch;
2726        int vclock_mode;
2727        bool host_tsc_clocksource, vcpus_matched;
2728
2729        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2730                        atomic_read(&kvm->online_vcpus));
2731
2732        /*
2733         * If the host uses TSC clock, then passthrough TSC as stable
2734         * to the guest.
2735         */
2736        host_tsc_clocksource = kvm_get_time_and_clockread(
2737                                        &ka->master_kernel_ns,
2738                                        &ka->master_cycle_now);
2739
2740        ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2741                                && !ka->backwards_tsc_observed
2742                                && !ka->boot_vcpu_runs_old_kvmclock;
2743
2744        if (ka->use_master_clock)
2745                atomic_set(&kvm_guest_has_master_clock, 1);
2746
2747        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2748        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2749                                        vcpus_matched);
2750#endif
2751}
2752
2753void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2754{
2755        kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2756}
2757
2758static void kvm_gen_update_masterclock(struct kvm *kvm)
2759{
2760#ifdef CONFIG_X86_64
2761        int i;
2762        struct kvm_vcpu *vcpu;
2763        struct kvm_arch *ka = &kvm->arch;
2764        unsigned long flags;
2765
2766        kvm_hv_invalidate_tsc_page(kvm);
2767
2768        kvm_make_mclock_inprogress_request(kvm);
2769
2770        /* no guest entries from this point */
2771        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
2772        pvclock_update_vm_gtod_copy(kvm);
2773        spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2774
2775        kvm_for_each_vcpu(i, vcpu, kvm)
2776                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2777
2778        /* guest entries allowed */
2779        kvm_for_each_vcpu(i, vcpu, kvm)
2780                kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2781#endif
2782}
2783
2784u64 get_kvmclock_ns(struct kvm *kvm)
2785{
2786        struct kvm_arch *ka = &kvm->arch;
2787        struct pvclock_vcpu_time_info hv_clock;
2788        unsigned long flags;
2789        u64 ret;
2790
2791        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
2792        if (!ka->use_master_clock) {
2793                spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2794                return get_kvmclock_base_ns() + ka->kvmclock_offset;
2795        }
2796
2797        hv_clock.tsc_timestamp = ka->master_cycle_now;
2798        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
2799        spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2800
2801        /* both __this_cpu_read() and rdtsc() should be on the same cpu */
2802        get_cpu();
2803
2804        if (__this_cpu_read(cpu_tsc_khz)) {
2805                kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
2806                                   &hv_clock.tsc_shift,
2807                                   &hv_clock.tsc_to_system_mul);
2808                ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2809        } else
2810                ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
2811
2812        put_cpu();
2813
2814        return ret;
2815}
2816
2817static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
2818                                   struct gfn_to_hva_cache *cache,
2819                                   unsigned int offset)
2820{
2821        struct kvm_vcpu_arch *vcpu = &v->arch;
2822        struct pvclock_vcpu_time_info guest_hv_clock;
2823
2824        if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
2825                &guest_hv_clock, offset, sizeof(guest_hv_clock))))
2826                return;
2827
2828        /* This VCPU is paused, but it's legal for a guest to read another
2829         * VCPU's kvmclock, so we really have to follow the specification where
2830         * it says that version is odd if data is being modified, and even after
2831         * it is consistent.
2832         *
2833         * Version field updates must be kept separate.  This is because
2834         * kvm_write_guest_cached might use a "rep movs" instruction, and
2835         * writes within a string instruction are weakly ordered.  So there
2836         * are three writes overall.
2837         *
2838         * As a small optimization, only write the version field in the first
2839         * and third write.  The vcpu->pv_time cache is still valid, because the
2840         * version field is the first in the struct.
2841         */
2842        BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2843
2844        if (guest_hv_clock.version & 1)
2845                ++guest_hv_clock.version;  /* first time write, random junk */
2846
2847        vcpu->hv_clock.version = guest_hv_clock.version + 1;
2848        kvm_write_guest_offset_cached(v->kvm, cache,
2849                                      &vcpu->hv_clock, offset,
2850                                      sizeof(vcpu->hv_clock.version));
2851
2852        smp_wmb();
2853
2854        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2855        vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2856
2857        if (vcpu->pvclock_set_guest_stopped_request) {
2858                vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2859                vcpu->pvclock_set_guest_stopped_request = false;
2860        }
2861
2862        trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2863
2864        kvm_write_guest_offset_cached(v->kvm, cache,
2865                                      &vcpu->hv_clock, offset,
2866                                      sizeof(vcpu->hv_clock));
2867
2868        smp_wmb();
2869
2870        vcpu->hv_clock.version++;
2871        kvm_write_guest_offset_cached(v->kvm, cache,
2872                                     &vcpu->hv_clock, offset,
2873                                     sizeof(vcpu->hv_clock.version));
2874}
2875
2876static int kvm_guest_time_update(struct kvm_vcpu *v)
2877{
2878        unsigned long flags, tgt_tsc_khz;
2879        struct kvm_vcpu_arch *vcpu = &v->arch;
2880        struct kvm_arch *ka = &v->kvm->arch;
2881        s64 kernel_ns;
2882        u64 tsc_timestamp, host_tsc;
2883        u8 pvclock_flags;
2884        bool use_master_clock;
2885
2886        kernel_ns = 0;
2887        host_tsc = 0;
2888
2889        /*
2890         * If the host uses TSC clock, then passthrough TSC as stable
2891         * to the guest.
2892         */
2893        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
2894        use_master_clock = ka->use_master_clock;
2895        if (use_master_clock) {
2896                host_tsc = ka->master_cycle_now;
2897                kernel_ns = ka->master_kernel_ns;
2898        }
2899        spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2900
2901        /* Keep irq disabled to prevent changes to the clock */
2902        local_irq_save(flags);
2903        tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2904        if (unlikely(tgt_tsc_khz == 0)) {
2905                local_irq_restore(flags);
2906                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2907                return 1;
2908        }
2909        if (!use_master_clock) {
2910                host_tsc = rdtsc();
2911                kernel_ns = get_kvmclock_base_ns();
2912        }
2913
2914        tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2915
2916        /*
2917         * We may have to catch up the TSC to match elapsed wall clock
2918         * time for two reasons, even if kvmclock is used.
2919         *   1) CPU could have been running below the maximum TSC rate
2920         *   2) Broken TSC compensation resets the base at each VCPU
2921         *      entry to avoid unknown leaps of TSC even when running
2922         *      again on the same CPU.  This may cause apparent elapsed
2923         *      time to disappear, and the guest to stand still or run
2924         *      very slowly.
2925         */
2926        if (vcpu->tsc_catchup) {
2927                u64 tsc = compute_guest_tsc(v, kernel_ns);
2928                if (tsc > tsc_timestamp) {
2929                        adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2930                        tsc_timestamp = tsc;
2931                }
2932        }
2933
2934        local_irq_restore(flags);
2935
2936        /* With all the info we got, fill in the values */
2937
2938        if (kvm_has_tsc_control)
2939                tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
2940                                            v->arch.l1_tsc_scaling_ratio);
2941
2942        if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2943                kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2944                                   &vcpu->hv_clock.tsc_shift,
2945                                   &vcpu->hv_clock.tsc_to_system_mul);
2946                vcpu->hw_tsc_khz = tgt_tsc_khz;
2947        }
2948
2949        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2950        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2951        vcpu->last_guest_tsc = tsc_timestamp;
2952
2953        /* If the host uses TSC clocksource, then it is stable */
2954        pvclock_flags = 0;
2955        if (use_master_clock)
2956                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2957
2958        vcpu->hv_clock.flags = pvclock_flags;
2959
2960        if (vcpu->pv_time_enabled)
2961                kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
2962        if (vcpu->xen.vcpu_info_set)
2963                kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
2964                                       offsetof(struct compat_vcpu_info, time));
2965        if (vcpu->xen.vcpu_time_info_set)
2966                kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
2967        if (v == kvm_get_vcpu(v->kvm, 0))
2968                kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2969        return 0;
2970}
2971
2972/*
2973 * kvmclock updates which are isolated to a given vcpu, such as
2974 * vcpu->cpu migration, should not allow system_timestamp from
2975 * the rest of the vcpus to remain static. Otherwise ntp frequency
2976 * correction applies to one vcpu's system_timestamp but not
2977 * the others.
2978 *
2979 * So in those cases, request a kvmclock update for all vcpus.
2980 * We need to rate-limit these requests though, as they can
2981 * considerably slow guests that have a large number of vcpus.
2982 * The time for a remote vcpu to update its kvmclock is bound
2983 * by the delay we use to rate-limit the updates.
2984 */
2985
2986#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2987
2988static void kvmclock_update_fn(struct work_struct *work)
2989{
2990        int i;
2991        struct delayed_work *dwork = to_delayed_work(work);
2992        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2993                                           kvmclock_update_work);
2994        struct kvm *kvm = container_of(ka, struct kvm, arch);
2995        struct kvm_vcpu *vcpu;
2996
2997        kvm_for_each_vcpu(i, vcpu, kvm) {
2998                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2999                kvm_vcpu_kick(vcpu);
3000        }
3001}
3002
3003static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
3004{
3005        struct kvm *kvm = v->kvm;
3006
3007        kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
3008        schedule_delayed_work(&kvm->arch.kvmclock_update_work,
3009                                        KVMCLOCK_UPDATE_DELAY);
3010}
3011
3012#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
3013
3014static void kvmclock_sync_fn(struct work_struct *work)
3015{
3016        struct delayed_work *dwork = to_delayed_work(work);
3017        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
3018                                           kvmclock_sync_work);
3019        struct kvm *kvm = container_of(ka, struct kvm, arch);
3020
3021        if (!kvmclock_periodic_sync)
3022                return;
3023
3024        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
3025        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
3026                                        KVMCLOCK_SYNC_PERIOD);
3027}
3028
3029/*
3030 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
3031 */
3032static bool can_set_mci_status(struct kvm_vcpu *vcpu)
3033{
3034        /* McStatusWrEn enabled? */
3035        if (guest_cpuid_is_amd_or_hygon(vcpu))
3036                return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3037
3038        return false;
3039}
3040
3041static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3042{
3043        u64 mcg_cap = vcpu->arch.mcg_cap;
3044        unsigned bank_num = mcg_cap & 0xff;
3045        u32 msr = msr_info->index;
3046        u64 data = msr_info->data;
3047
3048        switch (msr) {
3049        case MSR_IA32_MCG_STATUS:
3050                vcpu->arch.mcg_status = data;
3051                break;
3052        case MSR_IA32_MCG_CTL:
3053                if (!(mcg_cap & MCG_CTL_P) &&
3054                    (data || !msr_info->host_initiated))
3055                        return 1;
3056                if (data != 0 && data != ~(u64)0)
3057                        return 1;
3058                vcpu->arch.mcg_ctl = data;
3059                break;
3060        default:
3061                if (msr >= MSR_IA32_MC0_CTL &&
3062                    msr < MSR_IA32_MCx_CTL(bank_num)) {
3063                        u32 offset = array_index_nospec(
3064                                msr - MSR_IA32_MC0_CTL,
3065                                MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
3066
3067                        /* only 0 or all 1s can be written to IA32_MCi_CTL
3068                         * some Linux kernels though clear bit 10 in bank 4 to
3069                         * workaround a BIOS/GART TBL issue on AMD K8s, ignore
3070                         * this to avoid an uncatched #GP in the guest
3071                         */
3072                        if ((offset & 0x3) == 0 &&
3073                            data != 0 && (data | (1 << 10)) != ~(u64)0)
3074                                return -1;
3075
3076                        /* MCi_STATUS */
3077                        if (!msr_info->host_initiated &&
3078                            (offset & 0x3) == 1 && data != 0) {
3079                                if (!can_set_mci_status(vcpu))
3080                                        return -1;
3081                        }
3082
3083                        vcpu->arch.mce_banks[offset] = data;
3084                        break;
3085                }
3086                return 1;
3087        }
3088        return 0;
3089}
3090
3091static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
3092{
3093        u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
3094
3095        return (vcpu->arch.apf.msr_en_val & mask) == mask;
3096}
3097
3098static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
3099{
3100        gpa_t gpa = data & ~0x3f;
3101
3102        /* Bits 4:5 are reserved, Should be zero */
3103        if (data & 0x30)
3104                return 1;
3105
3106        if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
3107            (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
3108                return 1;
3109
3110        if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
3111            (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
3112                return 1;
3113
3114        if (!lapic_in_kernel(vcpu))
3115                return data ? 1 : 0;
3116
3117        vcpu->arch.apf.msr_en_val = data;
3118
3119        if (!kvm_pv_async_pf_enabled(vcpu)) {
3120                kvm_clear_async_pf_completion_queue(vcpu);
3121                kvm_async_pf_hash_reset(vcpu);
3122                return 0;
3123        }
3124
3125        if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
3126                                        sizeof(u64)))
3127                return 1;
3128
3129        vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
3130        vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
3131
3132        kvm_async_pf_wakeup_all(vcpu);
3133
3134        return 0;
3135}
3136
3137static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
3138{
3139        /* Bits 8-63 are reserved */
3140        if (data >> 8)
3141                return 1;
3142
3143        if (!lapic_in_kernel(vcpu))
3144                return 1;
3145
3146        vcpu->arch.apf.msr_int_val = data;
3147
3148        vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3149
3150        return 0;
3151}
3152
3153static void kvmclock_reset(struct kvm_vcpu *vcpu)
3154{
3155        vcpu->arch.pv_time_enabled = false;
3156        vcpu->arch.time = 0;
3157}
3158
3159static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
3160{
3161        ++vcpu->stat.tlb_flush;
3162        static_call(kvm_x86_tlb_flush_all)(vcpu);
3163}
3164
3165static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
3166{
3167        ++vcpu->stat.tlb_flush;
3168
3169        if (!tdp_enabled) {
3170               /*
3171                 * A TLB flush on behalf of the guest is equivalent to
3172                 * INVPCID(all), toggling CR4.PGE, etc., which requires
3173                 * a forced sync of the shadow page tables.  Unload the
3174                 * entire MMU here and the subsequent load will sync the
3175                 * shadow page tables, and also flush the TLB.
3176                 */
3177                kvm_mmu_unload(vcpu);
3178                return;
3179        }
3180
3181        static_call(kvm_x86_tlb_flush_guest)(vcpu);
3182}
3183
3184static void record_steal_time(struct kvm_vcpu *vcpu)
3185{
3186        struct kvm_host_map map;
3187        struct kvm_steal_time *st;
3188
3189        if (kvm_xen_msr_enabled(vcpu->kvm)) {
3190                kvm_xen_runstate_set_running(vcpu);
3191                return;
3192        }
3193
3194        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3195                return;
3196
3197        /* -EAGAIN is returned in atomic context so we can just return. */
3198        if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
3199                        &map, &vcpu->arch.st.cache, false))
3200                return;
3201
3202        st = map.hva +
3203                offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
3204
3205        /*
3206         * Doing a TLB flush here, on the guest's behalf, can avoid
3207         * expensive IPIs.
3208         */
3209        if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3210                u8 st_preempted = xchg(&st->preempted, 0);
3211
3212                trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3213                                       st_preempted & KVM_VCPU_FLUSH_TLB);
3214                if (st_preempted & KVM_VCPU_FLUSH_TLB)
3215                        kvm_vcpu_flush_tlb_guest(vcpu);
3216        } else {
3217                st->preempted = 0;
3218        }
3219
3220        vcpu->arch.st.preempted = 0;
3221
3222        if (st->version & 1)
3223                st->version += 1;  /* first time write, random junk */
3224
3225        st->version += 1;
3226
3227        smp_wmb();
3228
3229        st->steal += current->sched_info.run_delay -
3230                vcpu->arch.st.last_steal;
3231        vcpu->arch.st.last_steal = current->sched_info.run_delay;
3232
3233        smp_wmb();
3234
3235        st->version += 1;
3236
3237        kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
3238}
3239
3240int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3241{
3242        bool pr = false;
3243        u32 msr = msr_info->index;
3244        u64 data = msr_info->data;
3245
3246        if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
3247                return kvm_xen_write_hypercall_page(vcpu, data);
3248
3249        switch (msr) {
3250        case MSR_AMD64_NB_CFG:
3251        case MSR_IA32_UCODE_WRITE:
3252        case MSR_VM_HSAVE_PA:
3253        case MSR_AMD64_PATCH_LOADER:
3254        case MSR_AMD64_BU_CFG2:
3255        case MSR_AMD64_DC_CFG:
3256        case MSR_F15H_EX_CFG:
3257                break;
3258
3259        case MSR_IA32_UCODE_REV:
3260                if (msr_info->host_initiated)
3261                        vcpu->arch.microcode_version = data;
3262                break;
3263        case MSR_IA32_ARCH_CAPABILITIES:
3264                if (!msr_info->host_initiated)
3265                        return 1;
3266                vcpu->arch.arch_capabilities = data;
3267                break;
3268        case MSR_IA32_PERF_CAPABILITIES: {
3269                struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
3270
3271                if (!msr_info->host_initiated)
3272                        return 1;
3273                if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
3274                        return 1;
3275                if (data & ~msr_ent.data)
3276                        return 1;
3277
3278                vcpu->arch.perf_capabilities = data;
3279
3280                return 0;
3281                }
3282        case MSR_EFER:
3283                return set_efer(vcpu, msr_info);
3284        case MSR_K7_HWCR:
3285                data &= ~(u64)0x40;     /* ignore flush filter disable */
3286                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
3287                data &= ~(u64)0x8;      /* ignore TLB cache disable */
3288
3289                /* Handle McStatusWrEn */
3290                if (data == BIT_ULL(18)) {
3291                        vcpu->arch.msr_hwcr = data;
3292                } else if (data != 0) {
3293                        vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
3294                                    data);
3295                        return 1;
3296                }
3297                break;
3298        case MSR_FAM10H_MMIO_CONF_BASE:
3299                if (data != 0) {
3300                        vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
3301                                    "0x%llx\n", data);
3302                        return 1;
3303                }
3304                break;
3305        case 0x200 ... 0x2ff:
3306                return kvm_mtrr_set_msr(vcpu, msr, data);
3307        case MSR_IA32_APICBASE:
3308                return kvm_set_apic_base(vcpu, msr_info);
3309        case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3310                return kvm_x2apic_msr_write(vcpu, msr, data);
3311        case MSR_IA32_TSC_DEADLINE:
3312                kvm_set_lapic_tscdeadline_msr(vcpu, data);
3313                break;
3314        case MSR_IA32_TSC_ADJUST:
3315                if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
3316                        if (!msr_info->host_initiated) {
3317                                s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
3318                                adjust_tsc_offset_guest(vcpu, adj);
3319                        }
3320                        vcpu->arch.ia32_tsc_adjust_msr = data;
3321                }
3322                break;
3323        case MSR_IA32_MISC_ENABLE:
3324                if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3325                    ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3326                        if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3327                                return 1;
3328                        vcpu->arch.ia32_misc_enable_msr = data;
3329                        kvm_update_cpuid_runtime(vcpu);
3330                } else {
3331                        vcpu->arch.ia32_misc_enable_msr = data;
3332                }
3333                break;
3334        case MSR_IA32_SMBASE:
3335                if (!msr_info->host_initiated)
3336                        return 1;
3337                vcpu->arch.smbase = data;
3338                break;
3339        case MSR_IA32_POWER_CTL:
3340                vcpu->arch.msr_ia32_power_ctl = data;
3341                break;
3342        case MSR_IA32_TSC:
3343                if (msr_info->host_initiated) {
3344                        kvm_synchronize_tsc(vcpu, data);
3345                } else {
3346                        u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3347                        adjust_tsc_offset_guest(vcpu, adj);
3348                        vcpu->arch.ia32_tsc_adjust_msr += adj;
3349                }
3350                break;
3351        case MSR_IA32_XSS:
3352                if (!msr_info->host_initiated &&
3353                    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3354                        return 1;
3355                /*
3356                 * KVM supports exposing PT to the guest, but does not support
3357                 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3358                 * XSAVES/XRSTORS to save/restore PT MSRs.
3359                 */
3360                if (data & ~supported_xss)
3361                        return 1;
3362                vcpu->arch.ia32_xss = data;
3363                break;
3364        case MSR_SMI_COUNT:
3365                if (!msr_info->host_initiated)
3366                        return 1;
3367                vcpu->arch.smi_count = data;
3368                break;
3369        case MSR_KVM_WALL_CLOCK_NEW:
3370                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3371                        return 1;
3372
3373                vcpu->kvm->arch.wall_clock = data;
3374                kvm_write_wall_clock(vcpu->kvm, data, 0);
3375                break;
3376        case MSR_KVM_WALL_CLOCK:
3377                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3378                        return 1;
3379
3380                vcpu->kvm->arch.wall_clock = data;
3381                kvm_write_wall_clock(vcpu->kvm, data, 0);
3382                break;
3383        case MSR_KVM_SYSTEM_TIME_NEW:
3384                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3385                        return 1;
3386
3387                kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3388                break;
3389        case MSR_KVM_SYSTEM_TIME:
3390                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3391                        return 1;
3392
3393                kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
3394                break;
3395        case MSR_KVM_ASYNC_PF_EN:
3396                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3397                        return 1;
3398
3399                if (kvm_pv_enable_async_pf(vcpu, data))
3400                        return 1;
3401                break;
3402        case MSR_KVM_ASYNC_PF_INT:
3403                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3404                        return 1;
3405
3406                if (kvm_pv_enable_async_pf_int(vcpu, data))
3407                        return 1;
3408                break;
3409        case MSR_KVM_ASYNC_PF_ACK:
3410                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3411                        return 1;
3412                if (data & 0x1) {
3413                        vcpu->arch.apf.pageready_pending = false;
3414                        kvm_check_async_pf_completion(vcpu);
3415                }
3416                break;
3417        case MSR_KVM_STEAL_TIME:
3418                if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3419                        return 1;
3420
3421                if (unlikely(!sched_info_on()))
3422                        return 1;
3423
3424                if (data & KVM_STEAL_RESERVED_MASK)
3425                        return 1;
3426
3427                vcpu->arch.st.msr_val = data;
3428
3429                if (!(data & KVM_MSR_ENABLED))
3430                        break;
3431
3432                kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3433
3434                break;
3435        case MSR_KVM_PV_EOI_EN:
3436                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3437                        return 1;
3438
3439                if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
3440                        return 1;
3441                break;
3442
3443        case MSR_KVM_POLL_CONTROL:
3444                if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3445                        return 1;
3446
3447                /* only enable bit supported */
3448                if (data & (-1ULL << 1))
3449                        return 1;
3450
3451                vcpu->arch.msr_kvm_poll_control = data;
3452                break;
3453
3454        case MSR_IA32_MCG_CTL:
3455        case MSR_IA32_MCG_STATUS:
3456        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3457                return set_msr_mce(vcpu, msr_info);
3458
3459        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3460        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3461                pr = true;
3462                fallthrough;
3463        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3464        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3465                if (kvm_pmu_is_valid_msr(vcpu, msr))
3466                        return kvm_pmu_set_msr(vcpu, msr_info);
3467
3468                if (pr || data != 0)
3469                        vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
3470                                    "0x%x data 0x%llx\n", msr, data);
3471                break;
3472        case MSR_K7_CLK_CTL:
3473                /*
3474                 * Ignore all writes to this no longer documented MSR.
3475                 * Writes are only relevant for old K7 processors,
3476                 * all pre-dating SVM, but a recommended workaround from
3477                 * AMD for these chips. It is possible to specify the
3478                 * affected processor models on the command line, hence
3479                 * the need to ignore the workaround.
3480                 */
3481                break;
3482        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3483        case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3484        case HV_X64_MSR_SYNDBG_OPTIONS:
3485        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3486        case HV_X64_MSR_CRASH_CTL:
3487        case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3488        case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3489        case HV_X64_MSR_TSC_EMULATION_CONTROL:
3490        case HV_X64_MSR_TSC_EMULATION_STATUS:
3491                return kvm_hv_set_msr_common(vcpu, msr, data,
3492                                             msr_info->host_initiated);
3493        case MSR_IA32_BBL_CR_CTL3:
3494                /* Drop writes to this legacy MSR -- see rdmsr
3495                 * counterpart for further detail.
3496                 */
3497                if (report_ignored_msrs)
3498                        vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
3499                                msr, data);
3500                break;
3501        case MSR_AMD64_OSVW_ID_LENGTH:
3502                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3503                        return 1;
3504                vcpu->arch.osvw.length = data;
3505                break;
3506        case MSR_AMD64_OSVW_STATUS:
3507                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3508                        return 1;
3509                vcpu->arch.osvw.status = data;
3510                break;
3511        case MSR_PLATFORM_INFO:
3512                if (!msr_info->host_initiated ||
3513                    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
3514                     cpuid_fault_enabled(vcpu)))
3515                        return 1;
3516                vcpu->arch.msr_platform_info = data;
3517                break;
3518        case MSR_MISC_FEATURES_ENABLES:
3519                if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
3520                    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
3521                     !supports_cpuid_fault(vcpu)))
3522                        return 1;
3523                vcpu->arch.msr_misc_features_enables = data;
3524                break;
3525        default:
3526                if (kvm_pmu_is_valid_msr(vcpu, msr))
3527                        return kvm_pmu_set_msr(vcpu, msr_info);
3528                return KVM_MSR_RET_INVALID;
3529        }
3530        return 0;
3531}
3532EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3533
3534static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
3535{
3536        u64 data;
3537        u64 mcg_cap = vcpu->arch.mcg_cap;
3538        unsigned bank_num = mcg_cap & 0xff;
3539
3540        switch (msr) {
3541        case MSR_IA32_P5_MC_ADDR:
3542        case MSR_IA32_P5_MC_TYPE:
3543                data = 0;
3544                break;
3545        case MSR_IA32_MCG_CAP:
3546                data = vcpu->arch.mcg_cap;
3547                break;
3548        case MSR_IA32_MCG_CTL:
3549                if (!(mcg_cap & MCG_CTL_P) && !host)
3550                        return 1;
3551                data = vcpu->arch.mcg_ctl;
3552                break;
3553        case MSR_IA32_MCG_STATUS:
3554                data = vcpu->arch.mcg_status;
3555                break;
3556        default:
3557                if (msr >= MSR_IA32_MC0_CTL &&
3558                    msr < MSR_IA32_MCx_CTL(bank_num)) {
3559                        u32 offset = array_index_nospec(
3560                                msr - MSR_IA32_MC0_CTL,
3561                                MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
3562
3563                        data = vcpu->arch.mce_banks[offset];
3564                        break;
3565                }
3566                return 1;
3567        }
3568        *pdata = data;
3569        return 0;
3570}
3571
3572int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3573{
3574        switch (msr_info->index) {
3575        case MSR_IA32_PLATFORM_ID:
3576        case MSR_IA32_EBL_CR_POWERON:
3577        case MSR_IA32_LASTBRANCHFROMIP:
3578        case MSR_IA32_LASTBRANCHTOIP:
3579        case MSR_IA32_LASTINTFROMIP:
3580        case MSR_IA32_LASTINTTOIP:
3581        case MSR_AMD64_SYSCFG:
3582        case MSR_K8_TSEG_ADDR:
3583        case MSR_K8_TSEG_MASK:
3584        case MSR_VM_HSAVE_PA:
3585        case MSR_K8_INT_PENDING_MSG:
3586        case MSR_AMD64_NB_CFG:
3587        case MSR_FAM10H_MMIO_CONF_BASE:
3588        case MSR_AMD64_BU_CFG2:
3589        case MSR_IA32_PERF_CTL:
3590        case MSR_AMD64_DC_CFG:
3591        case MSR_F15H_EX_CFG:
3592        /*
3593         * Intel Sandy Bridge CPUs must support the RAPL (running average power
3594         * limit) MSRs. Just return 0, as we do not want to expose the host
3595         * data here. Do not conditionalize this on CPUID, as KVM does not do
3596         * so for existing CPU-specific MSRs.
3597         */
3598        case MSR_RAPL_POWER_UNIT:
3599        case MSR_PP0_ENERGY_STATUS:     /* Power plane 0 (core) */
3600        case MSR_PP1_ENERGY_STATUS:     /* Power plane 1 (graphics uncore) */
3601        case MSR_PKG_ENERGY_STATUS:     /* Total package */
3602        case MSR_DRAM_ENERGY_STATUS:    /* DRAM controller */
3603                msr_info->data = 0;
3604                break;
3605        case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
3606                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3607                        return kvm_pmu_get_msr(vcpu, msr_info);
3608                if (!msr_info->host_initiated)
3609                        return 1;
3610                msr_info->data = 0;
3611                break;
3612        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3613        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3614        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3615        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3616                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3617                        return kvm_pmu_get_msr(vcpu, msr_info);
3618                msr_info->data = 0;
3619                break;
3620        case MSR_IA32_UCODE_REV:
3621                msr_info->data = vcpu->arch.microcode_version;
3622                break;
3623        case MSR_IA32_ARCH_CAPABILITIES:
3624                if (!msr_info->host_initiated &&
3625                    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
3626                        return 1;
3627                msr_info->data = vcpu->arch.arch_capabilities;
3628                break;
3629        case MSR_IA32_PERF_CAPABILITIES:
3630                if (!msr_info->host_initiated &&
3631                    !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
3632                        return 1;
3633                msr_info->data = vcpu->arch.perf_capabilities;
3634                break;
3635        case MSR_IA32_POWER_CTL:
3636                msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3637                break;
3638        case MSR_IA32_TSC: {
3639                /*
3640                 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
3641                 * even when not intercepted. AMD manual doesn't explicitly
3642                 * state this but appears to behave the same.
3643                 *
3644                 * On userspace reads and writes, however, we unconditionally
3645                 * return L1's TSC value to ensure backwards-compatible
3646                 * behavior for migration.
3647                 */
3648                u64 offset, ratio;
3649
3650                if (msr_info->host_initiated) {
3651                        offset = vcpu->arch.l1_tsc_offset;
3652                        ratio = vcpu->arch.l1_tsc_scaling_ratio;
3653                } else {
3654                        offset = vcpu->arch.tsc_offset;
3655                        ratio = vcpu->arch.tsc_scaling_ratio;
3656                }
3657
3658                msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
3659                break;
3660        }
3661        case MSR_MTRRcap:
3662        case 0x200 ... 0x2ff:
3663                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
3664        case 0xcd: /* fsb frequency */
3665                msr_info->data = 3;
3666                break;
3667                /*
3668                 * MSR_EBC_FREQUENCY_ID
3669                 * Conservative value valid for even the basic CPU models.
3670                 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
3671                 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
3672                 * and 266MHz for model 3, or 4. Set Core Clock
3673                 * Frequency to System Bus Frequency Ratio to 1 (bits
3674                 * 31:24) even though these are only valid for CPU
3675                 * models > 2, however guests may end up dividing or
3676                 * multiplying by zero otherwise.
3677                 */
3678        case MSR_EBC_FREQUENCY_ID:
3679                msr_info->data = 1 << 24;
3680                break;
3681        case MSR_IA32_APICBASE:
3682                msr_info->data = kvm_get_apic_base(vcpu);
3683                break;
3684        case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3685                return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
3686        case MSR_IA32_TSC_DEADLINE:
3687                msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
3688                break;
3689        case MSR_IA32_TSC_ADJUST:
3690                msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
3691                break;
3692        case MSR_IA32_MISC_ENABLE:
3693                msr_info->data = vcpu->arch.ia32_misc_enable_msr;
3694                break;
3695        case MSR_IA32_SMBASE:
3696                if (!msr_info->host_initiated)
3697                        return 1;
3698                msr_info->data = vcpu->arch.smbase;
3699                break;
3700        case MSR_SMI_COUNT:
3701                msr_info->data = vcpu->arch.smi_count;
3702                break;
3703        case MSR_IA32_PERF_STATUS:
3704                /* TSC increment by tick */
3705                msr_info->data = 1000ULL;
3706                /* CPU multiplier */
3707                msr_info->data |= (((uint64_t)4ULL) << 40);
3708                break;
3709        case MSR_EFER:
3710                msr_info->data = vcpu->arch.efer;
3711                break;
3712        case MSR_KVM_WALL_CLOCK:
3713                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3714                        return 1;
3715
3716                msr_info->data = vcpu->kvm->arch.wall_clock;
3717                break;
3718        case MSR_KVM_WALL_CLOCK_NEW:
3719                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3720                        return 1;
3721
3722                msr_info->data = vcpu->kvm->arch.wall_clock;
3723                break;
3724        case MSR_KVM_SYSTEM_TIME:
3725                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3726                        return 1;
3727
3728                msr_info->data = vcpu->arch.time;
3729                break;
3730        case MSR_KVM_SYSTEM_TIME_NEW:
3731                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3732                        return 1;
3733
3734                msr_info->data = vcpu->arch.time;
3735                break;
3736        case MSR_KVM_ASYNC_PF_EN:
3737                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3738                        return 1;
3739
3740                msr_info->data = vcpu->arch.apf.msr_en_val;
3741                break;
3742        case MSR_KVM_ASYNC_PF_INT:
3743                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3744                        return 1;
3745
3746                msr_info->data = vcpu->arch.apf.msr_int_val;
3747                break;
3748        case MSR_KVM_ASYNC_PF_ACK:
3749                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3750                        return 1;
3751
3752                msr_info->data = 0;
3753                break;
3754        case MSR_KVM_STEAL_TIME:
3755                if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3756                        return 1;
3757
3758                msr_info->data = vcpu->arch.st.msr_val;
3759                break;
3760        case MSR_KVM_PV_EOI_EN:
3761                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3762                        return 1;
3763
3764                msr_info->data = vcpu->arch.pv_eoi.msr_val;
3765                break;
3766        case MSR_KVM_POLL_CONTROL:
3767                if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3768                        return 1;
3769
3770                msr_info->data = vcpu->arch.msr_kvm_poll_control;
3771                break;
3772        case MSR_IA32_P5_MC_ADDR:
3773        case MSR_IA32_P5_MC_TYPE:
3774        case MSR_IA32_MCG_CAP:
3775        case MSR_IA32_MCG_CTL:
3776        case MSR_IA32_MCG_STATUS:
3777        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3778                return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
3779                                   msr_info->host_initiated);
3780        case MSR_IA32_XSS:
3781                if (!msr_info->host_initiated &&
3782                    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3783                        return 1;
3784                msr_info->data = vcpu->arch.ia32_xss;
3785                break;
3786        case MSR_K7_CLK_CTL:
3787                /*
3788                 * Provide expected ramp-up count for K7. All other
3789                 * are set to zero, indicating minimum divisors for
3790                 * every field.
3791                 *
3792                 * This prevents guest kernels on AMD host with CPU
3793                 * type 6, model 8 and higher from exploding due to
3794                 * the rdmsr failing.
3795                 */
3796                msr_info->data = 0x20000000;
3797                break;
3798        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3799        case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3800        case HV_X64_MSR_SYNDBG_OPTIONS:
3801        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3802        case HV_X64_MSR_CRASH_CTL:
3803        case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3804        case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3805        case HV_X64_MSR_TSC_EMULATION_CONTROL:
3806        case HV_X64_MSR_TSC_EMULATION_STATUS:
3807                return kvm_hv_get_msr_common(vcpu,
3808                                             msr_info->index, &msr_info->data,
3809                                             msr_info->host_initiated);
3810        case MSR_IA32_BBL_CR_CTL3:
3811                /* This legacy MSR exists but isn't fully documented in current
3812                 * silicon.  It is however accessed by winxp in very narrow
3813                 * scenarios where it sets bit #19, itself documented as
3814                 * a "reserved" bit.  Best effort attempt to source coherent
3815                 * read data here should the balance of the register be
3816                 * interpreted by the guest:
3817                 *
3818                 * L2 cache control register 3: 64GB range, 256KB size,
3819                 * enabled, latency 0x1, configured
3820                 */
3821                msr_info->data = 0xbe702111;
3822                break;
3823        case MSR_AMD64_OSVW_ID_LENGTH:
3824                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3825                        return 1;
3826                msr_info->data = vcpu->arch.osvw.length;
3827                break;
3828        case MSR_AMD64_OSVW_STATUS:
3829                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3830                        return 1;
3831                msr_info->data = vcpu->arch.osvw.status;
3832                break;
3833        case MSR_PLATFORM_INFO:
3834                if (!msr_info->host_initiated &&
3835                    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
3836                        return 1;
3837                msr_info->data = vcpu->arch.msr_platform_info;
3838                break;
3839        case MSR_MISC_FEATURES_ENABLES:
3840                msr_info->data = vcpu->arch.msr_misc_features_enables;
3841                break;
3842        case MSR_K7_HWCR:
3843                msr_info->data = vcpu->arch.msr_hwcr;
3844                break;
3845        default:
3846                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3847                        return kvm_pmu_get_msr(vcpu, msr_info);
3848                return KVM_MSR_RET_INVALID;
3849        }
3850        return 0;
3851}
3852EXPORT_SYMBOL_GPL(kvm_get_msr_common);
3853
3854/*
3855 * Read or write a bunch of msrs. All parameters are kernel addresses.
3856 *
3857 * @return number of msrs set successfully.
3858 */
3859static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
3860                    struct kvm_msr_entry *entries,
3861                    int (*do_msr)(struct kvm_vcpu *vcpu,
3862                                  unsigned index, u64 *data))
3863{
3864        int i;
3865
3866        for (i = 0; i < msrs->nmsrs; ++i)
3867                if (do_msr(vcpu, entries[i].index, &entries[i].data))
3868                        break;
3869
3870        return i;
3871}
3872
3873/*
3874 * Read or write a bunch of msrs. Parameters are user addresses.
3875 *
3876 * @return number of msrs set successfully.
3877 */
3878static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
3879                  int (*do_msr)(struct kvm_vcpu *vcpu,
3880                                unsigned index, u64 *data),
3881                  int writeback)
3882{
3883        struct kvm_msrs msrs;
3884        struct kvm_msr_entry *entries;
3885        int r, n;
3886        unsigned size;
3887
3888        r = -EFAULT;
3889        if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
3890                goto out;
3891
3892        r = -E2BIG;
3893        if (msrs.nmsrs >= MAX_IO_MSRS)
3894                goto out;
3895
3896        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
3897        entries = memdup_user(user_msrs->entries, size);
3898        if (IS_ERR(entries)) {
3899                r = PTR_ERR(entries);
3900                goto out;
3901        }
3902
3903        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
3904        if (r < 0)
3905                goto out_free;
3906
3907        r = -EFAULT;
3908        if (writeback && copy_to_user(user_msrs->entries, entries, size))
3909                goto out_free;
3910
3911        r = n;
3912
3913out_free:
3914        kfree(entries);
3915out:
3916        return r;
3917}
3918
3919static inline bool kvm_can_mwait_in_guest(void)
3920{
3921        return boot_cpu_has(X86_FEATURE_MWAIT) &&
3922                !boot_cpu_has_bug(X86_BUG_MONITOR) &&
3923                boot_cpu_has(X86_FEATURE_ARAT);
3924}
3925
3926static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
3927                                            struct kvm_cpuid2 __user *cpuid_arg)
3928{
3929        struct kvm_cpuid2 cpuid;
3930        int r;
3931
3932        r = -EFAULT;
3933        if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3934                return r;
3935
3936        r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3937        if (r)
3938                return r;
3939
3940        r = -EFAULT;
3941        if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3942                return r;
3943
3944        return 0;
3945}
3946
3947int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3948{
3949        int r = 0;
3950
3951        switch (ext) {
3952        case KVM_CAP_IRQCHIP:
3953        case KVM_CAP_HLT:
3954        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
3955        case KVM_CAP_SET_TSS_ADDR:
3956        case KVM_CAP_EXT_CPUID:
3957        case KVM_CAP_EXT_EMUL_CPUID:
3958        case KVM_CAP_CLOCKSOURCE:
3959        case KVM_CAP_PIT:
3960        case KVM_CAP_NOP_IO_DELAY:
3961        case KVM_CAP_MP_STATE:
3962        case KVM_CAP_SYNC_MMU:
3963        case KVM_CAP_USER_NMI:
3964        case KVM_CAP_REINJECT_CONTROL:
3965        case KVM_CAP_IRQ_INJECT_STATUS:
3966        case KVM_CAP_IOEVENTFD:
3967        case KVM_CAP_IOEVENTFD_NO_LENGTH:
3968        case KVM_CAP_PIT2:
3969        case KVM_CAP_PIT_STATE2:
3970        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
3971        case KVM_CAP_VCPU_EVENTS:
3972        case KVM_CAP_HYPERV:
3973        case KVM_CAP_HYPERV_VAPIC:
3974        case KVM_CAP_HYPERV_SPIN:
3975        case KVM_CAP_HYPERV_SYNIC:
3976        case KVM_CAP_HYPERV_SYNIC2:
3977        case KVM_CAP_HYPERV_VP_INDEX:
3978        case KVM_CAP_HYPERV_EVENTFD:
3979        case KVM_CAP_HYPERV_TLBFLUSH:
3980        case KVM_CAP_HYPERV_SEND_IPI:
3981        case KVM_CAP_HYPERV_CPUID:
3982        case KVM_CAP_HYPERV_ENFORCE_CPUID:
3983        case KVM_CAP_SYS_HYPERV_CPUID:
3984        case KVM_CAP_PCI_SEGMENT:
3985        case KVM_CAP_DEBUGREGS:
3986        case KVM_CAP_X86_ROBUST_SINGLESTEP:
3987        case KVM_CAP_XSAVE:
3988        case KVM_CAP_ASYNC_PF:
3989        case KVM_CAP_ASYNC_PF_INT:
3990        case KVM_CAP_GET_TSC_KHZ:
3991        case KVM_CAP_KVMCLOCK_CTRL:
3992        case KVM_CAP_READONLY_MEM:
3993        case KVM_CAP_HYPERV_TIME:
3994        case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3995        case KVM_CAP_TSC_DEADLINE_TIMER:
3996        case KVM_CAP_DISABLE_QUIRKS:
3997        case KVM_CAP_SET_BOOT_CPU_ID:
3998        case KVM_CAP_SPLIT_IRQCHIP:
3999        case KVM_CAP_IMMEDIATE_EXIT:
4000        case KVM_CAP_PMU_EVENT_FILTER:
4001        case KVM_CAP_GET_MSR_FEATURES:
4002        case KVM_CAP_MSR_PLATFORM_INFO:
4003        case KVM_CAP_EXCEPTION_PAYLOAD:
4004        case KVM_CAP_SET_GUEST_DEBUG:
4005        case KVM_CAP_LAST_CPU:
4006        case KVM_CAP_X86_USER_SPACE_MSR:
4007        case KVM_CAP_X86_MSR_FILTER:
4008        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4009#ifdef CONFIG_X86_SGX_KVM
4010        case KVM_CAP_SGX_ATTRIBUTE:
4011#endif
4012        case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
4013        case KVM_CAP_SREGS2:
4014        case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
4015                r = 1;
4016                break;
4017        case KVM_CAP_EXIT_HYPERCALL:
4018                r = KVM_EXIT_HYPERCALL_VALID_MASK;
4019                break;
4020        case KVM_CAP_SET_GUEST_DEBUG2:
4021                return KVM_GUESTDBG_VALID_MASK;
4022#ifdef CONFIG_KVM_XEN
4023        case KVM_CAP_XEN_HVM:
4024                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
4025                    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
4026                    KVM_XEN_HVM_CONFIG_SHARED_INFO;
4027                if (sched_info_on())
4028                        r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
4029                break;
4030#endif
4031        case KVM_CAP_SYNC_REGS:
4032                r = KVM_SYNC_X86_VALID_FIELDS;
4033                break;
4034        case KVM_CAP_ADJUST_CLOCK:
4035                r = KVM_CLOCK_TSC_STABLE;
4036                break;
4037        case KVM_CAP_X86_DISABLE_EXITS:
4038                r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
4039                      KVM_X86_DISABLE_EXITS_CSTATE;
4040                if(kvm_can_mwait_in_guest())
4041                        r |= KVM_X86_DISABLE_EXITS_MWAIT;
4042                break;
4043        case KVM_CAP_X86_SMM:
4044                /* SMBASE is usually relocated above 1M on modern chipsets,
4045                 * and SMM handlers might indeed rely on 4G segment limits,
4046                 * so do not report SMM to be available if real mode is
4047                 * emulated via vm86 mode.  Still, do not go to great lengths
4048                 * to avoid userspace's usage of the feature, because it is a
4049                 * fringe case that is not enabled except via specific settings
4050                 * of the module parameters.
4051                 */
4052                r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
4053                break;
4054        case KVM_CAP_VAPIC:
4055                r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
4056                break;
4057        case KVM_CAP_NR_VCPUS:
4058                r = KVM_SOFT_MAX_VCPUS;
4059                break;
4060        case KVM_CAP_MAX_VCPUS:
4061                r = KVM_MAX_VCPUS;
4062                break;
4063        case KVM_CAP_MAX_VCPU_ID:
4064                r = KVM_MAX_VCPU_ID;
4065                break;
4066        case KVM_CAP_PV_MMU:    /* obsolete */
4067                r = 0;
4068                break;
4069        case KVM_CAP_MCE:
4070                r = KVM_MAX_MCE_BANKS;
4071                break;
4072        case KVM_CAP_XCRS:
4073                r = boot_cpu_has(X86_FEATURE_XSAVE);
4074                break;
4075        case KVM_CAP_TSC_CONTROL:
4076                r = kvm_has_tsc_control;
4077                break;
4078        case KVM_CAP_X2APIC_API:
4079                r = KVM_X2APIC_API_VALID_FLAGS;
4080                break;
4081        case KVM_CAP_NESTED_STATE:
4082                r = kvm_x86_ops.nested_ops->get_state ?
4083                        kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
4084                break;
4085        case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4086                r = kvm_x86_ops.enable_direct_tlbflush != NULL;
4087                break;
4088        case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4089                r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
4090                break;
4091        case KVM_CAP_SMALLER_MAXPHYADDR:
4092                r = (int) allow_smaller_maxphyaddr;
4093                break;
4094        case KVM_CAP_STEAL_TIME:
4095                r = sched_info_on();
4096                break;
4097        case KVM_CAP_X86_BUS_LOCK_EXIT:
4098                if (kvm_has_bus_lock_exit)
4099                        r = KVM_BUS_LOCK_DETECTION_OFF |
4100                            KVM_BUS_LOCK_DETECTION_EXIT;
4101                else
4102                        r = 0;
4103                break;
4104        default:
4105                break;
4106        }
4107        return r;
4108
4109}
4110
4111long kvm_arch_dev_ioctl(struct file *filp,
4112                        unsigned int ioctl, unsigned long arg)
4113{
4114        void __user *argp = (void __user *)arg;
4115        long r;
4116
4117        switch (ioctl) {
4118        case KVM_GET_MSR_INDEX_LIST: {
4119                struct kvm_msr_list __user *user_msr_list = argp;
4120                struct kvm_msr_list msr_list;
4121                unsigned n;
4122
4123                r = -EFAULT;
4124                if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4125                        goto out;
4126                n = msr_list.nmsrs;
4127                msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
4128                if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4129                        goto out;
4130                r = -E2BIG;
4131                if (n < msr_list.nmsrs)
4132                        goto out;
4133                r = -EFAULT;
4134                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
4135                                 num_msrs_to_save * sizeof(u32)))
4136                        goto out;
4137                if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
4138                                 &emulated_msrs,
4139                                 num_emulated_msrs * sizeof(u32)))
4140                        goto out;
4141                r = 0;
4142                break;
4143        }
4144        case KVM_GET_SUPPORTED_CPUID:
4145        case KVM_GET_EMULATED_CPUID: {
4146                struct kvm_cpuid2 __user *cpuid_arg = argp;
4147                struct kvm_cpuid2 cpuid;
4148
4149                r = -EFAULT;
4150                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4151                        goto out;
4152
4153                r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
4154                                            ioctl);
4155                if (r)
4156                        goto out;
4157
4158                r = -EFAULT;
4159                if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4160                        goto out;
4161                r = 0;
4162                break;
4163        }
4164        case KVM_X86_GET_MCE_CAP_SUPPORTED:
4165                r = -EFAULT;
4166                if (copy_to_user(argp, &kvm_mce_cap_supported,
4167                                 sizeof(kvm_mce_cap_supported)))
4168                        goto out;
4169                r = 0;
4170                break;
4171        case KVM_GET_MSR_FEATURE_INDEX_LIST: {
4172                struct kvm_msr_list __user *user_msr_list = argp;
4173                struct kvm_msr_list msr_list;
4174                unsigned int n;
4175
4176                r = -EFAULT;
4177                if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4178                        goto out;
4179                n = msr_list.nmsrs;
4180                msr_list.nmsrs = num_msr_based_features;
4181                if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4182                        goto out;
4183                r = -E2BIG;
4184                if (n < msr_list.nmsrs)
4185                        goto out;
4186                r = -EFAULT;
4187                if (copy_to_user(user_msr_list->indices, &msr_based_features,
4188                                 num_msr_based_features * sizeof(u32)))
4189                        goto out;
4190                r = 0;
4191                break;
4192        }
4193        case KVM_GET_MSRS:
4194                r = msr_io(NULL, argp, do_get_msr_feature, 1);
4195                break;
4196        case KVM_GET_SUPPORTED_HV_CPUID:
4197                r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
4198                break;
4199        default:
4200                r = -EINVAL;
4201                break;
4202        }
4203out:
4204        return r;
4205}
4206
4207static void wbinvd_ipi(void *garbage)
4208{
4209        wbinvd();
4210}
4211
4212static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
4213{
4214        return kvm_arch_has_noncoherent_dma(vcpu->kvm);
4215}
4216
4217void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4218{
4219        /* Address WBINVD may be executed by guest */
4220        if (need_emulate_wbinvd(vcpu)) {
4221                if (static_call(kvm_x86_has_wbinvd_exit)())
4222                        cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4223                else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
4224                        smp_call_function_single(vcpu->cpu,
4225                                        wbinvd_ipi, NULL, 1);
4226        }
4227
4228        static_call(kvm_x86_vcpu_load)(vcpu, cpu);
4229
4230        /* Save host pkru register if supported */
4231        vcpu->arch.host_pkru = read_pkru();
4232
4233        /* Apply any externally detected TSC adjustments (due to suspend) */
4234        if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
4235                adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
4236                vcpu->arch.tsc_offset_adjustment = 0;
4237                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4238        }
4239
4240        if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
4241                s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
4242                                rdtsc() - vcpu->arch.last_host_tsc;
4243                if (tsc_delta < 0)
4244                        mark_tsc_unstable("KVM discovered backwards TSC");
4245
4246                if (kvm_check_tsc_unstable()) {
4247                        u64 offset = kvm_compute_l1_tsc_offset(vcpu,
4248                                                vcpu->arch.last_guest_tsc);
4249                        kvm_vcpu_write_tsc_offset(vcpu, offset);
4250                        vcpu->arch.tsc_catchup = 1;
4251                }
4252
4253                if (kvm_lapic_hv_timer_in_use(vcpu))
4254                        kvm_lapic_restart_hv_timer(vcpu);
4255
4256                /*
4257                 * On a host with synchronized TSC, there is no need to update
4258                 * kvmclock on vcpu->cpu migration
4259                 */
4260                if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
4261                        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
4262                if (vcpu->cpu != cpu)
4263                        kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
4264                vcpu->cpu = cpu;
4265        }
4266
4267        kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
4268}
4269
4270static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
4271{
4272        struct kvm_host_map map;
4273        struct kvm_steal_time *st;
4274
4275        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
4276                return;
4277
4278        if (vcpu->arch.st.preempted)
4279                return;
4280
4281        if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
4282                        &vcpu->arch.st.cache, true))
4283                return;
4284
4285        st = map.hva +
4286                offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
4287
4288        st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4289
4290        kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
4291}
4292
4293void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
4294{
4295        int idx;
4296
4297        if (vcpu->preempted && !vcpu->arch.guest_state_protected)
4298                vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
4299
4300        /*
4301         * Take the srcu lock as memslots will be accessed to check the gfn
4302         * cache generation against the memslots generation.
4303         */
4304        idx = srcu_read_lock(&vcpu->kvm->srcu);
4305        if (kvm_xen_msr_enabled(vcpu->kvm))
4306                kvm_xen_runstate_set_preempted(vcpu);
4307        else
4308                kvm_steal_time_set_preempted(vcpu);
4309        srcu_read_unlock(&vcpu->kvm->srcu, idx);
4310
4311        static_call(kvm_x86_vcpu_put)(vcpu);
4312        vcpu->arch.last_host_tsc = rdtsc();
4313        /*
4314         * If userspace has set any breakpoints or watchpoints, dr6 is restored
4315         * on every vmexit, but if not, we might have a stale dr6 from the
4316         * guest. do_debug expects dr6 to be cleared after it runs, do the same.
4317         */
4318        set_debugreg(0, 6);
4319}
4320
4321static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
4322                                    struct kvm_lapic_state *s)
4323{
4324        if (vcpu->arch.apicv_active)
4325                static_call(kvm_x86_sync_pir_to_irr)(vcpu);
4326
4327        return kvm_apic_get_state(vcpu, s);
4328}
4329
4330static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
4331                                    struct kvm_lapic_state *s)
4332{
4333        int r;
4334
4335        r = kvm_apic_set_state(vcpu, s);
4336        if (r)
4337                return r;
4338        update_cr8_intercept(vcpu);
4339
4340        return 0;
4341}
4342
4343static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
4344{
4345        /*
4346         * We can accept userspace's request for interrupt injection
4347         * as long as we have a place to store the interrupt number.
4348         * The actual injection will happen when the CPU is able to
4349         * deliver the interrupt.
4350         */
4351        if (kvm_cpu_has_extint(vcpu))
4352                return false;
4353
4354        /* Acknowledging ExtINT does not happen if LINT0 is masked.  */
4355        return (!lapic_in_kernel(vcpu) ||
4356                kvm_apic_accept_pic_intr(vcpu));
4357}
4358
4359static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
4360{
4361        /*
4362         * Do not cause an interrupt window exit if an exception
4363         * is pending or an event needs reinjection; userspace
4364         * might want to inject the interrupt manually using KVM_SET_REGS
4365         * or KVM_SET_SREGS.  For that to work, we must be at an
4366         * instruction boundary and with no events half-injected.
4367         */
4368        return (kvm_arch_interrupt_allowed(vcpu) &&
4369                kvm_cpu_accept_dm_intr(vcpu) &&
4370                !kvm_event_needs_reinjection(vcpu) &&
4371                !vcpu->arch.exception.pending);
4372}
4373
4374static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
4375                                    struct kvm_interrupt *irq)
4376{
4377        if (irq->irq >= KVM_NR_INTERRUPTS)
4378                return -EINVAL;
4379
4380        if (!irqchip_in_kernel(vcpu->kvm)) {
4381                kvm_queue_interrupt(vcpu, irq->irq, false);
4382                kvm_make_request(KVM_REQ_EVENT, vcpu);
4383                return 0;
4384        }
4385
4386        /*
4387         * With in-kernel LAPIC, we only use this to inject EXTINT, so
4388         * fail for in-kernel 8259.
4389         */
4390        if (pic_in_kernel(vcpu->kvm))
4391                return -ENXIO;
4392
4393        if (vcpu->arch.pending_external_vector != -1)
4394                return -EEXIST;
4395
4396        vcpu->arch.pending_external_vector = irq->irq;
4397        kvm_make_request(KVM_REQ_EVENT, vcpu);
4398        return 0;
4399}
4400
4401static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
4402{
4403        kvm_inject_nmi(vcpu);
4404
4405        return 0;
4406}
4407
4408static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
4409{
4410        kvm_make_request(KVM_REQ_SMI, vcpu);
4411
4412        return 0;
4413}
4414
4415static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
4416                                           struct kvm_tpr_access_ctl *tac)
4417{
4418        if (tac->flags)
4419                return -EINVAL;
4420        vcpu->arch.tpr_access_reporting = !!tac->enabled;
4421        return 0;
4422}
4423
4424static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
4425                                        u64 mcg_cap)
4426{
4427        int r;
4428        unsigned bank_num = mcg_cap & 0xff, bank;
4429
4430        r = -EINVAL;
4431        if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
4432                goto out;
4433        if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
4434                goto out;
4435        r = 0;
4436        vcpu->arch.mcg_cap = mcg_cap;
4437        /* Init IA32_MCG_CTL to all 1s */
4438        if (mcg_cap & MCG_CTL_P)
4439                vcpu->arch.mcg_ctl = ~(u64)0;
4440        /* Init IA32_MCi_CTL to all 1s */
4441        for (bank = 0; bank < bank_num; bank++)
4442                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
4443
4444        static_call(kvm_x86_setup_mce)(vcpu);
4445out:
4446        return r;
4447}
4448
4449static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
4450                                      struct kvm_x86_mce *mce)
4451{
4452        u64 mcg_cap = vcpu->arch.mcg_cap;
4453        unsigned bank_num = mcg_cap & 0xff;
4454        u64 *banks = vcpu->arch.mce_banks;
4455
4456        if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
4457                return -EINVAL;
4458        /*
4459         * if IA32_MCG_CTL is not all 1s, the uncorrected error
4460         * reporting is disabled
4461         */
4462        if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
4463            vcpu->arch.mcg_ctl != ~(u64)0)
4464                return 0;
4465        banks += 4 * mce->bank;
4466        /*
4467         * if IA32_MCi_CTL is not all 1s, the uncorrected error
4468         * reporting is disabled for the bank
4469         */
4470        if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
4471                return 0;
4472        if (mce->status & MCI_STATUS_UC) {
4473                if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
4474                    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
4475                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4476                        return 0;
4477                }
4478                if (banks[1] & MCI_STATUS_VAL)
4479                        mce->status |= MCI_STATUS_OVER;
4480                banks[2] = mce->addr;
4481                banks[3] = mce->misc;
4482                vcpu->arch.mcg_status = mce->mcg_status;
4483                banks[1] = mce->status;
4484                kvm_queue_exception(vcpu, MC_VECTOR);
4485        } else if (!(banks[1] & MCI_STATUS_VAL)
4486                   || !(banks[1] & MCI_STATUS_UC)) {
4487                if (banks[1] & MCI_STATUS_VAL)
4488                        mce->status |= MCI_STATUS_OVER;
4489                banks[2] = mce->addr;
4490                banks[3] = mce->misc;
4491                banks[1] = mce->status;
4492        } else
4493                banks[1] |= MCI_STATUS_OVER;
4494        return 0;
4495}
4496
4497static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
4498                                               struct kvm_vcpu_events *events)
4499{
4500        process_nmi(vcpu);
4501
4502        if (kvm_check_request(KVM_REQ_SMI, vcpu))
4503                process_smi(vcpu);
4504
4505        /*
4506         * In guest mode, payload delivery should be deferred,
4507         * so that the L1 hypervisor can intercept #PF before
4508         * CR2 is modified (or intercept #DB before DR6 is
4509         * modified under nVMX). Unless the per-VM capability,
4510         * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
4511         * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
4512         * opportunistically defer the exception payload, deliver it if the
4513         * capability hasn't been requested before processing a
4514         * KVM_GET_VCPU_EVENTS.
4515         */
4516        if (!vcpu->kvm->arch.exception_payload_enabled &&
4517            vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
4518                kvm_deliver_exception_payload(vcpu);
4519
4520        /*
4521         * The API doesn't provide the instruction length for software
4522         * exceptions, so don't report them. As long as the guest RIP
4523         * isn't advanced, we should expect to encounter the exception
4524         * again.
4525         */
4526        if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
4527                events->exception.injected = 0;
4528                events->exception.pending = 0;
4529        } else {
4530                events->exception.injected = vcpu->arch.exception.injected;
4531                events->exception.pending = vcpu->arch.exception.pending;
4532                /*
4533                 * For ABI compatibility, deliberately conflate
4534                 * pending and injected exceptions when
4535                 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
4536                 */
4537                if (!vcpu->kvm->arch.exception_payload_enabled)
4538                        events->exception.injected |=
4539                                vcpu->arch.exception.pending;
4540        }
4541        events->exception.nr = vcpu->arch.exception.nr;
4542        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
4543        events->exception.error_code = vcpu->arch.exception.error_code;
4544        events->exception_has_payload = vcpu->arch.exception.has_payload;
4545        events->exception_payload = vcpu->arch.exception.payload;
4546
4547        events->interrupt.injected =
4548                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
4549        events->interrupt.nr = vcpu->arch.interrupt.nr;
4550        events->interrupt.soft = 0;
4551        events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
4552
4553        events->nmi.injected = vcpu->arch.nmi_injected;
4554        events->nmi.pending = vcpu->arch.nmi_pending != 0;
4555        events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
4556        events->nmi.pad = 0;
4557
4558        events->sipi_vector = 0; /* never valid when reporting to user space */
4559
4560        events->smi.smm = is_smm(vcpu);
4561        events->smi.pending = vcpu->arch.smi_pending;
4562        events->smi.smm_inside_nmi =
4563                !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
4564        events->smi.latched_init = kvm_lapic_latched_init(vcpu);
4565
4566        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
4567                         | KVM_VCPUEVENT_VALID_SHADOW
4568                         | KVM_VCPUEVENT_VALID_SMM);
4569        if (vcpu->kvm->arch.exception_payload_enabled)
4570                events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4571
4572        memset(&events->reserved, 0, sizeof(events->reserved));
4573}
4574
4575static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
4576
4577static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
4578                                              struct kvm_vcpu_events *events)
4579{
4580        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
4581                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
4582                              | KVM_VCPUEVENT_VALID_SHADOW
4583                              | KVM_VCPUEVENT_VALID_SMM
4584                              | KVM_VCPUEVENT_VALID_PAYLOAD))
4585