linux/arch/x86/kvm/svm/svm.c
<<
>>
Prefs
   1#define pr_fmt(fmt) "SVM: " fmt
   2
   3#include <linux/kvm_host.h>
   4
   5#include "irq.h"
   6#include "mmu.h"
   7#include "kvm_cache_regs.h"
   8#include "x86.h"
   9#include "cpuid.h"
  10#include "pmu.h"
  11
  12#include <linux/module.h>
  13#include <linux/mod_devicetable.h>
  14#include <linux/kernel.h>
  15#include <linux/vmalloc.h>
  16#include <linux/highmem.h>
  17#include <linux/amd-iommu.h>
  18#include <linux/sched.h>
  19#include <linux/trace_events.h>
  20#include <linux/slab.h>
  21#include <linux/hashtable.h>
  22#include <linux/objtool.h>
  23#include <linux/psp-sev.h>
  24#include <linux/file.h>
  25#include <linux/pagemap.h>
  26#include <linux/swap.h>
  27#include <linux/rwsem.h>
  28
  29#include <asm/apic.h>
  30#include <asm/perf_event.h>
  31#include <asm/tlbflush.h>
  32#include <asm/desc.h>
  33#include <asm/debugreg.h>
  34#include <asm/kvm_para.h>
  35#include <asm/irq_remapping.h>
  36#include <asm/spec-ctrl.h>
  37#include <asm/cpu_device_id.h>
  38#include <asm/traps.h>
  39
  40#include <asm/virtext.h>
  41#include "trace.h"
  42
  43#include "svm.h"
  44#include "svm_ops.h"
  45
  46#include "kvm_onhyperv.h"
  47#include "svm_onhyperv.h"
  48
  49#define __ex(x) __kvm_handle_fault_on_reboot(x)
  50
  51MODULE_AUTHOR("Qumranet");
  52MODULE_LICENSE("GPL");
  53
  54#ifdef MODULE
  55static const struct x86_cpu_id svm_cpu_id[] = {
  56        X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  57        {}
  58};
  59MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  60#endif
  61
  62#define SEG_TYPE_LDT 2
  63#define SEG_TYPE_BUSY_TSS16 3
  64
  65#define SVM_FEATURE_LBRV           (1 <<  1)
  66#define SVM_FEATURE_SVML           (1 <<  2)
  67#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  68#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  69#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  70#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  71#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  72
  73#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  74
  75#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  76#define TSC_RATIO_MIN           0x0000000000000001ULL
  77#define TSC_RATIO_MAX           0x000000ffffffffffULL
  78
  79static bool erratum_383_found __read_mostly;
  80
  81u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  82
  83/*
  84 * Set osvw_len to higher value when updated Revision Guides
  85 * are published and we know what the new status bits are
  86 */
  87static uint64_t osvw_len = 4, osvw_status;
  88
  89static DEFINE_PER_CPU(u64, current_tsc_ratio);
  90#define TSC_RATIO_DEFAULT       0x0100000000ULL
  91
  92static const struct svm_direct_access_msrs {
  93        u32 index;   /* Index of the MSR */
  94        bool always; /* True if intercept is initially cleared */
  95} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  96        { .index = MSR_STAR,                            .always = true  },
  97        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
  98        { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
  99        { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
 100#ifdef CONFIG_X86_64
 101        { .index = MSR_GS_BASE,                         .always = true  },
 102        { .index = MSR_FS_BASE,                         .always = true  },
 103        { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 104        { .index = MSR_LSTAR,                           .always = true  },
 105        { .index = MSR_CSTAR,                           .always = true  },
 106        { .index = MSR_SYSCALL_MASK,                    .always = true  },
 107#endif
 108        { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
 109        { .index = MSR_IA32_PRED_CMD,                   .always = false },
 110        { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 111        { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 112        { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 113        { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 114        { .index = MSR_EFER,                            .always = false },
 115        { .index = MSR_IA32_CR_PAT,                     .always = false },
 116        { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
 117        { .index = MSR_INVALID,                         .always = false },
 118};
 119
 120/*
 121 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 122 * pause_filter_count: On processors that support Pause filtering(indicated
 123 *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 124 *      count value. On VMRUN this value is loaded into an internal counter.
 125 *      Each time a pause instruction is executed, this counter is decremented
 126 *      until it reaches zero at which time a #VMEXIT is generated if pause
 127 *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 128 *      Intercept Filtering for more details.
 129 *      This also indicate if ple logic enabled.
 130 *
 131 * pause_filter_thresh: In addition, some processor families support advanced
 132 *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 133 *      the amount of time a guest is allowed to execute in a pause loop.
 134 *      In this mode, a 16-bit pause filter threshold field is added in the
 135 *      VMCB. The threshold value is a cycle count that is used to reset the
 136 *      pause counter. As with simple pause filtering, VMRUN loads the pause
 137 *      count value from VMCB into an internal counter. Then, on each pause
 138 *      instruction the hardware checks the elapsed number of cycles since
 139 *      the most recent pause instruction against the pause filter threshold.
 140 *      If the elapsed cycle count is greater than the pause filter threshold,
 141 *      then the internal pause count is reloaded from the VMCB and execution
 142 *      continues. If the elapsed cycle count is less than the pause filter
 143 *      threshold, then the internal pause count is decremented. If the count
 144 *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 145 *      triggered. If advanced pause filtering is supported and pause filter
 146 *      threshold field is set to zero, the filter will operate in the simpler,
 147 *      count only mode.
 148 */
 149
 150static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 151module_param(pause_filter_thresh, ushort, 0444);
 152
 153static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 154module_param(pause_filter_count, ushort, 0444);
 155
 156/* Default doubles per-vcpu window every exit. */
 157static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 158module_param(pause_filter_count_grow, ushort, 0444);
 159
 160/* Default resets per-vcpu window every exit to pause_filter_count. */
 161static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 162module_param(pause_filter_count_shrink, ushort, 0444);
 163
 164/* Default is to compute the maximum so we can never overflow. */
 165static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 166module_param(pause_filter_count_max, ushort, 0444);
 167
 168/*
 169 * Use nested page tables by default.  Note, NPT may get forced off by
 170 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 171 */
 172bool npt_enabled = true;
 173module_param_named(npt, npt_enabled, bool, 0444);
 174
 175/* allow nested virtualization in KVM/SVM */
 176static int nested = true;
 177module_param(nested, int, S_IRUGO);
 178
 179/* enable/disable Next RIP Save */
 180static int nrips = true;
 181module_param(nrips, int, 0444);
 182
 183/* enable/disable Virtual VMLOAD VMSAVE */
 184static int vls = true;
 185module_param(vls, int, 0444);
 186
 187/* enable/disable Virtual GIF */
 188static int vgif = true;
 189module_param(vgif, int, 0444);
 190
 191/*
 192 * enable / disable AVIC.  Because the defaults differ for APICv
 193 * support between VMX and SVM we cannot use module_param_named.
 194 */
 195static bool avic;
 196module_param(avic, bool, 0444);
 197
 198bool __read_mostly dump_invalid_vmcb;
 199module_param(dump_invalid_vmcb, bool, 0644);
 200
 201
 202bool intercept_smi = true;
 203module_param(intercept_smi, bool, 0444);
 204
 205
 206static bool svm_gp_erratum_intercept = true;
 207
 208static u8 rsm_ins_bytes[] = "\x0f\xaa";
 209
 210static unsigned long iopm_base;
 211
 212struct kvm_ldttss_desc {
 213        u16 limit0;
 214        u16 base0;
 215        unsigned base1:8, type:5, dpl:2, p:1;
 216        unsigned limit1:4, zero0:3, g:1, base2:8;
 217        u32 base3;
 218        u32 zero1;
 219} __attribute__((packed));
 220
 221DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 222
 223/*
 224 * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 225 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 226 *
 227 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 228 * defer the restoration of TSC_AUX until the CPU returns to userspace.
 229 */
 230static int tsc_aux_uret_slot __read_mostly = -1;
 231
 232static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 233
 234#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 235#define MSRS_RANGE_SIZE 2048
 236#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 237
 238u32 svm_msrpm_offset(u32 msr)
 239{
 240        u32 offset;
 241        int i;
 242
 243        for (i = 0; i < NUM_MSR_MAPS; i++) {
 244                if (msr < msrpm_ranges[i] ||
 245                    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 246                        continue;
 247
 248                offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 249                offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 250
 251                /* Now we have the u8 offset - but need the u32 offset */
 252                return offset / 4;
 253        }
 254
 255        /* MSR not in any range */
 256        return MSR_INVALID;
 257}
 258
 259#define MAX_INST_SIZE 15
 260
 261static int get_max_npt_level(void)
 262{
 263#ifdef CONFIG_X86_64
 264        return PT64_ROOT_4LEVEL;
 265#else
 266        return PT32E_ROOT_LEVEL;
 267#endif
 268}
 269
 270int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 271{
 272        struct vcpu_svm *svm = to_svm(vcpu);
 273        u64 old_efer = vcpu->arch.efer;
 274        vcpu->arch.efer = efer;
 275
 276        if (!npt_enabled) {
 277                /* Shadow paging assumes NX to be available.  */
 278                efer |= EFER_NX;
 279
 280                if (!(efer & EFER_LMA))
 281                        efer &= ~EFER_LME;
 282        }
 283
 284        if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 285                if (!(efer & EFER_SVME)) {
 286                        svm_leave_nested(svm);
 287                        svm_set_gif(svm, true);
 288                        /* #GP intercept is still needed for vmware backdoor */
 289                        if (!enable_vmware_backdoor)
 290                                clr_exception_intercept(svm, GP_VECTOR);
 291
 292                        /*
 293                         * Free the nested guest state, unless we are in SMM.
 294                         * In this case we will return to the nested guest
 295                         * as soon as we leave SMM.
 296                         */
 297                        if (!is_smm(vcpu))
 298                                svm_free_nested(svm);
 299
 300                } else {
 301                        int ret = svm_allocate_nested(svm);
 302
 303                        if (ret) {
 304                                vcpu->arch.efer = old_efer;
 305                                return ret;
 306                        }
 307
 308                        if (svm_gp_erratum_intercept)
 309                                set_exception_intercept(svm, GP_VECTOR);
 310                }
 311        }
 312
 313        svm->vmcb->save.efer = efer | EFER_SVME;
 314        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 315        return 0;
 316}
 317
 318static int is_external_interrupt(u32 info)
 319{
 320        info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 321        return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 322}
 323
 324static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 325{
 326        struct vcpu_svm *svm = to_svm(vcpu);
 327        u32 ret = 0;
 328
 329        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 330                ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 331        return ret;
 332}
 333
 334static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 335{
 336        struct vcpu_svm *svm = to_svm(vcpu);
 337
 338        if (mask == 0)
 339                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 340        else
 341                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 342
 343}
 344
 345static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 346{
 347        struct vcpu_svm *svm = to_svm(vcpu);
 348
 349        /*
 350         * SEV-ES does not expose the next RIP. The RIP update is controlled by
 351         * the type of exit and the #VC handler in the guest.
 352         */
 353        if (sev_es_guest(vcpu->kvm))
 354                goto done;
 355
 356        if (nrips && svm->vmcb->control.next_rip != 0) {
 357                WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 358                svm->next_rip = svm->vmcb->control.next_rip;
 359        }
 360
 361        if (!svm->next_rip) {
 362                if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 363                        return 0;
 364        } else {
 365                kvm_rip_write(vcpu, svm->next_rip);
 366        }
 367
 368done:
 369        svm_set_interrupt_shadow(vcpu, 0);
 370
 371        return 1;
 372}
 373
 374static void svm_queue_exception(struct kvm_vcpu *vcpu)
 375{
 376        struct vcpu_svm *svm = to_svm(vcpu);
 377        unsigned nr = vcpu->arch.exception.nr;
 378        bool has_error_code = vcpu->arch.exception.has_error_code;
 379        u32 error_code = vcpu->arch.exception.error_code;
 380
 381        kvm_deliver_exception_payload(vcpu);
 382
 383        if (nr == BP_VECTOR && !nrips) {
 384                unsigned long rip, old_rip = kvm_rip_read(vcpu);
 385
 386                /*
 387                 * For guest debugging where we have to reinject #BP if some
 388                 * INT3 is guest-owned:
 389                 * Emulate nRIP by moving RIP forward. Will fail if injection
 390                 * raises a fault that is not intercepted. Still better than
 391                 * failing in all cases.
 392                 */
 393                (void)skip_emulated_instruction(vcpu);
 394                rip = kvm_rip_read(vcpu);
 395                svm->int3_rip = rip + svm->vmcb->save.cs.base;
 396                svm->int3_injected = rip - old_rip;
 397        }
 398
 399        svm->vmcb->control.event_inj = nr
 400                | SVM_EVTINJ_VALID
 401                | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 402                | SVM_EVTINJ_TYPE_EXEPT;
 403        svm->vmcb->control.event_inj_err = error_code;
 404}
 405
 406static void svm_init_erratum_383(void)
 407{
 408        u32 low, high;
 409        int err;
 410        u64 val;
 411
 412        if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 413                return;
 414
 415        /* Use _safe variants to not break nested virtualization */
 416        val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 417        if (err)
 418                return;
 419
 420        val |= (1ULL << 47);
 421
 422        low  = lower_32_bits(val);
 423        high = upper_32_bits(val);
 424
 425        native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 426
 427        erratum_383_found = true;
 428}
 429
 430static void svm_init_osvw(struct kvm_vcpu *vcpu)
 431{
 432        /*
 433         * Guests should see errata 400 and 415 as fixed (assuming that
 434         * HLT and IO instructions are intercepted).
 435         */
 436        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 437        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 438
 439        /*
 440         * By increasing VCPU's osvw.length to 3 we are telling the guest that
 441         * all osvw.status bits inside that length, including bit 0 (which is
 442         * reserved for erratum 298), are valid. However, if host processor's
 443         * osvw_len is 0 then osvw_status[0] carries no information. We need to
 444         * be conservative here and therefore we tell the guest that erratum 298
 445         * is present (because we really don't know).
 446         */
 447        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 448                vcpu->arch.osvw.status |= 1;
 449}
 450
 451static int has_svm(void)
 452{
 453        const char *msg;
 454
 455        if (!cpu_has_svm(&msg)) {
 456                printk(KERN_INFO "has_svm: %s\n", msg);
 457                return 0;
 458        }
 459
 460        if (sev_active()) {
 461                pr_info("KVM is unsupported when running as an SEV guest\n");
 462                return 0;
 463        }
 464
 465        if (pgtable_l5_enabled()) {
 466                pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
 467                return 0;
 468        }
 469
 470        return 1;
 471}
 472
 473static void svm_hardware_disable(void)
 474{
 475        /* Make sure we clean up behind us */
 476        if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 477                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 478
 479        cpu_svm_disable();
 480
 481        amd_pmu_disable_virt();
 482}
 483
 484static int svm_hardware_enable(void)
 485{
 486
 487        struct svm_cpu_data *sd;
 488        uint64_t efer;
 489        struct desc_struct *gdt;
 490        int me = raw_smp_processor_id();
 491
 492        rdmsrl(MSR_EFER, efer);
 493        if (efer & EFER_SVME)
 494                return -EBUSY;
 495
 496        if (!has_svm()) {
 497                pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 498                return -EINVAL;
 499        }
 500        sd = per_cpu(svm_data, me);
 501        if (!sd) {
 502                pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 503                return -EINVAL;
 504        }
 505
 506        sd->asid_generation = 1;
 507        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 508        sd->next_asid = sd->max_asid + 1;
 509        sd->min_asid = max_sev_asid + 1;
 510
 511        gdt = get_current_gdt_rw();
 512        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 513
 514        wrmsrl(MSR_EFER, efer | EFER_SVME);
 515
 516        wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
 517
 518        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 519                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 520                __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 521        }
 522
 523
 524        /*
 525         * Get OSVW bits.
 526         *
 527         * Note that it is possible to have a system with mixed processor
 528         * revisions and therefore different OSVW bits. If bits are not the same
 529         * on different processors then choose the worst case (i.e. if erratum
 530         * is present on one processor and not on another then assume that the
 531         * erratum is present everywhere).
 532         */
 533        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 534                uint64_t len, status = 0;
 535                int err;
 536
 537                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 538                if (!err)
 539                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 540                                                      &err);
 541
 542                if (err)
 543                        osvw_status = osvw_len = 0;
 544                else {
 545                        if (len < osvw_len)
 546                                osvw_len = len;
 547                        osvw_status |= status;
 548                        osvw_status &= (1ULL << osvw_len) - 1;
 549                }
 550        } else
 551                osvw_status = osvw_len = 0;
 552
 553        svm_init_erratum_383();
 554
 555        amd_pmu_enable_virt();
 556
 557        return 0;
 558}
 559
 560static void svm_cpu_uninit(int cpu)
 561{
 562        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 563
 564        if (!sd)
 565                return;
 566
 567        per_cpu(svm_data, cpu) = NULL;
 568        kfree(sd->sev_vmcbs);
 569        __free_page(sd->save_area);
 570        kfree(sd);
 571}
 572
 573static int svm_cpu_init(int cpu)
 574{
 575        struct svm_cpu_data *sd;
 576        int ret = -ENOMEM;
 577
 578        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 579        if (!sd)
 580                return ret;
 581        sd->cpu = cpu;
 582        sd->save_area = alloc_page(GFP_KERNEL);
 583        if (!sd->save_area)
 584                goto free_cpu_data;
 585
 586        clear_page(page_address(sd->save_area));
 587
 588        ret = sev_cpu_init(sd);
 589        if (ret)
 590                goto free_save_area;
 591
 592        per_cpu(svm_data, cpu) = sd;
 593
 594        return 0;
 595
 596free_save_area:
 597        __free_page(sd->save_area);
 598free_cpu_data:
 599        kfree(sd);
 600        return ret;
 601
 602}
 603
 604static int direct_access_msr_slot(u32 msr)
 605{
 606        u32 i;
 607
 608        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 609                if (direct_access_msrs[i].index == msr)
 610                        return i;
 611
 612        return -ENOENT;
 613}
 614
 615static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 616                                     int write)
 617{
 618        struct vcpu_svm *svm = to_svm(vcpu);
 619        int slot = direct_access_msr_slot(msr);
 620
 621        if (slot == -ENOENT)
 622                return;
 623
 624        /* Set the shadow bitmaps to the desired intercept states */
 625        if (read)
 626                set_bit(slot, svm->shadow_msr_intercept.read);
 627        else
 628                clear_bit(slot, svm->shadow_msr_intercept.read);
 629
 630        if (write)
 631                set_bit(slot, svm->shadow_msr_intercept.write);
 632        else
 633                clear_bit(slot, svm->shadow_msr_intercept.write);
 634}
 635
 636static bool valid_msr_intercept(u32 index)
 637{
 638        return direct_access_msr_slot(index) != -ENOENT;
 639}
 640
 641static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 642{
 643        u8 bit_write;
 644        unsigned long tmp;
 645        u32 offset;
 646        u32 *msrpm;
 647
 648        msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 649                                      to_svm(vcpu)->msrpm;
 650
 651        offset    = svm_msrpm_offset(msr);
 652        bit_write = 2 * (msr & 0x0f) + 1;
 653        tmp       = msrpm[offset];
 654
 655        BUG_ON(offset == MSR_INVALID);
 656
 657        return !!test_bit(bit_write,  &tmp);
 658}
 659
 660static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 661                                        u32 msr, int read, int write)
 662{
 663        u8 bit_read, bit_write;
 664        unsigned long tmp;
 665        u32 offset;
 666
 667        /*
 668         * If this warning triggers extend the direct_access_msrs list at the
 669         * beginning of the file
 670         */
 671        WARN_ON(!valid_msr_intercept(msr));
 672
 673        /* Enforce non allowed MSRs to trap */
 674        if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 675                read = 0;
 676
 677        if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 678                write = 0;
 679
 680        offset    = svm_msrpm_offset(msr);
 681        bit_read  = 2 * (msr & 0x0f);
 682        bit_write = 2 * (msr & 0x0f) + 1;
 683        tmp       = msrpm[offset];
 684
 685        BUG_ON(offset == MSR_INVALID);
 686
 687        read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 688        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 689
 690        msrpm[offset] = tmp;
 691
 692        svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 693
 694}
 695
 696void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 697                          int read, int write)
 698{
 699        set_shadow_msr_intercept(vcpu, msr, read, write);
 700        set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 701}
 702
 703u32 *svm_vcpu_alloc_msrpm(void)
 704{
 705        unsigned int order = get_order(MSRPM_SIZE);
 706        struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 707        u32 *msrpm;
 708
 709        if (!pages)
 710                return NULL;
 711
 712        msrpm = page_address(pages);
 713        memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 714
 715        return msrpm;
 716}
 717
 718void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 719{
 720        int i;
 721
 722        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 723                if (!direct_access_msrs[i].always)
 724                        continue;
 725                set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 726        }
 727}
 728
 729
 730void svm_vcpu_free_msrpm(u32 *msrpm)
 731{
 732        __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 733}
 734
 735static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 736{
 737        struct vcpu_svm *svm = to_svm(vcpu);
 738        u32 i;
 739
 740        /*
 741         * Set intercept permissions for all direct access MSRs again. They
 742         * will automatically get filtered through the MSR filter, so we are
 743         * back in sync after this.
 744         */
 745        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 746                u32 msr = direct_access_msrs[i].index;
 747                u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 748                u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 749
 750                set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 751        }
 752}
 753
 754static void add_msr_offset(u32 offset)
 755{
 756        int i;
 757
 758        for (i = 0; i < MSRPM_OFFSETS; ++i) {
 759
 760                /* Offset already in list? */
 761                if (msrpm_offsets[i] == offset)
 762                        return;
 763
 764                /* Slot used by another offset? */
 765                if (msrpm_offsets[i] != MSR_INVALID)
 766                        continue;
 767
 768                /* Add offset to list */
 769                msrpm_offsets[i] = offset;
 770
 771                return;
 772        }
 773
 774        /*
 775         * If this BUG triggers the msrpm_offsets table has an overflow. Just
 776         * increase MSRPM_OFFSETS in this case.
 777         */
 778        BUG();
 779}
 780
 781static void init_msrpm_offsets(void)
 782{
 783        int i;
 784
 785        memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 786
 787        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 788                u32 offset;
 789
 790                offset = svm_msrpm_offset(direct_access_msrs[i].index);
 791                BUG_ON(offset == MSR_INVALID);
 792
 793                add_msr_offset(offset);
 794        }
 795}
 796
 797static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 798{
 799        struct vcpu_svm *svm = to_svm(vcpu);
 800
 801        svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 802        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 803        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 804        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 805        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 806}
 807
 808static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 809{
 810        struct vcpu_svm *svm = to_svm(vcpu);
 811
 812        svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
 813        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 814        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 815        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 816        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 817}
 818
 819void disable_nmi_singlestep(struct vcpu_svm *svm)
 820{
 821        svm->nmi_singlestep = false;
 822
 823        if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
 824                /* Clear our flags if they were not set by the guest */
 825                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
 826                        svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
 827                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
 828                        svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
 829        }
 830}
 831
 832static void grow_ple_window(struct kvm_vcpu *vcpu)
 833{
 834        struct vcpu_svm *svm = to_svm(vcpu);
 835        struct vmcb_control_area *control = &svm->vmcb->control;
 836        int old = control->pause_filter_count;
 837
 838        control->pause_filter_count = __grow_ple_window(old,
 839                                                        pause_filter_count,
 840                                                        pause_filter_count_grow,
 841                                                        pause_filter_count_max);
 842
 843        if (control->pause_filter_count != old) {
 844                vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 845                trace_kvm_ple_window_update(vcpu->vcpu_id,
 846                                            control->pause_filter_count, old);
 847        }
 848}
 849
 850static void shrink_ple_window(struct kvm_vcpu *vcpu)
 851{
 852        struct vcpu_svm *svm = to_svm(vcpu);
 853        struct vmcb_control_area *control = &svm->vmcb->control;
 854        int old = control->pause_filter_count;
 855
 856        control->pause_filter_count =
 857                                __shrink_ple_window(old,
 858                                                    pause_filter_count,
 859                                                    pause_filter_count_shrink,
 860                                                    pause_filter_count);
 861        if (control->pause_filter_count != old) {
 862                vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 863                trace_kvm_ple_window_update(vcpu->vcpu_id,
 864                                            control->pause_filter_count, old);
 865        }
 866}
 867
 868/*
 869 * The default MMIO mask is a single bit (excluding the present bit),
 870 * which could conflict with the memory encryption bit. Check for
 871 * memory encryption support and override the default MMIO mask if
 872 * memory encryption is enabled.
 873 */
 874static __init void svm_adjust_mmio_mask(void)
 875{
 876        unsigned int enc_bit, mask_bit;
 877        u64 msr, mask;
 878
 879        /* If there is no memory encryption support, use existing mask */
 880        if (cpuid_eax(0x80000000) < 0x8000001f)
 881                return;
 882
 883        /* If memory encryption is not enabled, use existing mask */
 884        rdmsrl(MSR_AMD64_SYSCFG, msr);
 885        if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
 886                return;
 887
 888        enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
 889        mask_bit = boot_cpu_data.x86_phys_bits;
 890
 891        /* Increment the mask bit if it is the same as the encryption bit */
 892        if (enc_bit == mask_bit)
 893                mask_bit++;
 894
 895        /*
 896         * If the mask bit location is below 52, then some bits above the
 897         * physical addressing limit will always be reserved, so use the
 898         * rsvd_bits() function to generate the mask. This mask, along with
 899         * the present bit, will be used to generate a page fault with
 900         * PFER.RSV = 1.
 901         *
 902         * If the mask bit location is 52 (or above), then clear the mask.
 903         */
 904        mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 905
 906        kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 907}
 908
 909static void svm_hardware_teardown(void)
 910{
 911        int cpu;
 912
 913        sev_hardware_teardown();
 914
 915        for_each_possible_cpu(cpu)
 916                svm_cpu_uninit(cpu);
 917
 918        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
 919        get_order(IOPM_SIZE));
 920        iopm_base = 0;
 921}
 922
 923static __init void svm_set_cpu_caps(void)
 924{
 925        kvm_set_cpu_caps();
 926
 927        supported_xss = 0;
 928
 929        /* CPUID 0x80000001 and 0x8000000A (SVM features) */
 930        if (nested) {
 931                kvm_cpu_cap_set(X86_FEATURE_SVM);
 932
 933                if (nrips)
 934                        kvm_cpu_cap_set(X86_FEATURE_NRIPS);
 935
 936                if (npt_enabled)
 937                        kvm_cpu_cap_set(X86_FEATURE_NPT);
 938
 939                /* Nested VM can receive #VMEXIT instead of triggering #GP */
 940                kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 941        }
 942
 943        /* CPUID 0x80000008 */
 944        if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 945            boot_cpu_has(X86_FEATURE_AMD_SSBD))
 946                kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
 947
 948        /* CPUID 0x8000001F (SME/SEV features) */
 949        sev_set_cpu_caps();
 950}
 951
 952static __init int svm_hardware_setup(void)
 953{
 954        int cpu;
 955        struct page *iopm_pages;
 956        void *iopm_va;
 957        int r;
 958        unsigned int order = get_order(IOPM_SIZE);
 959
 960        /*
 961         * NX is required for shadow paging and for NPT if the NX huge pages
 962         * mitigation is enabled.
 963         */
 964        if (!boot_cpu_has(X86_FEATURE_NX)) {
 965                pr_err_ratelimited("NX (Execute Disable) not supported\n");
 966                return -EOPNOTSUPP;
 967        }
 968        kvm_enable_efer_bits(EFER_NX);
 969
 970        iopm_pages = alloc_pages(GFP_KERNEL, order);
 971
 972        if (!iopm_pages)
 973                return -ENOMEM;
 974
 975        iopm_va = page_address(iopm_pages);
 976        memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
 977        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 978
 979        init_msrpm_offsets();
 980
 981        supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 982
 983        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 984                kvm_enable_efer_bits(EFER_FFXSR);
 985
 986        if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 987                kvm_has_tsc_control = true;
 988                kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
 989                kvm_tsc_scaling_ratio_frac_bits = 32;
 990        }
 991
 992        tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
 993
 994        /* Check for pause filtering support */
 995        if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
 996                pause_filter_count = 0;
 997                pause_filter_thresh = 0;
 998        } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
 999                pause_filter_thresh = 0;
1000        }
1001
1002        if (nested) {
1003                printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
1004                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
1005        }
1006
1007        /*
1008         * KVM's MMU doesn't support using 2-level paging for itself, and thus
1009         * NPT isn't supported if the host is using 2-level paging since host
1010         * CR4 is unchanged on VMRUN.
1011         */
1012        if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
1013                npt_enabled = false;
1014
1015        if (!boot_cpu_has(X86_FEATURE_NPT))
1016                npt_enabled = false;
1017
1018        kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
1019        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
1020
1021        /* Note, SEV setup consumes npt_enabled. */
1022        sev_hardware_setup();
1023
1024        svm_hv_hardware_setup();
1025
1026        svm_adjust_mmio_mask();
1027
1028        for_each_possible_cpu(cpu) {
1029                r = svm_cpu_init(cpu);
1030                if (r)
1031                        goto err;
1032        }
1033
1034        if (nrips) {
1035                if (!boot_cpu_has(X86_FEATURE_NRIPS))
1036                        nrips = false;
1037        }
1038
1039        enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
1040
1041        if (enable_apicv) {
1042                pr_info("AVIC enabled\n");
1043
1044                amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1045        }
1046
1047        if (vls) {
1048                if (!npt_enabled ||
1049                    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
1050                    !IS_ENABLED(CONFIG_X86_64)) {
1051                        vls = false;
1052                } else {
1053                        pr_info("Virtual VMLOAD VMSAVE supported\n");
1054                }
1055        }
1056
1057        if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
1058                svm_gp_erratum_intercept = false;
1059
1060        if (vgif) {
1061                if (!boot_cpu_has(X86_FEATURE_VGIF))
1062                        vgif = false;
1063                else
1064                        pr_info("Virtual GIF supported\n");
1065        }
1066
1067        svm_set_cpu_caps();
1068
1069        /*
1070         * It seems that on AMD processors PTE's accessed bit is
1071         * being set by the CPU hardware before the NPF vmexit.
1072         * This is not expected behaviour and our tests fail because
1073         * of it.
1074         * A workaround here is to disable support for
1075         * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
1076         * In this case userspace can know if there is support using
1077         * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
1078         * it
1079         * If future AMD CPU models change the behaviour described above,
1080         * this variable can be changed accordingly
1081         */
1082        allow_smaller_maxphyaddr = !npt_enabled;
1083
1084        return 0;
1085
1086err:
1087        svm_hardware_teardown();
1088        return r;
1089}
1090
1091static void init_seg(struct vmcb_seg *seg)
1092{
1093        seg->selector = 0;
1094        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1095                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1096        seg->limit = 0xffff;
1097        seg->base = 0;
1098}
1099
1100static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1101{
1102        seg->selector = 0;
1103        seg->attrib = SVM_SELECTOR_P_MASK | type;
1104        seg->limit = 0xffff;
1105        seg->base = 0;
1106}
1107
1108static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1109{
1110        struct vcpu_svm *svm = to_svm(vcpu);
1111
1112        return svm->nested.ctl.tsc_offset;
1113}
1114
1115static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1116{
1117        return kvm_default_tsc_scaling_ratio;
1118}
1119
1120static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1121{
1122        struct vcpu_svm *svm = to_svm(vcpu);
1123
1124        svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1125        svm->vmcb->control.tsc_offset = offset;
1126        vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1127}
1128
1129static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1130{
1131        wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
1132}
1133
1134/* Evaluate instruction intercepts that depend on guest CPUID features. */
1135static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1136                                              struct vcpu_svm *svm)
1137{
1138        /*
1139         * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1140         * roots, or if INVPCID is disabled in the guest to inject #UD.
1141         */
1142        if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1143                if (!npt_enabled ||
1144                    !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1145                        svm_set_intercept(svm, INTERCEPT_INVPCID);
1146                else
1147                        svm_clr_intercept(svm, INTERCEPT_INVPCID);
1148        }
1149
1150        if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1151                if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1152                        svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1153                else
1154                        svm_set_intercept(svm, INTERCEPT_RDTSCP);
1155        }
1156}
1157
1158static void init_vmcb(struct kvm_vcpu *vcpu)
1159{
1160        struct vcpu_svm *svm = to_svm(vcpu);
1161        struct vmcb_control_area *control = &svm->vmcb->control;
1162        struct vmcb_save_area *save = &svm->vmcb->save;
1163
1164        vcpu->arch.hflags = 0;
1165
1166        svm_set_intercept(svm, INTERCEPT_CR0_READ);
1167        svm_set_intercept(svm, INTERCEPT_CR3_READ);
1168        svm_set_intercept(svm, INTERCEPT_CR4_READ);
1169        svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1170        svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1171        svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1172        if (!kvm_vcpu_apicv_active(vcpu))
1173                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1174
1175        set_dr_intercepts(svm);
1176
1177        set_exception_intercept(svm, PF_VECTOR);
1178        set_exception_intercept(svm, UD_VECTOR);
1179        set_exception_intercept(svm, MC_VECTOR);
1180        set_exception_intercept(svm, AC_VECTOR);
1181        set_exception_intercept(svm, DB_VECTOR);
1182        /*
1183         * Guest access to VMware backdoor ports could legitimately
1184         * trigger #GP because of TSS I/O permission bitmap.
1185         * We intercept those #GP and allow access to them anyway
1186         * as VMware does.
1187         */
1188        if (enable_vmware_backdoor)
1189                set_exception_intercept(svm, GP_VECTOR);
1190
1191        svm_set_intercept(svm, INTERCEPT_INTR);
1192        svm_set_intercept(svm, INTERCEPT_NMI);
1193
1194        if (intercept_smi)
1195                svm_set_intercept(svm, INTERCEPT_SMI);
1196
1197        svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1198        svm_set_intercept(svm, INTERCEPT_RDPMC);
1199        svm_set_intercept(svm, INTERCEPT_CPUID);
1200        svm_set_intercept(svm, INTERCEPT_INVD);
1201        svm_set_intercept(svm, INTERCEPT_INVLPG);
1202        svm_set_intercept(svm, INTERCEPT_INVLPGA);
1203        svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1204        svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1205        svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1206        svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1207        svm_set_intercept(svm, INTERCEPT_VMRUN);
1208        svm_set_intercept(svm, INTERCEPT_VMMCALL);
1209        svm_set_intercept(svm, INTERCEPT_VMLOAD);
1210        svm_set_intercept(svm, INTERCEPT_VMSAVE);
1211        svm_set_intercept(svm, INTERCEPT_STGI);
1212        svm_set_intercept(svm, INTERCEPT_CLGI);
1213        svm_set_intercept(svm, INTERCEPT_SKINIT);
1214        svm_set_intercept(svm, INTERCEPT_WBINVD);
1215        svm_set_intercept(svm, INTERCEPT_XSETBV);
1216        svm_set_intercept(svm, INTERCEPT_RDPRU);
1217        svm_set_intercept(svm, INTERCEPT_RSM);
1218
1219        if (!kvm_mwait_in_guest(vcpu->kvm)) {
1220                svm_set_intercept(svm, INTERCEPT_MONITOR);
1221                svm_set_intercept(svm, INTERCEPT_MWAIT);
1222        }
1223
1224        if (!kvm_hlt_in_guest(vcpu->kvm))
1225                svm_set_intercept(svm, INTERCEPT_HLT);
1226
1227        control->iopm_base_pa = __sme_set(iopm_base);
1228        control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1229        control->int_ctl = V_INTR_MASKING_MASK;
1230
1231        init_seg(&save->es);
1232        init_seg(&save->ss);
1233        init_seg(&save->ds);
1234        init_seg(&save->fs);
1235        init_seg(&save->gs);
1236
1237        save->cs.selector = 0xf000;
1238        save->cs.base = 0xffff0000;
1239        /* Executable/Readable Code Segment */
1240        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1241                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1242        save->cs.limit = 0xffff;
1243
1244        save->gdtr.limit = 0xffff;
1245        save->idtr.limit = 0xffff;
1246
1247        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1248        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1249
1250        svm_set_cr4(vcpu, 0);
1251        svm_set_efer(vcpu, 0);
1252        save->dr6 = 0xffff0ff0;
1253        kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
1254        save->rip = 0x0000fff0;
1255        vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
1256
1257        /*
1258         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1259         * It also updates the guest-visible cr0 value.
1260         */
1261        svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1262        kvm_mmu_reset_context(vcpu);
1263
1264        save->cr4 = X86_CR4_PAE;
1265        /* rdx = ?? */
1266
1267        if (npt_enabled) {
1268                /* Setup VMCB for Nested Paging */
1269                control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1270                svm_clr_intercept(svm, INTERCEPT_INVLPG);
1271                clr_exception_intercept(svm, PF_VECTOR);
1272                svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1273                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1274                save->g_pat = vcpu->arch.pat;
1275                save->cr3 = 0;
1276                save->cr4 = 0;
1277        }
1278        svm->current_vmcb->asid_generation = 0;
1279        svm->asid = 0;
1280
1281        svm->nested.vmcb12_gpa = INVALID_GPA;
1282        svm->nested.last_vmcb12_gpa = INVALID_GPA;
1283        vcpu->arch.hflags = 0;
1284
1285        if (!kvm_pause_in_guest(vcpu->kvm)) {
1286                control->pause_filter_count = pause_filter_count;
1287                if (pause_filter_thresh)
1288                        control->pause_filter_thresh = pause_filter_thresh;
1289                svm_set_intercept(svm, INTERCEPT_PAUSE);
1290        } else {
1291                svm_clr_intercept(svm, INTERCEPT_PAUSE);
1292        }
1293
1294        svm_recalc_instruction_intercepts(vcpu, svm);
1295
1296        /*
1297         * If the host supports V_SPEC_CTRL then disable the interception
1298         * of MSR_IA32_SPEC_CTRL.
1299         */
1300        if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1301                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1302
1303        if (kvm_vcpu_apicv_active(vcpu))
1304                avic_init_vmcb(svm);
1305
1306        if (vgif) {
1307                svm_clr_intercept(svm, INTERCEPT_STGI);
1308                svm_clr_intercept(svm, INTERCEPT_CLGI);
1309                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1310        }
1311
1312        if (sev_guest(vcpu->kvm)) {
1313                svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1314                clr_exception_intercept(svm, UD_VECTOR);
1315
1316                if (sev_es_guest(vcpu->kvm)) {
1317                        /* Perform SEV-ES specific VMCB updates */
1318                        sev_es_init_vmcb(svm);
1319                }
1320        }
1321
1322        svm_hv_init_vmcb(svm->vmcb);
1323
1324        vmcb_mark_all_dirty(svm->vmcb);
1325
1326        enable_gif(svm);
1327
1328}
1329
1330static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1331{
1332        struct vcpu_svm *svm = to_svm(vcpu);
1333        u32 dummy;
1334        u32 eax = 1;
1335
1336        svm->spec_ctrl = 0;
1337        svm->virt_spec_ctrl = 0;
1338
1339        if (!init_event) {
1340                vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1341                                       MSR_IA32_APICBASE_ENABLE;
1342                if (kvm_vcpu_is_reset_bsp(vcpu))
1343                        vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
1344        }
1345        init_vmcb(vcpu);
1346
1347        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
1348        kvm_rdx_write(vcpu, eax);
1349
1350        if (kvm_vcpu_apicv_active(vcpu) && !init_event)
1351                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1352}
1353
1354void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1355{
1356        svm->current_vmcb = target_vmcb;
1357        svm->vmcb = target_vmcb->ptr;
1358}
1359
1360static int svm_create_vcpu(struct kvm_vcpu *vcpu)
1361{
1362        struct vcpu_svm *svm;
1363        struct page *vmcb01_page;
1364        struct page *vmsa_page = NULL;
1365        int err;
1366
1367        BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1368        svm = to_svm(vcpu);
1369
1370        err = -ENOMEM;
1371        vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1372        if (!vmcb01_page)
1373                goto out;
1374
1375        if (sev_es_guest(vcpu->kvm)) {
1376                /*
1377                 * SEV-ES guests require a separate VMSA page used to contain
1378                 * the encrypted register state of the guest.
1379                 */
1380                vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1381                if (!vmsa_page)
1382                        goto error_free_vmcb_page;
1383
1384                /*
1385                 * SEV-ES guests maintain an encrypted version of their FPU
1386                 * state which is restored and saved on VMRUN and VMEXIT.
1387                 * Free the fpu structure to prevent KVM from attempting to
1388                 * access the FPU state.
1389                 */
1390                kvm_free_guest_fpu(vcpu);
1391        }
1392
1393        err = avic_init_vcpu(svm);
1394        if (err)
1395                goto error_free_vmsa_page;
1396
1397        /* We initialize this flag to true to make sure that the is_running
1398         * bit would be set the first time the vcpu is loaded.
1399         */
1400        if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
1401                svm->avic_is_running = true;
1402
1403        svm->msrpm = svm_vcpu_alloc_msrpm();
1404        if (!svm->msrpm) {
1405                err = -ENOMEM;
1406                goto error_free_vmsa_page;
1407        }
1408
1409        svm->vmcb01.ptr = page_address(vmcb01_page);
1410        svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1411
1412        if (vmsa_page)
1413                svm->vmsa = page_address(vmsa_page);
1414
1415        svm->guest_state_loaded = false;
1416
1417        svm_switch_vmcb(svm, &svm->vmcb01);
1418        init_vmcb(vcpu);
1419
1420        svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1421
1422        svm_init_osvw(vcpu);
1423        vcpu->arch.microcode_version = 0x01000065;
1424
1425        if (sev_es_guest(vcpu->kvm))
1426                /* Perform SEV-ES specific VMCB creation updates */
1427                sev_es_create_vcpu(svm);
1428
1429        return 0;
1430
1431error_free_vmsa_page:
1432        if (vmsa_page)
1433                __free_page(vmsa_page);
1434error_free_vmcb_page:
1435        __free_page(vmcb01_page);
1436out:
1437        return err;
1438}
1439
1440static void svm_clear_current_vmcb(struct vmcb *vmcb)
1441{
1442        int i;
1443
1444        for_each_online_cpu(i)
1445                cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1446}
1447
1448static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1449{
1450        struct vcpu_svm *svm = to_svm(vcpu);
1451
1452        /*
1453         * The vmcb page can be recycled, causing a false negative in
1454         * svm_vcpu_load(). So, ensure that no logical CPU has this
1455         * vmcb page recorded as its current vmcb.
1456         */
1457        svm_clear_current_vmcb(svm->vmcb);
1458
1459        svm_free_nested(svm);
1460
1461        sev_free_vcpu(vcpu);
1462
1463        __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1464        __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1465}
1466
1467static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1468{
1469        struct vcpu_svm *svm = to_svm(vcpu);
1470        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
1471
1472        if (sev_es_guest(vcpu->kvm))
1473                sev_es_unmap_ghcb(svm);
1474
1475        if (svm->guest_state_loaded)
1476                return;
1477
1478        /*
1479         * Save additional host state that will be restored on VMEXIT (sev-es)
1480         * or subsequent vmload of host save area.
1481         */
1482        if (sev_es_guest(vcpu->kvm)) {
1483                sev_es_prepare_guest_switch(svm, vcpu->cpu);
1484        } else {
1485                vmsave(__sme_page_pa(sd->save_area));
1486        }
1487
1488        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1489                u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1490                if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1491                        __this_cpu_write(current_tsc_ratio, tsc_ratio);
1492                        wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1493                }
1494        }
1495
1496        if (likely(tsc_aux_uret_slot >= 0))
1497                kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1498
1499        svm->guest_state_loaded = true;
1500}
1501
1502static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1503{
1504        to_svm(vcpu)->guest_state_loaded = false;
1505}
1506
1507static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1508{
1509        struct vcpu_svm *svm = to_svm(vcpu);
1510        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1511
1512        if (sd->current_vmcb != svm->vmcb) {
1513                sd->current_vmcb = svm->vmcb;
1514                indirect_branch_prediction_barrier();
1515        }
1516        avic_vcpu_load(vcpu, cpu);
1517}
1518
1519static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1520{
1521        avic_vcpu_put(vcpu);
1522        svm_prepare_host_switch(vcpu);
1523
1524        ++vcpu->stat.host_state_reload;
1525}
1526
1527static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1528{
1529        struct vcpu_svm *svm = to_svm(vcpu);
1530        unsigned long rflags = svm->vmcb->save.rflags;
1531
1532        if (svm->nmi_singlestep) {
1533                /* Hide our flags if they were not set by the guest */
1534                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1535                        rflags &= ~X86_EFLAGS_TF;
1536                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1537                        rflags &= ~X86_EFLAGS_RF;
1538        }
1539        return rflags;
1540}
1541
1542static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1543{
1544        if (to_svm(vcpu)->nmi_singlestep)
1545                rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1546
1547       /*
1548        * Any change of EFLAGS.VM is accompanied by a reload of SS
1549        * (caused by either a task switch or an inter-privilege IRET),
1550        * so we do not need to update the CPL here.
1551        */
1552        to_svm(vcpu)->vmcb->save.rflags = rflags;
1553}
1554
1555static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1556{
1557        switch (reg) {
1558        case VCPU_EXREG_PDPTR:
1559                BUG_ON(!npt_enabled);
1560                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1561                break;
1562        default:
1563                WARN_ON_ONCE(1);
1564        }
1565}
1566
1567static void svm_set_vintr(struct vcpu_svm *svm)
1568{
1569        struct vmcb_control_area *control;
1570
1571        /*
1572         * The following fields are ignored when AVIC is enabled
1573         */
1574        WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
1575
1576        svm_set_intercept(svm, INTERCEPT_VINTR);
1577
1578        /*
1579         * This is just a dummy VINTR to actually cause a vmexit to happen.
1580         * Actual injection of virtual interrupts happens through EVENTINJ.
1581         */
1582        control = &svm->vmcb->control;
1583        control->int_vector = 0x0;
1584        control->int_ctl &= ~V_INTR_PRIO_MASK;
1585        control->int_ctl |= V_IRQ_MASK |
1586                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1587        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1588}
1589
1590static void svm_clear_vintr(struct vcpu_svm *svm)
1591{
1592        svm_clr_intercept(svm, INTERCEPT_VINTR);
1593
1594        /* Drop int_ctl fields related to VINTR injection.  */
1595        svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1596        if (is_guest_mode(&svm->vcpu)) {
1597                svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1598
1599                WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1600                        (svm->nested.ctl.int_ctl & V_TPR_MASK));
1601
1602                svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1603                        V_IRQ_INJECTION_BITS_MASK;
1604        }
1605
1606        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1607}
1608
1609static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1610{
1611        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1612        struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1613
1614        switch (seg) {
1615        case VCPU_SREG_CS: return &save->cs;
1616        case VCPU_SREG_DS: return &save->ds;
1617        case VCPU_SREG_ES: return &save->es;
1618        case VCPU_SREG_FS: return &save01->fs;
1619        case VCPU_SREG_GS: return &save01->gs;
1620        case VCPU_SREG_SS: return &save->ss;
1621        case VCPU_SREG_TR: return &save01->tr;
1622        case VCPU_SREG_LDTR: return &save01->ldtr;
1623        }
1624        BUG();
1625        return NULL;
1626}
1627
1628static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1629{
1630        struct vmcb_seg *s = svm_seg(vcpu, seg);
1631
1632        return s->base;
1633}
1634
1635static void svm_get_segment(struct kvm_vcpu *vcpu,
1636                            struct kvm_segment *var, int seg)
1637{
1638        struct vmcb_seg *s = svm_seg(vcpu, seg);
1639
1640        var->base = s->base;
1641        var->limit = s->limit;
1642        var->selector = s->selector;
1643        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1644        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1645        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1646        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1647        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1648        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1649        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1650
1651        /*
1652         * AMD CPUs circa 2014 track the G bit for all segments except CS.
1653         * However, the SVM spec states that the G bit is not observed by the
1654         * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1655         * So let's synthesize a legal G bit for all segments, this helps
1656         * running KVM nested. It also helps cross-vendor migration, because
1657         * Intel's vmentry has a check on the 'G' bit.
1658         */
1659        var->g = s->limit > 0xfffff;
1660
1661        /*
1662         * AMD's VMCB does not have an explicit unusable field, so emulate it
1663         * for cross vendor migration purposes by "not present"
1664         */
1665        var->unusable = !var->present;
1666
1667        switch (seg) {
1668        case VCPU_SREG_TR:
1669                /*
1670                 * Work around a bug where the busy flag in the tr selector
1671                 * isn't exposed
1672                 */
1673                var->type |= 0x2;
1674                break;
1675        case VCPU_SREG_DS:
1676        case VCPU_SREG_ES:
1677        case VCPU_SREG_FS:
1678        case VCPU_SREG_GS:
1679                /*
1680                 * The accessed bit must always be set in the segment
1681                 * descriptor cache, although it can be cleared in the
1682                 * descriptor, the cached bit always remains at 1. Since
1683                 * Intel has a check on this, set it here to support
1684                 * cross-vendor migration.
1685                 */
1686                if (!var->unusable)
1687                        var->type |= 0x1;
1688                break;
1689        case VCPU_SREG_SS:
1690                /*
1691                 * On AMD CPUs sometimes the DB bit in the segment
1692                 * descriptor is left as 1, although the whole segment has
1693                 * been made unusable. Clear it here to pass an Intel VMX
1694                 * entry check when cross vendor migrating.
1695                 */
1696                if (var->unusable)
1697                        var->db = 0;
1698                /* This is symmetric with svm_set_segment() */
1699                var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1700                break;
1701        }
1702}
1703
1704static int svm_get_cpl(struct kvm_vcpu *vcpu)
1705{
1706        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1707
1708        return save->cpl;
1709}
1710
1711static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1712{
1713        struct vcpu_svm *svm = to_svm(vcpu);
1714
1715        dt->size = svm->vmcb->save.idtr.limit;
1716        dt->address = svm->vmcb->save.idtr.base;
1717}
1718
1719static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1720{
1721        struct vcpu_svm *svm = to_svm(vcpu);
1722
1723        svm->vmcb->save.idtr.limit = dt->size;
1724        svm->vmcb->save.idtr.base = dt->address ;
1725        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1726}
1727
1728static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1729{
1730        struct vcpu_svm *svm = to_svm(vcpu);
1731
1732        dt->size = svm->vmcb->save.gdtr.limit;
1733        dt->address = svm->vmcb->save.gdtr.base;
1734}
1735
1736static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1737{
1738        struct vcpu_svm *svm = to_svm(vcpu);
1739
1740        svm->vmcb->save.gdtr.limit = dt->size;
1741        svm->vmcb->save.gdtr.base = dt->address ;
1742        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1743}
1744
1745void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1746{
1747        struct vcpu_svm *svm = to_svm(vcpu);
1748        u64 hcr0 = cr0;
1749
1750#ifdef CONFIG_X86_64
1751        if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1752                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1753                        vcpu->arch.efer |= EFER_LMA;
1754                        svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1755                }
1756
1757                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1758                        vcpu->arch.efer &= ~EFER_LMA;
1759                        svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1760                }
1761        }
1762#endif
1763        vcpu->arch.cr0 = cr0;
1764
1765        if (!npt_enabled)
1766                hcr0 |= X86_CR0_PG | X86_CR0_WP;
1767
1768        /*
1769         * re-enable caching here because the QEMU bios
1770         * does not do it - this results in some delay at
1771         * reboot
1772         */
1773        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1774                hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1775
1776        svm->vmcb->save.cr0 = hcr0;
1777        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1778
1779        /*
1780         * SEV-ES guests must always keep the CR intercepts cleared. CR
1781         * tracking is done using the CR write traps.
1782         */
1783        if (sev_es_guest(vcpu->kvm))
1784                return;
1785
1786        if (hcr0 == cr0) {
1787                /* Selective CR0 write remains on.  */
1788                svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1789                svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1790        } else {
1791                svm_set_intercept(svm, INTERCEPT_CR0_READ);
1792                svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1793        }
1794}
1795
1796static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1797{
1798        return true;
1799}
1800
1801void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1802{
1803        unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1804        unsigned long old_cr4 = vcpu->arch.cr4;
1805
1806        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1807                svm_flush_tlb(vcpu);
1808
1809        vcpu->arch.cr4 = cr4;
1810        if (!npt_enabled)
1811                cr4 |= X86_CR4_PAE;
1812        cr4 |= host_cr4_mce;
1813        to_svm(vcpu)->vmcb->save.cr4 = cr4;
1814        vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1815
1816        if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1817                kvm_update_cpuid_runtime(vcpu);
1818}
1819
1820static void svm_set_segment(struct kvm_vcpu *vcpu,
1821                            struct kvm_segment *var, int seg)
1822{
1823        struct vcpu_svm *svm = to_svm(vcpu);
1824        struct vmcb_seg *s = svm_seg(vcpu, seg);
1825
1826        s->base = var->base;
1827        s->limit = var->limit;
1828        s->selector = var->selector;
1829        s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1830        s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1831        s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1832        s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1833        s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1834        s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1835        s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1836        s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1837
1838        /*
1839         * This is always accurate, except if SYSRET returned to a segment
1840         * with SS.DPL != 3.  Intel does not have this quirk, and always
1841         * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1842         * would entail passing the CPL to userspace and back.
1843         */
1844        if (seg == VCPU_SREG_SS)
1845                /* This is symmetric with svm_get_segment() */
1846                svm->vmcb->save.cpl = (var->dpl & 3);
1847
1848        vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1849}
1850
1851static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1852{
1853        struct vcpu_svm *svm = to_svm(vcpu);
1854
1855        clr_exception_intercept(svm, BP_VECTOR);
1856
1857        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1858                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1859                        set_exception_intercept(svm, BP_VECTOR);
1860        }
1861}
1862
1863static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1864{
1865        if (sd->next_asid > sd->max_asid) {
1866                ++sd->asid_generation;
1867                sd->next_asid = sd->min_asid;
1868                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1869                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1870        }
1871
1872        svm->current_vmcb->asid_generation = sd->asid_generation;
1873        svm->asid = sd->next_asid++;
1874}
1875
1876static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1877{
1878        struct vmcb *vmcb = svm->vmcb;
1879
1880        if (svm->vcpu.arch.guest_state_protected)
1881                return;
1882
1883        if (unlikely(value != vmcb->save.dr6)) {
1884                vmcb->save.dr6 = value;
1885                vmcb_mark_dirty(vmcb, VMCB_DR);
1886        }
1887}
1888
1889static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1890{
1891        struct vcpu_svm *svm = to_svm(vcpu);
1892
1893        if (vcpu->arch.guest_state_protected)
1894                return;
1895
1896        get_debugreg(vcpu->arch.db[0], 0);
1897        get_debugreg(vcpu->arch.db[1], 1);
1898        get_debugreg(vcpu->arch.db[2], 2);
1899        get_debugreg(vcpu->arch.db[3], 3);
1900        /*
1901         * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1902         * because db_interception might need it.  We can do it before vmentry.
1903         */
1904        vcpu->arch.dr6 = svm->vmcb->save.dr6;
1905        vcpu->arch.dr7 = svm->vmcb->save.dr7;
1906        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1907        set_dr_intercepts(svm);
1908}
1909
1910static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1911{
1912        struct vcpu_svm *svm = to_svm(vcpu);
1913
1914        if (vcpu->arch.guest_state_protected)
1915                return;
1916
1917        svm->vmcb->save.dr7 = value;
1918        vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1919}
1920
1921static int pf_interception(struct kvm_vcpu *vcpu)
1922{
1923        struct vcpu_svm *svm = to_svm(vcpu);
1924
1925        u64 fault_address = svm->vmcb->control.exit_info_2;
1926        u64 error_code = svm->vmcb->control.exit_info_1;
1927
1928        return kvm_handle_page_fault(vcpu, error_code, fault_address,
1929                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1930                        svm->vmcb->control.insn_bytes : NULL,
1931                        svm->vmcb->control.insn_len);
1932}
1933
1934static int npf_interception(struct kvm_vcpu *vcpu)
1935{
1936        struct vcpu_svm *svm = to_svm(vcpu);
1937
1938        u64 fault_address = svm->vmcb->control.exit_info_2;
1939        u64 error_code = svm->vmcb->control.exit_info_1;
1940
1941        trace_kvm_page_fault(fault_address, error_code);
1942        return kvm_mmu_page_fault(vcpu, fault_address, error_code,
1943                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1944                        svm->vmcb->control.insn_bytes : NULL,
1945                        svm->vmcb->control.insn_len);
1946}
1947
1948static int db_interception(struct kvm_vcpu *vcpu)
1949{
1950        struct kvm_run *kvm_run = vcpu->run;
1951        struct vcpu_svm *svm = to_svm(vcpu);
1952
1953        if (!(vcpu->guest_debug &
1954              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1955                !svm->nmi_singlestep) {
1956                u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1957                kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1958                return 1;
1959        }
1960
1961        if (svm->nmi_singlestep) {
1962                disable_nmi_singlestep(svm);
1963                /* Make sure we check for pending NMIs upon entry */
1964                kvm_make_request(KVM_REQ_EVENT, vcpu);
1965        }
1966
1967        if (vcpu->guest_debug &
1968            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1969                kvm_run->exit_reason = KVM_EXIT_DEBUG;
1970                kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
1971                kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1972                kvm_run->debug.arch.pc =
1973                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1974                kvm_run->debug.arch.exception = DB_VECTOR;
1975                return 0;
1976        }
1977
1978        return 1;
1979}
1980
1981static int bp_interception(struct kvm_vcpu *vcpu)
1982{
1983        struct vcpu_svm *svm = to_svm(vcpu);
1984        struct kvm_run *kvm_run = vcpu->run;
1985
1986        kvm_run->exit_reason = KVM_EXIT_DEBUG;
1987        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1988        kvm_run->debug.arch.exception = BP_VECTOR;
1989        return 0;
1990}
1991
1992static int ud_interception(struct kvm_vcpu *vcpu)
1993{
1994        return handle_ud(vcpu);
1995}
1996
1997static int ac_interception(struct kvm_vcpu *vcpu)
1998{
1999        kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2000        return 1;
2001}
2002
2003static bool is_erratum_383(void)
2004{
2005        int err, i;
2006        u64 value;
2007
2008        if (!erratum_383_found)
2009                return false;
2010
2011        value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2012        if (err)
2013                return false;
2014
2015        /* Bit 62 may or may not be set for this mce */
2016        value &= ~(1ULL << 62);
2017
2018        if (value != 0xb600000000010015ULL)
2019                return false;
2020
2021        /* Clear MCi_STATUS registers */
2022        for (i = 0; i < 6; ++i)
2023                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2024
2025        value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2026        if (!err) {
2027                u32 low, high;
2028
2029                value &= ~(1ULL << 2);
2030                low    = lower_32_bits(value);
2031                high   = upper_32_bits(value);
2032
2033                native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2034        }
2035
2036        /* Flush tlb to evict multi-match entries */
2037        __flush_tlb_all();
2038
2039        return true;
2040}
2041
2042static void svm_handle_mce(struct kvm_vcpu *vcpu)
2043{
2044        if (is_erratum_383()) {
2045                /*
2046                 * Erratum 383 triggered. Guest state is corrupt so kill the
2047                 * guest.
2048                 */
2049                pr_err("KVM: Guest triggered AMD Erratum 383\n");
2050
2051                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2052
2053                return;
2054        }
2055
2056        /*
2057         * On an #MC intercept the MCE handler is not called automatically in
2058         * the host. So do it by hand here.
2059         */
2060        kvm_machine_check();
2061}
2062
2063static int mc_interception(struct kvm_vcpu *vcpu)
2064{
2065        return 1;
2066}
2067
2068static int shutdown_interception(struct kvm_vcpu *vcpu)
2069{
2070        struct kvm_run *kvm_run = vcpu->run;
2071        struct vcpu_svm *svm = to_svm(vcpu);
2072
2073        /*
2074         * The VM save area has already been encrypted so it
2075         * cannot be reinitialized - just terminate.
2076         */
2077        if (sev_es_guest(vcpu->kvm))
2078                return -EINVAL;
2079
2080        /*
2081         * VMCB is undefined after a SHUTDOWN intercept
2082         * so reinitialize it.
2083         */
2084        clear_page(svm->vmcb);
2085        init_vmcb(vcpu);
2086
2087        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2088        return 0;
2089}
2090
2091static int io_interception(struct kvm_vcpu *vcpu)
2092{
2093        struct vcpu_svm *svm = to_svm(vcpu);
2094        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2095        int size, in, string;
2096        unsigned port;
2097
2098        ++vcpu->stat.io_exits;
2099        string = (io_info & SVM_IOIO_STR_MASK) != 0;
2100        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2101        port = io_info >> 16;
2102        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2103
2104        if (string) {
2105                if (sev_es_guest(vcpu->kvm))
2106                        return sev_es_string_io(svm, size, port, in);
2107                else
2108                        return kvm_emulate_instruction(vcpu, 0);
2109        }
2110
2111        svm->next_rip = svm->vmcb->control.exit_info_2;
2112
2113        return kvm_fast_pio(vcpu, size, port, in);
2114}
2115
2116static int nmi_interception(struct kvm_vcpu *vcpu)
2117{
2118        return 1;
2119}
2120
2121static int smi_interception(struct kvm_vcpu *vcpu)
2122{
2123        return 1;
2124}
2125
2126static int intr_interception(struct kvm_vcpu *vcpu)
2127{
2128        ++vcpu->stat.irq_exits;
2129        return 1;
2130}
2131
2132static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2133{
2134        struct vcpu_svm *svm = to_svm(vcpu);
2135        struct vmcb *vmcb12;
2136        struct kvm_host_map map;
2137        int ret;
2138
2139        if (nested_svm_check_permissions(vcpu))
2140                return 1;
2141
2142        ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2143        if (ret) {
2144                if (ret == -EINVAL)
2145                        kvm_inject_gp(vcpu, 0);
2146                return 1;
2147        }
2148
2149        vmcb12 = map.hva;
2150
2151        ret = kvm_skip_emulated_instruction(vcpu);
2152
2153        if (vmload) {
2154                svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2155                svm->sysenter_eip_hi = 0;
2156                svm->sysenter_esp_hi = 0;
2157        } else {
2158                svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2159        }
2160
2161        kvm_vcpu_unmap(vcpu, &map, true);
2162
2163        return ret;
2164}
2165
2166static int vmload_interception(struct kvm_vcpu *vcpu)
2167{
2168        return vmload_vmsave_interception(vcpu, true);
2169}
2170
2171static int vmsave_interception(struct kvm_vcpu *vcpu)
2172{
2173        return vmload_vmsave_interception(vcpu, false);
2174}
2175
2176static int vmrun_interception(struct kvm_vcpu *vcpu)
2177{
2178        if (nested_svm_check_permissions(vcpu))
2179                return 1;
2180
2181        return nested_svm_vmrun(vcpu);
2182}
2183
2184enum {
2185        NONE_SVM_INSTR,
2186        SVM_INSTR_VMRUN,
2187        SVM_INSTR_VMLOAD,
2188        SVM_INSTR_VMSAVE,
2189};
2190
2191/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2192static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2193{
2194        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2195
2196        if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2197                return NONE_SVM_INSTR;
2198
2199        switch (ctxt->modrm) {
2200        case 0xd8: /* VMRUN */
2201                return SVM_INSTR_VMRUN;
2202        case 0xda: /* VMLOAD */
2203                return SVM_INSTR_VMLOAD;
2204        case 0xdb: /* VMSAVE */
2205                return SVM_INSTR_VMSAVE;
2206        default:
2207                break;
2208        }
2209
2210        return NONE_SVM_INSTR;
2211}
2212
2213static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2214{
2215        const int guest_mode_exit_codes[] = {
2216                [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2217                [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2218                [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2219        };
2220        int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2221                [SVM_INSTR_VMRUN] = vmrun_interception,
2222                [SVM_INSTR_VMLOAD] = vmload_interception,
2223                [SVM_INSTR_VMSAVE] = vmsave_interception,
2224        };
2225        struct vcpu_svm *svm = to_svm(vcpu);
2226        int ret;
2227
2228        if (is_guest_mode(vcpu)) {
2229                /* Returns '1' or -errno on failure, '0' on success. */
2230                ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2231                if (ret)
2232                        return ret;
2233                return 1;
2234        }
2235        return svm_instr_handlers[opcode](vcpu);
2236}
2237
2238/*
2239 * #GP handling code. Note that #GP can be triggered under the following two
2240 * cases:
2241 *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2242 *      some AMD CPUs when EAX of these instructions are in the reserved memory
2243 *      regions (e.g. SMM memory on host).
2244 *   2) VMware backdoor
2245 */
2246static int gp_interception(struct kvm_vcpu *vcpu)
2247{
2248        struct vcpu_svm *svm = to_svm(vcpu);
2249        u32 error_code = svm->vmcb->control.exit_info_1;
2250        int opcode;
2251
2252        /* Both #GP cases have zero error_code */
2253        if (error_code)
2254                goto reinject;
2255
2256        /* Decode the instruction for usage later */
2257        if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2258                goto reinject;
2259
2260        opcode = svm_instr_opcode(vcpu);
2261
2262        if (opcode == NONE_SVM_INSTR) {
2263                if (!enable_vmware_backdoor)
2264                        goto reinject;
2265
2266                /*
2267                 * VMware backdoor emulation on #GP interception only handles
2268                 * IN{S}, OUT{S}, and RDPMC.
2269                 */
2270                if (!is_guest_mode(vcpu))
2271                        return kvm_emulate_instruction(vcpu,
2272                                EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2273        } else
2274                return emulate_svm_instr(vcpu, opcode);
2275
2276reinject:
2277        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2278        return 1;
2279}
2280
2281void svm_set_gif(struct vcpu_svm *svm, bool value)
2282{
2283        if (value) {
2284                /*
2285                 * If VGIF is enabled, the STGI intercept is only added to
2286                 * detect the opening of the SMI/NMI window; remove it now.
2287                 * Likewise, clear the VINTR intercept, we will set it
2288                 * again while processing KVM_REQ_EVENT if needed.
2289                 */
2290                if (vgif_enabled(svm))
2291                        svm_clr_intercept(svm, INTERCEPT_STGI);
2292                if (svm_is_intercept(svm, INTERCEPT_VINTR))
2293                        svm_clear_vintr(svm);
2294
2295                enable_gif(svm);
2296                if (svm->vcpu.arch.smi_pending ||
2297                    svm->vcpu.arch.nmi_pending ||
2298                    kvm_cpu_has_injectable_intr(&svm->vcpu))
2299                        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2300        } else {
2301                disable_gif(svm);
2302
2303                /*
2304                 * After a CLGI no interrupts should come.  But if vGIF is
2305                 * in use, we still rely on the VINTR intercept (rather than
2306                 * STGI) to detect an open interrupt window.
2307                */
2308                if (!vgif_enabled(svm))
2309                        svm_clear_vintr(svm);
2310        }
2311}
2312
2313static int stgi_interception(struct kvm_vcpu *vcpu)
2314{
2315        int ret;
2316
2317        if (nested_svm_check_permissions(vcpu))
2318                return 1;
2319
2320        ret = kvm_skip_emulated_instruction(vcpu);
2321        svm_set_gif(to_svm(vcpu), true);
2322        return ret;
2323}
2324
2325static int clgi_interception(struct kvm_vcpu *vcpu)
2326{
2327        int ret;
2328
2329        if (nested_svm_check_permissions(vcpu))
2330                return 1;
2331
2332        ret = kvm_skip_emulated_instruction(vcpu);
2333        svm_set_gif(to_svm(vcpu), false);
2334        return ret;
2335}
2336
2337static int invlpga_interception(struct kvm_vcpu *vcpu)
2338{
2339        gva_t gva = kvm_rax_read(vcpu);
2340        u32 asid = kvm_rcx_read(vcpu);
2341
2342        /* FIXME: Handle an address size prefix. */
2343        if (!is_long_mode(vcpu))
2344                gva = (u32)gva;
2345
2346        trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2347
2348        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2349        kvm_mmu_invlpg(vcpu, gva);
2350
2351        return kvm_skip_emulated_instruction(vcpu);
2352}
2353
2354static int skinit_interception(struct kvm_vcpu *vcpu)
2355{
2356        trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2357
2358        kvm_queue_exception(vcpu, UD_VECTOR);
2359        return 1;
2360}
2361
2362static int task_switch_interception(struct kvm_vcpu *vcpu)
2363{
2364        struct vcpu_svm *svm = to_svm(vcpu);
2365        u16 tss_selector;
2366        int reason;
2367        int int_type = svm->vmcb->control.exit_int_info &
2368                SVM_EXITINTINFO_TYPE_MASK;
2369        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2370        uint32_t type =
2371                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2372        uint32_t idt_v =
2373                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2374        bool has_error_code = false;
2375        u32 error_code = 0;
2376
2377        tss_selector = (u16)svm->vmcb->control.exit_info_1;
2378
2379        if (svm->vmcb->control.exit_info_2 &
2380            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2381                reason = TASK_SWITCH_IRET;
2382        else if (svm->vmcb->control.exit_info_2 &
2383                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2384                reason = TASK_SWITCH_JMP;
2385        else if (idt_v)
2386                reason = TASK_SWITCH_GATE;
2387        else
2388                reason = TASK_SWITCH_CALL;
2389
2390        if (reason == TASK_SWITCH_GATE) {
2391                switch (type) {
2392                case SVM_EXITINTINFO_TYPE_NMI:
2393                        vcpu->arch.nmi_injected = false;
2394                        break;
2395                case SVM_EXITINTINFO_TYPE_EXEPT:
2396                        if (svm->vmcb->control.exit_info_2 &
2397                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2398                                has_error_code = true;
2399                                error_code =
2400                                        (u32)svm->vmcb->control.exit_info_2;
2401                        }
2402                        kvm_clear_exception_queue(vcpu);
2403                        break;
2404                case SVM_EXITINTINFO_TYPE_INTR:
2405                        kvm_clear_interrupt_queue(vcpu);
2406                        break;
2407                default:
2408                        break;
2409                }
2410        }
2411
2412        if (reason != TASK_SWITCH_GATE ||
2413            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2414            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2415             (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2416                if (!skip_emulated_instruction(vcpu))
2417                        return 0;
2418        }
2419
2420        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2421                int_vec = -1;
2422
2423        return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2424                               has_error_code, error_code);
2425}
2426
2427static int iret_interception(struct kvm_vcpu *vcpu)
2428{
2429        struct vcpu_svm *svm = to_svm(vcpu);
2430
2431        ++vcpu->stat.nmi_window_exits;
2432        vcpu->arch.hflags |= HF_IRET_MASK;
2433        if (!sev_es_guest(vcpu->kvm)) {
2434                svm_clr_intercept(svm, INTERCEPT_IRET);
2435                svm->nmi_iret_rip = kvm_rip_read(vcpu);
2436        }
2437        kvm_make_request(KVM_REQ_EVENT, vcpu);
2438        return 1;
2439}
2440
2441static int invlpg_interception(struct kvm_vcpu *vcpu)
2442{
2443        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2444                return kvm_emulate_instruction(vcpu, 0);
2445
2446        kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2447        return kvm_skip_emulated_instruction(vcpu);
2448}
2449
2450static int emulate_on_interception(struct kvm_vcpu *vcpu)
2451{
2452        return kvm_emulate_instruction(vcpu, 0);
2453}
2454
2455static int rsm_interception(struct kvm_vcpu *vcpu)
2456{
2457        return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2458}
2459
2460static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2461                                            unsigned long val)
2462{
2463        struct vcpu_svm *svm = to_svm(vcpu);
2464        unsigned long cr0 = vcpu->arch.cr0;
2465        bool ret = false;
2466
2467        if (!is_guest_mode(vcpu) ||
2468            (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2469                return false;
2470
2471        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2472        val &= ~SVM_CR0_SELECTIVE_MASK;
2473
2474        if (cr0 ^ val) {
2475                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2476                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2477        }
2478
2479        return ret;
2480}
2481
2482#define CR_VALID (1ULL << 63)
2483
2484static int cr_interception(struct kvm_vcpu *vcpu)
2485{
2486        struct vcpu_svm *svm = to_svm(vcpu);
2487        int reg, cr;
2488        unsigned long val;
2489        int err;
2490
2491        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2492                return emulate_on_interception(vcpu);
2493
2494        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2495                return emulate_on_interception(vcpu);
2496
2497        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2498        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2499                cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2500        else
2501                cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2502
2503        err = 0;
2504        if (cr >= 16) { /* mov to cr */
2505                cr -= 16;
2506                val = kvm_register_read(vcpu, reg);
2507                trace_kvm_cr_write(cr, val);
2508                switch (cr) {
2509                case 0:
2510                        if (!check_selective_cr0_intercepted(vcpu, val))
2511                                err = kvm_set_cr0(vcpu, val);
2512                        else
2513                                return 1;
2514
2515                        break;
2516                case 3:
2517                        err = kvm_set_cr3(vcpu, val);
2518                        break;
2519                case 4:
2520                        err = kvm_set_cr4(vcpu, val);
2521                        break;
2522                case 8:
2523                        err = kvm_set_cr8(vcpu, val);
2524                        break;
2525                default:
2526                        WARN(1, "unhandled write to CR%d", cr);
2527                        kvm_queue_exception(vcpu, UD_VECTOR);
2528                        return 1;
2529                }
2530        } else { /* mov from cr */
2531                switch (cr) {
2532                case 0:
2533                        val = kvm_read_cr0(vcpu);
2534                        break;
2535                case 2:
2536                        val = vcpu->arch.cr2;
2537                        break;
2538                case 3:
2539                        val = kvm_read_cr3(vcpu);
2540                        break;
2541                case 4:
2542                        val = kvm_read_cr4(vcpu);
2543                        break;
2544                case 8:
2545                        val = kvm_get_cr8(vcpu);
2546                        break;
2547                default:
2548                        WARN(1, "unhandled read from CR%d", cr);
2549                        kvm_queue_exception(vcpu, UD_VECTOR);
2550                        return 1;
2551                }
2552                kvm_register_write(vcpu, reg, val);
2553                trace_kvm_cr_read(cr, val);
2554        }
2555        return kvm_complete_insn_gp(vcpu, err);
2556}
2557
2558static int cr_trap(struct kvm_vcpu *vcpu)
2559{
2560        struct vcpu_svm *svm = to_svm(vcpu);
2561        unsigned long old_value, new_value;
2562        unsigned int cr;
2563        int ret = 0;
2564
2565        new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2566
2567        cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2568        switch (cr) {
2569        case 0:
2570                old_value = kvm_read_cr0(vcpu);
2571                svm_set_cr0(vcpu, new_value);
2572
2573                kvm_post_set_cr0(vcpu, old_value, new_value);
2574                break;
2575        case 4:
2576                old_value = kvm_read_cr4(vcpu);
2577                svm_set_cr4(vcpu, new_value);
2578
2579                kvm_post_set_cr4(vcpu, old_value, new_value);
2580                break;
2581        case 8:
2582                ret = kvm_set_cr8(vcpu, new_value);
2583                break;
2584        default:
2585                WARN(1, "unhandled CR%d write trap", cr);
2586                kvm_queue_exception(vcpu, UD_VECTOR);
2587                return 1;
2588        }
2589
2590        return kvm_complete_insn_gp(vcpu, ret);
2591}
2592
2593static int dr_interception(struct kvm_vcpu *vcpu)
2594{
2595        struct vcpu_svm *svm = to_svm(vcpu);
2596        int reg, dr;
2597        unsigned long val;
2598        int err = 0;
2599
2600        if (vcpu->guest_debug == 0) {
2601                /*
2602                 * No more DR vmexits; force a reload of the debug registers
2603                 * and reenter on this instruction.  The next vmexit will
2604                 * retrieve the full state of the debug registers.
2605                 */
2606                clr_dr_intercepts(svm);
2607                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2608                return 1;
2609        }
2610
2611        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2612                return emulate_on_interception(vcpu);
2613
2614        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2615        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2616        if (dr >= 16) { /* mov to DRn  */
2617                dr -= 16;
2618                val = kvm_register_read(vcpu, reg);
2619                err = kvm_set_dr(vcpu, dr, val);
2620        } else {
2621                kvm_get_dr(vcpu, dr, &val);
2622                kvm_register_write(vcpu, reg, val);
2623        }
2624
2625        return kvm_complete_insn_gp(vcpu, err);
2626}
2627
2628static int cr8_write_interception(struct kvm_vcpu *vcpu)
2629{
2630        int r;
2631
2632        u8 cr8_prev = kvm_get_cr8(vcpu);
2633        /* instruction emulation calls kvm_set_cr8() */
2634        r = cr_interception(vcpu);
2635        if (lapic_in_kernel(vcpu))
2636                return r;
2637        if (cr8_prev <= kvm_get_cr8(vcpu))
2638                return r;
2639        vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2640        return 0;
2641}
2642
2643static int efer_trap(struct kvm_vcpu *vcpu)
2644{
2645        struct msr_data msr_info;
2646        int ret;
2647
2648        /*
2649         * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2650         * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2651         * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2652         * the guest doesn't have X86_FEATURE_SVM.
2653         */
2654        msr_info.host_initiated = false;
2655        msr_info.index = MSR_EFER;
2656        msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2657        ret = kvm_set_msr_common(vcpu, &msr_info);
2658
2659        return kvm_complete_insn_gp(vcpu, ret);
2660}
2661
2662static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2663{
2664        msr->data = 0;
2665
2666        switch (msr->index) {
2667        case MSR_F10H_DECFG:
2668                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2669                        msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2670                break;
2671        case MSR_IA32_PERF_CAPABILITIES:
2672                return 0;
2673        default:
2674                return KVM_MSR_RET_INVALID;
2675        }
2676
2677        return 0;
2678}
2679
2680static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2681{
2682        struct vcpu_svm *svm = to_svm(vcpu);
2683
2684        switch (msr_info->index) {
2685        case MSR_STAR:
2686                msr_info->data = svm->vmcb01.ptr->save.star;
2687                break;
2688#ifdef CONFIG_X86_64
2689        case MSR_LSTAR:
2690                msr_info->data = svm->vmcb01.ptr->save.lstar;
2691                break;
2692        case MSR_CSTAR:
2693                msr_info->data = svm->vmcb01.ptr->save.cstar;
2694                break;
2695        case MSR_KERNEL_GS_BASE:
2696                msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2697                break;
2698        case MSR_SYSCALL_MASK:
2699                msr_info->data = svm->vmcb01.ptr->save.sfmask;
2700                break;
2701#endif
2702        case MSR_IA32_SYSENTER_CS:
2703                msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2704                break;
2705        case MSR_IA32_SYSENTER_EIP:
2706                msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2707                if (guest_cpuid_is_intel(vcpu))
2708                        msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2709                break;
2710        case MSR_IA32_SYSENTER_ESP:
2711                msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2712                if (guest_cpuid_is_intel(vcpu))
2713                        msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2714                break;
2715        case MSR_TSC_AUX:
2716                msr_info->data = svm->tsc_aux;
2717                break;
2718        /*
2719         * Nobody will change the following 5 values in the VMCB so we can
2720         * safely return them on rdmsr. They will always be 0 until LBRV is
2721         * implemented.
2722         */
2723        case MSR_IA32_DEBUGCTLMSR:
2724                msr_info->data = svm->vmcb->save.dbgctl;
2725                break;
2726        case MSR_IA32_LASTBRANCHFROMIP:
2727                msr_info->data = svm->vmcb->save.br_from;
2728                break;
2729        case MSR_IA32_LASTBRANCHTOIP:
2730                msr_info->data = svm->vmcb->save.br_to;
2731                break;
2732        case MSR_IA32_LASTINTFROMIP:
2733                msr_info->data = svm->vmcb->save.last_excp_from;
2734                break;
2735        case MSR_IA32_LASTINTTOIP:
2736                msr_info->data = svm->vmcb->save.last_excp_to;
2737                break;
2738        case MSR_VM_HSAVE_PA:
2739                msr_info->data = svm->nested.hsave_msr;
2740                break;
2741        case MSR_VM_CR:
2742                msr_info->data = svm->nested.vm_cr_msr;
2743                break;
2744        case MSR_IA32_SPEC_CTRL:
2745                if (!msr_info->host_initiated &&
2746                    !guest_has_spec_ctrl_msr(vcpu))
2747                        return 1;
2748
2749                if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2750                        msr_info->data = svm->vmcb->save.spec_ctrl;
2751                else
2752                        msr_info->data = svm->spec_ctrl;
2753                break;
2754        case MSR_AMD64_VIRT_SPEC_CTRL:
2755                if (!msr_info->host_initiated &&
2756                    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2757                        return 1;
2758
2759                msr_info->data = svm->virt_spec_ctrl;
2760                break;
2761        case MSR_F15H_IC_CFG: {
2762
2763                int family, model;
2764
2765                family = guest_cpuid_family(vcpu);
2766                model  = guest_cpuid_model(vcpu);
2767
2768                if (family < 0 || model < 0)
2769                        return kvm_get_msr_common(vcpu, msr_info);
2770
2771                msr_info->data = 0;
2772
2773                if (family == 0x15 &&
2774                    (model >= 0x2 && model < 0x20))
2775                        msr_info->data = 0x1E;
2776                }
2777                break;
2778        case MSR_F10H_DECFG:
2779                msr_info->data = svm->msr_decfg;
2780                break;
2781        default:
2782                return kvm_get_msr_common(vcpu, msr_info);
2783        }
2784        return 0;
2785}
2786
2787static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2788{
2789        struct vcpu_svm *svm = to_svm(vcpu);
2790        if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb))
2791                return kvm_complete_insn_gp(vcpu, err);
2792
2793        ghcb_set_sw_exit_info_1(svm->ghcb, 1);
2794        ghcb_set_sw_exit_info_2(svm->ghcb,
2795                                X86_TRAP_GP |
2796                                SVM_EVTINJ_TYPE_EXEPT |
2797                                SVM_EVTINJ_VALID);
2798        return 1;
2799}
2800
2801static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2802{
2803        struct vcpu_svm *svm = to_svm(vcpu);
2804        int svm_dis, chg_mask;
2805
2806        if (data & ~SVM_VM_CR_VALID_MASK)
2807                return 1;
2808
2809        chg_mask = SVM_VM_CR_VALID_MASK;
2810
2811        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2812                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2813
2814        svm->nested.vm_cr_msr &= ~chg_mask;
2815        svm->nested.vm_cr_msr |= (data & chg_mask);
2816
2817        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2818
2819        /* check for svm_disable while efer.svme is set */
2820        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2821                return 1;
2822
2823        return 0;
2824}
2825
2826static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2827{
2828        struct vcpu_svm *svm = to_svm(vcpu);
2829        int r;
2830
2831        u32 ecx = msr->index;
2832        u64 data = msr->data;
2833        switch (ecx) {
2834        case MSR_IA32_CR_PAT:
2835                if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2836                        return 1;
2837                vcpu->arch.pat = data;
2838                svm->vmcb01.ptr->save.g_pat = data;
2839                if (is_guest_mode(vcpu))
2840                        nested_vmcb02_compute_g_pat(svm);
2841                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2842                break;
2843        case MSR_IA32_SPEC_CTRL:
2844                if (!msr->host_initiated &&
2845                    !guest_has_spec_ctrl_msr(vcpu))
2846                        return 1;
2847
2848                if (kvm_spec_ctrl_test_value(data))
2849                        return 1;
2850
2851                if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2852                        svm->vmcb->save.spec_ctrl = data;
2853                else
2854                        svm->spec_ctrl = data;
2855                if (!data)
2856                        break;
2857
2858                /*
2859                 * For non-nested:
2860                 * When it's written (to non-zero) for the first time, pass
2861                 * it through.
2862                 *
2863                 * For nested:
2864                 * The handling of the MSR bitmap for L2 guests is done in
2865                 * nested_svm_vmrun_msrpm.
2866                 * We update the L1 MSR bit as well since it will end up
2867                 * touching the MSR anyway now.
2868                 */
2869                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2870                break;
2871        case MSR_IA32_PRED_CMD:
2872                if (!msr->host_initiated &&
2873                    !guest_has_pred_cmd_msr(vcpu))
2874                        return 1;
2875
2876                if (data & ~PRED_CMD_IBPB)
2877                        return 1;
2878                if (!boot_cpu_has(X86_FEATURE_IBPB))
2879                        return 1;
2880                if (!data)
2881                        break;
2882
2883                wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2884                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2885                break;
2886        case MSR_AMD64_VIRT_SPEC_CTRL:
2887                if (!msr->host_initiated &&
2888                    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2889                        return 1;
2890
2891                if (data & ~SPEC_CTRL_SSBD)
2892                        return 1;
2893
2894                svm->virt_spec_ctrl = data;
2895                break;
2896        case MSR_STAR:
2897                svm->vmcb01.ptr->save.star = data;
2898                break;
2899#ifdef CONFIG_X86_64
2900        case MSR_LSTAR:
2901                svm->vmcb01.ptr->save.lstar = data;
2902                break;
2903        case MSR_CSTAR:
2904                svm->vmcb01.ptr->save.cstar = data;
2905                break;
2906        case MSR_KERNEL_GS_BASE:
2907                svm->vmcb01.ptr->save.kernel_gs_base = data;
2908                break;
2909        case MSR_SYSCALL_MASK:
2910                svm->vmcb01.ptr->save.sfmask = data;
2911                break;
2912#endif
2913        case MSR_IA32_SYSENTER_CS:
2914                svm->vmcb01.ptr->save.sysenter_cs = data;
2915                break;
2916        case MSR_IA32_SYSENTER_EIP:
2917                svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2918                /*
2919                 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2920                 * when we spoof an Intel vendor ID (for cross vendor migration).
2921                 * In this case we use this intercept to track the high
2922                 * 32 bit part of these msrs to support Intel's
2923                 * implementation of SYSENTER/SYSEXIT.
2924                 */
2925                svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2926                break;
2927        case MSR_IA32_SYSENTER_ESP:
2928                svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2929                svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2930                break;
2931        case MSR_TSC_AUX:
2932                /*
2933                 * TSC_AUX is usually changed only during boot and never read
2934                 * directly.  Intercept TSC_AUX instead of exposing it to the
2935                 * guest via direct_access_msrs, and switch it via user return.
2936                 */
2937                preempt_disable();
2938                r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
2939                preempt_enable();
2940                if (r)
2941                        return 1;
2942
2943                svm->tsc_aux = data;
2944                break;
2945        case MSR_IA32_DEBUGCTLMSR:
2946                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2947                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2948                                    __func__, data);
2949                        break;
2950                }
2951                if (data & DEBUGCTL_RESERVED_BITS)
2952                        return 1;
2953
2954                svm->vmcb->save.dbgctl = data;
2955                vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
2956                if (data & (1ULL<<0))
2957                        svm_enable_lbrv(vcpu);
2958                else
2959                        svm_disable_lbrv(vcpu);
2960                break;
2961        case MSR_VM_HSAVE_PA:
2962                /*
2963                 * Old kernels did not validate the value written to
2964                 * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
2965                 * value to allow live migrating buggy or malicious guests
2966                 * originating from those kernels.
2967                 */
2968                if (!msr->host_initiated && !page_address_valid(vcpu, data))
2969                        return 1;
2970
2971                svm->nested.hsave_msr = data & PAGE_MASK;
2972                break;
2973        case MSR_VM_CR:
2974                return svm_set_vm_cr(vcpu, data);
2975        case MSR_VM_IGNNE:
2976                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2977                break;
2978        case MSR_F10H_DECFG: {
2979                struct kvm_msr_entry msr_entry;
2980
2981                msr_entry.index = msr->index;
2982                if (svm_get_msr_feature(&msr_entry))
2983                        return 1;
2984
2985                /* Check the supported bits */
2986                if (data & ~msr_entry.data)
2987                        return 1;
2988
2989                /* Don't allow the guest to change a bit, #GP */
2990                if (!msr->host_initiated && (data ^ msr_entry.data))
2991                        return 1;
2992
2993                svm->msr_decfg = data;
2994                break;
2995        }
2996        case MSR_IA32_APICBASE:
2997                if (kvm_vcpu_apicv_active(vcpu))
2998                        avic_update_vapic_bar(to_svm(vcpu), data);
2999                fallthrough;
3000        default:
3001                return kvm_set_msr_common(vcpu, msr);
3002        }
3003        return 0;
3004}
3005
3006static int msr_interception(struct kvm_vcpu *vcpu)
3007{
3008        if (to_svm(vcpu)->vmcb->control.exit_info_1)
3009                return kvm_emulate_wrmsr(vcpu);
3010        else
3011                return kvm_emulate_rdmsr(vcpu);
3012}
3013
3014static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3015{
3016        kvm_make_request(KVM_REQ_EVENT, vcpu);
3017        svm_clear_vintr(to_svm(vcpu));
3018
3019        /*
3020         * For AVIC, the only reason to end up here is ExtINTs.
3021         * In this case AVIC was temporarily disabled for
3022         * requesting the IRQ window and we have to re-enable it.
3023         */
3024        svm_toggle_avic_for_irq_window(vcpu, true);
3025
3026        ++vcpu->stat.irq_window_exits;
3027        return 1;
3028}
3029
3030static int pause_interception(struct kvm_vcpu *vcpu)
3031{
3032        bool in_kernel;
3033
3034        /*
3035         * CPL is not made available for an SEV-ES guest, therefore
3036         * vcpu->arch.preempted_in_kernel can never be true.  Just
3037         * set in_kernel to false as well.
3038         */
3039        in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3040
3041        if (!kvm_pause_in_guest(vcpu->kvm))
3042                grow_ple_window(vcpu);
3043
3044        kvm_vcpu_on_spin(vcpu, in_kernel);
3045        return kvm_skip_emulated_instruction(vcpu);
3046}
3047
3048static int invpcid_interception(struct kvm_vcpu *vcpu)
3049{
3050        struct vcpu_svm *svm = to_svm(vcpu);
3051        unsigned long type;
3052        gva_t gva;
3053
3054        if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3055                kvm_queue_exception(vcpu, UD_VECTOR);
3056                return 1;
3057        }
3058
3059        /*
3060         * For an INVPCID intercept:
3061         * EXITINFO1 provides the linear address of the memory operand.
3062         * EXITINFO2 provides the contents of the register operand.
3063         */
3064        type = svm->vmcb->control.exit_info_2;
3065        gva = svm->vmcb->control.exit_info_1;
3066
3067        if (type > 3) {
3068                kvm_inject_gp(vcpu, 0);
3069                return 1;
3070        }
3071
3072        return kvm_handle_invpcid(vcpu, type, gva);
3073}
3074
3075static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3076        [SVM_EXIT_READ_CR0]                     = cr_interception,
3077        [SVM_EXIT_READ_CR3]                     = cr_interception,
3078        [SVM_EXIT_READ_CR4]                     = cr_interception,
3079        [SVM_EXIT_READ_CR8]                     = cr_interception,
3080        [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3081        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3082        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3083        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3084        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3085        [SVM_EXIT_READ_DR0]                     = dr_interception,
3086        [SVM_EXIT_READ_DR1]                     = dr_interception,
3087        [SVM_EXIT_READ_DR2]                     = dr_interception,
3088        [SVM_EXIT_READ_DR3]                     = dr_interception,
3089        [SVM_EXIT_READ_DR4]                     = dr_interception,
3090        [SVM_EXIT_READ_DR5]                     = dr_interception,
3091        [SVM_EXIT_READ_DR6]                     = dr_interception,
3092        [SVM_EXIT_READ_DR7]                     = dr_interception,
3093        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3094        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3095        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3096        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3097        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3098        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3099        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3100        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3101        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3102        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3103        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3104        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3105        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3106        [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3107        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3108        [SVM_EXIT_INTR]                         = intr_interception,
3109        [SVM_EXIT_NMI]                          = nmi_interception,
3110        [SVM_EXIT_SMI]                          = smi_interception,
3111        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3112        [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3113        [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3114        [SVM_EXIT_IRET]                         = iret_interception,
3115        [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3116        [SVM_EXIT_PAUSE]                        = pause_interception,
3117        [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3118        [SVM_EXIT_INVLPG]                       = invlpg_interception,
3119        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3120        [SVM_EXIT_IOIO]                         = io_interception,
3121        [SVM_EXIT_MSR]                          = msr_interception,
3122        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3123        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3124        [SVM_EXIT_VMRUN]                        = vmrun_interception,
3125        [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3126        [SVM_EXIT_VMLOAD]                       = vmload_interception,
3127        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3128        [SVM_EXIT_STGI]                         = stgi_interception,
3129        [SVM_EXIT_CLGI]                         = clgi_interception,
3130        [SVM_EXIT_SKINIT]                       = skinit_interception,
3131        [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3132        [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3133        [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3134        [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3135        [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3136        [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3137        [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3138        [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3139        [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3140        [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3141        [SVM_EXIT_INVPCID]                      = invpcid_interception,
3142        [SVM_EXIT_NPF]                          = npf_interception,
3143        [SVM_EXIT_RSM]                          = rsm_interception,
3144        [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3145        [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3146        [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3147};
3148
3149static void dump_vmcb(struct kvm_vcpu *vcpu)
3150{
3151        struct vcpu_svm *svm = to_svm(vcpu);
3152        struct vmcb_control_area *control = &svm->vmcb->control;
3153        struct vmcb_save_area *save = &svm->vmcb->save;
3154        struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3155
3156        if (!dump_invalid_vmcb) {
3157                pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3158                return;
3159        }
3160
3161        pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3162               svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3163        pr_err("VMCB Control Area:\n");
3164        pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3165        pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3166        pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3167        pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3168        pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3169        pr_err("%-20s%08x %08x\n", "intercepts:",
3170              control->intercepts[INTERCEPT_WORD3],
3171               control->intercepts[INTERCEPT_WORD4]);
3172        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3173        pr_err("%-20s%d\n", "pause filter threshold:",
3174               control->pause_filter_thresh);
3175        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3176        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3177        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3178        pr_err("%-20s%d\n", "asid:", control->asid);
3179        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3180        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3181        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3182        pr_err("%-20s%08x\n", "int_state:", control->int_state);
3183        pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3184        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3185        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3186        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3187        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3188        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3189        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3190        pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3191        pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3192        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3193        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3194        pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3195        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3196        pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3197        pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3198        pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3199        pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3200        pr_err("VMCB State Save Area:\n");
3201        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3202               "es:",
3203               save->es.selector, save->es.attrib,
3204               save->es.limit, save->es.base);
3205        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3206               "cs:",
3207               save->cs.selector, save->cs.attrib,
3208               save->cs.limit, save->cs.base);
3209        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3210               "ss:",
3211               save->ss.selector, save->ss.attrib,
3212               save->ss.limit, save->ss.base);
3213        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3214               "ds:",
3215               save->ds.selector, save->ds.attrib,
3216               save->ds.limit, save->ds.base);
3217        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3218               "fs:",
3219               save01->fs.selector, save01->fs.attrib,
3220               save01->fs.limit, save01->fs.base);
3221        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3222               "gs:",
3223               save01->gs.selector, save01->gs.attrib,
3224               save01->gs.limit, save01->gs.base);
3225        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3226               "gdtr:",
3227               save->gdtr.selector, save->gdtr.attrib,
3228               save->gdtr.limit, save->gdtr.base);
3229        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3230               "ldtr:",
3231               save01->ldtr.selector, save01->ldtr.attrib,
3232               save01->ldtr.limit, save01->ldtr.base);
3233        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3234               "idtr:",
3235               save->idtr.selector, save->idtr.attrib,
3236               save->idtr.limit, save->idtr.base);
3237        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3238               "tr:",
3239               save01->tr.selector, save01->tr.attrib,
3240               save01->tr.limit, save01->tr.base);
3241        pr_err("cpl:            %d                efer:         %016llx\n",
3242                save->cpl, save->efer);
3243        pr_err("%-15s %016llx %-13s %016llx\n",
3244               "cr0:", save->cr0, "cr2:", save->cr2);
3245        pr_err("%-15s %016llx %-13s %016llx\n",
3246               "cr3:", save->cr3, "cr4:", save->cr4);
3247        pr_err("%-15s %016llx %-13s %016llx\n",
3248               "dr6:", save->dr6, "dr7:", save->dr7);
3249        pr_err("%-15s %016llx %-13s %016llx\n",
3250               "rip:", save->rip, "rflags:", save->rflags);
3251        pr_err("%-15s %016llx %-13s %016llx\n",
3252               "rsp:", save->rsp, "rax:", save->rax);
3253        pr_err("%-15s %016llx %-13s %016llx\n",
3254               "star:", save01->star, "lstar:", save01->lstar);
3255        pr_err("%-15s %016llx %-13s %016llx\n",
3256               "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3257        pr_err("%-15s %016llx %-13s %016llx\n",
3258               "kernel_gs_base:", save01->kernel_gs_base,
3259               "sysenter_cs:", save01->sysenter_cs);
3260        pr_err("%-15s %016llx %-13s %016llx\n",
3261               "sysenter_esp:", save01->sysenter_esp,
3262               "sysenter_eip:", save01->sysenter_eip);
3263        pr_err("%-15s %016llx %-13s %016llx\n",
3264               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3265        pr_err("%-15s %016llx %-13s %016llx\n",
3266               "br_from:", save->br_from, "br_to:", save->br_to);
3267        pr_err("%-15s %016llx %-13s %016llx\n",
3268               "excp_from:", save->last_excp_from,
3269               "excp_to:", save->last_excp_to);
3270}
3271
3272static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3273{
3274        if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3275            svm_exit_handlers[exit_code])
3276                return 0;
3277
3278        vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3279        dump_vmcb(vcpu);
3280        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3281        vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3282        vcpu->run->internal.ndata = 2;
3283        vcpu->run->internal.data[0] = exit_code;
3284        vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3285
3286        return -EINVAL;
3287}
3288
3289int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3290{
3291        if (svm_handle_invalid_exit(vcpu, exit_code))
3292                return 0;
3293
3294#ifdef CONFIG_RETPOLINE
3295        if (exit_code == SVM_EXIT_MSR)
3296                return msr_interception(vcpu);
3297        else if (exit_code == SVM_EXIT_VINTR)
3298                return interrupt_window_interception(vcpu);
3299        else if (exit_code == SVM_EXIT_INTR)
3300                return intr_interception(vcpu);
3301        else if (exit_code == SVM_EXIT_HLT)
3302                return kvm_emulate_halt(vcpu);
3303        else if (exit_code == SVM_EXIT_NPF)
3304                return npf_interception(vcpu);
3305#endif
3306        return svm_exit_handlers[exit_code](vcpu);
3307}
3308
3309static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
3310                              u32 *intr_info, u32 *error_code)
3311{
3312        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3313
3314        *info1 = control->exit_info_1;
3315        *info2 = control->exit_info_2;
3316        *intr_info = control->exit_int_info;
3317        if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3318            (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3319                *error_code = control->exit_int_info_err;
3320        else
3321                *error_code = 0;
3322}
3323
3324static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3325{
3326        struct vcpu_svm *svm = to_svm(vcpu);
3327        struct kvm_run *kvm_run = vcpu->run;
3328        u32 exit_code = svm->vmcb->control.exit_code;
3329
3330        trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3331
3332        /* SEV-ES guests must use the CR write traps to track CR registers. */
3333        if (!sev_es_guest(vcpu->kvm)) {
3334                if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3335                        vcpu->arch.cr0 = svm->vmcb->save.cr0;
3336                if (npt_enabled)
3337                        vcpu->arch.cr3 = svm->vmcb->save.cr3;
3338        }
3339
3340        if (is_guest_mode(vcpu)) {
3341                int vmexit;
3342
3343                trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
3344
3345                vmexit = nested_svm_exit_special(svm);
3346
3347                if (vmexit == NESTED_EXIT_CONTINUE)
3348                        vmexit = nested_svm_exit_handled(svm);
3349
3350                if (vmexit == NESTED_EXIT_DONE)
3351                        return 1;
3352        }
3353
3354        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3355                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3356                kvm_run->fail_entry.hardware_entry_failure_reason
3357                        = svm->vmcb->control.exit_code;
3358                kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3359                dump_vmcb(vcpu);
3360                return 0;
3361        }
3362
3363        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3364            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3365            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3366            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3367                printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3368                       "exit_code 0x%x\n",
3369                       __func__, svm->vmcb->control.exit_int_info,
3370                       exit_code);
3371
3372        if (exit_fastpath != EXIT_FASTPATH_NONE)
3373                return 1;
3374
3375        return svm_invoke_exit_handler(vcpu, exit_code);
3376}
3377
3378static void reload_tss(struct kvm_vcpu *vcpu)
3379{
3380        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3381
3382        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3383        load_TR_desc();
3384}
3385
3386static void pre_svm_run(struct kvm_vcpu *vcpu)
3387{
3388        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3389        struct vcpu_svm *svm = to_svm(vcpu);
3390
3391        /*
3392         * If the previous vmrun of the vmcb occurred on a different physical
3393         * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3394         * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3395         */
3396        if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3397                svm->current_vmcb->asid_generation = 0;
3398                vmcb_mark_all_dirty(svm->vmcb);
3399                svm->current_vmcb->cpu = vcpu->cpu;
3400        }
3401
3402        if (sev_guest(vcpu->kvm))
3403                return pre_sev_run(svm, vcpu->cpu);
3404
3405        /* FIXME: handle wraparound of asid_generation */
3406        if (svm->current_vmcb->asid_generation != sd->asid_generation)
3407                new_asid(svm, sd);
3408}
3409
3410static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3411{
3412        struct vcpu_svm *svm = to_svm(vcpu);
3413
3414        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3415        vcpu->arch.hflags |= HF_NMI_MASK;
3416        if (!sev_es_guest(vcpu->kvm))
3417                svm_set_intercept(svm, INTERCEPT_IRET);
3418        ++vcpu->stat.nmi_injections;
3419}
3420
3421static void svm_set_irq(struct kvm_vcpu *vcpu)
3422{
3423        struct vcpu_svm *svm = to_svm(vcpu);
3424
3425        BUG_ON(!(gif_set(svm)));
3426
3427        trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3428        ++vcpu->stat.irq_injections;
3429
3430        svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3431                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3432}
3433
3434static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3435{
3436        struct vcpu_svm *svm = to_svm(vcpu);
3437
3438        /*
3439         * SEV-ES guests must always keep the CR intercepts cleared. CR
3440         * tracking is done using the CR write traps.
3441         */
3442        if (sev_es_guest(vcpu->kvm))
3443                return;
3444
3445        if (nested_svm_virtualize_tpr(vcpu))
3446                return;
3447
3448        svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3449
3450        if (irr == -1)
3451                return;
3452
3453        if (tpr >= irr)
3454                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3455}
3456
3457bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3458{
3459        struct vcpu_svm *svm = to_svm(vcpu);
3460        struct vmcb *vmcb = svm->vmcb;
3461        bool ret;
3462
3463        if (!gif_set(svm))
3464                return true;
3465
3466        if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3467                return false;
3468
3469        ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3470              (vcpu->arch.hflags & HF_NMI_MASK);
3471
3472        return ret;
3473}
3474
3475static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3476{
3477        struct vcpu_svm *svm = to_svm(vcpu);
3478        if (svm->nested.nested_run_pending)
3479                return -EBUSY;
3480
3481        /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3482        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3483                return -EBUSY;
3484
3485        return !svm_nmi_blocked(vcpu);
3486}
3487
3488static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3489{
3490        return !!(vcpu->arch.hflags & HF_NMI_MASK);
3491}
3492
3493static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3494{
3495        struct vcpu_svm *svm = to_svm(vcpu);
3496
3497        if (masked) {
3498                vcpu->arch.hflags |= HF_NMI_MASK;
3499                if (!sev_es_guest(vcpu->kvm))
3500                        svm_set_intercept(svm, INTERCEPT_IRET);
3501        } else {
3502                vcpu->arch.hflags &= ~HF_NMI_MASK;
3503                if (!sev_es_guest(vcpu->kvm))
3504                        svm_clr_intercept(svm, INTERCEPT_IRET);
3505        }
3506}
3507
3508bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3509{
3510        struct vcpu_svm *svm = to_svm(vcpu);
3511        struct vmcb *vmcb = svm->vmcb;
3512
3513        if (!gif_set(svm))
3514                return true;
3515
3516        if (sev_es_guest(vcpu->kvm)) {
3517                /*
3518                 * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
3519                 * bit to determine the state of the IF flag.
3520                 */
3521                if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
3522                        return true;
3523        } else if (is_guest_mode(vcpu)) {
3524                /* As long as interrupts are being delivered...  */
3525                if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3526                    ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3527                    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3528                        return true;
3529
3530                /* ... vmexits aren't blocked by the interrupt shadow  */
3531                if (nested_exit_on_intr(svm))
3532                        return false;
3533        } else {
3534                if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3535                        return true;
3536        }
3537
3538        return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3539}
3540
3541static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3542{
3543        struct vcpu_svm *svm = to_svm(vcpu);
3544        if (svm->nested.nested_run_pending)
3545                return -EBUSY;
3546
3547        /*
3548         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3549         * e.g. if the IRQ arrived asynchronously after checking nested events.
3550         */
3551        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3552                return -EBUSY;
3553
3554        return !svm_interrupt_blocked(vcpu);
3555}
3556
3557static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3558{
3559        struct vcpu_svm *svm = to_svm(vcpu);
3560
3561        /*
3562         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3563         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3564         * get that intercept, this function will be called again though and
3565         * we'll get the vintr intercept. However, if the vGIF feature is
3566         * enabled, the STGI interception will not occur. Enable the irq
3567         * window under the assumption that the hardware will set the GIF.
3568         */
3569        if (vgif_enabled(svm) || gif_set(svm)) {
3570                /*
3571                 * IRQ window is not needed when AVIC is enabled,
3572                 * unless we have pending ExtINT since it cannot be injected
3573                 * via AVIC. In such case, we need to temporarily disable AVIC,
3574                 * and fallback to injecting IRQ via V_IRQ.
3575                 */
3576                svm_toggle_avic_for_irq_window(vcpu, false);
3577                svm_set_vintr(svm);
3578        }
3579}
3580
3581static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3582{
3583        struct vcpu_svm *svm = to_svm(vcpu);
3584
3585        if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
3586                return; /* IRET will cause a vm exit */
3587
3588        if (!gif_set(svm)) {
3589                if (vgif_enabled(svm))
3590                        svm_set_intercept(svm, INTERCEPT_STGI);
3591                return; /* STGI will cause a vm exit */
3592        }
3593
3594        /*
3595         * Something prevents NMI from been injected. Single step over possible
3596         * problem (IRET or exception injection or interrupt shadow)
3597         */
3598        svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3599        svm->nmi_singlestep = true;
3600        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3601}
3602
3603static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3604{
3605        return 0;
3606}
3607
3608static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
3609{
3610        return 0;
3611}
3612
3613void svm_flush_tlb(struct kvm_vcpu *vcpu)
3614{
3615        struct vcpu_svm *svm = to_svm(vcpu);
3616
3617        /*
3618         * Flush only the current ASID even if the TLB flush was invoked via
3619         * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3620         * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3621         * unconditionally does a TLB flush on both nested VM-Enter and nested
3622         * VM-Exit (via kvm_mmu_reset_context()).
3623         */
3624        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3625                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3626        else
3627                svm->current_vmcb->asid_generation--;
3628}
3629
3630static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3631{
3632        struct vcpu_svm *svm = to_svm(vcpu);
3633
3634        invlpga(gva, svm->vmcb->control.asid);
3635}
3636
3637static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3638{
3639        struct vcpu_svm *svm = to_svm(vcpu);
3640
3641        if (nested_svm_virtualize_tpr(vcpu))
3642                return;
3643
3644        if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3645                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3646                kvm_set_cr8(vcpu, cr8);
3647        }
3648}
3649
3650static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3651{
3652        struct vcpu_svm *svm = to_svm(vcpu);
3653        u64 cr8;
3654
3655        if (nested_svm_virtualize_tpr(vcpu) ||
3656            kvm_vcpu_apicv_active(vcpu))
3657                return;
3658
3659        cr8 = kvm_get_cr8(vcpu);
3660        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3661        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3662}
3663
3664static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3665{
3666        struct vcpu_svm *svm = to_svm(vcpu);
3667        u8 vector;
3668        int type;
3669        u32 exitintinfo = svm->vmcb->control.exit_int_info;
3670        unsigned int3_injected = svm->int3_injected;
3671
3672        svm->int3_injected = 0;
3673
3674        /*
3675         * If we've made progress since setting HF_IRET_MASK, we've
3676         * executed an IRET and can allow NMI injection.
3677         */
3678        if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3679            (sev_es_guest(vcpu->kvm) ||
3680             kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3681                vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3682                kvm_make_request(KVM_REQ_EVENT, vcpu);
3683        }
3684
3685        vcpu->arch.nmi_injected = false;
3686        kvm_clear_exception_queue(vcpu);
3687        kvm_clear_interrupt_queue(vcpu);
3688
3689        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3690                return;
3691
3692        kvm_make_request(KVM_REQ_EVENT, vcpu);
3693
3694        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3695        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3696
3697        switch (type) {
3698        case SVM_EXITINTINFO_TYPE_NMI:
3699                vcpu->arch.nmi_injected = true;
3700                break;
3701        case SVM_EXITINTINFO_TYPE_EXEPT:
3702                /*
3703                 * Never re-inject a #VC exception.
3704                 */
3705                if (vector == X86_TRAP_VC)
3706                        break;
3707
3708                /*
3709                 * In case of software exceptions, do not reinject the vector,
3710                 * but re-execute the instruction instead. Rewind RIP first
3711                 * if we emulated INT3 before.
3712                 */
3713                if (kvm_exception_is_soft(vector)) {
3714                        if (vector == BP_VECTOR && int3_injected &&
3715                            kvm_is_linear_rip(vcpu, svm->int3_rip))
3716                                kvm_rip_write(vcpu,
3717                                              kvm_rip_read(vcpu) - int3_injected);
3718                        break;
3719                }
3720                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3721                        u32 err = svm->vmcb->control.exit_int_info_err;
3722                        kvm_requeue_exception_e(vcpu, vector, err);
3723
3724                } else
3725                        kvm_requeue_exception(vcpu, vector);
3726                break;
3727        case SVM_EXITINTINFO_TYPE_INTR:
3728                kvm_queue_interrupt(vcpu, vector, false);
3729                break;
3730        default:
3731                break;
3732        }
3733}
3734
3735static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3736{
3737        struct vcpu_svm *svm = to_svm(vcpu);
3738        struct vmcb_control_area *control = &svm->vmcb->control;
3739
3740        control->exit_int_info = control->event_inj;
3741        control->exit_int_info_err = control->event_inj_err;
3742        control->event_inj = 0;
3743        svm_complete_interrupts(vcpu);
3744}
3745
3746static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3747{
3748        if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
3749            to_svm(vcpu)->vmcb->control.exit_info_1)
3750                return handle_fastpath_set_msr_irqoff(vcpu);
3751
3752        return EXIT_FASTPATH_NONE;
3753}
3754
3755static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
3756{
3757        struct vcpu_svm *svm = to_svm(vcpu);
3758        unsigned long vmcb_pa = svm->current_vmcb->pa;
3759
3760        kvm_guest_enter_irqoff();
3761
3762        if (sev_es_guest(vcpu->kvm)) {
3763                __svm_sev_es_vcpu_run(vmcb_pa);
3764        } else {
3765                struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3766
3767                /*
3768                 * Use a single vmcb (vmcb01 because it's always valid) for
3769                 * context switching guest state via VMLOAD/VMSAVE, that way
3770                 * the state doesn't need to be copied between vmcb01 and
3771                 * vmcb02 when switching vmcbs for nested virtualization.
3772                 */
3773                vmload(svm->vmcb01.pa);
3774                __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
3775                vmsave(svm->vmcb01.pa);
3776
3777                vmload(__sme_page_pa(sd->save_area));
3778        }
3779
3780        kvm_guest_exit_irqoff();
3781}
3782
3783static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3784{
3785        struct vcpu_svm *svm = to_svm(vcpu);
3786
3787        trace_kvm_entry(vcpu);
3788
3789        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3790        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3791        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3792
3793        /*
3794         * Disable singlestep if we're injecting an interrupt/exception.
3795         * We don't want our modified rflags to be pushed on the stack where
3796         * we might not be able to easily reset them if we disabled NMI
3797         * singlestep later.
3798         */
3799        if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3800                /*
3801                 * Event injection happens before external interrupts cause a
3802                 * vmexit and interrupts are disabled here, so smp_send_reschedule
3803                 * is enough to force an immediate vmexit.
3804                 */
3805                disable_nmi_singlestep(svm);
3806                smp_send_reschedule(vcpu->cpu);
3807        }
3808
3809        pre_svm_run(vcpu);
3810
3811        sync_lapic_to_cr8(vcpu);
3812
3813        if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3814                svm->vmcb->control.asid = svm->asid;
3815                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3816        }
3817        svm->vmcb->save.cr2 = vcpu->arch.cr2;
3818
3819        svm_hv_update_vp_id(svm->vmcb, vcpu);
3820
3821        /*
3822         * Run with all-zero DR6 unless needed, so that we can get the exact cause
3823         * of a #DB.
3824         */
3825        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3826                svm_set_dr6(svm, vcpu->arch.dr6);
3827        else
3828                svm_set_dr6(svm, DR6_ACTIVE_LOW);
3829
3830        clgi();
3831        kvm_load_guest_xsave_state(vcpu);
3832
3833        kvm_wait_lapic_expire(vcpu);
3834
3835        /*
3836         * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3837         * it's non-zero. Since vmentry is serialising on affected CPUs, there
3838         * is no need to worry about the conditional branch over the wrmsr
3839         * being speculatively taken.
3840         */
3841        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3842                x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
3843
3844        svm_vcpu_enter_exit(vcpu);
3845
3846        /*
3847         * We do not use IBRS in the kernel. If this vCPU has used the
3848         * SPEC_CTRL MSR it may have left it on; save the value and
3849         * turn it off. This is much more efficient than blindly adding
3850         * it to the atomic save/restore list. Especially as the former
3851         * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
3852         *
3853         * For non-nested case:
3854         * If the L01 MSR bitmap does not intercept the MSR, then we need to
3855         * save it.
3856         *
3857         * For nested case:
3858         * If the L02 MSR bitmap does not intercept the MSR, then we need to
3859         * save it.
3860         */
3861        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
3862            unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
3863                svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
3864
3865        if (!sev_es_guest(vcpu->kvm))
3866                reload_tss(vcpu);
3867
3868        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3869                x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
3870
3871        if (!sev_es_guest(vcpu->kvm)) {
3872                vcpu->arch.cr2 = svm->vmcb->save.cr2;
3873                vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3874                vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3875                vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3876        }
3877
3878        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3879                kvm_before_interrupt(vcpu);
3880
3881        kvm_load_host_xsave_state(vcpu);
3882        stgi();
3883
3884        /* Any pending NMI will happen here */
3885
3886        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3887                kvm_after_interrupt(vcpu);
3888
3889        sync_cr8_to_lapic(vcpu);
3890
3891        svm->next_rip = 0;
3892        if (is_guest_mode(vcpu)) {
3893                nested_sync_control_from_vmcb02(svm);
3894
3895                /* Track VMRUNs that have made past consistency checking */
3896                if (svm->nested.nested_run_pending &&
3897                    svm->vmcb->control.exit_code != SVM_EXIT_ERR)
3898                        ++vcpu->stat.nested_run;
3899
3900                svm->nested.nested_run_pending = 0;
3901        }
3902
3903        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3904        vmcb_mark_all_clean(svm->vmcb);
3905
3906        /* if exit due to PF check for async PF */
3907        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3908                vcpu->arch.apf.host_apf_flags =
3909                        kvm_read_and_reset_apf_flags();
3910
3911        if (npt_enabled)
3912                kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
3913
3914        /*
3915         * We need to handle MC intercepts here before the vcpu has a chance to
3916         * change the physical cpu
3917         */
3918        if (unlikely(svm->vmcb->control.exit_code ==
3919                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
3920                svm_handle_mce(vcpu);
3921
3922        svm_complete_interrupts(vcpu);
3923
3924        if (is_guest_mode(vcpu))
3925                return EXIT_FASTPATH_NONE;
3926
3927        return svm_exit_handlers_fastpath(vcpu);
3928}
3929
3930static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3931                             int root_level)
3932{
3933        struct vcpu_svm *svm = to_svm(vcpu);
3934        unsigned long cr3;
3935
3936        if (npt_enabled) {
3937                svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
3938                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
3939
3940                hv_track_root_tdp(vcpu, root_hpa);
3941
3942                /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
3943                if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3944                        return;
3945                cr3 = vcpu->arch.cr3;
3946        } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3947                cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
3948        } else {
3949                /* PCID in the guest should be impossible with a 32-bit MMU. */
3950                WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
3951                cr3 = root_hpa;
3952        }
3953
3954        svm->vmcb->save.cr3 = cr3;
3955        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
3956}
3957
3958static int is_disabled(void)
3959{
3960        u64 vm_cr;
3961
3962        rdmsrl(MSR_VM_CR, vm_cr);
3963        if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
3964                return 1;
3965
3966        return 0;
3967}
3968
3969static void
3970svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3971{
3972        /*
3973         * Patch in the VMMCALL instruction:
3974         */
3975        hypercall[0] = 0x0f;
3976        hypercall[1] = 0x01;
3977        hypercall[2] = 0xd9;
3978}
3979
3980static int __init svm_check_processor_compat(void)
3981{
3982        return 0;
3983}
3984
3985static bool svm_cpu_has_accelerated_tpr(void)
3986{
3987        return false;
3988}
3989
3990/*
3991 * The kvm parameter can be NULL (module initialization, or invocation before
3992 * VM creation). Be sure to check the kvm parameter before using it.
3993 */
3994static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
3995{
3996        switch (index) {
3997        case MSR_IA32_MCG_EXT_CTL:
3998        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3999                return false;
4000        case MSR_IA32_SMBASE:
4001                /* SEV-ES guests do not support SMM, so report false */
4002                if (kvm && sev_es_guest(kvm))
4003                        return false;
4004                break;
4005        default:
4006                break;
4007        }
4008
4009        return true;
4010}
4011
4012static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4013{
4014        return 0;
4015}
4016
4017static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4018{
4019        struct vcpu_svm *svm = to_svm(vcpu);
4020        struct kvm_cpuid_entry2 *best;
4021
4022        vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4023                                    boot_cpu_has(X86_FEATURE_XSAVE) &&
4024                                    boot_cpu_has(X86_FEATURE_XSAVES);
4025
4026        /* Update nrips enabled cache */
4027        svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4028                             guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4029
4030        svm_recalc_instruction_intercepts(vcpu, svm);
4031
4032        /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4033        if (sev_guest(vcpu->kvm)) {
4034                best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
4035                if (best)
4036                        vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4037        }
4038
4039        if (kvm_vcpu_apicv_active(vcpu)) {
4040                /*
4041                 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
4042                 * is exposed to the guest, disable AVIC.
4043                 */
4044                if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
4045                        kvm_request_apicv_update(vcpu->kvm, false,
4046                                                 APICV_INHIBIT_REASON_X2APIC);
4047
4048                /*
4049                 * Currently, AVIC does not work with nested virtualization.
4050                 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
4051                 */
4052                if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4053                        kvm_request_apicv_update(vcpu->kvm, false,
4054                                                 APICV_INHIBIT_REASON_NESTED);
4055        }
4056
4057        if (guest_cpuid_is_intel(vcpu)) {
4058                /*
4059                 * We must intercept SYSENTER_EIP and SYSENTER_ESP
4060                 * accesses because the processor only stores 32 bits.
4061                 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
4062                 */
4063                svm_set_intercept(svm, INTERCEPT_VMLOAD);
4064                svm_set_intercept(svm, INTERCEPT_VMSAVE);
4065                svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
4066
4067                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
4068                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
4069        } else {
4070                /*
4071                 * If hardware supports Virtual VMLOAD VMSAVE then enable it
4072                 * in VMCB and clear intercepts to avoid #VMEXIT.
4073                 */
4074                if (vls) {
4075                        svm_clr_intercept(svm, INTERCEPT_VMLOAD);
4076                        svm_clr_intercept(svm, INTERCEPT_VMSAVE);
4077                        svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
4078                }
4079                /* No need to intercept these MSRs */
4080                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
4081                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
4082        }
4083}
4084
4085static bool svm_has_wbinvd_exit(void)
4086{
4087        return true;
4088}
4089
4090#define PRE_EX(exit)  { .exit_code = (exit), \
4091                        .stage = X86_ICPT_PRE_EXCEPT, }
4092#define POST_EX(exit) { .exit_code = (exit), \
4093                        .stage = X86_ICPT_POST_EXCEPT, }
4094#define POST_MEM(exit) { .exit_code = (exit), \
4095                        .stage = X86_ICPT_POST_MEMACCESS, }
4096
4097static const struct __x86_intercept {
4098        u32 exit_code;
4099        enum x86_intercept_stage stage;
4100} x86_intercept_map[] = {
4101        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4102        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4103        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4104        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4105        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4106        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4107        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4108        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4109        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4110        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4111        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4112        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4113        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4114        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4115        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4116        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4117        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4118        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4119        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4120        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4121        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4122        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4123        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4124        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4125        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4126        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4127        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4128        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4129        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4130        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4131        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4132        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4133        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4134        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4135        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4136        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4137        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4138        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4139        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4140        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4141        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4142        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4143        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4144        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4145        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4146        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4147        [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4148};
4149
4150#undef PRE_EX
4151#undef POST_EX
4152#undef POST_MEM
4153
4154static int svm_check_intercept(struct kvm_vcpu *vcpu,
4155                               struct x86_instruction_info *info,
4156                               enum x86_intercept_stage stage,
4157                               struct x86_exception *exception)
4158{
4159        struct vcpu_svm *svm = to_svm(vcpu);
4160        int vmexit, ret = X86EMUL_CONTINUE;
4161        struct __x86_intercept icpt_info;
4162        struct vmcb *vmcb = svm->vmcb;
4163
4164        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4165                goto out;
4166
4167        icpt_info = x86_intercept_map[info->intercept];
4168
4169        if (stage != icpt_info.stage)
4170                goto out;
4171
4172        switch (icpt_info.exit_code) {
4173        case SVM_EXIT_READ_CR0:
4174                if (info->intercept == x86_intercept_cr_read)
4175                        icpt_info.exit_code += info->modrm_reg;
4176                break;
4177        case SVM_EXIT_WRITE_CR0: {
4178                unsigned long cr0, val;
4179
4180                if (info->intercept == x86_intercept_cr_write)
4181                        icpt_info.exit_code += info->modrm_reg;
4182
4183                if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4184                    info->intercept == x86_intercept_clts)
4185                        break;
4186
4187                if (!(vmcb_is_intercept(&svm->nested.ctl,
4188                                        INTERCEPT_SELECTIVE_CR0)))
4189                        break;
4190
4191                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4192                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4193
4194                if (info->intercept == x86_intercept_lmsw) {
4195                        cr0 &= 0xfUL;
4196                        val &= 0xfUL;
4197                        /* lmsw can't clear PE - catch this here */
4198                        if (cr0 & X86_CR0_PE)
4199                                val |= X86_CR0_PE;
4200                }
4201
4202                if (cr0 ^ val)
4203                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4204
4205                break;
4206        }
4207        case SVM_EXIT_READ_DR0:
4208        case SVM_EXIT_WRITE_DR0:
4209                icpt_info.exit_code += info->modrm_reg;
4210                break;
4211        case SVM_EXIT_MSR:
4212                if (info->intercept == x86_intercept_wrmsr)
4213                        vmcb->control.exit_info_1 = 1;
4214                else
4215                        vmcb->control.exit_info_1 = 0;
4216                break;
4217        case SVM_EXIT_PAUSE:
4218                /*
4219                 * We get this for NOP only, but pause
4220                 * is rep not, check this here
4221                 */
4222                if (info->rep_prefix != REPE_PREFIX)
4223                        goto out;
4224                break;
4225        case SVM_EXIT_IOIO: {
4226                u64 exit_info;
4227                u32 bytes;
4228
4229                if (info->intercept == x86_intercept_in ||
4230                    info->intercept == x86_intercept_ins) {
4231                        exit_info = ((info->src_val & 0xffff) << 16) |
4232                                SVM_IOIO_TYPE_MASK;
4233                        bytes = info->dst_bytes;
4234                } else {
4235                        exit_info = (info->dst_val & 0xffff) << 16;
4236                        bytes = info->src_bytes;
4237                }
4238
4239                if (info->intercept == x86_intercept_outs ||
4240                    info->intercept == x86_intercept_ins)
4241                        exit_info |= SVM_IOIO_STR_MASK;
4242
4243                if (info->rep_prefix)
4244                        exit_info |= SVM_IOIO_REP_MASK;
4245
4246                bytes = min(bytes, 4u);
4247
4248                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4249
4250                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4251
4252                vmcb->control.exit_info_1 = exit_info;
4253                vmcb->control.exit_info_2 = info->next_rip;
4254
4255                break;
4256        }
4257        default:
4258                break;
4259        }
4260
4261        /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4262        if (static_cpu_has(X86_FEATURE_NRIPS))
4263                vmcb->control.next_rip  = info->next_rip;
4264        vmcb->control.exit_code = icpt_info.exit_code;
4265        vmexit = nested_svm_exit_handled(svm);
4266
4267        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4268                                           : X86EMUL_CONTINUE;
4269
4270out:
4271        return ret;
4272}
4273
4274static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4275{
4276}
4277
4278static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4279{
4280        if (!kvm_pause_in_guest(vcpu->kvm))
4281                shrink_ple_window(vcpu);
4282}
4283
4284static void svm_setup_mce(struct kvm_vcpu *vcpu)
4285{
4286        /* [63:9] are reserved. */
4287        vcpu->arch.mcg_cap &= 0x1ff;
4288}
4289
4290bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4291{
4292        struct vcpu_svm *svm = to_svm(vcpu);
4293
4294        /* Per APM Vol.2 15.22.2 "Response to SMI" */
4295        if (!gif_set(svm))
4296                return true;
4297
4298        return is_smm(vcpu);
4299}
4300
4301static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4302{
4303        struct vcpu_svm *svm = to_svm(vcpu);
4304        if (svm->nested.nested_run_pending)
4305                return -EBUSY;
4306
4307        /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4308        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4309                return -EBUSY;
4310
4311        return !svm_smi_blocked(vcpu);
4312}
4313
4314static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
4315{
4316        struct vcpu_svm *svm = to_svm(vcpu);
4317        struct kvm_host_map map_save;
4318        int ret;
4319
4320        if (is_guest_mode(vcpu)) {
4321                /* FED8h - SVM Guest */
4322                put_smstate(u64, smstate, 0x7ed8, 1);
4323                /* FEE0h - SVM Guest VMCB Physical Address */
4324                put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
4325
4326                svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4327                svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4328                svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4329
4330                ret = nested_svm_vmexit(svm);
4331                if (ret)
4332                        return ret;
4333
4334                /*
4335                 * KVM uses VMCB01 to store L1 host state while L2 runs but
4336                 * VMCB01 is going to be used during SMM and thus the state will
4337                 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4338                 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4339                 * format of the area is identical to guest save area offsetted
4340                 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4341                 * within 'struct vmcb'). Note: HSAVE area may also be used by
4342                 * L1 hypervisor to save additional host context (e.g. KVM does
4343                 * that, see svm_prepare_guest_switch()) which must be
4344                 * preserved.
4345                 */
4346                if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4347                                 &map_save) == -EINVAL)
4348                        return 1;
4349
4350                BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4351
4352                svm_copy_vmrun_state(map_save.hva + 0x400,
4353                                     &svm->vmcb01.ptr->save);
4354
4355                kvm_vcpu_unmap(vcpu, &map_save, true);
4356        }
4357        return 0;
4358}
4359
4360static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
4361{
4362        struct vcpu_svm *svm = to_svm(vcpu);
4363        struct kvm_host_map map, map_save;
4364        int ret = 0;
4365
4366        if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
4367                u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4368                u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
4369                u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4370                struct vmcb *vmcb12;
4371
4372                if (guest) {
4373                        if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4374                                return 1;
4375
4376                        if (!(saved_efer & EFER_SVME))
4377                                return 1;
4378
4379                        if (kvm_vcpu_map(vcpu,
4380                                         gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4381                                return 1;
4382
4383                        if (svm_allocate_nested(svm))
4384                                return 1;
4385
4386                        vmcb12 = map.hva;
4387
4388                        nested_load_control_from_vmcb12(svm, &vmcb12->control);
4389
4390                        ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12);
4391                        kvm_vcpu_unmap(vcpu, &map, true);
4392
4393                        /*
4394                         * Restore L1 host state from L1 HSAVE area as VMCB01 was
4395                         * used during SMM (see svm_enter_smm())
4396                         */
4397                        if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4398                                         &map_save) == -EINVAL)
4399                                return 1;
4400
4401                        svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
4402                                             map_save.hva + 0x400);
4403
4404                        kvm_vcpu_unmap(vcpu, &map_save, true);
4405                }
4406        }
4407
4408        return ret;
4409}
4410
4411static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4412{
4413        struct vcpu_svm *svm = to_svm(vcpu);
4414
4415        if (!gif_set(svm)) {
4416                if (vgif_enabled(svm))
4417                        svm_set_intercept(svm, INTERCEPT_STGI);
4418                /* STGI will cause a vm exit */
4419        } else {
4420                /* We must be in SMM; RSM will cause a vmexit anyway.  */
4421        }
4422}
4423
4424static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
4425{
4426        bool smep, smap, is_user;
4427        unsigned long cr4;
4428
4429        /*
4430         * When the guest is an SEV-ES guest, emulation is not possible.
4431         */
4432        if (sev_es_guest(vcpu->kvm))
4433                return false;
4434
4435        /*
4436         * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4437         *
4438         * Errata:
4439         * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
4440         * possible that CPU microcode implementing DecodeAssist will fail
4441         * to read bytes of instruction which caused #NPF. In this case,
4442         * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
4443         * return 0 instead of the correct guest instruction bytes.
4444         *
4445         * This happens because CPU microcode reading instruction bytes
4446         * uses a special opcode wh