linux/arch/x86/kvm/vmx.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 *
   9 * Authors:
  10 *   Avi Kivity   <avi@qumranet.com>
  11 *   Yaniv Kamay  <yaniv@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17
  18#include "irq.h"
  19#include "mmu.h"
  20
  21#include <linux/kvm_host.h>
  22#include <linux/module.h>
  23#include <linux/kernel.h>
  24#include <linux/mm.h>
  25#include <linux/highmem.h>
  26#include <linux/sched.h>
  27#include <linux/moduleparam.h>
  28#include "kvm_cache_regs.h"
  29#include "x86.h"
  30
  31#include <asm/io.h>
  32#include <asm/desc.h>
  33#include <asm/vmx.h>
  34#include <asm/virtext.h>
  35#include <asm/mce.h>
  36
  37#define __ex(x) __kvm_handle_fault_on_reboot(x)
  38
  39MODULE_AUTHOR("Qumranet");
  40MODULE_LICENSE("GPL");
  41
  42static int __read_mostly bypass_guest_pf = 1;
  43module_param(bypass_guest_pf, bool, S_IRUGO);
  44
  45static int __read_mostly enable_vpid = 1;
  46module_param_named(vpid, enable_vpid, bool, 0444);
  47
  48static int __read_mostly flexpriority_enabled = 1;
  49module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  50
  51static int __read_mostly enable_ept = 1;
  52module_param_named(ept, enable_ept, bool, S_IRUGO);
  53
  54static int __read_mostly emulate_invalid_guest_state = 0;
  55module_param(emulate_invalid_guest_state, bool, S_IRUGO);
  56
  57struct vmcs {
  58        u32 revision_id;
  59        u32 abort;
  60        char data[0];
  61};
  62
  63struct vcpu_vmx {
  64        struct kvm_vcpu       vcpu;
  65        struct list_head      local_vcpus_link;
  66        unsigned long         host_rsp;
  67        int                   launched;
  68        u8                    fail;
  69        u32                   idt_vectoring_info;
  70        struct kvm_msr_entry *guest_msrs;
  71        struct kvm_msr_entry *host_msrs;
  72        int                   nmsrs;
  73        int                   save_nmsrs;
  74        int                   msr_offset_efer;
  75#ifdef CONFIG_X86_64
  76        int                   msr_offset_kernel_gs_base;
  77#endif
  78        struct vmcs          *vmcs;
  79        struct {
  80                int           loaded;
  81                u16           fs_sel, gs_sel, ldt_sel;
  82                int           gs_ldt_reload_needed;
  83                int           fs_reload_needed;
  84                int           guest_efer_loaded;
  85        } host_state;
  86        struct {
  87                struct {
  88                        bool pending;
  89                        u8 vector;
  90                        unsigned rip;
  91                } irq;
  92        } rmode;
  93        int vpid;
  94        bool emulation_required;
  95        enum emulation_result invalid_state_emulation_result;
  96
  97        /* Support for vnmi-less CPUs */
  98        int soft_vnmi_blocked;
  99        ktime_t entry_time;
 100        s64 vnmi_blocked_time;
 101        u32 exit_reason;
 102};
 103
 104static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 105{
 106        return container_of(vcpu, struct vcpu_vmx, vcpu);
 107}
 108
 109static int init_rmode(struct kvm *kvm);
 110static u64 construct_eptp(unsigned long root_hpa);
 111
 112static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 113static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 114static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 115
 116static unsigned long *vmx_io_bitmap_a;
 117static unsigned long *vmx_io_bitmap_b;
 118static unsigned long *vmx_msr_bitmap_legacy;
 119static unsigned long *vmx_msr_bitmap_longmode;
 120
 121static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 122static DEFINE_SPINLOCK(vmx_vpid_lock);
 123
 124static struct vmcs_config {
 125        int size;
 126        int order;
 127        u32 revision_id;
 128        u32 pin_based_exec_ctrl;
 129        u32 cpu_based_exec_ctrl;
 130        u32 cpu_based_2nd_exec_ctrl;
 131        u32 vmexit_ctrl;
 132        u32 vmentry_ctrl;
 133} vmcs_config;
 134
 135static struct vmx_capability {
 136        u32 ept;
 137        u32 vpid;
 138} vmx_capability;
 139
 140#define VMX_SEGMENT_FIELD(seg)                                  \
 141        [VCPU_SREG_##seg] = {                                   \
 142                .selector = GUEST_##seg##_SELECTOR,             \
 143                .base = GUEST_##seg##_BASE,                     \
 144                .limit = GUEST_##seg##_LIMIT,                   \
 145                .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 146        }
 147
 148static struct kvm_vmx_segment_field {
 149        unsigned selector;
 150        unsigned base;
 151        unsigned limit;
 152        unsigned ar_bytes;
 153} kvm_vmx_segment_fields[] = {
 154        VMX_SEGMENT_FIELD(CS),
 155        VMX_SEGMENT_FIELD(DS),
 156        VMX_SEGMENT_FIELD(ES),
 157        VMX_SEGMENT_FIELD(FS),
 158        VMX_SEGMENT_FIELD(GS),
 159        VMX_SEGMENT_FIELD(SS),
 160        VMX_SEGMENT_FIELD(TR),
 161        VMX_SEGMENT_FIELD(LDTR),
 162};
 163
 164/*
 165 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
 166 * away by decrementing the array size.
 167 */
 168static const u32 vmx_msr_index[] = {
 169#ifdef CONFIG_X86_64
 170        MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
 171#endif
 172        MSR_EFER, MSR_K6_STAR,
 173};
 174#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 175
 176static void load_msrs(struct kvm_msr_entry *e, int n)
 177{
 178        int i;
 179
 180        for (i = 0; i < n; ++i)
 181                wrmsrl(e[i].index, e[i].data);
 182}
 183
 184static void save_msrs(struct kvm_msr_entry *e, int n)
 185{
 186        int i;
 187
 188        for (i = 0; i < n; ++i)
 189                rdmsrl(e[i].index, e[i].data);
 190}
 191
 192static inline int is_page_fault(u32 intr_info)
 193{
 194        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 195                             INTR_INFO_VALID_MASK)) ==
 196                (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 197}
 198
 199static inline int is_no_device(u32 intr_info)
 200{
 201        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 202                             INTR_INFO_VALID_MASK)) ==
 203                (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 204}
 205
 206static inline int is_invalid_opcode(u32 intr_info)
 207{
 208        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 209                             INTR_INFO_VALID_MASK)) ==
 210                (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 211}
 212
 213static inline int is_external_interrupt(u32 intr_info)
 214{
 215        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 216                == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 217}
 218
 219static inline int is_machine_check(u32 intr_info)
 220{
 221        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 222                             INTR_INFO_VALID_MASK)) ==
 223                (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 224}
 225
 226static inline int cpu_has_vmx_msr_bitmap(void)
 227{
 228        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 229}
 230
 231static inline int cpu_has_vmx_tpr_shadow(void)
 232{
 233        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 234}
 235
 236static inline int vm_need_tpr_shadow(struct kvm *kvm)
 237{
 238        return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 239}
 240
 241static inline int cpu_has_secondary_exec_ctrls(void)
 242{
 243        return vmcs_config.cpu_based_exec_ctrl &
 244                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 245}
 246
 247static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 248{
 249        return vmcs_config.cpu_based_2nd_exec_ctrl &
 250                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 251}
 252
 253static inline bool cpu_has_vmx_flexpriority(void)
 254{
 255        return cpu_has_vmx_tpr_shadow() &&
 256                cpu_has_vmx_virtualize_apic_accesses();
 257}
 258
 259static inline int cpu_has_vmx_invept_individual_addr(void)
 260{
 261        return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
 262}
 263
 264static inline int cpu_has_vmx_invept_context(void)
 265{
 266        return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
 267}
 268
 269static inline int cpu_has_vmx_invept_global(void)
 270{
 271        return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
 272}
 273
 274static inline int cpu_has_vmx_ept(void)
 275{
 276        return vmcs_config.cpu_based_2nd_exec_ctrl &
 277                SECONDARY_EXEC_ENABLE_EPT;
 278}
 279
 280static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 281{
 282        return flexpriority_enabled &&
 283                (cpu_has_vmx_virtualize_apic_accesses()) &&
 284                (irqchip_in_kernel(kvm));
 285}
 286
 287static inline int cpu_has_vmx_vpid(void)
 288{
 289        return vmcs_config.cpu_based_2nd_exec_ctrl &
 290                SECONDARY_EXEC_ENABLE_VPID;
 291}
 292
 293static inline int cpu_has_virtual_nmis(void)
 294{
 295        return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 296}
 297
 298static inline bool report_flexpriority(void)
 299{
 300        return flexpriority_enabled;
 301}
 302
 303static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 304{
 305        int i;
 306
 307        for (i = 0; i < vmx->nmsrs; ++i)
 308                if (vmx->guest_msrs[i].index == msr)
 309                        return i;
 310        return -1;
 311}
 312
 313static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 314{
 315    struct {
 316        u64 vpid : 16;
 317        u64 rsvd : 48;
 318        u64 gva;
 319    } operand = { vpid, 0, gva };
 320
 321    asm volatile (__ex(ASM_VMX_INVVPID)
 322                  /* CF==1 or ZF==1 --> rc = -1 */
 323                  "; ja 1f ; ud2 ; 1:"
 324                  : : "a"(&operand), "c"(ext) : "cc", "memory");
 325}
 326
 327static inline void __invept(int ext, u64 eptp, gpa_t gpa)
 328{
 329        struct {
 330                u64 eptp, gpa;
 331        } operand = {eptp, gpa};
 332
 333        asm volatile (__ex(ASM_VMX_INVEPT)
 334                        /* CF==1 or ZF==1 --> rc = -1 */
 335                        "; ja 1f ; ud2 ; 1:\n"
 336                        : : "a" (&operand), "c" (ext) : "cc", "memory");
 337}
 338
 339static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 340{
 341        int i;
 342
 343        i = __find_msr_index(vmx, msr);
 344        if (i >= 0)
 345                return &vmx->guest_msrs[i];
 346        return NULL;
 347}
 348
 349static void vmcs_clear(struct vmcs *vmcs)
 350{
 351        u64 phys_addr = __pa(vmcs);
 352        u8 error;
 353
 354        asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
 355                      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
 356                      : "cc", "memory");
 357        if (error)
 358                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
 359                       vmcs, phys_addr);
 360}
 361
 362static void __vcpu_clear(void *arg)
 363{
 364        struct vcpu_vmx *vmx = arg;
 365        int cpu = raw_smp_processor_id();
 366
 367        if (vmx->vcpu.cpu == cpu)
 368                vmcs_clear(vmx->vmcs);
 369        if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 370                per_cpu(current_vmcs, cpu) = NULL;
 371        rdtscll(vmx->vcpu.arch.host_tsc);
 372        list_del(&vmx->local_vcpus_link);
 373        vmx->vcpu.cpu = -1;
 374        vmx->launched = 0;
 375}
 376
 377static void vcpu_clear(struct vcpu_vmx *vmx)
 378{
 379        if (vmx->vcpu.cpu == -1)
 380                return;
 381        smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 382}
 383
 384static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 385{
 386        if (vmx->vpid == 0)
 387                return;
 388
 389        __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 390}
 391
 392static inline void ept_sync_global(void)
 393{
 394        if (cpu_has_vmx_invept_global())
 395                __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
 396}
 397
 398static inline void ept_sync_context(u64 eptp)
 399{
 400        if (enable_ept) {
 401                if (cpu_has_vmx_invept_context())
 402                        __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 403                else
 404                        ept_sync_global();
 405        }
 406}
 407
 408static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 409{
 410        if (enable_ept) {
 411                if (cpu_has_vmx_invept_individual_addr())
 412                        __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
 413                                        eptp, gpa);
 414                else
 415                        ept_sync_context(eptp);
 416        }
 417}
 418
 419static unsigned long vmcs_readl(unsigned long field)
 420{
 421        unsigned long value;
 422
 423        asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
 424                      : "=a"(value) : "d"(field) : "cc");
 425        return value;
 426}
 427
 428static u16 vmcs_read16(unsigned long field)
 429{
 430        return vmcs_readl(field);
 431}
 432
 433static u32 vmcs_read32(unsigned long field)
 434{
 435        return vmcs_readl(field);
 436}
 437
 438static u64 vmcs_read64(unsigned long field)
 439{
 440#ifdef CONFIG_X86_64
 441        return vmcs_readl(field);
 442#else
 443        return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
 444#endif
 445}
 446
 447static noinline void vmwrite_error(unsigned long field, unsigned long value)
 448{
 449        printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
 450               field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 451        dump_stack();
 452}
 453
 454static void vmcs_writel(unsigned long field, unsigned long value)
 455{
 456        u8 error;
 457
 458        asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
 459                       : "=q"(error) : "a"(value), "d"(field) : "cc");
 460        if (unlikely(error))
 461                vmwrite_error(field, value);
 462}
 463
 464static void vmcs_write16(unsigned long field, u16 value)
 465{
 466        vmcs_writel(field, value);
 467}
 468
 469static void vmcs_write32(unsigned long field, u32 value)
 470{
 471        vmcs_writel(field, value);
 472}
 473
 474static void vmcs_write64(unsigned long field, u64 value)
 475{
 476        vmcs_writel(field, value);
 477#ifndef CONFIG_X86_64
 478        asm volatile ("");
 479        vmcs_writel(field+1, value >> 32);
 480#endif
 481}
 482
 483static void vmcs_clear_bits(unsigned long field, u32 mask)
 484{
 485        vmcs_writel(field, vmcs_readl(field) & ~mask);
 486}
 487
 488static void vmcs_set_bits(unsigned long field, u32 mask)
 489{
 490        vmcs_writel(field, vmcs_readl(field) | mask);
 491}
 492
 493static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 494{
 495        u32 eb;
 496
 497        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
 498        if (!vcpu->fpu_active)
 499                eb |= 1u << NM_VECTOR;
 500        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
 501                if (vcpu->guest_debug &
 502                    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
 503                        eb |= 1u << DB_VECTOR;
 504                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
 505                        eb |= 1u << BP_VECTOR;
 506        }
 507        if (vcpu->arch.rmode.vm86_active)
 508                eb = ~0;
 509        if (enable_ept)
 510                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 511        vmcs_write32(EXCEPTION_BITMAP, eb);
 512}
 513
 514static void reload_tss(void)
 515{
 516        /*
 517         * VT restores TR but not its size.  Useless.
 518         */
 519        struct descriptor_table gdt;
 520        struct desc_struct *descs;
 521
 522        kvm_get_gdt(&gdt);
 523        descs = (void *)gdt.base;
 524        descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 525        load_TR_desc();
 526}
 527
 528static void load_transition_efer(struct vcpu_vmx *vmx)
 529{
 530        int efer_offset = vmx->msr_offset_efer;
 531        u64 host_efer = vmx->host_msrs[efer_offset].data;
 532        u64 guest_efer = vmx->guest_msrs[efer_offset].data;
 533        u64 ignore_bits;
 534
 535        if (efer_offset < 0)
 536                return;
 537        /*
 538         * NX is emulated; LMA and LME handled by hardware; SCE meaninless
 539         * outside long mode
 540         */
 541        ignore_bits = EFER_NX | EFER_SCE;
 542#ifdef CONFIG_X86_64
 543        ignore_bits |= EFER_LMA | EFER_LME;
 544        /* SCE is meaningful only in long mode on Intel */
 545        if (guest_efer & EFER_LMA)
 546                ignore_bits &= ~(u64)EFER_SCE;
 547#endif
 548        if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
 549                return;
 550
 551        vmx->host_state.guest_efer_loaded = 1;
 552        guest_efer &= ~ignore_bits;
 553        guest_efer |= host_efer & ignore_bits;
 554        wrmsrl(MSR_EFER, guest_efer);
 555        vmx->vcpu.stat.efer_reload++;
 556}
 557
 558static void reload_host_efer(struct vcpu_vmx *vmx)
 559{
 560        if (vmx->host_state.guest_efer_loaded) {
 561                vmx->host_state.guest_efer_loaded = 0;
 562                load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
 563        }
 564}
 565
 566static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 567{
 568        struct vcpu_vmx *vmx = to_vmx(vcpu);
 569
 570        if (vmx->host_state.loaded)
 571                return;
 572
 573        vmx->host_state.loaded = 1;
 574        /*
 575         * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
 576         * allow segment selectors with cpl > 0 or ti == 1.
 577         */
 578        vmx->host_state.ldt_sel = kvm_read_ldt();
 579        vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 580        vmx->host_state.fs_sel = kvm_read_fs();
 581        if (!(vmx->host_state.fs_sel & 7)) {
 582                vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
 583                vmx->host_state.fs_reload_needed = 0;
 584        } else {
 585                vmcs_write16(HOST_FS_SELECTOR, 0);
 586                vmx->host_state.fs_reload_needed = 1;
 587        }
 588        vmx->host_state.gs_sel = kvm_read_gs();
 589        if (!(vmx->host_state.gs_sel & 7))
 590                vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
 591        else {
 592                vmcs_write16(HOST_GS_SELECTOR, 0);
 593                vmx->host_state.gs_ldt_reload_needed = 1;
 594        }
 595
 596#ifdef CONFIG_X86_64
 597        vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
 598        vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
 599#else
 600        vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
 601        vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
 602#endif
 603
 604#ifdef CONFIG_X86_64
 605        if (is_long_mode(&vmx->vcpu))
 606                save_msrs(vmx->host_msrs +
 607                          vmx->msr_offset_kernel_gs_base, 1);
 608
 609#endif
 610        load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 611        load_transition_efer(vmx);
 612}
 613
 614static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 615{
 616        unsigned long flags;
 617
 618        if (!vmx->host_state.loaded)
 619                return;
 620
 621        ++vmx->vcpu.stat.host_state_reload;
 622        vmx->host_state.loaded = 0;
 623        if (vmx->host_state.fs_reload_needed)
 624                kvm_load_fs(vmx->host_state.fs_sel);
 625        if (vmx->host_state.gs_ldt_reload_needed) {
 626                kvm_load_ldt(vmx->host_state.ldt_sel);
 627                /*
 628                 * If we have to reload gs, we must take care to
 629                 * preserve our gs base.
 630                 */
 631                local_irq_save(flags);
 632                kvm_load_gs(vmx->host_state.gs_sel);
 633#ifdef CONFIG_X86_64
 634                wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
 635#endif
 636                local_irq_restore(flags);
 637        }
 638        reload_tss();
 639        save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 640        load_msrs(vmx->host_msrs, vmx->save_nmsrs);
 641        reload_host_efer(vmx);
 642}
 643
 644static void vmx_load_host_state(struct vcpu_vmx *vmx)
 645{
 646        preempt_disable();
 647        __vmx_load_host_state(vmx);
 648        preempt_enable();
 649}
 650
 651/*
 652 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 653 * vcpu mutex is already taken.
 654 */
 655static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 656{
 657        struct vcpu_vmx *vmx = to_vmx(vcpu);
 658        u64 phys_addr = __pa(vmx->vmcs);
 659        u64 tsc_this, delta, new_offset;
 660
 661        if (vcpu->cpu != cpu) {
 662                vcpu_clear(vmx);
 663                kvm_migrate_timers(vcpu);
 664                vpid_sync_vcpu_all(vmx);
 665                local_irq_disable();
 666                list_add(&vmx->local_vcpus_link,
 667                         &per_cpu(vcpus_on_cpu, cpu));
 668                local_irq_enable();
 669        }
 670
 671        if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 672                u8 error;
 673
 674                per_cpu(current_vmcs, cpu) = vmx->vmcs;
 675                asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
 676                              : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
 677                              : "cc");
 678                if (error)
 679                        printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
 680                               vmx->vmcs, phys_addr);
 681        }
 682
 683        if (vcpu->cpu != cpu) {
 684                struct descriptor_table dt;
 685                unsigned long sysenter_esp;
 686
 687                vcpu->cpu = cpu;
 688                /*
 689                 * Linux uses per-cpu TSS and GDT, so set these when switching
 690                 * processors.
 691                 */
 692                vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
 693                kvm_get_gdt(&dt);
 694                vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
 695
 696                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 697                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 698
 699                /*
 700                 * Make sure the time stamp counter is monotonous.
 701                 */
 702                rdtscll(tsc_this);
 703                if (tsc_this < vcpu->arch.host_tsc) {
 704                        delta = vcpu->arch.host_tsc - tsc_this;
 705                        new_offset = vmcs_read64(TSC_OFFSET) + delta;
 706                        vmcs_write64(TSC_OFFSET, new_offset);
 707                }
 708        }
 709}
 710
 711static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 712{
 713        __vmx_load_host_state(to_vmx(vcpu));
 714}
 715
 716static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 717{
 718        if (vcpu->fpu_active)
 719                return;
 720        vcpu->fpu_active = 1;
 721        vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
 722        if (vcpu->arch.cr0 & X86_CR0_TS)
 723                vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
 724        update_exception_bitmap(vcpu);
 725}
 726
 727static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 728{
 729        if (!vcpu->fpu_active)
 730                return;
 731        vcpu->fpu_active = 0;
 732        vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
 733        update_exception_bitmap(vcpu);
 734}
 735
 736static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 737{
 738        return vmcs_readl(GUEST_RFLAGS);
 739}
 740
 741static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 742{
 743        if (vcpu->arch.rmode.vm86_active)
 744                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 745        vmcs_writel(GUEST_RFLAGS, rflags);
 746}
 747
 748static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 749{
 750        u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 751        int ret = 0;
 752
 753        if (interruptibility & GUEST_INTR_STATE_STI)
 754                ret |= X86_SHADOW_INT_STI;
 755        if (interruptibility & GUEST_INTR_STATE_MOV_SS)
 756                ret |= X86_SHADOW_INT_MOV_SS;
 757
 758        return ret & mask;
 759}
 760
 761static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 762{
 763        u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 764        u32 interruptibility = interruptibility_old;
 765
 766        interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
 767
 768        if (mask & X86_SHADOW_INT_MOV_SS)
 769                interruptibility |= GUEST_INTR_STATE_MOV_SS;
 770        if (mask & X86_SHADOW_INT_STI)
 771                interruptibility |= GUEST_INTR_STATE_STI;
 772
 773        if ((interruptibility != interruptibility_old))
 774                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 775}
 776
 777static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 778{
 779        unsigned long rip;
 780
 781        rip = kvm_rip_read(vcpu);
 782        rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 783        kvm_rip_write(vcpu, rip);
 784
 785        /* skipping an emulated instruction also counts */
 786        vmx_set_interrupt_shadow(vcpu, 0);
 787}
 788
 789static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 790                                bool has_error_code, u32 error_code)
 791{
 792        struct vcpu_vmx *vmx = to_vmx(vcpu);
 793        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 794
 795        if (has_error_code) {
 796                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 797                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
 798        }
 799
 800        if (vcpu->arch.rmode.vm86_active) {
 801                vmx->rmode.irq.pending = true;
 802                vmx->rmode.irq.vector = nr;
 803                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
 804                if (nr == BP_VECTOR || nr == OF_VECTOR)
 805                        vmx->rmode.irq.rip++;
 806                intr_info |= INTR_TYPE_SOFT_INTR;
 807                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 808                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
 809                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 810                return;
 811        }
 812
 813        if (kvm_exception_is_soft(nr)) {
 814                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
 815                             vmx->vcpu.arch.event_exit_inst_len);
 816                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
 817        } else
 818                intr_info |= INTR_TYPE_HARD_EXCEPTION;
 819
 820        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 821}
 822
 823/*
 824 * Swap MSR entry in host/guest MSR entry array.
 825 */
 826#ifdef CONFIG_X86_64
 827static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 828{
 829        struct kvm_msr_entry tmp;
 830
 831        tmp = vmx->guest_msrs[to];
 832        vmx->guest_msrs[to] = vmx->guest_msrs[from];
 833        vmx->guest_msrs[from] = tmp;
 834        tmp = vmx->host_msrs[to];
 835        vmx->host_msrs[to] = vmx->host_msrs[from];
 836        vmx->host_msrs[from] = tmp;
 837}
 838#endif
 839
 840/*
 841 * Set up the vmcs to automatically save and restore system
 842 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
 843 * mode, as fiddling with msrs is very expensive.
 844 */
 845static void setup_msrs(struct vcpu_vmx *vmx)
 846{
 847        int save_nmsrs;
 848        unsigned long *msr_bitmap;
 849
 850        vmx_load_host_state(vmx);
 851        save_nmsrs = 0;
 852#ifdef CONFIG_X86_64
 853        if (is_long_mode(&vmx->vcpu)) {
 854                int index;
 855
 856                index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
 857                if (index >= 0)
 858                        move_msr_up(vmx, index, save_nmsrs++);
 859                index = __find_msr_index(vmx, MSR_LSTAR);
 860                if (index >= 0)
 861                        move_msr_up(vmx, index, save_nmsrs++);
 862                index = __find_msr_index(vmx, MSR_CSTAR);
 863                if (index >= 0)
 864                        move_msr_up(vmx, index, save_nmsrs++);
 865                index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 866                if (index >= 0)
 867                        move_msr_up(vmx, index, save_nmsrs++);
 868                /*
 869                 * MSR_K6_STAR is only needed on long mode guests, and only
 870                 * if efer.sce is enabled.
 871                 */
 872                index = __find_msr_index(vmx, MSR_K6_STAR);
 873                if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
 874                        move_msr_up(vmx, index, save_nmsrs++);
 875        }
 876#endif
 877        vmx->save_nmsrs = save_nmsrs;
 878
 879#ifdef CONFIG_X86_64
 880        vmx->msr_offset_kernel_gs_base =
 881                __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 882#endif
 883        vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
 884
 885        if (cpu_has_vmx_msr_bitmap()) {
 886                if (is_long_mode(&vmx->vcpu))
 887                        msr_bitmap = vmx_msr_bitmap_longmode;
 888                else
 889                        msr_bitmap = vmx_msr_bitmap_legacy;
 890
 891                vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
 892        }
 893}
 894
 895/*
 896 * reads and returns guest's timestamp counter "register"
 897 * guest_tsc = host_tsc + tsc_offset    -- 21.3
 898 */
 899static u64 guest_read_tsc(void)
 900{
 901        u64 host_tsc, tsc_offset;
 902
 903        rdtscll(host_tsc);
 904        tsc_offset = vmcs_read64(TSC_OFFSET);
 905        return host_tsc + tsc_offset;
 906}
 907
 908/*
 909 * writes 'guest_tsc' into guest's timestamp counter "register"
 910 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
 911 */
 912static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 913{
 914        vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 915}
 916
 917/*
 918 * Reads an msr value (of 'msr_index') into 'pdata'.
 919 * Returns 0 on success, non-0 otherwise.
 920 * Assumes vcpu_load() was already called.
 921 */
 922static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 923{
 924        u64 data;
 925        struct kvm_msr_entry *msr;
 926
 927        if (!pdata) {
 928                printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
 929                return -EINVAL;
 930        }
 931
 932        switch (msr_index) {
 933#ifdef CONFIG_X86_64
 934        case MSR_FS_BASE:
 935                data = vmcs_readl(GUEST_FS_BASE);
 936                break;
 937        case MSR_GS_BASE:
 938                data = vmcs_readl(GUEST_GS_BASE);
 939                break;
 940        case MSR_EFER:
 941                return kvm_get_msr_common(vcpu, msr_index, pdata);
 942#endif
 943        case MSR_IA32_TIME_STAMP_COUNTER:
 944                data = guest_read_tsc();
 945                break;
 946        case MSR_IA32_SYSENTER_CS:
 947                data = vmcs_read32(GUEST_SYSENTER_CS);
 948                break;
 949        case MSR_IA32_SYSENTER_EIP:
 950                data = vmcs_readl(GUEST_SYSENTER_EIP);
 951                break;
 952        case MSR_IA32_SYSENTER_ESP:
 953                data = vmcs_readl(GUEST_SYSENTER_ESP);
 954                break;
 955        default:
 956                vmx_load_host_state(to_vmx(vcpu));
 957                msr = find_msr_entry(to_vmx(vcpu), msr_index);
 958                if (msr) {
 959                        data = msr->data;
 960                        break;
 961                }
 962                return kvm_get_msr_common(vcpu, msr_index, pdata);
 963        }
 964
 965        *pdata = data;
 966        return 0;
 967}
 968
 969/*
 970 * Writes msr value into into the appropriate "register".
 971 * Returns 0 on success, non-0 otherwise.
 972 * Assumes vcpu_load() was already called.
 973 */
 974static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 975{
 976        struct vcpu_vmx *vmx = to_vmx(vcpu);
 977        struct kvm_msr_entry *msr;
 978        u64 host_tsc;
 979        int ret = 0;
 980
 981        switch (msr_index) {
 982        case MSR_EFER:
 983                vmx_load_host_state(vmx);
 984                ret = kvm_set_msr_common(vcpu, msr_index, data);
 985                break;
 986#ifdef CONFIG_X86_64
 987        case MSR_FS_BASE:
 988                vmcs_writel(GUEST_FS_BASE, data);
 989                break;
 990        case MSR_GS_BASE:
 991                vmcs_writel(GUEST_GS_BASE, data);
 992                break;
 993#endif
 994        case MSR_IA32_SYSENTER_CS:
 995                vmcs_write32(GUEST_SYSENTER_CS, data);
 996                break;
 997        case MSR_IA32_SYSENTER_EIP:
 998                vmcs_writel(GUEST_SYSENTER_EIP, data);
 999                break;
1000        case MSR_IA32_SYSENTER_ESP:
1001                vmcs_writel(GUEST_SYSENTER_ESP, data);
1002                break;
1003        case MSR_IA32_TIME_STAMP_COUNTER:
1004                rdtscll(host_tsc);
1005                guest_write_tsc(data, host_tsc);
1006                break;
1007        case MSR_P6_PERFCTR0:
1008        case MSR_P6_PERFCTR1:
1009        case MSR_P6_EVNTSEL0:
1010        case MSR_P6_EVNTSEL1:
1011                /*
1012                 * Just discard all writes to the performance counters; this
1013                 * should keep both older linux and windows 64-bit guests
1014                 * happy
1015                 */
1016                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
1017
1018                break;
1019        case MSR_IA32_CR_PAT:
1020                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1021                        vmcs_write64(GUEST_IA32_PAT, data);
1022                        vcpu->arch.pat = data;
1023                        break;
1024                }
1025                /* Otherwise falls through to kvm_set_msr_common */
1026        default:
1027                vmx_load_host_state(vmx);
1028                msr = find_msr_entry(vmx, msr_index);
1029                if (msr) {
1030                        msr->data = data;
1031                        break;
1032                }
1033                ret = kvm_set_msr_common(vcpu, msr_index, data);
1034        }
1035
1036        return ret;
1037}
1038
1039static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1040{
1041        __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
1042        switch (reg) {
1043        case VCPU_REGS_RSP:
1044                vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1045                break;
1046        case VCPU_REGS_RIP:
1047                vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
1048                break;
1049        default:
1050                break;
1051        }
1052}
1053
1054static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1055{
1056        int old_debug = vcpu->guest_debug;
1057        unsigned long flags;
1058
1059        vcpu->guest_debug = dbg->control;
1060        if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1061                vcpu->guest_debug = 0;
1062
1063        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1064                vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1065        else
1066                vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1067
1068        flags = vmcs_readl(GUEST_RFLAGS);
1069        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1070                flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1071        else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1072                flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1073        vmcs_writel(GUEST_RFLAGS, flags);
1074
1075        update_exception_bitmap(vcpu);
1076
1077        return 0;
1078}
1079
1080static __init int cpu_has_kvm_support(void)
1081{
1082        return cpu_has_vmx();
1083}
1084
1085static __init int vmx_disabled_by_bios(void)
1086{
1087        u64 msr;
1088
1089        rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1090        return (msr & (FEATURE_CONTROL_LOCKED |
1091                       FEATURE_CONTROL_VMXON_ENABLED))
1092            == FEATURE_CONTROL_LOCKED;
1093        /* locked but not enabled */
1094}
1095
1096static void hardware_enable(void *garbage)
1097{
1098        int cpu = raw_smp_processor_id();
1099        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1100        u64 old;
1101
1102        INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1103        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1104        if ((old & (FEATURE_CONTROL_LOCKED |
1105                    FEATURE_CONTROL_VMXON_ENABLED))
1106            != (FEATURE_CONTROL_LOCKED |
1107                FEATURE_CONTROL_VMXON_ENABLED))
1108                /* enable and lock */
1109                wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1110                       FEATURE_CONTROL_LOCKED |
1111                       FEATURE_CONTROL_VMXON_ENABLED);
1112        write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1113        asm volatile (ASM_VMX_VMXON_RAX
1114                      : : "a"(&phys_addr), "m"(phys_addr)
1115                      : "memory", "cc");
1116}
1117
1118static void vmclear_local_vcpus(void)
1119{
1120        int cpu = raw_smp_processor_id();
1121        struct vcpu_vmx *vmx, *n;
1122
1123        list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1124                                 local_vcpus_link)
1125                __vcpu_clear(vmx);
1126}
1127
1128
1129/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
1130 * tricks.
1131 */
1132static void kvm_cpu_vmxoff(void)
1133{
1134        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1135        write_cr4(read_cr4() & ~X86_CR4_VMXE);
1136}
1137
1138static void hardware_disable(void *garbage)
1139{
1140        vmclear_local_vcpus();
1141        kvm_cpu_vmxoff();
1142}
1143
1144static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1145                                      u32 msr, u32 *result)
1146{
1147        u32 vmx_msr_low, vmx_msr_high;
1148        u32 ctl = ctl_min | ctl_opt;
1149
1150        rdmsr(msr, vmx_msr_low, vmx_msr_high);
1151
1152        ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
1153        ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
1154
1155        /* Ensure minimum (required) set of control bits are supported. */
1156        if (ctl_min & ~ctl)
1157                return -EIO;
1158
1159        *result = ctl;
1160        return 0;
1161}
1162
1163static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1164{
1165        u32 vmx_msr_low, vmx_msr_high;
1166        u32 min, opt, min2, opt2;
1167        u32 _pin_based_exec_control = 0;
1168        u32 _cpu_based_exec_control = 0;
1169        u32 _cpu_based_2nd_exec_control = 0;
1170        u32 _vmexit_control = 0;
1171        u32 _vmentry_control = 0;
1172
1173        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1174        opt = PIN_BASED_VIRTUAL_NMIS;
1175        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1176                                &_pin_based_exec_control) < 0)
1177                return -EIO;
1178
1179        min = CPU_BASED_HLT_EXITING |
1180#ifdef CONFIG_X86_64
1181              CPU_BASED_CR8_LOAD_EXITING |
1182              CPU_BASED_CR8_STORE_EXITING |
1183#endif
1184              CPU_BASED_CR3_LOAD_EXITING |
1185              CPU_BASED_CR3_STORE_EXITING |
1186              CPU_BASED_USE_IO_BITMAPS |
1187              CPU_BASED_MOV_DR_EXITING |
1188              CPU_BASED_USE_TSC_OFFSETING |
1189              CPU_BASED_INVLPG_EXITING;
1190        opt = CPU_BASED_TPR_SHADOW |
1191              CPU_BASED_USE_MSR_BITMAPS |
1192              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1193        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1194                                &_cpu_based_exec_control) < 0)
1195                return -EIO;
1196#ifdef CONFIG_X86_64
1197        if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
1198                _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
1199                                           ~CPU_BASED_CR8_STORE_EXITING;
1200#endif
1201        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
1202                min2 = 0;
1203                opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
1204                        SECONDARY_EXEC_WBINVD_EXITING |
1205                        SECONDARY_EXEC_ENABLE_VPID |
1206                        SECONDARY_EXEC_ENABLE_EPT;
1207                if (adjust_vmx_controls(min2, opt2,
1208                                        MSR_IA32_VMX_PROCBASED_CTLS2,
1209                                        &_cpu_based_2nd_exec_control) < 0)
1210                        return -EIO;
1211        }
1212#ifndef CONFIG_X86_64
1213        if (!(_cpu_based_2nd_exec_control &
1214                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
1215                _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1216#endif
1217        if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1218                /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1219                   enabled */
1220                min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1221                         CPU_BASED_CR3_STORE_EXITING |
1222                         CPU_BASED_INVLPG_EXITING);
1223                if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1224                                        &_cpu_based_exec_control) < 0)
1225                        return -EIO;
1226                rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1227                      vmx_capability.ept, vmx_capability.vpid);
1228        }
1229
1230        min = 0;
1231#ifdef CONFIG_X86_64
1232        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1233#endif
1234        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
1235        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1236                                &_vmexit_control) < 0)
1237                return -EIO;
1238
1239        min = 0;
1240        opt = VM_ENTRY_LOAD_IA32_PAT;
1241        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1242                                &_vmentry_control) < 0)
1243                return -EIO;
1244
1245        rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1246
1247        /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1248        if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1249                return -EIO;
1250
1251#ifdef CONFIG_X86_64
1252        /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1253        if (vmx_msr_high & (1u<<16))
1254                return -EIO;
1255#endif
1256
1257        /* Require Write-Back (WB) memory type for VMCS accesses. */
1258        if (((vmx_msr_high >> 18) & 15) != 6)
1259                return -EIO;
1260
1261        vmcs_conf->size = vmx_msr_high & 0x1fff;
1262        vmcs_conf->order = get_order(vmcs_config.size);
1263        vmcs_conf->revision_id = vmx_msr_low;
1264
1265        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1266        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1267        vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1268        vmcs_conf->vmexit_ctrl         = _vmexit_control;
1269        vmcs_conf->vmentry_ctrl        = _vmentry_control;
1270
1271        return 0;
1272}
1273
1274static struct vmcs *alloc_vmcs_cpu(int cpu)
1275{
1276        int node = cpu_to_node(cpu);
1277        struct page *pages;
1278        struct vmcs *vmcs;
1279
1280        pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1281        if (!pages)
1282                return NULL;
1283        vmcs = page_address(pages);
1284        memset(vmcs, 0, vmcs_config.size);
1285        vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1286        return vmcs;
1287}
1288
1289static struct vmcs *alloc_vmcs(void)
1290{
1291        return alloc_vmcs_cpu(raw_smp_processor_id());
1292}
1293
1294static void free_vmcs(struct vmcs *vmcs)
1295{
1296        free_pages((unsigned long)vmcs, vmcs_config.order);
1297}
1298
1299static void free_kvm_area(void)
1300{
1301        int cpu;
1302
1303        for_each_online_cpu(cpu)
1304                free_vmcs(per_cpu(vmxarea, cpu));
1305}
1306
1307static __init int alloc_kvm_area(void)
1308{
1309        int cpu;
1310
1311        for_each_online_cpu(cpu) {
1312                struct vmcs *vmcs;
1313
1314                vmcs = alloc_vmcs_cpu(cpu);
1315                if (!vmcs) {
1316                        free_kvm_area();
1317                        return -ENOMEM;
1318                }
1319
1320                per_cpu(vmxarea, cpu) = vmcs;
1321        }
1322        return 0;
1323}
1324
1325static __init int hardware_setup(void)
1326{
1327        if (setup_vmcs_config(&vmcs_config) < 0)
1328                return -EIO;
1329
1330        if (boot_cpu_has(X86_FEATURE_NX))
1331                kvm_enable_efer_bits(EFER_NX);
1332
1333        if (!cpu_has_vmx_vpid())
1334                enable_vpid = 0;
1335
1336        if (!cpu_has_vmx_ept())
1337                enable_ept = 0;
1338
1339        if (!cpu_has_vmx_flexpriority())
1340                flexpriority_enabled = 0;
1341
1342        if (!cpu_has_vmx_tpr_shadow())
1343                kvm_x86_ops->update_cr8_intercept = NULL;
1344
1345        return alloc_kvm_area();
1346}
1347
1348static __exit void hardware_unsetup(void)
1349{
1350        free_kvm_area();
1351}
1352
1353static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1354{
1355        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1356
1357        if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1358                vmcs_write16(sf->selector, save->selector);
1359                vmcs_writel(sf->base, save->base);
1360                vmcs_write32(sf->limit, save->limit);
1361                vmcs_write32(sf->ar_bytes, save->ar);
1362        } else {
1363                u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1364                        << AR_DPL_SHIFT;
1365                vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1366        }
1367}
1368
1369static void enter_pmode(struct kvm_vcpu *vcpu)
1370{
1371        unsigned long flags;
1372        struct vcpu_vmx *vmx = to_vmx(vcpu);
1373
1374        vmx->emulation_required = 1;
1375        vcpu->arch.rmode.vm86_active = 0;
1376
1377        vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1378        vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1379        vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1380
1381        flags = vmcs_readl(GUEST_RFLAGS);
1382        flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1383        flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1384        vmcs_writel(GUEST_RFLAGS, flags);
1385
1386        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1387                        (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1388
1389        update_exception_bitmap(vcpu);
1390
1391        if (emulate_invalid_guest_state)
1392                return;
1393
1394        fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1395        fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1396        fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1397        fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1398
1399        vmcs_write16(GUEST_SS_SELECTOR, 0);
1400        vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1401
1402        vmcs_write16(GUEST_CS_SELECTOR,
1403                     vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1404        vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1405}
1406
1407static gva_t rmode_tss_base(struct kvm *kvm)
1408{
1409        if (!kvm->arch.tss_addr) {
1410                gfn_t base_gfn = kvm->memslots[0].base_gfn +
1411                                 kvm->memslots[0].npages - 3;
1412                return base_gfn << PAGE_SHIFT;
1413        }
1414        return kvm->arch.tss_addr;
1415}
1416
1417static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1418{
1419        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1420
1421        save->selector = vmcs_read16(sf->selector);
1422        save->base = vmcs_readl(sf->base);
1423        save->limit = vmcs_read32(sf->limit);
1424        save->ar = vmcs_read32(sf->ar_bytes);
1425        vmcs_write16(sf->selector, save->base >> 4);
1426        vmcs_write32(sf->base, save->base & 0xfffff);
1427        vmcs_write32(sf->limit, 0xffff);
1428        vmcs_write32(sf->ar_bytes, 0xf3);
1429}
1430
1431static void enter_rmode(struct kvm_vcpu *vcpu)
1432{
1433        unsigned long flags;
1434        struct vcpu_vmx *vmx = to_vmx(vcpu);
1435
1436        vmx->emulation_required = 1;
1437        vcpu->arch.rmode.vm86_active = 1;
1438
1439        vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1440        vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1441
1442        vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1443        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1444
1445        vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1446        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1447
1448        flags = vmcs_readl(GUEST_RFLAGS);
1449        vcpu->arch.rmode.save_iopl
1450                = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1451
1452        flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1453
1454        vmcs_writel(GUEST_RFLAGS, flags);
1455        vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1456        update_exception_bitmap(vcpu);
1457
1458        if (emulate_invalid_guest_state)
1459                goto continue_rmode;
1460
1461        vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1462        vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1463        vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1464
1465        vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1466        vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1467        if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1468                vmcs_writel(GUEST_CS_BASE, 0xf0000);
1469        vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1470
1471        fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1472        fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1473        fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1474        fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1475
1476continue_rmode:
1477        kvm_mmu_reset_context(vcpu);
1478        init_rmode(vcpu->kvm);
1479}
1480
1481static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1482{
1483        struct vcpu_vmx *vmx = to_vmx(vcpu);
1484        struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1485
1486        vcpu->arch.shadow_efer = efer;
1487        if (!msr)
1488                return;
1489        if (efer & EFER_LMA) {
1490                vmcs_write32(VM_ENTRY_CONTROLS,
1491                             vmcs_read32(VM_ENTRY_CONTROLS) |
1492                             VM_ENTRY_IA32E_MODE);
1493                msr->data = efer;
1494        } else {
1495                vmcs_write32(VM_ENTRY_CONTROLS,
1496                             vmcs_read32(VM_ENTRY_CONTROLS) &
1497                             ~VM_ENTRY_IA32E_MODE);
1498
1499                msr->data = efer & ~EFER_LME;
1500        }
1501        setup_msrs(vmx);
1502}
1503
1504#ifdef CONFIG_X86_64
1505
1506static void enter_lmode(struct kvm_vcpu *vcpu)
1507{
1508        u32 guest_tr_ar;
1509
1510        guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1511        if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1512                printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1513                       __func__);
1514                vmcs_write32(GUEST_TR_AR_BYTES,
1515                             (guest_tr_ar & ~AR_TYPE_MASK)
1516                             | AR_TYPE_BUSY_64_TSS);
1517        }
1518        vcpu->arch.shadow_efer |= EFER_LMA;
1519        vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
1520}
1521
1522static void exit_lmode(struct kvm_vcpu *vcpu)
1523{
1524        vcpu->arch.shadow_efer &= ~EFER_LMA;
1525
1526        vmcs_write32(VM_ENTRY_CONTROLS,
1527                     vmcs_read32(VM_ENTRY_CONTROLS)
1528                     & ~VM_ENTRY_IA32E_MODE);
1529}
1530
1531#endif
1532
1533static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1534{
1535        vpid_sync_vcpu_all(to_vmx(vcpu));
1536        if (enable_ept)
1537                ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1538}
1539
1540static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1541{
1542        vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1543        vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1544}
1545
1546static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1547{
1548        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1549                if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
1550                        printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
1551                        return;
1552                }
1553                vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
1554                vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
1555                vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
1556                vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
1557        }
1558}
1559
1560static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1561
1562static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1563                                        unsigned long cr0,
1564                                        struct kvm_vcpu *vcpu)
1565{
1566        if (!(cr0 & X86_CR0_PG)) {
1567                /* From paging/starting to nonpaging */
1568                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1569                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1570                             (CPU_BASED_CR3_LOAD_EXITING |
1571                              CPU_BASED_CR3_STORE_EXITING));
1572                vcpu->arch.cr0 = cr0;
1573                vmx_set_cr4(vcpu, vcpu->arch.cr4);
1574                *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
1575                *hw_cr0 &= ~X86_CR0_WP;
1576        } else if (!is_paging(vcpu)) {
1577                /* From nonpaging to paging */
1578                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1579                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1580                             ~(CPU_BASED_CR3_LOAD_EXITING |
1581                               CPU_BASED_CR3_STORE_EXITING));
1582                vcpu->arch.cr0 = cr0;
1583                vmx_set_cr4(vcpu, vcpu->arch.cr4);
1584                if (!(vcpu->arch.cr0 & X86_CR0_WP))
1585                        *hw_cr0 &= ~X86_CR0_WP;
1586        }
1587}
1588
1589static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1590                                        struct kvm_vcpu *vcpu)
1591{
1592        if (!is_paging(vcpu)) {
1593                *hw_cr4 &= ~X86_CR4_PAE;
1594                *hw_cr4 |= X86_CR4_PSE;
1595        } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1596                *hw_cr4 &= ~X86_CR4_PAE;
1597}
1598
1599static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1600{
1601        unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
1602                                KVM_VM_CR0_ALWAYS_ON;
1603
1604        vmx_fpu_deactivate(vcpu);
1605
1606        if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
1607                enter_pmode(vcpu);
1608
1609        if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
1610                enter_rmode(vcpu);
1611
1612#ifdef CONFIG_X86_64
1613        if (vcpu->arch.shadow_efer & EFER_LME) {
1614                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1615                        enter_lmode(vcpu);
1616                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1617                        exit_lmode(vcpu);
1618        }
1619#endif
1620
1621        if (enable_ept)
1622                ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1623
1624        vmcs_writel(CR0_READ_SHADOW, cr0);
1625        vmcs_writel(GUEST_CR0, hw_cr0);
1626        vcpu->arch.cr0 = cr0;
1627
1628        if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1629                vmx_fpu_activate(vcpu);
1630}
1631
1632static u64 construct_eptp(unsigned long root_hpa)
1633{
1634        u64 eptp;
1635
1636        /* TODO write the value reading from MSR */
1637        eptp = VMX_EPT_DEFAULT_MT |
1638                VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
1639        eptp |= (root_hpa & PAGE_MASK);
1640
1641        return eptp;
1642}
1643
1644static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1645{
1646        unsigned long guest_cr3;
1647        u64 eptp;
1648
1649        guest_cr3 = cr3;
1650        if (enable_ept) {
1651                eptp = construct_eptp(cr3);
1652                vmcs_write64(EPT_POINTER, eptp);
1653                ept_sync_context(eptp);
1654                ept_load_pdptrs(vcpu);
1655                guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1656                        VMX_EPT_IDENTITY_PAGETABLE_ADDR;
1657        }
1658
1659        vmx_flush_tlb(vcpu);
1660        vmcs_writel(GUEST_CR3, guest_cr3);
1661        if (vcpu->arch.cr0 & X86_CR0_PE)
1662                vmx_fpu_deactivate(vcpu);
1663}
1664
1665static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1666{
1667        unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
1668                    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1669
1670        vcpu->arch.cr4 = cr4;
1671        if (enable_ept)
1672                ept_update_paging_mode_cr4(&hw_cr4, vcpu);
1673
1674        vmcs_writel(CR4_READ_SHADOW, cr4);
1675        vmcs_writel(GUEST_CR4, hw_cr4);
1676}
1677
1678static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1679{
1680        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1681
1682        return vmcs_readl(sf->base);
1683}
1684
1685static void vmx_get_segment(struct kvm_vcpu *vcpu,
1686                            struct kvm_segment *var, int seg)
1687{
1688        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1689        u32 ar;
1690
1691        var->base = vmcs_readl(sf->base);
1692        var->limit = vmcs_read32(sf->limit);
1693        var->selector = vmcs_read16(sf->selector);
1694        ar = vmcs_read32(sf->ar_bytes);
1695        if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
1696                ar = 0;
1697        var->type = ar & 15;
1698        var->s = (ar >> 4) & 1;
1699        var->dpl = (ar >> 5) & 3;
1700        var->present = (ar >> 7) & 1;
1701        var->avl = (ar >> 12) & 1;
1702        var->l = (ar >> 13) & 1;
1703        var->db = (ar >> 14) & 1;
1704        var->g = (ar >> 15) & 1;
1705        var->unusable = (ar >> 16) & 1;
1706}
1707
1708static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1709{
1710        struct kvm_segment kvm_seg;
1711
1712        if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
1713                return 0;
1714
1715        if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
1716                return 3;
1717
1718        vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
1719        return kvm_seg.selector & 3;
1720}
1721
1722static u32 vmx_segment_access_rights(struct kvm_segment *var)
1723{
1724        u32 ar;
1725
1726        if (var->unusable)
1727                ar = 1 << 16;
1728        else {
1729                ar = var->type & 15;
1730                ar |= (var->s & 1) << 4;
1731                ar |= (var->dpl & 3) << 5;
1732                ar |= (var->present & 1) << 7;
1733                ar |= (var->avl & 1) << 12;
1734                ar |= (var->l & 1) << 13;
1735                ar |= (var->db & 1) << 14;
1736                ar |= (var->g & 1) << 15;
1737        }
1738        if (ar == 0) /* a 0 value means unusable */
1739                ar = AR_UNUSABLE_MASK;
1740
1741        return ar;
1742}
1743
1744static void vmx_set_segment(struct kvm_vcpu *vcpu,
1745                            struct kvm_segment *var, int seg)
1746{
1747        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1748        u32 ar;
1749
1750        if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
1751                vcpu->arch.rmode.tr.selector = var->selector;
1752                vcpu->arch.rmode.tr.base = var->base;
1753                vcpu->arch.rmode.tr.limit = var->limit;
1754                vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1755                return;
1756        }
1757        vmcs_writel(sf->base, var->base);
1758        vmcs_write32(sf->limit, var->limit);
1759        vmcs_write16(sf->selector, var->selector);
1760        if (vcpu->arch.rmode.vm86_active && var->s) {
1761                /*
1762                 * Hack real-mode segments into vm86 compatibility.
1763                 */
1764                if (var->base == 0xffff0000 && var->selector == 0xf000)
1765                        vmcs_writel(sf->base, 0xf0000);
1766                ar = 0xf3;
1767        } else
1768                ar = vmx_segment_access_rights(var);
1769        vmcs_write32(sf->ar_bytes, ar);
1770}
1771
1772static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1773{
1774        u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1775
1776        *db = (ar >> 14) & 1;
1777        *l = (ar >> 13) & 1;
1778}
1779
1780static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1781{
1782        dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1783        dt->base = vmcs_readl(GUEST_IDTR_BASE);
1784}
1785
1786static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1787{
1788        vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1789        vmcs_writel(GUEST_IDTR_BASE, dt->base);
1790}
1791
1792static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1793{
1794        dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1795        dt->base = vmcs_readl(GUEST_GDTR_BASE);
1796}
1797
1798static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1799{
1800        vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1801        vmcs_writel(GUEST_GDTR_BASE, dt->base);
1802}
1803
1804static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1805{
1806        struct kvm_segment var;
1807        u32 ar;
1808
1809        vmx_get_segment(vcpu, &var, seg);
1810        ar = vmx_segment_access_rights(&var);
1811
1812        if (var.base != (var.selector << 4))
1813                return false;
1814        if (var.limit != 0xffff)
1815                return false;
1816        if (ar != 0xf3)
1817                return false;
1818
1819        return true;
1820}
1821
1822static bool code_segment_valid(struct kvm_vcpu *vcpu)
1823{
1824        struct kvm_segment cs;
1825        unsigned int cs_rpl;
1826
1827        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1828        cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1829
1830        if (cs.unusable)
1831                return false;
1832        if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1833                return false;
1834        if (!cs.s)
1835                return false;
1836        if (cs.type & AR_TYPE_WRITEABLE_MASK) {
1837                if (cs.dpl > cs_rpl)
1838                        return false;
1839        } else {
1840                if (cs.dpl != cs_rpl)
1841                        return false;
1842        }
1843        if (!cs.present)
1844                return false;
1845
1846        /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1847        return true;
1848}
1849
1850static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1851{
1852        struct kvm_segment ss;
1853        unsigned int ss_rpl;
1854
1855        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1856        ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1857
1858        if (ss.unusable)
1859                return true;
1860        if (ss.type != 3 && ss.type != 7)
1861                return false;
1862        if (!ss.s)
1863                return false;
1864        if (ss.dpl != ss_rpl) /* DPL != RPL */
1865                return false;
1866        if (!ss.present)
1867                return false;
1868
1869        return true;
1870}
1871
1872static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1873{
1874        struct kvm_segment var;
1875        unsigned int rpl;
1876
1877        vmx_get_segment(vcpu, &var, seg);
1878        rpl = var.selector & SELECTOR_RPL_MASK;
1879
1880        if (var.unusable)
1881                return true;
1882        if (!var.s)
1883                return false;
1884        if (!var.present)
1885                return false;
1886        if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1887                if (var.dpl < rpl) /* DPL < RPL */
1888                        return false;
1889        }
1890
1891        /* TODO: Add other members to kvm_segment_field to allow checking for other access
1892         * rights flags
1893         */
1894        return true;
1895}
1896
1897static bool tr_valid(struct kvm_vcpu *vcpu)
1898{
1899        struct kvm_segment tr;
1900
1901        vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1902
1903        if (tr.unusable)
1904                return false;
1905        if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
1906                return false;
1907        if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
1908                return false;
1909        if (!tr.present)
1910                return false;
1911
1912        return true;
1913}
1914
1915static bool ldtr_valid(struct kvm_vcpu *vcpu)
1916{
1917        struct kvm_segment ldtr;
1918
1919        vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1920
1921        if (ldtr.unusable)
1922                return true;
1923        if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
1924                return false;
1925        if (ldtr.type != 2)
1926                return false;
1927        if (!ldtr.present)
1928                return false;
1929
1930        return true;
1931}
1932
1933static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1934{
1935        struct kvm_segment cs, ss;
1936
1937        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1938        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1939
1940        return ((cs.selector & SELECTOR_RPL_MASK) ==
1941                 (ss.selector & SELECTOR_RPL_MASK));
1942}
1943
1944/*
1945 * Check if guest state is valid. Returns true if valid, false if
1946 * not.
1947 * We assume that registers are always usable
1948 */
1949static bool guest_state_valid(struct kvm_vcpu *vcpu)
1950{
1951        /* real mode guest state checks */
1952        if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1953                if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1954                        return false;
1955                if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1956                        return false;
1957                if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1958                        return false;
1959                if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1960                        return false;
1961                if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1962                        return false;
1963                if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1964                        return false;
1965        } else {
1966        /* protected mode guest state checks */
1967                if (!cs_ss_rpl_check(vcpu))
1968                        return false;
1969                if (!code_segment_valid(vcpu))
1970                        return false;
1971                if (!stack_segment_valid(vcpu))
1972                        return false;
1973                if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1974                        return false;
1975                if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1976                        return false;
1977                if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1978                        return false;
1979                if (!data_segment_valid(vcpu, VCPU_SREG_GS))
1980                        return false;
1981                if (!tr_valid(vcpu))
1982                        return false;
1983                if (!ldtr_valid(vcpu))
1984                        return false;
1985        }
1986        /* TODO:
1987         * - Add checks on RIP
1988         * - Add checks on RFLAGS
1989         */
1990
1991        return true;
1992}
1993
1994static int init_rmode_tss(struct kvm *kvm)
1995{
1996        gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1997        u16 data = 0;
1998        int ret = 0;
1999        int r;
2000
2001        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2002        if (r < 0)
2003                goto out;
2004        data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
2005        r = kvm_write_guest_page(kvm, fn++, &data,
2006                        TSS_IOPB_BASE_OFFSET, sizeof(u16));
2007        if (r < 0)
2008                goto out;
2009        r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
2010        if (r < 0)
2011                goto out;
2012        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2013        if (r < 0)
2014                goto out;
2015        data = ~0;
2016        r = kvm_write_guest_page(kvm, fn, &data,
2017                                 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
2018                                 sizeof(u8));
2019        if (r < 0)
2020                goto out;
2021
2022        ret = 1;
2023out:
2024        return ret;
2025}
2026
2027static int init_rmode_identity_map(struct kvm *kvm)
2028{
2029        int i, r, ret;
2030        pfn_t identity_map_pfn;
2031        u32 tmp;
2032
2033        if (!enable_ept)
2034                return 1;
2035        if (unlikely(!kvm->arch.ept_identity_pagetable)) {
2036                printk(KERN_ERR "EPT: identity-mapping pagetable "
2037                        "haven't been allocated!\n");
2038                return 0;
2039        }
2040        if (likely(kvm->arch.ept_identity_pagetable_done))
2041                return 1;
2042        ret = 0;
2043        identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
2044        r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2045        if (r < 0)
2046                goto out;
2047        /* Set up identity-mapping pagetable for EPT in real mode */
2048        for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
2049                tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
2050                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2051                r = kvm_write_guest_page(kvm, identity_map_pfn,
2052                                &tmp, i * sizeof(tmp), sizeof(tmp));
2053                if (r < 0)
2054                        goto out;
2055        }
2056        kvm->arch.ept_identity_pagetable_done = true;
2057        ret = 1;
2058out:
2059        return ret;
2060}
2061
2062static void seg_setup(int seg)
2063{
2064        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2065
2066        vmcs_write16(sf->selector, 0);
2067        vmcs_writel(sf->base, 0);
2068        vmcs_write32(sf->limit, 0xffff);
2069        vmcs_write32(sf->ar_bytes, 0xf3);
2070}
2071
2072static int alloc_apic_access_page(struct kvm *kvm)
2073{
2074        struct kvm_userspace_memory_region kvm_userspace_mem;
2075        int r = 0;
2076
2077        down_write(&kvm->slots_lock);
2078        if (kvm->arch.apic_access_page)
2079                goto out;
2080        kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
2081        kvm_userspace_mem.flags = 0;
2082        kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
2083        kvm_userspace_mem.memory_size = PAGE_SIZE;
2084        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2085        if (r)
2086                goto out;
2087
2088        kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2089out:
2090        up_write(&kvm->slots_lock);
2091        return r;
2092}
2093
2094static int alloc_identity_pagetable(struct kvm *kvm)
2095{
2096        struct kvm_userspace_memory_region kvm_userspace_mem;
2097        int r = 0;
2098
2099        down_write(&kvm->slots_lock);
2100        if (kvm->arch.ept_identity_pagetable)
2101                goto out;
2102        kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
2103        kvm_userspace_mem.flags = 0;
2104        kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
2105        kvm_userspace_mem.memory_size = PAGE_SIZE;
2106        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2107        if (r)
2108                goto out;
2109
2110        kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2111                        VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
2112out:
2113        up_write(&kvm->slots_lock);
2114        return r;
2115}
2116
2117static void allocate_vpid(struct vcpu_vmx *vmx)
2118{
2119        int vpid;
2120
2121        vmx->vpid = 0;
2122        if (!enable_vpid)
2123                return;
2124        spin_lock(&vmx_vpid_lock);
2125        vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
2126        if (vpid < VMX_NR_VPIDS) {
2127                vmx->vpid = vpid;
2128                __set_bit(vpid, vmx_vpid_bitmap);
2129        }
2130        spin_unlock(&vmx_vpid_lock);
2131}
2132
2133static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2134{
2135        int f = sizeof(unsigned long);
2136
2137        if (!cpu_has_vmx_msr_bitmap())
2138                return;
2139
2140        /*
2141         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2142         * have the write-low and read-high bitmap offsets the wrong way round.
2143         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2144         */
2145        if (msr <= 0x1fff) {
2146                __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2147                __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
2148        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2149                msr &= 0x1fff;
2150                __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2151                __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
2152        }
2153}
2154
2155static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2156{
2157        if (!longmode_only)
2158                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2159        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2160}
2161
2162/*
2163 * Sets up the vmcs for emulated real mode.
2164 */
2165static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2166{
2167        u32 host_sysenter_cs, msr_low, msr_high;
2168        u32 junk;
2169        u64 host_pat, tsc_this, tsc_base;
2170        unsigned long a;
2171        struct descriptor_table dt;
2172        int i;
2173        unsigned long kvm_vmx_return;
2174        u32 exec_control;
2175
2176        /* I/O */
2177        vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2178        vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
2179
2180        if (cpu_has_vmx_msr_bitmap())
2181                vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2182
2183        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2184
2185        /* Control */
2186        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2187                vmcs_config.pin_based_exec_ctrl);
2188
2189        exec_control = vmcs_config.cpu_based_exec_ctrl;
2190        if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2191                exec_control &= ~CPU_BASED_TPR_SHADOW;
2192#ifdef CONFIG_X86_64
2193                exec_control |= CPU_BASED_CR8_STORE_EXITING |
2194                                CPU_BASED_CR8_LOAD_EXITING;
2195#endif
2196        }
2197        if (!enable_ept)
2198                exec_control |= CPU_BASED_CR3_STORE_EXITING |
2199                                CPU_BASED_CR3_LOAD_EXITING  |
2200                                CPU_BASED_INVLPG_EXITING;
2201        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2202
2203        if (cpu_has_secondary_exec_ctrls()) {
2204                exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
2205                if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2206                        exec_control &=
2207                                ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2208                if (vmx->vpid == 0)
2209                        exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2210                if (!enable_ept)
2211                        exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2212                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2213        }
2214
2215        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2216        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2217        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
2218
2219        vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
2220        vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
2221        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
2222
2223        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
2224        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2225        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2226        vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
2227        vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
2228        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2229#ifdef CONFIG_X86_64
2230        rdmsrl(MSR_FS_BASE, a);
2231        vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
2232        rdmsrl(MSR_GS_BASE, a);
2233        vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
2234#else
2235        vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
2236        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2237#endif
2238
2239        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
2240
2241        kvm_get_idt(&dt);
2242        vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
2243
2244        asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2245        vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2246        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2247        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2248        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2249
2250        rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2251        vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2252        rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2253        vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
2254        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2255        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
2256
2257        if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2258                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2259                host_pat = msr_low | ((u64) msr_high << 32);
2260                vmcs_write64(HOST_IA32_PAT, host_pat);
2261        }
2262        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2263                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2264                host_pat = msr_low | ((u64) msr_high << 32);
2265                /* Write the default value follow host pat */
2266                vmcs_write64(GUEST_IA32_PAT, host_pat);
2267                /* Keep arch.pat sync with GUEST_IA32_PAT */
2268                vmx->vcpu.arch.pat = host_pat;
2269        }
2270
2271        for (i = 0; i < NR_VMX_MSR; ++i) {
2272                u32 index = vmx_msr_index[i];
2273                u32 data_low, data_high;
2274                u64 data;
2275                int j = vmx->nmsrs;
2276
2277                if (rdmsr_safe(index, &data_low, &data_high) < 0)
2278                        continue;
2279                if (wrmsr_safe(index, data_low, data_high) < 0)
2280                        continue;
2281                data = data_low | ((u64)data_high << 32);
2282                vmx->host_msrs[j].index = index;
2283                vmx->host_msrs[j].reserved = 0;
2284                vmx->host_msrs[j].data = data;
2285                vmx->guest_msrs[j] = vmx->host_msrs[j];
2286                ++vmx->nmsrs;
2287        }
2288
2289        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
2290
2291        /* 22.2.1, 20.8.1 */
2292        vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2293
2294        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2295        vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
2296
2297        tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2298        rdtscll(tsc_this);
2299        if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2300                tsc_base = tsc_this;
2301
2302        guest_write_tsc(0, tsc_base);
2303
2304        return 0;
2305}
2306
2307static int init_rmode(struct kvm *kvm)
2308{
2309        if (!init_rmode_tss(kvm))
2310                return 0;
2311        if (!init_rmode_identity_map(kvm))
2312                return 0;
2313        return 1;
2314}
2315
2316static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2317{
2318        struct vcpu_vmx *vmx = to_vmx(vcpu);
2319        u64 msr;
2320        int ret;
2321
2322        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2323        down_read(&vcpu->kvm->slots_lock);
2324        if (!init_rmode(vmx->vcpu.kvm)) {
2325                ret = -ENOMEM;
2326                goto out;
2327        }
2328
2329        vmx->vcpu.arch.rmode.vm86_active = 0;
2330
2331        vmx->soft_vnmi_blocked = 0;
2332
2333        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2334        kvm_set_cr8(&vmx->vcpu, 0);
2335        msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
2336        if (vmx->vcpu.vcpu_id == 0)
2337                msr |= MSR_IA32_APICBASE_BSP;
2338        kvm_set_apic_base(&vmx->vcpu, msr);
2339
2340        fx_init(&vmx->vcpu);
2341
2342        seg_setup(VCPU_SREG_CS);
2343        /*
2344         * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2345         * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
2346         */
2347        if (vmx->vcpu.vcpu_id == 0) {
2348                vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
2349                vmcs_writel(GUEST_CS_BASE, 0x000f0000);
2350        } else {
2351                vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2352                vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2353        }
2354
2355        seg_setup(VCPU_SREG_DS);
2356        seg_setup(VCPU_SREG_ES);
2357        seg_setup(VCPU_SREG_FS);
2358        seg_setup(VCPU_SREG_GS);
2359        seg_setup(VCPU_SREG_SS);
2360
2361        vmcs_write16(GUEST_TR_SELECTOR, 0);
2362        vmcs_writel(GUEST_TR_BASE, 0);
2363        vmcs_write32(GUEST_TR_LIMIT, 0xffff);
2364        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2365
2366        vmcs_write16(GUEST_LDTR_SELECTOR, 0);
2367        vmcs_writel(GUEST_LDTR_BASE, 0);
2368        vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
2369        vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
2370
2371        vmcs_write32(GUEST_SYSENTER_CS, 0);
2372        vmcs_writel(GUEST_SYSENTER_ESP, 0);
2373        vmcs_writel(GUEST_SYSENTER_EIP, 0);
2374
2375        vmcs_writel(GUEST_RFLAGS, 0x02);
2376        if (vmx->vcpu.vcpu_id == 0)
2377                kvm_rip_write(vcpu, 0xfff0);
2378        else
2379                kvm_rip_write(vcpu, 0);
2380        kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2381
2382        vmcs_writel(GUEST_DR7, 0x400);
2383
2384        vmcs_writel(GUEST_GDTR_BASE, 0);
2385        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
2386
2387        vmcs_writel(GUEST_IDTR_BASE, 0);
2388        vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2389
2390        vmcs_write32(GUEST_ACTIVITY_STATE, 0);
2391        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2392        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2393
2394        /* Special registers */
2395        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2396
2397        setup_msrs(vmx);
2398
2399        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
2400
2401        if (cpu_has_vmx_tpr_shadow()) {
2402                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2403                if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2404                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2405                                page_to_phys(vmx->vcpu.arch.apic->regs_page));
2406                vmcs_write32(TPR_THRESHOLD, 0);
2407        }
2408
2409        if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2410                vmcs_write64(APIC_ACCESS_ADDR,
2411                             page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
2412
2413        if (vmx->vpid != 0)
2414                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2415
2416        vmx->vcpu.arch.cr0 = 0x60000010;
2417        vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2418        vmx_set_cr4(&vmx->vcpu, 0);
2419        vmx_set_efer(&vmx->vcpu, 0);
2420        vmx_fpu_activate(&vmx->vcpu);
2421        update_exception_bitmap(&vmx->vcpu);
2422
2423        vpid_sync_vcpu_all(vmx);
2424
2425        ret = 0;
2426
2427        /* HACK: Don't enable emulation on guest boot/reset */
2428        vmx->emulation_required = 0;
2429
2430out:
2431        up_read(&vcpu->kvm->slots_lock);
2432        return ret;
2433}
2434
2435static void enable_irq_window(struct kvm_vcpu *vcpu)
2436{
2437        u32 cpu_based_vm_exec_control;
2438
2439        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2440        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2441        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2442}
2443
2444static void enable_nmi_window(struct kvm_vcpu *vcpu)
2445{
2446        u32 cpu_based_vm_exec_control;
2447
2448        if (!cpu_has_virtual_nmis()) {
2449                enable_irq_window(vcpu);
2450                return;
2451        }
2452
2453        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2454        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2455        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2456}
2457
2458static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2459{
2460        struct vcpu_vmx *vmx = to_vmx(vcpu);
2461        uint32_t intr;
2462        int irq = vcpu->arch.interrupt.nr;
2463
2464        KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2465
2466        ++vcpu->stat.irq_injections;
2467        if (vcpu->arch.rmode.vm86_active) {
2468                vmx->rmode.irq.pending = true;
2469                vmx->rmode.irq.vector = irq;
2470                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2471                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2472                             irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2473                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2474                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2475                return;
2476        }
2477        intr = irq | INTR_INFO_VALID_MASK;
2478        if (vcpu->arch.interrupt.soft) {
2479                intr |= INTR_TYPE_SOFT_INTR;
2480                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2481                             vmx->vcpu.arch.event_exit_inst_len);
2482        } else
2483                intr |= INTR_TYPE_EXT_INTR;
2484        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2485}
2486
2487static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2488{
2489        struct vcpu_vmx *vmx = to_vmx(vcpu);
2490
2491        if (!cpu_has_virtual_nmis()) {
2492                /*
2493                 * Tracking the NMI-blocked state in software is built upon
2494                 * finding the next open IRQ window. This, in turn, depends on
2495                 * well-behaving guests: They have to keep IRQs disabled at
2496                 * least as long as the NMI handler runs. Otherwise we may
2497                 * cause NMI nesting, maybe breaking the guest. But as this is
2498                 * highly unlikely, we can live with the residual risk.
2499                 */
2500                vmx->soft_vnmi_blocked = 1;
2501                vmx->vnmi_blocked_time = 0;
2502        }
2503
2504        ++vcpu->stat.nmi_injections;
2505        if (vcpu->arch.rmode.vm86_active) {
2506                vmx->rmode.irq.pending = true;
2507                vmx->rmode.irq.vector = NMI_VECTOR;
2508                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2509                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2510                             NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2511                             INTR_INFO_VALID_MASK);
2512                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2513                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2514                return;
2515        }
2516        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2517                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2518}
2519
2520static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2521{
2522        if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2523                return 0;
2524
2525        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2526                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
2527                                GUEST_INTR_STATE_NMI));
2528}
2529
2530static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2531{
2532        return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2533                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2534                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2535}
2536
2537static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2538{
2539        int ret;
2540        struct kvm_userspace_memory_region tss_mem = {
2541                .slot = TSS_PRIVATE_MEMSLOT,
2542                .guest_phys_addr = addr,
2543                .memory_size = PAGE_SIZE * 3,
2544                .flags = 0,
2545        };
2546
2547        ret = kvm_set_memory_region(kvm, &tss_mem, 0);
2548        if (ret)
2549                return ret;
2550        kvm->arch.tss_addr = addr;
2551        return 0;
2552}
2553
2554static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2555                                  int vec, u32 err_code)
2556{
2557        /*
2558         * Instruction with address size override prefix opcode 0x67
2559         * Cause the #SS fault with 0 error code in VM86 mode.
2560         */
2561        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2562                if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2563                        return 1;
2564        /*
2565         * Forward all other exceptions that are valid in real mode.
2566         * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2567         *        the required debugging infrastructure rework.
2568         */
2569        switch (vec) {
2570        case DB_VECTOR:
2571                if (vcpu->guest_debug &
2572                    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
2573                        return 0;
2574                kvm_queue_exception(vcpu, vec);
2575                return 1;
2576        case BP_VECTOR:
2577                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2578                        return 0;
2579                /* fall through */
2580        case DE_VECTOR:
2581        case OF_VECTOR:
2582        case BR_VECTOR:
2583        case UD_VECTOR:
2584        case DF_VECTOR:
2585        case SS_VECTOR:
2586        case GP_VECTOR:
2587        case MF_VECTOR:
2588                kvm_queue_exception(vcpu, vec);
2589                return 1;
2590        }
2591        return 0;
2592}
2593
2594/*
2595 * Trigger machine check on the host. We assume all the MSRs are already set up
2596 * by the CPU and that we still run on the same CPU as the MCE occurred on.
2597 * We pass a fake environment to the machine check handler because we want
2598 * the guest to be always treated like user space, no matter what context
2599 * it used internally.
2600 */
2601static void kvm_machine_check(void)
2602{
2603#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
2604        struct pt_regs regs = {
2605                .cs = 3, /* Fake ring 3 no matter what the guest ran on */
2606                .flags = X86_EFLAGS_IF,
2607        };
2608
2609        do_machine_check(&regs, 0);
2610#endif
2611}
2612
2613static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2614{
2615        /* already handled by vcpu_run */
2616        return 1;
2617}
2618
2619static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2620{
2621        struct vcpu_vmx *vmx = to_vmx(vcpu);
2622        u32 intr_info, ex_no, error_code;
2623        unsigned long cr2, rip, dr6;
2624        u32 vect_info;
2625        enum emulation_result er;
2626
2627        vect_info = vmx->idt_vectoring_info;
2628        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2629
2630        if (is_machine_check(intr_info))
2631                return handle_machine_check(vcpu, kvm_run);
2632
2633        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2634                                                !is_page_fault(intr_info))
2635                printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
2636                       "intr info 0x%x\n", __func__, vect_info, intr_info);
2637
2638        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2639                return 1;  /* already handled by vmx_vcpu_run() */
2640
2641        if (is_no_device(intr_info)) {
2642                vmx_fpu_activate(vcpu);
2643                return 1;
2644        }
2645
2646        if (is_invalid_opcode(intr_info)) {
2647                er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
2648                if (er != EMULATE_DONE)
2649                        kvm_queue_exception(vcpu, UD_VECTOR);
2650                return 1;
2651        }
2652
2653        error_code = 0;
2654        rip = kvm_rip_read(vcpu);
2655        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2656                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2657        if (is_page_fault(intr_info)) {
2658                /* EPT won't cause page fault directly */
2659                if (enable_ept)
2660                        BUG();
2661                cr2 = vmcs_readl(EXIT_QUALIFICATION);
2662                KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2663                            (u32)((u64)cr2 >> 32), handler);
2664                if (kvm_event_needs_reinjection(vcpu))
2665                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
2666                return kvm_mmu_page_fault(vcpu, cr2, error_code);
2667        }
2668
2669        if (vcpu->arch.rmode.vm86_active &&
2670            handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2671                                                                error_code)) {
2672                if (vcpu->arch.halt_request) {
2673                        vcpu->arch.halt_request = 0;
2674                        return kvm_emulate_halt(vcpu);
2675                }
2676                return 1;
2677        }
2678
2679        ex_no = intr_info & INTR_INFO_VECTOR_MASK;
2680        switch (ex_no) {
2681        case DB_VECTOR:
2682                dr6 = vmcs_readl(EXIT_QUALIFICATION);
2683                if (!(vcpu->guest_debug &
2684                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
2685                        vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
2686                        kvm_queue_exception(vcpu, DB_VECTOR);
2687                        return 1;
2688                }
2689                kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
2690                kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2691                /* fall through */
2692        case BP_VECTOR:
2693                kvm_run->exit_reason = KVM_EXIT_DEBUG;
2694                kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2695                kvm_run->debug.arch.exception = ex_no;
2696                break;
2697        default:
2698                kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
2699                kvm_run->ex.exception = ex_no;
2700                kvm_run->ex.error_code = error_code;
2701                break;
2702        }
2703        return 0;
2704}
2705
2706static int handle_external_interrupt(struct kvm_vcpu *vcpu,
2707                                     struct kvm_run *kvm_run)
2708{
2709        ++vcpu->stat.irq_exits;
2710        KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
2711        return 1;
2712}
2713
2714static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2715{
2716        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2717        return 0;
2718}
2719
2720static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2721{
2722        unsigned long exit_qualification;
2723        int size, in, string;
2724        unsigned port;
2725
2726        ++vcpu->stat.io_exits;
2727        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2728        string = (exit_qualification & 16) != 0;
2729
2730        if (string) {
2731                if (emulate_instruction(vcpu,
2732                                        kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2733                        return 0;
2734                return 1;
2735        }
2736
2737        size = (exit_qualification & 7) + 1;
2738        in = (exit_qualification & 8) != 0;
2739        port = exit_qualification >> 16;
2740
2741        skip_emulated_instruction(vcpu);
2742        return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
2743}
2744
2745static void
2746vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2747{
2748        /*
2749         * Patch in the VMCALL instruction:
2750         */
2751        hypercall[0] = 0x0f;
2752        hypercall[1] = 0x01;
2753        hypercall[2] = 0xc1;
2754}
2755
2756static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2757{
2758        unsigned long exit_qualification;
2759        int cr;
2760        int reg;
2761
2762        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2763        cr = exit_qualification & 15;
2764        reg = (exit_qualification >> 8) & 15;
2765        switch ((exit_qualification >> 4) & 3) {
2766        case 0: /* mov to cr */
2767                KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2768                            (u32)kvm_register_read(vcpu, reg),
2769                            (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2770                            handler);
2771                switch (cr) {
2772                case 0:
2773                        kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2774                        skip_emulated_instruction(vcpu);
2775                        return 1;
2776                case 3:
2777                        kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2778                        skip_emulated_instruction(vcpu);
2779                        return 1;
2780                case 4:
2781                        kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2782                        skip_emulated_instruction(vcpu);
2783                        return 1;
2784                case 8: {
2785                                u8 cr8_prev = kvm_get_cr8(vcpu);
2786                                u8 cr8 = kvm_register_read(vcpu, reg);
2787                                kvm_set_cr8(vcpu, cr8);
2788                                skip_emulated_instruction(vcpu);
2789                                if (irqchip_in_kernel(vcpu->kvm))
2790                                        return 1;
2791                                if (cr8_prev <= cr8)
2792                                        return 1;
2793                                kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2794                                return 0;
2795                        }
2796                };
2797                break;
2798        case 2: /* clts */
2799                vmx_fpu_deactivate(vcpu);
2800                vcpu->arch.cr0 &= ~X86_CR0_TS;
2801                vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2802                vmx_fpu_activate(vcpu);
2803                KVMTRACE_0D(CLTS, vcpu, handler);
2804                skip_emulated_instruction(vcpu);
2805                return 1;
2806        case 1: /*mov from cr*/
2807                switch (cr) {
2808                case 3:
2809                        kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2810                        KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2811                                    (u32)kvm_register_read(vcpu, reg),
2812                                    (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2813                                    handler);
2814                        skip_emulated_instruction(vcpu);
2815                        return 1;
2816                case 8:
2817                        kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2818                        KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2819                                    (u32)kvm_register_read(vcpu, reg), handler);
2820                        skip_emulated_instruction(vcpu);
2821                        return 1;
2822                }
2823                break;
2824        case 3: /* lmsw */
2825                kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2826
2827                skip_emulated_instruction(vcpu);
2828                return 1;
2829        default:
2830                break;
2831        }
2832        kvm_run->exit_reason = 0;
2833        pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2834               (int)(exit_qualification >> 4) & 3, cr);
2835        return 0;
2836}
2837
2838static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2839{
2840        unsigned long exit_qualification;
2841        unsigned long val;
2842        int dr, reg;
2843
2844        dr = vmcs_readl(GUEST_DR7);
2845        if (dr & DR7_GD) {
2846                /*
2847                 * As the vm-exit takes precedence over the debug trap, we
2848                 * need to emulate the latter, either for the host or the
2849                 * guest debugging itself.
2850                 */
2851                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2852                        kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
2853                        kvm_run->debug.arch.dr7 = dr;
2854                        kvm_run->debug.arch.pc =
2855                                vmcs_readl(GUEST_CS_BASE) +
2856                                vmcs_readl(GUEST_RIP);
2857                        kvm_run->debug.arch.exception = DB_VECTOR;
2858                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
2859                        return 0;
2860                } else {
2861                        vcpu->arch.dr7 &= ~DR7_GD;
2862                        vcpu->arch.dr6 |= DR6_BD;
2863                        vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2864                        kvm_queue_exception(vcpu, DB_VECTOR);
2865                        return 1;
2866                }
2867        }
2868
2869        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2870        dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
2871        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
2872        if (exit_qualification & TYPE_MOV_FROM_DR) {
2873                switch (dr) {
2874                case 0 ... 3:
2875                        val = vcpu->arch.db[dr];
2876                        break;
2877                case 6:
2878                        val = vcpu->arch.dr6;
2879                        break;
2880                case 7:
2881                        val = vcpu->arch.dr7;
2882                        break;
2883                default:
2884                        val = 0;
2885                }
2886                kvm_register_write(vcpu, reg, val);
2887                KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2888        } else {
2889                val = vcpu->arch.regs[reg];
2890                switch (dr) {
2891                case 0 ... 3:
2892                        vcpu->arch.db[dr] = val;
2893                        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
2894                                vcpu->arch.eff_db[dr] = val;
2895                        break;
2896                case 4 ... 5:
2897                        if (vcpu->arch.cr4 & X86_CR4_DE)
2898                                kvm_queue_exception(vcpu, UD_VECTOR);
2899                        break;
2900                case 6:
2901                        if (val & 0xffffffff00000000ULL) {
2902                                kvm_queue_exception(vcpu, GP_VECTOR);
2903                                break;
2904                        }
2905                        vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
2906                        break;
2907                case 7:
2908                        if (val & 0xffffffff00000000ULL) {
2909                                kvm_queue_exception(vcpu, GP_VECTOR);
2910                                break;
2911                        }
2912                        vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
2913                        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
2914                                vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2915                                vcpu->arch.switch_db_regs =
2916                                        (val & DR7_BP_EN_MASK);
2917                        }
2918                        break;
2919                }
2920                KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
2921        }
2922        skip_emulated_instruction(vcpu);
2923        return 1;
2924}
2925
2926static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2927{
2928        kvm_emulate_cpuid(vcpu);
2929        return 1;
2930}
2931
2932static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2933{
2934        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2935        u64 data;
2936
2937        if (vmx_get_msr(vcpu, ecx, &data)) {
2938                kvm_inject_gp(vcpu, 0);
2939                return 1;
2940        }
2941
2942        KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
2943                    handler);
2944
2945        /* FIXME: handling of bits 32:63 of rax, rdx */
2946        vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2947        vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2948        skip_emulated_instruction(vcpu);
2949        return 1;
2950}
2951
2952static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2953{
2954        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2955        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2956                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2957
2958        KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
2959                    handler);
2960
2961        if (vmx_set_msr(vcpu, ecx, data) != 0) {
2962                kvm_inject_gp(vcpu, 0);
2963                return 1;
2964        }
2965
2966        skip_emulated_instruction(vcpu);
2967        return 1;
2968}
2969
2970static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2971                                      struct kvm_run *kvm_run)
2972{
2973        return 1;
2974}
2975
2976static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2977                                   struct kvm_run *kvm_run)
2978{
2979        u32 cpu_based_vm_exec_control;
2980
2981        /* clear pending irq */
2982        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2983        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2984        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2985
2986        KVMTRACE_0D(PEND_INTR, vcpu, handler);
2987        ++vcpu->stat.irq_window_exits;
2988
2989        /*
2990         * If the user space waits to inject interrupts, exit as soon as
2991         * possible
2992         */
2993        if (!irqchip_in_kernel(vcpu->kvm) &&
2994            kvm_run->request_interrupt_window &&
2995            !kvm_cpu_has_interrupt(vcpu)) {
2996                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2997                return 0;
2998        }
2999        return 1;
3000}
3001
3002static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3003{
3004        skip_emulated_instruction(vcpu);
3005        return kvm_emulate_halt(vcpu);
3006}
3007
3008static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3009{
3010        skip_emulated_instruction(vcpu);
3011        kvm_emulate_hypercall(vcpu);
3012        return 1;
3013}
3014
3015static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016{
3017        kvm_queue_exception(vcpu, UD_VECTOR);
3018        return 1;
3019}
3020
3021static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3022{
3023        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3024
3025        kvm_mmu_invlpg(vcpu, exit_qualification);
3026        skip_emulated_instruction(vcpu);
3027        return 1;
3028}
3029
3030static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3031{
3032        skip_emulated_instruction(vcpu);
3033        /* TODO: Add support for VT-d/pass-through device */
3034        return 1;
3035}
3036
3037static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3038{
3039        unsigned long exit_qualification;
3040        enum emulation_result er;
3041        unsigned long offset;
3042
3043        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3044        offset = exit_qualification & 0xffful;
3045
3046        er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
3047
3048        if (er !=  EMULATE_DONE) {
3049                printk(KERN_ERR
3050                       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3051                       offset);
3052                return -ENOTSUPP;
3053        }
3054        return 1;
3055}
3056
3057static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3058{
3059        struct vcpu_vmx *vmx = to_vmx(vcpu);
3060        unsigned long exit_qualification;
3061        u16 tss_selector;
3062        int reason, type, idt_v;
3063
3064        idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3065        type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
3066
3067        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3068
3069        reason = (u32)exit_qualification >> 30;
3070        if (reason == TASK_SWITCH_GATE && idt_v) {
3071                switch (type) {
3072                case INTR_TYPE_NMI_INTR:
3073                        vcpu->arch.nmi_injected = false;
3074                        if (cpu_has_virtual_nmis())
3075                                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3076                                              GUEST_INTR_STATE_NMI);
3077                        break;
3078                case INTR_TYPE_EXT_INTR:
3079                case INTR_TYPE_SOFT_INTR:
3080                        kvm_clear_interrupt_queue(vcpu);
3081                        break;
3082                case INTR_TYPE_HARD_EXCEPTION:
3083                case INTR_TYPE_SOFT_EXCEPTION:
3084                        kvm_clear_exception_queue(vcpu);
3085                        break;
3086                default:
3087                        break;
3088                }
3089        }
3090        tss_selector = exit_qualification;
3091
3092        if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3093                       type != INTR_TYPE_EXT_INTR &&
3094                       type != INTR_TYPE_NMI_INTR))
3095                skip_emulated_instruction(vcpu);
3096
3097        if (!kvm_task_switch(vcpu, tss_selector, reason))
3098                return 0;
3099
3100        /* clear all local breakpoint enable flags */
3101        vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
3102
3103        /*
3104         * TODO: What about debug traps on tss switch?
3105         *       Are we supposed to inject them and update dr6?
3106         */
3107
3108        return 1;
3109}
3110
3111static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3112{
3113        unsigned long exit_qualification;
3114        gpa_t gpa;
3115        int gla_validity;
3116
3117        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3118
3119        if (exit_qualification & (1 << 6)) {
3120                printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
3121                return -ENOTSUPP;
3122        }
3123
3124        gla_validity = (exit_qualification >> 7) & 0x3;
3125        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
3126                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3127                printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3128                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3129                        vmcs_readl(GUEST_LINEAR_ADDRESS));
3130                printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3131                        (long unsigned int)exit_qualification);
3132                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3133                kvm_run->hw.hardware_exit_reason = 0;
3134                return -ENOTSUPP;
3135        }
3136
3137        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3138        return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
3139}
3140
3141static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3142{
3143        u32 cpu_based_vm_exec_control;
3144
3145        /* clear pending NMI */
3146        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3147        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3148        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3149        ++vcpu->stat.nmi_window_exits;
3150
3151        return 1;
3152}
3153
3154static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3155                                struct kvm_run *kvm_run)
3156{
3157        struct vcpu_vmx *vmx = to_vmx(vcpu);
3158        enum emulation_result err = EMULATE_DONE;
3159
3160        local_irq_enable();
3161        preempt_enable();
3162
3163        while (!guest_state_valid(vcpu)) {
3164                err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
3165
3166                if (err == EMULATE_DO_MMIO)
3167                        break;
3168
3169                if (err != EMULATE_DONE) {
3170                        kvm_report_emulation_failure(vcpu, "emulation failure");
3171                        break;
3172                }
3173
3174                if (signal_pending(current))
3175                        break;
3176                if (need_resched())
3177                        schedule();
3178        }
3179
3180        preempt_disable();
3181        local_irq_disable();
3182
3183        vmx->invalid_state_emulation_result = err;
3184}
3185
3186/*
3187 * The exit handlers return 1 if the exit was handled fully and guest execution
3188 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
3189 * to be done to userspace and return 0.
3190 */
3191static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3192                                      struct kvm_run *kvm_run) = {
3193        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
3194        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
3195        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
3196        [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
3197        [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
3198        [EXIT_REASON_CR_ACCESS]               = handle_cr,
3199        [EXIT_REASON_DR_ACCESS]               = handle_dr,
3200        [EXIT_REASON_CPUID]                   = handle_cpuid,
3201        [EXIT_REASON_MSR_READ]                = handle_rdmsr,
3202        [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
3203        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
3204        [EXIT_REASON_HLT]                     = handle_halt,
3205        [EXIT_REASON_INVLPG]                  = handle_invlpg,
3206        [EXIT_REASON_VMCALL]                  = handle_vmcall,
3207        [EXIT_REASON_VMCLEAR]                 = handle_vmx_insn,
3208        [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
3209        [EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
3210        [EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
3211        [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
3212        [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
3213        [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
3214        [EXIT_REASON_VMOFF]                   = handle_vmx_insn,
3215        [EXIT_REASON_VMON]                    = handle_vmx_insn,
3216        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
3217        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
3218        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
3219        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
3220        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
3221        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
3222};
3223
3224static const int kvm_vmx_max_exit_handlers =
3225        ARRAY_SIZE(kvm_vmx_exit_handlers);
3226
3227/*
3228 * The guest has exited.  See if we can fix it or if we need userspace
3229 * assistance.
3230 */
3231static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3232{
3233        struct vcpu_vmx *vmx = to_vmx(vcpu);
3234        u32 exit_reason = vmx->exit_reason;
3235        u32 vectoring_info = vmx->idt_vectoring_info;
3236
3237        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
3238                    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
3239
3240        /* If we need to emulate an MMIO from handle_invalid_guest_state
3241         * we just return 0 */
3242        if (vmx->emulation_required && emulate_invalid_guest_state) {
3243                if (guest_state_valid(vcpu))
3244                        vmx->emulation_required = 0;
3245                return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3246        }
3247
3248        /* Access CR3 don't cause VMExit in paging mode, so we need
3249         * to sync with guest real CR3. */
3250        if (enable_ept && is_paging(vcpu)) {
3251                vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3252                ept_load_pdptrs(vcpu);
3253        }
3254
3255        if (unlikely(vmx->fail)) {
3256                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3257                kvm_run->fail_entry.hardware_entry_failure_reason
3258                        = vmcs_read32(VM_INSTRUCTION_ERROR);
3259                return 0;
3260        }
3261
3262        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
3263                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
3264                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
3265                        exit_reason != EXIT_REASON_TASK_SWITCH))
3266                printk(KERN_WARNING "%s: unexpected, valid vectoring info "
3267                       "(0x%x) and exit reason is 0x%x\n",
3268                       __func__, vectoring_info, exit_reason);
3269
3270        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3271                if (vmx_interrupt_allowed(vcpu)) {
3272                        vmx->soft_vnmi_blocked = 0;
3273                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3274                           vcpu->arch.nmi_pending) {
3275                        /*
3276                         * This CPU don't support us in finding the end of an
3277                         * NMI-blocked window if the guest runs with IRQs
3278                         * disabled. So we pull the trigger after 1 s of
3279                         * futile waiting, but inform the user about this.
3280                         */
3281                        printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
3282                               "state on VCPU %d after 1 s timeout\n",
3283                               __func__, vcpu->vcpu_id);
3284                        vmx->soft_vnmi_blocked = 0;
3285                }
3286        }
3287
3288        if (exit_reason < kvm_vmx_max_exit_handlers
3289            && kvm_vmx_exit_handlers[exit_reason])
3290                return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
3291        else {
3292                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3293                kvm_run->hw.hardware_exit_reason = exit_reason;
3294        }
3295        return 0;
3296}
3297
3298static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3299{
3300        if (irr == -1 || tpr < irr) {
3301                vmcs_write32(TPR_THRESHOLD, 0);
3302                return;
3303        }
3304
3305        vmcs_write32(TPR_THRESHOLD, irr);
3306}
3307
3308static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3309{
3310        u32 exit_intr_info;
3311        u32 idt_vectoring_info = vmx->idt_vectoring_info;
3312        bool unblock_nmi;
3313        u8 vector;
3314        int type;
3315        bool idtv_info_valid;
3316
3317        exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3318
3319        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3320
3321        /* Handle machine checks before interrupts are enabled */
3322        if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3323            || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3324                && is_machine_check(exit_intr_info)))
3325                kvm_machine_check();
3326
3327        /* We need to handle NMIs before interrupts are enabled */
3328        if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3329            (exit_intr_info & INTR_INFO_VALID_MASK)) {
3330                KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3331                asm("int $2");
3332        }
3333
3334        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3335
3336        if (cpu_has_virtual_nmis()) {
3337                unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3338                vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3339                /*
3340                 * SDM 3: 27.7.1.2 (September 2008)
3341                 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3342                 * a guest IRET fault.
3343                 * SDM 3: 23.2.2 (September 2008)
3344                 * Bit 12 is undefined in any of the following cases:
3345                 *  If the VM exit sets the valid bit in the IDT-vectoring
3346                 *   information field.
3347                 *  If the VM exit is due to a double fault.
3348                 */
3349                if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3350                    vector != DF_VECTOR && !idtv_info_valid)
3351                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3352                                      GUEST_INTR_STATE_NMI);
3353        } else if (unlikely(vmx->soft_vnmi_blocked))
3354                vmx->vnmi_blocked_time +=
3355                        ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3356
3357        vmx->vcpu.arch.nmi_injected = false;
3358        kvm_clear_exception_queue(&vmx->vcpu);
3359        kvm_clear_interrupt_queue(&vmx->vcpu);
3360
3361        if (!idtv_info_valid)
3362                return;
3363
3364        vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3365        type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3366
3367        switch (type) {
3368        case INTR_TYPE_NMI_INTR:
3369                vmx->vcpu.arch.nmi_injected = true;
3370                /*
3371                 * SDM 3: 27.7.1.2 (September 2008)
3372                 * Clear bit "block by NMI" before VM entry if a NMI
3373                 * delivery faulted.
3374                 */
3375                vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3376                                GUEST_INTR_STATE_NMI);
3377                break;
3378        case INTR_TYPE_SOFT_EXCEPTION:
3379                vmx->vcpu.arch.event_exit_inst_len =
3380                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3381                /* fall through */
3382        case INTR_TYPE_HARD_EXCEPTION:
3383                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3384                        u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3385                        kvm_queue_exception_e(&vmx->vcpu, vector, err);
3386                } else
3387                        kvm_queue_exception(&vmx->vcpu, vector);
3388                break;
3389        case INTR_TYPE_SOFT_INTR:
3390                vmx->vcpu.arch.event_exit_inst_len =
3391                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3392                /* fall through */
3393        case INTR_TYPE_EXT_INTR:
3394                kvm_queue_interrupt(&vmx->vcpu, vector,
3395                        type == INTR_TYPE_SOFT_INTR);
3396                break;
3397        default:
3398                break;
3399        }
3400}
3401
3402/*
3403 * Failure to inject an interrupt should give us the information
3404 * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
3405 * when fetching the interrupt redirection bitmap in the real-mode
3406 * tss, this doesn't happen.  So we do it ourselves.
3407 */
3408static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3409{
3410        vmx->rmode.irq.pending = 0;
3411        if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
3412                return;
3413        kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
3414        if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
3415                vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
3416                vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
3417                return;
3418        }
3419        vmx->idt_vectoring_info =
3420                VECTORING_INFO_VALID_MASK
3421                | INTR_TYPE_EXT_INTR
3422                | vmx->rmode.irq.vector;
3423}
3424
3425#ifdef CONFIG_X86_64
3426#define R "r"
3427#define Q "q"
3428#else
3429#define R "e"
3430#define Q "l"
3431#endif
3432
3433static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3434{
3435        struct vcpu_vmx *vmx = to_vmx(vcpu);
3436
3437        /* Record the guest's net vcpu time for enforced NMI injections. */
3438        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3439                vmx->entry_time = ktime_get();
3440
3441        /* Handle invalid guest state instead of entering VMX */
3442        if (vmx->emulation_required && emulate_invalid_guest_state) {
3443                handle_invalid_guest_state(vcpu, kvm_run);
3444                return;
3445        }
3446
3447        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3448                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3449        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3450                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3451
3452        /*
3453         * Loading guest fpu may have cleared host cr0.ts
3454         */
3455        vmcs_writel(HOST_CR0, read_cr0());
3456
3457        set_debugreg(vcpu->arch.dr6, 6);
3458
3459        asm(
3460                /* Store host registers */
3461                "push %%"R"dx; push %%"R"bp;"
3462                "push %%"R"cx \n\t"
3463                "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3464                "je 1f \n\t"
3465                "mov %%"R"sp, %c[host_rsp](%0) \n\t"
3466                __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3467                "1: \n\t"
3468                /* Check if vmlaunch of vmresume is needed */
3469                "cmpl $0, %c[launched](%0) \n\t"
3470                /* Load guest registers.  Don't clobber flags. */
3471                "mov %c[cr2](%0), %%"R"ax \n\t"
3472                "mov %%"R"ax, %%cr2 \n\t"
3473                "mov %c[rax](%0), %%"R"ax \n\t"
3474                "mov %c[rbx](%0), %%"R"bx \n\t"
3475                "mov %c[rdx](%0), %%"R"dx \n\t"
3476                "mov %c[rsi](%0), %%"R"si \n\t"
3477                "mov %c[rdi](%0), %%"R"di \n\t"
3478                "mov %c[rbp](%0), %%"R"bp \n\t"
3479#ifdef CONFIG_X86_64
3480                "mov %c[r8](%0),  %%r8  \n\t"
3481                "mov %c[r9](%0),  %%r9  \n\t"
3482                "mov %c[r10](%0), %%r10 \n\t"
3483                "mov %c[r11](%0), %%r11 \n\t"
3484                "mov %c[r12](%0), %%r12 \n\t"
3485                "mov %c[r13](%0), %%r13 \n\t"
3486                "mov %c[r14](%0), %%r14 \n\t"
3487                "mov %c[r15](%0), %%r15 \n\t"
3488#endif
3489                "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3490
3491                /* Enter guest mode */
3492                "jne .Llaunched \n\t"
3493                __ex(ASM_VMX_VMLAUNCH) "\n\t"
3494                "jmp .Lkvm_vmx_return \n\t"
3495                ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
3496                ".Lkvm_vmx_return: "
3497                /* Save guest registers, load host registers, keep flags */
3498                "xchg %0,     (%%"R"sp) \n\t"
3499                "mov %%"R"ax, %c[rax](%0) \n\t"
3500                "mov %%"R"bx, %c[rbx](%0) \n\t"
3501                "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3502                "mov %%"R"dx, %c[rdx](%0) \n\t"
3503                "mov %%"R"si, %c[rsi](%0) \n\t"
3504                "mov %%"R"di, %c[rdi](%0) \n\t"
3505                "mov %%"R"bp, %c[rbp](%0) \n\t"
3506#ifdef CONFIG_X86_64
3507                "mov %%r8,  %c[r8](%0) \n\t"
3508                "mov %%r9,  %c[r9](%0) \n\t"
3509                "mov %%r10, %c[r10](%0) \n\t"
3510                "mov %%r11, %c[r11](%0) \n\t"
3511                "mov %%r12, %c[r12](%0) \n\t"
3512                "mov %%r13, %c[r13](%0) \n\t"
3513                "mov %%r14, %c[r14](%0) \n\t"
3514                "mov %%r15, %c[r15](%0) \n\t"
3515#endif
3516                "mov %%cr2, %%"R"ax   \n\t"
3517                "mov %%"R"ax, %c[cr2](%0) \n\t"
3518
3519                "pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
3520                "setbe %c[fail](%0) \n\t"
3521              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3522                [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3523                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3524                [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
3525                [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3526                [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3527                [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
3528                [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
3529                [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
3530                [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
3531                [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
3532#ifdef CONFIG_X86_64
3533                [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
3534                [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
3535                [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
3536                [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
3537                [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
3538                [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
3539                [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
3540                [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
3541#endif
3542                [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
3543              : "cc", "memory"
3544                , R"bx", R"di", R"si"
3545#ifdef CONFIG_X86_64
3546                , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3547#endif
3548              );
3549
3550        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3551        vcpu->arch.regs_dirty = 0;
3552
3553        get_debugreg(vcpu->arch.dr6, 6);
3554
3555        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3556        if (vmx->rmode.irq.pending)
3557                fixup_rmode_irq(vmx);
3558
3559        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3560        vmx->launched = 1;
3561
3562        vmx_complete_interrupts(vmx);
3563}
3564
3565#undef R
3566#undef Q
3567
3568static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3569{
3570        struct vcpu_vmx *vmx = to_vmx(vcpu);
3571
3572        if (vmx->vmcs) {
3573                vcpu_clear(vmx);
3574                free_vmcs(vmx->vmcs);
3575                vmx->vmcs = NULL;
3576        }
3577}
3578
3579static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3580{
3581        struct vcpu_vmx *vmx = to_vmx(vcpu);
3582
3583        spin_lock(&vmx_vpid_lock);
3584        if (vmx->vpid != 0)
3585                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3586        spin_unlock(&vmx_vpid_lock);
3587        vmx_free_vmcs(vcpu);
3588        kfree(vmx->host_msrs);
3589        kfree(vmx->guest_msrs);
3590        kvm_vcpu_uninit(vcpu);
3591        kmem_cache_free(kvm_vcpu_cache, vmx);
3592}
3593
3594static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3595{
3596        int err;
3597        struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3598        int cpu;
3599
3600        if (!vmx)
3601                return ERR_PTR(-ENOMEM);
3602
3603        allocate_vpid(vmx);
3604
3605        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3606        if (err)
3607                goto free_vcpu;
3608
3609        vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3610        if (!vmx->guest_msrs) {
3611                err = -ENOMEM;
3612                goto uninit_vcpu;
3613        }
3614
3615        vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3616        if (!vmx->host_msrs)
3617                goto free_guest_msrs;
3618
3619        vmx->vmcs = alloc_vmcs();
3620        if (!vmx->vmcs)
3621                goto free_msrs;
3622
3623        vmcs_clear(vmx->vmcs);
3624
3625        cpu = get_cpu();
3626        vmx_vcpu_load(&vmx->vcpu, cpu);
3627        err = vmx_vcpu_setup(vmx);
3628        vmx_vcpu_put(&vmx->vcpu);
3629        put_cpu();
3630        if (err)
3631                goto free_vmcs;
3632        if (vm_need_virtualize_apic_accesses(kvm))
3633                if (alloc_apic_access_page(kvm) != 0)
3634                        goto free_vmcs;
3635
3636        if (enable_ept)
3637                if (alloc_identity_pagetable(kvm) != 0)
3638                        goto free_vmcs;
3639
3640        return &vmx->vcpu;
3641
3642free_vmcs:
3643        free_vmcs(vmx->vmcs);
3644free_msrs:
3645        kfree(vmx->host_msrs);
3646free_guest_msrs:
3647        kfree(vmx->guest_msrs);
3648uninit_vcpu:
3649        kvm_vcpu_uninit(&vmx->vcpu);
3650free_vcpu:
3651        kmem_cache_free(kvm_vcpu_cache, vmx);
3652        return ERR_PTR(err);
3653}
3654
3655static void __init vmx_check_processor_compat(void *rtn)
3656{
3657        struct vmcs_config vmcs_conf;
3658
3659        *(int *)rtn = 0;
3660        if (setup_vmcs_config(&vmcs_conf) < 0)
3661                *(int *)rtn = -EIO;
3662        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
3663                printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
3664                                smp_processor_id());
3665                *(int *)rtn = -EIO;
3666        }
3667}
3668
3669static int get_ept_level(void)
3670{
3671        return VMX_EPT_DEFAULT_GAW + 1;
3672}
3673
3674static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3675{
3676        u64 ret;
3677
3678        /* For VT-d and EPT combination
3679         * 1. MMIO: always map as UC
3680         * 2. EPT with VT-d:
3681         *   a. VT-d without snooping control feature: can't guarantee the
3682         *      result, try to trust guest.
3683         *   b. VT-d with snooping control feature: snooping control feature of
3684         *      VT-d engine can guarantee the cache correctness. Just set it
3685         *      to WB to keep consistent with host. So the same as item 3.
3686         * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
3687         *    consistent with host MTRR
3688         */
3689        if (is_mmio)
3690                ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
3691        else if (vcpu->kvm->arch.iommu_domain &&
3692                !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
3693                ret = kvm_get_guest_memory_type(vcpu, gfn) <<
3694                      VMX_EPT_MT_EPTE_SHIFT;
3695        else
3696                ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3697                        | VMX_EPT_IGMT_BIT;
3698
3699        return ret;
3700}
3701
3702static struct kvm_x86_ops vmx_x86_ops = {
3703        .cpu_has_kvm_support = cpu_has_kvm_support,
3704        .disabled_by_bios = vmx_disabled_by_bios,
3705        .hardware_setup = hardware_setup,
3706        .hardware_unsetup = hardware_unsetup,
3707        .check_processor_compatibility = vmx_check_processor_compat,
3708        .hardware_enable = hardware_enable,
3709        .hardware_disable = hardware_disable,
3710        .cpu_has_accelerated_tpr = report_flexpriority,
3711
3712        .vcpu_create = vmx_create_vcpu,
3713        .vcpu_free = vmx_free_vcpu,
3714        .vcpu_reset = vmx_vcpu_reset,
3715
3716        .prepare_guest_switch = vmx_save_host_state,
3717        .vcpu_load = vmx_vcpu_load,
3718        .vcpu_put = vmx_vcpu_put,
3719
3720        .set_guest_debug = set_guest_debug,
3721        .get_msr = vmx_get_msr,
3722        .set_msr = vmx_set_msr,
3723        .get_segment_base = vmx_get_segment_base,
3724        .get_segment = vmx_get_segment,
3725        .set_segment = vmx_set_segment,
3726        .get_cpl = vmx_get_cpl,
3727        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
3728        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3729        .set_cr0 = vmx_set_cr0,
3730        .set_cr3 = vmx_set_cr3,
3731        .set_cr4 = vmx_set_cr4,
3732        .set_efer = vmx_set_efer,
3733        .get_idt = vmx_get_idt,
3734        .set_idt = vmx_set_idt,
3735        .get_gdt = vmx_get_gdt,
3736        .set_gdt = vmx_set_gdt,
3737        .cache_reg = vmx_cache_reg,
3738        .get_rflags = vmx_get_rflags,
3739        .set_rflags = vmx_set_rflags,
3740
3741        .tlb_flush = vmx_flush_tlb,
3742
3743        .run = vmx_vcpu_run,
3744        .handle_exit = vmx_handle_exit,
3745        .skip_emulated_instruction = skip_emulated_instruction,
3746        .set_interrupt_shadow = vmx_set_interrupt_shadow,
3747        .get_interrupt_shadow = vmx_get_interrupt_shadow,
3748        .patch_hypercall = vmx_patch_hypercall,
3749        .set_irq = vmx_inject_irq,
3750        .set_nmi = vmx_inject_nmi,
3751        .queue_exception = vmx_queue_exception,
3752        .interrupt_allowed = vmx_interrupt_allowed,
3753        .nmi_allowed = vmx_nmi_allowed,
3754        .enable_nmi_window = enable_nmi_window,
3755        .enable_irq_window = enable_irq_window,
3756        .update_cr8_intercept = update_cr8_intercept,
3757
3758        .set_tss_addr = vmx_set_tss_addr,
3759        .get_tdp_level = get_ept_level,
3760        .get_mt_mask = vmx_get_mt_mask,
3761};
3762
3763static int __init vmx_init(void)
3764{
3765        int r;
3766
3767        vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3768        if (!vmx_io_bitmap_a)
3769                return -ENOMEM;
3770
3771        vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
3772        if (!vmx_io_bitmap_b) {
3773                r = -ENOMEM;
3774                goto out;
3775        }
3776
3777        vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
3778        if (!vmx_msr_bitmap_legacy) {
3779                r = -ENOMEM;
3780                goto out1;
3781        }
3782
3783        vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
3784        if (!vmx_msr_bitmap_longmode) {
3785                r = -ENOMEM;
3786                goto out2;
3787        }
3788
3789        /*
3790         * Allow direct access to the PC debug port (it is often used for I/O
3791         * delays, but the vmexits simply slow things down).
3792         */
3793        memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
3794        clear_bit(0x80, vmx_io_bitmap_a);
3795
3796        memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
3797
3798        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
3799        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
3800
3801        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3802
3803        r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
3804        if (r)
3805                goto out3;
3806
3807        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
3808        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
3809        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
3810        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
3811        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
3812        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
3813
3814        if (enable_ept) {
3815                bypass_guest_pf = 0;
3816                kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3817                        VMX_EPT_WRITABLE_MASK);
3818                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3819                                VMX_EPT_EXECUTABLE_MASK);
3820                kvm_enable_tdp();
3821        } else
3822                kvm_disable_tdp();
3823
3824        if (bypass_guest_pf)
3825                kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
3826
3827        ept_sync_global();
3828
3829        return 0;
3830
3831out3:
3832        free_page((unsigned long)vmx_msr_bitmap_longmode);
3833out2:
3834        free_page((unsigned long)vmx_msr_bitmap_legacy);
3835out1:
3836        free_page((unsigned long)vmx_io_bitmap_b);
3837out:
3838        free_page((unsigned long)vmx_io_bitmap_a);
3839        return r;
3840}
3841
3842static void __exit vmx_exit(void)
3843{
3844        free_page((unsigned long)vmx_msr_bitmap_legacy);
3845        free_page((unsigned long)vmx_msr_bitmap_longmode);
3846        free_page((unsigned long)vmx_io_bitmap_b);
3847        free_page((unsigned long)vmx_io_bitmap_a);
3848
3849        kvm_exit();
3850}
3851
3852module_init(vmx_init)
3853module_exit(vmx_exit)
3854
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.