linux/arch/x86/kvm/vmx.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 *
   9 * Authors:
  10 *   Avi Kivity   <avi@qumranet.com>
  11 *   Yaniv Kamay  <yaniv@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17
  18#include "irq.h"
  19#include "vmx.h"
  20#include "mmu.h"
  21
  22#include <linux/kvm_host.h>
  23#include <linux/module.h>
  24#include <linux/kernel.h>
  25#include <linux/mm.h>
  26#include <linux/highmem.h>
  27#include <linux/sched.h>
  28#include <linux/moduleparam.h>
  29#include "kvm_cache_regs.h"
  30#include "x86.h"
  31
  32#include <asm/io.h>
  33#include <asm/desc.h>
  34
  35#define __ex(x) __kvm_handle_fault_on_reboot(x)
  36
  37MODULE_AUTHOR("Qumranet");
  38MODULE_LICENSE("GPL");
  39
  40static int bypass_guest_pf = 1;
  41module_param(bypass_guest_pf, bool, 0);
  42
  43static int enable_vpid = 1;
  44module_param(enable_vpid, bool, 0);
  45
  46static int flexpriority_enabled = 1;
  47module_param(flexpriority_enabled, bool, 0);
  48
  49static int enable_ept = 1;
  50module_param(enable_ept, bool, 0);
  51
  52static int emulate_invalid_guest_state = 0;
  53module_param(emulate_invalid_guest_state, bool, 0);
  54
  55struct vmcs {
  56        u32 revision_id;
  57        u32 abort;
  58        char data[0];
  59};
  60
  61struct vcpu_vmx {
  62        struct kvm_vcpu       vcpu;
  63        struct list_head      local_vcpus_link;
  64        unsigned long         host_rsp;
  65        int                   launched;
  66        u8                    fail;
  67        u32                   idt_vectoring_info;
  68        struct kvm_msr_entry *guest_msrs;
  69        struct kvm_msr_entry *host_msrs;
  70        int                   nmsrs;
  71        int                   save_nmsrs;
  72        int                   msr_offset_efer;
  73#ifdef CONFIG_X86_64
  74        int                   msr_offset_kernel_gs_base;
  75#endif
  76        struct vmcs          *vmcs;
  77        struct {
  78                int           loaded;
  79                u16           fs_sel, gs_sel, ldt_sel;
  80                int           gs_ldt_reload_needed;
  81                int           fs_reload_needed;
  82                int           guest_efer_loaded;
  83        } host_state;
  84        struct {
  85                struct {
  86                        bool pending;
  87                        u8 vector;
  88                        unsigned rip;
  89                } irq;
  90        } rmode;
  91        int vpid;
  92        bool emulation_required;
  93};
  94
  95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
  96{
  97        return container_of(vcpu, struct vcpu_vmx, vcpu);
  98}
  99
 100static int init_rmode(struct kvm *kvm);
 101static u64 construct_eptp(unsigned long root_hpa);
 102
 103static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 104static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 105static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 106
 107static struct page *vmx_io_bitmap_a;
 108static struct page *vmx_io_bitmap_b;
 109static struct page *vmx_msr_bitmap;
 110
 111static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 112static DEFINE_SPINLOCK(vmx_vpid_lock);
 113
 114static struct vmcs_config {
 115        int size;
 116        int order;
 117        u32 revision_id;
 118        u32 pin_based_exec_ctrl;
 119        u32 cpu_based_exec_ctrl;
 120        u32 cpu_based_2nd_exec_ctrl;
 121        u32 vmexit_ctrl;
 122        u32 vmentry_ctrl;
 123} vmcs_config;
 124
 125struct vmx_capability {
 126        u32 ept;
 127        u32 vpid;
 128} vmx_capability;
 129
 130#define VMX_SEGMENT_FIELD(seg)                                  \
 131        [VCPU_SREG_##seg] = {                                   \
 132                .selector = GUEST_##seg##_SELECTOR,             \
 133                .base = GUEST_##seg##_BASE,                     \
 134                .limit = GUEST_##seg##_LIMIT,                   \
 135                .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 136        }
 137
 138static struct kvm_vmx_segment_field {
 139        unsigned selector;
 140        unsigned base;
 141        unsigned limit;
 142        unsigned ar_bytes;
 143} kvm_vmx_segment_fields[] = {
 144        VMX_SEGMENT_FIELD(CS),
 145        VMX_SEGMENT_FIELD(DS),
 146        VMX_SEGMENT_FIELD(ES),
 147        VMX_SEGMENT_FIELD(FS),
 148        VMX_SEGMENT_FIELD(GS),
 149        VMX_SEGMENT_FIELD(SS),
 150        VMX_SEGMENT_FIELD(TR),
 151        VMX_SEGMENT_FIELD(LDTR),
 152};
 153
 154/*
 155 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
 156 * away by decrementing the array size.
 157 */
 158static const u32 vmx_msr_index[] = {
 159#ifdef CONFIG_X86_64
 160        MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
 161#endif
 162        MSR_EFER, MSR_K6_STAR,
 163};
 164#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 165
 166static void load_msrs(struct kvm_msr_entry *e, int n)
 167{
 168        int i;
 169
 170        for (i = 0; i < n; ++i)
 171                wrmsrl(e[i].index, e[i].data);
 172}
 173
 174static void save_msrs(struct kvm_msr_entry *e, int n)
 175{
 176        int i;
 177
 178        for (i = 0; i < n; ++i)
 179                rdmsrl(e[i].index, e[i].data);
 180}
 181
 182static inline int is_page_fault(u32 intr_info)
 183{
 184        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 185                             INTR_INFO_VALID_MASK)) ==
 186                (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 187}
 188
 189static inline int is_no_device(u32 intr_info)
 190{
 191        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 192                             INTR_INFO_VALID_MASK)) ==
 193                (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 194}
 195
 196static inline int is_invalid_opcode(u32 intr_info)
 197{
 198        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 199                             INTR_INFO_VALID_MASK)) ==
 200                (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 201}
 202
 203static inline int is_external_interrupt(u32 intr_info)
 204{
 205        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 206                == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 207}
 208
 209static inline int cpu_has_vmx_msr_bitmap(void)
 210{
 211        return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
 212}
 213
 214static inline int cpu_has_vmx_tpr_shadow(void)
 215{
 216        return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
 217}
 218
 219static inline int vm_need_tpr_shadow(struct kvm *kvm)
 220{
 221        return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
 222}
 223
 224static inline int cpu_has_secondary_exec_ctrls(void)
 225{
 226        return (vmcs_config.cpu_based_exec_ctrl &
 227                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
 228}
 229
 230static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 231{
 232        return flexpriority_enabled
 233                && (vmcs_config.cpu_based_2nd_exec_ctrl &
 234                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 235}
 236
 237static inline int cpu_has_vmx_invept_individual_addr(void)
 238{
 239        return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
 240}
 241
 242static inline int cpu_has_vmx_invept_context(void)
 243{
 244        return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
 245}
 246
 247static inline int cpu_has_vmx_invept_global(void)
 248{
 249        return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
 250}
 251
 252static inline int cpu_has_vmx_ept(void)
 253{
 254        return (vmcs_config.cpu_based_2nd_exec_ctrl &
 255                SECONDARY_EXEC_ENABLE_EPT);
 256}
 257
 258static inline int vm_need_ept(void)
 259{
 260        return (cpu_has_vmx_ept() && enable_ept);
 261}
 262
 263static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 264{
 265        return ((cpu_has_vmx_virtualize_apic_accesses()) &&
 266                (irqchip_in_kernel(kvm)));
 267}
 268
 269static inline int cpu_has_vmx_vpid(void)
 270{
 271        return (vmcs_config.cpu_based_2nd_exec_ctrl &
 272                SECONDARY_EXEC_ENABLE_VPID);
 273}
 274
 275static inline int cpu_has_virtual_nmis(void)
 276{
 277        return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 278}
 279
 280static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 281{
 282        int i;
 283
 284        for (i = 0; i < vmx->nmsrs; ++i)
 285                if (vmx->guest_msrs[i].index == msr)
 286                        return i;
 287        return -1;
 288}
 289
 290static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 291{
 292    struct {
 293        u64 vpid : 16;
 294        u64 rsvd : 48;
 295        u64 gva;
 296    } operand = { vpid, 0, gva };
 297
 298    asm volatile (__ex(ASM_VMX_INVVPID)
 299                  /* CF==1 or ZF==1 --> rc = -1 */
 300                  "; ja 1f ; ud2 ; 1:"
 301                  : : "a"(&operand), "c"(ext) : "cc", "memory");
 302}
 303
 304static inline void __invept(int ext, u64 eptp, gpa_t gpa)
 305{
 306        struct {
 307                u64 eptp, gpa;
 308        } operand = {eptp, gpa};
 309
 310        asm volatile (__ex(ASM_VMX_INVEPT)
 311                        /* CF==1 or ZF==1 --> rc = -1 */
 312                        "; ja 1f ; ud2 ; 1:\n"
 313                        : : "a" (&operand), "c" (ext) : "cc", "memory");
 314}
 315
 316static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 317{
 318        int i;
 319
 320        i = __find_msr_index(vmx, msr);
 321        if (i >= 0)
 322                return &vmx->guest_msrs[i];
 323        return NULL;
 324}
 325
 326static void vmcs_clear(struct vmcs *vmcs)
 327{
 328        u64 phys_addr = __pa(vmcs);
 329        u8 error;
 330
 331        asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
 332                      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
 333                      : "cc", "memory");
 334        if (error)
 335                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
 336                       vmcs, phys_addr);
 337}
 338
 339static void __vcpu_clear(void *arg)
 340{
 341        struct vcpu_vmx *vmx = arg;
 342        int cpu = raw_smp_processor_id();
 343
 344        if (vmx->vcpu.cpu == cpu)
 345                vmcs_clear(vmx->vmcs);
 346        if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 347                per_cpu(current_vmcs, cpu) = NULL;
 348        rdtscll(vmx->vcpu.arch.host_tsc);
 349        list_del(&vmx->local_vcpus_link);
 350        vmx->vcpu.cpu = -1;
 351        vmx->launched = 0;
 352}
 353
 354static void vcpu_clear(struct vcpu_vmx *vmx)
 355{
 356        if (vmx->vcpu.cpu == -1)
 357                return;
 358        smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 359}
 360
 361static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 362{
 363        if (vmx->vpid == 0)
 364                return;
 365
 366        __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 367}
 368
 369static inline void ept_sync_global(void)
 370{
 371        if (cpu_has_vmx_invept_global())
 372                __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
 373}
 374
 375static inline void ept_sync_context(u64 eptp)
 376{
 377        if (vm_need_ept()) {
 378                if (cpu_has_vmx_invept_context())
 379                        __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 380                else
 381                        ept_sync_global();
 382        }
 383}
 384
 385static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 386{
 387        if (vm_need_ept()) {
 388                if (cpu_has_vmx_invept_individual_addr())
 389                        __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
 390                                        eptp, gpa);
 391                else
 392                        ept_sync_context(eptp);
 393        }
 394}
 395
 396static unsigned long vmcs_readl(unsigned long field)
 397{
 398        unsigned long value;
 399
 400        asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
 401                      : "=a"(value) : "d"(field) : "cc");
 402        return value;
 403}
 404
 405static u16 vmcs_read16(unsigned long field)
 406{
 407        return vmcs_readl(field);
 408}
 409
 410static u32 vmcs_read32(unsigned long field)
 411{
 412        return vmcs_readl(field);
 413}
 414
 415static u64 vmcs_read64(unsigned long field)
 416{
 417#ifdef CONFIG_X86_64
 418        return vmcs_readl(field);
 419#else
 420        return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
 421#endif
 422}
 423
 424static noinline void vmwrite_error(unsigned long field, unsigned long value)
 425{
 426        printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
 427               field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 428        dump_stack();
 429}
 430
 431static void vmcs_writel(unsigned long field, unsigned long value)
 432{
 433        u8 error;
 434
 435        asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
 436                       : "=q"(error) : "a"(value), "d"(field) : "cc");
 437        if (unlikely(error))
 438                vmwrite_error(field, value);
 439}
 440
 441static void vmcs_write16(unsigned long field, u16 value)
 442{
 443        vmcs_writel(field, value);
 444}
 445
 446static void vmcs_write32(unsigned long field, u32 value)
 447{
 448        vmcs_writel(field, value);
 449}
 450
 451static void vmcs_write64(unsigned long field, u64 value)
 452{
 453        vmcs_writel(field, value);
 454#ifndef CONFIG_X86_64
 455        asm volatile ("");
 456        vmcs_writel(field+1, value >> 32);
 457#endif
 458}
 459
 460static void vmcs_clear_bits(unsigned long field, u32 mask)
 461{
 462        vmcs_writel(field, vmcs_readl(field) & ~mask);
 463}
 464
 465static void vmcs_set_bits(unsigned long field, u32 mask)
 466{
 467        vmcs_writel(field, vmcs_readl(field) | mask);
 468}
 469
 470static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 471{
 472        u32 eb;
 473
 474        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
 475        if (!vcpu->fpu_active)
 476                eb |= 1u << NM_VECTOR;
 477        if (vcpu->guest_debug.enabled)
 478                eb |= 1u << DB_VECTOR;
 479        if (vcpu->arch.rmode.active)
 480                eb = ~0;
 481        if (vm_need_ept())
 482                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 483        vmcs_write32(EXCEPTION_BITMAP, eb);
 484}
 485
 486static void reload_tss(void)
 487{
 488        /*
 489         * VT restores TR but not its size.  Useless.
 490         */
 491        struct descriptor_table gdt;
 492        struct desc_struct *descs;
 493
 494        kvm_get_gdt(&gdt);
 495        descs = (void *)gdt.base;
 496        descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 497        load_TR_desc();
 498}
 499
 500static void load_transition_efer(struct vcpu_vmx *vmx)
 501{
 502        int efer_offset = vmx->msr_offset_efer;
 503        u64 host_efer = vmx->host_msrs[efer_offset].data;
 504        u64 guest_efer = vmx->guest_msrs[efer_offset].data;
 505        u64 ignore_bits;
 506
 507        if (efer_offset < 0)
 508                return;
 509        /*
 510         * NX is emulated; LMA and LME handled by hardware; SCE meaninless
 511         * outside long mode
 512         */
 513        ignore_bits = EFER_NX | EFER_SCE;
 514#ifdef CONFIG_X86_64
 515        ignore_bits |= EFER_LMA | EFER_LME;
 516        /* SCE is meaningful only in long mode on Intel */
 517        if (guest_efer & EFER_LMA)
 518                ignore_bits &= ~(u64)EFER_SCE;
 519#endif
 520        if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
 521                return;
 522
 523        vmx->host_state.guest_efer_loaded = 1;
 524        guest_efer &= ~ignore_bits;
 525        guest_efer |= host_efer & ignore_bits;
 526        wrmsrl(MSR_EFER, guest_efer);
 527        vmx->vcpu.stat.efer_reload++;
 528}
 529
 530static void reload_host_efer(struct vcpu_vmx *vmx)
 531{
 532        if (vmx->host_state.guest_efer_loaded) {
 533                vmx->host_state.guest_efer_loaded = 0;
 534                load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
 535        }
 536}
 537
 538static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 539{
 540        struct vcpu_vmx *vmx = to_vmx(vcpu);
 541
 542        if (vmx->host_state.loaded)
 543                return;
 544
 545        vmx->host_state.loaded = 1;
 546        /*
 547         * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
 548         * allow segment selectors with cpl > 0 or ti == 1.
 549         */
 550        vmx->host_state.ldt_sel = kvm_read_ldt();
 551        vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 552        vmx->host_state.fs_sel = kvm_read_fs();
 553        if (!(vmx->host_state.fs_sel & 7)) {
 554                vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
 555                vmx->host_state.fs_reload_needed = 0;
 556        } else {
 557                vmcs_write16(HOST_FS_SELECTOR, 0);
 558                vmx->host_state.fs_reload_needed = 1;
 559        }
 560        vmx->host_state.gs_sel = kvm_read_gs();
 561        if (!(vmx->host_state.gs_sel & 7))
 562                vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
 563        else {
 564                vmcs_write16(HOST_GS_SELECTOR, 0);
 565                vmx->host_state.gs_ldt_reload_needed = 1;
 566        }
 567
 568#ifdef CONFIG_X86_64
 569        vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
 570        vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
 571#else
 572        vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
 573        vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
 574#endif
 575
 576#ifdef CONFIG_X86_64
 577        if (is_long_mode(&vmx->vcpu))
 578                save_msrs(vmx->host_msrs +
 579                          vmx->msr_offset_kernel_gs_base, 1);
 580
 581#endif
 582        load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 583        load_transition_efer(vmx);
 584}
 585
 586static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 587{
 588        unsigned long flags;
 589
 590        if (!vmx->host_state.loaded)
 591                return;
 592
 593        ++vmx->vcpu.stat.host_state_reload;
 594        vmx->host_state.loaded = 0;
 595        if (vmx->host_state.fs_reload_needed)
 596                kvm_load_fs(vmx->host_state.fs_sel);
 597        if (vmx->host_state.gs_ldt_reload_needed) {
 598                kvm_load_ldt(vmx->host_state.ldt_sel);
 599                /*
 600                 * If we have to reload gs, we must take care to
 601                 * preserve our gs base.
 602                 */
 603                local_irq_save(flags);
 604                kvm_load_gs(vmx->host_state.gs_sel);
 605#ifdef CONFIG_X86_64
 606                wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
 607#endif
 608                local_irq_restore(flags);
 609        }
 610        reload_tss();
 611        save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 612        load_msrs(vmx->host_msrs, vmx->save_nmsrs);
 613        reload_host_efer(vmx);
 614}
 615
 616static void vmx_load_host_state(struct vcpu_vmx *vmx)
 617{
 618        preempt_disable();
 619        __vmx_load_host_state(vmx);
 620        preempt_enable();
 621}
 622
 623/*
 624 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 625 * vcpu mutex is already taken.
 626 */
 627static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 628{
 629        struct vcpu_vmx *vmx = to_vmx(vcpu);
 630        u64 phys_addr = __pa(vmx->vmcs);
 631        u64 tsc_this, delta, new_offset;
 632
 633        if (vcpu->cpu != cpu) {
 634                vcpu_clear(vmx);
 635                kvm_migrate_timers(vcpu);
 636                vpid_sync_vcpu_all(vmx);
 637                local_irq_disable();
 638                list_add(&vmx->local_vcpus_link,
 639                         &per_cpu(vcpus_on_cpu, cpu));
 640                local_irq_enable();
 641        }
 642
 643        if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 644                u8 error;
 645
 646                per_cpu(current_vmcs, cpu) = vmx->vmcs;
 647                asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
 648                              : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
 649                              : "cc");
 650                if (error)
 651                        printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
 652                               vmx->vmcs, phys_addr);
 653        }
 654
 655        if (vcpu->cpu != cpu) {
 656                struct descriptor_table dt;
 657                unsigned long sysenter_esp;
 658
 659                vcpu->cpu = cpu;
 660                /*
 661                 * Linux uses per-cpu TSS and GDT, so set these when switching
 662                 * processors.
 663                 */
 664                vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
 665                kvm_get_gdt(&dt);
 666                vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
 667
 668                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 669                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 670
 671                /*
 672                 * Make sure the time stamp counter is monotonous.
 673                 */
 674                rdtscll(tsc_this);
 675                if (tsc_this < vcpu->arch.host_tsc) {
 676                        delta = vcpu->arch.host_tsc - tsc_this;
 677                        new_offset = vmcs_read64(TSC_OFFSET) + delta;
 678                        vmcs_write64(TSC_OFFSET, new_offset);
 679                }
 680        }
 681}
 682
 683static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 684{
 685        __vmx_load_host_state(to_vmx(vcpu));
 686}
 687
 688static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 689{
 690        if (vcpu->fpu_active)
 691                return;
 692        vcpu->fpu_active = 1;
 693        vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
 694        if (vcpu->arch.cr0 & X86_CR0_TS)
 695                vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
 696        update_exception_bitmap(vcpu);
 697}
 698
 699static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 700{
 701        if (!vcpu->fpu_active)
 702                return;
 703        vcpu->fpu_active = 0;
 704        vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
 705        update_exception_bitmap(vcpu);
 706}
 707
 708static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 709{
 710        return vmcs_readl(GUEST_RFLAGS);
 711}
 712
 713static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 714{
 715        if (vcpu->arch.rmode.active)
 716                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 717        vmcs_writel(GUEST_RFLAGS, rflags);
 718}
 719
 720static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 721{
 722        unsigned long rip;
 723        u32 interruptibility;
 724
 725        rip = kvm_rip_read(vcpu);
 726        rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 727        kvm_rip_write(vcpu, rip);
 728
 729        /*
 730         * We emulated an instruction, so temporary interrupt blocking
 731         * should be removed, if set.
 732         */
 733        interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 734        if (interruptibility & 3)
 735                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
 736                             interruptibility & ~3);
 737        vcpu->arch.interrupt_window_open = 1;
 738}
 739
 740static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 741                                bool has_error_code, u32 error_code)
 742{
 743        struct vcpu_vmx *vmx = to_vmx(vcpu);
 744
 745        if (has_error_code)
 746                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 747
 748        if (vcpu->arch.rmode.active) {
 749                vmx->rmode.irq.pending = true;
 750                vmx->rmode.irq.vector = nr;
 751                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
 752                if (nr == BP_VECTOR)
 753                        vmx->rmode.irq.rip++;
 754                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 755                             nr | INTR_TYPE_SOFT_INTR
 756                             | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
 757                             | INTR_INFO_VALID_MASK);
 758                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
 759                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 760                return;
 761        }
 762
 763        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 764                     nr | INTR_TYPE_EXCEPTION
 765                     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
 766                     | INTR_INFO_VALID_MASK);
 767}
 768
 769static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
 770{
 771        return false;
 772}
 773
 774/*
 775 * Swap MSR entry in host/guest MSR entry array.
 776 */
 777#ifdef CONFIG_X86_64
 778static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 779{
 780        struct kvm_msr_entry tmp;
 781
 782        tmp = vmx->guest_msrs[to];
 783        vmx->guest_msrs[to] = vmx->guest_msrs[from];
 784        vmx->guest_msrs[from] = tmp;
 785        tmp = vmx->host_msrs[to];
 786        vmx->host_msrs[to] = vmx->host_msrs[from];
 787        vmx->host_msrs[from] = tmp;
 788}
 789#endif
 790
 791/*
 792 * Set up the vmcs to automatically save and restore system
 793 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
 794 * mode, as fiddling with msrs is very expensive.
 795 */
 796static void setup_msrs(struct vcpu_vmx *vmx)
 797{
 798        int save_nmsrs;
 799
 800        vmx_load_host_state(vmx);
 801        save_nmsrs = 0;
 802#ifdef CONFIG_X86_64
 803        if (is_long_mode(&vmx->vcpu)) {
 804                int index;
 805
 806                index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
 807                if (index >= 0)
 808                        move_msr_up(vmx, index, save_nmsrs++);
 809                index = __find_msr_index(vmx, MSR_LSTAR);
 810                if (index >= 0)
 811                        move_msr_up(vmx, index, save_nmsrs++);
 812                index = __find_msr_index(vmx, MSR_CSTAR);
 813                if (index >= 0)
 814                        move_msr_up(vmx, index, save_nmsrs++);
 815                index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 816                if (index >= 0)
 817                        move_msr_up(vmx, index, save_nmsrs++);
 818                /*
 819                 * MSR_K6_STAR is only needed on long mode guests, and only
 820                 * if efer.sce is enabled.
 821                 */
 822                index = __find_msr_index(vmx, MSR_K6_STAR);
 823                if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
 824                        move_msr_up(vmx, index, save_nmsrs++);
 825        }
 826#endif
 827        vmx->save_nmsrs = save_nmsrs;
 828
 829#ifdef CONFIG_X86_64
 830        vmx->msr_offset_kernel_gs_base =
 831                __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 832#endif
 833        vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
 834}
 835
 836/*
 837 * reads and returns guest's timestamp counter "register"
 838 * guest_tsc = host_tsc + tsc_offset    -- 21.3
 839 */
 840static u64 guest_read_tsc(void)
 841{
 842        u64 host_tsc, tsc_offset;
 843
 844        rdtscll(host_tsc);
 845        tsc_offset = vmcs_read64(TSC_OFFSET);
 846        return host_tsc + tsc_offset;
 847}
 848
 849/*
 850 * writes 'guest_tsc' into guest's timestamp counter "register"
 851 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
 852 */
 853static void guest_write_tsc(u64 guest_tsc)
 854{
 855        u64 host_tsc;
 856
 857        rdtscll(host_tsc);
 858        vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 859}
 860
 861/*
 862 * Reads an msr value (of 'msr_index') into 'pdata'.
 863 * Returns 0 on success, non-0 otherwise.
 864 * Assumes vcpu_load() was already called.
 865 */
 866static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 867{
 868        u64 data;
 869        struct kvm_msr_entry *msr;
 870
 871        if (!pdata) {
 872                printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
 873                return -EINVAL;
 874        }
 875
 876        switch (msr_index) {
 877#ifdef CONFIG_X86_64
 878        case MSR_FS_BASE:
 879                data = vmcs_readl(GUEST_FS_BASE);
 880                break;
 881        case MSR_GS_BASE:
 882                data = vmcs_readl(GUEST_GS_BASE);
 883                break;
 884        case MSR_EFER:
 885                return kvm_get_msr_common(vcpu, msr_index, pdata);
 886#endif
 887        case MSR_IA32_TIME_STAMP_COUNTER:
 888                data = guest_read_tsc();
 889                break;
 890        case MSR_IA32_SYSENTER_CS:
 891                data = vmcs_read32(GUEST_SYSENTER_CS);
 892                break;
 893        case MSR_IA32_SYSENTER_EIP:
 894                data = vmcs_readl(GUEST_SYSENTER_EIP);
 895                break;
 896        case MSR_IA32_SYSENTER_ESP:
 897                data = vmcs_readl(GUEST_SYSENTER_ESP);
 898                break;
 899        default:
 900                msr = find_msr_entry(to_vmx(vcpu), msr_index);
 901                if (msr) {
 902                        data = msr->data;
 903                        break;
 904                }
 905                return kvm_get_msr_common(vcpu, msr_index, pdata);
 906        }
 907
 908        *pdata = data;
 909        return 0;
 910}
 911
 912/*
 913 * Writes msr value into into the appropriate "register".
 914 * Returns 0 on success, non-0 otherwise.
 915 * Assumes vcpu_load() was already called.
 916 */
 917static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 918{
 919        struct vcpu_vmx *vmx = to_vmx(vcpu);
 920        struct kvm_msr_entry *msr;
 921        int ret = 0;
 922
 923        switch (msr_index) {
 924#ifdef CONFIG_X86_64
 925        case MSR_EFER:
 926                vmx_load_host_state(vmx);
 927                ret = kvm_set_msr_common(vcpu, msr_index, data);
 928                break;
 929        case MSR_FS_BASE:
 930                vmcs_writel(GUEST_FS_BASE, data);
 931                break;
 932        case MSR_GS_BASE:
 933                vmcs_writel(GUEST_GS_BASE, data);
 934                break;
 935#endif
 936        case MSR_IA32_SYSENTER_CS:
 937                vmcs_write32(GUEST_SYSENTER_CS, data);
 938                break;
 939        case MSR_IA32_SYSENTER_EIP:
 940                vmcs_writel(GUEST_SYSENTER_EIP, data);
 941                break;
 942        case MSR_IA32_SYSENTER_ESP:
 943                vmcs_writel(GUEST_SYSENTER_ESP, data);
 944                break;
 945        case MSR_IA32_TIME_STAMP_COUNTER:
 946                guest_write_tsc(data);
 947                break;
 948        case MSR_P6_PERFCTR0:
 949        case MSR_P6_PERFCTR1:
 950        case MSR_P6_EVNTSEL0:
 951        case MSR_P6_EVNTSEL1:
 952                /*
 953                 * Just discard all writes to the performance counters; this
 954                 * should keep both older linux and windows 64-bit guests
 955                 * happy
 956                 */
 957                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
 958
 959                break;
 960        default:
 961                vmx_load_host_state(vmx);
 962                msr = find_msr_entry(vmx, msr_index);
 963                if (msr) {
 964                        msr->data = data;
 965                        break;
 966                }
 967                ret = kvm_set_msr_common(vcpu, msr_index, data);
 968        }
 969
 970        return ret;
 971}
 972
 973static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 974{
 975        __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
 976        switch (reg) {
 977        case VCPU_REGS_RSP:
 978                vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
 979                break;
 980        case VCPU_REGS_RIP:
 981                vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
 982                break;
 983        default:
 984                break;
 985        }
 986}
 987
 988static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 989{
 990        unsigned long dr7 = 0x400;
 991        int old_singlestep;
 992
 993        old_singlestep = vcpu->guest_debug.singlestep;
 994
 995        vcpu->guest_debug.enabled = dbg->enabled;
 996        if (vcpu->guest_debug.enabled) {
 997                int i;
 998
 999                dr7 |= 0x200;  /* exact */
1000                for (i = 0; i < 4; ++i) {
1001                        if (!dbg->breakpoints[i].enabled)
1002                                continue;
1003                        vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
1004                        dr7 |= 2 << (i*2);    /* global enable */
1005                        dr7 |= 0 << (i*4+16); /* execution breakpoint */
1006                }
1007
1008                vcpu->guest_debug.singlestep = dbg->singlestep;
1009        } else
1010                vcpu->guest_debug.singlestep = 0;
1011
1012        if (old_singlestep && !vcpu->guest_debug.singlestep) {
1013                unsigned long flags;
1014
1015                flags = vmcs_readl(GUEST_RFLAGS);
1016                flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1017                vmcs_writel(GUEST_RFLAGS, flags);
1018        }
1019
1020        update_exception_bitmap(vcpu);
1021        vmcs_writel(GUEST_DR7, dr7);
1022
1023        return 0;
1024}
1025
1026static int vmx_get_irq(struct kvm_vcpu *vcpu)
1027{
1028        if (!vcpu->arch.interrupt.pending)
1029                return -1;
1030        return vcpu->arch.interrupt.nr;
1031}
1032
1033static __init int cpu_has_kvm_support(void)
1034{
1035        unsigned long ecx = cpuid_ecx(1);
1036        return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
1037}
1038
1039static __init int vmx_disabled_by_bios(void)
1040{
1041        u64 msr;
1042
1043        rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1044        return (msr & (FEATURE_CONTROL_LOCKED |
1045                       FEATURE_CONTROL_VMXON_ENABLED))
1046            == FEATURE_CONTROL_LOCKED;
1047        /* locked but not enabled */
1048}
1049
1050static void hardware_enable(void *garbage)
1051{
1052        int cpu = raw_smp_processor_id();
1053        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1054        u64 old;
1055
1056        INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1057        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1058        if ((old & (FEATURE_CONTROL_LOCKED |
1059                    FEATURE_CONTROL_VMXON_ENABLED))
1060            != (FEATURE_CONTROL_LOCKED |
1061                FEATURE_CONTROL_VMXON_ENABLED))
1062                /* enable and lock */
1063                wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1064                       FEATURE_CONTROL_LOCKED |
1065                       FEATURE_CONTROL_VMXON_ENABLED);
1066        write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1067        asm volatile (ASM_VMX_VMXON_RAX
1068                      : : "a"(&phys_addr), "m"(phys_addr)
1069                      : "memory", "cc");
1070}
1071
1072static void vmclear_local_vcpus(void)
1073{
1074        int cpu = raw_smp_processor_id();
1075        struct vcpu_vmx *vmx, *n;
1076
1077        list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1078                                 local_vcpus_link)
1079                __vcpu_clear(vmx);
1080}
1081
1082static void hardware_disable(void *garbage)
1083{
1084        vmclear_local_vcpus();
1085        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1086        write_cr4(read_cr4() & ~X86_CR4_VMXE);
1087}
1088
1089static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1090                                      u32 msr, u32 *result)
1091{
1092        u32 vmx_msr_low, vmx_msr_high;
1093        u32 ctl = ctl_min | ctl_opt;
1094
1095        rdmsr(msr, vmx_msr_low, vmx_msr_high);
1096
1097        ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
1098        ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
1099
1100        /* Ensure minimum (required) set of control bits are supported. */
1101        if (ctl_min & ~ctl)
1102                return -EIO;
1103
1104        *result = ctl;
1105        return 0;
1106}
1107
1108static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1109{
1110        u32 vmx_msr_low, vmx_msr_high;
1111        u32 min, opt, min2, opt2;
1112        u32 _pin_based_exec_control = 0;
1113        u32 _cpu_based_exec_control = 0;
1114        u32 _cpu_based_2nd_exec_control = 0;
1115        u32 _vmexit_control = 0;
1116        u32 _vmentry_control = 0;
1117
1118        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1119        opt = PIN_BASED_VIRTUAL_NMIS;
1120        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1121                                &_pin_based_exec_control) < 0)
1122                return -EIO;
1123
1124        min = CPU_BASED_HLT_EXITING |
1125#ifdef CONFIG_X86_64
1126              CPU_BASED_CR8_LOAD_EXITING |
1127              CPU_BASED_CR8_STORE_EXITING |
1128#endif
1129              CPU_BASED_CR3_LOAD_EXITING |
1130              CPU_BASED_CR3_STORE_EXITING |
1131              CPU_BASED_USE_IO_BITMAPS |
1132              CPU_BASED_MOV_DR_EXITING |
1133              CPU_BASED_USE_TSC_OFFSETING |
1134              CPU_BASED_INVLPG_EXITING;
1135        opt = CPU_BASED_TPR_SHADOW |
1136              CPU_BASED_USE_MSR_BITMAPS |
1137              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1138        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1139                                &_cpu_based_exec_control) < 0)
1140                return -EIO;
1141#ifdef CONFIG_X86_64
1142        if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
1143                _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
1144                                           ~CPU_BASED_CR8_STORE_EXITING;
1145#endif
1146        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
1147                min2 = 0;
1148                opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
1149                        SECONDARY_EXEC_WBINVD_EXITING |
1150                        SECONDARY_EXEC_ENABLE_VPID |
1151                        SECONDARY_EXEC_ENABLE_EPT;
1152                if (adjust_vmx_controls(min2, opt2,
1153                                        MSR_IA32_VMX_PROCBASED_CTLS2,
1154                                        &_cpu_based_2nd_exec_control) < 0)
1155                        return -EIO;
1156        }
1157#ifndef CONFIG_X86_64
1158        if (!(_cpu_based_2nd_exec_control &
1159                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
1160                _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1161#endif
1162        if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1163                /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1164                   enabled */
1165                min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1166                         CPU_BASED_CR3_STORE_EXITING |
1167                         CPU_BASED_INVLPG_EXITING);
1168                if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1169                                        &_cpu_based_exec_control) < 0)
1170                        return -EIO;
1171                rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1172                      vmx_capability.ept, vmx_capability.vpid);
1173        }
1174
1175        min = 0;
1176#ifdef CONFIG_X86_64
1177        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1178#endif
1179        opt = 0;
1180        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1181                                &_vmexit_control) < 0)
1182                return -EIO;
1183
1184        min = opt = 0;
1185        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1186                                &_vmentry_control) < 0)
1187                return -EIO;
1188
1189        rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1190
1191        /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1192        if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1193                return -EIO;
1194
1195#ifdef CONFIG_X86_64
1196        /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1197        if (vmx_msr_high & (1u<<16))
1198                return -EIO;
1199#endif
1200
1201        /* Require Write-Back (WB) memory type for VMCS accesses. */
1202        if (((vmx_msr_high >> 18) & 15) != 6)
1203                return -EIO;
1204
1205        vmcs_conf->size = vmx_msr_high & 0x1fff;
1206        vmcs_conf->order = get_order(vmcs_config.size);
1207        vmcs_conf->revision_id = vmx_msr_low;
1208
1209        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1210        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1211        vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1212        vmcs_conf->vmexit_ctrl         = _vmexit_control;
1213        vmcs_conf->vmentry_ctrl        = _vmentry_control;
1214
1215        return 0;
1216}
1217
1218static struct vmcs *alloc_vmcs_cpu(int cpu)
1219{
1220        int node = cpu_to_node(cpu);
1221        struct page *pages;
1222        struct vmcs *vmcs;
1223
1224        pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
1225        if (!pages)
1226                return NULL;
1227        vmcs = page_address(pages);
1228        memset(vmcs, 0, vmcs_config.size);
1229        vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1230        return vmcs;
1231}
1232
1233static struct vmcs *alloc_vmcs(void)
1234{
1235        return alloc_vmcs_cpu(raw_smp_processor_id());
1236}
1237
1238static void free_vmcs(struct vmcs *vmcs)
1239{
1240        free_pages((unsigned long)vmcs, vmcs_config.order);
1241}
1242
1243static void free_kvm_area(void)
1244{
1245        int cpu;
1246
1247        for_each_online_cpu(cpu)
1248                free_vmcs(per_cpu(vmxarea, cpu));
1249}
1250
1251static __init int alloc_kvm_area(void)
1252{
1253        int cpu;
1254
1255        for_each_online_cpu(cpu) {
1256                struct vmcs *vmcs;
1257
1258                vmcs = alloc_vmcs_cpu(cpu);
1259                if (!vmcs) {
1260                        free_kvm_area();
1261                        return -ENOMEM;
1262                }
1263
1264                per_cpu(vmxarea, cpu) = vmcs;
1265        }
1266        return 0;
1267}
1268
1269static __init int hardware_setup(void)
1270{
1271        if (setup_vmcs_config(&vmcs_config) < 0)
1272                return -EIO;
1273
1274        if (boot_cpu_has(X86_FEATURE_NX))
1275                kvm_enable_efer_bits(EFER_NX);
1276
1277        return alloc_kvm_area();
1278}
1279
1280static __exit void hardware_unsetup(void)
1281{
1282        free_kvm_area();
1283}
1284
1285static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1286{
1287        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1288
1289        if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1290                vmcs_write16(sf->selector, save->selector);
1291                vmcs_writel(sf->base, save->base);
1292                vmcs_write32(sf->limit, save->limit);
1293                vmcs_write32(sf->ar_bytes, save->ar);
1294        } else {
1295                u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1296                        << AR_DPL_SHIFT;
1297                vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1298        }
1299}
1300
1301static void enter_pmode(struct kvm_vcpu *vcpu)
1302{
1303        unsigned long flags;
1304        struct vcpu_vmx *vmx = to_vmx(vcpu);
1305
1306        vmx->emulation_required = 1;
1307        vcpu->arch.rmode.active = 0;
1308
1309        vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1310        vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1311        vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1312
1313        flags = vmcs_readl(GUEST_RFLAGS);
1314        flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1315        flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1316        vmcs_writel(GUEST_RFLAGS, flags);
1317
1318        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1319                        (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1320
1321        update_exception_bitmap(vcpu);
1322
1323        if (emulate_invalid_guest_state)
1324                return;
1325
1326        fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1327        fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1328        fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1329        fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1330
1331        vmcs_write16(GUEST_SS_SELECTOR, 0);
1332        vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1333
1334        vmcs_write16(GUEST_CS_SELECTOR,
1335                     vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1336        vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1337}
1338
1339static gva_t rmode_tss_base(struct kvm *kvm)
1340{
1341        if (!kvm->arch.tss_addr) {
1342                gfn_t base_gfn = kvm->memslots[0].base_gfn +
1343                                 kvm->memslots[0].npages - 3;
1344                return base_gfn << PAGE_SHIFT;
1345        }
1346        return kvm->arch.tss_addr;
1347}
1348
1349static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1350{
1351        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1352
1353        save->selector = vmcs_read16(sf->selector);
1354        save->base = vmcs_readl(sf->base);
1355        save->limit = vmcs_read32(sf->limit);
1356        save->ar = vmcs_read32(sf->ar_bytes);
1357        vmcs_write16(sf->selector, save->base >> 4);
1358        vmcs_write32(sf->base, save->base & 0xfffff);
1359        vmcs_write32(sf->limit, 0xffff);
1360        vmcs_write32(sf->ar_bytes, 0xf3);
1361}
1362
1363static void enter_rmode(struct kvm_vcpu *vcpu)
1364{
1365        unsigned long flags;
1366        struct vcpu_vmx *vmx = to_vmx(vcpu);
1367
1368        vmx->emulation_required = 1;
1369        vcpu->arch.rmode.active = 1;
1370
1371        vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1372        vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1373
1374        vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1375        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1376
1377        vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1378        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1379
1380        flags = vmcs_readl(GUEST_RFLAGS);
1381        vcpu->arch.rmode.save_iopl
1382                = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1383
1384        flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1385
1386        vmcs_writel(GUEST_RFLAGS, flags);
1387        vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1388        update_exception_bitmap(vcpu);
1389
1390        if (emulate_invalid_guest_state)
1391                goto continue_rmode;
1392
1393        vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1394        vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1395        vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1396
1397        vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1398        vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1399        if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1400                vmcs_writel(GUEST_CS_BASE, 0xf0000);
1401        vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1402
1403        fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1404        fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1405        fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1406        fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1407
1408continue_rmode:
1409        kvm_mmu_reset_context(vcpu);
1410        init_rmode(vcpu->kvm);
1411}
1412
1413#ifdef CONFIG_X86_64
1414
1415static void enter_lmode(struct kvm_vcpu *vcpu)
1416{
1417        u32 guest_tr_ar;
1418
1419        guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1420        if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1421                printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1422                       __func__);
1423                vmcs_write32(GUEST_TR_AR_BYTES,
1424                             (guest_tr_ar & ~AR_TYPE_MASK)
1425                             | AR_TYPE_BUSY_64_TSS);
1426        }
1427
1428        vcpu->arch.shadow_efer |= EFER_LMA;
1429
1430        find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1431        vmcs_write32(VM_ENTRY_CONTROLS,
1432                     vmcs_read32(VM_ENTRY_CONTROLS)
1433                     | VM_ENTRY_IA32E_MODE);
1434}
1435
1436static void exit_lmode(struct kvm_vcpu *vcpu)
1437{
1438        vcpu->arch.shadow_efer &= ~EFER_LMA;
1439
1440        vmcs_write32(VM_ENTRY_CONTROLS,
1441                     vmcs_read32(VM_ENTRY_CONTROLS)
1442                     & ~VM_ENTRY_IA32E_MODE);
1443}
1444
1445#endif
1446
1447static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1448{
1449        vpid_sync_vcpu_all(to_vmx(vcpu));
1450        if (vm_need_ept())
1451                ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1452}
1453
1454static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1455{
1456        vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1457        vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1458}
1459
1460static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1461{
1462        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1463                if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
1464                        printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
1465                        return;
1466                }
1467                vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
1468                vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
1469                vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
1470                vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
1471        }
1472}
1473
1474static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1475
1476static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1477                                        unsigned long cr0,
1478                                        struct kvm_vcpu *vcpu)
1479{
1480        if (!(cr0 & X86_CR0_PG)) {
1481                /* From paging/starting to nonpaging */
1482                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1483                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1484                             (CPU_BASED_CR3_LOAD_EXITING |
1485                              CPU_BASED_CR3_STORE_EXITING));
1486                vcpu->arch.cr0 = cr0;
1487                vmx_set_cr4(vcpu, vcpu->arch.cr4);
1488                *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
1489                *hw_cr0 &= ~X86_CR0_WP;
1490        } else if (!is_paging(vcpu)) {
1491                /* From nonpaging to paging */
1492                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1493                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1494                             ~(CPU_BASED_CR3_LOAD_EXITING |
1495                               CPU_BASED_CR3_STORE_EXITING));
1496                vcpu->arch.cr0 = cr0;
1497                vmx_set_cr4(vcpu, vcpu->arch.cr4);
1498                if (!(vcpu->arch.cr0 & X86_CR0_WP))
1499                        *hw_cr0 &= ~X86_CR0_WP;
1500        }
1501}
1502
1503static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1504                                        struct kvm_vcpu *vcpu)
1505{
1506        if (!is_paging(vcpu)) {
1507                *hw_cr4 &= ~X86_CR4_PAE;
1508                *hw_cr4 |= X86_CR4_PSE;
1509        } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1510                *hw_cr4 &= ~X86_CR4_PAE;
1511}
1512
1513static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1514{
1515        unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
1516                                KVM_VM_CR0_ALWAYS_ON;
1517
1518        vmx_fpu_deactivate(vcpu);
1519
1520        if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1521                enter_pmode(vcpu);
1522
1523        if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1524                enter_rmode(vcpu);
1525
1526#ifdef CONFIG_X86_64
1527        if (vcpu->arch.shadow_efer & EFER_LME) {
1528                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1529                        enter_lmode(vcpu);
1530                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1531                        exit_lmode(vcpu);
1532        }
1533#endif
1534
1535        if (vm_need_ept())
1536                ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1537
1538        vmcs_writel(CR0_READ_SHADOW, cr0);
1539        vmcs_writel(GUEST_CR0, hw_cr0);
1540        vcpu->arch.cr0 = cr0;
1541
1542        if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1543                vmx_fpu_activate(vcpu);
1544}
1545
1546static u64 construct_eptp(unsigned long root_hpa)
1547{
1548        u64 eptp;
1549
1550        /* TODO write the value reading from MSR */
1551        eptp = VMX_EPT_DEFAULT_MT |
1552                VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
1553        eptp |= (root_hpa & PAGE_MASK);
1554
1555        return eptp;
1556}
1557
1558static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1559{
1560        unsigned long guest_cr3;
1561        u64 eptp;
1562
1563        guest_cr3 = cr3;
1564        if (vm_need_ept()) {
1565                eptp = construct_eptp(cr3);
1566                vmcs_write64(EPT_POINTER, eptp);
1567                ept_sync_context(eptp);
1568                ept_load_pdptrs(vcpu);
1569                guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1570                        VMX_EPT_IDENTITY_PAGETABLE_ADDR;
1571        }
1572
1573        vmx_flush_tlb(vcpu);
1574        vmcs_writel(GUEST_CR3, guest_cr3);
1575        if (vcpu->arch.cr0 & X86_CR0_PE)
1576                vmx_fpu_deactivate(vcpu);
1577}
1578
1579static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1580{
1581        unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
1582                    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1583
1584        vcpu->arch.cr4 = cr4;
1585        if (vm_need_ept())
1586                ept_update_paging_mode_cr4(&hw_cr4, vcpu);
1587
1588        vmcs_writel(CR4_READ_SHADOW, cr4);
1589        vmcs_writel(GUEST_CR4, hw_cr4);
1590}
1591
1592static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1593{
1594        struct vcpu_vmx *vmx = to_vmx(vcpu);
1595        struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1596
1597        vcpu->arch.shadow_efer = efer;
1598        if (!msr)
1599                return;
1600        if (efer & EFER_LMA) {
1601                vmcs_write32(VM_ENTRY_CONTROLS,
1602                                     vmcs_read32(VM_ENTRY_CONTROLS) |
1603                                     VM_ENTRY_IA32E_MODE);
1604                msr->data = efer;
1605
1606        } else {
1607                vmcs_write32(VM_ENTRY_CONTROLS,
1608                                     vmcs_read32(VM_ENTRY_CONTROLS) &
1609                                     ~VM_ENTRY_IA32E_MODE);
1610
1611                msr->data = efer & ~EFER_LME;
1612        }
1613        setup_msrs(vmx);
1614}
1615
1616static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1617{
1618        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1619
1620        return vmcs_readl(sf->base);
1621}
1622
1623static void vmx_get_segment(struct kvm_vcpu *vcpu,
1624                            struct kvm_segment *var, int seg)
1625{
1626        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1627        u32 ar;
1628
1629        var->base = vmcs_readl(sf->base);
1630        var->limit = vmcs_read32(sf->limit);
1631        var->selector = vmcs_read16(sf->selector);
1632        ar = vmcs_read32(sf->ar_bytes);
1633        if (ar & AR_UNUSABLE_MASK)
1634                ar = 0;
1635        var->type = ar & 15;
1636        var->s = (ar >> 4) & 1;
1637        var->dpl = (ar >> 5) & 3;
1638        var->present = (ar >> 7) & 1;
1639        var->avl = (ar >> 12) & 1;
1640        var->l = (ar >> 13) & 1;
1641        var->db = (ar >> 14) & 1;
1642        var->g = (ar >> 15) & 1;
1643        var->unusable = (ar >> 16) & 1;
1644}
1645
1646static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1647{
1648        struct kvm_segment kvm_seg;
1649
1650        if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
1651                return 0;
1652
1653        if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
1654                return 3;
1655
1656        vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
1657        return kvm_seg.selector & 3;
1658}
1659
1660static u32 vmx_segment_access_rights(struct kvm_segment *var)
1661{
1662        u32 ar;
1663
1664        if (var->unusable)
1665                ar = 1 << 16;
1666        else {
1667                ar = var->type & 15;
1668                ar |= (var->s & 1) << 4;
1669                ar |= (var->dpl & 3) << 5;
1670                ar |= (var->present & 1) << 7;
1671                ar |= (var->avl & 1) << 12;
1672                ar |= (var->l & 1) << 13;
1673                ar |= (var->db & 1) << 14;
1674                ar |= (var->g & 1) << 15;
1675        }
1676        if (ar == 0) /* a 0 value means unusable */
1677                ar = AR_UNUSABLE_MASK;
1678
1679        return ar;
1680}
1681
1682static void vmx_set_segment(struct kvm_vcpu *vcpu,
1683                            struct kvm_segment *var, int seg)
1684{
1685        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1686        u32 ar;
1687
1688        if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1689                vcpu->arch.rmode.tr.selector = var->selector;
1690                vcpu->arch.rmode.tr.base = var->base;
1691                vcpu->arch.rmode.tr.limit = var->limit;
1692                vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1693                return;
1694        }
1695        vmcs_writel(sf->base, var->base);
1696        vmcs_write32(sf->limit, var->limit);
1697        vmcs_write16(sf->selector, var->selector);
1698        if (vcpu->arch.rmode.active && var->s) {
1699                /*
1700                 * Hack real-mode segments into vm86 compatibility.
1701                 */
1702                if (var->base == 0xffff0000 && var->selector == 0xf000)
1703                        vmcs_writel(sf->base, 0xf0000);
1704                ar = 0xf3;
1705        } else
1706                ar = vmx_segment_access_rights(var);
1707        vmcs_write32(sf->ar_bytes, ar);
1708}
1709
1710static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1711{
1712        u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1713
1714        *db = (ar >> 14) & 1;
1715        *l = (ar >> 13) & 1;
1716}
1717
1718static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1719{
1720        dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1721        dt->base = vmcs_readl(GUEST_IDTR_BASE);
1722}
1723
1724static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1725{
1726        vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1727        vmcs_writel(GUEST_IDTR_BASE, dt->base);
1728}
1729
1730static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1731{
1732        dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1733        dt->base = vmcs_readl(GUEST_GDTR_BASE);
1734}
1735
1736static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1737{
1738        vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1739        vmcs_writel(GUEST_GDTR_BASE, dt->base);
1740}
1741
1742static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1743{
1744        struct kvm_segment var;
1745        u32 ar;
1746
1747        vmx_get_segment(vcpu, &var, seg);
1748        ar = vmx_segment_access_rights(&var);
1749
1750        if (var.base != (var.selector << 4))
1751                return false;
1752        if (var.limit != 0xffff)
1753                return false;
1754        if (ar != 0xf3)
1755                return false;
1756
1757        return true;
1758}
1759
1760static bool code_segment_valid(struct kvm_vcpu *vcpu)
1761{
1762        struct kvm_segment cs;
1763        unsigned int cs_rpl;
1764
1765        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1766        cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1767
1768        if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1769                return false;
1770        if (!cs.s)
1771                return false;
1772        if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
1773                if (cs.dpl > cs_rpl)
1774                        return false;
1775        } else if (cs.type & AR_TYPE_CODE_MASK) {
1776                if (cs.dpl != cs_rpl)
1777                        return false;
1778        }
1779        if (!cs.present)
1780                return false;
1781
1782        /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1783        return true;
1784}
1785
1786static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1787{
1788        struct kvm_segment ss;
1789        unsigned int ss_rpl;
1790
1791        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1792        ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1793
1794        if ((ss.type != 3) || (ss.type != 7))
1795                return false;
1796        if (!ss.s)
1797                return false;
1798        if (ss.dpl != ss_rpl) /* DPL != RPL */
1799                return false;
1800        if (!ss.present)
1801                return false;
1802
1803        return true;
1804}
1805
1806static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1807{
1808        struct kvm_segment var;
1809        unsigned int rpl;
1810
1811        vmx_get_segment(vcpu, &var, seg);
1812        rpl = var.selector & SELECTOR_RPL_MASK;
1813
1814        if (!var.s)
1815                return false;
1816        if (!var.present)
1817                return false;
1818        if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1819                if (var.dpl < rpl) /* DPL < RPL */
1820                        return false;
1821        }
1822
1823        /* TODO: Add other members to kvm_segment_field to allow checking for other access
1824         * rights flags
1825         */
1826        return true;
1827}
1828
1829static bool tr_valid(struct kvm_vcpu *vcpu)
1830{
1831        struct kvm_segment tr;
1832
1833        vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1834
1835        if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
1836                return false;
1837        if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
1838                return false;
1839        if (!tr.present)
1840                return false;
1841
1842        return true;
1843}
1844
1845static bool ldtr_valid(struct kvm_vcpu *vcpu)
1846{
1847        struct kvm_segment ldtr;
1848
1849        vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1850
1851        if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
1852                return false;
1853        if (ldtr.type != 2)
1854                return false;
1855        if (!ldtr.present)
1856                return false;
1857
1858        return true;
1859}
1860
1861static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1862{
1863        struct kvm_segment cs, ss;
1864
1865        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1866        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1867
1868        return ((cs.selector & SELECTOR_RPL_MASK) ==
1869                 (ss.selector & SELECTOR_RPL_MASK));
1870}
1871
1872/*
1873 * Check if guest state is valid. Returns true if valid, false if
1874 * not.
1875 * We assume that registers are always usable
1876 */
1877static bool guest_state_valid(struct kvm_vcpu *vcpu)
1878{
1879        /* real mode guest state checks */
1880        if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1881                if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1882                        return false;
1883                if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1884                        return false;
1885                if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1886                        return false;
1887                if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1888                        return false;
1889                if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1890                        return false;
1891                if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1892                        return false;
1893        } else {
1894        /* protected mode guest state checks */
1895                if (!cs_ss_rpl_check(vcpu))
1896                        return false;
1897                if (!code_segment_valid(vcpu))
1898                        return false;
1899                if (!stack_segment_valid(vcpu))
1900                        return false;
1901                if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1902                        return false;
1903                if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1904                        return false;
1905                if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1906                        return false;
1907                if (!data_segment_valid(vcpu, VCPU_SREG_GS))
1908                        return false;
1909                if (!tr_valid(vcpu))
1910                        return false;
1911                if (!ldtr_valid(vcpu))
1912                        return false;
1913        }
1914        /* TODO:
1915         * - Add checks on RIP
1916         * - Add checks on RFLAGS
1917         */
1918
1919        return true;
1920}
1921
1922static int init_rmode_tss(struct kvm *kvm)
1923{
1924        gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1925        u16 data = 0;
1926        int ret = 0;
1927        int r;
1928
1929        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1930        if (r < 0)
1931                goto out;
1932        data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1933        r = kvm_write_guest_page(kvm, fn++, &data,
1934                        TSS_IOPB_BASE_OFFSET, sizeof(u16));
1935        if (r < 0)
1936                goto out;
1937        r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1938        if (r < 0)
1939                goto out;
1940        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1941        if (r < 0)
1942                goto out;
1943        data = ~0;
1944        r = kvm_write_guest_page(kvm, fn, &data,
1945                                 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1946                                 sizeof(u8));
1947        if (r < 0)
1948                goto out;
1949
1950        ret = 1;
1951out:
1952        return ret;
1953}
1954
1955static int init_rmode_identity_map(struct kvm *kvm)
1956{
1957        int i, r, ret;
1958        pfn_t identity_map_pfn;
1959        u32 tmp;
1960
1961        if (!vm_need_ept())
1962                return 1;
1963        if (unlikely(!kvm->arch.ept_identity_pagetable)) {
1964                printk(KERN_ERR "EPT: identity-mapping pagetable "
1965                        "haven't been allocated!\n");
1966                return 0;
1967        }
1968        if (likely(kvm->arch.ept_identity_pagetable_done))
1969                return 1;
1970        ret = 0;
1971        identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
1972        r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
1973        if (r < 0)
1974                goto out;
1975        /* Set up identity-mapping pagetable for EPT in real mode */
1976        for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
1977                tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
1978                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
1979                r = kvm_write_guest_page(kvm, identity_map_pfn,
1980                                &tmp, i * sizeof(tmp), sizeof(tmp));
1981                if (r < 0)
1982                        goto out;
1983        }
1984        kvm->arch.ept_identity_pagetable_done = true;
1985        ret = 1;
1986out:
1987        return ret;
1988}
1989
1990static void seg_setup(int seg)
1991{
1992        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1993
1994        vmcs_write16(sf->selector, 0);
1995        vmcs_writel(sf->base, 0);
1996        vmcs_write32(sf->limit, 0xffff);
1997        vmcs_write32(sf->ar_bytes, 0xf3);
1998}
1999
2000static int alloc_apic_access_page(struct kvm *kvm)
2001{
2002        struct kvm_userspace_memory_region kvm_userspace_mem;
2003        int r = 0;
2004
2005        down_write(&kvm->slots_lock);
2006        if (kvm->arch.apic_access_page)
2007                goto out;
2008        kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
2009        kvm_userspace_mem.flags = 0;
2010        kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
2011        kvm_userspace_mem.memory_size = PAGE_SIZE;
2012        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2013        if (r)
2014                goto out;
2015
2016        kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2017out:
2018        up_write(&kvm->slots_lock);
2019        return r;
2020}
2021
2022static int alloc_identity_pagetable(struct kvm *kvm)
2023{
2024        struct kvm_userspace_memory_region kvm_userspace_mem;
2025        int r = 0;
2026
2027        down_write(&kvm->slots_lock);
2028        if (kvm->arch.ept_identity_pagetable)
2029                goto out;
2030        kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
2031        kvm_userspace_mem.flags = 0;
2032        kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
2033        kvm_userspace_mem.memory_size = PAGE_SIZE;
2034        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2035        if (r)
2036                goto out;
2037
2038        kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2039                        VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
2040out:
2041        up_write(&kvm->slots_lock);
2042        return r;
2043}
2044
2045static void allocate_vpid(struct vcpu_vmx *vmx)
2046{
2047        int vpid;
2048
2049        vmx->vpid = 0;
2050        if (!enable_vpid || !cpu_has_vmx_vpid())
2051                return;
2052        spin_lock(&vmx_vpid_lock);
2053        vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
2054        if (vpid < VMX_NR_VPIDS) {
2055                vmx->vpid = vpid;
2056                __set_bit(vpid, vmx_vpid_bitmap);
2057        }
2058        spin_unlock(&vmx_vpid_lock);
2059}
2060
2061static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
2062{
2063        void *va;
2064
2065        if (!cpu_has_vmx_msr_bitmap())
2066                return;
2067
2068        /*
2069         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2070         * have the write-low and read-high bitmap offsets the wrong way round.
2071         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2072         */
2073        va = kmap(msr_bitmap);
2074        if (msr <= 0x1fff) {
2075                __clear_bit(msr, va + 0x000); /* read-low */
2076                __clear_bit(msr, va + 0x800); /* write-low */
2077        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2078                msr &= 0x1fff;
2079                __clear_bit(msr, va + 0x400); /* read-high */
2080                __clear_bit(msr, va + 0xc00); /* write-high */
2081        }
2082        kunmap(msr_bitmap);
2083}
2084
2085/*
2086 * Sets up the vmcs for emulated real mode.
2087 */
2088static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2089{
2090        u32 host_sysenter_cs;
2091        u32 junk;
2092        unsigned long a;
2093        struct descriptor_table dt;
2094        int i;
2095        unsigned long kvm_vmx_return;
2096        u32 exec_control;
2097
2098        /* I/O */
2099        vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
2100        vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
2101
2102        if (cpu_has_vmx_msr_bitmap())
2103                vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
2104
2105        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2106
2107        /* Control */
2108        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2109                vmcs_config.pin_based_exec_ctrl);
2110
2111        exec_control = vmcs_config.cpu_based_exec_ctrl;
2112        if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2113                exec_control &= ~CPU_BASED_TPR_SHADOW;
2114#ifdef CONFIG_X86_64
2115                exec_control |= CPU_BASED_CR8_STORE_EXITING |
2116                                CPU_BASED_CR8_LOAD_EXITING;
2117#endif
2118        }
2119        if (!vm_need_ept())
2120                exec_control |= CPU_BASED_CR3_STORE_EXITING |
2121                                CPU_BASED_CR3_LOAD_EXITING  |
2122                                CPU_BASED_INVLPG_EXITING;
2123        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2124
2125        if (cpu_has_secondary_exec_ctrls()) {
2126                exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
2127                if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2128                        exec_control &=
2129                                ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2130                if (vmx->vpid == 0)
2131                        exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2132                if (!vm_need_ept())
2133                        exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2134                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2135        }
2136
2137        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2138        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2139        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
2140
2141        vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
2142        vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
2143        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
2144
2145        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
2146        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2147        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2148        vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
2149        vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
2150        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2151#ifdef CONFIG_X86_64
2152        rdmsrl(MSR_FS_BASE, a);
2153        vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
2154        rdmsrl(MSR_GS_BASE, a);
2155        vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
2156#else
2157        vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
2158        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2159#endif
2160
2161        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
2162
2163        kvm_get_idt(&dt);
2164        vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
2165
2166        asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2167        vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2168        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2169        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2170        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2171
2172        rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2173        vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2174        rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2175        vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
2176        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2177        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
2178
2179        for (i = 0; i < NR_VMX_MSR; ++i) {
2180                u32 index = vmx_msr_index[i];
2181                u32 data_low, data_high;
2182                u64 data;
2183                int j = vmx->nmsrs;
2184
2185                if (rdmsr_safe(index, &data_low, &data_high) < 0)
2186                        continue;
2187                if (wrmsr_safe(index, data_low, data_high) < 0)
2188                        continue;
2189                data = data_low | ((u64)data_high << 32);
2190                vmx->host_msrs[j].index = index;
2191                vmx->host_msrs[j].reserved = 0;
2192                vmx->host_msrs[j].data = data;
2193                vmx->guest_msrs[j] = vmx->host_msrs[j];
2194                ++vmx->nmsrs;
2195        }
2196
2197        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
2198
2199        /* 22.2.1, 20.8.1 */
2200        vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2201
2202        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2203        vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
2204
2205
2206        return 0;
2207}
2208
2209static int init_rmode(struct kvm *kvm)
2210{
2211        if (!init_rmode_tss(kvm))
2212                return 0;
2213        if (!init_rmode_identity_map(kvm))
2214                return 0;
2215        return 1;
2216}
2217
2218static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2219{
2220        struct vcpu_vmx *vmx = to_vmx(vcpu);
2221        u64 msr;
2222        int ret;
2223
2224        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2225        down_read(&vcpu->kvm->slots_lock);
2226        if (!init_rmode(vmx->vcpu.kvm)) {
2227                ret = -ENOMEM;
2228                goto out;
2229        }
2230
2231        vmx->vcpu.arch.rmode.active = 0;
2232
2233        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2234        kvm_set_cr8(&vmx->vcpu, 0);
2235        msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
2236        if (vmx->vcpu.vcpu_id == 0)
2237                msr |= MSR_IA32_APICBASE_BSP;
2238        kvm_set_apic_base(&vmx->vcpu, msr);
2239
2240        fx_init(&vmx->vcpu);
2241
2242        seg_setup(VCPU_SREG_CS);
2243        /*
2244         * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2245         * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
2246         */
2247        if (vmx->vcpu.vcpu_id == 0) {
2248                vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
2249                vmcs_writel(GUEST_CS_BASE, 0x000f0000);
2250        } else {
2251                vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2252                vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2253        }
2254
2255        seg_setup(VCPU_SREG_DS);
2256        seg_setup(VCPU_SREG_ES);
2257        seg_setup(VCPU_SREG_FS);
2258        seg_setup(VCPU_SREG_GS);
2259        seg_setup(VCPU_SREG_SS);
2260
2261        vmcs_write16(GUEST_TR_SELECTOR, 0);
2262        vmcs_writel(GUEST_TR_BASE, 0);
2263        vmcs_write32(GUEST_TR_LIMIT, 0xffff);
2264        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2265
2266        vmcs_write16(GUEST_LDTR_SELECTOR, 0);
2267        vmcs_writel(GUEST_LDTR_BASE, 0);
2268        vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
2269        vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
2270
2271        vmcs_write32(GUEST_SYSENTER_CS, 0);
2272        vmcs_writel(GUEST_SYSENTER_ESP, 0);
2273        vmcs_writel(GUEST_SYSENTER_EIP, 0);
2274
2275        vmcs_writel(GUEST_RFLAGS, 0x02);
2276        if (vmx->vcpu.vcpu_id == 0)
2277                kvm_rip_write(vcpu, 0xfff0);
2278        else
2279                kvm_rip_write(vcpu, 0);
2280        kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2281
2282        /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2283        vmcs_writel(GUEST_DR7, 0x400);
2284
2285        vmcs_writel(GUEST_GDTR_BASE, 0);
2286        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
2287
2288        vmcs_writel(GUEST_IDTR_BASE, 0);
2289        vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2290
2291        vmcs_write32(GUEST_ACTIVITY_STATE, 0);
2292        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2293        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2294
2295        guest_write_tsc(0);
2296
2297        /* Special registers */
2298        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2299
2300        setup_msrs(vmx);
2301
2302        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
2303
2304        if (cpu_has_vmx_tpr_shadow()) {
2305                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2306                if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2307                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2308                                page_to_phys(vmx->vcpu.arch.apic->regs_page));
2309                vmcs_write32(TPR_THRESHOLD, 0);
2310        }
2311
2312        if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2313                vmcs_write64(APIC_ACCESS_ADDR,
2314                             page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
2315
2316        if (vmx->vpid != 0)
2317                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2318
2319        vmx->vcpu.arch.cr0 = 0x60000010;
2320        vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2321        vmx_set_cr4(&vmx->vcpu, 0);
2322        vmx_set_efer(&vmx->vcpu, 0);
2323        vmx_fpu_activate(&vmx->vcpu);
2324        update_exception_bitmap(&vmx->vcpu);
2325
2326        vpid_sync_vcpu_all(vmx);
2327
2328        ret = 0;
2329
2330        /* HACK: Don't enable emulation on guest boot/reset */
2331        vmx->emulation_required = 0;
2332
2333out:
2334        up_read(&vcpu->kvm->slots_lock);
2335        return ret;
2336}
2337
2338static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2339{
2340        struct vcpu_vmx *vmx = to_vmx(vcpu);
2341
2342        KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2343
2344        ++vcpu->stat.irq_injections;
2345        if (vcpu->arch.rmode.active) {
2346                vmx->rmode.irq.pending = true;
2347                vmx->rmode.irq.vector = irq;
2348                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2349                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2350                             irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2351                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2352                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2353                return;
2354        }
2355        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2356                        irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2357}
2358
2359static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2360{
2361        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2362                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2363}
2364
2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2366{
2367        int word_index = __ffs(vcpu->arch.irq_summary);
2368        int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2369        int irq = word_index * BITS_PER_LONG + bit_index;
2370
2371        clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2372        if (!vcpu->arch.irq_pending[word_index])
2373                clear_bit(word_index, &vcpu->arch.irq_summary);
2374        kvm_queue_interrupt(vcpu, irq);
2375}
2376
2377
2378static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2379                                       struct kvm_run *kvm_run)
2380{
2381        u32 cpu_based_vm_exec_control;
2382
2383        vcpu->arch.interrupt_window_open =
2384                ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2385                 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2386
2387        if (vcpu->arch.interrupt_window_open &&
2388            vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2389                kvm_do_inject_irq(vcpu);
2390
2391        if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
2392                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2393
2394        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2395        if (!vcpu->arch.interrupt_window_open &&
2396            (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
2397                /*
2398                 * Interrupts blocked.  Wait for unblock.
2399                 */
2400                cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2401        else
2402                cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2403        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2404}
2405
2406static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2407{
2408        int ret;
2409        struct kvm_userspace_memory_region tss_mem = {
2410                .slot = 8,
2411                .guest_phys_addr = addr,
2412                .memory_size = PAGE_SIZE * 3,
2413                .flags = 0,
2414        };
2415
2416        ret = kvm_set_memory_region(kvm, &tss_mem, 0);
2417        if (ret)
2418                return ret;
2419        kvm->arch.tss_addr = addr;
2420        return 0;
2421}
2422
2423static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
2424{
2425        struct kvm_guest_debug *dbg = &vcpu->guest_debug;
2426
2427        set_debugreg(dbg->bp[0], 0);
2428        set_debugreg(dbg->bp[1], 1);
2429        set_debugreg(dbg->bp[2], 2);
2430        set_debugreg(dbg->bp[3], 3);
2431
2432        if (dbg->singlestep) {
2433                unsigned long flags;
2434
2435                flags = vmcs_readl(GUEST_RFLAGS);
2436                flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2437                vmcs_writel(GUEST_RFLAGS, flags);
2438        }
2439}
2440
2441static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2442                                  int vec, u32 err_code)
2443{
2444        /*
2445         * Instruction with address size override prefix opcode 0x67
2446         * Cause the #SS fault with 0 error code in VM86 mode.
2447         */
2448        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2449                if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2450                        return 1;
2451        /*
2452         * Forward all other exceptions that are valid in real mode.
2453         * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2454         *        the required debugging infrastructure rework.
2455         */
2456        switch (vec) {
2457        case DE_VECTOR:
2458        case DB_VECTOR:
2459        case BP_VECTOR:
2460        case OF_VECTOR:
2461        case BR_VECTOR:
2462        case UD_VECTOR:
2463        case DF_VECTOR:
2464        case SS_VECTOR:
2465        case GP_VECTOR:
2466        case MF_VECTOR:
2467                kvm_queue_exception(vcpu, vec);
2468                return 1;
2469        }
2470        return 0;
2471}
2472
2473static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2474{
2475        struct vcpu_vmx *vmx = to_vmx(vcpu);
2476        u32 intr_info, error_code;
2477        unsigned long cr2, rip;
2478        u32 vect_info;
2479        enum emulation_result er;
2480
2481        vect_info = vmx->idt_vectoring_info;
2482        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2483
2484        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2485                                                !is_page_fault(intr_info))
2486                printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
2487                       "intr info 0x%x\n", __func__, vect_info, intr_info);
2488
2489        if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
2490                int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2491                set_bit(irq, vcpu->arch.irq_pending);
2492                set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
2493        }
2494
2495        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
2496                return 1;  /* already handled by vmx_vcpu_run() */
2497
2498        if (is_no_device(intr_info)) {
2499                vmx_fpu_activate(vcpu);
2500                return 1;
2501        }
2502
2503        if (is_invalid_opcode(intr_info)) {
2504                er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
2505                if (er != EMULATE_DONE)
2506                        kvm_queue_exception(vcpu, UD_VECTOR);
2507                return 1;
2508        }
2509
2510        error_code = 0;
2511        rip = kvm_rip_read(vcpu);
2512        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2513                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2514        if (is_page_fault(intr_info)) {
2515                /* EPT won't cause page fault directly */
2516                if (vm_need_ept())
2517                        BUG();
2518                cr2 = vmcs_readl(EXIT_QUALIFICATION);
2519                KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2520                            (u32)((u64)cr2 >> 32), handler);
2521                if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
2522                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
2523                return kvm_mmu_page_fault(vcpu, cr2, error_code);
2524        }
2525
2526        if (vcpu->arch.rmode.active &&
2527            handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2528                                                                error_code)) {
2529                if (vcpu->arch.halt_request) {
2530                        vcpu->arch.halt_request = 0;
2531                        return kvm_emulate_halt(vcpu);
2532                }
2533                return 1;
2534        }
2535
2536        if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
2537            (INTR_TYPE_EXCEPTION | 1)) {
2538                kvm_run->exit_reason = KVM_EXIT_DEBUG;
2539                return 0;
2540        }
2541        kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
2542        kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2543        kvm_run->ex.error_code = error_code;
2544        return 0;
2545}
2546
2547static int handle_external_interrupt(struct kvm_vcpu *vcpu,
2548                                     struct kvm_run *kvm_run)
2549{
2550        ++vcpu->stat.irq_exits;
2551        KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
2552        return 1;
2553}
2554
2555static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2556{
2557        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2558        return 0;
2559}
2560
2561static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2562{
2563        unsigned long exit_qualification;
2564        int size, down, in, string, rep;
2565        unsigned port;
2566
2567        ++vcpu->stat.io_exits;
2568        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2569        string = (exit_qualification & 16) != 0;
2570
2571        if (string) {
2572                if (emulate_instruction(vcpu,
2573                                        kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2574                        return 0;
2575                return 1;
2576        }
2577
2578        size = (exit_qualification & 7) + 1;
2579        in = (exit_qualification & 8) != 0;
2580        down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2581        rep = (exit_qualification & 32) != 0;
2582        port = exit_qualification >> 16;
2583
2584        return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
2585}
2586
2587static void
2588vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2589{
2590        /*
2591         * Patch in the VMCALL instruction:
2592         */
2593        hypercall[0] = 0x0f;
2594        hypercall[1] = 0x01;
2595        hypercall[2] = 0xc1;
2596}
2597
2598static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2599{
2600        unsigned long exit_qualification;
2601        int cr;
2602        int reg;
2603
2604        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2605        cr = exit_qualification & 15;
2606        reg = (exit_qualification >> 8) & 15;
2607        switch ((exit_qualification >> 4) & 3) {
2608        case 0: /* mov to cr */
2609                KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2610                            (u32)kvm_register_read(vcpu, reg),
2611                            (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2612                            handler);
2613                switch (cr) {
2614                case 0:
2615                        kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2616                        skip_emulated_instruction(vcpu);
2617                        return 1;
2618                case 3:
2619                        kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2620                        skip_emulated_instruction(vcpu);
2621                        return 1;
2622                case 4:
2623                        kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2624                        skip_emulated_instruction(vcpu);
2625                        return 1;
2626                case 8:
2627                        kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
2628                        skip_emulated_instruction(vcpu);
2629                        if (irqchip_in_kernel(vcpu->kvm))
2630                                return 1;
2631                        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2632                        return 0;
2633                };
2634                break;
2635        case 2: /* clts */
2636                vmx_fpu_deactivate(vcpu);
2637                vcpu->arch.cr0 &= ~X86_CR0_TS;
2638                vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2639                vmx_fpu_activate(vcpu);
2640                KVMTRACE_0D(CLTS, vcpu, handler);
2641                skip_emulated_instruction(vcpu);
2642                return 1;
2643        case 1: /*mov from cr*/
2644                switch (cr) {
2645                case 3:
2646                        kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2647                        KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2648                                    (u32)kvm_register_read(vcpu, reg),
2649                                    (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2650                                    handler);
2651                        skip_emulated_instruction(vcpu);
2652                        return 1;
2653                case 8:
2654                        kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2655                        KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2656                                    (u32)kvm_register_read(vcpu, reg), handler);
2657                        skip_emulated_instruction(vcpu);
2658                        return 1;
2659                }
2660                break;
2661        case 3: /* lmsw */
2662                kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2663
2664                skip_emulated_instruction(vcpu);
2665                return 1;
2666        default:
2667                break;
2668        }
2669        kvm_run->exit_reason = 0;
2670        pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2671               (int)(exit_qualification >> 4) & 3, cr);
2672        return 0;
2673}
2674
2675static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2676{
2677        unsigned long exit_qualification;
2678        unsigned long val;
2679        int dr, reg;
2680
2681        /*
2682         * FIXME: this code assumes the host is debugging the guest.
2683         *        need to deal with guest debugging itself too.
2684         */
2685        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2686        dr = exit_qualification & 7;
2687        reg = (exit_qualification >> 8) & 15;
2688        if (exit_qualification & 16) {
2689                /* mov from dr */
2690                switch (dr) {
2691                case 6:
2692                        val = 0xffff0ff0;
2693                        break;
2694                case 7:
2695                        val = 0x400;
2696                        break;
2697                default:
2698                        val = 0;
2699                }
2700                kvm_register_write(vcpu, reg, val);
2701                KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2702        } else {
2703                /* mov to dr */
2704        }
2705        skip_emulated_instruction(vcpu);
2706        return 1;
2707}
2708
2709static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2710{
2711        kvm_emulate_cpuid(vcpu);
2712        return 1;
2713}
2714
2715static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2716{
2717        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2718        u64 data;
2719
2720        if (vmx_get_msr(vcpu, ecx, &data)) {
2721                kvm_inject_gp(vcpu, 0);
2722                return 1;
2723        }
2724
2725        KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
2726                    handler);
2727
2728        /* FIXME: handling of bits 32:63 of rax, rdx */
2729        vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2730        vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2731        skip_emulated_instruction(vcpu);
2732        return 1;
2733}
2734
2735static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2736{
2737        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2738        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2739                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2740
2741        KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
2742                    handler);
2743
2744        if (vmx_set_msr(vcpu, ecx, data) != 0) {
2745                kvm_inject_gp(vcpu, 0);
2746                return 1;
2747        }
2748
2749        skip_emulated_instruction(vcpu);
2750        return 1;
2751}
2752
2753static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2754                                      struct kvm_run *kvm_run)
2755{
2756        return 1;
2757}
2758
2759static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2760                                   struct kvm_run *kvm_run)
2761{
2762        u32 cpu_based_vm_exec_control;
2763
2764        /* clear pending irq */
2765        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2766        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2767        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2768
2769        KVMTRACE_0D(PEND_INTR, vcpu, handler);
2770
2771        /*
2772         * If the user space waits to inject interrupts, exit as soon as
2773         * possible
2774         */
2775        if (kvm_run->request_interrupt_window &&
2776            !vcpu->arch.irq_summary) {
2777                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2778                ++vcpu->stat.irq_window_exits;
2779                return 0;
2780        }
2781        return 1;
2782}
2783
2784static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2785{
2786        skip_emulated_instruction(vcpu);
2787        return kvm_emulate_halt(vcpu);
2788}
2789
2790static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2791{
2792        skip_emulated_instruction(vcpu);
2793        kvm_emulate_hypercall(vcpu);
2794        return 1;
2795}
2796
2797static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2798{
2799        u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2800
2801        kvm_mmu_invlpg(vcpu, exit_qualification);
2802        skip_emulated_instruction(vcpu);
2803        return 1;
2804}
2805
2806static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2807{
2808        skip_emulated_instruction(vcpu);
2809        /* TODO: Add support for VT-d/pass-through device */
2810        return 1;
2811}
2812
2813static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2814{
2815        u64 exit_qualification;
2816        enum emulation_result er;
2817        unsigned long offset;
2818
2819        exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2820        offset = exit_qualification & 0xffful;
2821
2822        er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2823
2824        if (er !=  EMULATE_DONE) {
2825                printk(KERN_ERR
2826                       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2827                       offset);
2828                return -ENOTSUPP;
2829        }
2830        return 1;
2831}
2832
2833static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2834{
2835        unsigned long exit_qualification;
2836        u16 tss_selector;
2837        int reason;
2838
2839        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2840
2841        reason = (u32)exit_qualification >> 30;
2842        tss_selector = exit_qualification;
2843
2844        return kvm_task_switch(vcpu, tss_selector, reason);
2845}
2846
2847static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2848{
2849        u64 exit_qualification;
2850        enum emulation_result er;
2851        gpa_t gpa;
2852        unsigned long hva;
2853        int gla_validity;
2854        int r;
2855
2856        exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2857
2858        if (exit_qualification & (1 << 6)) {
2859                printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
2860                return -ENOTSUPP;
2861        }
2862
2863        gla_validity = (exit_qualification >> 7) & 0x3;
2864        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
2865                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
2866                printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
2867                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
2868                        (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
2869                printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
2870                        (long unsigned int)exit_qualification);
2871                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2872                kvm_run->hw.hardware_exit_reason = 0;
2873                return -ENOTSUPP;
2874        }
2875
2876        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
2877        hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
2878        if (!kvm_is_error_hva(hva)) {
2879                r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
2880                if (r < 0) {
2881                        printk(KERN_ERR "EPT: Not enough memory!\n");
2882                        return -ENOMEM;
2883                }
2884                return 1;
2885        } else {
2886                /* must be MMIO */
2887                er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2888
2889                if (er == EMULATE_FAIL) {
2890                        printk(KERN_ERR
2891                         "EPT: Fail to handle EPT violation vmexit!er is %d\n",
2892                         er);
2893                        printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
2894                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
2895                         (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
2896                        printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
2897                                (long unsigned int)exit_qualification);
2898                        return -ENOTSUPP;
2899                } else if (er == EMULATE_DO_MMIO)
2900                        return 0;
2901        }
2902        return 1;
2903}
2904
2905static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2906{
2907        u32 cpu_based_vm_exec_control;
2908
2909        /* clear pending NMI */
2910        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2911        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2912        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2913        ++vcpu->stat.nmi_window_exits;
2914
2915        return 1;
2916}
2917
2918static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2919                                struct kvm_run *kvm_run)
2920{
2921        struct vcpu_vmx *vmx = to_vmx(vcpu);
2922        int err;
2923
2924        preempt_enable();
2925        local_irq_enable();
2926
2927        while (!guest_state_valid(vcpu)) {
2928                err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929
2930                switch (err) {
2931                        case EMULATE_DONE:
2932                                break;
2933                        case EMULATE_DO_MMIO:
2934                                kvm_report_emulation_failure(vcpu, "mmio");
2935                                /* TODO: Handle MMIO */
2936                                return;
2937                        default:
2938                                kvm_report_emulation_failure(vcpu, "emulation failure");
2939                                return;
2940                }
2941
2942                if (signal_pending(current))
2943                        break;
2944                if (need_resched())
2945                        schedule();
2946        }
2947
2948        local_irq_disable();
2949        preempt_disable();
2950
2951        /* Guest state should be valid now, no more emulation should be needed */
2952        vmx->emulation_required = 0;
2953}
2954
2955/*
2956 * The exit handlers return 1 if the exit was handled fully and guest execution
2957 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
2958 * to be done to userspace and return 0.
2959 */
2960static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2961                                      struct kvm_run *kvm_run) = {
2962        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
2963        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
2964        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
2965        [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
2966        [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
2967        [EXIT_REASON_CR_ACCESS]               = handle_cr,
2968        [EXIT_REASON_DR_ACCESS]               = handle_dr,
2969        [EXIT_REASON_CPUID]                   = handle_cpuid,
2970        [EXIT_REASON_MSR_READ]                = handle_rdmsr,
2971        [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
2972        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
2973        [EXIT_REASON_HLT]                     = handle_halt,
2974        [EXIT_REASON_INVLPG]                  = handle_invlpg,
2975        [EXIT_REASON_VMCALL]                  = handle_vmcall,
2976        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
2977        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
2978        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
2979        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
2980        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
2981};
2982
2983static const int kvm_vmx_max_exit_handlers =
2984        ARRAY_SIZE(kvm_vmx_exit_handlers);
2985
2986/*
2987 * The guest has exited.  See if we can fix it or if we need userspace
2988 * assistance.
2989 */
2990static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2991{
2992        u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2993        struct vcpu_vmx *vmx = to_vmx(vcpu);
2994        u32 vectoring_info = vmx->idt_vectoring_info;
2995
2996        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2997                    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2998
2999        /* Access CR3 don't cause VMExit in paging mode, so we need
3000         * to sync with guest real CR3. */
3001        if (vm_need_ept() && is_paging(vcpu)) {
3002                vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3003                ept_load_pdptrs(vcpu);
3004        }
3005
3006        if (unlikely(vmx->fail)) {
3007                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3008                kvm_run->fail_entry.hardware_entry_failure_reason
3009                        = vmcs_read32(VM_INSTRUCTION_ERROR);
3010                return 0;
3011        }
3012
3013        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
3014                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
3015                        exit_reason != EXIT_REASON_EPT_VIOLATION))
3016                printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
3017                       "exit reason is 0x%x\n", __func__, exit_reason);
3018        if (exit_reason < kvm_vmx_max_exit_handlers
3019            && kvm_vmx_exit_handlers[exit_reason])
3020                return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
3021        else {
3022                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3023                kvm_run->hw.hardware_exit_reason = exit_reason;
3024        }
3025        return 0;
3026}
3027
3028static void update_tpr_threshold(struct kvm_vcpu *vcpu)
3029{
3030        int max_irr, tpr;
3031
3032        if (!vm_need_tpr_shadow(vcpu->kvm))
3033                return;
3034
3035        if (!kvm_lapic_enabled(vcpu) ||
3036            ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
3037                vmcs_write32(TPR_THRESHOLD, 0);
3038                return;
3039        }
3040
3041        tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
3042        vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
3043}
3044
3045static void enable_irq_window(struct kvm_vcpu *vcpu)
3046{
3047        u32 cpu_based_vm_exec_control;
3048
3049        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3050        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
3051        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3052}
3053
3054static void enable_nmi_window(struct kvm_vcpu *vcpu)
3055{
3056        u32 cpu_based_vm_exec_control;
3057
3058        if (!cpu_has_virtual_nmis())
3059                return;
3060
3061        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3062        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
3063        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3064}
3065
3066static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
3067{
3068        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3069        return !(guest_intr & (GUEST_INTR_STATE_NMI |
3070                               GUEST_INTR_STATE_MOV_SS |
3071                               GUEST_INTR_STATE_STI));
3072}
3073
3074static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
3075{
3076        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3077        return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
3078                               GUEST_INTR_STATE_STI)) &&
3079                (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
3080}
3081
3082static void enable_intr_window(struct kvm_vcpu *vcpu)
3083{
3084        if (vcpu->arch.nmi_pending)
3085                enable_nmi_window(vcpu);
3086        else if (kvm_cpu_has_interrupt(vcpu))
3087                enable_irq_window(vcpu);
3088}
3089
3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3091{
3092        u32 exit_intr_info;
3093        u32 idt_vectoring_info;
3094        bool unblock_nmi;
3095        u8 vector;
3096        int type;
3097        bool idtv_info_valid;
3098        u32 error;
3099
3100        exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3101        if (cpu_has_virtual_nmis()) {
3102                unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3103                vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3104                /*
3105                 * SDM 3: 25.7.1.2
3106                 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3107                 * a guest IRET fault.
3108                 */
3109                if (unblock_nmi && vector != DF_VECTOR)
3110                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3111                                      GUEST_INTR_STATE_NMI);
3112        }
3113
3114        idt_vectoring_info = vmx->idt_vectoring_info;
3115        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3116        vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3117        type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3118        if (vmx->vcpu.arch.nmi_injected) {
3119                /*
3120                 * SDM 3: 25.7.1.2
3121                 * Clear bit "block by NMI" before VM entry if a NMI delivery
3122                 * faulted.
3123                 */
3124                if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
3125                        vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3126                                        GUEST_INTR_STATE_NMI);
3127                else
3128                        vmx->vcpu.arch.nmi_injected = false;
3129        }
3130        kvm_clear_exception_queue(&vmx->vcpu);
3131        if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
3132                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3133                        error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3134                        kvm_queue_exception_e(&vmx->vcpu, vector, error);
3135                } else
3136                        kvm_queue_exception(&vmx->vcpu, vector);
3137                vmx->idt_vectoring_info = 0;
3138        }
3139        kvm_clear_interrupt_queue(&vmx->vcpu);
3140        if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
3141                kvm_queue_interrupt(&vmx->vcpu, vector);
3142                vmx->idt_vectoring_info = 0;
3143        }
3144}
3145
3146static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{
3148        update_tpr_threshold(vcpu);
3149
3150        if (cpu_has_virtual_nmis()) {
3151                if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3152                        if (vcpu->arch.interrupt.pending) {
3153                                enable_nmi_window(vcpu);
3154                        } else if (vmx_nmi_enabled(vcpu)) {
3155                                vcpu->arch.nmi_pending = false;
3156                                vcpu->arch.nmi_injected = true;
3157                        } else {
3158                                enable_intr_window(vcpu);
3159                                return;
3160                        }
3161                }
3162                if (vcpu->arch.nmi_injected) {
3163                        vmx_inject_nmi(vcpu);
3164                        enable_intr_window(vcpu);
3165                        return;
3166                }
3167        }
3168        if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3169                if (vmx_irq_enabled(vcpu))
3170                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3171                else
3172                        enable_irq_window(vcpu);
3173        }
3174        if (vcpu->arch.interrupt.pending) {
3175                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3176                kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
3177        }
3178}
3179
3180/*
3181 * Failure to inject an interrupt should give us the information
3182 * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
3183 * when fetching the interrupt redirection bitmap in the real-mode
3184 * tss, this doesn't happen.  So we do it ourselves.
3185 */
3186static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3187{
3188        vmx->rmode.irq.pending = 0;
3189        if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
3190                return;
3191        kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
3192        if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
3193                vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
3194                vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
3195                return;
3196        }
3197        vmx->idt_vectoring_info =
3198                VECTORING_INFO_VALID_MASK
3199                | INTR_TYPE_EXT_INTR
3200                | vmx->rmode.irq.vector;
3201}
3202
3203#ifdef CONFIG_X86_64
3204#define R "r"
3205#define Q "q"
3206#else
3207#define R "e"
3208#define Q "l"
3209#endif
3210
3211static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3212{
3213        struct vcpu_vmx *vmx = to_vmx(vcpu);
3214        u32 intr_info;
3215
3216        /* Handle invalid guest state instead of entering VMX */
3217        if (vmx->emulation_required && emulate_invalid_guest_state) {
3218                handle_invalid_guest_state(vcpu, kvm_run);
3219                return;
3220        }
3221
3222        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3223                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3224        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3225                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3226
3227        /*
3228         * Loading guest fpu may have cleared host cr0.ts
3229         */
3230        vmcs_writel(HOST_CR0, read_cr0());
3231
3232        asm(
3233                /* Store host registers */
3234                "push %%"R"dx; push %%"R"bp;"
3235                "push %%"R"cx \n\t"
3236                "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3237                "je 1f \n\t"
3238                "mov %%"R"sp, %c[host_rsp](%0) \n\t"
3239                __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3240                "1: \n\t"
3241                /* Check if vmlaunch of vmresume is needed */
3242                "cmpl $0, %c[launched](%0) \n\t"
3243                /* Load guest registers.  Don't clobber flags. */
3244                "mov %c[cr2](%0), %%"R"ax \n\t"
3245                "mov %%"R"ax, %%cr2 \n\t"
3246                "mov %c[rax](%0), %%"R"ax \n\t"
3247                "mov %c[rbx](%0), %%"R"bx \n\t"
3248                "mov %c[rdx](%0), %%"R"dx \n\t"
3249                "mov %c[rsi](%0), %%"R"si \n\t"
3250                "mov %c[rdi](%0), %%"R"di \n\t"
3251                "mov %c[rbp](%0), %%"R"bp \n\t"
3252#ifdef CONFIG_X86_64
3253                "mov %c[r8](%0),  %%r8  \n\t"
3254                "mov %c[r9](%0),  %%r9  \n\t"
3255                "mov %c[r10](%0), %%r10 \n\t"
3256                "mov %c[r11](%0), %%r11 \n\t"
3257                "mov %c[r12](%0), %%r12 \n\t"
3258                "mov %c[r13](%0), %%r13 \n\t"
3259                "mov %c[r14](%0), %%r14 \n\t"
3260                "mov %c[r15](%0), %%r15 \n\t"
3261#endif
3262                "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3263
3264                /* Enter guest mode */
3265                "jne .Llaunched \n\t"
3266                __ex(ASM_VMX_VMLAUNCH) "\n\t"
3267                "jmp .Lkvm_vmx_return \n\t"
3268                ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
3269                ".Lkvm_vmx_return: "
3270                /* Save guest registers, load host registers, keep flags */
3271                "xchg %0,     (%%"R"sp) \n\t"
3272                "mov %%"R"ax, %c[rax](%0) \n\t"
3273                "mov %%"R"bx, %c[rbx](%0) \n\t"
3274                "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3275                "mov %%"R"dx, %c[rdx](%0) \n\t"
3276                "mov %%"R"si, %c[rsi](%0) \n\t"
3277                "mov %%"R"di, %c[rdi](%0) \n\t"
3278                "mov %%"R"bp, %c[rbp](%0) \n\t"
3279#ifdef CONFIG_X86_64
3280                "mov %%r8,  %c[r8](%0) \n\t"
3281                "mov %%r9,  %c[r9](%0) \n\t"
3282                "mov %%r10, %c[r10](%0) \n\t"
3283                "mov %%r11, %c[r11](%0) \n\t"
3284                "mov %%r12, %c[r12](%0) \n\t"
3285                "mov %%r13, %c[r13](%0) \n\t"
3286                "mov %%r14, %c[r14](%0) \n\t"
3287                "mov %%r15, %c[r15](%0) \n\t"
3288#endif
3289                "mov %%cr2, %%"R"ax   \n\t"
3290                "mov %%"R"ax, %c[cr2](%0) \n\t"
3291
3292                "pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
3293                "setbe %c[fail](%0) \n\t"
3294              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3295                [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3296                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3297                [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
3298                [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3299                [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3300                [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
3301                [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
3302                [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
3303                [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
3304                [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
3305#ifdef CONFIG_X86_64
3306                [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
3307                [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
3308                [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
3309                [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
3310                [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
3311                [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
3312                [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
3313                [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
3314#endif
3315                [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
3316              : "cc", "memory"
3317                , R"bx", R"di", R"si"
3318#ifdef CONFIG_X86_64
3319                , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3320#endif
3321              );
3322
3323        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3324        vcpu->arch.regs_dirty = 0;
3325
3326        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3327        if (vmx->rmode.irq.pending)
3328                fixup_rmode_irq(vmx);
3329
3330        vcpu->arch.interrupt_window_open =
3331                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3332                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
3333
3334        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3335        vmx->launched = 1;
3336
3337        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3338
3339        /* We need to handle NMIs before interrupts are enabled */
3340        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
3341            (intr_info & INTR_INFO_VALID_MASK)) {
3342                KVMTRACE_0D(NMI, vcpu, handler);
3343                asm("int $2");
3344        }
3345
3346        vmx_complete_interrupts(vmx);
3347}
3348
3349#undef R
3350#undef Q
3351
3352static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3353{
3354        struct vcpu_vmx *vmx = to_vmx(vcpu);
3355
3356        if (vmx->vmcs) {
3357                vcpu_clear(vmx);
3358                free_vmcs(vmx->vmcs);
3359                vmx->vmcs = NULL;
3360        }
3361}
3362
3363static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3364{
3365        struct vcpu_vmx *vmx = to_vmx(vcpu);
3366
3367        spin_lock(&vmx_vpid_lock);
3368        if (vmx->vpid != 0)
3369                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3370        spin_unlock(&vmx_vpid_lock);
3371        vmx_free_vmcs(vcpu);
3372        kfree(vmx->host_msrs);
3373        kfree(vmx->guest_msrs);
3374        kvm_vcpu_uninit(vcpu);
3375        kmem_cache_free(kvm_vcpu_cache, vmx);
3376}
3377
3378static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3379{
3380        int err;
3381        struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3382        int cpu;
3383
3384        if (!vmx)
3385                return ERR_PTR(-ENOMEM);
3386
3387        allocate_vpid(vmx);
3388
3389        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3390        if (err)
3391                goto free_vcpu;
3392
3393        vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3394        if (!vmx->guest_msrs) {
3395                err = -ENOMEM;
3396                goto uninit_vcpu;
3397        }
3398
3399        vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3400        if (!vmx->host_msrs)
3401                goto free_guest_msrs;
3402
3403        vmx->vmcs = alloc_vmcs();
3404        if (!vmx->vmcs)
3405                goto free_msrs;
3406
3407        vmcs_clear(vmx->vmcs);
3408
3409        cpu = get_cpu();
3410        vmx_vcpu_load(&vmx->vcpu, cpu);
3411        err = vmx_vcpu_setup(vmx);
3412        vmx_vcpu_put(&vmx->vcpu);
3413        put_cpu();
3414        if (err)
3415                goto free_vmcs;
3416        if (vm_need_virtualize_apic_accesses(kvm))
3417                if (alloc_apic_access_page(kvm) != 0)
3418                        goto free_vmcs;
3419
3420        if (vm_need_ept())
3421                if (alloc_identity_pagetable(kvm) != 0)
3422                        goto free_vmcs;
3423
3424        return &vmx->vcpu;
3425
3426free_vmcs:
3427        free_vmcs(vmx->vmcs);
3428free_msrs:
3429        kfree(vmx->host_msrs);
3430free_guest_msrs:
3431        kfree(vmx->guest_msrs);
3432uninit_vcpu:
3433        kvm_vcpu_uninit(&vmx->vcpu);
3434free_vcpu:
3435        kmem_cache_free(kvm_vcpu_cache, vmx);
3436        return ERR_PTR(err);
3437}
3438
3439static void __init vmx_check_processor_compat(void *rtn)
3440{
3441        struct vmcs_config vmcs_conf;
3442
3443        *(int *)rtn = 0;
3444        if (setup_vmcs_config(&vmcs_conf) < 0)
3445                *(int *)rtn = -EIO;
3446        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
3447                printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
3448                                smp_processor_id());
3449                *(int *)rtn = -EIO;
3450        }
3451}
3452
3453static int get_ept_level(void)
3454{
3455        return VMX_EPT_DEFAULT_GAW + 1;
3456}
3457
3458static struct kvm_x86_ops vmx_x86_ops = {
3459        .cpu_has_kvm_support = cpu_has_kvm_support,
3460        .disabled_by_bios = vmx_disabled_by_bios,
3461        .hardware_setup = hardware_setup,
3462        .hardware_unsetup = hardware_unsetup,
3463        .check_processor_compatibility = vmx_check_processor_compat,
3464        .hardware_enable = hardware_enable,
3465        .hardware_disable = hardware_disable,
3466        .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
3467
3468        .vcpu_create = vmx_create_vcpu,
3469        .vcpu_free = vmx_free_vcpu,
3470        .vcpu_reset = vmx_vcpu_reset,
3471
3472        .prepare_guest_switch = vmx_save_host_state,
3473        .vcpu_load = vmx_vcpu_load,
3474        .vcpu_put = vmx_vcpu_put,
3475
3476        .set_guest_debug = set_guest_debug,
3477        .guest_debug_pre = kvm_guest_debug_pre,
3478        .get_msr = vmx_get_msr,
3479        .set_msr = vmx_set_msr,
3480        .get_segment_base = vmx_get_segment_base,
3481        .get_segment = vmx_get_segment,
3482        .set_segment = vmx_set_segment,
3483        .get_cpl = vmx_get_cpl,
3484        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
3485        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3486        .set_cr0 = vmx_set_cr0,
3487        .set_cr3 = vmx_set_cr3,
3488        .set_cr4 = vmx_set_cr4,
3489        .set_efer = vmx_set_efer,
3490        .get_idt = vmx_get_idt,
3491        .set_idt = vmx_set_idt,
3492        .get_gdt = vmx_get_gdt,
3493        .set_gdt = vmx_set_gdt,
3494        .cache_reg = vmx_cache_reg,
3495        .get_rflags = vmx_get_rflags,
3496        .set_rflags = vmx_set_rflags,
3497
3498        .tlb_flush = vmx_flush_tlb,
3499
3500        .run = vmx_vcpu_run,
3501        .handle_exit = kvm_handle_exit,
3502        .skip_emulated_instruction = skip_emulated_instruction,
3503        .patch_hypercall = vmx_patch_hypercall,
3504        .get_irq = vmx_get_irq,
3505        .set_irq = vmx_inject_irq,
3506        .queue_exception = vmx_queue_exception,
3507        .exception_injected = vmx_exception_injected,
3508        .inject_pending_irq = vmx_intr_assist,
3509        .inject_pending_vectors = do_interrupt_requests,
3510
3511        .set_tss_addr = vmx_set_tss_addr,
3512        .get_tdp_level = get_ept_level,
3513};
3514
3515static int __init vmx_init(void)
3516{
3517        void *va;
3518        int r;
3519
3520        vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
3521        if (!vmx_io_bitmap_a)
3522                return -ENOMEM;
3523
3524        vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
3525        if (!vmx_io_bitmap_b) {
3526                r = -ENOMEM;
3527                goto out;
3528        }
3529
3530        vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
3531        if (!vmx_msr_bitmap) {
3532                r = -ENOMEM;
3533                goto out1;
3534        }
3535
3536        /*
3537         * Allow direct access to the PC debug port (it is often used for I/O
3538         * delays, but the vmexits simply slow things down).
3539         */
3540        va = kmap(vmx_io_bitmap_a);
3541        memset(va, 0xff, PAGE_SIZE);
3542        clear_bit(0x80, va);
3543        kunmap(vmx_io_bitmap_a);
3544
3545        va = kmap(vmx_io_bitmap_b);
3546        memset(va, 0xff, PAGE_SIZE);
3547        kunmap(vmx_io_bitmap_b);
3548
3549        va = kmap(vmx_msr_bitmap);
3550        memset(va, 0xff, PAGE_SIZE);
3551        kunmap(vmx_msr_bitmap);
3552
3553        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3554
3555        r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
3556        if (r)
3557                goto out2;
3558
3559        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
3560        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
3561        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
3562        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
3563        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
3564
3565        if (vm_need_ept()) {
3566                bypass_guest_pf = 0;
3567                kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3568                        VMX_EPT_WRITABLE_MASK |
3569                        VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
3570                        VMX_EPT_IGMT_BIT);
3571                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3572                                VMX_EPT_EXECUTABLE_MASK);
3573                kvm_enable_tdp();
3574        } else
3575                kvm_disable_tdp();
3576
3577        if (bypass_guest_pf)
3578                kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
3579
3580        ept_sync_global();
3581
3582        return 0;
3583
3584out2:
3585        __free_page(vmx_msr_bitmap);
3586out1:
3587        __free_page(vmx_io_bitmap_b);
3588out:
3589        __free_page(vmx_io_bitmap_a);
3590        return r;
3591}
3592
3593static void __exit vmx_exit(void)
3594{
3595        __free_page(vmx_msr_bitmap);
3596        __free_page(vmx_io_bitmap_b);
3597        __free_page(vmx_io_bitmap_a);
3598
3599        kvm_exit();
3600}
3601
3602module_init(vmx_init)
3603module_exit(vmx_exit)
3604