linux/arch/x86/kvm/mmu.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * MMU support
   8 *
   9 * Copyright (C) 2006 Qumranet, Inc.
  10 *
  11 * Authors:
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *   Avi Kivity   <avi@qumranet.com>
  14 *
  15 * This work is licensed under the terms of the GNU GPL, version 2.  See
  16 * the COPYING file in the top-level directory.
  17 *
  18 */
  19
  20#include "vmx.h"
  21#include "mmu.h"
  22
  23#include <linux/kvm_host.h>
  24#include <linux/types.h>
  25#include <linux/string.h>
  26#include <linux/mm.h>
  27#include <linux/highmem.h>
  28#include <linux/module.h>
  29#include <linux/swap.h>
  30#include <linux/hugetlb.h>
  31#include <linux/compiler.h>
  32
  33#include <asm/page.h>
  34#include <asm/cmpxchg.h>
  35#include <asm/io.h>
  36
  37/*
  38 * When setting this variable to true it enables Two-Dimensional-Paging
  39 * where the hardware walks 2 page tables:
  40 * 1. the guest-virtual to guest-physical
  41 * 2. while doing 1. it walks guest-physical to host-physical
  42 * If the hardware supports that we don't need to do shadow paging.
  43 */
  44bool tdp_enabled = false;
  45
  46#undef MMU_DEBUG
  47
  48#undef AUDIT
  49
  50#ifdef AUDIT
  51static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
  52#else
  53static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
  54#endif
  55
  56#ifdef MMU_DEBUG
  57
  58#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  59#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
  60
  61#else
  62
  63#define pgprintk(x...) do { } while (0)
  64#define rmap_printk(x...) do { } while (0)
  65
  66#endif
  67
  68#if defined(MMU_DEBUG) || defined(AUDIT)
  69static int dbg = 0;
  70module_param(dbg, bool, 0644);
  71#endif
  72
  73#ifndef MMU_DEBUG
  74#define ASSERT(x) do { } while (0)
  75#else
  76#define ASSERT(x)                                                       \
  77        if (!(x)) {                                                     \
  78                printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
  79                       __FILE__, __LINE__, #x);                         \
  80        }
  81#endif
  82
  83#define PT_FIRST_AVAIL_BITS_SHIFT 9
  84#define PT64_SECOND_AVAIL_BITS_SHIFT 52
  85
  86#define VALID_PAGE(x) ((x) != INVALID_PAGE)
  87
  88#define PT64_LEVEL_BITS 9
  89
  90#define PT64_LEVEL_SHIFT(level) \
  91                (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
  92
  93#define PT64_LEVEL_MASK(level) \
  94                (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
  95
  96#define PT64_INDEX(address, level)\
  97        (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
  98
  99
 100#define PT32_LEVEL_BITS 10
 101
 102#define PT32_LEVEL_SHIFT(level) \
 103                (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 104
 105#define PT32_LEVEL_MASK(level) \
 106                (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
 107
 108#define PT32_INDEX(address, level)\
 109        (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 110
 111
 112#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 113#define PT64_DIR_BASE_ADDR_MASK \
 114        (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
 115
 116#define PT32_BASE_ADDR_MASK PAGE_MASK
 117#define PT32_DIR_BASE_ADDR_MASK \
 118        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 119
 120#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 121                        | PT64_NX_MASK)
 122
 123#define PFERR_PRESENT_MASK (1U << 0)
 124#define PFERR_WRITE_MASK (1U << 1)
 125#define PFERR_USER_MASK (1U << 2)
 126#define PFERR_FETCH_MASK (1U << 4)
 127
 128#define PT_DIRECTORY_LEVEL 2
 129#define PT_PAGE_TABLE_LEVEL 1
 130
 131#define RMAP_EXT 4
 132
 133#define ACC_EXEC_MASK    1
 134#define ACC_WRITE_MASK   PT_WRITABLE_MASK
 135#define ACC_USER_MASK    PT_USER_MASK
 136#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 137
 138struct kvm_rmap_desc {
 139        u64 *shadow_ptes[RMAP_EXT];
 140        struct kvm_rmap_desc *more;
 141};
 142
 143static struct kmem_cache *pte_chain_cache;
 144static struct kmem_cache *rmap_desc_cache;
 145static struct kmem_cache *mmu_page_header_cache;
 146
 147static u64 __read_mostly shadow_trap_nonpresent_pte;
 148static u64 __read_mostly shadow_notrap_nonpresent_pte;
 149static u64 __read_mostly shadow_base_present_pte;
 150static u64 __read_mostly shadow_nx_mask;
 151static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 152static u64 __read_mostly shadow_user_mask;
 153static u64 __read_mostly shadow_accessed_mask;
 154static u64 __read_mostly shadow_dirty_mask;
 155
 156void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 157{
 158        shadow_trap_nonpresent_pte = trap_pte;
 159        shadow_notrap_nonpresent_pte = notrap_pte;
 160}
 161EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 162
 163void kvm_mmu_set_base_ptes(u64 base_pte)
 164{
 165        shadow_base_present_pte = base_pte;
 166}
 167EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 168
 169void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 170                u64 dirty_mask, u64 nx_mask, u64 x_mask)
 171{
 172        shadow_user_mask = user_mask;
 173        shadow_accessed_mask = accessed_mask;
 174        shadow_dirty_mask = dirty_mask;
 175        shadow_nx_mask = nx_mask;
 176        shadow_x_mask = x_mask;
 177}
 178EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 179
 180static int is_write_protection(struct kvm_vcpu *vcpu)
 181{
 182        return vcpu->arch.cr0 & X86_CR0_WP;
 183}
 184
 185static int is_cpuid_PSE36(void)
 186{
 187        return 1;
 188}
 189
 190static int is_nx(struct kvm_vcpu *vcpu)
 191{
 192        return vcpu->arch.shadow_efer & EFER_NX;
 193}
 194
 195static int is_present_pte(unsigned long pte)
 196{
 197        return pte & PT_PRESENT_MASK;
 198}
 199
 200static int is_shadow_present_pte(u64 pte)
 201{
 202        return pte != shadow_trap_nonpresent_pte
 203                && pte != shadow_notrap_nonpresent_pte;
 204}
 205
 206static int is_large_pte(u64 pte)
 207{
 208        return pte & PT_PAGE_SIZE_MASK;
 209}
 210
 211static int is_writeble_pte(unsigned long pte)
 212{
 213        return pte & PT_WRITABLE_MASK;
 214}
 215
 216static int is_dirty_pte(unsigned long pte)
 217{
 218        return pte & shadow_dirty_mask;
 219}
 220
 221static int is_rmap_pte(u64 pte)
 222{
 223        return is_shadow_present_pte(pte);
 224}
 225
 226static pfn_t spte_to_pfn(u64 pte)
 227{
 228        return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 229}
 230
 231static gfn_t pse36_gfn_delta(u32 gpte)
 232{
 233        int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 234
 235        return (gpte & PT32_DIR_PSE36_MASK) << shift;
 236}
 237
 238static void set_shadow_pte(u64 *sptep, u64 spte)
 239{
 240#ifdef CONFIG_X86_64
 241        set_64bit((unsigned long *)sptep, spte);
 242#else
 243        set_64bit((unsigned long long *)sptep, spte);
 244#endif
 245}
 246
 247static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 248                                  struct kmem_cache *base_cache, int min)
 249{
 250        void *obj;
 251
 252        if (cache->nobjs >= min)
 253                return 0;
 254        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 255                obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
 256                if (!obj)
 257                        return -ENOMEM;
 258                cache->objects[cache->nobjs++] = obj;
 259        }
 260        return 0;
 261}
 262
 263static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 264{
 265        while (mc->nobjs)
 266                kfree(mc->objects[--mc->nobjs]);
 267}
 268
 269static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 270                                       int min)
 271{
 272        struct page *page;
 273
 274        if (cache->nobjs >= min)
 275                return 0;
 276        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 277                page = alloc_page(GFP_KERNEL);
 278                if (!page)
 279                        return -ENOMEM;
 280                set_page_private(page, 0);
 281                cache->objects[cache->nobjs++] = page_address(page);
 282        }
 283        return 0;
 284}
 285
 286static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
 287{
 288        while (mc->nobjs)
 289                free_page((unsigned long)mc->objects[--mc->nobjs]);
 290}
 291
 292static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 293{
 294        int r;
 295
 296        r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
 297                                   pte_chain_cache, 4);
 298        if (r)
 299                goto out;
 300        r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
 301                                   rmap_desc_cache, 4);
 302        if (r)
 303                goto out;
 304        r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
 305        if (r)
 306                goto out;
 307        r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 308                                   mmu_page_header_cache, 4);
 309out:
 310        return r;
 311}
 312
 313static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 314{
 315        mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
 316        mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
 317        mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
 318        mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 319}
 320
 321static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 322                                    size_t size)
 323{
 324        void *p;
 325
 326        BUG_ON(!mc->nobjs);
 327        p = mc->objects[--mc->nobjs];
 328        memset(p, 0, size);
 329        return p;
 330}
 331
 332static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 333{
 334        return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
 335                                      sizeof(struct kvm_pte_chain));
 336}
 337
 338static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 339{
 340        kfree(pc);
 341}
 342
 343static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 344{
 345        return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
 346                                      sizeof(struct kvm_rmap_desc));
 347}
 348
 349static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 350{
 351        kfree(rd);
 352}
 353
 354/*
 355 * Return the pointer to the largepage write count for a given
 356 * gfn, handling slots that are not large page aligned.
 357 */
 358static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
 359{
 360        unsigned long idx;
 361
 362        idx = (gfn / KVM_PAGES_PER_HPAGE) -
 363              (slot->base_gfn / KVM_PAGES_PER_HPAGE);
 364        return &slot->lpage_info[idx].write_count;
 365}
 366
 367static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 368{
 369        int *write_count;
 370
 371        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
 372        *write_count += 1;
 373}
 374
 375static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 376{
 377        int *write_count;
 378
 379        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
 380        *write_count -= 1;
 381        WARN_ON(*write_count < 0);
 382}
 383
 384static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 385{
 386        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 387        int *largepage_idx;
 388
 389        if (slot) {
 390                largepage_idx = slot_largepage_idx(gfn, slot);
 391                return *largepage_idx;
 392        }
 393
 394        return 1;
 395}
 396
 397static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
 398{
 399        struct vm_area_struct *vma;
 400        unsigned long addr;
 401
 402        addr = gfn_to_hva(kvm, gfn);
 403        if (kvm_is_error_hva(addr))
 404                return 0;
 405
 406        vma = find_vma(current->mm, addr);
 407        if (vma && is_vm_hugetlb_page(vma))
 408                return 1;
 409
 410        return 0;
 411}
 412
 413static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 414{
 415        struct kvm_memory_slot *slot;
 416
 417        if (has_wrprotected_page(vcpu->kvm, large_gfn))
 418                return 0;
 419
 420        if (!host_largepage_backed(vcpu->kvm, large_gfn))
 421                return 0;
 422
 423        slot = gfn_to_memslot(vcpu->kvm, large_gfn);
 424        if (slot && slot->dirty_bitmap)
 425                return 0;
 426
 427        return 1;
 428}
 429
 430/*
 431 * Take gfn and return the reverse mapping to it.
 432 * Note: gfn must be unaliased before this function get called
 433 */
 434
 435static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
 436{
 437        struct kvm_memory_slot *slot;
 438        unsigned long idx;
 439
 440        slot = gfn_to_memslot(kvm, gfn);
 441        if (!lpage)
 442                return &slot->rmap[gfn - slot->base_gfn];
 443
 444        idx = (gfn / KVM_PAGES_PER_HPAGE) -
 445              (slot->base_gfn / KVM_PAGES_PER_HPAGE);
 446
 447        return &slot->lpage_info[idx].rmap_pde;
 448}
 449
 450/*
 451 * Reverse mapping data structures:
 452 *
 453 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 454 * that points to page_address(page).
 455 *
 456 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 457 * containing more mappings.
 458 */
 459static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 460{
 461        struct kvm_mmu_page *sp;
 462        struct kvm_rmap_desc *desc;
 463        unsigned long *rmapp;
 464        int i;
 465
 466        if (!is_rmap_pte(*spte))
 467                return;
 468        gfn = unalias_gfn(vcpu->kvm, gfn);
 469        sp = page_header(__pa(spte));
 470        sp->gfns[spte - sp->spt] = gfn;
 471        rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
 472        if (!*rmapp) {
 473                rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
 474                *rmapp = (unsigned long)spte;
 475        } else if (!(*rmapp & 1)) {
 476                rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
 477                desc = mmu_alloc_rmap_desc(vcpu);
 478                desc->shadow_ptes[0] = (u64 *)*rmapp;
 479                desc->shadow_ptes[1] = spte;
 480                *rmapp = (unsigned long)desc | 1;
 481        } else {
 482                rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 483                desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 484                while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
 485                        desc = desc->more;
 486                if (desc->shadow_ptes[RMAP_EXT-1]) {
 487                        desc->more = mmu_alloc_rmap_desc(vcpu);
 488                        desc = desc->more;
 489                }
 490                for (i = 0; desc->shadow_ptes[i]; ++i)
 491                        ;
 492                desc->shadow_ptes[i] = spte;
 493        }
 494}
 495
 496static void rmap_desc_remove_entry(unsigned long *rmapp,
 497                                   struct kvm_rmap_desc *desc,
 498                                   int i,
 499                                   struct kvm_rmap_desc *prev_desc)
 500{
 501        int j;
 502
 503        for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
 504                ;
 505        desc->shadow_ptes[i] = desc->shadow_ptes[j];
 506        desc->shadow_ptes[j] = NULL;
 507        if (j != 0)
 508                return;
 509        if (!prev_desc && !desc->more)
 510                *rmapp = (unsigned long)desc->shadow_ptes[0];
 511        else
 512                if (prev_desc)
 513                        prev_desc->more = desc->more;
 514                else
 515                        *rmapp = (unsigned long)desc->more | 1;
 516        mmu_free_rmap_desc(desc);
 517}
 518
 519static void rmap_remove(struct kvm *kvm, u64 *spte)
 520{
 521        struct kvm_rmap_desc *desc;
 522        struct kvm_rmap_desc *prev_desc;
 523        struct kvm_mmu_page *sp;
 524        pfn_t pfn;
 525        unsigned long *rmapp;
 526        int i;
 527
 528        if (!is_rmap_pte(*spte))
 529                return;
 530        sp = page_header(__pa(spte));
 531        pfn = spte_to_pfn(*spte);
 532        if (*spte & shadow_accessed_mask)
 533                kvm_set_pfn_accessed(pfn);
 534        if (is_writeble_pte(*spte))
 535                kvm_release_pfn_dirty(pfn);
 536        else
 537                kvm_release_pfn_clean(pfn);
 538        rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
 539        if (!*rmapp) {
 540                printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 541                BUG();
 542        } else if (!(*rmapp & 1)) {
 543                rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
 544                if ((u64 *)*rmapp != spte) {
 545                        printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
 546                               spte, *spte);
 547                        BUG();
 548                }
 549                *rmapp = 0;
 550        } else {
 551                rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
 552                desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 553                prev_desc = NULL;
 554                while (desc) {
 555                        for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
 556                                if (desc->shadow_ptes[i] == spte) {
 557                                        rmap_desc_remove_entry(rmapp,
 558                                                               desc, i,
 559                                                               prev_desc);
 560                                        return;
 561                                }
 562                        prev_desc = desc;
 563                        desc = desc->more;
 564                }
 565                BUG();
 566        }
 567}
 568
 569static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 570{
 571        struct kvm_rmap_desc *desc;
 572        struct kvm_rmap_desc *prev_desc;
 573        u64 *prev_spte;
 574        int i;
 575
 576        if (!*rmapp)
 577                return NULL;
 578        else if (!(*rmapp & 1)) {
 579                if (!spte)
 580                        return (u64 *)*rmapp;
 581                return NULL;
 582        }
 583        desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 584        prev_desc = NULL;
 585        prev_spte = NULL;
 586        while (desc) {
 587                for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
 588                        if (prev_spte == spte)
 589                                return desc->shadow_ptes[i];
 590                        prev_spte = desc->shadow_ptes[i];
 591                }
 592                desc = desc->more;
 593        }
 594        return NULL;
 595}
 596
 597static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 598{
 599        unsigned long *rmapp;
 600        u64 *spte;
 601        int write_protected = 0;
 602
 603        gfn = unalias_gfn(kvm, gfn);
 604        rmapp = gfn_to_rmap(kvm, gfn, 0);
 605
 606        spte = rmap_next(kvm, rmapp, NULL);
 607        while (spte) {
 608                BUG_ON(!spte);
 609                BUG_ON(!(*spte & PT_PRESENT_MASK));
 610                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 611                if (is_writeble_pte(*spte)) {
 612                        set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
 613                        write_protected = 1;
 614                }
 615                spte = rmap_next(kvm, rmapp, spte);
 616        }
 617        if (write_protected) {
 618                pfn_t pfn;
 619
 620                spte = rmap_next(kvm, rmapp, NULL);
 621                pfn = spte_to_pfn(*spte);
 622                kvm_set_pfn_dirty(pfn);
 623        }
 624
 625        /* check for huge page mappings */
 626        rmapp = gfn_to_rmap(kvm, gfn, 1);
 627        spte = rmap_next(kvm, rmapp, NULL);
 628        while (spte) {
 629                BUG_ON(!spte);
 630                BUG_ON(!(*spte & PT_PRESENT_MASK));
 631                BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 632                pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 633                if (is_writeble_pte(*spte)) {
 634                        rmap_remove(kvm, spte);
 635                        --kvm->stat.lpages;
 636                        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 637                        spte = NULL;
 638                        write_protected = 1;
 639                }
 640                spte = rmap_next(kvm, rmapp, spte);
 641        }
 642
 643        if (write_protected)
 644                kvm_flush_remote_tlbs(kvm);
 645
 646        account_shadowed(kvm, gfn);
 647}
 648
 649static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 650{
 651        u64 *spte;
 652        int need_tlb_flush = 0;
 653
 654        while ((spte = rmap_next(kvm, rmapp, NULL))) {
 655                BUG_ON(!(*spte & PT_PRESENT_MASK));
 656                rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 657                rmap_remove(kvm, spte);
 658                set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 659                need_tlb_flush = 1;
 660        }
 661        return need_tlb_flush;
 662}
 663
 664static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 665                          int (*handler)(struct kvm *kvm, unsigned long *rmapp))
 666{
 667        int i;
 668        int retval = 0;
 669
 670        /*
 671         * If mmap_sem isn't taken, we can look the memslots with only
 672         * the mmu_lock by skipping over the slots with userspace_addr == 0.
 673         */
 674        for (i = 0; i < kvm->nmemslots; i++) {
 675                struct kvm_memory_slot *memslot = &kvm->memslots[i];
 676                unsigned long start = memslot->userspace_addr;
 677                unsigned long end;
 678
 679                /* mmu_lock protects userspace_addr */
 680                if (!start)
 681                        continue;
 682
 683                end = start + (memslot->npages << PAGE_SHIFT);
 684                if (hva >= start && hva < end) {
 685                        gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 686                        retval |= handler(kvm, &memslot->rmap[gfn_offset]);
 687                        retval |= handler(kvm,
 688                                          &memslot->lpage_info[
 689                                                  gfn_offset /
 690                                                  KVM_PAGES_PER_HPAGE].rmap_pde);
 691                }
 692        }
 693
 694        return retval;
 695}
 696
 697int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 698{
 699        return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 700}
 701
 702static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
 703{
 704        u64 *spte;
 705        int young = 0;
 706
 707        /* always return old for EPT */
 708        if (!shadow_accessed_mask)
 709                return 0;
 710
 711        spte = rmap_next(kvm, rmapp, NULL);
 712        while (spte) {
 713                int _young;
 714                u64 _spte = *spte;
 715                BUG_ON(!(_spte & PT_PRESENT_MASK));
 716                _young = _spte & PT_ACCESSED_MASK;
 717                if (_young) {
 718                        young = 1;
 719                        clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
 720                }
 721                spte = rmap_next(kvm, rmapp, spte);
 722        }
 723        return young;
 724}
 725
 726int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 727{
 728        return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
 729}
 730
 731#ifdef MMU_DEBUG
 732static int is_empty_shadow_page(u64 *spt)
 733{
 734        u64 *pos;
 735        u64 *end;
 736
 737        for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
 738                if (is_shadow_present_pte(*pos)) {
 739                        printk(KERN_ERR "%s: %p %llx\n", __func__,
 740                               pos, *pos);
 741                        return 0;
 742                }
 743        return 1;
 744}
 745#endif
 746
 747static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 748{
 749        ASSERT(is_empty_shadow_page(sp->spt));
 750        list_del(&sp->link);
 751        __free_page(virt_to_page(sp->spt));
 752        __free_page(virt_to_page(sp->gfns));
 753        kfree(sp);
 754        ++kvm->arch.n_free_mmu_pages;
 755}
 756
 757static unsigned kvm_page_table_hashfn(gfn_t gfn)
 758{
 759        return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 760}
 761
 762static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 763                                               u64 *parent_pte)
 764{
 765        struct kvm_mmu_page *sp;
 766
 767        sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
 768        sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 769        sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 770        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 771        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 772        ASSERT(is_empty_shadow_page(sp->spt));
 773        sp->slot_bitmap = 0;
 774        sp->multimapped = 0;
 775        sp->parent_pte = parent_pte;
 776        --vcpu->kvm->arch.n_free_mmu_pages;
 777        return sp;
 778}
 779
 780static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
 781                                    struct kvm_mmu_page *sp, u64 *parent_pte)
 782{
 783        struct kvm_pte_chain *pte_chain;
 784        struct hlist_node *node;
 785        int i;
 786
 787        if (!parent_pte)
 788                return;
 789        if (!sp->multimapped) {
 790                u64 *old = sp->parent_pte;
 791
 792                if (!old) {
 793                        sp->parent_pte = parent_pte;
 794                        return;
 795                }
 796                sp->multimapped = 1;
 797                pte_chain = mmu_alloc_pte_chain(vcpu);
 798                INIT_HLIST_HEAD(&sp->parent_ptes);
 799                hlist_add_head(&pte_chain->link, &sp->parent_ptes);
 800                pte_chain->parent_ptes[0] = old;
 801        }
 802        hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
 803                if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
 804                        continue;
 805                for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
 806                        if (!pte_chain->parent_ptes[i]) {
 807                                pte_chain->parent_ptes[i] = parent_pte;
 808                                return;
 809                        }
 810        }
 811        pte_chain = mmu_alloc_pte_chain(vcpu);
 812        BUG_ON(!pte_chain);
 813        hlist_add_head(&pte_chain->link, &sp->parent_ptes);
 814        pte_chain->parent_ptes[0] = parent_pte;
 815}
 816
 817static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 818                                       u64 *parent_pte)
 819{
 820        struct kvm_pte_chain *pte_chain;
 821        struct hlist_node *node;
 822        int i;
 823
 824        if (!sp->multimapped) {
 825                BUG_ON(sp->parent_pte != parent_pte);
 826                sp->parent_pte = NULL;
 827                return;
 828        }
 829        hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 830                for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
 831                        if (!pte_chain->parent_ptes[i])
 832                                break;
 833                        if (pte_chain->parent_ptes[i] != parent_pte)
 834                                continue;
 835                        while (i + 1 < NR_PTE_CHAIN_ENTRIES
 836                                && pte_chain->parent_ptes[i + 1]) {
 837                                pte_chain->parent_ptes[i]
 838                                        = pte_chain->parent_ptes[i + 1];
 839                                ++i;
 840                        }
 841                        pte_chain->parent_ptes[i] = NULL;
 842                        if (i == 0) {
 843                                hlist_del(&pte_chain->link);
 844                                mmu_free_pte_chain(pte_chain);
 845                                if (hlist_empty(&sp->parent_ptes)) {
 846                                        sp->multimapped = 0;
 847                                        sp->parent_pte = NULL;
 848                                }
 849                        }
 850                        return;
 851                }
 852        BUG();
 853}
 854
 855static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 856                                    struct kvm_mmu_page *sp)
 857{
 858        int i;
 859
 860        for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 861                sp->spt[i] = shadow_trap_nonpresent_pte;
 862}
 863
 864static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 865{
 866        unsigned index;
 867        struct hlist_head *bucket;
 868        struct kvm_mmu_page *sp;
 869        struct hlist_node *node;
 870
 871        pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 872        index = kvm_page_table_hashfn(gfn);
 873        bucket = &kvm->arch.mmu_page_hash[index];
 874        hlist_for_each_entry(sp, node, bucket, hash_link)
 875                if (sp->gfn == gfn && !sp->role.metaphysical
 876                    && !sp->role.invalid) {
 877                        pgprintk("%s: found role %x\n",
 878                                 __func__, sp->role.word);
 879                        return sp;
 880                }
 881        return NULL;
 882}
 883
 884static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 885                                             gfn_t gfn,
 886                                             gva_t gaddr,
 887                                             unsigned level,
 888                                             int metaphysical,
 889                                             unsigned access,
 890                                             u64 *parent_pte)
 891{
 892        union kvm_mmu_page_role role;
 893        unsigned index;
 894        unsigned quadrant;
 895        struct hlist_head *bucket;
 896        struct kvm_mmu_page *sp;
 897        struct hlist_node *node;
 898
 899        role.word = 0;
 900        role.glevels = vcpu->arch.mmu.root_level;
 901        role.level = level;
 902        role.metaphysical = metaphysical;
 903        role.access = access;
 904        if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 905                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 906                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 907                role.quadrant = quadrant;
 908        }
 909        pgprintk("%s: looking gfn %lx role %x\n", __func__,
 910                 gfn, role.word);
 911        index = kvm_page_table_hashfn(gfn);
 912        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 913        hlist_for_each_entry(sp, node, bucket, hash_link)
 914                if (sp->gfn == gfn && sp->role.word == role.word) {
 915                        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 916                        pgprintk("%s: found\n", __func__);
 917                        return sp;
 918                }
 919        ++vcpu->kvm->stat.mmu_cache_miss;
 920        sp = kvm_mmu_alloc_page(vcpu, parent_pte);
 921        if (!sp)
 922                return sp;
 923        pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 924        sp->gfn = gfn;
 925        sp->role = role;
 926        hlist_add_head(&sp->hash_link, bucket);
 927        if (!metaphysical)
 928                rmap_write_protect(vcpu->kvm, gfn);
 929        if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
 930                vcpu->arch.mmu.prefetch_page(vcpu, sp);
 931        else
 932                nonpaging_prefetch_page(vcpu, sp);
 933        return sp;
 934}
 935
 936static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 937                                         struct kvm_mmu_page *sp)
 938{
 939        unsigned i;
 940        u64 *pt;
 941        u64 ent;
 942
 943        pt = sp->spt;
 944
 945        if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
 946                for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
 947                        if (is_shadow_present_pte(pt[i]))
 948                                rmap_remove(kvm, &pt[i]);
 949                        pt[i] = shadow_trap_nonpresent_pte;
 950                }
 951                kvm_flush_remote_tlbs(kvm);
 952                return;
 953        }
 954
 955        for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
 956                ent = pt[i];
 957
 958                if (is_shadow_present_pte(ent)) {
 959                        if (!is_large_pte(ent)) {
 960                                ent &= PT64_BASE_ADDR_MASK;
 961                                mmu_page_remove_parent_pte(page_header(ent),
 962                                                           &pt[i]);
 963                        } else {
 964                                --kvm->stat.lpages;
 965                                rmap_remove(kvm, &pt[i]);
 966                        }
 967                }
 968                pt[i] = shadow_trap_nonpresent_pte;
 969        }
 970        kvm_flush_remote_tlbs(kvm);
 971}
 972
 973static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
 974{
 975        mmu_page_remove_parent_pte(sp, parent_pte);
 976}
 977
 978static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
 979{
 980        int i;
 981
 982        for (i = 0; i < KVM_MAX_VCPUS; ++i)
 983                if (kvm->vcpus[i])
 984                        kvm->vcpus[i]->arch.last_pte_updated = NULL;
 985}
 986
 987static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 988{
 989        u64 *parent_pte;
 990
 991        ++kvm->stat.mmu_shadow_zapped;
 992        while (sp->multimapped || sp->parent_pte) {
 993                if (!sp->multimapped)
 994                        parent_pte = sp->parent_pte;
 995                else {
 996                        struct kvm_pte_chain *chain;
 997
 998                        chain = container_of(sp->parent_ptes.first,
 999                                             struct kvm_pte_chain, link);
1000                        parent_pte = chain->parent_ptes[0];
1001                }
1002                BUG_ON(!parent_pte);
1003                kvm_mmu_put_page(sp, parent_pte);
1004                set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1005        }
1006        kvm_mmu_page_unlink_children(kvm, sp);
1007        if (!sp->root_count) {
1008                if (!sp->role.metaphysical && !sp->role.invalid)
1009                        unaccount_shadowed(kvm, sp->gfn);
1010                hlist_del(&sp->hash_link);
1011                kvm_mmu_free_page(kvm, sp);
1012        } else {
1013                int invalid = sp->role.invalid;
1014                list_move(&sp->link, &kvm->arch.active_mmu_pages);
1015                sp->role.invalid = 1;
1016                kvm_reload_remote_mmus(kvm);
1017                if (!sp->role.metaphysical && !invalid)
1018                        unaccount_shadowed(kvm, sp->gfn);
1019        }
1020        kvm_mmu_reset_last_pte_updated(kvm);
1021}
1022
1023/*
1024 * Changing the number of mmu pages allocated to the vm
1025 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1026 */
1027void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1028{
1029        /*
1030         * If we set the number of mmu pages to be smaller be than the
1031         * number of actived pages , we must to free some mmu pages before we
1032         * change the value
1033         */
1034
1035        if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
1036            kvm_nr_mmu_pages) {
1037                int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
1038                                       - kvm->arch.n_free_mmu_pages;
1039
1040                while (n_used_mmu_pages > kvm_nr_mmu_pages) {
1041                        struct kvm_mmu_page *page;
1042
1043                        page = container_of(kvm->arch.active_mmu_pages.prev,
1044                                            struct kvm_mmu_page, link);
1045                        kvm_mmu_zap_page(kvm, page);
1046                        n_used_mmu_pages--;
1047                }
1048                kvm->arch.n_free_mmu_pages = 0;
1049        }
1050        else
1051                kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1052                                         - kvm->arch.n_alloc_mmu_pages;
1053
1054        kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1055}
1056
1057static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1058{
1059        unsigned index;
1060        struct hlist_head *bucket;
1061        struct kvm_mmu_page *sp;
1062        struct hlist_node *node, *n;
1063        int r;
1064
1065        pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1066        r = 0;
1067        index = kvm_page_table_hashfn(gfn);
1068        bucket = &kvm->arch.mmu_page_hash[index];
1069        hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1070                if (sp->gfn == gfn && !sp->role.metaphysical) {
1071                        pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1072                                 sp->role.word);
1073                        kvm_mmu_zap_page(kvm, sp);
1074                        r = 1;
1075                }
1076        return r;
1077}
1078
1079static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1080{
1081        struct kvm_mmu_page *sp;
1082
1083        while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
1084                pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
1085                kvm_mmu_zap_page(kvm, sp);
1086        }
1087}
1088
1089static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1090{
1091        int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
1092        struct kvm_mmu_page *sp = page_header(__pa(pte));
1093
1094        __set_bit(slot, &sp->slot_bitmap);
1095}
1096
1097struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1098{
1099        struct page *page;
1100
1101        gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1102
1103        if (gpa == UNMAPPED_GVA)
1104                return NULL;
1105
1106        down_read(&current->mm->mmap_sem);
1107        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1108        up_read(&current->mm->mmap_sem);
1109
1110        return page;
1111}
1112
1113static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1114                         unsigned pt_access, unsigned pte_access,
1115                         int user_fault, int write_fault, int dirty,
1116                         int *ptwrite, int largepage, gfn_t gfn,
1117                         pfn_t pfn, bool speculative)
1118{
1119        u64 spte;
1120        int was_rmapped = 0;
1121        int was_writeble = is_writeble_pte(*shadow_pte);
1122
1123        pgprintk("%s: spte %llx access %x write_fault %d"
1124                 " user_fault %d gfn %lx\n",
1125                 __func__, *shadow_pte, pt_access,
1126                 write_fault, user_fault, gfn);
1127
1128        if (is_rmap_pte(*shadow_pte)) {
1129                /*
1130                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1131                 * the parent of the now unreachable PTE.
1132                 */
1133                if (largepage && !is_large_pte(*shadow_pte)) {
1134                        struct kvm_mmu_page *child;
1135                        u64 pte = *shadow_pte;
1136
1137                        child = page_header(pte & PT64_BASE_ADDR_MASK);
1138                        mmu_page_remove_parent_pte(child, shadow_pte);
1139                } else if (pfn != spte_to_pfn(*shadow_pte)) {
1140                        pgprintk("hfn old %lx new %lx\n",
1141                                 spte_to_pfn(*shadow_pte), pfn);
1142                        rmap_remove(vcpu->kvm, shadow_pte);
1143                } else {
1144                        if (largepage)
1145                                was_rmapped = is_large_pte(*shadow_pte);
1146                        else
1147                                was_rmapped = 1;
1148                }
1149        }
1150
1151        /*
1152         * We don't set the accessed bit, since we sometimes want to see
1153         * whether the guest actually used the pte (in order to detect
1154         * demand paging).
1155         */
1156        spte = shadow_base_present_pte | shadow_dirty_mask;
1157        if (!speculative)
1158                spte |= shadow_accessed_mask;
1159        if (!dirty)
1160                pte_access &= ~ACC_WRITE_MASK;
1161        if (pte_access & ACC_EXEC_MASK)
1162                spte |= shadow_x_mask;
1163        else
1164                spte |= shadow_nx_mask;
1165        if (pte_access & ACC_USER_MASK)
1166                spte |= shadow_user_mask;
1167        if (largepage)
1168                spte |= PT_PAGE_SIZE_MASK;
1169
1170        spte |= (u64)pfn << PAGE_SHIFT;
1171
1172        if ((pte_access & ACC_WRITE_MASK)
1173            || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1174                struct kvm_mmu_page *shadow;
1175
1176                spte |= PT_WRITABLE_MASK;
1177
1178                shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1179                if (shadow ||
1180                   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1181                        pgprintk("%s: found shadow page for %lx, marking ro\n",
1182                                 __func__, gfn);
1183                        pte_access &= ~ACC_WRITE_MASK;
1184                        if (is_writeble_pte(spte)) {
1185                                spte &= ~PT_WRITABLE_MASK;
1186                                kvm_x86_ops->tlb_flush(vcpu);
1187                        }
1188                        if (write_fault)
1189                                *ptwrite = 1;
1190                }
1191        }
1192
1193        if (pte_access & ACC_WRITE_MASK)
1194                mark_page_dirty(vcpu->kvm, gfn);
1195
1196        pgprintk("%s: setting spte %llx\n", __func__, spte);
1197        pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1198                 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1199                 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1200        set_shadow_pte(shadow_pte, spte);
1201        if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
1202            && (spte & PT_PRESENT_MASK))
1203                ++vcpu->kvm->stat.lpages;
1204
1205        page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
1206        if (!was_rmapped) {
1207                rmap_add(vcpu, shadow_pte, gfn, largepage);
1208                if (!is_rmap_pte(*shadow_pte))
1209                        kvm_release_pfn_clean(pfn);
1210        } else {
1211                if (was_writeble)
1212                        kvm_release_pfn_dirty(pfn);
1213                else
1214                        kvm_release_pfn_clean(pfn);
1215        }
1216        if (speculative) {
1217                vcpu->arch.last_pte_updated = shadow_pte;
1218                vcpu->arch.last_pte_gfn = gfn;
1219        }
1220}
1221
1222static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1223{
1224}
1225
1226static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1227                           int largepage, gfn_t gfn, pfn_t pfn,
1228                           int level)
1229{
1230        hpa_t table_addr = vcpu->arch.mmu.root_hpa;
1231        int pt_write = 0;
1232
1233        for (; ; level--) {
1234                u32 index = PT64_INDEX(v, level);
1235                u64 *table;
1236
1237                ASSERT(VALID_PAGE(table_addr));
1238                table = __va(table_addr);
1239
1240                if (level == 1) {
1241                        mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1242                                     0, write, 1, &pt_write, 0, gfn, pfn, false);
1243                        return pt_write;
1244                }
1245
1246                if (largepage && level == 2) {
1247                        mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1248                                     0, write, 1, &pt_write, 1, gfn, pfn, false);
1249                        return pt_write;
1250                }
1251
1252                if (table[index] == shadow_trap_nonpresent_pte) {
1253                        struct kvm_mmu_page *new_table;
1254                        gfn_t pseudo_gfn;
1255
1256                        pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1257                                >> PAGE_SHIFT;
1258                        new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1259                                                     v, level - 1,
1260                                                     1, ACC_ALL, &table[index]);
1261                        if (!new_table) {
1262                                pgprintk("nonpaging_map: ENOMEM\n");
1263                                kvm_release_pfn_clean(pfn);
1264                                return -ENOMEM;
1265                        }
1266
1267                        set_shadow_pte(&table[index],
1268                                       __pa(new_table->spt)
1269                                       | PT_PRESENT_MASK | PT_WRITABLE_MASK
1270                                       | shadow_user_mask | shadow_x_mask);
1271                }
1272                table_addr = table[index] & PT64_BASE_ADDR_MASK;
1273        }
1274}
1275
1276static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1277{
1278        int r;
1279        int largepage = 0;
1280        pfn_t pfn;
1281        unsigned long mmu_seq;
1282
1283        down_read(&current->mm->mmap_sem);
1284        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1285                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1286                largepage = 1;
1287        }
1288
1289        mmu_seq = vcpu->kvm->mmu_notifier_seq;
1290        /* implicit mb(), we'll read before PT lock is unlocked */
1291        pfn = gfn_to_pfn(vcpu->kvm, gfn);
1292        up_read(&current->mm->mmap_sem);
1293
1294        /* mmio */
1295        if (is_error_pfn(pfn)) {
1296                kvm_release_pfn_clean(pfn);
1297                return 1;
1298        }
1299
1300        spin_lock(&vcpu->kvm->mmu_lock);
1301        if (mmu_notifier_retry(vcpu, mmu_seq))
1302                goto out_unlock;
1303        kvm_mmu_free_some_pages(vcpu);
1304        r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1305                         PT32E_ROOT_LEVEL);
1306        spin_unlock(&vcpu->kvm->mmu_lock);
1307
1308
1309        return r;
1310
1311out_unlock:
1312        spin_unlock(&vcpu->kvm->mmu_lock);
1313        kvm_release_pfn_clean(pfn);
1314        return 0;
1315}
1316
1317
1318static void mmu_free_roots(struct kvm_vcpu *vcpu)
1319{
1320        int i;
1321        struct kvm_mmu_page *sp;
1322
1323        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1324                return;
1325        spin_lock(&vcpu->kvm->mmu_lock);
1326        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1327                hpa_t root = vcpu->arch.mmu.root_hpa;
1328
1329                sp = page_header(root);
1330                --sp->root_count;
1331                if (!sp->root_count && sp->role.invalid)
1332                        kvm_mmu_zap_page(vcpu->kvm, sp);
1333                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1334                spin_unlock(&vcpu->kvm->mmu_lock);
1335                return;
1336        }
1337        for (i = 0; i < 4; ++i) {
1338                hpa_t root = vcpu->arch.mmu.pae_root[i];
1339
1340                if (root) {
1341                        root &= PT64_BASE_ADDR_MASK;
1342                        sp = page_header(root);
1343                        --sp->root_count;
1344                        if (!sp->root_count && sp->role.invalid)
1345                                kvm_mmu_zap_page(vcpu->kvm, sp);
1346                }
1347                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1348        }
1349        spin_unlock(&vcpu->kvm->mmu_lock);
1350        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1351}
1352
1353static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
1354{
1355        int ret = 0;
1356
1357        if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
1358                set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1359                ret = 1;
1360        }
1361
1362        return ret;
1363}
1364
1365static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1366{
1367        int i;
1368        gfn_t root_gfn;
1369        struct kvm_mmu_page *sp;
1370        int metaphysical = 0;
1371
1372        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1373
1374        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1375                hpa_t root = vcpu->arch.mmu.root_hpa;
1376
1377                ASSERT(!VALID_PAGE(root));
1378                if (tdp_enabled)
1379                        metaphysical = 1;
1380                if (mmu_check_root(vcpu, root_gfn))
1381                        return 1;
1382                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1383                                      PT64_ROOT_LEVEL, metaphysical,
1384                                      ACC_ALL, NULL);
1385                root = __pa(sp->spt);
1386                ++sp->root_count;
1387                vcpu->arch.mmu.root_hpa = root;
1388                return 0;
1389        }
1390        metaphysical = !is_paging(vcpu);
1391        if (tdp_enabled)
1392                metaphysical = 1;
1393        for (i = 0; i < 4; ++i) {
1394                hpa_t root = vcpu->arch.mmu.pae_root[i];
1395
1396                ASSERT(!VALID_PAGE(root));
1397                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1398                        if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1399                                vcpu->arch.mmu.pae_root[i] = 0;
1400                                continue;
1401                        }
1402                        root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1403                } else if (vcpu->arch.mmu.root_level == 0)
1404                        root_gfn = 0;
1405                if (mmu_check_root(vcpu, root_gfn))
1406                        return 1;
1407                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1408                                      PT32_ROOT_LEVEL, metaphysical,
1409                                      ACC_ALL, NULL);
1410                root = __pa(sp->spt);
1411                ++sp->root_count;
1412                vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1413        }
1414        vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1415        return 0;
1416}
1417
1418static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1419{
1420        return vaddr;
1421}
1422
1423static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1424                                u32 error_code)
1425{
1426        gfn_t gfn;
1427        int r;
1428
1429        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
1430        r = mmu_topup_memory_caches(vcpu);
1431        if (r)
1432                return r;
1433
1434        ASSERT(vcpu);
1435        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1436
1437        gfn = gva >> PAGE_SHIFT;
1438
1439        return nonpaging_map(vcpu, gva & PAGE_MASK,
1440                             error_code & PFERR_WRITE_MASK, gfn);
1441}
1442
1443static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1444                                u32 error_code)
1445{
1446        pfn_t pfn;
1447        int r;
1448        int largepage = 0;
1449        gfn_t gfn = gpa >> PAGE_SHIFT;
1450        unsigned long mmu_seq;
1451
1452        ASSERT(vcpu);
1453        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1454
1455        r = mmu_topup_memory_caches(vcpu);
1456        if (r)
1457                return r;
1458
1459        down_read(&current->mm->mmap_sem);
1460        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1461                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1462                largepage = 1;
1463        }
1464        mmu_seq = vcpu->kvm->mmu_notifier_seq;
1465        /* implicit mb(), we'll read before PT lock is unlocked */
1466        pfn = gfn_to_pfn(vcpu->kvm, gfn);
1467        up_read(&current->mm->mmap_sem);
1468        if (is_error_pfn(pfn)) {
1469                kvm_release_pfn_clean(pfn);
1470                return 1;
1471        }
1472        spin_lock(&vcpu->kvm->mmu_lock);
1473        if (mmu_notifier_retry(vcpu, mmu_seq))
1474                goto out_unlock;
1475        kvm_mmu_free_some_pages(vcpu);
1476        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1477                         largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
1478        spin_unlock(&vcpu->kvm->mmu_lock);
1479
1480        return r;
1481
1482out_unlock:
1483        spin_unlock(&vcpu->kvm->mmu_lock);
1484        kvm_release_pfn_clean(pfn);
1485        return 0;
1486}
1487
1488static void nonpaging_free(struct kvm_vcpu *vcpu)
1489{
1490        mmu_free_roots(vcpu);
1491}
1492
1493static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1494{
1495        struct kvm_mmu *context = &vcpu->arch.mmu;
1496
1497        context->new_cr3 = nonpaging_new_cr3;
1498        context->page_fault = nonpaging_page_fault;
1499        context->gva_to_gpa = nonpaging_gva_to_gpa;
1500        context->free = nonpaging_free;
1501        context->prefetch_page = nonpaging_prefetch_page;
1502        context->root_level = 0;
1503        context->shadow_root_level = PT32E_ROOT_LEVEL;
1504        context->root_hpa = INVALID_PAGE;
1505        return 0;
1506}
1507
1508void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1509{
1510        ++vcpu->stat.tlb_flush;
1511        kvm_x86_ops->tlb_flush(vcpu);
1512}
1513
1514static void paging_new_cr3(struct kvm_vcpu *vcpu)
1515{
1516        pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
1517        mmu_free_roots(vcpu);
1518}
1519
1520static void inject_page_fault(struct kvm_vcpu *vcpu,
1521                              u64 addr,
1522                              u32 err_code)
1523{
1524        kvm_inject_page_fault(vcpu, addr, err_code);
1525}
1526
1527static void paging_free(struct kvm_vcpu *vcpu)
1528{
1529        nonpaging_free(vcpu);
1530}
1531
1532#define PTTYPE 64
1533#include "paging_tmpl.h"
1534#undef PTTYPE
1535
1536#define PTTYPE 32
1537#include "paging_tmpl.h"
1538#undef PTTYPE
1539
1540static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1541{
1542        struct kvm_mmu *context = &vcpu->arch.mmu;
1543
1544        ASSERT(is_pae(vcpu));
1545        context->new_cr3 = paging_new_cr3;
1546        context->page_fault = paging64_page_fault;
1547        context->gva_to_gpa = paging64_gva_to_gpa;
1548        context->prefetch_page = paging64_prefetch_page;
1549        context->free = paging_free;
1550        context->root_level = level;
1551        context->shadow_root_level = level;
1552        context->root_hpa = INVALID_PAGE;
1553        return 0;
1554}
1555
1556static int paging64_init_context(struct kvm_vcpu *vcpu)
1557{
1558        return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1559}
1560
1561static int paging32_init_context(struct kvm_vcpu *vcpu)
1562{
1563        struct kvm_mmu *context = &vcpu->arch.mmu;
1564
1565        context->new_cr3 = paging_new_cr3;
1566        context->page_fault = paging32_page_fault;
1567        context->gva_to_gpa = paging32_gva_to_gpa;
1568        context->free = paging_free;
1569        context->prefetch_page = paging32_prefetch_page;
1570        context->root_level = PT32_ROOT_LEVEL;
1571        context->shadow_root_level = PT32E_ROOT_LEVEL;
1572        context->root_hpa = INVALID_PAGE;
1573        return 0;
1574}
1575
1576static int paging32E_init_context(struct kvm_vcpu *vcpu)
1577{
1578        return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1579}
1580
1581static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1582{
1583        struct kvm_mmu *context = &vcpu->arch.mmu;
1584
1585        context->new_cr3 = nonpaging_new_cr3;
1586        context->page_fault = tdp_page_fault;
1587        context->free = nonpaging_free;
1588        context->prefetch_page = nonpaging_prefetch_page;
1589        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1590        context->root_hpa = INVALID_PAGE;
1591
1592        if (!is_paging(vcpu)) {
1593                context->gva_to_gpa = nonpaging_gva_to_gpa;
1594                context->root_level = 0;
1595        } else if (is_long_mode(vcpu)) {
1596                context->gva_to_gpa = paging64_gva_to_gpa;
1597                context->root_level = PT64_ROOT_LEVEL;
1598        } else if (is_pae(vcpu)) {
1599                context->gva_to_gpa = paging64_gva_to_gpa;
1600                context->root_level = PT32E_ROOT_LEVEL;
1601        } else {
1602                context->gva_to_gpa = paging32_gva_to_gpa;
1603                context->root_level = PT32_ROOT_LEVEL;
1604        }
1605
1606        return 0;
1607}
1608
1609static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
1610{
1611        ASSERT(vcpu);
1612        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1613
1614        if (!is_paging(vcpu))
1615                return nonpaging_init_context(vcpu);
1616        else if (is_long_mode(vcpu))
1617                return paging64_init_context(vcpu);
1618        else if (is_pae(vcpu))
1619                return paging32E_init_context(vcpu);
1620        else
1621                return paging32_init_context(vcpu);
1622}
1623
1624static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1625{
1626        vcpu->arch.update_pte.pfn = bad_pfn;
1627
1628        if (tdp_enabled)
1629                return init_kvm_tdp_mmu(vcpu);
1630        else
1631                return init_kvm_softmmu(vcpu);
1632}
1633
1634static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1635{
1636        ASSERT(vcpu);
1637        if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1638                vcpu->arch.mmu.free(vcpu);
1639                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1640        }
1641}
1642
1643int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1644{
1645        destroy_kvm_mmu(vcpu);
1646        return init_kvm_mmu(vcpu);
1647}
1648EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1649
1650int kvm_mmu_load(struct kvm_vcpu *vcpu)
1651{
1652        int r;
1653
1654        r = mmu_topup_memory_caches(vcpu);
1655        if (r)
1656                goto out;
1657        spin_lock(&vcpu->kvm->mmu_lock);
1658        kvm_mmu_free_some_pages(vcpu);
1659        r = mmu_alloc_roots(vcpu);
1660        spin_unlock(&vcpu->kvm->mmu_lock);
1661        if (r)
1662                goto out;
1663        kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1664        kvm_mmu_flush_tlb(vcpu);
1665out:
1666        return r;
1667}
1668EXPORT_SYMBOL_GPL(kvm_mmu_load);
1669
1670void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1671{
1672        mmu_free_roots(vcpu);
1673}
1674
1675static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1676                                  struct kvm_mmu_page *sp,
1677                                  u64 *spte)
1678{
1679        u64 pte;
1680        struct kvm_mmu_page *child;
1681
1682        pte = *spte;
1683        if (is_shadow_present_pte(pte)) {
1684                if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
1685                    is_large_pte(pte))
1686                        rmap_remove(vcpu->kvm, spte);
1687                else {
1688                        child = page_header(pte & PT64_BASE_ADDR_MASK);
1689                        mmu_page_remove_parent_pte(child, spte);
1690                }
1691        }
1692        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1693        if (is_large_pte(pte))
1694                --vcpu->kvm->stat.lpages;
1695}
1696
1697static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1698                                  struct kvm_mmu_page *sp,
1699                                  u64 *spte,
1700                                  const void *new)
1701{
1702        if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1703                if (!vcpu->arch.update_pte.largepage ||
1704                    sp->role.glevels == PT32_ROOT_LEVEL) {
1705                        ++vcpu->kvm->stat.mmu_pde_zapped;
1706                        return;
1707                }
1708        }
1709
1710        ++vcpu->kvm->stat.mmu_pte_updated;
1711        if (sp->role.glevels == PT32_ROOT_LEVEL)
1712                paging32_update_pte(vcpu, sp, spte, new);
1713        else
1714                paging64_update_pte(vcpu, sp, spte, new);
1715}
1716
1717static bool need_remote_flush(u64 old, u64 new)
1718{
1719        if (!is_shadow_present_pte(old))
1720                return false;
1721        if (!is_shadow_present_pte(new))
1722                return true;
1723        if ((old ^ new) & PT64_BASE_ADDR_MASK)
1724                return true;
1725        old ^= PT64_NX_MASK;
1726        new ^= PT64_NX_MASK;
1727        return (old & ~new & PT64_PERM_MASK) != 0;
1728}
1729
1730static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1731{
1732        if (need_remote_flush(old, new))
1733                kvm_flush_remote_tlbs(vcpu->kvm);
1734        else
1735                kvm_mmu_flush_tlb(vcpu);
1736}
1737
1738static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1739{
1740        u64 *spte = vcpu->arch.last_pte_updated;
1741
1742        return !!(spte && (*spte & shadow_accessed_mask));
1743}
1744
1745static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1746                                          const u8 *new, int bytes)
1747{
1748        gfn_t gfn;
1749        int r;
1750        u64 gpte = 0;
1751        pfn_t pfn;
1752
1753        vcpu->arch.update_pte.largepage = 0;
1754
1755        if (bytes != 4 && bytes != 8)
1756                return;
1757
1758        /*
1759         * Assume that the pte write on a page table of the same type
1760         * as the current vcpu paging mode.  This is nearly always true
1761         * (might be false while changing modes).  Note it is verified later
1762         * by update_pte().
1763         */
1764        if (is_pae(vcpu)) {
1765                /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
1766                if ((bytes == 4) && (gpa % 4 == 0)) {
1767                        r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
1768                        if (r)
1769                                return;
1770                        memcpy((void *)&gpte + (gpa % 8), new, 4);
1771                } else if ((bytes == 8) && (gpa % 8 == 0)) {
1772                        memcpy((void *)&gpte, new, 8);
1773                }
1774        } else {
1775                if ((bytes == 4) && (gpa % 4 == 0))
1776                        memcpy((void *)&gpte, new, 4);
1777        }
1778        if (!is_present_pte(gpte))
1779                return;
1780        gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1781
1782        down_read(&current->mm->mmap_sem);
1783        if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1784                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1785                vcpu->arch.update_pte.largepage = 1;
1786        }
1787        vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1788        /* implicit mb(), we'll read before PT lock is unlocked */
1789        pfn = gfn_to_pfn(vcpu->kvm, gfn);
1790        up_read(&current->mm->mmap_sem);
1791
1792        if (is_error_pfn(pfn)) {
1793                kvm_release_pfn_clean(pfn);
1794                return;
1795        }
1796        vcpu->arch.update_pte.gfn = gfn;
1797        vcpu->arch.update_pte.pfn = pfn;
1798}
1799
1800static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1801{
1802        u64 *spte = vcpu->arch.last_pte_updated;
1803
1804        if (spte
1805            && vcpu->arch.last_pte_gfn == gfn
1806            && shadow_accessed_mask
1807            && !(*spte & shadow_accessed_mask)
1808            && is_shadow_present_pte(*spte))
1809                set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1810}
1811
1812void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1813                       const u8 *new, int bytes)
1814{
1815        gfn_t gfn = gpa >> PAGE_SHIFT;
1816        struct kvm_mmu_page *sp;
1817        struct hlist_node *node, *n;
1818        struct hlist_head *bucket;
1819        unsigned index;
1820        u64 entry, gentry;
1821        u64 *spte;
1822        unsigned offset = offset_in_page(gpa);
1823        unsigned pte_size;
1824        unsigned page_offset;
1825        unsigned misaligned;
1826        unsigned quadrant;
1827        int level;
1828        int flooded = 0;
1829        int npte;
1830        int r;
1831
1832        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1833        mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1834        spin_lock(&vcpu->kvm->mmu_lock);
1835        kvm_mmu_access_page(vcpu, gfn);
1836        kvm_mmu_free_some_pages(vcpu);
1837        ++vcpu->kvm->stat.mmu_pte_write;
1838        kvm_mmu_audit(vcpu, "pre pte write");
1839        if (gfn == vcpu->arch.last_pt_write_gfn
1840            && !last_updated_pte_accessed(vcpu)) {
1841                ++vcpu->arch.last_pt_write_count;
1842                if (vcpu->arch.last_pt_write_count >= 3)
1843                        flooded = 1;
1844        } else {
1845                vcpu->arch.last_pt_write_gfn = gfn;
1846                vcpu->arch.last_pt_write_count = 1;
1847                vcpu->arch.last_pte_updated = NULL;
1848        }
1849        index = kvm_page_table_hashfn(gfn);
1850        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1851        hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1852                if (sp->gfn != gfn || sp->role.metaphysical)
1853                        continue;
1854                pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1855                misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1856                misaligned |= bytes < 4;
1857                if (misaligned || flooded) {
1858                        /*
1859                         * Misaligned accesses are too much trouble to fix
1860                         * up; also, they usually indicate a page is not used
1861                         * as a page table.
1862                         *
1863                         * If we're seeing too many writes to a page,
1864                         * it may no longer be a page table, or we may be
1865                         * forking, in which case it is better to unmap the
1866                         * page.
1867                         */
1868                        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1869                                 gpa, bytes, sp->role.word);
1870                        kvm_mmu_zap_page(vcpu->kvm, sp);
1871                        ++vcpu->kvm->stat.mmu_flooded;
1872                        continue;
1873                }
1874                page_offset = offset;
1875                level = sp->role.level;
1876                npte = 1;
1877                if (sp->role.glevels == PT32_ROOT_LEVEL) {
1878                        page_offset <<= 1;      /* 32->64 */
1879                        /*
1880                         * A 32-bit pde maps 4MB while the shadow pdes map
1881                         * only 2MB.  So we need to double the offset again
1882                         * and zap two pdes instead of one.
1883                         */
1884                        if (level == PT32_ROOT_LEVEL) {
1885                                page_offset &= ~7; /* kill rounding error */
1886                                page_offset <<= 1;
1887                                npte = 2;
1888                        }
1889                        quadrant = page_offset >> PAGE_SHIFT;
1890                        page_offset &= ~PAGE_MASK;
1891                        if (quadrant != sp->role.quadrant)
1892                                continue;
1893                }
1894                spte = &sp->spt[page_offset / sizeof(*spte)];
1895                if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
1896                        gentry = 0;
1897                        r = kvm_read_guest_atomic(vcpu->kvm,
1898                                                  gpa & ~(u64)(pte_size - 1),
1899                                                  &gentry, pte_size);
1900                        new = (const void *)&gentry;
1901                        if (r < 0)
1902                                new = NULL;
1903                }
1904                while (npte--) {
1905                        entry = *spte;
1906                        mmu_pte_write_zap_pte(vcpu, sp, spte);
1907                        if (new)
1908                                mmu_pte_write_new_pte(vcpu, sp, spte, new);
1909                        mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1910                        ++spte;
1911                }
1912        }
1913        kvm_mmu_audit(vcpu, "post pte write");
1914        spin_unlock(&vcpu->kvm->mmu_lock);
1915        if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
1916                kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
1917                vcpu->arch.update_pte.pfn = bad_pfn;
1918        }
1919}
1920
1921int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1922{
1923        gpa_t gpa;
1924        int r;
1925
1926        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1927
1928        spin_lock(&vcpu->kvm->mmu_lock);
1929        r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1930        spin_unlock(&vcpu->kvm->mmu_lock);
1931        return r;
1932}
1933EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
1934
1935void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1936{
1937        while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1938                struct kvm_mmu_page *sp;
1939
1940                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1941                                  struct kvm_mmu_page, link);
1942                kvm_mmu_zap_page(vcpu->kvm, sp);
1943                ++vcpu->kvm->stat.mmu_recycled;
1944        }
1945}
1946
1947int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1948{
1949        int r;
1950        enum emulation_result er;
1951
1952        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1953        if (r < 0)
1954                goto out;
1955
1956        if (!r) {
1957                r = 1;
1958                goto out;
1959        }
1960
1961        r = mmu_topup_memory_caches(vcpu);
1962        if (r)
1963                goto out;
1964
1965        er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1966
1967        switch (er) {
1968        case EMULATE_DONE:
1969                return 1;
1970        case EMULATE_DO_MMIO:
1971                ++vcpu->stat.mmio_exits;
1972                return 0;
1973        case EMULATE_FAIL:
1974                kvm_report_emulation_failure(vcpu, "pagetable");
1975                return 1;
1976        default:
1977                BUG();
1978        }
1979out:
1980        return r;
1981}
1982EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1983
1984void kvm_enable_tdp(void)
1985{
1986        tdp_enabled = true;
1987}
1988EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1989
1990void kvm_disable_tdp(void)
1991{
1992        tdp_enabled = false;
1993}
1994EXPORT_SYMBOL_GPL(kvm_disable_tdp);
1995
1996static void free_mmu_pages(struct kvm_vcpu *vcpu)
1997{
1998        struct kvm_mmu_page *sp;
1999
2000        while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2001                sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2002                                  struct kvm_mmu_page, link);
2003                kvm_mmu_zap_page(vcpu->kvm, sp);
2004                cond_resched();
2005        }
2006        free_page((unsigned long)vcpu->arch.mmu.pae_root);
2007}
2008
2009static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2010{
2011        struct page *page;
2012        int i;
2013
2014        ASSERT(vcpu);
2015
2016        if (vcpu->kvm->arch.n_requested_mmu_pages)
2017                vcpu->kvm->arch.n_free_mmu_pages =
2018                                        vcpu->kvm->arch.n_requested_mmu_pages;
2019        else
2020                vcpu->kvm->arch.n_free_mmu_pages =
2021                                        vcpu->kvm->arch.n_alloc_mmu_pages;
2022        /*
2023         * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2024         * Therefore we need to allocate shadow page tables in the first
2025         * 4GB of memory, which happens to fit the DMA32 zone.
2026         */
2027        page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2028        if (!page)
2029                goto error_1;
2030        vcpu->arch.mmu.pae_root = page_address(page);
2031        for (i = 0; i < 4; ++i)
2032                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2033
2034        return 0;
2035
2036error_1:
2037        free_mmu_pages(vcpu);
2038        return -ENOMEM;
2039}
2040
2041int kvm_mmu_create(struct kvm_vcpu *vcpu)
2042{
2043        ASSERT(vcpu);
2044        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2045
2046        return alloc_mmu_pages(vcpu);
2047}
2048
2049int kvm_mmu_setup(struct kvm_vcpu *vcpu)
2050{
2051        ASSERT(vcpu);
2052        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2053
2054        return init_kvm_mmu(vcpu);
2055}
2056
2057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
2058{
2059        ASSERT(vcpu);
2060
2061        destroy_kvm_mmu(vcpu);
2062        free_mmu_pages(vcpu);
2063        mmu_free_memory_caches(vcpu);
2064}
2065
2066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2067{
2068        struct kvm_mmu_page *sp;
2069
2070        list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2071                int i;
2072                u64 *pt;
2073
2074                if (!test_bit(slot, &sp->slot_bitmap))
2075                        continue;
2076
2077                pt = sp->spt;
2078                for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2079                        /* avoid RMW */
2080                        if (pt[i] & PT_WRITABLE_MASK)
2081                                pt[i] &= ~PT_WRITABLE_MASK;
2082        }
2083        kvm_flush_remote_tlbs(kvm);
2084}
2085
2086void kvm_mmu_zap_all(struct kvm *kvm)
2087{
2088        struct kvm_mmu_page *sp, *node;
2089
2090        spin_lock(&kvm->mmu_lock);
2091        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2092                kvm_mmu_zap_page(kvm, sp);
2093        spin_unlock(&kvm->mmu_lock);
2094
2095        kvm_flush_remote_tlbs(kvm);
2096}
2097
2098static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
2099{
2100        struct kvm_mmu_page *page;
2101
2102        page = container_of(kvm->arch.active_mmu_pages.prev,
2103                            struct kvm_mmu_page, link);
2104        kvm_mmu_zap_page(kvm, page);
2105}
2106
2107static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2108{
2109        struct kvm *kvm;
2110        struct kvm *kvm_freed = NULL;
2111        int cache_count = 0;
2112
2113        spin_lock(&kvm_lock);
2114
2115        list_for_each_entry(kvm, &vm_list, vm_list) {
2116                int npages;
2117
2118                if (!down_read_trylock(&kvm->slots_lock))
2119                        continue;
2120                spin_lock(&kvm->mmu_lock);
2121                npages = kvm->arch.n_alloc_mmu_pages -
2122                         kvm->arch.n_free_mmu_pages;
2123                cache_count += npages;
2124                if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2125                        kvm_mmu_remove_one_alloc_mmu_page(kvm);
2126                        cache_count--;
2127                        kvm_freed = kvm;
2128                }
2129                nr_to_scan--;
2130
2131                spin_unlock(&kvm->mmu_lock);
2132                up_read(&kvm->slots_lock);
2133        }
2134        if (kvm_freed)
2135                list_move_tail(&kvm_freed->vm_list, &vm_list);
2136
2137        spin_unlock(&kvm_lock);
2138
2139        return cache_count;
2140}
2141
2142static struct shrinker mmu_shrinker = {
2143        .shrink = mmu_shrink,
2144        .seeks = DEFAULT_SEEKS * 10,
2145};
2146
2147static void mmu_destroy_caches(void)
2148{
2149        if (pte_chain_cache)
2150                kmem_cache_destroy(pte_chain_cache);
2151        if (rmap_desc_cache)
2152                kmem_cache_destroy(rmap_desc_cache);
2153        if (mmu_page_header_cache)
2154                kmem_cache_destroy(mmu_page_header_cache);
2155}
2156
2157void kvm_mmu_module_exit(void)
2158{
2159        mmu_destroy_caches();
2160        unregister_shrinker(&mmu_shrinker);
2161}
2162
2163int kvm_mmu_module_init(void)
2164{
2165        pte_chain_cache = kmem_cache_create("kvm_pte_chain",
2166                                            sizeof(struct kvm_pte_chain),
2167                                            0, 0, NULL);
2168        if (!pte_chain_cache)
2169                goto nomem;
2170        rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
2171                                            sizeof(struct kvm_rmap_desc),
2172                                            0, 0, NULL);
2173        if (!rmap_desc_cache)
2174                goto nomem;
2175
2176        mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
2177                                                  sizeof(struct kvm_mmu_page),
2178                                                  0, 0, NULL);
2179        if (!mmu_page_header_cache)
2180                goto nomem;
2181
2182        register_shrinker(&mmu_shrinker);
2183
2184        return 0;
2185
2186nomem:
2187        mmu_destroy_caches();
2188        return -ENOMEM;
2189}
2190
2191/*
2192 * Caculate mmu pages needed for kvm.
2193 */
2194unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
2195{
2196        int i;
2197        unsigned int nr_mmu_pages;
2198        unsigned int  nr_pages = 0;
2199
2200        for (i = 0; i < kvm->nmemslots; i++)
2201                nr_pages += kvm->memslots[i].npages;
2202
2203        nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
2204        nr_mmu_pages = max(nr_mmu_pages,
2205                        (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
2206
2207        return nr_mmu_pages;
2208}
2209
2210static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2211                                unsigned len)
2212{
2213        if (len > buffer->len)
2214                return NULL;
2215        return buffer->ptr;
2216}
2217
2218static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2219                                unsigned len)
2220{
2221        void *ret;
2222
2223        ret = pv_mmu_peek_buffer(buffer, len);
2224        if (!ret)
2225                return ret;
2226        buffer->ptr += len;
2227        buffer->len -= len;
2228        buffer->processed += len;
2229        return ret;
2230}
2231
2232static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
2233                             gpa_t addr, gpa_t value)
2234{
2235        int bytes = 8;
2236        int r;
2237
2238        if (!is_long_mode(vcpu) && !is_pae(vcpu))
2239                bytes = 4;
2240
2241        r = mmu_topup_memory_caches(vcpu);
2242        if (r)
2243                return r;
2244
2245        if (!emulator_write_phys(vcpu, addr, &value, bytes))
2246                return -EFAULT;
2247
2248        return 1;
2249}
2250
2251static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2252{
2253        kvm_set_cr3(vcpu, vcpu->arch.cr3);
2254        return 1;
2255}
2256
2257static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
2258{
2259        spin_lock(&vcpu->kvm->mmu_lock);
2260        mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
2261        spin_unlock(&vcpu->kvm->mmu_lock);
2262        return 1;
2263}
2264
2265static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
2266                             struct kvm_pv_mmu_op_buffer *buffer)
2267{
2268        struct kvm_mmu_op_header *header;
2269
2270        header = pv_mmu_peek_buffer(buffer, sizeof *header);
2271        if (!header)
2272                return 0;
2273        switch (header->op) {
2274        case KVM_MMU_OP_WRITE_PTE: {
2275                struct kvm_mmu_op_write_pte *wpte;
2276
2277                wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
2278                if (!wpte)
2279                        return 0;
2280                return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
2281                                        wpte->pte_val);
2282        }
2283        case KVM_MMU_OP_FLUSH_TLB: {
2284                struct kvm_mmu_op_flush_tlb *ftlb;
2285
2286                ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
2287                if (!ftlb)
2288                        return 0;
2289                return kvm_pv_mmu_flush_tlb(vcpu);
2290        }
2291        case KVM_MMU_OP_RELEASE_PT: {
2292                struct kvm_mmu_op_release_pt *rpt;
2293
2294                rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
2295                if (!rpt)
2296                        return 0;
2297                return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
2298        }
2299        default: return 0;
2300        }
2301}
2302
2303int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2304                  gpa_t addr, unsigned long *ret)
2305{
2306        int r;
2307        struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2308
2309        buffer->ptr = buffer->buf;
2310        buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2311        buffer->processed = 0;
2312
2313        r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2314        if (r)
2315                goto out;
2316
2317        while (buffer->len) {
2318                r = kvm_pv_mmu_op_one(vcpu, buffer);
2319                if (r < 0)
2320                        goto out;
2321                if (r == 0)
2322                        break;
2323        }
2324
2325        r = 1;
2326out:
2327        *ret = buffer->processed;
2328        return r;
2329}
2330
2331#ifdef AUDIT
2332
2333static const char *audit_msg;
2334
2335static gva_t canonicalize(gva_t gva)
2336{
2337#ifdef CONFIG_X86_64
2338        gva = (long long)(gva << 16) >> 16;
2339#endif
2340        return gva;
2341}
2342
2343static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
2344                                gva_t va, int level)
2345{
2346        u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
2347        int i;
2348        gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
2349
2350        for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
2351                u64 ent = pt[i];
2352
2353                if (ent == shadow_trap_nonpresent_pte)
2354                        continue;
2355
2356                va = canonicalize(va);
2357                if (level > 1) {
2358                        if (ent == shadow_notrap_nonpresent_pte)
2359                                printk(KERN_ERR "audit: (%s) nontrapping pte"
2360                                       " in nonleaf level: levels %d gva %lx"
2361                                       " level %d pte %llx\n", audit_msg,
2362                                       vcpu->arch.mmu.root_level, va, level, ent);
2363
2364                        audit_mappings_page(vcpu, ent, va, level - 1);
2365                } else {
2366                        gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
2367                        hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
2368
2369                        if (is_shadow_present_pte(ent)
2370                            && (ent & PT64_BASE_ADDR_MASK) != hpa)
2371                                printk(KERN_ERR "xx audit error: (%s) levels %d"
2372                                       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
2373                                       audit_msg, vcpu->arch.mmu.root_level,
2374                                       va, gpa, hpa, ent,
2375                                       is_shadow_present_pte(ent));
2376                        else if (ent == shadow_notrap_nonpresent_pte
2377                                 && !is_error_hpa(hpa))
2378                                printk(KERN_ERR "audit: (%s) notrap shadow,"
2379                                       " valid guest gva %lx\n", audit_msg, va);
2380                        kvm_release_pfn_clean(pfn);
2381
2382                }
2383        }
2384}
2385
2386static void audit_mappings(struct kvm_vcpu *vcpu)
2387{
2388        unsigned i;
2389
2390        if (vcpu->arch.mmu.root_level == 4)
2391                audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
2392        else
2393                for (i = 0; i < 4; ++i)
2394                        if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
2395                                audit_mappings_page(vcpu,
2396                                                    vcpu->arch.mmu.pae_root[i],
2397                                                    i << 30,
2398                                                    2);
2399}
2400
2401static int count_rmaps(struct kvm_vcpu *vcpu)
2402{
2403        int nmaps = 0;
2404        int i, j, k;
2405
2406        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
2407                struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
2408                struct kvm_rmap_desc *d;
2409
2410                for (j = 0; j < m->npages; ++j) {
2411                        unsigned long *rmapp = &m->rmap[j];
2412
2413                        if (!*rmapp)
2414                                continue;
2415                        if (!(*rmapp & 1)) {
2416                                ++nmaps;
2417                                continue;
2418                        }
2419                        d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
2420                        while (d) {
2421                                for (k = 0; k < RMAP_EXT; ++k)
2422                                        if (d->shadow_ptes[k])
2423                                                ++nmaps;
2424                                        else
2425                                                break;
2426                                d = d->more;
2427                        }
2428                }
2429        }
2430        return nmaps;
2431}
2432
2433static int count_writable_mappings(struct kvm_vcpu *vcpu)
2434{
2435        int nmaps = 0;
2436        struct kvm_mmu_page *sp;
2437        int i;
2438
2439        list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2440                u64 *pt = sp->spt;
2441
2442                if (sp->role.level != PT_PAGE_TABLE_LEVEL)
2443                        continue;
2444
2445                for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
2446                        u64 ent = pt[i];
2447
2448                        if (!(ent & PT_PRESENT_MASK))
2449                                continue;
2450                        if (!(ent & PT_WRITABLE_MASK))
2451                                continue;
2452                        ++nmaps;
2453                }
2454        }
2455        return nmaps;
2456}
2457
2458static void audit_rmap(struct kvm_vcpu *vcpu)
2459{
2460        int n_rmap = count_rmaps(vcpu);
2461        int n_actual = count_writable_mappings(vcpu);
2462
2463        if (n_rmap != n_actual)
2464                printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
2465                       __func__, audit_msg, n_rmap, n_actual);
2466}
2467
2468static void audit_write_protection(struct kvm_vcpu *vcpu)
2469{
2470        struct kvm_mmu_page *sp;
2471        struct kvm_memory_slot *slot;
2472        unsigned long *rmapp;
2473        gfn_t gfn;
2474
2475        list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2476                if (sp->role.metaphysical)
2477                        continue;
2478
2479                slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
2480                gfn = unalias_gfn(vcpu->kvm, sp->gfn);
2481                rmapp = &slot->rmap[gfn - slot->base_gfn];
2482                if (*rmapp)
2483                        printk(KERN_ERR "%s: (%s) shadow page has writable"
2484                               " mappings: gfn %lx role %x\n",
2485                               __func__, audit_msg, sp->gfn,
2486                               sp->role.word);
2487        }
2488}
2489
2490static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
2491{
2492        int olddbg = dbg;
2493
2494        dbg = 0;
2495        audit_msg = msg;
2496        audit_rmap(vcpu);
2497        audit_write_protection(vcpu);
2498        audit_mappings(vcpu);
2499        dbg = olddbg;
2500}
2501
2502#endif
2503