linux/arch/x86/kvm/mmu.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * MMU support
   8 *
   9 * Copyright (C) 2006 Qumranet, Inc.
  10 *
  11 * Authors:
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *   Avi Kivity   <avi@qumranet.com>
  14 *
  15 * This work is licensed under the terms of the GNU GPL, version 2.  See
  16 * the COPYING file in the top-level directory.
  17 *
  18 */
  19
  20#include "vmx.h"
  21#include "mmu.h"
  22
  23#include <linux/kvm_host.h>
  24#include <linux/types.h>
  25#include <linux/string.h>
  26#include <linux/mm.h>
  27#include <linux/highmem.h>
  28#include <linux/module.h>
  29#include <linux/swap.h>
  30#include <linux/hugetlb.h>
  31#include <linux/compiler.h>
  32
  33#include <asm/page.h>
  34#include <asm/cmpxchg.h>
  35#include <asm/io.h>
  36
  37/*
  38 * When setting this variable to true it enables Two-Dimensional-Paging
  39 * where the hardware walks 2 page tables:
  40 * 1. the guest-virtual to guest-physical
  41 * 2. while doing 1. it walks guest-physical to host-physical
  42 * If the hardware supports that we don't need to do shadow paging.
  43 */
  44bool tdp_enabled = false;
  45
  46#undef MMU_DEBUG
  47
  48#undef AUDIT
  49
  50#ifdef AUDIT
  51static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
  52#else
  53static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
  54#endif
  55
  56#ifdef MMU_DEBUG
  57
  58#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  59#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
  60
  61#else
  62
  63#define pgprintk(x...) do { } while (0)
  64#define rmap_printk(x...) do { } while (0)
  65
  66#endif
  67
  68#if defined(MMU_DEBUG) || defined(AUDIT)
  69static int dbg = 0;
  70module_param(dbg, bool, 0644);
  71#endif
  72
  73static int oos_shadow = 1;
  74module_param(oos_shadow, bool, 0644);
  75
  76#ifndef MMU_DEBUG
  77#define ASSERT(x) do { } while (0)
  78#else
  79#define ASSERT(x)                                                       \
  80        if (!(x)) {                                                     \
  81                printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
  82                       __FILE__, __LINE__, #x);                         \
  83        }
  84#endif
  85
  86#define PT_FIRST_AVAIL_BITS_SHIFT 9
  87#define PT64_SECOND_AVAIL_BITS_SHIFT 52
  88
  89#define VALID_PAGE(x) ((x) != INVALID_PAGE)
  90
  91#define PT64_LEVEL_BITS 9
  92
  93#define PT64_LEVEL_SHIFT(level) \
  94                (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
  95
  96#define PT64_LEVEL_MASK(level) \
  97                (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
  98
  99#define PT64_INDEX(address, level)\
 100        (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 101
 102
 103#define PT32_LEVEL_BITS 10
 104
 105#define PT32_LEVEL_SHIFT(level) \
 106                (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 107
 108#define PT32_LEVEL_MASK(level) \
 109                (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
 110
 111#define PT32_INDEX(address, level)\
 112        (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 113
 114
 115#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 116#define PT64_DIR_BASE_ADDR_MASK \
 117        (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
 118
 119#define PT32_BASE_ADDR_MASK PAGE_MASK
 120#define PT32_DIR_BASE_ADDR_MASK \
 121        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 122
 123#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 124                        | PT64_NX_MASK)
 125
 126#define PFERR_PRESENT_MASK (1U << 0)
 127#define PFERR_WRITE_MASK (1U << 1)
 128#define PFERR_USER_MASK (1U << 2)
 129#define PFERR_FETCH_MASK (1U << 4)
 130
 131#define PT_DIRECTORY_LEVEL 2
 132#define PT_PAGE_TABLE_LEVEL 1
 133
 134#define RMAP_EXT 4
 135
 136#define ACC_EXEC_MASK    1
 137#define ACC_WRITE_MASK   PT_WRITABLE_MASK
 138#define ACC_USER_MASK    PT_USER_MASK
 139#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 140
 141#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 142
 143struct kvm_rmap_desc {
 144        u64 *shadow_ptes[RMAP_EXT];
 145        struct kvm_rmap_desc *more;
 146};
 147
 148struct kvm_shadow_walk {
 149        int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
 150                     u64 addr, u64 *spte, int level);
 151};
 152
 153struct kvm_unsync_walk {
 154        int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
 155};
 156
 157typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
 158
 159static struct kmem_cache *pte_chain_cache;
 160static struct kmem_cache *rmap_desc_cache;
 161static struct kmem_cache *mmu_page_header_cache;
 162
 163static u64 __read_mostly shadow_trap_nonpresent_pte;
 164static u64 __read_mostly shadow_notrap_nonpresent_pte;
 165static u64 __read_mostly shadow_base_present_pte;
 166static u64 __read_mostly shadow_nx_mask;
 167static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 168static u64 __read_mostly shadow_user_mask;
 169static u64 __read_mostly shadow_accessed_mask;
 170static u64 __read_mostly shadow_dirty_mask;
 171
 172void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 173{
 174        shadow_trap_nonpresent_pte = trap_pte;
 175        shadow_notrap_nonpresent_pte = notrap_pte;
 176}
 177EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 178
 179void kvm_mmu_set_base_ptes(u64 base_pte)
 180{
 181        shadow_base_present_pte = base_pte;
 182}
 183EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 184
 185void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 186                u64 dirty_mask, u64 nx_mask, u64 x_mask)
 187{
 188        shadow_user_mask = user_mask;
 189        shadow_accessed_mask = accessed_mask;
 190        shadow_dirty_mask = dirty_mask;
 191        shadow_nx_mask = nx_mask;
 192        shadow_x_mask = x_mask;
 193}
 194EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 195
 196static int is_write_protection(struct kvm_vcpu *vcpu)
 197{
 198        return vcpu->arch.cr0 & X86_CR0_WP;
 199}
 200
 201static int is_cpuid_PSE36(void)
 202{
 203        return 1;
 204}
 205
 206static int is_nx(struct kvm_vcpu *vcpu)
 207{
 208        return vcpu->arch.shadow_efer & EFER_NX;
 209}
 210
 211static int is_present_pte(unsigned long pte)
 212{
 213        return pte & PT_PRESENT_MASK;
 214}
 215
 216static int is_shadow_present_pte(u64 pte)
 217{
 218        return pte != shadow_trap_nonpresent_pte
 219                && pte != shadow_notrap_nonpresent_pte;
 220}
 221
 222static int is_large_pte(u64 pte)
 223{
 224        return pte & PT_PAGE_SIZE_MASK;
 225}
 226
 227static int is_writeble_pte(unsigned long pte)
 228{
 229        return pte & PT_WRITABLE_MASK;
 230}
 231
 232static int is_dirty_pte(unsigned long pte)
 233{
 234        return pte & shadow_dirty_mask;
 235}
 236
 237static int is_rmap_pte(u64 pte)
 238{
 239        return is_shadow_present_pte(pte);
 240}
 241
 242static pfn_t spte_to_pfn(u64 pte)
 243{
 244        return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 245}
 246
 247static gfn_t pse36_gfn_delta(u32 gpte)
 248{
 249        int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 250
 251        return (gpte & PT32_DIR_PSE36_MASK) << shift;
 252}
 253
 254static void set_shadow_pte(u64 *sptep, u64 spte)
 255{
 256#ifdef CONFIG_X86_64
 257        set_64bit((unsigned long *)sptep, spte);
 258#else
 259        set_64bit((unsigned long long *)sptep, spte);
 260#endif
 261}
 262
 263static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 264                                  struct kmem_cache *base_cache, int min)
 265{
 266        void *obj;
 267
 268        if (cache->nobjs >= min)
 269                return 0;
 270        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 271                obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
 272                if (!obj)
 273                        return -ENOMEM;
 274                cache->objects[cache->nobjs++] = obj;
 275        }
 276        return 0;
 277}
 278
 279static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 280{
 281        while (mc->nobjs)
 282                kfree(mc->objects[--mc->nobjs]);
 283}
 284
 285static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 286                                       int min)
 287{
 288        struct page *page;
 289
 290        if (cache->nobjs >= min)
 291                return 0;
 292        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 293                page = alloc_page(GFP_KERNEL);
 294                if (!page)
 295                        return -ENOMEM;
 296                set_page_private(page, 0);
 297                cache->objects[cache->nobjs++] = page_address(page);
 298        }
 299        return 0;
 300}
 301
 302static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
 303{
 304        while (mc->nobjs)
 305                free_page((unsigned long)mc->objects[--mc->nobjs]);
 306}
 307
 308static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 309{
 310        int r;
 311
 312        r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
 313                                   pte_chain_cache, 4);
 314        if (r)
 315                goto out;
 316        r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
 317                                   rmap_desc_cache, 4);
 318        if (r)
 319                goto out;
 320        r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
 321        if (r)
 322                goto out;
 323        r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 324                                   mmu_page_header_cache, 4);
 325out:
 326        return r;
 327}
 328
 329static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 330{
 331        mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
 332        mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
 333        mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
 334        mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 335}
 336
 337static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 338                                    size_t size)
 339{
 340        void *p;
 341
 342        BUG_ON(!mc->nobjs);
 343        p = mc->objects[--mc->nobjs];
 344        memset(p, 0, size);
 345        return p;
 346}
 347
 348static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 349{
 350        return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
 351                                      sizeof(struct kvm_pte_chain));
 352}
 353
 354static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 355{
 356        kfree(pc);
 357}
 358
 359static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 360{
 361        return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
 362                                      sizeof(struct kvm_rmap_desc));
 363}
 364
 365static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 366{
 367        kfree(rd);
 368}
 369
 370/*
 371 * Return the pointer to the largepage write count for a given
 372 * gfn, handling slots that are not large page aligned.
 373 */
 374static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
 375{
 376        unsigned long idx;
 377
 378        idx = (gfn / KVM_PAGES_PER_HPAGE) -
 379              (slot->base_gfn / KVM_PAGES_PER_HPAGE);
 380        return &slot->lpage_info[idx].write_count;
 381}
 382
 383static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 384{
 385        int *write_count;
 386
 387        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
 388        *write_count += 1;
 389}
 390
 391static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 392{
 393        int *write_count;
 394
 395        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
 396        *write_count -= 1;
 397        WARN_ON(*write_count < 0);
 398}
 399
 400static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 401{
 402        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 403        int *largepage_idx;
 404
 405        if (slot) {
 406                largepage_idx = slot_largepage_idx(gfn, slot);
 407                return *largepage_idx;
 408        }
 409
 410        return 1;
 411}
 412
 413static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
 414{
 415        struct vm_area_struct *vma;
 416        unsigned long addr;
 417        int ret = 0;
 418
 419        addr = gfn_to_hva(kvm, gfn);
 420        if (kvm_is_error_hva(addr))
 421                return ret;
 422
 423        down_read(&current->mm->mmap_sem);
 424        vma = find_vma(current->mm, addr);
 425        if (vma && is_vm_hugetlb_page(vma))
 426                ret = 1;
 427        up_read(&current->mm->mmap_sem);
 428
 429        return ret;
 430}
 431
 432static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 433{
 434        struct kvm_memory_slot *slot;
 435
 436        if (has_wrprotected_page(vcpu->kvm, large_gfn))
 437                return 0;
 438
 439        if (!host_largepage_backed(vcpu->kvm, large_gfn))
 440                return 0;
 441
 442        slot = gfn_to_memslot(vcpu->kvm, large_gfn);
 443        if (slot && slot->dirty_bitmap)
 444                return 0;
 445
 446        return 1;
 447}
 448
 449/*
 450 * Take gfn and return the reverse mapping to it.
 451 * Note: gfn must be unaliased before this function get called
 452 */
 453
 454static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
 455{
 456        struct kvm_memory_slot *slot;
 457        unsigned long idx;
 458
 459        slot = gfn_to_memslot(kvm, gfn);
 460        if (!lpage)
 461                return &slot->rmap[gfn - slot->base_gfn];
 462
 463        idx = (gfn / KVM_PAGES_PER_HPAGE) -
 464              (slot->base_gfn / KVM_PAGES_PER_HPAGE);
 465
 466        return &slot->lpage_info[idx].rmap_pde;
 467}
 468
 469/*
 470 * Reverse mapping data structures:
 471 *
 472 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 473 * that points to page_address(page).
 474 *
 475 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 476 * containing more mappings.
 477 */
 478static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 479{
 480        struct kvm_mmu_page *sp;
 481        struct kvm_rmap_desc *desc;
 482        unsigned long *rmapp;
 483        int i;
 484
 485        if (!is_rmap_pte(*spte))
 486                return;
 487        gfn = unalias_gfn(vcpu->kvm, gfn);
 488        sp = page_header(__pa(spte));
 489        sp->gfns[spte - sp->spt] = gfn;
 490        rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
 491        if (!*rmapp) {
 492                rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
 493                *rmapp = (unsigned long)spte;
 494        } else if (!(*rmapp & 1)) {
 495                rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
 496                desc = mmu_alloc_rmap_desc(vcpu);
 497                desc->shadow_ptes[0] = (u64 *)*rmapp;
 498                desc->shadow_ptes[1] = spte;
 499                *rmapp = (unsigned long)desc | 1;
 500        } else {
 501                rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 502                desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 503                while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
 504                        desc = desc->more;
 505                if (desc->shadow_ptes[RMAP_EXT-1]) {
 506                        desc->more = mmu_alloc_rmap_desc(vcpu);
 507                        desc = desc->more;
 508                }
 509                for (i = 0; desc->shadow_ptes[i]; ++i)
 510                        ;
 511                desc->shadow_ptes[i] = spte;
 512        }
 513}
 514
 515static void rmap_desc_remove_entry(unsigned long *rmapp,
 516                                   struct kvm_rmap_desc *desc,
 517                                   int i,
 518                                   struct kvm_rmap_desc *prev_desc)
 519{
 520        int j;
 521
 522        for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
 523                ;
 524        desc->shadow_ptes[i] = desc->shadow_ptes[j];
 525        desc->shadow_ptes[j] = NULL;
 526        if (j != 0)
 527                return;
 528        if (!prev_desc && !desc->more)
 529                *rmapp = (unsigned long)desc->shadow_ptes[0];
 530        else
 531                if (prev_desc)
 532                        prev_desc->more = desc->more;
 533                else
 534                        *rmapp = (unsigned long)desc->more | 1;
 535        mmu_free_rmap_desc(desc);
 536}
 537
 538static void rmap_remove(struct kvm *kvm, u64 *spte)
 539{
 540        struct kvm_rmap_desc *desc;
 541        struct kvm_rmap_desc *prev_desc;
 542        struct kvm_mmu_page *sp;
 543        pfn_t pfn;
 544        unsigned long *rmapp;
 545        int i;
 546
 547        if (!is_rmap_pte(*spte))
 548                return;
 549        sp = page_header(__pa(spte));
 550        pfn = spte_to_pfn(*spte);
 551        if (*spte & shadow_accessed_mask)
 552                kvm_set_pfn_accessed(pfn);
 553        if (is_writeble_pte(*spte))
 554                kvm_release_pfn_dirty(pfn);
 555        else
 556                kvm_release_pfn_clean(pfn);
 557        rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
 558        if (!*rmapp) {
 559                printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 560                BUG();
 561        } else if (!(*rmapp & 1)) {
 562                rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
 563                if ((u64 *)*rmapp != spte) {
 564                        printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
 565                               spte, *spte);
 566                        BUG();
 567                }
 568                *rmapp = 0;
 569        } else {
 570                rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
 571                desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 572                prev_desc = NULL;
 573                while (desc) {
 574                        for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
 575                                if (desc->shadow_ptes[i] == spte) {
 576                                        rmap_desc_remove_entry(rmapp,
 577                                                               desc, i,
 578                                                               prev_desc);
 579                                        return;
 580                                }
 581                        prev_desc = desc;
 582                        desc = desc->more;
 583                }
 584                BUG();
 585        }
 586}
 587
 588static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 589{
 590        struct kvm_rmap_desc *desc;
 591        struct kvm_rmap_desc *prev_desc;
 592        u64 *prev_spte;
 593        int i;
 594
 595        if (!*rmapp)
 596                return NULL;
 597        else if (!(*rmapp & 1)) {
 598                if (!spte)
 599                        return (u64 *)*rmapp;
 600                return NULL;
 601        }
 602        desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 603        prev_desc = NULL;
 604        prev_spte = NULL;
 605        while (desc) {
 606                for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
 607                        if (prev_spte == spte)
 608                                return desc->shadow_ptes[i];
 609                        prev_spte = desc->shadow_ptes[i];
 610                }
 611                desc = desc->more;
 612        }
 613        return NULL;
 614}
 615
 616static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 617{
 618        unsigned long *rmapp;
 619        u64 *spte;
 620        int write_protected = 0;
 621
 622        gfn = unalias_gfn(kvm, gfn);
 623        rmapp = gfn_to_rmap(kvm, gfn, 0);
 624
 625        spte = rmap_next(kvm, rmapp, NULL);
 626        while (spte) {
 627                BUG_ON(!spte);
 628                BUG_ON(!(*spte & PT_PRESENT_MASK));
 629                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 630                if (is_writeble_pte(*spte)) {
 631                        set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
 632                        write_protected = 1;
 633                }
 634                spte = rmap_next(kvm, rmapp, spte);
 635        }
 636        if (write_protected) {
 637                pfn_t pfn;
 638
 639                spte = rmap_next(kvm, rmapp, NULL);
 640                pfn = spte_to_pfn(*spte);
 641                kvm_set_pfn_dirty(pfn);
 642        }
 643
 644        /* check for huge page mappings */
 645        rmapp = gfn_to_rmap(kvm, gfn, 1);
 646        spte = rmap_next(kvm, rmapp, NULL);
 647        while (spte) {
 648                BUG_ON(!spte);
 649                BUG_ON(!(*spte & PT_PRESENT_MASK));
 650                BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 651                pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 652                if (is_writeble_pte(*spte)) {
 653                        rmap_remove(kvm, spte);
 654                        --kvm->stat.lpages;
 655                        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 656                        spte = NULL;
 657                        write_protected = 1;
 658                }
 659                spte = rmap_next(kvm, rmapp, spte);
 660        }
 661
 662        if (write_protected)
 663                kvm_flush_remote_tlbs(kvm);
 664}
 665
 666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 667{
 668        u64 *spte;
 669        int need_tlb_flush = 0;
 670
 671        while ((spte = rmap_next(kvm, rmapp, NULL))) {
 672                BUG_ON(!(*spte & PT_PRESENT_MASK));
 673                rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 674                rmap_remove(kvm, spte);
 675                set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 676                need_tlb_flush = 1;
 677        }
 678        return need_tlb_flush;
 679}
 680
 681static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 682                          int (*handler)(struct kvm *kvm, unsigned long *rmapp))
 683{
 684        int i;
 685        int retval = 0;
 686
 687        /*
 688         * If mmap_sem isn't taken, we can look the memslots with only
 689         * the mmu_lock by skipping over the slots with userspace_addr == 0.
 690         */
 691        for (i = 0; i < kvm->nmemslots; i++) {
 692                struct kvm_memory_slot *memslot = &kvm->memslots[i];
 693                unsigned long start = memslot->userspace_addr;
 694                unsigned long end;
 695
 696                /* mmu_lock protects userspace_addr */
 697                if (!start)
 698                        continue;
 699
 700                end = start + (memslot->npages << PAGE_SHIFT);
 701                if (hva >= start && hva < end) {
 702                        gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 703                        retval |= handler(kvm, &memslot->rmap[gfn_offset]);
 704                        retval |= handler(kvm,
 705                                          &memslot->lpage_info[
 706                                                  gfn_offset /
 707                                                  KVM_PAGES_PER_HPAGE].rmap_pde);
 708                }
 709        }
 710
 711        return retval;
 712}
 713
 714int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 715{
 716        return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 717}
 718
 719static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
 720{
 721        u64 *spte;
 722        int young = 0;
 723
 724        /* always return old for EPT */
 725        if (!shadow_accessed_mask)
 726                return 0;
 727
 728        spte = rmap_next(kvm, rmapp, NULL);
 729        while (spte) {
 730                int _young;
 731                u64 _spte = *spte;
 732                BUG_ON(!(_spte & PT_PRESENT_MASK));
 733                _young = _spte & PT_ACCESSED_MASK;
 734                if (_young) {
 735                        young = 1;
 736                        clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
 737                }
 738                spte = rmap_next(kvm, rmapp, spte);
 739        }
 740        return young;
 741}
 742
 743int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 744{
 745        return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
 746}
 747
 748#ifdef MMU_DEBUG
 749static int is_empty_shadow_page(u64 *spt)
 750{
 751        u64 *pos;
 752        u64 *end;
 753
 754        for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
 755                if (is_shadow_present_pte(*pos)) {
 756                        printk(KERN_ERR "%s: %p %llx\n", __func__,
 757                               pos, *pos);
 758                        return 0;
 759                }
 760        return 1;
 761}
 762#endif
 763
 764static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 765{
 766        ASSERT(is_empty_shadow_page(sp->spt));
 767        list_del(&sp->link);
 768        __free_page(virt_to_page(sp->spt));
 769        __free_page(virt_to_page(sp->gfns));
 770        kfree(sp);
 771        ++kvm->arch.n_free_mmu_pages;
 772}
 773
 774static unsigned kvm_page_table_hashfn(gfn_t gfn)
 775{
 776        return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 777}
 778
 779static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 780                                               u64 *parent_pte)
 781{
 782        struct kvm_mmu_page *sp;
 783
 784        sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
 785        sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 786        sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 787        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 788        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 789        ASSERT(is_empty_shadow_page(sp->spt));
 790        sp->slot_bitmap = 0;
 791        sp->multimapped = 0;
 792        sp->parent_pte = parent_pte;
 793        --vcpu->kvm->arch.n_free_mmu_pages;
 794        return sp;
 795}
 796
 797static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
 798                                    struct kvm_mmu_page *sp, u64 *parent_pte)
 799{
 800        struct kvm_pte_chain *pte_chain;
 801        struct hlist_node *node;
 802        int i;
 803
 804        if (!parent_pte)
 805                return;
 806        if (!sp->multimapped) {
 807                u64 *old = sp->parent_pte;
 808
 809                if (!old) {
 810                        sp->parent_pte = parent_pte;
 811                        return;
 812                }
 813                sp->multimapped = 1;
 814                pte_chain = mmu_alloc_pte_chain(vcpu);
 815                INIT_HLIST_HEAD(&sp->parent_ptes);
 816                hlist_add_head(&pte_chain->link, &sp->parent_ptes);
 817                pte_chain->parent_ptes[0] = old;
 818        }
 819        hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
 820                if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
 821                        continue;
 822                for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
 823                        if (!pte_chain->parent_ptes[i]) {
 824                                pte_chain->parent_ptes[i] = parent_pte;
 825                                return;
 826                        }
 827        }
 828        pte_chain = mmu_alloc_pte_chain(vcpu);
 829        BUG_ON(!pte_chain);
 830        hlist_add_head(&pte_chain->link, &sp->parent_ptes);
 831        pte_chain->parent_ptes[0] = parent_pte;
 832}
 833
 834static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 835                                       u64 *parent_pte)
 836{
 837        struct kvm_pte_chain *pte_chain;
 838        struct hlist_node *node;
 839        int i;
 840
 841        if (!sp->multimapped) {
 842                BUG_ON(sp->parent_pte != parent_pte);
 843                sp->parent_pte = NULL;
 844                return;
 845        }
 846        hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 847                for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
 848                        if (!pte_chain->parent_ptes[i])
 849                                break;
 850                        if (pte_chain->parent_ptes[i] != parent_pte)
 851                                continue;
 852                        while (i + 1 < NR_PTE_CHAIN_ENTRIES
 853                                && pte_chain->parent_ptes[i + 1]) {
 854                                pte_chain->parent_ptes[i]
 855                                        = pte_chain->parent_ptes[i + 1];
 856                                ++i;
 857                        }
 858                        pte_chain->parent_ptes[i] = NULL;
 859                        if (i == 0) {
 860                                hlist_del(&pte_chain->link);
 861                                mmu_free_pte_chain(pte_chain);
 862                                if (hlist_empty(&sp->parent_ptes)) {
 863                                        sp->multimapped = 0;
 864                                        sp->parent_pte = NULL;
 865                                }
 866                        }
 867                        return;
 868                }
 869        BUG();
 870}
 871
 872
 873static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 874                            mmu_parent_walk_fn fn)
 875{
 876        struct kvm_pte_chain *pte_chain;
 877        struct hlist_node *node;
 878        struct kvm_mmu_page *parent_sp;
 879        int i;
 880
 881        if (!sp->multimapped && sp->parent_pte) {
 882                parent_sp = page_header(__pa(sp->parent_pte));
 883                fn(vcpu, parent_sp);
 884                mmu_parent_walk(vcpu, parent_sp, fn);
 885                return;
 886        }
 887        hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 888                for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
 889                        if (!pte_chain->parent_ptes[i])
 890                                break;
 891                        parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
 892                        fn(vcpu, parent_sp);
 893                        mmu_parent_walk(vcpu, parent_sp, fn);
 894                }
 895}
 896
 897static void kvm_mmu_update_unsync_bitmap(u64 *spte)
 898{
 899        unsigned int index;
 900        struct kvm_mmu_page *sp = page_header(__pa(spte));
 901
 902        index = spte - sp->spt;
 903        __set_bit(index, sp->unsync_child_bitmap);
 904        sp->unsync_children = 1;
 905}
 906
 907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 908{
 909        struct kvm_pte_chain *pte_chain;
 910        struct hlist_node *node;
 911        int i;
 912
 913        if (!sp->parent_pte)
 914                return;
 915
 916        if (!sp->multimapped) {
 917                kvm_mmu_update_unsync_bitmap(sp->parent_pte);
 918                return;
 919        }
 920
 921        hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 922                for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
 923                        if (!pte_chain->parent_ptes[i])
 924                                break;
 925                        kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
 926                }
 927}
 928
 929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 930{
 931        sp->unsync_children = 1;
 932        kvm_mmu_update_parents_unsync(sp);
 933        return 1;
 934}
 935
 936static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
 937                                        struct kvm_mmu_page *sp)
 938{
 939        mmu_parent_walk(vcpu, sp, unsync_walk_fn);
 940        kvm_mmu_update_parents_unsync(sp);
 941}
 942
 943static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 944                                    struct kvm_mmu_page *sp)
 945{
 946        int i;
 947
 948        for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 949                sp->spt[i] = shadow_trap_nonpresent_pte;
 950}
 951
 952static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 953                               struct kvm_mmu_page *sp)
 954{
 955        return 1;
 956}
 957
 958static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 959{
 960}
 961
 962#define for_each_unsync_children(bitmap, idx)           \
 963        for (idx = find_first_bit(bitmap, 512);         \
 964             idx < 512;                                 \
 965             idx = find_next_bit(bitmap, 512, idx+1))
 966
 967static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 968                           struct kvm_unsync_walk *walker)
 969{
 970        int i, ret;
 971
 972        if (!sp->unsync_children)
 973                return 0;
 974
 975        for_each_unsync_children(sp->unsync_child_bitmap, i) {
 976                u64 ent = sp->spt[i];
 977
 978                if (is_shadow_present_pte(ent)) {
 979                        struct kvm_mmu_page *child;
 980                        child = page_header(ent & PT64_BASE_ADDR_MASK);
 981
 982                        if (child->unsync_children) {
 983                                ret = mmu_unsync_walk(child, walker);
 984                                if (ret)
 985                                        return ret;
 986                                __clear_bit(i, sp->unsync_child_bitmap);
 987                        }
 988
 989                        if (child->unsync) {
 990                                ret = walker->entry(child, walker);
 991                                __clear_bit(i, sp->unsync_child_bitmap);
 992                                if (ret)
 993                                        return ret;
 994                        }
 995                }
 996        }
 997
 998        if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
 999                sp->unsync_children = 0;
1000
1001        return 0;
1002}
1003
1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1005{
1006        unsigned index;
1007        struct hlist_head *bucket;
1008        struct kvm_mmu_page *sp;
1009        struct hlist_node *node;
1010
1011        pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1012        index = kvm_page_table_hashfn(gfn);
1013        bucket = &kvm->arch.mmu_page_hash[index];
1014        hlist_for_each_entry(sp, node, bucket, hash_link)
1015                if (sp->gfn == gfn && !sp->role.metaphysical
1016                    && !sp->role.invalid) {
1017                        pgprintk("%s: found role %x\n",
1018                                 __func__, sp->role.word);
1019                        return sp;
1020                }
1021        return NULL;
1022}
1023
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{
1026        WARN_ON(!sp->unsync);
1027        sp->unsync = 0;
1028        --kvm->stat.mmu_unsync;
1029}
1030
1031static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1032
1033static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1034{
1035        if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1036                kvm_mmu_zap_page(vcpu->kvm, sp);
1037                return 1;
1038        }
1039
1040        rmap_write_protect(vcpu->kvm, sp->gfn);
1041        kvm_unlink_unsync_page(vcpu->kvm, sp);
1042        if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1043                kvm_mmu_zap_page(vcpu->kvm, sp);
1044                return 1;
1045        }
1046
1047        kvm_mmu_flush_tlb(vcpu);
1048        return 0;
1049}
1050
1051struct sync_walker {
1052        struct kvm_vcpu *vcpu;
1053        struct kvm_unsync_walk walker;
1054};
1055
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1057{
1058        struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1059                                                     walker);
1060        struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061
1062        kvm_sync_page(vcpu, sp);
1063        return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1064}
1065
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1067{
1068        struct sync_walker walker = {
1069                .walker = { .entry = mmu_sync_fn, },
1070                .vcpu = vcpu,
1071        };
1072
1073        while (mmu_unsync_walk(sp, &walker.walker))
1074                cond_resched_lock(&vcpu->kvm->mmu_lock);
1075}
1076
1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1078                                             gfn_t gfn,
1079                                             gva_t gaddr,
1080                                             unsigned level,
1081                                             int metaphysical,
1082                                             unsigned access,
1083                                             u64 *parent_pte)
1084{
1085        union kvm_mmu_page_role role;
1086        unsigned index;
1087        unsigned quadrant;
1088        struct hlist_head *bucket;
1089        struct kvm_mmu_page *sp;
1090        struct hlist_node *node, *tmp;
1091
1092        role.word = 0;
1093        role.glevels = vcpu->arch.mmu.root_level;
1094        role.level = level;
1095        role.metaphysical = metaphysical;
1096        role.access = access;
1097        if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1098                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1099                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1100                role.quadrant = quadrant;
1101        }
1102        pgprintk("%s: looking gfn %lx role %x\n", __func__,
1103                 gfn, role.word);
1104        index = kvm_page_table_hashfn(gfn);
1105        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1106        hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
1107                if (sp->gfn == gfn) {
1108                        if (sp->unsync)
1109                                if (kvm_sync_page(vcpu, sp))
1110                                        continue;
1111
1112                        if (sp->role.word != role.word)
1113                                continue;
1114
1115                        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1116                        if (sp->unsync_children) {
1117                                set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1118                                kvm_mmu_mark_parents_unsync(vcpu, sp);
1119                        }
1120                        pgprintk("%s: found\n", __func__);
1121                        return sp;
1122                }
1123        ++vcpu->kvm->stat.mmu_cache_miss;
1124        sp = kvm_mmu_alloc_page(vcpu, parent_pte);
1125        if (!sp)
1126                return sp;
1127        pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1128        sp->gfn = gfn;
1129        sp->role = role;
1130        hlist_add_head(&sp->hash_link, bucket);
1131        if (!metaphysical) {
1132                rmap_write_protect(vcpu->kvm, gfn);
1133                account_shadowed(vcpu->kvm, gfn);
1134        }
1135        if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1136                vcpu->arch.mmu.prefetch_page(vcpu, sp);
1137        else
1138                nonpaging_prefetch_page(vcpu, sp);
1139        return sp;
1140}
1141
1142static int walk_shadow(struct kvm_shadow_walk *walker,
1143                       struct kvm_vcpu *vcpu, u64 addr)
1144{
1145        hpa_t shadow_addr;
1146        int level;
1147        int r;
1148        u64 *sptep;
1149        unsigned index;
1150
1151        shadow_addr = vcpu->arch.mmu.root_hpa;
1152        level = vcpu->arch.mmu.shadow_root_level;
1153        if (level == PT32E_ROOT_LEVEL) {
1154                shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155                shadow_addr &= PT64_BASE_ADDR_MASK;
1156                --level;
1157        }
1158
1159        while (level >= PT_PAGE_TABLE_LEVEL) {
1160                index = SHADOW_PT_INDEX(addr, level);
1161                sptep = ((u64 *)__va(shadow_addr)) + index;
1162                r = walker->entry(walker, vcpu, addr, sptep, level);
1163                if (r)
1164                        return r;
1165                shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1166                --level;
1167        }
1168        return 0;
1169}
1170
1171static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1172                                         struct kvm_mmu_page *sp)
1173{
1174        unsigned i;
1175        u64 *pt;
1176        u64 ent;
1177
1178        pt = sp->spt;
1179
1180        if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1181                for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1182                        if (is_shadow_present_pte(pt[i]))
1183                                rmap_remove(kvm, &pt[i]);
1184                        pt[i] = shadow_trap_nonpresent_pte;
1185                }
1186                return;
1187        }
1188
1189        for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1190                ent = pt[i];
1191
1192                if (is_shadow_present_pte(ent)) {
1193                        if (!is_large_pte(ent)) {
1194                                ent &= PT64_BASE_ADDR_MASK;
1195                                mmu_page_remove_parent_pte(page_header(ent),
1196                                                           &pt[i]);
1197                        } else {
1198                                --kvm->stat.lpages;
1199                                rmap_remove(kvm, &pt[i]);
1200                        }
1201                }
1202                pt[i] = shadow_trap_nonpresent_pte;
1203        }
1204}
1205
1206static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1207{
1208        mmu_page_remove_parent_pte(sp, parent_pte);
1209}
1210
1211static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1212{
1213        int i;
1214
1215        for (i = 0; i < KVM_MAX_VCPUS; ++i)
1216                if (kvm->vcpus[i])
1217                        kvm->vcpus[i]->arch.last_pte_updated = NULL;
1218}
1219
1220static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1221{
1222        u64 *parent_pte;
1223
1224        while (sp->multimapped || sp->parent_pte) {
1225                if (!sp->multimapped)
1226                        parent_pte = sp->parent_pte;
1227                else {
1228                        struct kvm_pte_chain *chain;
1229
1230                        chain = container_of(sp->parent_ptes.first,
1231                                             struct kvm_pte_chain, link);
1232                        parent_pte = chain->parent_ptes[0];
1233                }
1234                BUG_ON(!parent_pte);
1235                kvm_mmu_put_page(sp, parent_pte);
1236                set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1237        }
1238}
1239
1240struct zap_walker {
1241        struct kvm_unsync_walk walker;
1242        struct kvm *kvm;
1243        int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{
1248        struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1249                                                     walker);
1250        kvm_mmu_zap_page(zap_walk->kvm, sp);
1251        zap_walk->zapped = 1;
1252        return 0;
1253}
1254
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1256{
1257        struct zap_walker walker = {
1258                .walker = { .entry = mmu_zap_fn, },
1259                .kvm = kvm,
1260                .zapped = 0,
1261        };
1262
1263        if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264                return 0;
1265        mmu_unsync_walk(sp, &walker.walker);
1266        return walker.zapped;
1267}
1268
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1270{
1271        int ret;
1272        ++kvm->stat.mmu_shadow_zapped;
1273        ret = mmu_zap_unsync_children(kvm, sp);
1274        kvm_mmu_page_unlink_children(kvm, sp);
1275        kvm_mmu_unlink_parents(kvm, sp);
1276        kvm_flush_remote_tlbs(kvm);
1277        if (!sp->role.invalid && !sp->role.metaphysical)
1278                unaccount_shadowed(kvm, sp->gfn);
1279        if (sp->unsync)
1280                kvm_unlink_unsync_page(kvm, sp);
1281        if (!sp->root_count) {
1282                hlist_del(&sp->hash_link);
1283                kvm_mmu_free_page(kvm, sp);
1284        } else {
1285                sp->role.invalid = 1;
1286                list_move(&sp->link, &kvm->arch.active_mmu_pages);
1287                kvm_reload_remote_mmus(kvm);
1288        }
1289        kvm_mmu_reset_last_pte_updated(kvm);
1290        return ret;
1291}
1292
1293/*
1294 * Changing the number of mmu pages allocated to the vm
1295 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1296 */
1297void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1298{
1299        /*
1300         * If we set the number of mmu pages to be smaller be than the
1301         * number of actived pages , we must to free some mmu pages before we
1302         * change the value
1303         */
1304
1305        if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
1306            kvm_nr_mmu_pages) {
1307                int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
1308                                       - kvm->arch.n_free_mmu_pages;
1309
1310                while (n_used_mmu_pages > kvm_nr_mmu_pages) {
1311                        struct kvm_mmu_page *page;
1312
1313                        page = container_of(kvm->arch.active_mmu_pages.prev,
1314                                            struct kvm_mmu_page, link);
1315                        kvm_mmu_zap_page(kvm, page);
1316                        n_used_mmu_pages--;
1317                }
1318                kvm->arch.n_free_mmu_pages = 0;
1319        }
1320        else
1321                kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1322                                         - kvm->arch.n_alloc_mmu_pages;
1323
1324        kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1325}
1326
1327static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1328{
1329        unsigned index;
1330        struct hlist_head *bucket;
1331        struct kvm_mmu_page *sp;
1332        struct hlist_node *node, *n;
1333        int r;
1334
1335        pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1336        r = 0;
1337        index = kvm_page_table_hashfn(gfn);
1338        bucket = &kvm->arch.mmu_page_hash[index];
1339        hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1340                if (sp->gfn == gfn && !sp->role.metaphysical) {
1341                        pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1342                                 sp->role.word);
1343                        r = 1;
1344                        if (kvm_mmu_zap_page(kvm, sp))
1345                                n = bucket->first;
1346                }
1347        return r;
1348}
1349
1350static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1351{
1352        struct kvm_mmu_page *sp;
1353
1354        while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
1355                pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
1356                kvm_mmu_zap_page(kvm, sp);
1357        }
1358}
1359
1360static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1361{
1362        int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
1363        struct kvm_mmu_page *sp = page_header(__pa(pte));
1364
1365        __set_bit(slot, &sp->slot_bitmap);
1366}
1367
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1369{
1370        int i;
1371        u64 *pt = sp->spt;
1372
1373        if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1374                return;
1375
1376        for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1377                if (pt[i] == shadow_notrap_nonpresent_pte)
1378                        set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1379        }
1380}
1381
1382struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1383{
1384        struct page *page;
1385
1386        gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1387
1388        if (gpa == UNMAPPED_GVA)
1389                return NULL;
1390
1391        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1392
1393        return page;
1394}
1395
1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1397{
1398        unsigned index;
1399        struct hlist_head *bucket;
1400        struct kvm_mmu_page *s;
1401        struct hlist_node *node, *n;
1402
1403        index = kvm_page_table_hashfn(sp->gfn);
1404        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1405        /* don't unsync if pagetable is shadowed with multiple roles */
1406        hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1407                if (s->gfn != sp->gfn || s->role.metaphysical)
1408                        continue;
1409                if (s->role.word != sp->role.word)
1410                        return 1;
1411        }
1412        kvm_mmu_mark_parents_unsync(vcpu, sp);
1413        ++vcpu->kvm->stat.mmu_unsync;
1414        sp->unsync = 1;
1415        mmu_convert_notrap(sp);
1416        return 0;
1417}
1418
1419static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1420                                  bool can_unsync)
1421{
1422        struct kvm_mmu_page *shadow;
1423
1424        shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1425        if (shadow) {
1426                if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1427                        return 1;
1428                if (shadow->unsync)
1429                        return 0;
1430                if (can_unsync && oos_shadow)
1431                        return kvm_unsync_page(vcpu, shadow);
1432                return 1;
1433        }
1434        return 0;
1435}
1436
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438                    unsigned pte_access, int user_fault,
1439                    int write_fault, int dirty, int largepage,
1440                    gfn_t gfn, pfn_t pfn, bool speculative,
1441                    bool can_unsync)
1442{
1443        u64 spte;
1444        int ret = 0;
1445        /*
1446         * We don't set the accessed bit, since we sometimes want to see
1447         * whether the guest actually used the pte (in order to detect
1448         * demand paging).
1449         */
1450        spte = shadow_base_present_pte | shadow_dirty_mask;
1451        if (!speculative)
1452                spte |= shadow_accessed_mask;
1453        if (!dirty)
1454                pte_access &= ~ACC_WRITE_MASK;
1455        if (pte_access & ACC_EXEC_MASK)
1456                spte |= shadow_x_mask;
1457        else
1458                spte |= shadow_nx_mask;
1459        if (pte_access & ACC_USER_MASK)
1460                spte |= shadow_user_mask;
1461        if (largepage)
1462                spte |= PT_PAGE_SIZE_MASK;
1463
1464        spte |= (u64)pfn << PAGE_SHIFT;
1465
1466        if ((pte_access & ACC_WRITE_MASK)
1467            || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1468
1469                if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1470                        ret = 1;
1471                        spte = shadow_trap_nonpresent_pte;
1472                        goto set_pte;
1473                }
1474
1475                spte |= PT_WRITABLE_MASK;
1476
1477                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1478                        pgprintk("%s: found shadow page for %lx, marking ro\n",
1479                                 __func__, gfn);
1480                        ret = 1;
1481                        pte_access &= ~ACC_WRITE_MASK;
1482                        if (is_writeble_pte(spte))
1483                                spte &= ~PT_WRITABLE_MASK;
1484                }
1485        }
1486
1487        if (pte_access & ACC_WRITE_MASK)
1488                mark_page_dirty(vcpu->kvm, gfn);
1489
1490set_pte:
1491        set_shadow_pte(shadow_pte, spte);
1492        return ret;
1493}
1494
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496                         unsigned pt_access, unsigned pte_access,
1497                         int user_fault, int write_fault, int dirty,
1498                         int *ptwrite, int largepage, gfn_t gfn,
1499                         pfn_t pfn, bool speculative)
1500{
1501        int was_rmapped = 0;
1502        int was_writeble = is_writeble_pte(*shadow_pte);
1503
1504        pgprintk("%s: spte %llx access %x write_fault %d"
1505                 " user_fault %d gfn %lx\n",
1506                 __func__, *shadow_pte, pt_access,
1507                 write_fault, user_fault, gfn);
1508
1509        if (is_rmap_pte(*shadow_pte)) {
1510                /*
1511                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1512                 * the parent of the now unreachable PTE.
1513                 */
1514                if (largepage && !is_large_pte(*shadow_pte)) {
1515                        struct kvm_mmu_page *child;
1516                        u64 pte = *shadow_pte;
1517
1518                        child = page_header(pte & PT64_BASE_ADDR_MASK);
1519                        mmu_page_remove_parent_pte(child, shadow_pte);
1520                } else if (pfn != spte_to_pfn(*shadow_pte)) {
1521                        pgprintk("hfn old %lx new %lx\n",
1522                                 spte_to_pfn(*shadow_pte), pfn);
1523                        rmap_remove(vcpu->kvm, shadow_pte);
1524                } else {
1525                        if (largepage)
1526                                was_rmapped = is_large_pte(*shadow_pte);
1527                        else
1528                                was_rmapped = 1;
1529                }
1530        }
1531        if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532                      dirty, largepage, gfn, pfn, speculative, true)) {
1533                if (write_fault)
1534                        *ptwrite = 1;
1535                kvm_x86_ops->tlb_flush(vcpu);
1536        }
1537
1538        pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1539        pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1540                 is_large_pte(*shadow_pte)? "2MB" : "4kB",
1541                 is_present_pte(*shadow_pte)?"RW":"R", gfn,
1542                 *shadow_pte, shadow_pte);
1543        if (!was_rmapped && is_large_pte(*shadow_pte))
1544                ++vcpu->kvm->stat.lpages;
1545
1546        page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
1547        if (!was_rmapped) {
1548                rmap_add(vcpu, shadow_pte, gfn, largepage);
1549                if (!is_rmap_pte(*shadow_pte))
1550                        kvm_release_pfn_clean(pfn);
1551        } else {
1552                if (was_writeble)
1553                        kvm_release_pfn_dirty(pfn);
1554                else
1555                        kvm_release_pfn_clean(pfn);
1556        }
1557        if (speculative) {
1558                vcpu->arch.last_pte_updated = shadow_pte;
1559                vcpu->arch.last_pte_gfn = gfn;
1560        }
1561}
1562
1563static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1564{
1565}
1566
1567struct direct_shadow_walk {
1568        struct kvm_shadow_walk walker;
1569        pfn_t pfn;
1570        int write;
1571        int largepage;
1572        int pt_write;
1573};
1574
1575static int direct_map_entry(struct kvm_shadow_walk *_walk,
1576                            struct kvm_vcpu *vcpu,
1577                            u64 addr, u64 *sptep, int level)
1578{
1579        struct direct_shadow_walk *walk =
1580                container_of(_walk, struct direct_shadow_walk, walker);
1581        struct kvm_mmu_page *sp;
1582        gfn_t pseudo_gfn;
1583        gfn_t gfn = addr >> PAGE_SHIFT;
1584
1585        if (level == PT_PAGE_TABLE_LEVEL
1586            || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587                mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588                             0, walk->write, 1, &walk->pt_write,
1589                             walk->largepage, gfn, walk->pfn, false);
1590                ++vcpu->stat.pf_fixed;
1591                return 1;
1592        }
1593
1594        if (*sptep == shadow_trap_nonpresent_pte) {
1595                pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1596                sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1597                                      1, ACC_ALL, sptep);
1598                if (!sp) {
1599                        pgprintk("nonpaging_map: ENOMEM\n");
1600                        kvm_release_pfn_clean(walk->pfn);
1601                        return -ENOMEM;
1602                }
1603
1604                set_shadow_pte(sptep,
1605                               __pa(sp->spt)
1606                               | PT_PRESENT_MASK | PT_WRITABLE_MASK
1607                               | shadow_user_mask | shadow_x_mask);
1608        }
1609        return 0;
1610}
1611
1612static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1613                        int largepage, gfn_t gfn, pfn_t pfn)
1614{
1615        int r;
1616        struct direct_shadow_walk walker = {
1617                .walker = { .entry = direct_map_entry, },
1618                .pfn = pfn,
1619                .largepage = largepage,
1620                .write = write,
1621                .pt_write = 0,
1622        };
1623
1624        r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1625        if (r < 0)
1626                return r;
1627        return walker.pt_write;
1628}
1629
1630static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1631{
1632        int r;
1633        int largepage = 0;
1634        pfn_t pfn;
1635        unsigned long mmu_seq;
1636
1637        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1638                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1639                largepage = 1;
1640        }
1641
1642        mmu_seq = vcpu->kvm->mmu_notifier_seq;
1643        smp_rmb();
1644        pfn = gfn_to_pfn(vcpu->kvm, gfn);
1645
1646        /* mmio */
1647        if (is_error_pfn(pfn)) {
1648                kvm_release_pfn_clean(pfn);
1649                return 1;
1650        }
1651
1652        spin_lock(&vcpu->kvm->mmu_lock);
1653        if (mmu_notifier_retry(vcpu, mmu_seq))
1654                goto out_unlock;
1655        kvm_mmu_free_some_pages(vcpu);
1656        r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1657        spin_unlock(&vcpu->kvm->mmu_lock);
1658
1659
1660        return r;
1661
1662out_unlock:
1663        spin_unlock(&vcpu->kvm->mmu_lock);
1664        kvm_release_pfn_clean(pfn);
1665        return 0;
1666}
1667
1668
1669static void mmu_free_roots(struct kvm_vcpu *vcpu)
1670{
1671        int i;
1672        struct kvm_mmu_page *sp;
1673
1674        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1675                return;
1676        spin_lock(&vcpu->kvm->mmu_lock);
1677        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1678                hpa_t root = vcpu->arch.mmu.root_hpa;
1679
1680                sp = page_header(root);
1681                --sp->root_count;
1682                if (!sp->root_count && sp->role.invalid)
1683                        kvm_mmu_zap_page(vcpu->kvm, sp);
1684                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1685                spin_unlock(&vcpu->kvm->mmu_lock);
1686                return;
1687        }
1688        for (i = 0; i < 4; ++i) {
1689                hpa_t root = vcpu->arch.mmu.pae_root[i];
1690
1691                if (root) {
1692                        root &= PT64_BASE_ADDR_MASK;
1693                        sp = page_header(root);
1694                        --sp->root_count;
1695                        if (!sp->root_count && sp->role.invalid)
1696                                kvm_mmu_zap_page(vcpu->kvm, sp);
1697                }
1698                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1699        }
1700        spin_unlock(&vcpu->kvm->mmu_lock);
1701        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1702}
1703
1704static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1705{
1706        int i;
1707        gfn_t root_gfn;
1708        struct kvm_mmu_page *sp;
1709        int metaphysical = 0;
1710
1711        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1712
1713        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1714                hpa_t root = vcpu->arch.mmu.root_hpa;
1715
1716                ASSERT(!VALID_PAGE(root));
1717                if (tdp_enabled)
1718                        metaphysical = 1;
1719                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1720                                      PT64_ROOT_LEVEL, metaphysical,
1721                                      ACC_ALL, NULL);
1722                root = __pa(sp->spt);
1723                ++sp->root_count;
1724                vcpu->arch.mmu.root_hpa = root;
1725                return;
1726        }
1727        metaphysical = !is_paging(vcpu);
1728        if (tdp_enabled)
1729                metaphysical = 1;
1730        for (i = 0; i < 4; ++i) {
1731                hpa_t root = vcpu->arch.mmu.pae_root[i];
1732
1733                ASSERT(!VALID_PAGE(root));
1734                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1735                        if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1736                                vcpu->arch.mmu.pae_root[i] = 0;
1737                                continue;
1738                        }
1739                        root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1740                } else if (vcpu->arch.mmu.root_level == 0)
1741                        root_gfn = 0;
1742                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1743                                      PT32_ROOT_LEVEL, metaphysical,
1744                                      ACC_ALL, NULL);
1745                root = __pa(sp->spt);
1746                ++sp->root_count;
1747                vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1748        }
1749        vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1750}
1751
1752static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1753{
1754        int i;
1755        struct kvm_mmu_page *sp;
1756
1757        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1758                return;
1759        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1760                hpa_t root = vcpu->arch.mmu.root_hpa;
1761                sp = page_header(root);
1762                mmu_sync_children(vcpu, sp);
1763                return;
1764        }
1765        for (i = 0; i < 4; ++i) {
1766                hpa_t root = vcpu->arch.mmu.pae_root[i];
1767
1768                if (root) {
1769                        root &= PT64_BASE_ADDR_MASK;
1770                        sp = page_header(root);
1771                        mmu_sync_children(vcpu, sp);
1772                }
1773        }
1774}
1775
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{
1778        spin_lock(&vcpu->kvm->mmu_lock);
1779        mmu_sync_roots(vcpu);
1780        spin_unlock(&vcpu->kvm->mmu_lock);
1781}
1782
1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1784{
1785        return vaddr;
1786}
1787
1788static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1789                                u32 error_code)
1790{
1791        gfn_t gfn;
1792        int r;
1793
1794        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
1795        r = mmu_topup_memory_caches(vcpu);
1796        if (r)
1797                return r;
1798
1799        ASSERT(vcpu);
1800        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1801
1802        gfn = gva >> PAGE_SHIFT;
1803
1804        return nonpaging_map(vcpu, gva & PAGE_MASK,
1805                             error_code & PFERR_WRITE_MASK, gfn);
1806}
1807
1808static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1809                                u32 error_code)
1810{
1811        pfn_t pfn;
1812        int r;
1813        int largepage = 0;
1814        gfn_t gfn = gpa >> PAGE_SHIFT;
1815        unsigned long mmu_seq;
1816
1817        ASSERT(vcpu);
1818        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1819
1820        r = mmu_topup_memory_caches(vcpu);
1821        if (r)
1822                return r;
1823
1824        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1825                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1826                largepage = 1;
1827        }
1828        mmu_seq = vcpu->kvm->mmu_notifier_seq;
1829        smp_rmb();
1830        pfn = gfn_to_pfn(vcpu->kvm, gfn);
1831        if (is_error_pfn(pfn)) {
1832                kvm_release_pfn_clean(pfn);
1833                return 1;
1834        }
1835        spin_lock(&vcpu->kvm->mmu_lock);
1836        if (mmu_notifier_retry(vcpu, mmu_seq))
1837                goto out_unlock;
1838        kvm_mmu_free_some_pages(vcpu);
1839        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1840                         largepage, gfn, pfn);
1841        spin_unlock(&vcpu->kvm->mmu_lock);
1842
1843        return r;
1844
1845out_unlock:
1846        spin_unlock(&vcpu->kvm->mmu_lock);
1847        kvm_release_pfn_clean(pfn);
1848        return 0;
1849}
1850
1851static void nonpaging_free(struct kvm_vcpu *vcpu)
1852{
1853        mmu_free_roots(vcpu);
1854}
1855
1856static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1857{
1858        struct kvm_mmu *context = &vcpu->arch.mmu;
1859
1860        context->new_cr3 = nonpaging_new_cr3;
1861        context->page_fault = nonpaging_page_fault;
1862        context->gva_to_gpa = nonpaging_gva_to_gpa;
1863        context->free = nonpaging_free;
1864        context->prefetch_page = nonpaging_prefetch_page;
1865        context->sync_page = nonpaging_sync_page;
1866        context->invlpg = nonpaging_invlpg;
1867        context->root_level = 0;
1868        context->shadow_root_level = PT32E_ROOT_LEVEL;
1869        context->root_hpa = INVALID_PAGE;
1870        return 0;
1871}
1872
1873void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1874{
1875        ++vcpu->stat.tlb_flush;
1876        kvm_x86_ops->tlb_flush(vcpu);
1877}
1878
1879static void paging_new_cr3(struct kvm_vcpu *vcpu)
1880{
1881        pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
1882        mmu_free_roots(vcpu);
1883}
1884
1885static void inject_page_fault(struct kvm_vcpu *vcpu,
1886                              u64 addr,
1887                              u32 err_code)
1888{
1889        kvm_inject_page_fault(vcpu, addr, err_code);
1890}
1891
1892static void paging_free(struct kvm_vcpu *vcpu)
1893{
1894        nonpaging_free(vcpu);
1895}
1896
1897#define PTTYPE 64
1898#include "paging_tmpl.h"
1899#undef PTTYPE
1900
1901#define PTTYPE 32
1902#include "paging_tmpl.h"
1903#undef PTTYPE
1904
1905static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1906{
1907        struct kvm_mmu *context = &vcpu->arch.mmu;
1908
1909        ASSERT(is_pae(vcpu));
1910        context->new_cr3 = paging_new_cr3;
1911        context->page_fault = paging64_page_fault;
1912        context->gva_to_gpa = paging64_gva_to_gpa;
1913        context->prefetch_page = paging64_prefetch_page;
1914        context->sync_page = paging64_sync_page;
1915        context->invlpg = paging64_invlpg;
1916        context->free = paging_free;
1917        context->root_level = level;
1918        context->shadow_root_level = level;
1919        context->root_hpa = INVALID_PAGE;
1920        return 0;
1921}
1922
1923static int paging64_init_context(struct kvm_vcpu *vcpu)
1924{
1925        return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1926}
1927
1928static int paging32_init_context(struct kvm_vcpu *vcpu)
1929{
1930        struct kvm_mmu *context = &vcpu->arch.mmu;
1931
1932        context->new_cr3 = paging_new_cr3;
1933        context->page_fault = paging32_page_fault;
1934        context->gva_to_gpa = paging32_gva_to_gpa;
1935        context->free = paging_free;
1936        context->prefetch_page = paging32_prefetch_page;
1937        context->sync_page = paging32_sync_page;
1938        context->invlpg = paging32_invlpg;
1939        context->root_level = PT32_ROOT_LEVEL;
1940        context->shadow_root_level = PT32E_ROOT_LEVEL;
1941        context->root_hpa = INVALID_PAGE;
1942        return 0;
1943}
1944
1945static int paging32E_init_context(struct kvm_vcpu *vcpu)
1946{
1947        return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1948}
1949
1950static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1951{
1952        struct kvm_mmu *context = &vcpu->arch.mmu;
1953
1954        context->new_cr3 = nonpaging_new_cr3;
1955        context->page_fault = tdp_page_fault;
1956        context->free = nonpaging_free;
1957        context->prefetch_page = nonpaging_prefetch_page;
1958        context->sync_page = nonpaging_sync_page;
1959        context->invlpg = nonpaging_invlpg;
1960        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1961        context->root_hpa = INVALID_PAGE;
1962
1963        if (!is_paging(vcpu)) {
1964                context->gva_to_gpa = nonpaging_gva_to_gpa;
1965                context->root_level = 0;
1966        } else if (is_long_mode(vcpu)) {
1967                context->gva_to_gpa = paging64_gva_to_gpa;
1968                context->root_level = PT64_ROOT_LEVEL;
1969        } else if (is_pae(vcpu)) {
1970                context->gva_to_gpa = paging64_gva_to_gpa;
1971                context->root_level = PT32E_ROOT_LEVEL;
1972        } else {
1973                context->gva_to_gpa = paging32_gva_to_gpa;
1974                context->root_level = PT32_ROOT_LEVEL;
1975        }
1976
1977        return 0;
1978}
1979
1980static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
1981{
1982        ASSERT(vcpu);
1983        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1984
1985        if (!is_paging(vcpu))
1986                return nonpaging_init_context(vcpu);
1987        else if (is_long_mode(vcpu))
1988                return paging64_init_context(vcpu);
1989        else if (is_pae(vcpu))
1990                return paging32E_init_context(vcpu);
1991        else
1992                return paging32_init_context(vcpu);
1993}
1994
1995static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1996{
1997        vcpu->arch.update_pte.pfn = bad_pfn;
1998
1999        if (tdp_enabled)
2000                return init_kvm_tdp_mmu(vcpu);
2001        else
2002                return init_kvm_softmmu(vcpu);
2003}
2004
2005static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2006{
2007        ASSERT(vcpu);
2008        if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
2009                vcpu->arch.mmu.free(vcpu);
2010                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2011        }
2012}
2013
2014int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
2015{
2016        destroy_kvm_mmu(vcpu);
2017        return init_kvm_mmu(vcpu);
2018}
2019EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
2020
2021int kvm_mmu_load(struct kvm_vcpu *vcpu)
2022{
2023        int r;
2024
2025        r = mmu_topup_memory_caches(vcpu);
2026        if (r)
2027                goto out;
2028        spin_lock(&vcpu->kvm->mmu_lock);
2029        kvm_mmu_free_some_pages(vcpu);
2030        mmu_alloc_roots(vcpu);
2031        mmu_sync_roots(vcpu);
2032        spin_unlock(&vcpu->kvm->mmu_lock);
2033        kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2034        kvm_mmu_flush_tlb(vcpu);
2035out:
2036        return r;
2037}
2038EXPORT_SYMBOL_GPL(kvm_mmu_load);
2039
2040void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2041{
2042        mmu_free_roots(vcpu);
2043}
2044
2045static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2046                                  struct kvm_mmu_page *sp,
2047                                  u64 *spte)
2048{
2049        u64 pte;
2050        struct kvm_mmu_page *child;
2051
2052        pte = *spte;
2053        if (is_shadow_present_pte(pte)) {
2054                if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
2055                    is_large_pte(pte))
2056                        rmap_remove(vcpu->kvm, spte);
2057                else {
2058                        child = page_header(pte & PT64_BASE_ADDR_MASK);
2059                        mmu_page_remove_parent_pte(child, spte);
2060                }
2061        }
2062        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
2063        if (is_large_pte(pte))
2064                --vcpu->kvm->stat.lpages;
2065}
2066
2067static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2068                                  struct kvm_mmu_page *sp,
2069                                  u64 *spte,
2070                                  const void *new)
2071{
2072        if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2073                if (!vcpu->arch.update_pte.largepage ||
2074                    sp->role.glevels == PT32_ROOT_LEVEL) {
2075                        ++vcpu->kvm->stat.mmu_pde_zapped;
2076                        return;
2077                }
2078        }
2079
2080        ++vcpu->kvm->stat.mmu_pte_updated;
2081        if (sp->role.glevels == PT32_ROOT_LEVEL)
2082                paging32_update_pte(vcpu, sp, spte, new);
2083        else
2084                paging64_update_pte(vcpu, sp, spte, new);
2085}
2086
2087static bool need_remote_flush(u64 old, u64 new)
2088{
2089        if (!is_shadow_present_pte(old))
2090                return false;
2091        if (!is_shadow_present_pte(new))
2092                return true;
2093        if ((old ^ new) & PT64_BASE_ADDR_MASK)
2094                return true;
2095        old ^= PT64_NX_MASK;
2096        new ^= PT64_NX_MASK;
2097        return (old & ~new & PT64_PERM_MASK) != 0;
2098}
2099
2100static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
2101{
2102        if (need_remote_flush(old, new))
2103                kvm_flush_remote_tlbs(vcpu->kvm);
2104        else
2105                kvm_mmu_flush_tlb(vcpu);
2106}
2107
2108static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2109{
2110        u64 *spte = vcpu->arch.last_pte_updated;
2111
2112        return !!(spte && (*spte & shadow_accessed_mask));
2113}
2114
2115static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2116                                          const u8 *new, int bytes)
2117{
2118        gfn_t gfn;
2119        int r;
2120        u64 gpte = 0;
2121        pfn_t pfn;
2122
2123        vcpu->arch.update_pte.largepage = 0;
2124
2125        if (bytes != 4 && bytes != 8)
2126                return;
2127
2128        /*
2129         * Assume that the pte write on a page table of the same type
2130         * as the current vcpu paging mode.  This is nearly always true
2131         * (might be false while changing modes).  Note it is verified later
2132         * by update_pte().
2133         */
2134        if (is_pae(vcpu)) {
2135                /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2136                if ((bytes == 4) && (gpa % 4 == 0)) {
2137                        r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2138                        if (r)
2139                                return;
2140                        memcpy((void *)&gpte + (gpa % 8), new, 4);
2141                } else if ((bytes == 8) && (gpa % 8 == 0)) {
2142                        memcpy((void *)&gpte, new, 8);
2143                }
2144        } else {
2145                if ((bytes == 4) && (gpa % 4 == 0))
2146                        memcpy((void *)&gpte, new, 4);
2147        }
2148        if (!is_present_pte(gpte))
2149                return;
2150        gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2151
2152        if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
2153                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
2154                vcpu->arch.update_pte.largepage = 1;
2155        }
2156        vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2157        smp_rmb();
2158        pfn = gfn_to_pfn(vcpu->kvm, gfn);
2159
2160        if (is_error_pfn(pfn)) {
2161                kvm_release_pfn_clean(pfn);
2162                return;
2163        }
2164        vcpu->arch.update_pte.gfn = gfn;
2165        vcpu->arch.update_pte.pfn = pfn;
2166}
2167
2168static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2169{
2170        u64 *spte = vcpu->arch.last_pte_updated;
2171
2172        if (spte
2173            && vcpu->arch.last_pte_gfn == gfn
2174            && shadow_accessed_mask
2175            && !(*spte & shadow_accessed_mask)
2176            && is_shadow_present_pte(*spte))
2177                set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2178}
2179
2180void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2181                       const u8 *new, int bytes)
2182{
2183        gfn_t gfn = gpa >> PAGE_SHIFT;
2184        struct kvm_mmu_page *sp;
2185        struct hlist_node *node, *n;
2186        struct hlist_head *bucket;
2187        unsigned index;
2188        u64 entry, gentry;
2189        u64 *spte;
2190        unsigned offset = offset_in_page(gpa);
2191        unsigned pte_size;
2192        unsigned page_offset;
2193        unsigned misaligned;
2194        unsigned quadrant;
2195        int level;
2196        int flooded = 0;
2197        int npte;
2198        int r;
2199
2200        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2201        mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
2202        spin_lock(&vcpu->kvm->mmu_lock);
2203        kvm_mmu_access_page(vcpu, gfn);
2204        kvm_mmu_free_some_pages(vcpu);
2205        ++vcpu->kvm->stat.mmu_pte_write;
2206        kvm_mmu_audit(vcpu, "pre pte write");
2207        if (gfn == vcpu->arch.last_pt_write_gfn
2208            && !last_updated_pte_accessed(vcpu)) {
2209                ++vcpu->arch.last_pt_write_count;
2210                if (vcpu->arch.last_pt_write_count >= 3)
2211                        flooded = 1;
2212        } else {
2213                vcpu->arch.last_pt_write_gfn = gfn;
2214                vcpu->arch.last_pt_write_count = 1;
2215                vcpu->arch.last_pte_updated = NULL;
2216        }
2217        index = kvm_page_table_hashfn(gfn);
2218        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2219        hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2220                if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
2221                        continue;
2222                pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
2223                misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2224                misaligned |= bytes < 4;
2225                if (misaligned || flooded) {
2226                        /*
2227                         * Misaligned accesses are too much trouble to fix
2228                         * up; also, they usually indicate a page is not used
2229                         * as a page table.
2230                         *
2231                         * If we're seeing too many writes to a page,
2232                         * it may no longer be a page table, or we may be
2233                         * forking, in which case it is better to unmap the
2234                         * page.
2235                         */
2236                        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2237                                 gpa, bytes, sp->role.word);
2238                        if (kvm_mmu_zap_page(vcpu->kvm, sp))
2239                                n = bucket->first;
2240                        ++vcpu->kvm->stat.mmu_flooded;
2241                        continue;
2242                }
2243                page_offset = offset;
2244                level = sp->role.level;
2245                npte = 1;
2246                if (sp->role.glevels == PT32_ROOT_LEVEL) {
2247                        page_offset <<= 1;      /* 32->64 */
2248                        /*
2249                         * A 32-bit pde maps 4MB while the shadow pdes map
2250                         * only 2MB.  So we need to double the offset again
2251                         * and zap two pdes instead of one.
2252                         */
2253                        if (level == PT32_ROOT_LEVEL) {
2254                                page_offset &= ~7; /* kill rounding error */
2255                                page_offset <<= 1;
2256                                npte = 2;
2257                        }
2258                        quadrant = page_offset >> PAGE_SHIFT;
2259                        page_offset &= ~PAGE_MASK;
2260                        if (quadrant != sp->role.quadrant)
2261                                continue;
2262                }
2263                spte = &sp->spt[page_offset / sizeof(*spte)];
2264                if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2265                        gentry = 0;
2266                        r = kvm_read_guest_atomic(vcpu->kvm,
2267                                                  gpa & ~(u64)(pte_size - 1),
2268                                                  &gentry, pte_size);
2269                        new = (const void *)&gentry;
2270                        if (r < 0)
2271                                new = NULL;
2272                }
2273                while (npte--) {
2274                        entry = *spte;
2275                        mmu_pte_write_zap_pte(vcpu, sp, spte);
2276                        if (new)
2277                                mmu_pte_write_new_pte(vcpu, sp, spte, new);
2278                        mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2279                        ++spte;
2280                }
2281        }
2282        kvm_mmu_audit(vcpu, "post pte write");
2283        spin_unlock(&vcpu->kvm->mmu_lock);
2284        if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2285                kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2286                vcpu->arch.update_pte.pfn = bad_pfn;
2287        }
2288}
2289
2290int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2291{
2292        gpa_t gpa;
2293        int r;
2294
2295        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
2296
2297        spin_lock(&vcpu->kvm->mmu_lock);
2298        r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2299        spin_unlock(&vcpu->kvm->mmu_lock);
2300        return r;
2301}
2302EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2303
2304void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2305{
2306        while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
2307                struct kvm_mmu_page *sp;
2308
2309                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2310                                  struct kvm_mmu_page, link);
2311                kvm_mmu_zap_page(vcpu->kvm, sp);
2312                ++vcpu->kvm->stat.mmu_recycled;
2313        }
2314}
2315
2316int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2317{
2318        int r;
2319        enum emulation_result er;
2320
2321        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2322        if (r < 0)
2323                goto out;
2324
2325        if (!r) {
2326                r = 1;
2327                goto out;
2328        }
2329
2330        r = mmu_topup_memory_caches(vcpu);
2331        if (r)
2332                goto out;
2333
2334        er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
2335
2336        switch (er) {
2337        case EMULATE_DONE:
2338                return 1;
2339        case EMULATE_DO_MMIO:
2340                ++vcpu->stat.mmio_exits;
2341                return 0;
2342        case EMULATE_FAIL:
2343                kvm_report_emulation_failure(vcpu, "pagetable");
2344                return 1;
2345        default:
2346                BUG();
2347        }
2348out:
2349        return r;
2350}
2351EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2352
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{
2355        spin_lock(&vcpu->kvm->mmu_lock);
2356        vcpu->arch.mmu.invlpg(vcpu, gva);
2357        spin_unlock(&vcpu->kvm->mmu_lock);
2358        kvm_mmu_flush_tlb(vcpu);
2359        ++vcpu->stat.invlpg;
2360}
2361EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2362
2363void kvm_enable_tdp(void)
2364{
2365        tdp_enabled = true;
2366}
2367EXPORT_SYMBOL_GPL(kvm_enable_tdp);
2368
2369void kvm_disable_tdp(void)
2370{
2371        tdp_enabled = false;
2372}
2373EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2374
2375static void free_mmu_pages(struct kvm_vcpu *vcpu)
2376{
2377        struct kvm_mmu_page *sp;
2378
2379        while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2380                sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2381                                  struct kvm_mmu_page, link);
2382                kvm_mmu_zap_page(vcpu->kvm, sp);
2383                cond_resched();
2384        }
2385        free_page((unsigned long)vcpu->arch.mmu.pae_root);
2386}
2387
2388static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2389{
2390        struct page *page;
2391        int i;
2392
2393        ASSERT(vcpu);
2394
2395        if (vcpu->kvm->arch.n_requested_mmu_pages)
2396                vcpu->kvm->arch.n_free_mmu_pages =
2397                                        vcpu->kvm->arch.n_requested_mmu_pages;
2398        else
2399                vcpu->kvm->arch.n_free_mmu_pages =
2400                                        vcpu->kvm->arch.n_alloc_mmu_pages;
2401        /*
2402         * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2403         * Therefore we need to allocate shadow page tables in the first
2404         * 4GB of memory, which happens to fit the DMA32 zone.
2405         */
2406        page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2407        if (!page)
2408                goto error_1;
2409        vcpu->arch.mmu.pae_root = page_address(page);
2410        for (i = 0; i < 4; ++i)
2411                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2412
2413        return 0;
2414
2415error_1:
2416        free_mmu_pages(vcpu);
2417        return -ENOMEM;
2418}
2419
2420int kvm_mmu_create(struct kvm_vcpu *vcpu)
2421{
2422        ASSERT(vcpu);
2423        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2424
2425        return alloc_mmu_pages(vcpu);
2426}
2427
2428int kvm_mmu_setup(struct kvm_vcpu *vcpu)
2429{
2430        ASSERT(vcpu);
2431        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2432
2433        return init_kvm_mmu(vcpu);
2434}
2435
2436void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
2437{
2438        ASSERT(vcpu);
2439
2440        destroy_kvm_mmu(vcpu);
2441        free_mmu_pages(vcpu);
2442        mmu_free_memory_caches(vcpu);
2443}
2444
2445void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2446{
2447        struct kvm_mmu_page *sp;
2448
2449        spin_lock(&kvm->mmu_lock);
2450        list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2451                int i;
2452                u64 *pt;
2453
2454                if (!test_bit(slot, &sp->slot_bitmap))
2455                        continue;
2456
2457                pt = sp->spt;
2458                for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2459                        /* avoid RMW */
2460                        if (pt[i] & PT_WRITABLE_MASK)
2461                                pt[i] &= ~PT_WRITABLE_MASK;
2462        }
2463        kvm_flush_remote_tlbs(kvm);
2464        spin_unlock(&kvm->mmu_lock);
2465}
2466
2467void kvm_mmu_zap_all(struct kvm *kvm)
2468{
2469        struct kvm_mmu_page *sp, *node;
2470
2471        spin_lock(&kvm->mmu_lock);
2472        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2473                if (kvm_mmu_zap_page(kvm, sp))
2474                        node = container_of(kvm->arch.active_mmu_pages.next,
2475                                            struct kvm_mmu_page, link);
2476        spin_unlock(&kvm->mmu_lock);
2477
2478        kvm_flush_remote_tlbs(kvm);
2479}
2480
2481static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
2482{
2483        struct kvm_mmu_page *page;
2484
2485        page = container_of(kvm->arch.active_mmu_pages.prev,
2486                            struct kvm_mmu_page, link);
2487        kvm_mmu_zap_page(kvm, page);
2488}
2489
2490static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2491{
2492        struct kvm *kvm;
2493        struct kvm *kvm_freed = NULL;
2494        int cache_count = 0;
2495
2496        spin_lock(&kvm_lock);
2497
2498        list_for_each_entry(kvm, &vm_list, vm_list) {
2499                int npages;
2500
2501                if (!down_read_trylock(&kvm->slots_lock))
2502                        continue;
2503                spin_lock(&kvm->mmu_lock);
2504                npages = kvm->arch.n_alloc_mmu_pages -
2505                         kvm->arch.n_free_mmu_pages;
2506                cache_count += npages;
2507                if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2508                        kvm_mmu_remove_one_alloc_mmu_page(kvm);
2509                        cache_count--;
2510                        kvm_freed = kvm;
2511                }
2512                nr_to_scan--;
2513
2514                spin_unlock(&kvm->mmu_lock);
2515                up_read(&kvm->slots_lock);
2516        }
2517        if (kvm_freed)
2518                list_move_tail(&kvm_freed->vm_list, &vm_list);
2519
2520        spin_unlock(&kvm_lock);
2521
2522        return cache_count;
2523}
2524
2525static struct shrinker mmu_shrinker = {
2526        .shrink = mmu_shrink,
2527        .seeks = DEFAULT_SEEKS * 10,
2528};
2529
2530static void mmu_destroy_caches(void)
2531{
2532        if (pte_chain_cache)
2533                kmem_cache_destroy(pte_chain_cache);
2534        if (rmap_desc_cache)
2535                kmem_cache_destroy(rmap_desc_cache);
2536        if (mmu_page_header_cache)
2537                kmem_cache_destroy(mmu_page_header_cache);
2538}
2539
2540void kvm_mmu_module_exit(void)
2541{
2542        mmu_destroy_caches();
2543        unregister_shrinker(&mmu_shrinker);
2544}
2545
2546int kvm_mmu_module_init(void)
2547{
2548        pte_chain_cache = kmem_cache_create("kvm_pte_chain",
2549                                            sizeof(struct kvm_pte_chain),
2550                                            0, 0, NULL);
2551        if (!pte_chain_cache)
2552                goto nomem;
2553        rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
2554                                            sizeof(struct kvm_rmap_desc),
2555                                            0, 0, NULL);
2556        if (!rmap_desc_cache)
2557                goto nomem;
2558
2559        mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
2560                                                  sizeof(struct kvm_mmu_page),
2561                                                  0, 0, NULL);
2562        if (!mmu_page_header_cache)
2563                goto nomem;
2564
2565        register_shrinker(&mmu_shrinker);
2566
2567        return 0;
2568
2569nomem:
2570        mmu_destroy_caches();
2571        return -ENOMEM;
2572}
2573
2574/*
2575 * Caculate mmu pages needed for kvm.
2576 */
2577unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
2578{
2579        int i;
2580        unsigned int nr_mmu_pages;
2581        unsigned int  nr_pages = 0;
2582
2583        for (i = 0; i < kvm->nmemslots; i++)
2584                nr_pages += kvm->memslots[i].npages;
2585
2586        nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
2587        nr_mmu_pages = max(nr_mmu_pages,
2588                        (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
2589
2590        return nr_mmu_pages;
2591}
2592
2593static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2594                                unsigned len)
2595{
2596        if (len > buffer->len)
2597                return NULL;
2598        return buffer->ptr;
2599}
2600
2601static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2602                                unsigned len)
2603{
2604        void *ret;
2605
2606        ret = pv_mmu_peek_buffer(buffer, len);
2607        if (!ret)
2608                return ret;
2609        buffer->ptr += len;
2610        buffer->len -= len;
2611        buffer->processed += len;
2612        return ret;
2613}
2614
2615static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
2616                             gpa_t addr, gpa_t value)
2617{
2618        int bytes = 8;
2619        int r;
2620
2621        if (!is_long_mode(vcpu) && !is_pae(vcpu))
2622                bytes = 4;
2623
2624        r = mmu_topup_memory_caches(vcpu);
2625        if (r)
2626                return r;
2627
2628        if (!emulator_write_phys(vcpu, addr, &value, bytes))
2629                return -EFAULT;
2630
2631        return 1;
2632}
2633
2634static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2635{
2636        kvm_x86_ops->tlb_flush(vcpu);
2637        set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
2638        return 1;
2639}
2640
2641static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
2642{
2643        spin_lock(&vcpu->kvm->mmu_lock);
2644        mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
2645        spin_unlock(&vcpu->kvm->mmu_lock);
2646        return 1;
2647}
2648
2649static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
2650                             struct kvm_pv_mmu_op_buffer *buffer)
2651{
2652        struct kvm_mmu_op_header *header;
2653
2654        header = pv_mmu_peek_buffer(buffer, sizeof *header);
2655        if (!header)
2656                return 0;
2657        switch (header->op) {
2658        case KVM_MMU_OP_WRITE_PTE: {
2659                struct kvm_mmu_op_write_pte *wpte;
2660
2661                wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
2662                if (!wpte)
2663                        return 0;
2664                return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
2665                                        wpte->pte_val);
2666        }
2667        case KVM_MMU_OP_FLUSH_TLB: {
2668                struct kvm_mmu_op_flush_tlb *ftlb;
2669
2670                ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
2671                if (!ftlb)
2672                        return 0;
2673                return kvm_pv_mmu_flush_tlb(vcpu);
2674        }
2675        case KVM_MMU_OP_RELEASE_PT: {
2676                struct kvm_mmu_op_release_pt *rpt;
2677
2678                rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
2679                if (!rpt)
2680                        return 0;
2681                return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
2682        }
2683        default: return 0;
2684        }
2685}
2686
2687int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2688                  gpa_t addr, unsigned long *ret)
2689{
2690        int r;
2691        struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2692
2693        buffer->ptr = buffer->buf;
2694        buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2695        buffer->processed = 0;
2696
2697        r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2698        if (r)
2699                goto out;
2700
2701        while (buffer->len) {
2702                r = kvm_pv_mmu_op_one(vcpu, buffer);
2703                if (r < 0)
2704                        goto out;
2705                if (r == 0)
2706                        break;
2707        }
2708
2709        r = 1;
2710out:
2711        *ret = buffer->processed;
2712        return r;
2713}
2714
2715#ifdef AUDIT
2716
2717static const char *audit_msg;
2718
2719static gva_t canonicalize(gva_t gva)
2720{
2721#ifdef CONFIG_X86_64
2722        gva = (long long)(gva << 16) >> 16;
2723#endif
2724        return gva;
2725}
2726
2727static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
2728                                gva_t va, int level)
2729{
2730        u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
2731        int i;
2732        gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
2733
2734        for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
2735                u64 ent = pt[i];
2736
2737                if (ent == shadow_trap_nonpresent_pte)
2738                        continue;
2739
2740                va = canonicalize(va);
2741                if (level > 1) {
2742                        if (ent == shadow_notrap_nonpresent_pte)
2743                                printk(KERN_ERR "audit: (%s) nontrapping pte"
2744                                       " in nonleaf level: levels %d gva %lx"
2745                                       " level %d pte %llx\n", audit_msg,
2746                                       vcpu->arch.mmu.root_level, va, level, ent);
2747
2748                        audit_mappings_page(vcpu, ent, va, level - 1);
2749                } else {
2750                        gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
2751                        hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
2752
2753                        if (is_shadow_present_pte(ent)
2754                            && (ent & PT64_BASE_ADDR_MASK) != hpa)
2755                                printk(KERN_ERR "xx audit error: (%s) levels %d"
2756                                       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
2757                                       audit_msg, vcpu->arch.mmu.root_level,
2758                                       va, gpa, hpa, ent,
2759                                       is_shadow_present_pte(ent));
2760                        else if (ent == shadow_notrap_nonpresent_pte
2761                                 && !is_error_hpa(hpa))
2762                                printk(KERN_ERR "audit: (%s) notrap shadow,"
2763                                       " valid guest gva %lx\n", audit_msg, va);
2764                        kvm_release_pfn_clean(pfn);
2765
2766                }
2767        }
2768}
2769
2770static void audit_mappings(struct kvm_vcpu *vcpu)
2771{
2772        unsigned i;
2773
2774        if (vcpu->arch.mmu.root_level == 4)
2775                audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
2776        else
2777                for (i = 0; i < 4; ++i)
2778                        if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
2779                                audit_mappings_page(vcpu,
2780                                                    vcpu->arch.mmu.pae_root[i],
2781                                                    i << 30,
2782                                                    2);
2783}
2784
2785static int count_rmaps(struct kvm_vcpu *vcpu)
2786{
2787        int nmaps = 0;
2788        int i, j, k;
2789
2790        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
2791                struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
2792                struct kvm_rmap_desc *d;
2793
2794                for (j = 0; j < m->npages; ++j) {
2795                        unsigned long *rmapp = &m->rmap[j];
2796
2797                        if (!*rmapp)
2798                                continue;
2799                        if (!(*rmapp & 1)) {
2800                                ++nmaps;
2801                                continue;
2802                        }
2803                        d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
2804                        while (d) {
2805                                for (k = 0; k < RMAP_EXT; ++k)
2806                                        if (d->shadow_ptes[k])
2807                                                ++nmaps;
2808                                        else
2809                                                break;
2810                                d = d->more;
2811                        }
2812                }
2813        }
2814        return nmaps;
2815}
2816
2817static int count_writable_mappings(struct kvm_vcpu *vcpu)
2818{
2819        int nmaps = 0;
2820        struct kvm_mmu_page *sp;
2821        int i;
2822
2823        list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2824                u64 *pt = sp->spt;
2825
2826                if (sp->role.level != PT_PAGE_TABLE_LEVEL)
2827                        continue;
2828
2829                for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
2830                        u64 ent = pt[i];
2831
2832                        if (!(ent & PT_PRESENT_MASK))
2833                                continue;
2834                        if (!(ent & PT_WRITABLE_MASK))
2835                                continue;
2836                        ++nmaps;
2837                }
2838        }
2839        return nmaps;
2840}
2841
2842static void audit_rmap(struct kvm_vcpu *vcpu)
2843{
2844        int n_rmap = count_rmaps(vcpu);
2845        int n_actual = count_writable_mappings(vcpu);
2846
2847        if (n_rmap != n_actual)
2848                printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
2849                       __func__, audit_msg, n_rmap, n_actual);
2850}
2851
2852static void audit_write_protection(struct kvm_vcpu *vcpu)
2853{
2854        struct kvm_mmu_page *sp;
2855        struct kvm_memory_slot *slot;
2856        unsigned long *rmapp;
2857        gfn_t gfn;
2858
2859        list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2860                if (sp->role.metaphysical)
2861                        continue;
2862
2863                slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
2864                gfn = unalias_gfn(vcpu->kvm, sp->gfn);
2865                rmapp = &slot->rmap[gfn - slot->base_gfn];
2866                if (*rmapp)
2867                        printk(KERN_ERR "%s: (%s) shadow page has writable"
2868                               " mappings: gfn %lx role %x\n",
2869                               __func__, audit_msg, sp->gfn,
2870                               sp->role.word);
2871        }
2872}
2873
2874static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
2875{
2876        int olddbg = dbg;
2877
2878        dbg = 0;
2879        audit_msg = msg;
2880        audit_rmap(vcpu);
2881        audit_write_protection(vcpu);
2882        audit_mappings(vcpu);
2883        dbg = olddbg;
2884}
2885
2886#endif
2887