linux/arch/s390/mm/gmap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  KVM guest address space mapping code
   4 *
   5 *    Copyright IBM Corp. 2007, 2020
   6 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   7 *               David Hildenbrand <david@redhat.com>
   8 *               Janosch Frank <frankja@linux.vnet.ibm.com>
   9 */
  10
  11#include <linux/kernel.h>
  12#include <linux/pagewalk.h>
  13#include <linux/swap.h>
  14#include <linux/smp.h>
  15#include <linux/spinlock.h>
  16#include <linux/slab.h>
  17#include <linux/swapops.h>
  18#include <linux/ksm.h>
  19#include <linux/mman.h>
  20#include <linux/pgtable.h>
  21
  22#include <asm/pgalloc.h>
  23#include <asm/gmap.h>
  24#include <asm/tlb.h>
  25
  26#define GMAP_SHADOW_FAKE_TABLE 1ULL
  27
  28/**
  29 * gmap_alloc - allocate and initialize a guest address space
  30 * @mm: pointer to the parent mm_struct
  31 * @limit: maximum address of the gmap address space
  32 *
  33 * Returns a guest address space structure.
  34 */
  35static struct gmap *gmap_alloc(unsigned long limit)
  36{
  37        struct gmap *gmap;
  38        struct page *page;
  39        unsigned long *table;
  40        unsigned long etype, atype;
  41
  42        if (limit < _REGION3_SIZE) {
  43                limit = _REGION3_SIZE - 1;
  44                atype = _ASCE_TYPE_SEGMENT;
  45                etype = _SEGMENT_ENTRY_EMPTY;
  46        } else if (limit < _REGION2_SIZE) {
  47                limit = _REGION2_SIZE - 1;
  48                atype = _ASCE_TYPE_REGION3;
  49                etype = _REGION3_ENTRY_EMPTY;
  50        } else if (limit < _REGION1_SIZE) {
  51                limit = _REGION1_SIZE - 1;
  52                atype = _ASCE_TYPE_REGION2;
  53                etype = _REGION2_ENTRY_EMPTY;
  54        } else {
  55                limit = -1UL;
  56                atype = _ASCE_TYPE_REGION1;
  57                etype = _REGION1_ENTRY_EMPTY;
  58        }
  59        gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
  60        if (!gmap)
  61                goto out;
  62        INIT_LIST_HEAD(&gmap->crst_list);
  63        INIT_LIST_HEAD(&gmap->children);
  64        INIT_LIST_HEAD(&gmap->pt_list);
  65        INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
  66        INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
  67        INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
  68        spin_lock_init(&gmap->guest_table_lock);
  69        spin_lock_init(&gmap->shadow_lock);
  70        refcount_set(&gmap->ref_count, 1);
  71        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
  72        if (!page)
  73                goto out_free;
  74        page->index = 0;
  75        list_add(&page->lru, &gmap->crst_list);
  76        table = (unsigned long *) page_to_phys(page);
  77        crst_table_init(table, etype);
  78        gmap->table = table;
  79        gmap->asce = atype | _ASCE_TABLE_LENGTH |
  80                _ASCE_USER_BITS | __pa(table);
  81        gmap->asce_end = limit;
  82        return gmap;
  83
  84out_free:
  85        kfree(gmap);
  86out:
  87        return NULL;
  88}
  89
  90/**
  91 * gmap_create - create a guest address space
  92 * @mm: pointer to the parent mm_struct
  93 * @limit: maximum size of the gmap address space
  94 *
  95 * Returns a guest address space structure.
  96 */
  97struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
  98{
  99        struct gmap *gmap;
 100        unsigned long gmap_asce;
 101
 102        gmap = gmap_alloc(limit);
 103        if (!gmap)
 104                return NULL;
 105        gmap->mm = mm;
 106        spin_lock(&mm->context.lock);
 107        list_add_rcu(&gmap->list, &mm->context.gmap_list);
 108        if (list_is_singular(&mm->context.gmap_list))
 109                gmap_asce = gmap->asce;
 110        else
 111                gmap_asce = -1UL;
 112        WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
 113        spin_unlock(&mm->context.lock);
 114        return gmap;
 115}
 116EXPORT_SYMBOL_GPL(gmap_create);
 117
 118static void gmap_flush_tlb(struct gmap *gmap)
 119{
 120        if (MACHINE_HAS_IDTE)
 121                __tlb_flush_idte(gmap->asce);
 122        else
 123                __tlb_flush_global();
 124}
 125
 126static void gmap_radix_tree_free(struct radix_tree_root *root)
 127{
 128        struct radix_tree_iter iter;
 129        unsigned long indices[16];
 130        unsigned long index;
 131        void __rcu **slot;
 132        int i, nr;
 133
 134        /* A radix tree is freed by deleting all of its entries */
 135        index = 0;
 136        do {
 137                nr = 0;
 138                radix_tree_for_each_slot(slot, root, &iter, index) {
 139                        indices[nr] = iter.index;
 140                        if (++nr == 16)
 141                                break;
 142                }
 143                for (i = 0; i < nr; i++) {
 144                        index = indices[i];
 145                        radix_tree_delete(root, index);
 146                }
 147        } while (nr > 0);
 148}
 149
 150static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
 151{
 152        struct gmap_rmap *rmap, *rnext, *head;
 153        struct radix_tree_iter iter;
 154        unsigned long indices[16];
 155        unsigned long index;
 156        void __rcu **slot;
 157        int i, nr;
 158
 159        /* A radix tree is freed by deleting all of its entries */
 160        index = 0;
 161        do {
 162                nr = 0;
 163                radix_tree_for_each_slot(slot, root, &iter, index) {
 164                        indices[nr] = iter.index;
 165                        if (++nr == 16)
 166                                break;
 167                }
 168                for (i = 0; i < nr; i++) {
 169                        index = indices[i];
 170                        head = radix_tree_delete(root, index);
 171                        gmap_for_each_rmap_safe(rmap, rnext, head)
 172                                kfree(rmap);
 173                }
 174        } while (nr > 0);
 175}
 176
 177/**
 178 * gmap_free - free a guest address space
 179 * @gmap: pointer to the guest address space structure
 180 *
 181 * No locks required. There are no references to this gmap anymore.
 182 */
 183static void gmap_free(struct gmap *gmap)
 184{
 185        struct page *page, *next;
 186
 187        /* Flush tlb of all gmaps (if not already done for shadows) */
 188        if (!(gmap_is_shadow(gmap) && gmap->removed))
 189                gmap_flush_tlb(gmap);
 190        /* Free all segment & region tables. */
 191        list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 192                __free_pages(page, CRST_ALLOC_ORDER);
 193        gmap_radix_tree_free(&gmap->guest_to_host);
 194        gmap_radix_tree_free(&gmap->host_to_guest);
 195
 196        /* Free additional data for a shadow gmap */
 197        if (gmap_is_shadow(gmap)) {
 198                /* Free all page tables. */
 199                list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
 200                        page_table_free_pgste(page);
 201                gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
 202                /* Release reference to the parent */
 203                gmap_put(gmap->parent);
 204        }
 205
 206        kfree(gmap);
 207}
 208
 209/**
 210 * gmap_get - increase reference counter for guest address space
 211 * @gmap: pointer to the guest address space structure
 212 *
 213 * Returns the gmap pointer
 214 */
 215struct gmap *gmap_get(struct gmap *gmap)
 216{
 217        refcount_inc(&gmap->ref_count);
 218        return gmap;
 219}
 220EXPORT_SYMBOL_GPL(gmap_get);
 221
 222/**
 223 * gmap_put - decrease reference counter for guest address space
 224 * @gmap: pointer to the guest address space structure
 225 *
 226 * If the reference counter reaches zero the guest address space is freed.
 227 */
 228void gmap_put(struct gmap *gmap)
 229{
 230        if (refcount_dec_and_test(&gmap->ref_count))
 231                gmap_free(gmap);
 232}
 233EXPORT_SYMBOL_GPL(gmap_put);
 234
 235/**
 236 * gmap_remove - remove a guest address space but do not free it yet
 237 * @gmap: pointer to the guest address space structure
 238 */
 239void gmap_remove(struct gmap *gmap)
 240{
 241        struct gmap *sg, *next;
 242        unsigned long gmap_asce;
 243
 244        /* Remove all shadow gmaps linked to this gmap */
 245        if (!list_empty(&gmap->children)) {
 246                spin_lock(&gmap->shadow_lock);
 247                list_for_each_entry_safe(sg, next, &gmap->children, list) {
 248                        list_del(&sg->list);
 249                        gmap_put(sg);
 250                }
 251                spin_unlock(&gmap->shadow_lock);
 252        }
 253        /* Remove gmap from the pre-mm list */
 254        spin_lock(&gmap->mm->context.lock);
 255        list_del_rcu(&gmap->list);
 256        if (list_empty(&gmap->mm->context.gmap_list))
 257                gmap_asce = 0;
 258        else if (list_is_singular(&gmap->mm->context.gmap_list))
 259                gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
 260                                             struct gmap, list)->asce;
 261        else
 262                gmap_asce = -1UL;
 263        WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
 264        spin_unlock(&gmap->mm->context.lock);
 265        synchronize_rcu();
 266        /* Put reference */
 267        gmap_put(gmap);
 268}
 269EXPORT_SYMBOL_GPL(gmap_remove);
 270
 271/**
 272 * gmap_enable - switch primary space to the guest address space
 273 * @gmap: pointer to the guest address space structure
 274 */
 275void gmap_enable(struct gmap *gmap)
 276{
 277        S390_lowcore.gmap = (unsigned long) gmap;
 278}
 279EXPORT_SYMBOL_GPL(gmap_enable);
 280
 281/**
 282 * gmap_disable - switch back to the standard primary address space
 283 * @gmap: pointer to the guest address space structure
 284 */
 285void gmap_disable(struct gmap *gmap)
 286{
 287        S390_lowcore.gmap = 0UL;
 288}
 289EXPORT_SYMBOL_GPL(gmap_disable);
 290
 291/**
 292 * gmap_get_enabled - get a pointer to the currently enabled gmap
 293 *
 294 * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
 295 */
 296struct gmap *gmap_get_enabled(void)
 297{
 298        return (struct gmap *) S390_lowcore.gmap;
 299}
 300EXPORT_SYMBOL_GPL(gmap_get_enabled);
 301
 302/*
 303 * gmap_alloc_table is assumed to be called with mmap_lock held
 304 */
 305static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 306                            unsigned long init, unsigned long gaddr)
 307{
 308        struct page *page;
 309        unsigned long *new;
 310
 311        /* since we dont free the gmap table until gmap_free we can unlock */
 312        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
 313        if (!page)
 314                return -ENOMEM;
 315        new = (unsigned long *) page_to_phys(page);
 316        crst_table_init(new, init);
 317        spin_lock(&gmap->guest_table_lock);
 318        if (*table & _REGION_ENTRY_INVALID) {
 319                list_add(&page->lru, &gmap->crst_list);
 320                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
 321                        (*table & _REGION_ENTRY_TYPE_MASK);
 322                page->index = gaddr;
 323                page = NULL;
 324        }
 325        spin_unlock(&gmap->guest_table_lock);
 326        if (page)
 327                __free_pages(page, CRST_ALLOC_ORDER);
 328        return 0;
 329}
 330
 331/**
 332 * __gmap_segment_gaddr - find virtual address from segment pointer
 333 * @entry: pointer to a segment table entry in the guest address space
 334 *
 335 * Returns the virtual address in the guest address space for the segment
 336 */
 337static unsigned long __gmap_segment_gaddr(unsigned long *entry)
 338{
 339        struct page *page;
 340        unsigned long offset, mask;
 341
 342        offset = (unsigned long) entry / sizeof(unsigned long);
 343        offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
 344        mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
 345        page = virt_to_page((void *)((unsigned long) entry & mask));
 346        return page->index + offset;
 347}
 348
 349/**
 350 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
 351 * @gmap: pointer to the guest address space structure
 352 * @vmaddr: address in the host process address space
 353 *
 354 * Returns 1 if a TLB flush is required
 355 */
 356static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 357{
 358        unsigned long *entry;
 359        int flush = 0;
 360
 361        BUG_ON(gmap_is_shadow(gmap));
 362        spin_lock(&gmap->guest_table_lock);
 363        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 364        if (entry) {
 365                flush = (*entry != _SEGMENT_ENTRY_EMPTY);
 366                *entry = _SEGMENT_ENTRY_EMPTY;
 367        }
 368        spin_unlock(&gmap->guest_table_lock);
 369        return flush;
 370}
 371
 372/**
 373 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
 374 * @gmap: pointer to the guest address space structure
 375 * @gaddr: address in the guest address space
 376 *
 377 * Returns 1 if a TLB flush is required
 378 */
 379static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
 380{
 381        unsigned long vmaddr;
 382
 383        vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
 384                                                   gaddr >> PMD_SHIFT);
 385        return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
 386}
 387
 388/**
 389 * gmap_unmap_segment - unmap segment from the guest address space
 390 * @gmap: pointer to the guest address space structure
 391 * @to: address in the guest address space
 392 * @len: length of the memory area to unmap
 393 *
 394 * Returns 0 if the unmap succeeded, -EINVAL if not.
 395 */
 396int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 397{
 398        unsigned long off;
 399        int flush;
 400
 401        BUG_ON(gmap_is_shadow(gmap));
 402        if ((to | len) & (PMD_SIZE - 1))
 403                return -EINVAL;
 404        if (len == 0 || to + len < to)
 405                return -EINVAL;
 406
 407        flush = 0;
 408        mmap_write_lock(gmap->mm);
 409        for (off = 0; off < len; off += PMD_SIZE)
 410                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 411        mmap_write_unlock(gmap->mm);
 412        if (flush)
 413                gmap_flush_tlb(gmap);
 414        return 0;
 415}
 416EXPORT_SYMBOL_GPL(gmap_unmap_segment);
 417
 418/**
 419 * gmap_map_segment - map a segment to the guest address space
 420 * @gmap: pointer to the guest address space structure
 421 * @from: source address in the parent address space
 422 * @to: target address in the guest address space
 423 * @len: length of the memory area to map
 424 *
 425 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
 426 */
 427int gmap_map_segment(struct gmap *gmap, unsigned long from,
 428                     unsigned long to, unsigned long len)
 429{
 430        unsigned long off;
 431        int flush;
 432
 433        BUG_ON(gmap_is_shadow(gmap));
 434        if ((from | to | len) & (PMD_SIZE - 1))
 435                return -EINVAL;
 436        if (len == 0 || from + len < from || to + len < to ||
 437            from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
 438                return -EINVAL;
 439
 440        flush = 0;
 441        mmap_write_lock(gmap->mm);
 442        for (off = 0; off < len; off += PMD_SIZE) {
 443                /* Remove old translation */
 444                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 445                /* Store new translation */
 446                if (radix_tree_insert(&gmap->guest_to_host,
 447                                      (to + off) >> PMD_SHIFT,
 448                                      (void *) from + off))
 449                        break;
 450        }
 451        mmap_write_unlock(gmap->mm);
 452        if (flush)
 453                gmap_flush_tlb(gmap);
 454        if (off >= len)
 455                return 0;
 456        gmap_unmap_segment(gmap, to, len);
 457        return -ENOMEM;
 458}
 459EXPORT_SYMBOL_GPL(gmap_map_segment);
 460
 461/**
 462 * __gmap_translate - translate a guest address to a user space address
 463 * @gmap: pointer to guest mapping meta data structure
 464 * @gaddr: guest address
 465 *
 466 * Returns user space address which corresponds to the guest address or
 467 * -EFAULT if no such mapping exists.
 468 * This function does not establish potentially missing page table entries.
 469 * The mmap_lock of the mm that belongs to the address space must be held
 470 * when this function gets called.
 471 *
 472 * Note: Can also be called for shadow gmaps.
 473 */
 474unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 475{
 476        unsigned long vmaddr;
 477
 478        vmaddr = (unsigned long)
 479                radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
 480        /* Note: guest_to_host is empty for a shadow gmap */
 481        return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 482}
 483EXPORT_SYMBOL_GPL(__gmap_translate);
 484
 485/**
 486 * gmap_translate - translate a guest address to a user space address
 487 * @gmap: pointer to guest mapping meta data structure
 488 * @gaddr: guest address
 489 *
 490 * Returns user space address which corresponds to the guest address or
 491 * -EFAULT if no such mapping exists.
 492 * This function does not establish potentially missing page table entries.
 493 */
 494unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
 495{
 496        unsigned long rc;
 497
 498        mmap_read_lock(gmap->mm);
 499        rc = __gmap_translate(gmap, gaddr);
 500        mmap_read_unlock(gmap->mm);
 501        return rc;
 502}
 503EXPORT_SYMBOL_GPL(gmap_translate);
 504
 505/**
 506 * gmap_unlink - disconnect a page table from the gmap shadow tables
 507 * @gmap: pointer to guest mapping meta data structure
 508 * @table: pointer to the host page table
 509 * @vmaddr: vm address associated with the host page table
 510 */
 511void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 512                 unsigned long vmaddr)
 513{
 514        struct gmap *gmap;
 515        int flush;
 516
 517        rcu_read_lock();
 518        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 519                flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
 520                if (flush)
 521                        gmap_flush_tlb(gmap);
 522        }
 523        rcu_read_unlock();
 524}
 525
 526static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
 527                           unsigned long gaddr);
 528
 529/**
 530 * gmap_link - set up shadow page tables to connect a host to a guest address
 531 * @gmap: pointer to guest mapping meta data structure
 532 * @gaddr: guest address
 533 * @vmaddr: vm address
 534 *
 535 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 536 * if the vm address is already mapped to a different guest segment.
 537 * The mmap_lock of the mm that belongs to the address space must be held
 538 * when this function gets called.
 539 */
 540int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 541{
 542        struct mm_struct *mm;
 543        unsigned long *table;
 544        spinlock_t *ptl;
 545        pgd_t *pgd;
 546        p4d_t *p4d;
 547        pud_t *pud;
 548        pmd_t *pmd;
 549        u64 unprot;
 550        int rc;
 551
 552        BUG_ON(gmap_is_shadow(gmap));
 553        /* Create higher level tables in the gmap page table */
 554        table = gmap->table;
 555        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
 556                table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
 557                if ((*table & _REGION_ENTRY_INVALID) &&
 558                    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
 559                                     gaddr & _REGION1_MASK))
 560                        return -ENOMEM;
 561                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 562        }
 563        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
 564                table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
 565                if ((*table & _REGION_ENTRY_INVALID) &&
 566                    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
 567                                     gaddr & _REGION2_MASK))
 568                        return -ENOMEM;
 569                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 570        }
 571        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
 572                table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
 573                if ((*table & _REGION_ENTRY_INVALID) &&
 574                    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
 575                                     gaddr & _REGION3_MASK))
 576                        return -ENOMEM;
 577                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 578        }
 579        table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 580        /* Walk the parent mm page table */
 581        mm = gmap->mm;
 582        pgd = pgd_offset(mm, vmaddr);
 583        VM_BUG_ON(pgd_none(*pgd));
 584        p4d = p4d_offset(pgd, vmaddr);
 585        VM_BUG_ON(p4d_none(*p4d));
 586        pud = pud_offset(p4d, vmaddr);
 587        VM_BUG_ON(pud_none(*pud));
 588        /* large puds cannot yet be handled */
 589        if (pud_large(*pud))
 590                return -EFAULT;
 591        pmd = pmd_offset(pud, vmaddr);
 592        VM_BUG_ON(pmd_none(*pmd));
 593        /* Are we allowed to use huge pages? */
 594        if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
 595                return -EFAULT;
 596        /* Link gmap segment table entry location to page table. */
 597        rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
 598        if (rc)
 599                return rc;
 600        ptl = pmd_lock(mm, pmd);
 601        spin_lock(&gmap->guest_table_lock);
 602        if (*table == _SEGMENT_ENTRY_EMPTY) {
 603                rc = radix_tree_insert(&gmap->host_to_guest,
 604                                       vmaddr >> PMD_SHIFT, table);
 605                if (!rc) {
 606                        if (pmd_large(*pmd)) {
 607                                *table = (pmd_val(*pmd) &
 608                                          _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
 609                                        | _SEGMENT_ENTRY_GMAP_UC;
 610                        } else
 611                                *table = pmd_val(*pmd) &
 612                                        _SEGMENT_ENTRY_HARDWARE_BITS;
 613                }
 614        } else if (*table & _SEGMENT_ENTRY_PROTECT &&
 615                   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
 616                unprot = (u64)*table;
 617                unprot &= ~_SEGMENT_ENTRY_PROTECT;
 618                unprot |= _SEGMENT_ENTRY_GMAP_UC;
 619                gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
 620        }
 621        spin_unlock(&gmap->guest_table_lock);
 622        spin_unlock(ptl);
 623        radix_tree_preload_end();
 624        return rc;
 625}
 626
 627/**
 628 * gmap_fault - resolve a fault on a guest address
 629 * @gmap: pointer to guest mapping meta data structure
 630 * @gaddr: guest address
 631 * @fault_flags: flags to pass down to handle_mm_fault()
 632 *
 633 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 634 * if the vm address is already mapped to a different guest segment.
 635 */
 636int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 637               unsigned int fault_flags)
 638{
 639        unsigned long vmaddr;
 640        int rc;
 641        bool unlocked;
 642
 643        mmap_read_lock(gmap->mm);
 644
 645retry:
 646        unlocked = false;
 647        vmaddr = __gmap_translate(gmap, gaddr);
 648        if (IS_ERR_VALUE(vmaddr)) {
 649                rc = vmaddr;
 650                goto out_up;
 651        }
 652        if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
 653                             &unlocked)) {
 654                rc = -EFAULT;
 655                goto out_up;
 656        }
 657        /*
 658         * In the case that fixup_user_fault unlocked the mmap_lock during
 659         * faultin redo __gmap_translate to not race with a map/unmap_segment.
 660         */
 661        if (unlocked)
 662                goto retry;
 663
 664        rc = __gmap_link(gmap, gaddr, vmaddr);
 665out_up:
 666        mmap_read_unlock(gmap->mm);
 667        return rc;
 668}
 669EXPORT_SYMBOL_GPL(gmap_fault);
 670
 671/*
 672 * this function is assumed to be called with mmap_lock held
 673 */
 674void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 675{
 676        unsigned long vmaddr;
 677        spinlock_t *ptl;
 678        pte_t *ptep;
 679
 680        /* Find the vm address for the guest address */
 681        vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
 682                                                   gaddr >> PMD_SHIFT);
 683        if (vmaddr) {
 684                vmaddr |= gaddr & ~PMD_MASK;
 685                /* Get pointer to the page table entry */
 686                ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
 687                if (likely(ptep))
 688                        ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
 689                pte_unmap_unlock(ptep, ptl);
 690        }
 691}
 692EXPORT_SYMBOL_GPL(__gmap_zap);
 693
 694void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 695{
 696        unsigned long gaddr, vmaddr, size;
 697        struct vm_area_struct *vma;
 698
 699        mmap_read_lock(gmap->mm);
 700        for (gaddr = from; gaddr < to;
 701             gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
 702                /* Find the vm address for the guest address */
 703                vmaddr = (unsigned long)
 704                        radix_tree_lookup(&gmap->guest_to_host,
 705                                          gaddr >> PMD_SHIFT);
 706                if (!vmaddr)
 707                        continue;
 708                vmaddr |= gaddr & ~PMD_MASK;
 709                /* Find vma in the parent mm */
 710                vma = find_vma(gmap->mm, vmaddr);
 711                if (!vma)
 712                        continue;
 713                /*
 714                 * We do not discard pages that are backed by
 715                 * hugetlbfs, so we don't have to refault them.
 716                 */
 717                if (is_vm_hugetlb_page(vma))
 718                        continue;
 719                size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
 720                zap_page_range(vma, vmaddr, size);
 721        }
 722        mmap_read_unlock(gmap->mm);
 723}
 724EXPORT_SYMBOL_GPL(gmap_discard);
 725
 726static LIST_HEAD(gmap_notifier_list);
 727static DEFINE_SPINLOCK(gmap_notifier_lock);
 728
 729/**
 730 * gmap_register_pte_notifier - register a pte invalidation callback
 731 * @nb: pointer to the gmap notifier block
 732 */
 733void gmap_register_pte_notifier(struct gmap_notifier *nb)
 734{
 735        spin_lock(&gmap_notifier_lock);
 736        list_add_rcu(&nb->list, &gmap_notifier_list);
 737        spin_unlock(&gmap_notifier_lock);
 738}
 739EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
 740
 741/**
 742 * gmap_unregister_pte_notifier - remove a pte invalidation callback
 743 * @nb: pointer to the gmap notifier block
 744 */
 745void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
 746{
 747        spin_lock(&gmap_notifier_lock);
 748        list_del_rcu(&nb->list);
 749        spin_unlock(&gmap_notifier_lock);
 750        synchronize_rcu();
 751}
 752EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
 753
 754/**
 755 * gmap_call_notifier - call all registered invalidation callbacks
 756 * @gmap: pointer to guest mapping meta data structure
 757 * @start: start virtual address in the guest address space
 758 * @end: end virtual address in the guest address space
 759 */
 760static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
 761                               unsigned long end)
 762{
 763        struct gmap_notifier *nb;
 764
 765        list_for_each_entry(nb, &gmap_notifier_list, list)
 766                nb->notifier_call(gmap, start, end);
 767}
 768
 769/**
 770 * gmap_table_walk - walk the gmap page tables
 771 * @gmap: pointer to guest mapping meta data structure
 772 * @gaddr: virtual address in the guest address space
 773 * @level: page table level to stop at
 774 *
 775 * Returns a table entry pointer for the given guest address and @level
 776 * @level=0 : returns a pointer to a page table table entry (or NULL)
 777 * @level=1 : returns a pointer to a segment table entry (or NULL)
 778 * @level=2 : returns a pointer to a region-3 table entry (or NULL)
 779 * @level=3 : returns a pointer to a region-2 table entry (or NULL)
 780 * @level=4 : returns a pointer to a region-1 table entry (or NULL)
 781 *
 782 * Returns NULL if the gmap page tables could not be walked to the
 783 * requested level.
 784 *
 785 * Note: Can also be called for shadow gmaps.
 786 */
 787static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 788                                             unsigned long gaddr, int level)
 789{
 790        const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
 791        unsigned long *table = gmap->table;
 792
 793        if (gmap_is_shadow(gmap) && gmap->removed)
 794                return NULL;
 795
 796        if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
 797                return NULL;
 798
 799        if (asce_type != _ASCE_TYPE_REGION1 &&
 800            gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
 801                return NULL;
 802
 803        switch (asce_type) {
 804        case _ASCE_TYPE_REGION1:
 805                table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
 806                if (level == 4)
 807                        break;
 808                if (*table & _REGION_ENTRY_INVALID)
 809                        return NULL;
 810                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 811                fallthrough;
 812        case _ASCE_TYPE_REGION2:
 813                table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
 814                if (level == 3)
 815                        break;
 816                if (*table & _REGION_ENTRY_INVALID)
 817                        return NULL;
 818                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 819                fallthrough;
 820        case _ASCE_TYPE_REGION3:
 821                table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
 822                if (level == 2)
 823                        break;
 824                if (*table & _REGION_ENTRY_INVALID)
 825                        return NULL;
 826                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 827                fallthrough;
 828        case _ASCE_TYPE_SEGMENT:
 829                table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 830                if (level == 1)
 831                        break;
 832                if (*table & _REGION_ENTRY_INVALID)
 833                        return NULL;
 834                table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 835                table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
 836        }
 837        return table;
 838}
 839
 840/**
 841 * gmap_pte_op_walk - walk the gmap page table, get the page table lock
 842 *                    and return the pte pointer
 843 * @gmap: pointer to guest mapping meta data structure
 844 * @gaddr: virtual address in the guest address space
 845 * @ptl: pointer to the spinlock pointer
 846 *
 847 * Returns a pointer to the locked pte for a guest address, or NULL
 848 */
 849static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
 850                               spinlock_t **ptl)
 851{
 852        unsigned long *table;
 853
 854        BUG_ON(gmap_is_shadow(gmap));
 855        /* Walk the gmap page table, lock and get pte pointer */
 856        table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
 857        if (!table || *table & _SEGMENT_ENTRY_INVALID)
 858                return NULL;
 859        return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
 860}
 861
 862/**
 863 * gmap_pte_op_fixup - force a page in and connect the gmap page table
 864 * @gmap: pointer to guest mapping meta data structure
 865 * @gaddr: virtual address in the guest address space
 866 * @vmaddr: address in the host process address space
 867 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 868 *
 869 * Returns 0 if the caller can retry __gmap_translate (might fail again),
 870 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
 871 * up or connecting the gmap page table.
 872 */
 873static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 874                             unsigned long vmaddr, int prot)
 875{
 876        struct mm_struct *mm = gmap->mm;
 877        unsigned int fault_flags;
 878        bool unlocked = false;
 879
 880        BUG_ON(gmap_is_shadow(gmap));
 881        fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
 882        if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
 883                return -EFAULT;
 884        if (unlocked)
 885                /* lost mmap_lock, caller has to retry __gmap_translate */
 886                return 0;
 887        /* Connect the page tables */
 888        return __gmap_link(gmap, gaddr, vmaddr);
 889}
 890
 891/**
 892 * gmap_pte_op_end - release the page table lock
 893 * @ptl: pointer to the spinlock pointer
 894 */
 895static void gmap_pte_op_end(spinlock_t *ptl)
 896{
 897        if (ptl)
 898                spin_unlock(ptl);
 899}
 900
 901/**
 902 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
 903 *                    and return the pmd pointer
 904 * @gmap: pointer to guest mapping meta data structure
 905 * @gaddr: virtual address in the guest address space
 906 *
 907 * Returns a pointer to the pmd for a guest address, or NULL
 908 */
 909static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
 910{
 911        pmd_t *pmdp;
 912
 913        BUG_ON(gmap_is_shadow(gmap));
 914        pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
 915        if (!pmdp)
 916                return NULL;
 917
 918        /* without huge pages, there is no need to take the table lock */
 919        if (!gmap->mm->context.allow_gmap_hpage_1m)
 920                return pmd_none(*pmdp) ? NULL : pmdp;
 921
 922        spin_lock(&gmap->guest_table_lock);
 923        if (pmd_none(*pmdp)) {
 924                spin_unlock(&gmap->guest_table_lock);
 925                return NULL;
 926        }
 927
 928        /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
 929        if (!pmd_large(*pmdp))
 930                spin_unlock(&gmap->guest_table_lock);
 931        return pmdp;
 932}
 933
 934/**
 935 * gmap_pmd_op_end - release the guest_table_lock if needed
 936 * @gmap: pointer to the guest mapping meta data structure
 937 * @pmdp: pointer to the pmd
 938 */
 939static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
 940{
 941        if (pmd_large(*pmdp))
 942                spin_unlock(&gmap->guest_table_lock);
 943}
 944
 945/*
 946 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
 947 * @pmdp: pointer to the pmd to be protected
 948 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 949 * @bits: notification bits to set
 950 *
 951 * Returns:
 952 * 0 if successfully protected
 953 * -EAGAIN if a fixup is needed
 954 * -EINVAL if unsupported notifier bits have been specified
 955 *
 956 * Expected to be called with sg->mm->mmap_lock in read and
 957 * guest_table_lock held.
 958 */
 959static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
 960                            pmd_t *pmdp, int prot, unsigned long bits)
 961{
 962        int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
 963        int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
 964        pmd_t new = *pmdp;
 965
 966        /* Fixup needed */
 967        if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
 968                return -EAGAIN;
 969
 970        if (prot == PROT_NONE && !pmd_i) {
 971                pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
 972                gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
 973        }
 974
 975        if (prot == PROT_READ && !pmd_p) {
 976                pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
 977                pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
 978                gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
 979        }
 980
 981        if (bits & GMAP_NOTIFY_MPROT)
 982                pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
 983
 984        /* Shadow GMAP protection needs split PMDs */
 985        if (bits & GMAP_NOTIFY_SHADOW)
 986                return -EINVAL;
 987
 988        return 0;
 989}
 990
 991/*
 992 * gmap_protect_pte - remove access rights to memory and set pgste bits
 993 * @gmap: pointer to guest mapping meta data structure
 994 * @gaddr: virtual address in the guest address space
 995 * @pmdp: pointer to the pmd associated with the pte
 996 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 997 * @bits: notification bits to set
 998 *
 999 * Returns 0 if successfully protected, -ENOMEM if out of memory and
1000 * -EAGAIN if a fixup is needed.
1001 *
1002 * Expected to be called with sg->mm->mmap_lock in read
1003 */
1004static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
1005                            pmd_t *pmdp, int prot, unsigned long bits)
1006{
1007        int rc;
1008        pte_t *ptep;
1009        spinlock_t *ptl = NULL;
1010        unsigned long pbits = 0;
1011
1012        if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
1013                return -EAGAIN;
1014
1015        ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
1016        if (!ptep)
1017                return -ENOMEM;
1018
1019        pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
1020        pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
1021        /* Protect and unlock. */
1022        rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
1023        gmap_pte_op_end(ptl);
1024        return rc;
1025}
1026
1027/*
1028 * gmap_protect_range - remove access rights to memory and set pgste bits
1029 * @gmap: pointer to guest mapping meta data structure
1030 * @gaddr: virtual address in the guest address space
1031 * @len: size of area
1032 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1033 * @bits: pgste notification bits to set
1034 *
1035 * Returns 0 if successfully protected, -ENOMEM if out of memory and
1036 * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
1037 *
1038 * Called with sg->mm->mmap_lock in read.
1039 */
1040static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
1041                              unsigned long len, int prot, unsigned long bits)
1042{
1043        unsigned long vmaddr, dist;
1044        pmd_t *pmdp;
1045        int rc;
1046
1047        BUG_ON(gmap_is_shadow(gmap));
1048        while (len) {
1049                rc = -EAGAIN;
1050                pmdp = gmap_pmd_op_walk(gmap, gaddr);
1051                if (pmdp) {
1052                        if (!pmd_large(*pmdp)) {
1053                                rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
1054                                                      bits);
1055                                if (!rc) {
1056                                        len -= PAGE_SIZE;
1057                                        gaddr += PAGE_SIZE;
1058                                }
1059                        } else {
1060                                rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
1061                                                      bits);
1062                                if (!rc) {
1063                                        dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
1064                                        len = len < dist ? 0 : len - dist;
1065                                        gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
1066                                }
1067                        }
1068                        gmap_pmd_op_end(gmap, pmdp);
1069                }
1070                if (rc) {
1071                        if (rc == -EINVAL)
1072                                return rc;
1073
1074                        /* -EAGAIN, fixup of userspace mm and gmap */
1075                        vmaddr = __gmap_translate(gmap, gaddr);
1076                        if (IS_ERR_VALUE(vmaddr))
1077                                return vmaddr;
1078                        rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
1079                        if (rc)
1080                                return rc;
1081                }
1082        }
1083        return 0;
1084}
1085
1086/**
1087 * gmap_mprotect_notify - change access rights for a range of ptes and
1088 *                        call the notifier if any pte changes again
1089 * @gmap: pointer to guest mapping meta data structure
1090 * @gaddr: virtual address in the guest address space
1091 * @len: size of area
1092 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1093 *
1094 * Returns 0 if for each page in the given range a gmap mapping exists,
1095 * the new access rights could be set and the notifier could be armed.
1096 * If the gmap mapping is missing for one or more pages -EFAULT is
1097 * returned. If no memory could be allocated -ENOMEM is returned.
1098 * This function establishes missing page table entries.
1099 */
1100int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
1101                         unsigned long len, int prot)
1102{
1103        int rc;
1104
1105        if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
1106                return -EINVAL;
1107        if (!MACHINE_HAS_ESOP && prot == PROT_READ)
1108                return -EINVAL;
1109        mmap_read_lock(gmap->mm);
1110        rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
1111        mmap_read_unlock(gmap->mm);
1112        return rc;
1113}
1114EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
1115
1116/**
1117 * gmap_read_table - get an unsigned long value from a guest page table using
1118 *                   absolute addressing, without marking the page referenced.
1119 * @gmap: pointer to guest mapping meta data structure
1120 * @gaddr: virtual address in the guest address space
1121 * @val: pointer to the unsigned long value to return
1122 *
1123 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
1124 * if reading using the virtual address failed. -EINVAL if called on a gmap
1125 * shadow.
1126 *
1127 * Called with gmap->mm->mmap_lock in read.
1128 */
1129int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
1130{
1131        unsigned long address, vmaddr;
1132        spinlock_t *ptl;
1133        pte_t *ptep, pte;
1134        int rc;
1135
1136        if (gmap_is_shadow(gmap))
1137                return -EINVAL;
1138
1139        while (1) {
1140                rc = -EAGAIN;
1141                ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
1142                if (ptep) {
1143                        pte = *ptep;
1144                        if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1145                                address = pte_val(pte) & PAGE_MASK;
1146                                address += gaddr & ~PAGE_MASK;
1147                                *val = *(unsigned long *) address;
1148                                pte_val(*ptep) |= _PAGE_YOUNG;
1149                                /* Do *NOT* clear the _PAGE_INVALID bit! */
1150                                rc = 0;
1151                        }
1152                        gmap_pte_op_end(ptl);
1153                }
1154                if (!rc)
1155                        break;
1156                vmaddr = __gmap_translate(gmap, gaddr);
1157                if (IS_ERR_VALUE(vmaddr)) {
1158                        rc = vmaddr;
1159                        break;
1160                }
1161                rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1162                if (rc)
1163                        break;
1164        }
1165        return rc;
1166}
1167EXPORT_SYMBOL_GPL(gmap_read_table);
1168
1169/**
1170 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1171 * @sg: pointer to the shadow guest address space structure
1172 * @vmaddr: vm address associated with the rmap
1173 * @rmap: pointer to the rmap structure
1174 *
1175 * Called with the sg->guest_table_lock
1176 */
1177static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1178                                    struct gmap_rmap *rmap)
1179{
1180        void __rcu **slot;
1181
1182        BUG_ON(!gmap_is_shadow(sg));
1183        slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1184        if (slot) {
1185                rmap->next = radix_tree_deref_slot_protected(slot,
1186                                                        &sg->guest_table_lock);
1187                radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1188        } else {
1189                rmap->next = NULL;
1190                radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1191                                  rmap);
1192        }
1193}
1194
1195/**
1196 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1197 * @sg: pointer to the shadow guest address space structure
1198 * @raddr: rmap address in the shadow gmap
1199 * @paddr: address in the parent guest address space
1200 * @len: length of the memory area to protect
1201 *
1202 * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1203 * if out of memory and -EFAULT if paddr is invalid.
1204 */
1205static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1206                             unsigned long paddr, unsigned long len)
1207{
1208        struct gmap *parent;
1209        struct gmap_rmap *rmap;
1210        unsigned long vmaddr;
1211        spinlock_t *ptl;
1212        pte_t *ptep;
1213        int rc;
1214
1215        BUG_ON(!gmap_is_shadow(sg));
1216        parent = sg->parent;
1217        while (len) {
1218                vmaddr = __gmap_translate(parent, paddr);
1219                if (IS_ERR_VALUE(vmaddr))
1220                        return vmaddr;
1221                rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1222                if (!rmap)
1223                        return -ENOMEM;
1224                rmap->raddr = raddr;
1225                rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1226                if (rc) {
1227                        kfree(rmap);
1228                        return rc;
1229                }
1230                rc = -EAGAIN;
1231                ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1232                if (ptep) {
1233                        spin_lock(&sg->guest_table_lock);
1234                        rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1235                                             PGSTE_VSIE_BIT);
1236                        if (!rc)
1237                                gmap_insert_rmap(sg, vmaddr, rmap);
1238                        spin_unlock(&sg->guest_table_lock);
1239                        gmap_pte_op_end(ptl);
1240                }
1241                radix_tree_preload_end();
1242                if (rc) {
1243                        kfree(rmap);
1244                        rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
1245                        if (rc)
1246                                return rc;
1247                        continue;
1248                }
1249                paddr += PAGE_SIZE;
1250                len -= PAGE_SIZE;
1251        }
1252        return 0;
1253}
1254
1255#define _SHADOW_RMAP_MASK       0x7
1256#define _SHADOW_RMAP_REGION1    0x5
1257#define _SHADOW_RMAP_REGION2    0x4
1258#define _SHADOW_RMAP_REGION3    0x3
1259#define _SHADOW_RMAP_SEGMENT    0x2
1260#define _SHADOW_RMAP_PGTABLE    0x1
1261
1262/**
1263 * gmap_idte_one - invalidate a single region or segment table entry
1264 * @asce: region or segment table *origin* + table-type bits
1265 * @vaddr: virtual address to identify the table entry to flush
1266 *
1267 * The invalid bit of a single region or segment table entry is set
1268 * and the associated TLB entries depending on the entry are flushed.
1269 * The table-type of the @asce identifies the portion of the @vaddr
1270 * that is used as the invalidation index.
1271 */
1272static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1273{
1274        asm volatile(
1275                "       .insn   rrf,0xb98e0000,%0,%1,0,0"
1276                : : "a" (asce), "a" (vaddr) : "cc", "memory");
1277}
1278
1279/**
1280 * gmap_unshadow_page - remove a page from a shadow page table
1281 * @sg: pointer to the shadow guest address space structure
1282 * @raddr: rmap address in the shadow guest address space
1283 *
1284 * Called with the sg->guest_table_lock
1285 */
1286static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1287{
1288        unsigned long *table;
1289
1290        BUG_ON(!gmap_is_shadow(sg));
1291        table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1292        if (!table || *table & _PAGE_INVALID)
1293                return;
1294        gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
1295        ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1296}
1297
1298/**
1299 * __gmap_unshadow_pgt - remove all entries from a shadow page table
1300 * @sg: pointer to the shadow guest address space structure
1301 * @raddr: rmap address in the shadow guest address space
1302 * @pgt: pointer to the start of a shadow page table
1303 *
1304 * Called with the sg->guest_table_lock
1305 */
1306static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1307                                unsigned long *pgt)
1308{
1309        int i;
1310
1311        BUG_ON(!gmap_is_shadow(sg));
1312        for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
1313                pgt[i] = _PAGE_INVALID;
1314}
1315
1316/**
1317 * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1318 * @sg: pointer to the shadow guest address space structure
1319 * @raddr: address in the shadow guest address space
1320 *
1321 * Called with the sg->guest_table_lock
1322 */
1323static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1324{
1325        unsigned long sto, *ste, *pgt;
1326        struct page *page;
1327
1328        BUG_ON(!gmap_is_shadow(sg));
1329        ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1330        if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1331                return;
1332        gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1333        sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1334        gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1335        pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
1336        *ste = _SEGMENT_ENTRY_EMPTY;
1337        __gmap_unshadow_pgt(sg, raddr, pgt);
1338        /* Free page table */
1339        page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1340        list_del(&page->lru);
1341        page_table_free_pgste(page);
1342}
1343
1344/**
1345 * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1346 * @sg: pointer to the shadow guest address space structure
1347 * @raddr: rmap address in the shadow guest address space
1348 * @sgt: pointer to the start of a shadow segment table
1349 *
1350 * Called with the sg->guest_table_lock
1351 */
1352static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1353                                unsigned long *sgt)
1354{
1355        unsigned long *pgt;
1356        struct page *page;
1357        int i;
1358
1359        BUG_ON(!gmap_is_shadow(sg));
1360        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1361                if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1362                        continue;
1363                pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
1364                sgt[i] = _SEGMENT_ENTRY_EMPTY;
1365                __gmap_unshadow_pgt(sg, raddr, pgt);
1366                /* Free page table */
1367                page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1368                list_del(&page->lru);
1369                page_table_free_pgste(page);
1370        }
1371}
1372
1373/**
1374 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1375 * @sg: pointer to the shadow guest address space structure
1376 * @raddr: rmap address in the shadow guest address space
1377 *
1378 * Called with the shadow->guest_table_lock
1379 */
1380static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1381{
1382        unsigned long r3o, *r3e, *sgt;
1383        struct page *page;
1384
1385        BUG_ON(!gmap_is_shadow(sg));
1386        r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1387        if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1388                return;
1389        gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1390        r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1391        gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
1392        sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
1393        *r3e = _REGION3_ENTRY_EMPTY;
1394        __gmap_unshadow_sgt(sg, raddr, sgt);
1395        /* Free segment table */
1396        page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1397        list_del(&page->lru);
1398        __free_pages(page, CRST_ALLOC_ORDER);
1399}
1400
1401/**
1402 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1403 * @sg: pointer to the shadow guest address space structure
1404 * @raddr: address in the shadow guest address space
1405 * @r3t: pointer to the start of a shadow region-3 table
1406 *
1407 * Called with the sg->guest_table_lock
1408 */
1409static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1410                                unsigned long *r3t)
1411{
1412        unsigned long *sgt;
1413        struct page *page;
1414        int i;
1415
1416        BUG_ON(!gmap_is_shadow(sg));
1417        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1418                if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1419                        continue;
1420                sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
1421                r3t[i] = _REGION3_ENTRY_EMPTY;
1422                __gmap_unshadow_sgt(sg, raddr, sgt);
1423                /* Free segment table */
1424                page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1425                list_del(&page->lru);
1426                __free_pages(page, CRST_ALLOC_ORDER);
1427        }
1428}
1429
1430/**
1431 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1432 * @sg: pointer to the shadow guest address space structure
1433 * @raddr: rmap address in the shadow guest address space
1434 *
1435 * Called with the sg->guest_table_lock
1436 */
1437static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1438{
1439        unsigned long r2o, *r2e, *r3t;
1440        struct page *page;
1441
1442        BUG_ON(!gmap_is_shadow(sg));
1443        r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1444        if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1445                return;
1446        gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1447        r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1448        gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
1449        r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
1450        *r2e = _REGION2_ENTRY_EMPTY;
1451        __gmap_unshadow_r3t(sg, raddr, r3t);
1452        /* Free region 3 table */
1453        page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1454        list_del(&page->lru);
1455        __free_pages(page, CRST_ALLOC_ORDER);
1456}
1457
1458/**
1459 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1460 * @sg: pointer to the shadow guest address space structure
1461 * @raddr: rmap address in the shadow guest address space
1462 * @r2t: pointer to the start of a shadow region-2 table
1463 *
1464 * Called with the sg->guest_table_lock
1465 */
1466static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1467                                unsigned long *r2t)
1468{
1469        unsigned long *r3t;
1470        struct page *page;
1471        int i;
1472
1473        BUG_ON(!gmap_is_shadow(sg));
1474        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1475                if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1476                        continue;
1477                r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
1478                r2t[i] = _REGION2_ENTRY_EMPTY;
1479                __gmap_unshadow_r3t(sg, raddr, r3t);
1480                /* Free region 3 table */
1481                page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1482                list_del(&page->lru);
1483                __free_pages(page, CRST_ALLOC_ORDER);
1484        }
1485}
1486
1487/**
1488 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1489 * @sg: pointer to the shadow guest address space structure
1490 * @raddr: rmap address in the shadow guest address space
1491 *
1492 * Called with the sg->guest_table_lock
1493 */
1494static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1495{
1496        unsigned long r1o, *r1e, *r2t;
1497        struct page *page;
1498
1499        BUG_ON(!gmap_is_shadow(sg));
1500        r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1501        if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1502                return;
1503        gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1504        r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1505        gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
1506        r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
1507        *r1e = _REGION1_ENTRY_EMPTY;
1508        __gmap_unshadow_r2t(sg, raddr, r2t);
1509        /* Free region 2 table */
1510        page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1511        list_del(&page->lru);
1512        __free_pages(page, CRST_ALLOC_ORDER);
1513}
1514
1515/**
1516 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1517 * @sg: pointer to the shadow guest address space structure
1518 * @raddr: rmap address in the shadow guest address space
1519 * @r1t: pointer to the start of a shadow region-1 table
1520 *
1521 * Called with the shadow->guest_table_lock
1522 */
1523static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1524                                unsigned long *r1t)
1525{
1526        unsigned long asce, *r2t;
1527        struct page *page;
1528        int i;
1529
1530        BUG_ON(!gmap_is_shadow(sg));
1531        asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
1532        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1533                if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1534                        continue;
1535                r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
1536                __gmap_unshadow_r2t(sg, raddr, r2t);
1537                /* Clear entry and flush translation r1t -> r2t */
1538                gmap_idte_one(asce, raddr);
1539                r1t[i] = _REGION1_ENTRY_EMPTY;
1540                /* Free region 2 table */
1541                page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1542                list_del(&page->lru);
1543                __free_pages(page, CRST_ALLOC_ORDER);
1544        }
1545}
1546
1547/**
1548 * gmap_unshadow - remove a shadow page table completely
1549 * @sg: pointer to the shadow guest address space structure
1550 *
1551 * Called with sg->guest_table_lock
1552 */
1553static void gmap_unshadow(struct gmap *sg)
1554{
1555        unsigned long *table;
1556
1557        BUG_ON(!gmap_is_shadow(sg));
1558        if (sg->removed)
1559                return;
1560        sg->removed = 1;
1561        gmap_call_notifier(sg, 0, -1UL);
1562        gmap_flush_tlb(sg);
1563        table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
1564        switch (sg->asce & _ASCE_TYPE_MASK) {
1565        case _ASCE_TYPE_REGION1:
1566                __gmap_unshadow_r1t(sg, 0, table);
1567                break;
1568        case _ASCE_TYPE_REGION2:
1569                __gmap_unshadow_r2t(sg, 0, table);
1570                break;
1571        case _ASCE_TYPE_REGION3:
1572                __gmap_unshadow_r3t(sg, 0, table);
1573                break;
1574        case _ASCE_TYPE_SEGMENT:
1575                __gmap_unshadow_sgt(sg, 0, table);
1576                break;
1577        }
1578}
1579
1580/**
1581 * gmap_find_shadow - find a specific asce in the list of shadow tables
1582 * @parent: pointer to the parent gmap
1583 * @asce: ASCE for which the shadow table is created
1584 * @edat_level: edat level to be used for the shadow translation
1585 *
1586 * Returns the pointer to a gmap if a shadow table with the given asce is
1587 * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1588 * otherwise NULL
1589 */
1590static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
1591                                     int edat_level)
1592{
1593        struct gmap *sg;
1594
1595        list_for_each_entry(sg, &parent->children, list) {
1596                if (sg->orig_asce != asce || sg->edat_level != edat_level ||
1597                    sg->removed)
1598                        continue;
1599                if (!sg->initialized)
1600                        return ERR_PTR(-EAGAIN);
1601                refcount_inc(&sg->ref_count);
1602                return sg;
1603        }
1604        return NULL;
1605}
1606
1607/**
1608 * gmap_shadow_valid - check if a shadow guest address space matches the
1609 *                     given properties and is still valid
1610 * @sg: pointer to the shadow guest address space structure
1611 * @asce: ASCE for which the shadow table is requested
1612 * @edat_level: edat level to be used for the shadow translation
1613 *
1614 * Returns 1 if the gmap shadow is still valid and matches the given
1615 * properties, the caller can continue using it. Returns 0 otherwise, the
1616 * caller has to request a new shadow gmap in this case.
1617 *
1618 */
1619int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
1620{
1621        if (sg->removed)
1622                return 0;
1623        return sg->orig_asce == asce && sg->edat_level == edat_level;
1624}
1625EXPORT_SYMBOL_GPL(gmap_shadow_valid);
1626
1627/**
1628 * gmap_shadow - create/find a shadow guest address space
1629 * @parent: pointer to the parent gmap
1630 * @asce: ASCE for which the shadow table is created
1631 * @edat_level: edat level to be used for the shadow translation
1632 *
1633 * The pages of the top level page table referred by the asce parameter
1634 * will be set to read-only and marked in the PGSTEs of the kvm process.
1635 * The shadow table will be removed automatically on any change to the
1636 * PTE mapping for the source table.
1637 *
1638 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1639 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1640 * parent gmap table could not be protected.
1641 */
1642struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
1643                         int edat_level)
1644{
1645        struct gmap *sg, *new;
1646        unsigned long limit;
1647        int rc;
1648
1649        BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
1650        BUG_ON(gmap_is_shadow(parent));
1651        spin_lock(&parent->shadow_lock);
1652        sg = gmap_find_shadow(parent, asce, edat_level);
1653        spin_unlock(&parent->shadow_lock);
1654        if (sg)
1655                return sg;
1656        /* Create a new shadow gmap */
1657        limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
1658        if (asce & _ASCE_REAL_SPACE)
1659                limit = -1UL;
1660        new = gmap_alloc(limit);
1661        if (!new)
1662                return ERR_PTR(-ENOMEM);
1663        new->mm = parent->mm;
1664        new->parent = gmap_get(parent);
1665        new->orig_asce = asce;
1666        new->edat_level = edat_level;
1667        new->initialized = false;
1668        spin_lock(&parent->shadow_lock);
1669        /* Recheck if another CPU created the same shadow */
1670        sg = gmap_find_shadow(parent, asce, edat_level);
1671        if (sg) {
1672                spin_unlock(&parent->shadow_lock);
1673                gmap_free(new);
1674                return sg;
1675        }
1676        if (asce & _ASCE_REAL_SPACE) {
1677                /* only allow one real-space gmap shadow */
1678                list_for_each_entry(sg, &parent->children, list) {
1679                        if (sg->orig_asce & _ASCE_REAL_SPACE) {
1680                                spin_lock(&sg->guest_table_lock);
1681                                gmap_unshadow(sg);
1682                                spin_unlock(&sg->guest_table_lock);
1683                                list_del(&sg->list);
1684                                gmap_put(sg);
1685                                break;
1686                        }
1687                }
1688        }
1689        refcount_set(&new->ref_count, 2);
1690        list_add(&new->list, &parent->children);
1691        if (asce & _ASCE_REAL_SPACE) {
1692                /* nothing to protect, return right away */
1693                new->initialized = true;
1694                spin_unlock(&parent->shadow_lock);
1695                return new;
1696        }
1697        spin_unlock(&parent->shadow_lock);
1698        /* protect after insertion, so it will get properly invalidated */
1699        mmap_read_lock(parent->mm);
1700        rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
1701                                ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
1702                                PROT_READ, GMAP_NOTIFY_SHADOW);
1703        mmap_read_unlock(parent->mm);
1704        spin_lock(&parent->shadow_lock);
1705        new->initialized = true;
1706        if (rc) {
1707                list_del(&new->list);
1708                gmap_free(new);
1709                new = ERR_PTR(rc);
1710        }
1711        spin_unlock(&parent->shadow_lock);
1712        return new;
1713}
1714EXPORT_SYMBOL_GPL(gmap_shadow);
1715
1716/**
1717 * gmap_shadow_r2t - create an empty shadow region 2 table
1718 * @sg: pointer to the shadow guest address space structure
1719 * @saddr: faulting address in the shadow gmap
1720 * @r2t: parent gmap address of the region 2 table to get shadowed
1721 * @fake: r2t references contiguous guest memory block, not a r2t
1722 *
1723 * The r2t parameter specifies the address of the source table. The
1724 * four pages of the source table are made read-only in the parent gmap
1725 * address space. A write to the source table area @r2t will automatically
1726 * remove the shadow r2 table and all of its decendents.
1727 *
1728 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1729 * shadow table structure is incomplete, -ENOMEM if out of memory and
1730 * -EFAULT if an address in the parent gmap could not be resolved.
1731 *
1732 * Called with sg->mm->mmap_lock in read.
1733 */
1734int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1735                    int fake)
1736{
1737        unsigned long raddr, origin, offset, len;
1738        unsigned long *s_r2t, *table;
1739        struct page *page;
1740        int rc;
1741
1742        BUG_ON(!gmap_is_shadow(sg));
1743        /* Allocate a shadow region second table */
1744        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1745        if (!page)
1746                return -ENOMEM;
1747        page->index = r2t & _REGION_ENTRY_ORIGIN;
1748        if (fake)
1749                page->index |= GMAP_SHADOW_FAKE_TABLE;
1750        s_r2t = (unsigned long *) page_to_phys(page);
1751        /* Install shadow region second table */
1752        spin_lock(&sg->guest_table_lock);
1753        table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1754        if (!table) {
1755                rc = -EAGAIN;           /* Race with unshadow */
1756                goto out_free;
1757        }
1758        if (!(*table & _REGION_ENTRY_INVALID)) {
1759                rc = 0;                 /* Already established */
1760                goto out_free;
1761        } else if (*table & _REGION_ENTRY_ORIGIN) {
1762                rc = -EAGAIN;           /* Race with shadow */
1763                goto out_free;
1764        }
1765        crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
1766        /* mark as invalid as long as the parent table is not protected */
1767        *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
1768                 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1769        if (sg->edat_level >= 1)
1770                *table |= (r2t & _REGION_ENTRY_PROTECT);
1771        list_add(&page->lru, &sg->crst_list);
1772        if (fake) {
1773                /* nothing to protect for fake tables */
1774                *table &= ~_REGION_ENTRY_INVALID;
1775                spin_unlock(&sg->guest_table_lock);
1776                return 0;
1777        }
1778        spin_unlock(&sg->guest_table_lock);
1779        /* Make r2t read-only in parent gmap page table */
1780        raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1781        origin = r2t & _REGION_ENTRY_ORIGIN;
1782        offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1783        len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1784        rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1785        spin_lock(&sg->guest_table_lock);
1786        if (!rc) {
1787                table = gmap_table_walk(sg, saddr, 4);
1788                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1789                              (unsigned long) s_r2t)
1790                        rc = -EAGAIN;           /* Race with unshadow */
1791                else
1792                        *table &= ~_REGION_ENTRY_INVALID;
1793        } else {
1794                gmap_unshadow_r2t(sg, raddr);
1795        }
1796        spin_unlock(&sg->guest_table_lock);
1797        return rc;
1798out_free:
1799        spin_unlock(&sg->guest_table_lock);
1800        __free_pages(page, CRST_ALLOC_ORDER);
1801        return rc;
1802}
1803EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1804
1805/**
1806 * gmap_shadow_r3t - create a shadow region 3 table
1807 * @sg: pointer to the shadow guest address space structure
1808 * @saddr: faulting address in the shadow gmap
1809 * @r3t: parent gmap address of the region 3 table to get shadowed
1810 * @fake: r3t references contiguous guest memory block, not a r3t
1811 *
1812 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1813 * shadow table structure is incomplete, -ENOMEM if out of memory and
1814 * -EFAULT if an address in the parent gmap could not be resolved.
1815 *
1816 * Called with sg->mm->mmap_lock in read.
1817 */
1818int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1819                    int fake)
1820{
1821        unsigned long raddr, origin, offset, len;
1822        unsigned long *s_r3t, *table;
1823        struct page *page;
1824        int rc;
1825
1826        BUG_ON(!gmap_is_shadow(sg));
1827        /* Allocate a shadow region second table */
1828        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1829        if (!page)
1830                return -ENOMEM;
1831        page->index = r3t & _REGION_ENTRY_ORIGIN;
1832        if (fake)
1833                page->index |= GMAP_SHADOW_FAKE_TABLE;
1834        s_r3t = (unsigned long *) page_to_phys(page);
1835        /* Install shadow region second table */
1836        spin_lock(&sg->guest_table_lock);
1837        table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1838        if (!table) {
1839                rc = -EAGAIN;           /* Race with unshadow */
1840                goto out_free;
1841        }
1842        if (!(*table & _REGION_ENTRY_INVALID)) {
1843                rc = 0;                 /* Already established */
1844                goto out_free;
1845        } else if (*table & _REGION_ENTRY_ORIGIN) {
1846                rc = -EAGAIN;           /* Race with shadow */
1847                goto out_free;
1848        }
1849        crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
1850        /* mark as invalid as long as the parent table is not protected */
1851        *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
1852                 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1853        if (sg->edat_level >= 1)
1854                *table |= (r3t & _REGION_ENTRY_PROTECT);
1855        list_add(&page->lru, &sg->crst_list);
1856        if (fake) {
1857                /* nothing to protect for fake tables */
1858                *table &= ~_REGION_ENTRY_INVALID;
1859                spin_unlock(&sg->guest_table_lock);
1860                return 0;
1861        }
1862        spin_unlock(&sg->guest_table_lock);
1863        /* Make r3t read-only in parent gmap page table */
1864        raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1865        origin = r3t & _REGION_ENTRY_ORIGIN;
1866        offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1867        len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1868        rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1869        spin_lock(&sg->guest_table_lock);
1870        if (!rc) {
1871                table = gmap_table_walk(sg, saddr, 3);
1872                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1873                              (unsigned long) s_r3t)
1874                        rc = -EAGAIN;           /* Race with unshadow */
1875                else
1876                        *table &= ~_REGION_ENTRY_INVALID;
1877        } else {
1878                gmap_unshadow_r3t(sg, raddr);
1879        }
1880        spin_unlock(&sg->guest_table_lock);
1881        return rc;
1882out_free:
1883        spin_unlock(&sg->guest_table_lock);
1884        __free_pages(page, CRST_ALLOC_ORDER);
1885        return rc;
1886}
1887EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1888
1889/**
1890 * gmap_shadow_sgt - create a shadow segment table
1891 * @sg: pointer to the shadow guest address space structure
1892 * @saddr: faulting address in the shadow gmap
1893 * @sgt: parent gmap address of the segment table to get shadowed
1894 * @fake: sgt references contiguous guest memory block, not a sgt
1895 *
1896 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1897 * shadow table structure is incomplete, -ENOMEM if out of memory and
1898 * -EFAULT if an address in the parent gmap could not be resolved.
1899 *
1900 * Called with sg->mm->mmap_lock in read.
1901 */
1902int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1903                    int fake)
1904{
1905        unsigned long raddr, origin, offset, len;
1906        unsigned long *s_sgt, *table;
1907        struct page *page;
1908        int rc;
1909
1910        BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1911        /* Allocate a shadow segment table */
1912        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1913        if (!page)
1914                return -ENOMEM;
1915        page->index = sgt & _REGION_ENTRY_ORIGIN;
1916        if (fake)
1917                page->index |= GMAP_SHADOW_FAKE_TABLE;
1918        s_sgt = (unsigned long *) page_to_phys(page);
1919        /* Install shadow region second table */
1920        spin_lock(&sg->guest_table_lock);
1921        table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1922        if (!table) {
1923                rc = -EAGAIN;           /* Race with unshadow */
1924                goto out_free;
1925        }
1926        if (!(*table & _REGION_ENTRY_INVALID)) {
1927                rc = 0;                 /* Already established */
1928                goto out_free;
1929        } else if (*table & _REGION_ENTRY_ORIGIN) {
1930                rc = -EAGAIN;           /* Race with shadow */
1931                goto out_free;
1932        }
1933        crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
1934        /* mark as invalid as long as the parent table is not protected */
1935        *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
1936                 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1937        if (sg->edat_level >= 1)
1938                *table |= sgt & _REGION_ENTRY_PROTECT;
1939        list_add(&page->lru, &sg->crst_list);
1940        if (fake) {
1941                /* nothing to protect for fake tables */
1942                *table &= ~_REGION_ENTRY_INVALID;
1943                spin_unlock(&sg->guest_table_lock);
1944                return 0;
1945        }
1946        spin_unlock(&sg->guest_table_lock);
1947        /* Make sgt read-only in parent gmap page table */
1948        raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1949        origin = sgt & _REGION_ENTRY_ORIGIN;
1950        offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1951        len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1952        rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1953        spin_lock(&sg->guest_table_lock);
1954        if (!rc) {
1955                table = gmap_table_walk(sg, saddr, 2);
1956                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1957                              (unsigned long) s_sgt)
1958                        rc = -EAGAIN;           /* Race with unshadow */
1959                else
1960                        *table &= ~_REGION_ENTRY_INVALID;
1961        } else {
1962                gmap_unshadow_sgt(sg, raddr);
1963        }
1964        spin_unlock(&sg->guest_table_lock);
1965        return rc;
1966out_free:
1967        spin_unlock(&sg->guest_table_lock);
1968        __free_pages(page, CRST_ALLOC_ORDER);
1969        return rc;
1970}
1971EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1972
1973/**
1974 * gmap_shadow_lookup_pgtable - find a shadow page table
1975 * @sg: pointer to the shadow guest address space structure
1976 * @saddr: the address in the shadow aguest address space
1977 * @pgt: parent gmap address of the page table to get shadowed
1978 * @dat_protection: if the pgtable is marked as protected by dat
1979 * @fake: pgt references contiguous guest memory block, not a pgtable
1980 *
1981 * Returns 0 if the shadow page table was found and -EAGAIN if the page
1982 * table was not found.
1983 *
1984 * Called with sg->mm->mmap_lock in read.
1985 */
1986int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
1987                           unsigned long *pgt, int *dat_protection,
1988                           int *fake)
1989{
1990        unsigned long *table;
1991        struct page *page;
1992        int rc;
1993
1994        BUG_ON(!gmap_is_shadow(sg));
1995        spin_lock(&sg->guest_table_lock);
1996        table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1997        if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
1998                /* Shadow page tables are full pages (pte+pgste) */
1999                page = pfn_to_page(*table >> PAGE_SHIFT);
2000                *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
2001                *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
2002                *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
2003                rc = 0;
2004        } else  {
2005                rc = -EAGAIN;
2006        }
2007        spin_unlock(&sg->guest_table_lock);
2008        return rc;
2009
2010}
2011EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
2012
2013/**
2014 * gmap_shadow_pgt - instantiate a shadow page table
2015 * @sg: pointer to the shadow guest address space structure
2016 * @saddr: faulting address in the shadow gmap
2017 * @pgt: parent gmap address of the page table to get shadowed
2018 * @fake: pgt references contiguous guest memory block, not a pgtable
2019 *
2020 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
2021 * shadow table structure is incomplete, -ENOMEM if out of memory,
2022 * -EFAULT if an address in the parent gmap could not be resolved and
2023 *
2024 * Called with gmap->mm->mmap_lock in read
2025 */
2026int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
2027                    int fake)
2028{
2029        unsigned long raddr, origin;
2030        unsigned long *s_pgt, *table;
2031        struct page *page;
2032        int rc;
2033
2034        BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
2035        /* Allocate a shadow page table */
2036        page = page_table_alloc_pgste(sg->mm);
2037        if (!page)
2038                return -ENOMEM;
2039        page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
2040        if (fake)
2041                page->index |= GMAP_SHADOW_FAKE_TABLE;
2042        s_pgt = (unsigned long *) page_to_phys(page);
2043        /* Install shadow page table */
2044        spin_lock(&sg->guest_table_lock);
2045        table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
2046        if (!table) {
2047                rc = -EAGAIN;           /* Race with unshadow */
2048                goto out_free;
2049        }
2050        if (!(*table & _SEGMENT_ENTRY_INVALID)) {
2051                rc = 0;                 /* Already established */
2052                goto out_free;
2053        } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
2054                rc = -EAGAIN;           /* Race with shadow */
2055                goto out_free;
2056        }
2057        /* mark as invalid as long as the parent table is not protected */
2058        *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
2059                 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
2060        list_add(&page->lru, &sg->pt_list);
2061        if (fake) {
2062                /* nothing to protect for fake tables */
2063                *table &= ~_SEGMENT_ENTRY_INVALID;
2064                spin_unlock(&sg->guest_table_lock);
2065                return 0;
2066        }
2067        spin_unlock(&sg->guest_table_lock);
2068        /* Make pgt read-only in parent gmap page table (not the pgste) */
2069        raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
2070        origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
2071        rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
2072        spin_lock(&sg->guest_table_lock);
2073        if (!rc) {
2074                table = gmap_table_walk(sg, saddr, 1);
2075                if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
2076                              (unsigned long) s_pgt)
2077                        rc = -EAGAIN;           /* Race with unshadow */
2078                else
2079                        *table &= ~_SEGMENT_ENTRY_INVALID;
2080        } else {
2081                gmap_unshadow_pgt(sg, raddr);
2082        }
2083        spin_unlock(&sg->guest_table_lock);
2084        return rc;
2085out_free:
2086        spin_unlock(&sg->guest_table_lock);
2087        page_table_free_pgste(page);
2088        return rc;
2089
2090}
2091EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
2092
2093/**
2094 * gmap_shadow_page - create a shadow page mapping
2095 * @sg: pointer to the shadow guest address space structure
2096 * @saddr: faulting address in the shadow gmap
2097 * @pte: pte in parent gmap address space to get shadowed
2098 *
2099 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
2100 * shadow table structure is incomplete, -ENOMEM if out of memory and
2101 * -EFAULT if an address in the parent gmap could not be resolved.
2102 *
2103 * Called with sg->mm->mmap_lock in read.
2104 */
2105int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
2106{
2107        struct gmap *parent;
2108        struct gmap_rmap *rmap;
2109        unsigned long vmaddr, paddr;
2110        spinlock_t *ptl;
2111        pte_t *sptep, *tptep;
2112        int prot;
2113        int rc;
2114
2115        BUG_ON(!gmap_is_shadow(sg));
2116        parent = sg->parent;
2117        prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
2118
2119        rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
2120        if (!rmap)
2121                return -ENOMEM;
2122        rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
2123
2124        while (1) {
2125                paddr = pte_val(pte) & PAGE_MASK;
2126                vmaddr = __gmap_translate(parent, paddr);
2127                if (IS_ERR_VALUE(vmaddr)) {
2128                        rc = vmaddr;
2129                        break;
2130                }
2131                rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
2132                if (rc)
2133                        break;
2134                rc = -EAGAIN;
2135                sptep = gmap_pte_op_walk(parent, paddr, &ptl);
2136                if (sptep) {
2137                        spin_lock(&sg->guest_table_lock);
2138                        /* Get page table pointer */
2139                        tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
2140                        if (!tptep) {
2141                                spin_unlock(&sg->guest_table_lock);
2142                                gmap_pte_op_end(ptl);
2143                                radix_tree_preload_end();
2144                                break;
2145                        }
2146                        rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
2147                        if (rc > 0) {
2148                                /* Success and a new mapping */
2149                                gmap_insert_rmap(sg, vmaddr, rmap);
2150                                rmap = NULL;
2151                                rc = 0;
2152                        }
2153                        gmap_pte_op_end(ptl);
2154                        spin_unlock(&sg->guest_table_lock);
2155                }
2156                radix_tree_preload_end();
2157                if (!rc)
2158                        break;
2159                rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
2160                if (rc)
2161                        break;
2162        }
2163        kfree(rmap);
2164        return rc;
2165}
2166EXPORT_SYMBOL_GPL(gmap_shadow_page);
2167
2168/**
2169 * gmap_shadow_notify - handle notifications for shadow gmap
2170 *
2171 * Called with sg->parent->shadow_lock.
2172 */
2173static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
2174                               unsigned long gaddr)
2175{
2176        struct gmap_rmap *rmap, *rnext, *head;
2177        unsigned long start, end, bits, raddr;
2178
2179        BUG_ON(!gmap_is_shadow(sg));
2180
2181        spin_lock(&sg->guest_table_lock);
2182        if (sg->removed) {
2183                spin_unlock(&sg->guest_table_lock);
2184                return;
2185        }
2186        /* Check for top level table */
2187        start = sg->orig_asce & _ASCE_ORIGIN;
2188        end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
2189        if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
2190            gaddr < end) {
2191                /* The complete shadow table has to go */
2192                gmap_unshadow(sg);
2193                spin_unlock(&sg->guest_table_lock);
2194                list_del(&sg->list);
2195                gmap_put(sg);
2196                return;
2197        }
2198        /* Remove the page table tree from on specific entry */
2199        head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
2200        gmap_for_each_rmap_safe(rmap, rnext, head) {
2201                bits = rmap->raddr & _SHADOW_RMAP_MASK;
2202                raddr = rmap->raddr ^ bits;
2203                switch (bits) {
2204                case _SHADOW_RMAP_REGION1:
2205                        gmap_unshadow_r2t(sg, raddr);
2206                        break;
2207                case _SHADOW_RMAP_REGION2:
2208                        gmap_unshadow_r3t(sg, raddr);
2209                        break;
2210                case _SHADOW_RMAP_REGION3:
2211                        gmap_unshadow_sgt(sg, raddr);
2212                        break;
2213                case _SHADOW_RMAP_SEGMENT:
2214                        gmap_unshadow_pgt(sg, raddr);
2215                        break;
2216                case _SHADOW_RMAP_PGTABLE:
2217                        gmap_unshadow_page(sg, raddr);
2218                        break;
2219                }
2220                kfree(rmap);
2221        }
2222        spin_unlock(&sg->guest_table_lock);
2223}
2224
2225/**
2226 * ptep_notify - call all invalidation callbacks for a specific pte.
2227 * @mm: pointer to the process mm_struct
2228 * @addr: virtual address in the process address space
2229 * @pte: pointer to the page table entry
2230 * @bits: bits from the pgste that caused the notify call
2231 *
2232 * This function is assumed to be called with the page table lock held
2233 * for the pte to notify.
2234 */
2235void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
2236                 pte_t *pte, unsigned long bits)
2237{
2238        unsigned long offset, gaddr = 0;
2239        unsigned long *table;
2240        struct gmap *gmap, *sg, *next;
2241
2242        offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
2243        offset = offset * (PAGE_SIZE / sizeof(pte_t));
2244        rcu_read_lock();
2245        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2246                spin_lock(&gmap->guest_table_lock);
2247                table = radix_tree_lookup(&gmap->host_to_guest,
2248                                          vmaddr >> PMD_SHIFT);
2249                if (table)
2250                        gaddr = __gmap_segment_gaddr(table) + offset;
2251                spin_unlock(&gmap->guest_table_lock);
2252                if (!table)
2253                        continue;
2254
2255                if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
2256                        spin_lock(&gmap->shadow_lock);
2257                        list_for_each_entry_safe(sg, next,
2258                                                 &gmap->children, list)
2259                                gmap_shadow_notify(sg, vmaddr, gaddr);
2260                        spin_unlock(&gmap->shadow_lock);
2261                }
2262                if (bits & PGSTE_IN_BIT)
2263                        gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
2264        }
2265        rcu_read_unlock();
2266}
2267EXPORT_SYMBOL_GPL(ptep_notify);
2268
2269static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
2270                             unsigned long gaddr)
2271{
2272        pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
2273        gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
2274}
2275
2276/**
2277 * gmap_pmdp_xchg - exchange a gmap pmd with another
2278 * @gmap: pointer to the guest address space structure
2279 * @pmdp: pointer to the pmd entry
2280 * @new: replacement entry
2281 * @gaddr: the affected guest address
2282 *
2283 * This function is assumed to be called with the guest_table_lock
2284 * held.
2285 */
2286static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
2287                           unsigned long gaddr)
2288{
2289        gaddr &= HPAGE_MASK;
2290        pmdp_notify_gmap(gmap, pmdp, gaddr);
2291        pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
2292        if (MACHINE_HAS_TLB_GUEST)
2293                __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
2294                            IDTE_GLOBAL);
2295        else if (MACHINE_HAS_IDTE)
2296                __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
2297        else
2298                __pmdp_csp(pmdp);
2299        *pmdp = new;
2300}
2301
2302static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
2303                            int purge)
2304{
2305        pmd_t *pmdp;
2306        struct gmap *gmap;
2307        unsigned long gaddr;
2308
2309        rcu_read_lock();
2310        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2311                spin_lock(&gmap->guest_table_lock);
2312                pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
2313                                                  vmaddr >> PMD_SHIFT);
2314                if (pmdp) {
2315                        gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
2316                        pmdp_notify_gmap(gmap, pmdp, gaddr);
2317                        WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2318                                                   _SEGMENT_ENTRY_GMAP_UC));
2319                        if (purge)
2320                                __pmdp_csp(pmdp);
2321                        pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
2322                }
2323                spin_unlock(&gmap->guest_table_lock);
2324        }
2325        rcu_read_unlock();
2326}
2327
2328/**
2329 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2330 *                        flushing
2331 * @mm: pointer to the process mm_struct
2332 * @vmaddr: virtual address in the process address space
2333 */
2334void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2335{
2336        gmap_pmdp_clear(mm, vmaddr, 0);
2337}
2338EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2339
2340/**
2341 * gmap_pmdp_csp - csp all affected guest pmd entries
2342 * @mm: pointer to the process mm_struct
2343 * @vmaddr: virtual address in the process address space
2344 */
2345void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
2346{
2347        gmap_pmdp_clear(mm, vmaddr, 1);
2348}
2349EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
2350
2351/**
2352 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2353 * @mm: pointer to the process mm_struct
2354 * @vmaddr: virtual address in the process address space
2355 */
2356void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2357{
2358        unsigned long *entry, gaddr;
2359        struct gmap *gmap;
2360        pmd_t *pmdp;
2361
2362        rcu_read_lock();
2363        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2364                spin_lock(&gmap->guest_table_lock);
2365                entry = radix_tree_delete(&gmap->host_to_guest,
2366                                          vmaddr >> PMD_SHIFT);
2367                if (entry) {
2368                        pmdp = (pmd_t *)entry;
2369                        gaddr = __gmap_segment_gaddr(entry);
2370                        pmdp_notify_gmap(gmap, pmdp, gaddr);
2371                        WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2372                                           _SEGMENT_ENTRY_GMAP_UC));
2373                        if (MACHINE_HAS_TLB_GUEST)
2374                                __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2375                                            gmap->asce, IDTE_LOCAL);
2376                        else if (MACHINE_HAS_IDTE)
2377                                __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2378                        *entry = _SEGMENT_ENTRY_EMPTY;
2379                }
2380                spin_unlock(&gmap->guest_table_lock);
2381        }
2382        rcu_read_unlock();
2383}
2384EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2385
2386/**
2387 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2388 * @mm: pointer to the process mm_struct
2389 * @vmaddr: virtual address in the process address space
2390 */
2391void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2392{
2393        unsigned long *entry, gaddr;
2394        struct gmap *gmap;
2395        pmd_t *pmdp;
2396
2397        rcu_read_lock();
2398        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2399                spin_lock(&gmap->guest_table_lock);
2400                entry = radix_tree_delete(&gmap->host_to_guest,
2401                                          vmaddr >> PMD_SHIFT);
2402                if (entry) {
2403                        pmdp = (pmd_t *)entry;
2404                        gaddr = __gmap_segment_gaddr(entry);
2405                        pmdp_notify_gmap(gmap, pmdp, gaddr);
2406                        WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2407                                           _SEGMENT_ENTRY_GMAP_UC));
2408                        if (MACHINE_HAS_TLB_GUEST)
2409                                __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2410                                            gmap->asce, IDTE_GLOBAL);
2411                        else if (MACHINE_HAS_IDTE)
2412                                __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2413                        else
2414                                __pmdp_csp(pmdp);
2415                        *entry = _SEGMENT_ENTRY_EMPTY;
2416                }
2417                spin_unlock(&gmap->guest_table_lock);
2418        }
2419        rcu_read_unlock();
2420}
2421EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2422
2423/**
2424 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2425 * @gmap: pointer to guest address space
2426 * @pmdp: pointer to the pmd to be tested
2427 * @gaddr: virtual address in the guest address space
2428 *
2429 * This function is assumed to be called with the guest_table_lock
2430 * held.
2431 */
2432static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2433                                          unsigned long gaddr)
2434{
2435        if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2436                return false;
2437
2438        /* Already protected memory, which did not change is clean */
2439        if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2440            !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2441                return false;
2442
2443        /* Clear UC indication and reset protection */
2444        pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
2445        gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
2446        return true;
2447}
2448
2449/**
2450 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2451 * @gmap: pointer to guest address space
2452 * @bitmap: dirty bitmap for this pmd
2453 * @gaddr: virtual address in the guest address space
2454 * @vmaddr: virtual address in the host address space
2455 *
2456 * This function is assumed to be called with the guest_table_lock
2457 * held.
2458 */
2459void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2460                             unsigned long gaddr, unsigned long vmaddr)
2461{
2462        int i;
2463        pmd_t *pmdp;
2464        pte_t *ptep;
2465        spinlock_t *ptl;
2466
2467        pmdp = gmap_pmd_op_walk(gmap, gaddr);
2468        if (!pmdp)
2469                return;
2470
2471        if (pmd_large(*pmdp)) {
2472                if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2473                        bitmap_fill(bitmap, _PAGE_ENTRIES);
2474        } else {
2475                for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2476                        ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2477                        if (!ptep)
2478                                continue;
2479                        if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2480                                set_bit(i, bitmap);
2481                        spin_unlock(ptl);
2482                }
2483        }
2484        gmap_pmd_op_end(gmap, pmdp);
2485}
2486EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2487
2488#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2489static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2490                                    unsigned long end, struct mm_walk *walk)
2491{
2492        struct vm_area_struct *vma = walk->vma;
2493
2494        split_huge_pmd(vma, pmd, addr);
2495        return 0;
2496}
2497
2498static const struct mm_walk_ops thp_split_walk_ops = {
2499        .pmd_entry      = thp_split_walk_pmd_entry,
2500};
2501
2502static inline void thp_split_mm(struct mm_struct *mm)
2503{
2504        struct vm_area_struct *vma;
2505
2506        for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
2507                vma->vm_flags &= ~VM_HUGEPAGE;
2508                vma->vm_flags |= VM_NOHUGEPAGE;
2509                walk_page_vma(vma, &thp_split_walk_ops, NULL);
2510        }
2511        mm->def_flags |= VM_NOHUGEPAGE;
2512}
2513#else
2514static inline void thp_split_mm(struct mm_struct *mm)
2515{
2516}
2517#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2518
2519/*
2520 * Remove all empty zero pages from the mapping for lazy refaulting
2521 * - This must be called after mm->context.has_pgste is set, to avoid
2522 *   future creation of zero pages
2523 * - This must be called after THP was enabled
2524 */
2525static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
2526                           unsigned long end, struct mm_walk *walk)
2527{
2528        unsigned long addr;
2529
2530        for (addr = start; addr != end; addr += PAGE_SIZE) {
2531                pte_t *ptep;
2532                spinlock_t *ptl;
2533
2534                ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
2535                if (is_zero_pfn(pte_pfn(*ptep)))
2536                        ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
2537                pte_unmap_unlock(ptep, ptl);
2538        }
2539        return 0;
2540}
2541
2542static const struct mm_walk_ops zap_zero_walk_ops = {
2543        .pmd_entry      = __zap_zero_pages,
2544};
2545
2546/*
2547 * switch on pgstes for its userspace process (for kvm)
2548 */
2549int s390_enable_sie(void)
2550{
2551        struct mm_struct *mm = current->mm;
2552
2553        /* Do we have pgstes? if yes, we are done */
2554        if (mm_has_pgste(mm))
2555                return 0;
2556        /* Fail if the page tables are 2K */
2557        if (!mm_alloc_pgste(mm))
2558                return -EINVAL;
2559        mmap_write_lock(mm);
2560        mm->context.has_pgste = 1;
2561        /* split thp mappings and disable thp for future mappings */
2562        thp_split_mm(mm);
2563        walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
2564        mmap_write_unlock(mm);
2565        return 0;
2566}
2567EXPORT_SYMBOL_GPL(s390_enable_sie);
2568
2569int gmap_mark_unmergeable(void)
2570{
2571        struct mm_struct *mm = current->mm;
2572        struct vm_area_struct *vma;
2573        int ret;
2574
2575        for (vma = mm->mmap; vma; vma = vma->vm_next) {
2576                ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
2577                                  MADV_UNMERGEABLE, &vma->vm_flags);
2578                if (ret)
2579                        return ret;
2580        }
2581        mm->def_flags &= ~VM_MERGEABLE;
2582        return 0;
2583}
2584EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
2585
2586/*
2587 * Enable storage key handling from now on and initialize the storage
2588 * keys with the default key.
2589 */
2590static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2591                                  unsigned long next, struct mm_walk *walk)
2592{
2593        /* Clear storage key */
2594        ptep_zap_key(walk->mm, addr, pte);
2595        return 0;
2596}
2597
2598static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2599                                      unsigned long hmask, unsigned long next,
2600                                      struct mm_walk *walk)
2601{
2602        pmd_t *pmd = (pmd_t *)pte;
2603        unsigned long start, end;
2604        struct page *page = pmd_page(*pmd);
2605
2606        /*
2607         * The write check makes sure we do not set a key on shared
2608         * memory. This is needed as the walker does not differentiate
2609         * between actual guest memory and the process executable or
2610         * shared libraries.
2611         */
2612        if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2613            !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2614                return 0;
2615
2616        start = pmd_val(*pmd) & HPAGE_MASK;
2617        end = start + HPAGE_SIZE - 1;
2618        __storage_key_init_range(start, end);
2619        set_bit(PG_arch_1, &page->flags);
2620        return 0;
2621}
2622
2623static const struct mm_walk_ops enable_skey_walk_ops = {
2624        .hugetlb_entry          = __s390_enable_skey_hugetlb,
2625        .pte_entry              = __s390_enable_skey_pte,
2626};
2627
2628int s390_enable_skey(void)
2629{
2630        struct mm_struct *mm = current->mm;
2631        int rc = 0;
2632
2633        mmap_write_lock(mm);
2634        if (mm_uses_skeys(mm))
2635                goto out_up;
2636
2637        mm->context.uses_skeys = 1;
2638        rc = gmap_mark_unmergeable();
2639        if (rc) {
2640                mm->context.uses_skeys = 0;
2641                goto out_up;
2642        }
2643        walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2644
2645out_up:
2646        mmap_write_unlock(mm);
2647        return rc;
2648}
2649EXPORT_SYMBOL_GPL(s390_enable_skey);
2650
2651/*
2652 * Reset CMMA state, make all pages stable again.
2653 */
2654static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2655                             unsigned long next, struct mm_walk *walk)
2656{
2657        ptep_zap_unused(walk->mm, addr, pte, 1);
2658        return 0;
2659}
2660
2661static const struct mm_walk_ops reset_cmma_walk_ops = {
2662        .pte_entry              = __s390_reset_cmma,
2663};
2664
2665void s390_reset_cmma(struct mm_struct *mm)
2666{
2667        mmap_write_lock(mm);
2668        walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2669        mmap_write_unlock(mm);
2670}
2671EXPORT_SYMBOL_GPL(s390_reset_cmma);
2672
2673/*
2674 * make inaccessible pages accessible again
2675 */
2676static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
2677                            unsigned long next, struct mm_walk *walk)
2678{
2679        pte_t pte = READ_ONCE(*ptep);
2680
2681        if (pte_present(pte))
2682                WARN_ON_ONCE(uv_destroy_page(pte_val(pte) & PAGE_MASK));
2683        return 0;
2684}
2685
2686static const struct mm_walk_ops reset_acc_walk_ops = {
2687        .pte_entry              = __s390_reset_acc,
2688};
2689
2690#include <linux/sched/mm.h>
2691void s390_reset_acc(struct mm_struct *mm)
2692{
2693        if (!mm_is_protected(mm))
2694                return;
2695        /*
2696         * we might be called during
2697         * reset:                             we walk the pages and clear
2698         * close of all kvm file descriptors: we walk the pages and clear
2699         * exit of process on fd closure:     vma already gone, do nothing
2700         */
2701        if (!mmget_not_zero(mm))
2702                return;
2703        mmap_read_lock(mm);
2704        walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
2705        mmap_read_unlock(mm);
2706        mmput(mm);
2707}
2708EXPORT_SYMBOL_GPL(s390_reset_acc);
2709