linux/arch/s390/mm/pgtable.c
<<
>>
Prefs
   1/*
   2 *  arch/s390/mm/pgtable.c
   3 *
   4 *    Copyright IBM Corp. 2007
   5 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   6 */
   7
   8#include <linux/sched.h>
   9#include <linux/kernel.h>
  10#include <linux/errno.h>
  11#include <linux/mm.h>
  12#include <linux/swap.h>
  13#include <linux/smp.h>
  14#include <linux/highmem.h>
  15#include <linux/slab.h>
  16#include <linux/pagemap.h>
  17#include <linux/spinlock.h>
  18#include <linux/module.h>
  19#include <linux/quicklist.h>
  20
  21#include <asm/system.h>
  22#include <asm/pgtable.h>
  23#include <asm/pgalloc.h>
  24#include <asm/tlb.h>
  25#include <asm/tlbflush.h>
  26#include <asm/mmu_context.h>
  27
  28#ifndef CONFIG_64BIT
  29#define ALLOC_ORDER     1
  30#define TABLES_PER_PAGE 4
  31#define FRAG_MASK       15UL
  32#define SECOND_HALVES   10UL
  33
  34void clear_table_pgstes(unsigned long *table)
  35{
  36        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
  37        memset(table + 256, 0, PAGE_SIZE/4);
  38        clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
  39        memset(table + 768, 0, PAGE_SIZE/4);
  40}
  41
  42#else
  43#define ALLOC_ORDER     2
  44#define TABLES_PER_PAGE 2
  45#define FRAG_MASK       3UL
  46#define SECOND_HALVES   2UL
  47
  48void clear_table_pgstes(unsigned long *table)
  49{
  50        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
  51        memset(table + 256, 0, PAGE_SIZE/2);
  52}
  53
  54#endif
  55
  56unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
  57{
  58        struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  59
  60        if (!page)
  61                return NULL;
  62        page->index = 0;
  63        if (noexec) {
  64                struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  65                if (!shadow) {
  66                        __free_pages(page, ALLOC_ORDER);
  67                        return NULL;
  68                }
  69                page->index = page_to_phys(shadow);
  70        }
  71        spin_lock(&mm->page_table_lock);
  72        list_add(&page->lru, &mm->context.crst_list);
  73        spin_unlock(&mm->page_table_lock);
  74        return (unsigned long *) page_to_phys(page);
  75}
  76
  77void crst_table_free(struct mm_struct *mm, unsigned long *table)
  78{
  79        unsigned long *shadow = get_shadow_table(table);
  80        struct page *page = virt_to_page(table);
  81
  82        spin_lock(&mm->page_table_lock);
  83        list_del(&page->lru);
  84        spin_unlock(&mm->page_table_lock);
  85        if (shadow)
  86                free_pages((unsigned long) shadow, ALLOC_ORDER);
  87        free_pages((unsigned long) table, ALLOC_ORDER);
  88}
  89
  90#ifdef CONFIG_64BIT
  91int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
  92{
  93        unsigned long *table, *pgd;
  94        unsigned long entry;
  95
  96        BUG_ON(limit > (1UL << 53));
  97repeat:
  98        table = crst_table_alloc(mm, mm->context.noexec);
  99        if (!table)
 100                return -ENOMEM;
 101        spin_lock(&mm->page_table_lock);
 102        if (mm->context.asce_limit < limit) {
 103                pgd = (unsigned long *) mm->pgd;
 104                if (mm->context.asce_limit <= (1UL << 31)) {
 105                        entry = _REGION3_ENTRY_EMPTY;
 106                        mm->context.asce_limit = 1UL << 42;
 107                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 108                                                _ASCE_USER_BITS |
 109                                                _ASCE_TYPE_REGION3;
 110                } else {
 111                        entry = _REGION2_ENTRY_EMPTY;
 112                        mm->context.asce_limit = 1UL << 53;
 113                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 114                                                _ASCE_USER_BITS |
 115                                                _ASCE_TYPE_REGION2;
 116                }
 117                crst_table_init(table, entry);
 118                pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
 119                mm->pgd = (pgd_t *) table;
 120                mm->task_size = mm->context.asce_limit;
 121                table = NULL;
 122        }
 123        spin_unlock(&mm->page_table_lock);
 124        if (table)
 125                crst_table_free(mm, table);
 126        if (mm->context.asce_limit < limit)
 127                goto repeat;
 128        update_mm(mm, current);
 129        return 0;
 130}
 131
 132void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
 133{
 134        pgd_t *pgd;
 135
 136        if (mm->context.asce_limit <= limit)
 137                return;
 138        __tlb_flush_mm(mm);
 139        while (mm->context.asce_limit > limit) {
 140                pgd = mm->pgd;
 141                switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
 142                case _REGION_ENTRY_TYPE_R2:
 143                        mm->context.asce_limit = 1UL << 42;
 144                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 145                                                _ASCE_USER_BITS |
 146                                                _ASCE_TYPE_REGION3;
 147                        break;
 148                case _REGION_ENTRY_TYPE_R3:
 149                        mm->context.asce_limit = 1UL << 31;
 150                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 151                                                _ASCE_USER_BITS |
 152                                                _ASCE_TYPE_SEGMENT;
 153                        break;
 154                default:
 155                        BUG();
 156                }
 157                mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 158                mm->task_size = mm->context.asce_limit;
 159                crst_table_free(mm, (unsigned long *) pgd);
 160        }
 161        update_mm(mm, current);
 162}
 163#endif
 164
 165/*
 166 * page table entry allocation/free routines.
 167 */
 168unsigned long *page_table_alloc(struct mm_struct *mm)
 169{
 170        struct page *page;
 171        unsigned long *table;
 172        unsigned long bits;
 173
 174        bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
 175        spin_lock(&mm->page_table_lock);
 176        page = NULL;
 177        if (!list_empty(&mm->context.pgtable_list)) {
 178                page = list_first_entry(&mm->context.pgtable_list,
 179                                        struct page, lru);
 180                if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
 181                        page = NULL;
 182        }
 183        if (!page) {
 184                spin_unlock(&mm->page_table_lock);
 185                page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 186                if (!page)
 187                        return NULL;
 188                pgtable_page_ctor(page);
 189                page->flags &= ~FRAG_MASK;
 190                table = (unsigned long *) page_to_phys(page);
 191                if (mm->context.has_pgste)
 192                        clear_table_pgstes(table);
 193                else
 194                        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
 195                spin_lock(&mm->page_table_lock);
 196                list_add(&page->lru, &mm->context.pgtable_list);
 197        }
 198        table = (unsigned long *) page_to_phys(page);
 199        while (page->flags & bits) {
 200                table += 256;
 201                bits <<= 1;
 202        }
 203        page->flags |= bits;
 204        if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
 205                list_move_tail(&page->lru, &mm->context.pgtable_list);
 206        spin_unlock(&mm->page_table_lock);
 207        return table;
 208}
 209
 210void page_table_free(struct mm_struct *mm, unsigned long *table)
 211{
 212        struct page *page;
 213        unsigned long bits;
 214
 215        bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
 216        bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
 217        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 218        spin_lock(&mm->page_table_lock);
 219        page->flags ^= bits;
 220        if (page->flags & FRAG_MASK) {
 221                /* Page now has some free pgtable fragments. */
 222                list_move(&page->lru, &mm->context.pgtable_list);
 223                page = NULL;
 224        } else
 225                /* All fragments of the 4K page have been freed. */
 226                list_del(&page->lru);
 227        spin_unlock(&mm->page_table_lock);
 228        if (page) {
 229                pgtable_page_dtor(page);
 230                __free_page(page);
 231        }
 232}
 233
 234void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 235{
 236        struct page *page;
 237
 238        spin_lock(&mm->page_table_lock);
 239        /* Free shadow region and segment tables. */
 240        list_for_each_entry(page, &mm->context.crst_list, lru)
 241                if (page->index) {
 242                        free_pages((unsigned long) page->index, ALLOC_ORDER);
 243                        page->index = 0;
 244                }
 245        /* "Free" second halves of page tables. */
 246        list_for_each_entry(page, &mm->context.pgtable_list, lru)
 247                page->flags &= ~SECOND_HALVES;
 248        spin_unlock(&mm->page_table_lock);
 249        mm->context.noexec = 0;
 250        update_mm(mm, tsk);
 251}
 252
 253/*
 254 * switch on pgstes for its userspace process (for kvm)
 255 */
 256int s390_enable_sie(void)
 257{
 258        struct task_struct *tsk = current;
 259        struct mm_struct *mm, *old_mm;
 260
 261        /* Do we have switched amode? If no, we cannot do sie */
 262        if (!switch_amode)
 263                return -EINVAL;
 264
 265        /* Do we have pgstes? if yes, we are done */
 266        if (tsk->mm->context.has_pgste)
 267                return 0;
 268
 269        /* lets check if we are allowed to replace the mm */
 270        task_lock(tsk);
 271        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 272            tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
 273                task_unlock(tsk);
 274                return -EINVAL;
 275        }
 276        task_unlock(tsk);
 277
 278        /* we copy the mm and let dup_mm create the page tables with_pgstes */
 279        tsk->mm->context.alloc_pgste = 1;
 280        mm = dup_mm(tsk);
 281        tsk->mm->context.alloc_pgste = 0;
 282        if (!mm)
 283                return -ENOMEM;
 284
 285        /* Now lets check again if something happened */
 286        task_lock(tsk);
 287        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 288            tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
 289                mmput(mm);
 290                task_unlock(tsk);
 291                return -EINVAL;
 292        }
 293
 294        /* ok, we are alone. No ptrace, no threads, etc. */
 295        old_mm = tsk->mm;
 296        tsk->mm = tsk->active_mm = mm;
 297        preempt_disable();
 298        update_mm(mm, tsk);
 299        cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
 300        preempt_enable();
 301        task_unlock(tsk);
 302        mmput(old_mm);
 303        return 0;
 304}
 305EXPORT_SYMBOL_GPL(s390_enable_sie);
 306