linux/arch/powerpc/mm/fault.c
<<
>>
Prefs
   1/*
   2 *  PowerPC version
   3 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   4 *
   5 *  Derived from "arch/i386/mm/fault.c"
   6 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   7 *
   8 *  Modified by Cort Dougan and Paul Mackerras.
   9 *
  10 *  Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
  11 *
  12 *  This program is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU General Public License
  14 *  as published by the Free Software Foundation; either version
  15 *  2 of the License, or (at your option) any later version.
  16 */
  17
  18#include <linux/signal.h>
  19#include <linux/sched.h>
  20#include <linux/kernel.h>
  21#include <linux/errno.h>
  22#include <linux/string.h>
  23#include <linux/types.h>
  24#include <linux/ptrace.h>
  25#include <linux/mman.h>
  26#include <linux/mm.h>
  27#include <linux/interrupt.h>
  28#include <linux/highmem.h>
  29#include <linux/module.h>
  30#include <linux/kprobes.h>
  31#include <linux/kdebug.h>
  32
  33#include <asm/page.h>
  34#include <asm/pgtable.h>
  35#include <asm/mmu.h>
  36#include <asm/mmu_context.h>
  37#include <asm/system.h>
  38#include <asm/uaccess.h>
  39#include <asm/tlbflush.h>
  40#include <asm/siginfo.h>
  41
  42
  43#ifdef CONFIG_KPROBES
  44static inline int notify_page_fault(struct pt_regs *regs)
  45{
  46        int ret = 0;
  47
  48        /* kprobe_running() needs smp_processor_id() */
  49        if (!user_mode(regs)) {
  50                preempt_disable();
  51                if (kprobe_running() && kprobe_fault_handler(regs, 11))
  52                        ret = 1;
  53                preempt_enable();
  54        }
  55
  56        return ret;
  57}
  58#else
  59static inline int notify_page_fault(struct pt_regs *regs)
  60{
  61        return 0;
  62}
  63#endif
  64
  65/*
  66 * Check whether the instruction at regs->nip is a store using
  67 * an update addressing form which will update r1.
  68 */
  69static int store_updates_sp(struct pt_regs *regs)
  70{
  71        unsigned int inst;
  72
  73        if (get_user(inst, (unsigned int __user *)regs->nip))
  74                return 0;
  75        /* check for 1 in the rA field */
  76        if (((inst >> 16) & 0x1f) != 1)
  77                return 0;
  78        /* check major opcode */
  79        switch (inst >> 26) {
  80        case 37:        /* stwu */
  81        case 39:        /* stbu */
  82        case 45:        /* sthu */
  83        case 53:        /* stfsu */
  84        case 55:        /* stfdu */
  85                return 1;
  86        case 62:        /* std or stdu */
  87                return (inst & 3) == 1;
  88        case 31:
  89                /* check minor opcode */
  90                switch ((inst >> 1) & 0x3ff) {
  91                case 181:       /* stdux */
  92                case 183:       /* stwux */
  93                case 247:       /* stbux */
  94                case 439:       /* sthux */
  95                case 695:       /* stfsux */
  96                case 759:       /* stfdux */
  97                        return 1;
  98                }
  99        }
 100        return 0;
 101}
 102
 103/*
 104 * For 600- and 800-family processors, the error_code parameter is DSISR
 105 * for a data fault, SRR1 for an instruction fault. For 400-family processors
 106 * the error_code parameter is ESR for a data fault, 0 for an instruction
 107 * fault.
 108 * For 64-bit processors, the error_code parameter is
 109 *  - DSISR for a non-SLB data access fault,
 110 *  - SRR1 & 0x08000000 for a non-SLB instruction access fault
 111 *  - 0 any SLB fault.
 112 *
 113 * The return value is 0 if the fault was handled, or the signal
 114 * number if this is a kernel fault that can't be handled here.
 115 */
 116int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 117                            unsigned long error_code)
 118{
 119        struct vm_area_struct * vma;
 120        struct mm_struct *mm = current->mm;
 121        siginfo_t info;
 122        int code = SEGV_MAPERR;
 123        int is_write = 0, ret;
 124        int trap = TRAP(regs);
 125        int is_exec = trap == 0x400;
 126
 127#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 128        /*
 129         * Fortunately the bit assignments in SRR1 for an instruction
 130         * fault and DSISR for a data fault are mostly the same for the
 131         * bits we are interested in.  But there are some bits which
 132         * indicate errors in DSISR but can validly be set in SRR1.
 133         */
 134        if (trap == 0x400)
 135                error_code &= 0x48200000;
 136        else
 137                is_write = error_code & DSISR_ISSTORE;
 138#else
 139        is_write = error_code & ESR_DST;
 140#endif /* CONFIG_4xx || CONFIG_BOOKE */
 141
 142        if (notify_page_fault(regs))
 143                return 0;
 144
 145        if (unlikely(debugger_fault_handler(regs)))
 146                return 0;
 147
 148        /* On a kernel SLB miss we can only check for a valid exception entry */
 149        if (!user_mode(regs) && (address >= TASK_SIZE))
 150                return SIGSEGV;
 151
 152#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 153        if (error_code & DSISR_DABRMATCH) {
 154                /* DABR match */
 155                do_dabr(regs, address, error_code);
 156                return 0;
 157        }
 158#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
 159
 160        if (in_atomic() || mm == NULL) {
 161                if (!user_mode(regs))
 162                        return SIGSEGV;
 163                /* in_atomic() in user mode is really bad,
 164                   as is current->mm == NULL. */
 165                printk(KERN_EMERG "Page fault in user mode with "
 166                       "in_atomic() = %d mm = %p\n", in_atomic(), mm);
 167                printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
 168                       regs->nip, regs->msr);
 169                die("Weird page fault", regs, SIGSEGV);
 170        }
 171
 172        /* When running in the kernel we expect faults to occur only to
 173         * addresses in user space.  All other faults represent errors in the
 174         * kernel and should generate an OOPS.  Unfortunately, in the case of an
 175         * erroneous fault occurring in a code path which already holds mmap_sem
 176         * we will deadlock attempting to validate the fault against the
 177         * address space.  Luckily the kernel only validly references user
 178         * space from well defined areas of code, which are listed in the
 179         * exceptions table.
 180         *
 181         * As the vast majority of faults will be valid we will only perform
 182         * the source reference check when there is a possibility of a deadlock.
 183         * Attempt to lock the address space, if we cannot we then validate the
 184         * source.  If this is invalid we can skip the address space check,
 185         * thus avoiding the deadlock.
 186         */
 187        if (!down_read_trylock(&mm->mmap_sem)) {
 188                if (!user_mode(regs) && !search_exception_tables(regs->nip))
 189                        goto bad_area_nosemaphore;
 190
 191                down_read(&mm->mmap_sem);
 192        }
 193
 194        vma = find_vma(mm, address);
 195        if (!vma)
 196                goto bad_area;
 197        if (vma->vm_start <= address)
 198                goto good_area;
 199        if (!(vma->vm_flags & VM_GROWSDOWN))
 200                goto bad_area;
 201
 202        /*
 203         * N.B. The POWER/Open ABI allows programs to access up to
 204         * 288 bytes below the stack pointer.
 205         * The kernel signal delivery code writes up to about 1.5kB
 206         * below the stack pointer (r1) before decrementing it.
 207         * The exec code can write slightly over 640kB to the stack
 208         * before setting the user r1.  Thus we allow the stack to
 209         * expand to 1MB without further checks.
 210         */
 211        if (address + 0x100000 < vma->vm_end) {
 212                /* get user regs even if this fault is in kernel mode */
 213                struct pt_regs *uregs = current->thread.regs;
 214                if (uregs == NULL)
 215                        goto bad_area;
 216
 217                /*
 218                 * A user-mode access to an address a long way below
 219                 * the stack pointer is only valid if the instruction
 220                 * is one which would update the stack pointer to the
 221                 * address accessed if the instruction completed,
 222                 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
 223                 * (or the byte, halfword, float or double forms).
 224                 *
 225                 * If we don't check this then any write to the area
 226                 * between the last mapped region and the stack will
 227                 * expand the stack rather than segfaulting.
 228                 */
 229                if (address + 2048 < uregs->gpr[1]
 230                    && (!user_mode(regs) || !store_updates_sp(regs)))
 231                        goto bad_area;
 232        }
 233        if (expand_stack(vma, address))
 234                goto bad_area;
 235
 236good_area:
 237        code = SEGV_ACCERR;
 238#if defined(CONFIG_6xx)
 239        if (error_code & 0x95700000)
 240                /* an error such as lwarx to I/O controller space,
 241                   address matching DABR, eciwx, etc. */
 242                goto bad_area;
 243#endif /* CONFIG_6xx */
 244#if defined(CONFIG_8xx)
 245        /* The MPC8xx seems to always set 0x80000000, which is
 246         * "undefined".  Of those that can be set, this is the only
 247         * one which seems bad.
 248         */
 249        if (error_code & 0x10000000)
 250                /* Guarded storage error. */
 251                goto bad_area;
 252#endif /* CONFIG_8xx */
 253
 254        if (is_exec) {
 255#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 256                /* protection fault */
 257                if (error_code & DSISR_PROTFAULT)
 258                        goto bad_area;
 259                /*
 260                 * Allow execution from readable areas if the MMU does not
 261                 * provide separate controls over reading and executing.
 262                 */
 263                if (!(vma->vm_flags & VM_EXEC) &&
 264                    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
 265                     !(vma->vm_flags & (VM_READ | VM_WRITE))))
 266                        goto bad_area;
 267#else
 268                pte_t *ptep;
 269                pmd_t *pmdp;
 270
 271                /* Since 4xx/Book-E supports per-page execute permission,
 272                 * we lazily flush dcache to icache. */
 273                ptep = NULL;
 274                if (get_pteptr(mm, address, &ptep, &pmdp)) {
 275                        spinlock_t *ptl = pte_lockptr(mm, pmdp);
 276                        spin_lock(ptl);
 277                        if (pte_present(*ptep)) {
 278                                struct page *page = pte_page(*ptep);
 279
 280                                if (!test_bit(PG_arch_1, &page->flags)) {
 281                                        flush_dcache_icache_page(page);
 282                                        set_bit(PG_arch_1, &page->flags);
 283                                }
 284                                pte_update(ptep, 0, _PAGE_HWEXEC |
 285                                           _PAGE_ACCESSED);
 286                                _tlbie(address, mm->context.id);
 287                                pte_unmap_unlock(ptep, ptl);
 288                                up_read(&mm->mmap_sem);
 289                                return 0;
 290                        }
 291                        pte_unmap_unlock(ptep, ptl);
 292                }
 293#endif
 294        /* a write */
 295        } else if (is_write) {
 296                if (!(vma->vm_flags & VM_WRITE))
 297                        goto bad_area;
 298        /* a read */
 299        } else {
 300                /* protection fault */
 301                if (error_code & 0x08000000)
 302                        goto bad_area;
 303                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 304                        goto bad_area;
 305        }
 306
 307        /*
 308         * If for any reason at all we couldn't handle the fault,
 309         * make sure we exit gracefully rather than endlessly redo
 310         * the fault.
 311         */
 312 survive:
 313        ret = handle_mm_fault(mm, vma, address, is_write);
 314        if (unlikely(ret & VM_FAULT_ERROR)) {
 315                if (ret & VM_FAULT_OOM)
 316                        goto out_of_memory;
 317                else if (ret & VM_FAULT_SIGBUS)
 318                        goto do_sigbus;
 319                BUG();
 320        }
 321        if (ret & VM_FAULT_MAJOR)
 322                current->maj_flt++;
 323        else
 324                current->min_flt++;
 325        up_read(&mm->mmap_sem);
 326        return 0;
 327
 328bad_area:
 329        up_read(&mm->mmap_sem);
 330
 331bad_area_nosemaphore:
 332        /* User mode accesses cause a SIGSEGV */
 333        if (user_mode(regs)) {
 334                _exception(SIGSEGV, regs, code, address);
 335                return 0;
 336        }
 337
 338        if (is_exec && (error_code & DSISR_PROTFAULT)
 339            && printk_ratelimit())
 340                printk(KERN_CRIT "kernel tried to execute NX-protected"
 341                       " page (%lx) - exploit attempt? (uid: %d)\n",
 342                       address, current->uid);
 343
 344        return SIGSEGV;
 345
 346/*
 347 * We ran out of memory, or some other thing happened to us that made
 348 * us unable to handle the page fault gracefully.
 349 */
 350out_of_memory:
 351        up_read(&mm->mmap_sem);
 352        if (is_global_init(current)) {
 353                yield();
 354                down_read(&mm->mmap_sem);
 355                goto survive;
 356        }
 357        printk("VM: killing process %s\n", current->comm);
 358        if (user_mode(regs))
 359                do_group_exit(SIGKILL);
 360        return SIGKILL;
 361
 362do_sigbus:
 363        up_read(&mm->mmap_sem);
 364        if (user_mode(regs)) {
 365                info.si_signo = SIGBUS;
 366                info.si_errno = 0;
 367                info.si_code = BUS_ADRERR;
 368                info.si_addr = (void __user *)address;
 369                force_sig_info(SIGBUS, &info, current);
 370                return 0;
 371        }
 372        return SIGBUS;
 373}
 374
 375/*
 376 * bad_page_fault is called when we have a bad access from the kernel.
 377 * It is called from the DSI and ISI handlers in head.S and from some
 378 * of the procedures in traps.c.
 379 */
 380void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 381{
 382        const struct exception_table_entry *entry;
 383
 384        /* Are we prepared to handle this fault?  */
 385        if ((entry = search_exception_tables(regs->nip)) != NULL) {
 386                regs->nip = entry->fixup;
 387                return;
 388        }
 389
 390        /* kernel has accessed a bad area */
 391
 392        switch (regs->trap) {
 393        case 0x300:
 394        case 0x380:
 395                printk(KERN_ALERT "Unable to handle kernel paging request for "
 396                        "data at address 0x%08lx\n", regs->dar);
 397                break;
 398        case 0x400:
 399        case 0x480:
 400                printk(KERN_ALERT "Unable to handle kernel paging request for "
 401                        "instruction fetch\n");
 402                break;
 403        default:
 404                printk(KERN_ALERT "Unable to handle kernel paging request for "
 405                        "unknown fault\n");
 406                break;
 407        }
 408        printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
 409                regs->nip);
 410
 411        die("Kernel access of bad area", regs, sig);
 412}
 413
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.