linux/arch/x86/mm/fault.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   4 */
   5
   6#include <linux/signal.h>
   7#include <linux/sched.h>
   8#include <linux/kernel.h>
   9#include <linux/errno.h>
  10#include <linux/string.h>
  11#include <linux/types.h>
  12#include <linux/ptrace.h>
  13#include <linux/mmiotrace.h>
  14#include <linux/mman.h>
  15#include <linux/mm.h>
  16#include <linux/smp.h>
  17#include <linux/interrupt.h>
  18#include <linux/init.h>
  19#include <linux/tty.h>
  20#include <linux/vt_kern.h>              /* For unblank_screen() */
  21#include <linux/compiler.h>
  22#include <linux/highmem.h>
  23#include <linux/bootmem.h>              /* for max_low_pfn */
  24#include <linux/vmalloc.h>
  25#include <linux/module.h>
  26#include <linux/kprobes.h>
  27#include <linux/uaccess.h>
  28#include <linux/kdebug.h>
  29
  30#include <asm/system.h>
  31#include <asm/desc.h>
  32#include <asm/segment.h>
  33#include <asm/pgalloc.h>
  34#include <asm/smp.h>
  35#include <asm/tlbflush.h>
  36#include <asm/proto.h>
  37#include <asm-generic/sections.h>
  38
  39/*
  40 * Page fault error code bits
  41 *      bit 0 == 0 means no page found, 1 means protection fault
  42 *      bit 1 == 0 means read, 1 means write
  43 *      bit 2 == 0 means kernel, 1 means user-mode
  44 *      bit 3 == 1 means use of reserved bit detected
  45 *      bit 4 == 1 means fault was an instruction fetch
  46 */
  47#define PF_PROT         (1<<0)
  48#define PF_WRITE        (1<<1)
  49#define PF_USER         (1<<2)
  50#define PF_RSVD         (1<<3)
  51#define PF_INSTR        (1<<4)
  52
  53static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
  54{
  55#ifdef CONFIG_MMIOTRACE_HOOKS
  56        if (unlikely(is_kmmio_active()))
  57                if (kmmio_handler(regs, addr) == 1)
  58                        return -1;
  59#endif
  60        return 0;
  61}
  62
  63static inline int notify_page_fault(struct pt_regs *regs)
  64{
  65#ifdef CONFIG_KPROBES
  66        int ret = 0;
  67
  68        /* kprobe_running() needs smp_processor_id() */
  69        if (!user_mode_vm(regs)) {
  70                preempt_disable();
  71                if (kprobe_running() && kprobe_fault_handler(regs, 14))
  72                        ret = 1;
  73                preempt_enable();
  74        }
  75
  76        return ret;
  77#else
  78        return 0;
  79#endif
  80}
  81
  82/*
  83 * X86_32
  84 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
  85 * Check that here and ignore it.
  86 *
  87 * X86_64
  88 * Sometimes the CPU reports invalid exceptions on prefetch.
  89 * Check that here and ignore it.
  90 *
  91 * Opcode checker based on code by Richard Brunner
  92 */
  93static int is_prefetch(struct pt_regs *regs, unsigned long addr,
  94                       unsigned long error_code)
  95{
  96        unsigned char *instr;
  97        int scan_more = 1;
  98        int prefetch = 0;
  99        unsigned char *max_instr;
 100
 101        /*
 102         * If it was a exec (instruction fetch) fault on NX page, then
 103         * do not ignore the fault:
 104         */
 105        if (error_code & PF_INSTR)
 106                return 0;
 107
 108        instr = (unsigned char *)convert_ip_to_linear(current, regs);
 109        max_instr = instr + 15;
 110
 111        if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 112                return 0;
 113
 114        while (scan_more && instr < max_instr) {
 115                unsigned char opcode;
 116                unsigned char instr_hi;
 117                unsigned char instr_lo;
 118
 119                if (probe_kernel_address(instr, opcode))
 120                        break;
 121
 122                instr_hi = opcode & 0xf0;
 123                instr_lo = opcode & 0x0f;
 124                instr++;
 125
 126                switch (instr_hi) {
 127                case 0x20:
 128                case 0x30:
 129                        /*
 130                         * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
 131                         * In X86_64 long mode, the CPU will signal invalid
 132                         * opcode if some of these prefixes are present so
 133                         * X86_64 will never get here anyway
 134                         */
 135                        scan_more = ((instr_lo & 7) == 0x6);
 136                        break;
 137#ifdef CONFIG_X86_64
 138                case 0x40:
 139                        /*
 140                         * In AMD64 long mode 0x40..0x4F are valid REX prefixes
 141                         * Need to figure out under what instruction mode the
 142                         * instruction was issued. Could check the LDT for lm,
 143                         * but for now it's good enough to assume that long
 144                         * mode only uses well known segments or kernel.
 145                         */
 146                        scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
 147                        break;
 148#endif
 149                case 0x60:
 150                        /* 0x64 thru 0x67 are valid prefixes in all modes. */
 151                        scan_more = (instr_lo & 0xC) == 0x4;
 152                        break;
 153                case 0xF0:
 154                        /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
 155                        scan_more = !instr_lo || (instr_lo>>1) == 1;
 156                        break;
 157                case 0x00:
 158                        /* Prefetch instruction is 0x0F0D or 0x0F18 */
 159                        scan_more = 0;
 160
 161                        if (probe_kernel_address(instr, opcode))
 162                                break;
 163                        prefetch = (instr_lo == 0xF) &&
 164                                (opcode == 0x0D || opcode == 0x18);
 165                        break;
 166                default:
 167                        scan_more = 0;
 168                        break;
 169                }
 170        }
 171        return prefetch;
 172}
 173
 174static void force_sig_info_fault(int si_signo, int si_code,
 175        unsigned long address, struct task_struct *tsk)
 176{
 177        siginfo_t info;
 178
 179        info.si_signo = si_signo;
 180        info.si_errno = 0;
 181        info.si_code = si_code;
 182        info.si_addr = (void __user *)address;
 183        force_sig_info(si_signo, &info, tsk);
 184}
 185
 186#ifdef CONFIG_X86_64
 187static int bad_address(void *p)
 188{
 189        unsigned long dummy;
 190        return probe_kernel_address((unsigned long *)p, dummy);
 191}
 192#endif
 193
 194static void dump_pagetable(unsigned long address)
 195{
 196#ifdef CONFIG_X86_32
 197        __typeof__(pte_val(__pte(0))) page;
 198
 199        page = read_cr3();
 200        page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
 201#ifdef CONFIG_X86_PAE
 202        printk("*pdpt = %016Lx ", page);
 203        if ((page >> PAGE_SHIFT) < max_low_pfn
 204            && page & _PAGE_PRESENT) {
 205                page &= PAGE_MASK;
 206                page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
 207                                                         & (PTRS_PER_PMD - 1)];
 208                printk(KERN_CONT "*pde = %016Lx ", page);
 209                page &= ~_PAGE_NX;
 210        }
 211#else
 212        printk("*pde = %08lx ", page);
 213#endif
 214
 215        /*
 216         * We must not directly access the pte in the highpte
 217         * case if the page table is located in highmem.
 218         * And let's rather not kmap-atomic the pte, just in case
 219         * it's allocated already.
 220         */
 221        if ((page >> PAGE_SHIFT) < max_low_pfn
 222            && (page & _PAGE_PRESENT)
 223            && !(page & _PAGE_PSE)) {
 224                page &= PAGE_MASK;
 225                page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
 226                                                         & (PTRS_PER_PTE - 1)];
 227                printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
 228        }
 229
 230        printk("\n");
 231#else /* CONFIG_X86_64 */
 232        pgd_t *pgd;
 233        pud_t *pud;
 234        pmd_t *pmd;
 235        pte_t *pte;
 236
 237        pgd = (pgd_t *)read_cr3();
 238
 239        pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
 240        pgd += pgd_index(address);
 241        if (bad_address(pgd)) goto bad;
 242        printk("PGD %lx ", pgd_val(*pgd));
 243        if (!pgd_present(*pgd)) goto ret;
 244
 245        pud = pud_offset(pgd, address);
 246        if (bad_address(pud)) goto bad;
 247        printk("PUD %lx ", pud_val(*pud));
 248        if (!pud_present(*pud) || pud_large(*pud))
 249                goto ret;
 250
 251        pmd = pmd_offset(pud, address);
 252        if (bad_address(pmd)) goto bad;
 253        printk("PMD %lx ", pmd_val(*pmd));
 254        if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
 255
 256        pte = pte_offset_kernel(pmd, address);
 257        if (bad_address(pte)) goto bad;
 258        printk("PTE %lx", pte_val(*pte));
 259ret:
 260        printk("\n");
 261        return;
 262bad:
 263        printk("BAD\n");
 264#endif
 265}
 266
 267#ifdef CONFIG_X86_32
 268static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 269{
 270        unsigned index = pgd_index(address);
 271        pgd_t *pgd_k;
 272        pud_t *pud, *pud_k;
 273        pmd_t *pmd, *pmd_k;
 274
 275        pgd += index;
 276        pgd_k = init_mm.pgd + index;
 277
 278        if (!pgd_present(*pgd_k))
 279                return NULL;
 280
 281        /*
 282         * set_pgd(pgd, *pgd_k); here would be useless on PAE
 283         * and redundant with the set_pmd() on non-PAE. As would
 284         * set_pud.
 285         */
 286
 287        pud = pud_offset(pgd, address);
 288        pud_k = pud_offset(pgd_k, address);
 289        if (!pud_present(*pud_k))
 290                return NULL;
 291
 292        pmd = pmd_offset(pud, address);
 293        pmd_k = pmd_offset(pud_k, address);
 294        if (!pmd_present(*pmd_k))
 295                return NULL;
 296        if (!pmd_present(*pmd)) {
 297                set_pmd(pmd, *pmd_k);
 298                arch_flush_lazy_mmu_mode();
 299        } else
 300                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
 301        return pmd_k;
 302}
 303#endif
 304
 305#ifdef CONFIG_X86_64
 306static const char errata93_warning[] =
 307KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 308KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 309KERN_ERR "******* Please consider a BIOS update.\n"
 310KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 311#endif
 312
 313/* Workaround for K8 erratum #93 & buggy BIOS.
 314   BIOS SMM functions are required to use a specific workaround
 315   to avoid corruption of the 64bit RIP register on C stepping K8.
 316   A lot of BIOS that didn't get tested properly miss this.
 317   The OS sees this as a page fault with the upper 32bits of RIP cleared.
 318   Try to work around it here.
 319   Note we only handle faults in kernel here.
 320   Does nothing for X86_32
 321 */
 322static int is_errata93(struct pt_regs *regs, unsigned long address)
 323{
 324#ifdef CONFIG_X86_64
 325        static int warned;
 326        if (address != regs->ip)
 327                return 0;
 328        if ((address >> 32) != 0)
 329                return 0;
 330        address |= 0xffffffffUL << 32;
 331        if ((address >= (u64)_stext && address <= (u64)_etext) ||
 332            (address >= MODULES_VADDR && address <= MODULES_END)) {
 333                if (!warned) {
 334                        printk(errata93_warning);
 335                        warned = 1;
 336                }
 337                regs->ip = address;
 338                return 1;
 339        }
 340#endif
 341        return 0;
 342}
 343
 344/*
 345 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
 346 * addresses >4GB.  We catch this in the page fault handler because these
 347 * addresses are not reachable. Just detect this case and return.  Any code
 348 * segment in LDT is compatibility mode.
 349 */
 350static int is_errata100(struct pt_regs *regs, unsigned long address)
 351{
 352#ifdef CONFIG_X86_64
 353        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 354            (address >> 32))
 355                return 1;
 356#endif
 357        return 0;
 358}
 359
 360void do_invalid_op(struct pt_regs *, unsigned long);
 361
 362static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 363{
 364#ifdef CONFIG_X86_F00F_BUG
 365        unsigned long nr;
 366        /*
 367         * Pentium F0 0F C7 C8 bug workaround.
 368         */
 369        if (boot_cpu_data.f00f_bug) {
 370                nr = (address - idt_descr.address) >> 3;
 371
 372                if (nr == 6) {
 373                        do_invalid_op(regs, 0);
 374                        return 1;
 375                }
 376        }
 377#endif
 378        return 0;
 379}
 380
 381static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 382                            unsigned long address)
 383{
 384#ifdef CONFIG_X86_32
 385        if (!oops_may_print())
 386                return;
 387#endif
 388
 389#ifdef CONFIG_X86_PAE
 390        if (error_code & PF_INSTR) {
 391                unsigned int level;
 392                pte_t *pte = lookup_address(address, &level);
 393
 394                if (pte && pte_present(*pte) && !pte_exec(*pte))
 395                        printk(KERN_CRIT "kernel tried to execute "
 396                                "NX-protected page - exploit attempt? "
 397                                "(uid: %d)\n", current->uid);
 398        }
 399#endif
 400
 401        printk(KERN_ALERT "BUG: unable to handle kernel ");
 402        if (address < PAGE_SIZE)
 403                printk(KERN_CONT "NULL pointer dereference");
 404        else
 405                printk(KERN_CONT "paging request");
 406        printk(KERN_CONT " at %p\n", (void *) address);
 407        printk(KERN_ALERT "IP:");
 408        printk_address(regs->ip, 1);
 409        dump_pagetable(address);
 410}
 411
 412#ifdef CONFIG_X86_64
 413static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 414                                 unsigned long error_code)
 415{
 416        unsigned long flags = oops_begin();
 417        struct task_struct *tsk;
 418
 419        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 420               current->comm, address);
 421        dump_pagetable(address);
 422        tsk = current;
 423        tsk->thread.cr2 = address;
 424        tsk->thread.trap_no = 14;
 425        tsk->thread.error_code = error_code;
 426        if (__die("Bad pagetable", regs, error_code))
 427                regs = NULL;
 428        oops_end(flags, regs, SIGKILL);
 429}
 430#endif
 431
 432static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 433{
 434        if ((error_code & PF_WRITE) && !pte_write(*pte))
 435                return 0;
 436        if ((error_code & PF_INSTR) && !pte_exec(*pte))
 437                return 0;
 438
 439        return 1;
 440}
 441
 442/*
 443 * Handle a spurious fault caused by a stale TLB entry.  This allows
 444 * us to lazily refresh the TLB when increasing the permissions of a
 445 * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
 446 * expensive since that implies doing a full cross-processor TLB
 447 * flush, even if no stale TLB entries exist on other processors.
 448 * There are no security implications to leaving a stale TLB when
 449 * increasing the permissions on a page.
 450 */
 451static int spurious_fault(unsigned long address,
 452                          unsigned long error_code)
 453{
 454        pgd_t *pgd;
 455        pud_t *pud;
 456        pmd_t *pmd;
 457        pte_t *pte;
 458
 459        /* Reserved-bit violation or user access to kernel space? */
 460        if (error_code & (PF_USER | PF_RSVD))
 461                return 0;
 462
 463        pgd = init_mm.pgd + pgd_index(address);
 464        if (!pgd_present(*pgd))
 465                return 0;
 466
 467        pud = pud_offset(pgd, address);
 468        if (!pud_present(*pud))
 469                return 0;
 470
 471        if (pud_large(*pud))
 472                return spurious_fault_check(error_code, (pte_t *) pud);
 473
 474        pmd = pmd_offset(pud, address);
 475        if (!pmd_present(*pmd))
 476                return 0;
 477
 478        if (pmd_large(*pmd))
 479                return spurious_fault_check(error_code, (pte_t *) pmd);
 480
 481        pte = pte_offset_kernel(pmd, address);
 482        if (!pte_present(*pte))
 483                return 0;
 484
 485        return spurious_fault_check(error_code, pte);
 486}
 487
 488/*
 489 * X86_32
 490 * Handle a fault on the vmalloc or module mapping area
 491 *
 492 * X86_64
 493 * Handle a fault on the vmalloc area
 494 *
 495 * This assumes no large pages in there.
 496 */
 497static int vmalloc_fault(unsigned long address)
 498{
 499#ifdef CONFIG_X86_32
 500        unsigned long pgd_paddr;
 501        pmd_t *pmd_k;
 502        pte_t *pte_k;
 503
 504        /* Make sure we are in vmalloc area */
 505        if (!(address >= VMALLOC_START && address < VMALLOC_END))
 506                return -1;
 507
 508        /*
 509         * Synchronize this task's top level page-table
 510         * with the 'reference' page table.
 511         *
 512         * Do _not_ use "current" here. We might be inside
 513         * an interrupt in the middle of a task switch..
 514         */
 515        pgd_paddr = read_cr3();
 516        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 517        if (!pmd_k)
 518                return -1;
 519        pte_k = pte_offset_kernel(pmd_k, address);
 520        if (!pte_present(*pte_k))
 521                return -1;
 522        return 0;
 523#else
 524        pgd_t *pgd, *pgd_ref;
 525        pud_t *pud, *pud_ref;
 526        pmd_t *pmd, *pmd_ref;
 527        pte_t *pte, *pte_ref;
 528
 529        /* Make sure we are in vmalloc area */
 530        if (!(address >= VMALLOC_START && address < VMALLOC_END))
 531                return -1;
 532
 533        /* Copy kernel mappings over when needed. This can also
 534           happen within a race in page table update. In the later
 535           case just flush. */
 536
 537        pgd = pgd_offset(current->active_mm, address);
 538        pgd_ref = pgd_offset_k(address);
 539        if (pgd_none(*pgd_ref))
 540                return -1;
 541        if (pgd_none(*pgd))
 542                set_pgd(pgd, *pgd_ref);
 543        else
 544                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 545
 546        /* Below here mismatches are bugs because these lower tables
 547           are shared */
 548
 549        pud = pud_offset(pgd, address);
 550        pud_ref = pud_offset(pgd_ref, address);
 551        if (pud_none(*pud_ref))
 552                return -1;
 553        if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 554                BUG();
 555        pmd = pmd_offset(pud, address);
 556        pmd_ref = pmd_offset(pud_ref, address);
 557        if (pmd_none(*pmd_ref))
 558                return -1;
 559        if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 560                BUG();
 561        pte_ref = pte_offset_kernel(pmd_ref, address);
 562        if (!pte_present(*pte_ref))
 563                return -1;
 564        pte = pte_offset_kernel(pmd, address);
 565        /* Don't use pte_page here, because the mappings can point
 566           outside mem_map, and the NUMA hash lookup cannot handle
 567           that. */
 568        if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 569                BUG();
 570        return 0;
 571#endif
 572}
 573
 574int show_unhandled_signals = 1;
 575
 576/*
 577 * This routine handles page faults.  It determines the address,
 578 * and the problem, and then passes it off to one of the appropriate
 579 * routines.
 580 */
 581#ifdef CONFIG_X86_64
 582asmlinkage
 583#endif
 584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 585{
 586        struct task_struct *tsk;
 587        struct mm_struct *mm;
 588        struct vm_area_struct *vma;
 589        unsigned long address;
 590        int write, si_code;
 591        int fault;
 592#ifdef CONFIG_X86_64
 593        unsigned long flags;
 594#endif
 595
 596        /*
 597         * We can fault from pretty much anywhere, with unknown IRQ state.
 598         */
 599        trace_hardirqs_fixup();
 600
 601        tsk = current;
 602        mm = tsk->mm;
 603        prefetchw(&mm->mmap_sem);
 604
 605        /* get the address */
 606        address = read_cr2();
 607
 608        si_code = SEGV_MAPERR;
 609
 610        if (unlikely(kmmio_fault(regs, address)))
 611                return;
 612
 613        /*
 614         * We fault-in kernel-space virtual memory on-demand. The
 615         * 'reference' page table is init_mm.pgd.
 616         *
 617         * NOTE! We MUST NOT take any locks for this case. We may
 618         * be in an interrupt or a critical region, and should
 619         * only copy the information from the master page table,
 620         * nothing more.
 621         *
 622         * This verifies that the fault happens in kernel space
 623         * (error_code & 4) == 0, and that the fault was not a
 624         * protection error (error_code & 9) == 0.
 625         */
 626#ifdef CONFIG_X86_32
 627        if (unlikely(address >= TASK_SIZE)) {
 628#else
 629        if (unlikely(address >= TASK_SIZE64)) {
 630#endif
 631                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 632                    vmalloc_fault(address) >= 0)
 633                        return;
 634
 635                /* Can handle a stale RO->RW TLB */
 636                if (spurious_fault(address, error_code))
 637                        return;
 638
 639                /* kprobes don't want to hook the spurious faults. */
 640                if (notify_page_fault(regs))
 641                        return;
 642                /*
 643                 * Don't take the mm semaphore here. If we fixup a prefetch
 644                 * fault we could otherwise deadlock.
 645                 */
 646                goto bad_area_nosemaphore;
 647        }
 648
 649        /* kprobes don't want to hook the spurious faults. */
 650        if (notify_page_fault(regs))
 651                return;
 652
 653#ifdef CONFIG_X86_32
 654        /* It's safe to allow irq's after cr2 has been saved and the vmalloc
 655           fault has been handled. */
 656        if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
 657                local_irq_enable();
 658
 659        /*
 660         * If we're in an interrupt, have no user context or are running in an
 661         * atomic region then we must not take the fault.
 662         */
 663        if (in_atomic() || !mm)
 664                goto bad_area_nosemaphore;
 665#else /* CONFIG_X86_64 */
 666        if (likely(regs->flags & X86_EFLAGS_IF))
 667                local_irq_enable();
 668
 669        if (unlikely(error_code & PF_RSVD))
 670                pgtable_bad(address, regs, error_code);
 671
 672        /*
 673         * If we're in an interrupt, have no user context or are running in an
 674         * atomic region then we must not take the fault.
 675         */
 676        if (unlikely(in_atomic() || !mm))
 677                goto bad_area_nosemaphore;
 678
 679        /*
 680         * User-mode registers count as a user access even for any
 681         * potential system fault or CPU buglet.
 682         */
 683        if (user_mode_vm(regs))
 684                error_code |= PF_USER;
 685again:
 686#endif
 687        /* When running in the kernel we expect faults to occur only to
 688         * addresses in user space.  All other faults represent errors in the
 689         * kernel and should generate an OOPS.  Unfortunately, in the case of an
 690         * erroneous fault occurring in a code path which already holds mmap_sem
 691         * we will deadlock attempting to validate the fault against the
 692         * address space.  Luckily the kernel only validly references user
 693         * space from well defined areas of code, which are listed in the
 694         * exceptions table.
 695         *
 696         * As the vast majority of faults will be valid we will only perform
 697         * the source reference check when there is a possibility of a deadlock.
 698         * Attempt to lock the address space, if we cannot we then validate the
 699         * source.  If this is invalid we can skip the address space check,
 700         * thus avoiding the deadlock.
 701         */
 702        if (!down_read_trylock(&mm->mmap_sem)) {
 703                if ((error_code & PF_USER) == 0 &&
 704                    !search_exception_tables(regs->ip))
 705                        goto bad_area_nosemaphore;
 706                down_read(&mm->mmap_sem);
 707        }
 708
 709        vma = find_vma(mm, address);
 710        if (!vma)
 711                goto bad_area;
 712        if (vma->vm_start <= address)
 713                goto good_area;
 714        if (!(vma->vm_flags & VM_GROWSDOWN))
 715                goto bad_area;
 716        if (error_code & PF_USER) {
 717                /*
 718                 * Accessing the stack below %sp is always a bug.
 719                 * The large cushion allows instructions like enter
 720                 * and pusha to work.  ("enter $65535,$31" pushes
 721                 * 32 pointers and then decrements %sp by 65535.)
 722                 */
 723                if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 724                        goto bad_area;
 725        }
 726        if (expand_stack(vma, address))
 727                goto bad_area;
 728/*
 729 * Ok, we have a good vm_area for this memory access, so
 730 * we can handle it..
 731 */
 732good_area:
 733        si_code = SEGV_ACCERR;
 734        write = 0;
 735        switch (error_code & (PF_PROT|PF_WRITE)) {
 736        default:        /* 3: write, present */
 737                /* fall through */
 738        case PF_WRITE:          /* write, not present */
 739                if (!(vma->vm_flags & VM_WRITE))
 740                        goto bad_area;
 741                write++;
 742                break;
 743        case PF_PROT:           /* read, present */
 744                goto bad_area;
 745        case 0:                 /* read, not present */
 746                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 747                        goto bad_area;
 748        }
 749
 750#ifdef CONFIG_X86_32
 751survive:
 752#endif
 753        /*
 754         * If for any reason at all we couldn't handle the fault,
 755         * make sure we exit gracefully rather than endlessly redo
 756         * the fault.
 757         */
 758        fault = handle_mm_fault(mm, vma, address, write);
 759        if (unlikely(fault & VM_FAULT_ERROR)) {
 760                if (fault & VM_FAULT_OOM)
 761                        goto out_of_memory;
 762                else if (fault & VM_FAULT_SIGBUS)
 763                        goto do_sigbus;
 764                BUG();
 765        }
 766        if (fault & VM_FAULT_MAJOR)
 767                tsk->maj_flt++;
 768        else
 769                tsk->min_flt++;
 770
 771#ifdef CONFIG_X86_32
 772        /*
 773         * Did it hit the DOS screen memory VA from vm86 mode?
 774         */
 775        if (v8086_mode(regs)) {
 776                unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
 777                if (bit < 32)
 778                        tsk->thread.screen_bitmap |= 1 << bit;
 779        }
 780#endif
 781        up_read(&mm->mmap_sem);
 782        return;
 783
 784/*
 785 * Something tried to access memory that isn't in our memory map..
 786 * Fix it, but check if it's kernel or user first..
 787 */
 788bad_area:
 789        up_read(&mm->mmap_sem);
 790
 791bad_area_nosemaphore:
 792        /* User mode accesses just cause a SIGSEGV */
 793        if (error_code & PF_USER) {
 794                /*
 795                 * It's possible to have interrupts off here.
 796                 */
 797                local_irq_enable();
 798
 799                /*
 800                 * Valid to do another page fault here because this one came
 801                 * from user space.
 802                 */
 803                if (is_prefetch(regs, address, error_code))
 804                        return;
 805
 806                if (is_errata100(regs, address))
 807                        return;
 808
 809                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 810                    printk_ratelimit()) {
 811                        printk(
 812                        "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 813                        task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 814                        tsk->comm, task_pid_nr(tsk), address,
 815                        (void *) regs->ip, (void *) regs->sp, error_code);
 816                        print_vma_addr(" in ", regs->ip);
 817                        printk("\n");
 818                }
 819
 820                tsk->thread.cr2 = address;
 821                /* Kernel addresses are always protection faults */
 822                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 823                tsk->thread.trap_no = 14;
 824                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
 825                return;
 826        }
 827
 828        if (is_f00f_bug(regs, address))
 829                return;
 830
 831no_context:
 832        /* Are we prepared to handle this kernel fault?  */
 833        if (fixup_exception(regs))
 834                return;
 835
 836        /*
 837         * X86_32
 838         * Valid to do another page fault here, because if this fault
 839         * had been triggered by is_prefetch fixup_exception would have
 840         * handled it.
 841         *
 842         * X86_64
 843         * Hall of shame of CPU/BIOS bugs.
 844         */
 845        if (is_prefetch(regs, address, error_code))
 846                return;
 847
 848        if (is_errata93(regs, address))
 849                return;
 850
 851/*
 852 * Oops. The kernel tried to access some bad page. We'll have to
 853 * terminate things with extreme prejudice.
 854 */
 855#ifdef CONFIG_X86_32
 856        bust_spinlocks(1);
 857#else
 858        flags = oops_begin();
 859#endif
 860
 861        show_fault_oops(regs, error_code, address);
 862
 863        tsk->thread.cr2 = address;
 864        tsk->thread.trap_no = 14;
 865        tsk->thread.error_code = error_code;
 866
 867#ifdef CONFIG_X86_32
 868        die("Oops", regs, error_code);
 869        bust_spinlocks(0);
 870        do_exit(SIGKILL);
 871#else
 872        if (__die("Oops", regs, error_code))
 873                regs = NULL;
 874        /* Executive summary in case the body of the oops scrolled away */
 875        printk(KERN_EMERG "CR2: %016lx\n", address);
 876        oops_end(flags, regs, SIGKILL);
 877#endif
 878
 879/*
 880 * We ran out of memory, or some other thing happened to us that made
 881 * us unable to handle the page fault gracefully.
 882 */
 883out_of_memory:
 884        up_read(&mm->mmap_sem);
 885        if (is_global_init(tsk)) {
 886                yield();
 887#ifdef CONFIG_X86_32
 888                down_read(&mm->mmap_sem);
 889                goto survive;
 890#else
 891                goto again;
 892#endif
 893        }
 894
 895        printk("VM: killing process %s\n", tsk->comm);
 896        if (error_code & PF_USER)
 897                do_group_exit(SIGKILL);
 898        goto no_context;
 899
 900do_sigbus:
 901        up_read(&mm->mmap_sem);
 902
 903        /* Kernel mode? Handle exceptions or die */
 904        if (!(error_code & PF_USER))
 905                goto no_context;
 906#ifdef CONFIG_X86_32
 907        /* User space => ok to do another page fault */
 908        if (is_prefetch(regs, address, error_code))
 909                return;
 910#endif
 911        tsk->thread.cr2 = address;
 912        tsk->thread.error_code = error_code;
 913        tsk->thread.trap_no = 14;
 914        force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 915}
 916
 917DEFINE_SPINLOCK(pgd_lock);
 918LIST_HEAD(pgd_list);
 919
 920void vmalloc_sync_all(void)
 921{
 922#ifdef CONFIG_X86_32
 923        unsigned long start = VMALLOC_START & PGDIR_MASK;
 924        unsigned long address;
 925
 926        if (SHARED_KERNEL_PMD)
 927                return;
 928
 929        BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
 930        for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
 931                unsigned long flags;
 932                struct page *page;
 933
 934                spin_lock_irqsave(&pgd_lock, flags);
 935                list_for_each_entry(page, &pgd_list, lru) {
 936                        if (!vmalloc_sync_one(page_address(page),
 937                                              address))
 938                                break;
 939                }
 940                spin_unlock_irqrestore(&pgd_lock, flags);
 941        }
 942#else /* CONFIG_X86_64 */
 943        unsigned long start = VMALLOC_START & PGDIR_MASK;
 944        unsigned long address;
 945
 946        for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
 947                const pgd_t *pgd_ref = pgd_offset_k(address);
 948                unsigned long flags;
 949                struct page *page;
 950
 951                if (pgd_none(*pgd_ref))
 952                        continue;
 953                spin_lock_irqsave(&pgd_lock, flags);
 954                list_for_each_entry(page, &pgd_list, lru) {
 955                        pgd_t *pgd;
 956                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
 957                        if (pgd_none(*pgd))
 958                                set_pgd(pgd, *pgd_ref);
 959                        else
 960                                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 961                }
 962                spin_unlock_irqrestore(&pgd_lock, flags);
 963        }
 964#endif
 965}
 966
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.