linux/arch/x86/mm/fault.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
   4 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
   5 */
   6#include <linux/sched.h>                /* test_thread_flag(), ...      */
   7#include <linux/kdebug.h>               /* oops_begin/end, ...          */
   8#include <linux/module.h>               /* search_exception_table       */
   9#include <linux/bootmem.h>              /* max_low_pfn                  */
  10#include <linux/kprobes.h>              /* NOKPROBE_SYMBOL, ...         */
  11#include <linux/mmiotrace.h>            /* kmmio_handler, ...           */
  12#include <linux/perf_event.h>           /* perf_sw_event                */
  13#include <linux/hugetlb.h>              /* hstate_index_to_shift        */
  14#include <linux/prefetch.h>             /* prefetchw                    */
  15#include <linux/context_tracking.h>     /* exception_enter(), ...       */
  16
  17#include <asm/traps.h>                  /* dotraplinkage, ...           */
  18#include <asm/pgalloc.h>                /* pgd_*(), ...                 */
  19#include <asm/kmemcheck.h>              /* kmemcheck_*(), ...           */
  20#include <asm/fixmap.h>                 /* VSYSCALL_ADDR                */
  21#include <asm/vsyscall.h>               /* emulate_vsyscall             */
  22
  23#define CREATE_TRACE_POINTS
  24#include <asm/trace/exceptions.h>
  25
  26/*
  27 * Page fault error code bits:
  28 *
  29 *   bit 0 ==    0: no page found       1: protection fault
  30 *   bit 1 ==    0: read access         1: write access
  31 *   bit 2 ==    0: kernel-mode access  1: user-mode access
  32 *   bit 3 ==                           1: use of reserved bit detected
  33 *   bit 4 ==                           1: fault was an instruction fetch
  34 */
  35enum x86_pf_error_code {
  36
  37        PF_PROT         =               1 << 0,
  38        PF_WRITE        =               1 << 1,
  39        PF_USER         =               1 << 2,
  40        PF_RSVD         =               1 << 3,
  41        PF_INSTR        =               1 << 4,
  42};
  43
  44/*
  45 * Returns 0 if mmiotrace is disabled, or if the fault is not
  46 * handled by mmiotrace:
  47 */
  48static nokprobe_inline int
  49kmmio_fault(struct pt_regs *regs, unsigned long addr)
  50{
  51        if (unlikely(is_kmmio_active()))
  52                if (kmmio_handler(regs, addr) == 1)
  53                        return -1;
  54        return 0;
  55}
  56
  57static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
  58{
  59        int ret = 0;
  60
  61        /* kprobe_running() needs smp_processor_id() */
  62        if (kprobes_built_in() && !user_mode_vm(regs)) {
  63                preempt_disable();
  64                if (kprobe_running() && kprobe_fault_handler(regs, 14))
  65                        ret = 1;
  66                preempt_enable();
  67        }
  68
  69        return ret;
  70}
  71
  72/*
  73 * Prefetch quirks:
  74 *
  75 * 32-bit mode:
  76 *
  77 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
  78 *   Check that here and ignore it.
  79 *
  80 * 64-bit mode:
  81 *
  82 *   Sometimes the CPU reports invalid exceptions on prefetch.
  83 *   Check that here and ignore it.
  84 *
  85 * Opcode checker based on code by Richard Brunner.
  86 */
  87static inline int
  88check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
  89                      unsigned char opcode, int *prefetch)
  90{
  91        unsigned char instr_hi = opcode & 0xf0;
  92        unsigned char instr_lo = opcode & 0x0f;
  93
  94        switch (instr_hi) {
  95        case 0x20:
  96        case 0x30:
  97                /*
  98                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
  99                 * In X86_64 long mode, the CPU will signal invalid
 100                 * opcode if some of these prefixes are present so
 101                 * X86_64 will never get here anyway
 102                 */
 103                return ((instr_lo & 7) == 0x6);
 104#ifdef CONFIG_X86_64
 105        case 0x40:
 106                /*
 107                 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
 108                 * Need to figure out under what instruction mode the
 109                 * instruction was issued. Could check the LDT for lm,
 110                 * but for now it's good enough to assume that long
 111                 * mode only uses well known segments or kernel.
 112                 */
 113                return (!user_mode(regs) || user_64bit_mode(regs));
 114#endif
 115        case 0x60:
 116                /* 0x64 thru 0x67 are valid prefixes in all modes. */
 117                return (instr_lo & 0xC) == 0x4;
 118        case 0xF0:
 119                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
 120                return !instr_lo || (instr_lo>>1) == 1;
 121        case 0x00:
 122                /* Prefetch instruction is 0x0F0D or 0x0F18 */
 123                if (probe_kernel_address(instr, opcode))
 124                        return 0;
 125
 126                *prefetch = (instr_lo == 0xF) &&
 127                        (opcode == 0x0D || opcode == 0x18);
 128                return 0;
 129        default:
 130                return 0;
 131        }
 132}
 133
 134static int
 135is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 136{
 137        unsigned char *max_instr;
 138        unsigned char *instr;
 139        int prefetch = 0;
 140
 141        /*
 142         * If it was a exec (instruction fetch) fault on NX page, then
 143         * do not ignore the fault:
 144         */
 145        if (error_code & PF_INSTR)
 146                return 0;
 147
 148        instr = (void *)convert_ip_to_linear(current, regs);
 149        max_instr = instr + 15;
 150
 151        if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 152                return 0;
 153
 154        while (instr < max_instr) {
 155                unsigned char opcode;
 156
 157                if (probe_kernel_address(instr, opcode))
 158                        break;
 159
 160                instr++;
 161
 162                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
 163                        break;
 164        }
 165        return prefetch;
 166}
 167
 168static void
 169force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 170                     struct task_struct *tsk, int fault)
 171{
 172        unsigned lsb = 0;
 173        siginfo_t info;
 174
 175        info.si_signo   = si_signo;
 176        info.si_errno   = 0;
 177        info.si_code    = si_code;
 178        info.si_addr    = (void __user *)address;
 179        if (fault & VM_FAULT_HWPOISON_LARGE)
 180                lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
 181        if (fault & VM_FAULT_HWPOISON)
 182                lsb = PAGE_SHIFT;
 183        info.si_addr_lsb = lsb;
 184
 185        force_sig_info(si_signo, &info, tsk);
 186}
 187
 188DEFINE_SPINLOCK(pgd_lock);
 189LIST_HEAD(pgd_list);
 190
 191#ifdef CONFIG_X86_32
 192static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 193{
 194        unsigned index = pgd_index(address);
 195        pgd_t *pgd_k;
 196        pud_t *pud, *pud_k;
 197        pmd_t *pmd, *pmd_k;
 198
 199        pgd += index;
 200        pgd_k = init_mm.pgd + index;
 201
 202        if (!pgd_present(*pgd_k))
 203                return NULL;
 204
 205        /*
 206         * set_pgd(pgd, *pgd_k); here would be useless on PAE
 207         * and redundant with the set_pmd() on non-PAE. As would
 208         * set_pud.
 209         */
 210        pud = pud_offset(pgd, address);
 211        pud_k = pud_offset(pgd_k, address);
 212        if (!pud_present(*pud_k))
 213                return NULL;
 214
 215        pmd = pmd_offset(pud, address);
 216        pmd_k = pmd_offset(pud_k, address);
 217        if (!pmd_present(*pmd_k))
 218                return NULL;
 219
 220        if (!pmd_present(*pmd))
 221                set_pmd(pmd, *pmd_k);
 222        else
 223                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
 224
 225        return pmd_k;
 226}
 227
 228void vmalloc_sync_all(void)
 229{
 230        unsigned long address;
 231
 232        if (SHARED_KERNEL_PMD)
 233                return;
 234
 235        for (address = VMALLOC_START & PMD_MASK;
 236             address >= TASK_SIZE && address < FIXADDR_TOP;
 237             address += PMD_SIZE) {
 238                struct page *page;
 239
 240                spin_lock(&pgd_lock);
 241                list_for_each_entry(page, &pgd_list, lru) {
 242                        spinlock_t *pgt_lock;
 243                        pmd_t *ret;
 244
 245                        /* the pgt_lock only for Xen */
 246                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 247
 248                        spin_lock(pgt_lock);
 249                        ret = vmalloc_sync_one(page_address(page), address);
 250                        spin_unlock(pgt_lock);
 251
 252                        if (!ret)
 253                                break;
 254                }
 255                spin_unlock(&pgd_lock);
 256        }
 257}
 258
 259/*
 260 * 32-bit:
 261 *
 262 *   Handle a fault on the vmalloc or module mapping area
 263 */
 264static noinline int vmalloc_fault(unsigned long address)
 265{
 266        unsigned long pgd_paddr;
 267        pmd_t *pmd_k;
 268        pte_t *pte_k;
 269
 270        /* Make sure we are in vmalloc area: */
 271        if (!(address >= VMALLOC_START && address < VMALLOC_END))
 272                return -1;
 273
 274        WARN_ON_ONCE(in_nmi());
 275
 276        /*
 277         * Synchronize this task's top level page-table
 278         * with the 'reference' page table.
 279         *
 280         * Do _not_ use "current" here. We might be inside
 281         * an interrupt in the middle of a task switch..
 282         */
 283        pgd_paddr = read_cr3();
 284        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 285        if (!pmd_k)
 286                return -1;
 287
 288        pte_k = pte_offset_kernel(pmd_k, address);
 289        if (!pte_present(*pte_k))
 290                return -1;
 291
 292        return 0;
 293}
 294NOKPROBE_SYMBOL(vmalloc_fault);
 295
 296/*
 297 * Did it hit the DOS screen memory VA from vm86 mode?
 298 */
 299static inline void
 300check_v8086_mode(struct pt_regs *regs, unsigned long address,
 301                 struct task_struct *tsk)
 302{
 303        unsigned long bit;
 304
 305        if (!v8086_mode(regs))
 306                return;
 307
 308        bit = (address - 0xA0000) >> PAGE_SHIFT;
 309        if (bit < 32)
 310                tsk->thread.screen_bitmap |= 1 << bit;
 311}
 312
 313static bool low_pfn(unsigned long pfn)
 314{
 315        return pfn < max_low_pfn;
 316}
 317
 318static void dump_pagetable(unsigned long address)
 319{
 320        pgd_t *base = __va(read_cr3());
 321        pgd_t *pgd = &base[pgd_index(address)];
 322        pmd_t *pmd;
 323        pte_t *pte;
 324
 325#ifdef CONFIG_X86_PAE
 326        printk("*pdpt = %016Lx ", pgd_val(*pgd));
 327        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
 328                goto out;
 329#endif
 330        pmd = pmd_offset(pud_offset(pgd, address), address);
 331        printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
 332
 333        /*
 334         * We must not directly access the pte in the highpte
 335         * case if the page table is located in highmem.
 336         * And let's rather not kmap-atomic the pte, just in case
 337         * it's allocated already:
 338         */
 339        if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
 340                goto out;
 341
 342        pte = pte_offset_kernel(pmd, address);
 343        printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
 344out:
 345        printk("\n");
 346}
 347
 348#else /* CONFIG_X86_64: */
 349
 350void vmalloc_sync_all(void)
 351{
 352        sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
 353}
 354
 355/*
 356 * 64-bit:
 357 *
 358 *   Handle a fault on the vmalloc area
 359 *
 360 * This assumes no large pages in there.
 361 */
 362static noinline int vmalloc_fault(unsigned long address)
 363{
 364        pgd_t *pgd, *pgd_ref;
 365        pud_t *pud, *pud_ref;
 366        pmd_t *pmd, *pmd_ref;
 367        pte_t *pte, *pte_ref;
 368
 369        /* Make sure we are in vmalloc area: */
 370        if (!(address >= VMALLOC_START && address < VMALLOC_END))
 371                return -1;
 372
 373        WARN_ON_ONCE(in_nmi());
 374
 375        /*
 376         * Copy kernel mappings over when needed. This can also
 377         * happen within a race in page table update. In the later
 378         * case just flush:
 379         */
 380        pgd = pgd_offset(current->active_mm, address);
 381        pgd_ref = pgd_offset_k(address);
 382        if (pgd_none(*pgd_ref))
 383                return -1;
 384
 385        if (pgd_none(*pgd)) {
 386                set_pgd(pgd, *pgd_ref);
 387                arch_flush_lazy_mmu_mode();
 388        } else {
 389                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 390        }
 391
 392        /*
 393         * Below here mismatches are bugs because these lower tables
 394         * are shared:
 395         */
 396
 397        pud = pud_offset(pgd, address);
 398        pud_ref = pud_offset(pgd_ref, address);
 399        if (pud_none(*pud_ref))
 400                return -1;
 401
 402        if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 403                BUG();
 404
 405        pmd = pmd_offset(pud, address);
 406        pmd_ref = pmd_offset(pud_ref, address);
 407        if (pmd_none(*pmd_ref))
 408                return -1;
 409
 410        if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 411                BUG();
 412
 413        pte_ref = pte_offset_kernel(pmd_ref, address);
 414        if (!pte_present(*pte_ref))
 415                return -1;
 416
 417        pte = pte_offset_kernel(pmd, address);
 418
 419        /*
 420         * Don't use pte_page here, because the mappings can point
 421         * outside mem_map, and the NUMA hash lookup cannot handle
 422         * that:
 423         */
 424        if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 425                BUG();
 426
 427        return 0;
 428}
 429NOKPROBE_SYMBOL(vmalloc_fault);
 430
 431#ifdef CONFIG_CPU_SUP_AMD
 432static const char errata93_warning[] =
 433KERN_ERR 
 434"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 435"******* Working around it, but it may cause SEGVs or burn power.\n"
 436"******* Please consider a BIOS update.\n"
 437"******* Disabling USB legacy in the BIOS may also help.\n";
 438#endif
 439
 440/*
 441 * No vm86 mode in 64-bit mode:
 442 */
 443static inline void
 444check_v8086_mode(struct pt_regs *regs, unsigned long address,
 445                 struct task_struct *tsk)
 446{
 447}
 448
 449static int bad_address(void *p)
 450{
 451        unsigned long dummy;
 452
 453        return probe_kernel_address((unsigned long *)p, dummy);
 454}
 455
 456static void dump_pagetable(unsigned long address)
 457{
 458        pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
 459        pgd_t *pgd = base + pgd_index(address);
 460        pud_t *pud;
 461        pmd_t *pmd;
 462        pte_t *pte;
 463
 464        if (bad_address(pgd))
 465                goto bad;
 466
 467        printk("PGD %lx ", pgd_val(*pgd));
 468
 469        if (!pgd_present(*pgd))
 470                goto out;
 471
 472        pud = pud_offset(pgd, address);
 473        if (bad_address(pud))
 474                goto bad;
 475
 476        printk("PUD %lx ", pud_val(*pud));
 477        if (!pud_present(*pud) || pud_large(*pud))
 478                goto out;
 479
 480        pmd = pmd_offset(pud, address);
 481        if (bad_address(pmd))
 482                goto bad;
 483
 484        printk("PMD %lx ", pmd_val(*pmd));
 485        if (!pmd_present(*pmd) || pmd_large(*pmd))
 486                goto out;
 487
 488        pte = pte_offset_kernel(pmd, address);
 489        if (bad_address(pte))
 490                goto bad;
 491
 492        printk("PTE %lx", pte_val(*pte));
 493out:
 494        printk("\n");
 495        return;
 496bad:
 497        printk("BAD\n");
 498}
 499
 500#endif /* CONFIG_X86_64 */
 501
 502/*
 503 * Workaround for K8 erratum #93 & buggy BIOS.
 504 *
 505 * BIOS SMM functions are required to use a specific workaround
 506 * to avoid corruption of the 64bit RIP register on C stepping K8.
 507 *
 508 * A lot of BIOS that didn't get tested properly miss this.
 509 *
 510 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 511 * Try to work around it here.
 512 *
 513 * Note we only handle faults in kernel here.
 514 * Does nothing on 32-bit.
 515 */
 516static int is_errata93(struct pt_regs *regs, unsigned long address)
 517{
 518#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
 519        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
 520            || boot_cpu_data.x86 != 0xf)
 521                return 0;
 522
 523        if (address != regs->ip)
 524                return 0;
 525
 526        if ((address >> 32) != 0)
 527                return 0;
 528
 529        address |= 0xffffffffUL << 32;
 530        if ((address >= (u64)_stext && address <= (u64)_etext) ||
 531            (address >= MODULES_VADDR && address <= MODULES_END)) {
 532                printk_once(errata93_warning);
 533                regs->ip = address;
 534                return 1;
 535        }
 536#endif
 537        return 0;
 538}
 539
 540/*
 541 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 542 * to illegal addresses >4GB.
 543 *
 544 * We catch this in the page fault handler because these addresses
 545 * are not reachable. Just detect this case and return.  Any code
 546 * segment in LDT is compatibility mode.
 547 */
 548static int is_errata100(struct pt_regs *regs, unsigned long address)
 549{
 550#ifdef CONFIG_X86_64
 551        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
 552                return 1;
 553#endif
 554        return 0;
 555}
 556
 557static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 558{
 559#ifdef CONFIG_X86_F00F_BUG
 560        unsigned long nr;
 561
 562        /*
 563         * Pentium F0 0F C7 C8 bug workaround:
 564         */
 565        if (boot_cpu_has_bug(X86_BUG_F00F)) {
 566                nr = (address - idt_descr.address) >> 3;
 567
 568                if (nr == 6) {
 569                        do_invalid_op(regs, 0);
 570                        return 1;
 571                }
 572        }
 573#endif
 574        return 0;
 575}
 576
 577static const char nx_warning[] = KERN_CRIT
 578"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
 579static const char smep_warning[] = KERN_CRIT
 580"unable to execute userspace code (SMEP?) (uid: %d)\n";
 581
 582static void
 583show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 584                unsigned long address)
 585{
 586        if (!oops_may_print())
 587                return;
 588
 589        if (error_code & PF_INSTR) {
 590                unsigned int level;
 591                pgd_t *pgd;
 592                pte_t *pte;
 593
 594                pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
 595                pgd += pgd_index(address);
 596
 597                pte = lookup_address_in_pgd(pgd, address, &level);
 598
 599                if (pte && pte_present(*pte) && !pte_exec(*pte))
 600                        printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 601                if (pte && pte_present(*pte) && pte_exec(*pte) &&
 602                                (pgd_flags(*pgd) & _PAGE_USER) &&
 603                                (read_cr4() & X86_CR4_SMEP))
 604                        printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
 605        }
 606
 607        printk(KERN_ALERT "BUG: unable to handle kernel ");
 608        if (address < PAGE_SIZE)
 609                printk(KERN_CONT "NULL pointer dereference");
 610        else
 611                printk(KERN_CONT "paging request");
 612
 613        printk(KERN_CONT " at %p\n", (void *) address);
 614        printk(KERN_ALERT "IP:");
 615        printk_address(regs->ip);
 616
 617        dump_pagetable(address);
 618}
 619
 620static noinline void
 621pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 622            unsigned long address)
 623{
 624        struct task_struct *tsk;
 625        unsigned long flags;
 626        int sig;
 627
 628        flags = oops_begin();
 629        tsk = current;
 630        sig = SIGKILL;
 631
 632        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 633               tsk->comm, address);
 634        dump_pagetable(address);
 635
 636        tsk->thread.cr2         = address;
 637        tsk->thread.trap_nr     = X86_TRAP_PF;
 638        tsk->thread.error_code  = error_code;
 639
 640        if (__die("Bad pagetable", regs, error_code))
 641                sig = 0;
 642
 643        oops_end(flags, regs, sig);
 644}
 645
 646static noinline void
 647no_context(struct pt_regs *regs, unsigned long error_code,
 648           unsigned long address, int signal, int si_code)
 649{
 650        struct task_struct *tsk = current;
 651        unsigned long flags;
 652        int sig;
 653
 654        /* Are we prepared to handle this kernel fault? */
 655        if (fixup_exception(regs)) {
 656                /*
 657                 * Any interrupt that takes a fault gets the fixup. This makes
 658                 * the below recursive fault logic only apply to a faults from
 659                 * task context.
 660                 */
 661                if (in_interrupt())
 662                        return;
 663
 664                /*
 665                 * Per the above we're !in_interrupt(), aka. task context.
 666                 *
 667                 * In this case we need to make sure we're not recursively
 668                 * faulting through the emulate_vsyscall() logic.
 669                 */
 670                if (current_thread_info()->sig_on_uaccess_error && signal) {
 671                        tsk->thread.trap_nr = X86_TRAP_PF;
 672                        tsk->thread.error_code = error_code | PF_USER;
 673                        tsk->thread.cr2 = address;
 674
 675                        /* XXX: hwpoison faults will set the wrong code. */
 676                        force_sig_info_fault(signal, si_code, address, tsk, 0);
 677                }
 678
 679                /*
 680                 * Barring that, we can do the fixup and be happy.
 681                 */
 682                return;
 683        }
 684
 685        /*
 686         * 32-bit:
 687         *
 688         *   Valid to do another page fault here, because if this fault
 689         *   had been triggered by is_prefetch fixup_exception would have
 690         *   handled it.
 691         *
 692         * 64-bit:
 693         *
 694         *   Hall of shame of CPU/BIOS bugs.
 695         */
 696        if (is_prefetch(regs, error_code, address))
 697                return;
 698
 699        if (is_errata93(regs, address))
 700                return;
 701
 702        /*
 703         * Oops. The kernel tried to access some bad page. We'll have to
 704         * terminate things with extreme prejudice:
 705         */
 706        flags = oops_begin();
 707
 708        show_fault_oops(regs, error_code, address);
 709
 710        if (task_stack_end_corrupted(tsk))
 711                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 712
 713        tsk->thread.cr2         = address;
 714        tsk->thread.trap_nr     = X86_TRAP_PF;
 715        tsk->thread.error_code  = error_code;
 716
 717        sig = SIGKILL;
 718        if (__die("Oops", regs, error_code))
 719                sig = 0;
 720
 721        /* Executive summary in case the body of the oops scrolled away */
 722        printk(KERN_DEFAULT "CR2: %016lx\n", address);
 723
 724        oops_end(flags, regs, sig);
 725}
 726
 727/*
 728 * Print out info about fatal segfaults, if the show_unhandled_signals
 729 * sysctl is set:
 730 */
 731static inline void
 732show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 733                unsigned long address, struct task_struct *tsk)
 734{
 735        if (!unhandled_signal(tsk, SIGSEGV))
 736                return;
 737
 738        if (!printk_ratelimit())
 739                return;
 740
 741        printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 742                task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 743                tsk->comm, task_pid_nr(tsk), address,
 744                (void *)regs->ip, (void *)regs->sp, error_code);
 745
 746        print_vma_addr(KERN_CONT " in ", regs->ip);
 747
 748        printk(KERN_CONT "\n");
 749}
 750
 751static void
 752__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 753                       unsigned long address, int si_code)
 754{
 755        struct task_struct *tsk = current;
 756
 757        /* User mode accesses just cause a SIGSEGV */
 758        if (error_code & PF_USER) {
 759                /*
 760                 * It's possible to have interrupts off here:
 761                 */
 762                local_irq_enable();
 763
 764                /*
 765                 * Valid to do another page fault here because this one came
 766                 * from user space:
 767                 */
 768                if (is_prefetch(regs, error_code, address))
 769                        return;
 770
 771                if (is_errata100(regs, address))
 772                        return;
 773
 774#ifdef CONFIG_X86_64
 775                /*
 776                 * Instruction fetch faults in the vsyscall page might need
 777                 * emulation.
 778                 */
 779                if (unlikely((error_code & PF_INSTR) &&
 780                             ((address & ~0xfff) == VSYSCALL_ADDR))) {
 781                        if (emulate_vsyscall(regs, address))
 782                                return;
 783                }
 784#endif
 785                /* Kernel addresses are always protection faults: */
 786                if (address >= TASK_SIZE)
 787                        error_code |= PF_PROT;
 788
 789                if (likely(show_unhandled_signals))
 790                        show_signal_msg(regs, error_code, address, tsk);
 791
 792                tsk->thread.cr2         = address;
 793                tsk->thread.error_code  = error_code;
 794                tsk->thread.trap_nr     = X86_TRAP_PF;
 795
 796                force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
 797
 798                return;
 799        }
 800
 801        if (is_f00f_bug(regs, address))
 802                return;
 803
 804        no_context(regs, error_code, address, SIGSEGV, si_code);
 805}
 806
 807static noinline void
 808bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 809                     unsigned long address)
 810{
 811        __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
 812}
 813
 814static void
 815__bad_area(struct pt_regs *regs, unsigned long error_code,
 816           unsigned long address, int si_code)
 817{
 818        struct mm_struct *mm = current->mm;
 819
 820        /*
 821         * Something tried to access memory that isn't in our memory map..
 822         * Fix it, but check if it's kernel or user first..
 823         */
 824        up_read(&mm->mmap_sem);
 825
 826        __bad_area_nosemaphore(regs, error_code, address, si_code);
 827}
 828
 829static noinline void
 830bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 831{
 832        __bad_area(regs, error_code, address, SEGV_MAPERR);
 833}
 834
 835static noinline void
 836bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 837                      unsigned long address)
 838{
 839        __bad_area(regs, error_code, address, SEGV_ACCERR);
 840}
 841
 842static void
 843do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 844          unsigned int fault)
 845{
 846        struct task_struct *tsk = current;
 847        int code = BUS_ADRERR;
 848
 849        /* Kernel mode? Handle exceptions or die: */
 850        if (!(error_code & PF_USER)) {
 851                no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 852                return;
 853        }
 854
 855        /* User-space => ok to do another page fault: */
 856        if (is_prefetch(regs, error_code, address))
 857                return;
 858
 859        tsk->thread.cr2         = address;
 860        tsk->thread.error_code  = error_code;
 861        tsk->thread.trap_nr     = X86_TRAP_PF;
 862
 863#ifdef CONFIG_MEMORY_FAILURE
 864        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
 865                printk(KERN_ERR
 866        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
 867                        tsk->comm, tsk->pid, address);
 868                code = BUS_MCEERR_AR;
 869        }
 870#endif
 871        force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 872}
 873
 874static noinline void
 875mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 876               unsigned long address, unsigned int fault)
 877{
 878        if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
 879                no_context(regs, error_code, address, 0, 0);
 880                return;
 881        }
 882
 883        if (fault & VM_FAULT_OOM) {
 884                /* Kernel mode? Handle exceptions or die: */
 885                if (!(error_code & PF_USER)) {
 886                        no_context(regs, error_code, address,
 887                                   SIGSEGV, SEGV_MAPERR);
 888                        return;
 889                }
 890
 891                /*
 892                 * We ran out of memory, call the OOM killer, and return the
 893                 * userspace (which will retry the fault, or kill us if we got
 894                 * oom-killed):
 895                 */
 896                pagefault_out_of_memory();
 897        } else {
 898                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 899                             VM_FAULT_HWPOISON_LARGE))
 900                        do_sigbus(regs, error_code, address, fault);
 901                else if (fault & VM_FAULT_SIGSEGV)
 902                        bad_area_nosemaphore(regs, error_code, address);
 903                else
 904                        BUG();
 905        }
 906}
 907
 908static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 909{
 910        if ((error_code & PF_WRITE) && !pte_write(*pte))
 911                return 0;
 912
 913        if ((error_code & PF_INSTR) && !pte_exec(*pte))
 914                return 0;
 915
 916        return 1;
 917}
 918
 919/*
 920 * Handle a spurious fault caused by a stale TLB entry.
 921 *
 922 * This allows us to lazily refresh the TLB when increasing the
 923 * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
 924 * eagerly is very expensive since that implies doing a full
 925 * cross-processor TLB flush, even if no stale TLB entries exist
 926 * on other processors.
 927 *
 928 * Spurious faults may only occur if the TLB contains an entry with
 929 * fewer permission than the page table entry.  Non-present (P = 0)
 930 * and reserved bit (R = 1) faults are never spurious.
 931 *
 932 * There are no security implications to leaving a stale TLB when
 933 * increasing the permissions on a page.
 934 *
 935 * Returns non-zero if a spurious fault was handled, zero otherwise.
 936 *
 937 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
 938 * (Optional Invalidation).
 939 */
 940static noinline int
 941spurious_fault(unsigned long error_code, unsigned long address)
 942{
 943        pgd_t *pgd;
 944        pud_t *pud;
 945        pmd_t *pmd;
 946        pte_t *pte;
 947        int ret;
 948
 949        /*
 950         * Only writes to RO or instruction fetches from NX may cause
 951         * spurious faults.
 952         *
 953         * These could be from user or supervisor accesses but the TLB
 954         * is only lazily flushed after a kernel mapping protection
 955         * change, so user accesses are not expected to cause spurious
 956         * faults.
 957         */
 958        if (error_code != (PF_WRITE | PF_PROT)
 959            && error_code != (PF_INSTR | PF_PROT))
 960                return 0;
 961
 962        pgd = init_mm.pgd + pgd_index(address);
 963        if (!pgd_present(*pgd))
 964                return 0;
 965
 966        pud = pud_offset(pgd, address);
 967        if (!pud_present(*pud))
 968                return 0;
 969
 970        if (pud_large(*pud))
 971                return spurious_fault_check(error_code, (pte_t *) pud);
 972
 973        pmd = pmd_offset(pud, address);
 974        if (!pmd_present(*pmd))
 975                return 0;
 976
 977        if (pmd_large(*pmd))
 978                return spurious_fault_check(error_code, (pte_t *) pmd);
 979
 980        pte = pte_offset_kernel(pmd, address);
 981        if (!pte_present(*pte))
 982                return 0;
 983
 984        ret = spurious_fault_check(error_code, pte);
 985        if (!ret)
 986                return 0;
 987
 988        /*
 989         * Make sure we have permissions in PMD.
 990         * If not, then there's a bug in the page tables:
 991         */
 992        ret = spurious_fault_check(error_code, (pte_t *) pmd);
 993        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
 994
 995        return ret;
 996}
 997NOKPROBE_SYMBOL(spurious_fault);
 998
 999int show_unhandled_signals = 1;
1000
1001static inline int
1002access_error(unsigned long error_code, struct vm_area_struct *vma)
1003{
1004        if (error_code & PF_WRITE) {
1005                /* write, present and write, not present: */
1006                if (unlikely(!(vma->vm_flags & VM_WRITE)))
1007                        return 1;
1008                return 0;
1009        }
1010
1011        /* read, present: */
1012        if (unlikely(error_code & PF_PROT))
1013                return 1;
1014
1015        /* read, not present: */
1016        if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
1017                return 1;
1018
1019        return 0;
1020}
1021
1022static int fault_in_kernel_space(unsigned long address)
1023{
1024        return address >= TASK_SIZE_MAX;
1025}
1026
1027static inline bool smap_violation(int error_code, struct pt_regs *regs)
1028{
1029        if (!IS_ENABLED(CONFIG_X86_SMAP))
1030                return false;
1031
1032        if (!static_cpu_has(X86_FEATURE_SMAP))
1033                return false;
1034
1035        if (error_code & PF_USER)
1036                return false;
1037
1038        if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
1039                return false;
1040
1041        return true;
1042}
1043
1044/*
1045 * This routine handles page faults.  It determines the address,
1046 * and the problem, and then passes it off to one of the appropriate
1047 * routines.
1048 *
1049 * This function must have noinline because both callers
1050 * {,trace_}do_page_fault() have notrace on. Having this an actual function
1051 * guarantees there's a function trace entry.
1052 */
1053static noinline void
1054__do_page_fault(struct pt_regs *regs, unsigned long error_code,
1055                unsigned long address)
1056{
1057        struct vm_area_struct *vma;
1058        struct task_struct *tsk;
1059        struct mm_struct *mm;
1060        int fault, major = 0;
1061        unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1062
1063        tsk = current;
1064        mm = tsk->mm;
1065
1066        /*
1067         * Detect and handle instructions that would cause a page fault for
1068         * both a tracked kernel page and a userspace page.
1069         */
1070        if (kmemcheck_active(regs))
1071                kmemcheck_hide(regs);
1072        prefetchw(&mm->mmap_sem);
1073
1074        if (unlikely(kmmio_fault(regs, address)))
1075                return;
1076
1077        /*
1078         * We fault-in kernel-space virtual memory on-demand. The
1079         * 'reference' page table is init_mm.pgd.
1080         *
1081         * NOTE! We MUST NOT take any locks for this case. We may
1082         * be in an interrupt or a critical region, and should
1083         * only copy the information from the master page table,
1084         * nothing more.
1085         *
1086         * This verifies that the fault happens in kernel space
1087         * (error_code & 4) == 0, and that the fault was not a
1088         * protection error (error_code & 9) == 0.
1089         */
1090        if (unlikely(fault_in_kernel_space(address))) {
1091                if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1092                        if (vmalloc_fault(address) >= 0)
1093                                return;
1094
1095                        if (kmemcheck_fault(regs, address, error_code))
1096                                return;
1097                }
1098
1099                /* Can handle a stale RO->RW TLB: */
1100                if (spurious_fault(error_code, address))
1101                        return;
1102
1103                /* kprobes don't want to hook the spurious faults: */
1104                if (kprobes_fault(regs))
1105                        return;
1106                /*
1107                 * Don't take the mm semaphore here. If we fixup a prefetch
1108                 * fault we could otherwise deadlock:
1109                 */
1110                bad_area_nosemaphore(regs, error_code, address);
1111
1112                return;
1113        }
1114
1115        /* kprobes don't want to hook the spurious faults: */
1116        if (unlikely(kprobes_fault(regs)))
1117                return;
1118
1119        if (unlikely(error_code & PF_RSVD))
1120                pgtable_bad(regs, error_code, address);
1121
1122        if (unlikely(smap_violation(error_code, regs))) {
1123                bad_area_nosemaphore(regs, error_code, address);
1124                return;
1125        }
1126
1127        /*
1128         * If we're in an interrupt, have no user context or are running
1129         * in an atomic region then we must not take the fault:
1130         */
1131        if (unlikely(in_atomic() || !mm)) {
1132                bad_area_nosemaphore(regs, error_code, address);
1133                return;
1134        }
1135
1136        /*
1137         * It's safe to allow irq's after cr2 has been saved and the
1138         * vmalloc fault has been handled.
1139         *
1140         * User-mode registers count as a user access even for any
1141         * potential system fault or CPU buglet:
1142         */
1143        if (user_mode_vm(regs)) {
1144                local_irq_enable();
1145                error_code |= PF_USER;
1146                flags |= FAULT_FLAG_USER;
1147        } else {
1148                if (regs->flags & X86_EFLAGS_IF)
1149                        local_irq_enable();
1150        }
1151
1152        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1153
1154        if (error_code & PF_WRITE)
1155                flags |= FAULT_FLAG_WRITE;
1156
1157        /*
1158         * When running in the kernel we expect faults to occur only to
1159         * addresses in user space.  All other faults represent errors in
1160         * the kernel and should generate an OOPS.  Unfortunately, in the
1161         * case of an erroneous fault occurring in a code path which already
1162         * holds mmap_sem we will deadlock attempting to validate the fault
1163         * against the address space.  Luckily the kernel only validly
1164         * references user space from well defined areas of code, which are
1165         * listed in the exceptions table.
1166         *
1167         * As the vast majority of faults will be valid we will only perform
1168         * the source reference check when there is a possibility of a
1169         * deadlock. Attempt to lock the address space, if we cannot we then
1170         * validate the source. If this is invalid we can skip the address
1171         * space check, thus avoiding the deadlock:
1172         */
1173        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1174                if ((error_code & PF_USER) == 0 &&
1175                    !search_exception_tables(regs->ip)) {
1176                        bad_area_nosemaphore(regs, error_code, address);
1177                        return;
1178                }
1179retry:
1180                down_read(&mm->mmap_sem);
1181        } else {
1182                /*
1183                 * The above down_read_trylock() might have succeeded in
1184                 * which case we'll have missed the might_sleep() from
1185                 * down_read():
1186                 */
1187                might_sleep();
1188        }
1189
1190        vma = find_vma(mm, address);
1191        if (unlikely(!vma)) {
1192                bad_area(regs, error_code, address);
1193                return;
1194        }
1195        if (likely(vma->vm_start <= address))
1196                goto good_area;
1197        if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1198                bad_area(regs, error_code, address);
1199                return;
1200        }
1201        if (error_code & PF_USER) {
1202                /*
1203                 * Accessing the stack below %sp is always a bug.
1204                 * The large cushion allows instructions like enter
1205                 * and pusha to work. ("enter $65535, $31" pushes
1206                 * 32 pointers and then decrements %sp by 65535.)
1207                 */
1208                if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1209                        bad_area(regs, error_code, address);
1210                        return;
1211                }
1212        }
1213        if (unlikely(expand_stack(vma, address))) {
1214                bad_area(regs, error_code, address);
1215                return;
1216        }
1217
1218        /*
1219         * Ok, we have a good vm_area for this memory access, so
1220         * we can handle it..
1221         */
1222good_area:
1223        if (unlikely(access_error(error_code, vma))) {
1224                bad_area_access_error(regs, error_code, address);
1225                return;
1226        }
1227
1228        /*
1229         * If for any reason at all we couldn't handle the fault,
1230         * make sure we exit gracefully rather than endlessly redo
1231         * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1232         * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
1233         */
1234        fault = handle_mm_fault(mm, vma, address, flags);
1235        major |= fault & VM_FAULT_MAJOR;
1236
1237        /*
1238         * If we need to retry the mmap_sem has already been released,
1239         * and if there is a fatal signal pending there is no guarantee
1240         * that we made any progress. Handle this case first.
1241         */
1242        if (unlikely(fault & VM_FAULT_RETRY)) {
1243                /* Retry at most once */
1244                if (flags & FAULT_FLAG_ALLOW_RETRY) {
1245                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
1246                        flags |= FAULT_FLAG_TRIED;
1247                        if (!fatal_signal_pending(tsk))
1248                                goto retry;
1249                }
1250
1251                /* User mode? Just return to handle the fatal exception */
1252                if (flags & FAULT_FLAG_USER)
1253                        return;
1254
1255                /* Not returning to user mode? Handle exceptions or die: */
1256                no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
1257                return;
1258        }
1259
1260        up_read(&mm->mmap_sem);
1261        if (unlikely(fault & VM_FAULT_ERROR)) {
1262                mm_fault_error(regs, error_code, address, fault);
1263                return;
1264        }
1265
1266        /*
1267         * Major/minor page fault accounting. If any of the events
1268         * returned VM_FAULT_MAJOR, we account it as a major fault.
1269         */
1270        if (major) {
1271                tsk->maj_flt++;
1272                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
1273        } else {
1274                tsk->min_flt++;
1275                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
1276        }
1277
1278        check_v8086_mode(regs, address, tsk);
1279}
1280NOKPROBE_SYMBOL(__do_page_fault);
1281
1282dotraplinkage void notrace
1283do_page_fault(struct pt_regs *regs, unsigned long error_code)
1284{
1285        unsigned long address = read_cr2(); /* Get the faulting address */
1286        enum ctx_state prev_state;
1287
1288        /*
1289         * We must have this function tagged with __kprobes, notrace and call
1290         * read_cr2() before calling anything else. To avoid calling any kind
1291         * of tracing machinery before we've observed the CR2 value.
1292         *
1293         * exception_{enter,exit}() contain all sorts of tracepoints.
1294         */
1295
1296        prev_state = exception_enter();
1297        __do_page_fault(regs, error_code, address);
1298        exception_exit(prev_state);
1299}
1300NOKPROBE_SYMBOL(do_page_fault);
1301
1302#ifdef CONFIG_TRACING
1303static nokprobe_inline void
1304trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1305                         unsigned long error_code)
1306{
1307        if (user_mode(regs))
1308                trace_page_fault_user(address, regs, error_code);
1309        else
1310                trace_page_fault_kernel(address, regs, error_code);
1311}
1312
1313dotraplinkage void notrace
1314trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
1315{
1316        /*
1317         * The exception_enter and tracepoint processing could
1318         * trigger another page faults (user space callchain
1319         * reading) and destroy the original cr2 value, so read
1320         * the faulting address now.
1321         */
1322        unsigned long address = read_cr2();
1323        enum ctx_state prev_state;
1324
1325        prev_state = exception_enter();
1326        trace_page_fault_entries(address, regs, error_code);
1327        __do_page_fault(regs, error_code, address);
1328        exception_exit(prev_state);
1329}
1330NOKPROBE_SYMBOL(trace_do_page_fault);
1331#endif /* CONFIG_TRACING */
1332
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.