linux/arch/x86/mm/fault_32.c
<<
>>
Prefs
   1/*
   2 *  linux/arch/i386/mm/fault.c
   3 *
   4 *  Copyright (C) 1995  Linus Torvalds
   5 */
   6
   7#include <linux/signal.h>
   8#include <linux/sched.h>
   9#include <linux/kernel.h>
  10#include <linux/errno.h>
  11#include <linux/string.h>
  12#include <linux/types.h>
  13#include <linux/ptrace.h>
  14#include <linux/mman.h>
  15#include <linux/mm.h>
  16#include <linux/smp.h>
  17#include <linux/interrupt.h>
  18#include <linux/init.h>
  19#include <linux/tty.h>
  20#include <linux/vt_kern.h>              /* For unblank_screen() */
  21#include <linux/highmem.h>
  22#include <linux/bootmem.h>              /* for max_low_pfn */
  23#include <linux/vmalloc.h>
  24#include <linux/module.h>
  25#include <linux/kprobes.h>
  26#include <linux/uaccess.h>
  27#include <linux/kdebug.h>
  28#include <linux/kprobes.h>
  29
  30#include <asm/system.h>
  31#include <asm/desc.h>
  32#include <asm/segment.h>
  33
  34extern void die(const char *,struct pt_regs *,long);
  35
  36#ifdef CONFIG_KPROBES
  37static inline int notify_page_fault(struct pt_regs *regs)
  38{
  39        int ret = 0;
  40
  41        /* kprobe_running() needs smp_processor_id() */
  42        if (!user_mode_vm(regs)) {
  43                preempt_disable();
  44                if (kprobe_running() && kprobe_fault_handler(regs, 14))
  45                        ret = 1;
  46                preempt_enable();
  47        }
  48
  49        return ret;
  50}
  51#else
  52static inline int notify_page_fault(struct pt_regs *regs)
  53{
  54        return 0;
  55}
  56#endif
  57
  58/*
  59 * Return EIP plus the CS segment base.  The segment limit is also
  60 * adjusted, clamped to the kernel/user address space (whichever is
  61 * appropriate), and returned in *eip_limit.
  62 *
  63 * The segment is checked, because it might have been changed by another
  64 * task between the original faulting instruction and here.
  65 *
  66 * If CS is no longer a valid code segment, or if EIP is beyond the
  67 * limit, or if it is a kernel address when CS is not a kernel segment,
  68 * then the returned value will be greater than *eip_limit.
  69 * 
  70 * This is slow, but is very rarely executed.
  71 */
  72static inline unsigned long get_segment_eip(struct pt_regs *regs,
  73                                            unsigned long *eip_limit)
  74{
  75        unsigned long eip = regs->eip;
  76        unsigned seg = regs->xcs & 0xffff;
  77        u32 seg_ar, seg_limit, base, *desc;
  78
  79        /* Unlikely, but must come before segment checks. */
  80        if (unlikely(regs->eflags & VM_MASK)) {
  81                base = seg << 4;
  82                *eip_limit = base + 0xffff;
  83                return base + (eip & 0xffff);
  84        }
  85
  86        /* The standard kernel/user address space limit. */
  87        *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
  88        
  89        /* By far the most common cases. */
  90        if (likely(SEGMENT_IS_FLAT_CODE(seg)))
  91                return eip;
  92
  93        /* Check the segment exists, is within the current LDT/GDT size,
  94           that kernel/user (ring 0..3) has the appropriate privilege,
  95           that it's a code segment, and get the limit. */
  96        __asm__ ("larl %3,%0; lsll %3,%1"
  97                 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
  98        if ((~seg_ar & 0x9800) || eip > seg_limit) {
  99                *eip_limit = 0;
 100                return 1;        /* So that returned eip > *eip_limit. */
 101        }
 102
 103        /* Get the GDT/LDT descriptor base. 
 104           When you look for races in this code remember that
 105           LDT and other horrors are only used in user space. */
 106        if (seg & (1<<2)) {
 107                /* Must lock the LDT while reading it. */
 108                mutex_lock(&current->mm->context.lock);
 109                desc = current->mm->context.ldt;
 110                desc = (void *)desc + (seg & ~7);
 111        } else {
 112                /* Must disable preemption while reading the GDT. */
 113                desc = (u32 *)get_cpu_gdt_table(get_cpu());
 114                desc = (void *)desc + (seg & ~7);
 115        }
 116
 117        /* Decode the code segment base from the descriptor */
 118        base = get_desc_base((unsigned long *)desc);
 119
 120        if (seg & (1<<2)) { 
 121                mutex_unlock(&current->mm->context.lock);
 122        } else
 123                put_cpu();
 124
 125        /* Adjust EIP and segment limit, and clamp at the kernel limit.
 126           It's legitimate for segments to wrap at 0xffffffff. */
 127        seg_limit += base;
 128        if (seg_limit < *eip_limit && seg_limit >= base)
 129                *eip_limit = seg_limit;
 130        return eip + base;
 131}
 132
 133/* 
 134 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 135 * Check that here and ignore it.
 136 */
 137static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
 138{ 
 139        unsigned long limit;
 140        unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
 141        int scan_more = 1;
 142        int prefetch = 0; 
 143        int i;
 144
 145        for (i = 0; scan_more && i < 15; i++) { 
 146                unsigned char opcode;
 147                unsigned char instr_hi;
 148                unsigned char instr_lo;
 149
 150                if (instr > (unsigned char *)limit)
 151                        break;
 152                if (probe_kernel_address(instr, opcode))
 153                        break; 
 154
 155                instr_hi = opcode & 0xf0; 
 156                instr_lo = opcode & 0x0f; 
 157                instr++;
 158
 159                switch (instr_hi) { 
 160                case 0x20:
 161                case 0x30:
 162                        /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
 163                        scan_more = ((instr_lo & 7) == 0x6);
 164                        break;
 165                        
 166                case 0x60:
 167                        /* 0x64 thru 0x67 are valid prefixes in all modes. */
 168                        scan_more = (instr_lo & 0xC) == 0x4;
 169                        break;          
 170                case 0xF0:
 171                        /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
 172                        scan_more = !instr_lo || (instr_lo>>1) == 1;
 173                        break;                  
 174                case 0x00:
 175                        /* Prefetch instruction is 0x0F0D or 0x0F18 */
 176                        scan_more = 0;
 177                        if (instr > (unsigned char *)limit)
 178                                break;
 179                        if (probe_kernel_address(instr, opcode))
 180                                break;
 181                        prefetch = (instr_lo == 0xF) &&
 182                                (opcode == 0x0D || opcode == 0x18);
 183                        break;                  
 184                default:
 185                        scan_more = 0;
 186                        break;
 187                } 
 188        }
 189        return prefetch;
 190}
 191
 192static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 193                              unsigned long error_code)
 194{
 195        if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 196                     boot_cpu_data.x86 >= 6)) {
 197                /* Catch an obscure case of prefetch inside an NX page. */
 198                if (nx_enabled && (error_code & 16))
 199                        return 0;
 200                return __is_prefetch(regs, addr);
 201        }
 202        return 0;
 203} 
 204
 205static noinline void force_sig_info_fault(int si_signo, int si_code,
 206        unsigned long address, struct task_struct *tsk)
 207{
 208        siginfo_t info;
 209
 210        info.si_signo = si_signo;
 211        info.si_errno = 0;
 212        info.si_code = si_code;
 213        info.si_addr = (void __user *)address;
 214        force_sig_info(si_signo, &info, tsk);
 215}
 216
 217fastcall void do_invalid_op(struct pt_regs *, unsigned long);
 218
 219static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 220{
 221        unsigned index = pgd_index(address);
 222        pgd_t *pgd_k;
 223        pud_t *pud, *pud_k;
 224        pmd_t *pmd, *pmd_k;
 225
 226        pgd += index;
 227        pgd_k = init_mm.pgd + index;
 228
 229        if (!pgd_present(*pgd_k))
 230                return NULL;
 231
 232        /*
 233         * set_pgd(pgd, *pgd_k); here would be useless on PAE
 234         * and redundant with the set_pmd() on non-PAE. As would
 235         * set_pud.
 236         */
 237
 238        pud = pud_offset(pgd, address);
 239        pud_k = pud_offset(pgd_k, address);
 240        if (!pud_present(*pud_k))
 241                return NULL;
 242
 243        pmd = pmd_offset(pud, address);
 244        pmd_k = pmd_offset(pud_k, address);
 245        if (!pmd_present(*pmd_k))
 246                return NULL;
 247        if (!pmd_present(*pmd)) {
 248                set_pmd(pmd, *pmd_k);
 249                arch_flush_lazy_mmu_mode();
 250        } else
 251                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
 252        return pmd_k;
 253}
 254
 255/*
 256 * Handle a fault on the vmalloc or module mapping area
 257 *
 258 * This assumes no large pages in there.
 259 */
 260static inline int vmalloc_fault(unsigned long address)
 261{
 262        unsigned long pgd_paddr;
 263        pmd_t *pmd_k;
 264        pte_t *pte_k;
 265        /*
 266         * Synchronize this task's top level page-table
 267         * with the 'reference' page table.
 268         *
 269         * Do _not_ use "current" here. We might be inside
 270         * an interrupt in the middle of a task switch..
 271         */
 272        pgd_paddr = read_cr3();
 273        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 274        if (!pmd_k)
 275                return -1;
 276        pte_k = pte_offset_kernel(pmd_k, address);
 277        if (!pte_present(*pte_k))
 278                return -1;
 279        return 0;
 280}
 281
 282int show_unhandled_signals = 1;
 283
 284/*
 285 * This routine handles page faults.  It determines the address,
 286 * and the problem, and then passes it off to one of the appropriate
 287 * routines.
 288 *
 289 * error_code:
 290 *      bit 0 == 0 means no page found, 1 means protection fault
 291 *      bit 1 == 0 means read, 1 means write
 292 *      bit 2 == 0 means kernel, 1 means user-mode
 293 *      bit 3 == 1 means use of reserved bit detected
 294 *      bit 4 == 1 means fault was an instruction fetch
 295 */
 296fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 297                                      unsigned long error_code)
 298{
 299        struct task_struct *tsk;
 300        struct mm_struct *mm;
 301        struct vm_area_struct * vma;
 302        unsigned long address;
 303        int write, si_code;
 304        int fault;
 305
 306        /*
 307         * We can fault from pretty much anywhere, with unknown IRQ state.
 308         */
 309        trace_hardirqs_fixup();
 310
 311        /* get the address */
 312        address = read_cr2();
 313
 314        tsk = current;
 315
 316        si_code = SEGV_MAPERR;
 317
 318        /*
 319         * We fault-in kernel-space virtual memory on-demand. The
 320         * 'reference' page table is init_mm.pgd.
 321         *
 322         * NOTE! We MUST NOT take any locks for this case. We may
 323         * be in an interrupt or a critical region, and should
 324         * only copy the information from the master page table,
 325         * nothing more.
 326         *
 327         * This verifies that the fault happens in kernel space
 328         * (error_code & 4) == 0, and that the fault was not a
 329         * protection error (error_code & 9) == 0.
 330         */
 331        if (unlikely(address >= TASK_SIZE)) {
 332                if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
 333                        return;
 334                if (notify_page_fault(regs))
 335                        return;
 336                /*
 337                 * Don't take the mm semaphore here. If we fixup a prefetch
 338                 * fault we could otherwise deadlock.
 339                 */
 340                goto bad_area_nosemaphore;
 341        }
 342
 343        if (notify_page_fault(regs))
 344                return;
 345
 346        /* It's safe to allow irq's after cr2 has been saved and the vmalloc
 347           fault has been handled. */
 348        if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
 349                local_irq_enable();
 350
 351        mm = tsk->mm;
 352
 353        /*
 354         * If we're in an interrupt, have no user context or are running in an
 355         * atomic region then we must not take the fault..
 356         */
 357        if (in_atomic() || !mm)
 358                goto bad_area_nosemaphore;
 359
 360        /* When running in the kernel we expect faults to occur only to
 361         * addresses in user space.  All other faults represent errors in the
 362         * kernel and should generate an OOPS.  Unfortunately, in the case of an
 363         * erroneous fault occurring in a code path which already holds mmap_sem
 364         * we will deadlock attempting to validate the fault against the
 365         * address space.  Luckily the kernel only validly references user
 366         * space from well defined areas of code, which are listed in the
 367         * exceptions table.
 368         *
 369         * As the vast majority of faults will be valid we will only perform
 370         * the source reference check when there is a possibility of a deadlock.
 371         * Attempt to lock the address space, if we cannot we then validate the
 372         * source.  If this is invalid we can skip the address space check,
 373         * thus avoiding the deadlock.
 374         */
 375        if (!down_read_trylock(&mm->mmap_sem)) {
 376                if ((error_code & 4) == 0 &&
 377                    !search_exception_tables(regs->eip))
 378                        goto bad_area_nosemaphore;
 379                down_read(&mm->mmap_sem);
 380        }
 381
 382        vma = find_vma(mm, address);
 383        if (!vma)
 384                goto bad_area;
 385        if (vma->vm_start <= address)
 386                goto good_area;
 387        if (!(vma->vm_flags & VM_GROWSDOWN))
 388                goto bad_area;
 389        if (error_code & 4) {
 390                /*
 391                 * Accessing the stack below %esp is always a bug.
 392                 * The large cushion allows instructions like enter
 393                 * and pusha to work.  ("enter $65535,$31" pushes
 394                 * 32 pointers and then decrements %esp by 65535.)
 395                 */
 396                if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
 397                        goto bad_area;
 398        }
 399        if (expand_stack(vma, address))
 400                goto bad_area;
 401/*
 402 * Ok, we have a good vm_area for this memory access, so
 403 * we can handle it..
 404 */
 405good_area:
 406        si_code = SEGV_ACCERR;
 407        write = 0;
 408        switch (error_code & 3) {
 409                default:        /* 3: write, present */
 410                                /* fall through */
 411                case 2:         /* write, not present */
 412                        if (!(vma->vm_flags & VM_WRITE))
 413                                goto bad_area;
 414                        write++;
 415                        break;
 416                case 1:         /* read, present */
 417                        goto bad_area;
 418                case 0:         /* read, not present */
 419                        if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 420                                goto bad_area;
 421        }
 422
 423 survive:
 424        /*
 425         * If for any reason at all we couldn't handle the fault,
 426         * make sure we exit gracefully rather than endlessly redo
 427         * the fault.
 428         */
 429        fault = handle_mm_fault(mm, vma, address, write);
 430        if (unlikely(fault & VM_FAULT_ERROR)) {
 431                if (fault & VM_FAULT_OOM)
 432                        goto out_of_memory;
 433                else if (fault & VM_FAULT_SIGBUS)
 434                        goto do_sigbus;
 435                BUG();
 436        }
 437        if (fault & VM_FAULT_MAJOR)
 438                tsk->maj_flt++;
 439        else
 440                tsk->min_flt++;
 441
 442        /*
 443         * Did it hit the DOS screen memory VA from vm86 mode?
 444         */
 445        if (regs->eflags & VM_MASK) {
 446                unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
 447                if (bit < 32)
 448                        tsk->thread.screen_bitmap |= 1 << bit;
 449        }
 450        up_read(&mm->mmap_sem);
 451        return;
 452
 453/*
 454 * Something tried to access memory that isn't in our memory map..
 455 * Fix it, but check if it's kernel or user first..
 456 */
 457bad_area:
 458        up_read(&mm->mmap_sem);
 459
 460bad_area_nosemaphore:
 461        /* User mode accesses just cause a SIGSEGV */
 462        if (error_code & 4) {
 463                /*
 464                 * It's possible to have interrupts off here.
 465                 */
 466                local_irq_enable();
 467
 468                /* 
 469                 * Valid to do another page fault here because this one came 
 470                 * from user space.
 471                 */
 472                if (is_prefetch(regs, address, error_code))
 473                        return;
 474
 475                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 476                    printk_ratelimit()) {
 477                        printk("%s%s[%d]: segfault at %08lx eip %08lx "
 478                            "esp %08lx error %lx\n",
 479                            task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 480                            tsk->comm, task_pid_nr(tsk), address, regs->eip,
 481                            regs->esp, error_code);
 482                }
 483                tsk->thread.cr2 = address;
 484                /* Kernel addresses are always protection faults */
 485                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 486                tsk->thread.trap_no = 14;
 487                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
 488                return;
 489        }
 490
 491#ifdef CONFIG_X86_F00F_BUG
 492        /*
 493         * Pentium F0 0F C7 C8 bug workaround.
 494         */
 495        if (boot_cpu_data.f00f_bug) {
 496                unsigned long nr;
 497                
 498                nr = (address - idt_descr.address) >> 3;
 499
 500                if (nr == 6) {
 501                        do_invalid_op(regs, 0);
 502                        return;
 503                }
 504        }
 505#endif
 506
 507no_context:
 508        /* Are we prepared to handle this kernel fault?  */
 509        if (fixup_exception(regs))
 510                return;
 511
 512        /* 
 513         * Valid to do another page fault here, because if this fault
 514         * had been triggered by is_prefetch fixup_exception would have 
 515         * handled it.
 516         */
 517        if (is_prefetch(regs, address, error_code))
 518                return;
 519
 520/*
 521 * Oops. The kernel tried to access some bad page. We'll have to
 522 * terminate things with extreme prejudice.
 523 */
 524
 525        bust_spinlocks(1);
 526
 527        if (oops_may_print()) {
 528                __typeof__(pte_val(__pte(0))) page;
 529
 530#ifdef CONFIG_X86_PAE
 531                if (error_code & 16) {
 532                        pte_t *pte = lookup_address(address);
 533
 534                        if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
 535                                printk(KERN_CRIT "kernel tried to execute "
 536                                        "NX-protected page - exploit attempt? "
 537                                        "(uid: %d)\n", current->uid);
 538                }
 539#endif
 540                if (address < PAGE_SIZE)
 541                        printk(KERN_ALERT "BUG: unable to handle kernel NULL "
 542                                        "pointer dereference");
 543                else
 544                        printk(KERN_ALERT "BUG: unable to handle kernel paging"
 545                                        " request");
 546                printk(" at virtual address %08lx\n",address);
 547                printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
 548
 549                page = read_cr3();
 550                page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
 551#ifdef CONFIG_X86_PAE
 552                printk("*pdpt = %016Lx ", page);
 553                if ((page >> PAGE_SHIFT) < max_low_pfn
 554                    && page & _PAGE_PRESENT) {
 555                        page &= PAGE_MASK;
 556                        page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
 557                                                                 & (PTRS_PER_PMD - 1)];
 558                        printk(KERN_CONT "*pde = %016Lx ", page);
 559                        page &= ~_PAGE_NX;
 560                }
 561#else
 562                printk("*pde = %08lx ", page);
 563#endif
 564
 565                /*
 566                 * We must not directly access the pte in the highpte
 567                 * case if the page table is located in highmem.
 568                 * And let's rather not kmap-atomic the pte, just in case
 569                 * it's allocated already.
 570                 */
 571                if ((page >> PAGE_SHIFT) < max_low_pfn
 572                    && (page & _PAGE_PRESENT)
 573                    && !(page & _PAGE_PSE)) {
 574                        page &= PAGE_MASK;
 575                        page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
 576                                                                 & (PTRS_PER_PTE - 1)];
 577                        printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
 578                }
 579
 580                printk("\n");
 581        }
 582
 583        tsk->thread.cr2 = address;
 584        tsk->thread.trap_no = 14;
 585        tsk->thread.error_code = error_code;
 586        die("Oops", regs, error_code);
 587        bust_spinlocks(0);
 588        do_exit(SIGKILL);
 589
 590/*
 591 * We ran out of memory, or some other thing happened to us that made
 592 * us unable to handle the page fault gracefully.
 593 */
 594out_of_memory:
 595        up_read(&mm->mmap_sem);
 596        if (is_global_init(tsk)) {
 597                yield();
 598                down_read(&mm->mmap_sem);
 599                goto survive;
 600        }
 601        printk("VM: killing process %s\n", tsk->comm);
 602        if (error_code & 4)
 603                do_group_exit(SIGKILL);
 604        goto no_context;
 605
 606do_sigbus:
 607        up_read(&mm->mmap_sem);
 608
 609        /* Kernel mode? Handle exceptions or die */
 610        if (!(error_code & 4))
 611                goto no_context;
 612
 613        /* User space => ok to do another page fault */
 614        if (is_prefetch(regs, address, error_code))
 615                return;
 616
 617        tsk->thread.cr2 = address;
 618        tsk->thread.error_code = error_code;
 619        tsk->thread.trap_no = 14;
 620        force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 621}
 622
 623void vmalloc_sync_all(void)
 624{
 625        /*
 626         * Note that races in the updates of insync and start aren't
 627         * problematic: insync can only get set bits added, and updates to
 628         * start are only improving performance (without affecting correctness
 629         * if undone).
 630         */
 631        static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 632        static unsigned long start = TASK_SIZE;
 633        unsigned long address;
 634
 635        if (SHARED_KERNEL_PMD)
 636                return;
 637
 638        BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
 639        for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
 640                if (!test_bit(pgd_index(address), insync)) {
 641                        unsigned long flags;
 642                        struct page *page;
 643
 644                        spin_lock_irqsave(&pgd_lock, flags);
 645                        for (page = pgd_list; page; page =
 646                                        (struct page *)page->index)
 647                                if (!vmalloc_sync_one(page_address(page),
 648                                                                address)) {
 649                                        BUG_ON(page != pgd_list);
 650                                        break;
 651                                }
 652                        spin_unlock_irqrestore(&pgd_lock, flags);
 653                        if (!page)
 654                                set_bit(pgd_index(address), insync);
 655                }
 656                if (address == start && test_bit(pgd_index(address), insync))
 657                        start = address + PGDIR_SIZE;
 658        }
 659}
 660
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.