linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/stackprotector.h>
  18#include <linux/cpu.h>
  19#include <linux/errno.h>
  20#include <linux/sched.h>
  21#include <linux/fs.h>
  22#include <linux/kernel.h>
  23#include <linux/mm.h>
  24#include <linux/elfcore.h>
  25#include <linux/smp.h>
  26#include <linux/slab.h>
  27#include <linux/user.h>
  28#include <linux/interrupt.h>
  29#include <linux/delay.h>
  30#include <linux/module.h>
  31#include <linux/ptrace.h>
  32#include <linux/notifier.h>
  33#include <linux/kprobes.h>
  34#include <linux/kdebug.h>
  35#include <linux/tick.h>
  36#include <linux/prctl.h>
  37#include <linux/uaccess.h>
  38#include <linux/io.h>
  39#include <linux/ftrace.h>
  40
  41#include <asm/pgtable.h>
  42#include <asm/system.h>
  43#include <asm/processor.h>
  44#include <asm/i387.h>
  45#include <asm/mmu_context.h>
  46#include <asm/prctl.h>
  47#include <asm/desc.h>
  48#include <asm/proto.h>
  49#include <asm/ia32.h>
  50#include <asm/idle.h>
  51#include <asm/syscalls.h>
  52#include <asm/debugreg.h>
  53
  54asmlinkage extern void ret_from_fork(void);
  55
  56DEFINE_PER_CPU(unsigned long, old_rsp);
  57static DEFINE_PER_CPU(unsigned char, is_idle);
  58
  59static ATOMIC_NOTIFIER_HEAD(idle_notifier);
  60
  61void idle_notifier_register(struct notifier_block *n)
  62{
  63        atomic_notifier_chain_register(&idle_notifier, n);
  64}
  65EXPORT_SYMBOL_GPL(idle_notifier_register);
  66
  67void idle_notifier_unregister(struct notifier_block *n)
  68{
  69        atomic_notifier_chain_unregister(&idle_notifier, n);
  70}
  71EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  72
  73void enter_idle(void)
  74{
  75        percpu_write(is_idle, 1);
  76        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
  77}
  78
  79static void __exit_idle(void)
  80{
  81        if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
  82                return;
  83        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
  84}
  85
  86/* Called from interrupts to signify idle end */
  87void exit_idle(void)
  88{
  89        /* idle loop has pid 0 */
  90        if (current->pid)
  91                return;
  92        __exit_idle();
  93}
  94
  95#ifndef CONFIG_SMP
  96static inline void play_dead(void)
  97{
  98        BUG();
  99}
 100#endif
 101
 102/*
 103 * The idle thread. There's no useful work to be
 104 * done, so just try to conserve power and have a
 105 * low exit latency (ie sit in a loop waiting for
 106 * somebody to say that they'd like to reschedule)
 107 */
 108void cpu_idle(void)
 109{
 110        current_thread_info()->status |= TS_POLLING;
 111
 112        /*
 113         * If we're the non-boot CPU, nothing set the stack canary up
 114         * for us.  CPU0 already has it initialized but no harm in
 115         * doing it again.  This is a good place for updating it, as
 116         * we wont ever return from this function (so the invalid
 117         * canaries already on the stack wont ever trigger).
 118         */
 119        boot_init_stack_canary();
 120
 121        /* endless idle loop with no priority at all */
 122        while (1) {
 123                tick_nohz_stop_sched_tick(1);
 124                while (!need_resched()) {
 125
 126                        rmb();
 127
 128                        if (cpu_is_offline(smp_processor_id()))
 129                                play_dead();
 130                        /*
 131                         * Idle routines should keep interrupts disabled
 132                         * from here on, until they go to idle.
 133                         * Otherwise, idle callbacks can misfire.
 134                         */
 135                        local_irq_disable();
 136                        enter_idle();
 137                        /* Don't trace irqs off for idle */
 138                        stop_critical_timings();
 139                        pm_idle();
 140                        start_critical_timings();
 141
 142                        /* In many cases the interrupt that ended idle
 143                           has already called exit_idle. But some idle
 144                           loops can be woken up without interrupt. */
 145                        __exit_idle();
 146                }
 147
 148                tick_nohz_restart_sched_tick();
 149                preempt_enable_no_resched();
 150                schedule();
 151                preempt_disable();
 152        }
 153}
 154
 155/* Prints also some state that isn't saved in the pt_regs */
 156void __show_regs(struct pt_regs *regs, int all)
 157{
 158        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
 159        unsigned long d0, d1, d2, d3, d6, d7;
 160        unsigned int fsindex, gsindex;
 161        unsigned int ds, cs, es;
 162
 163        show_regs_common();
 164        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 165        printk_address(regs->ip, 1);
 166        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
 167                        regs->sp, regs->flags);
 168        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 169               regs->ax, regs->bx, regs->cx);
 170        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
 171               regs->dx, regs->si, regs->di);
 172        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
 173               regs->bp, regs->r8, regs->r9);
 174        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
 175               regs->r10, regs->r11, regs->r12);
 176        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
 177               regs->r13, regs->r14, regs->r15);
 178
 179        asm("movl %%ds,%0" : "=r" (ds));
 180        asm("movl %%cs,%0" : "=r" (cs));
 181        asm("movl %%es,%0" : "=r" (es));
 182        asm("movl %%fs,%0" : "=r" (fsindex));
 183        asm("movl %%gs,%0" : "=r" (gsindex));
 184
 185        rdmsrl(MSR_FS_BASE, fs);
 186        rdmsrl(MSR_GS_BASE, gs);
 187        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 188
 189        if (!all)
 190                return;
 191
 192        cr0 = read_cr0();
 193        cr2 = read_cr2();
 194        cr3 = read_cr3();
 195        cr4 = read_cr4();
 196
 197        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 198               fs, fsindex, gs, gsindex, shadowgs);
 199        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 200                        es, cr0);
 201        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 202                        cr4);
 203
 204        get_debugreg(d0, 0);
 205        get_debugreg(d1, 1);
 206        get_debugreg(d2, 2);
 207        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 208        get_debugreg(d3, 3);
 209        get_debugreg(d6, 6);
 210        get_debugreg(d7, 7);
 211        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 212}
 213
 214void release_thread(struct task_struct *dead_task)
 215{
 216        if (dead_task->mm) {
 217                if (dead_task->mm->context.size) {
 218                        printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
 219                                        dead_task->comm,
 220                                        dead_task->mm->context.ldt,
 221                                        dead_task->mm->context.size);
 222                        BUG();
 223                }
 224        }
 225}
 226
 227static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 228{
 229        struct user_desc ud = {
 230                .base_addr = addr,
 231                .limit = 0xfffff,
 232                .seg_32bit = 1,
 233                .limit_in_pages = 1,
 234                .useable = 1,
 235        };
 236        struct desc_struct *desc = t->thread.tls_array;
 237        desc += tls;
 238        fill_ldt(desc, &ud);
 239}
 240
 241static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 242{
 243        return get_desc_base(&t->thread.tls_array[tls]);
 244}
 245
 246/*
 247 * This gets called before we allocate a new thread and copy
 248 * the current task into it.
 249 */
 250void prepare_to_copy(struct task_struct *tsk)
 251{
 252        unlazy_fpu(tsk);
 253}
 254
 255int copy_thread(unsigned long clone_flags, unsigned long sp,
 256                unsigned long unused,
 257        struct task_struct *p, struct pt_regs *regs)
 258{
 259        int err;
 260        struct pt_regs *childregs;
 261        struct task_struct *me = current;
 262
 263        childregs = ((struct pt_regs *)
 264                        (THREAD_SIZE + task_stack_page(p))) - 1;
 265        *childregs = *regs;
 266
 267        childregs->ax = 0;
 268        if (user_mode(regs))
 269                childregs->sp = sp;
 270        else
 271                childregs->sp = (unsigned long)childregs;
 272
 273        p->thread.sp = (unsigned long) childregs;
 274        p->thread.sp0 = (unsigned long) (childregs+1);
 275        p->thread.usersp = me->thread.usersp;
 276
 277        set_tsk_thread_flag(p, TIF_FORK);
 278
 279        p->thread.io_bitmap_ptr = NULL;
 280
 281        savesegment(gs, p->thread.gsindex);
 282        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 283        savesegment(fs, p->thread.fsindex);
 284        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 285        savesegment(es, p->thread.es);
 286        savesegment(ds, p->thread.ds);
 287
 288        err = -ENOMEM;
 289        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 290
 291        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 292                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 293                if (!p->thread.io_bitmap_ptr) {
 294                        p->thread.io_bitmap_max = 0;
 295                        return -ENOMEM;
 296                }
 297                memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 298                                IO_BITMAP_BYTES);
 299                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 300        }
 301
 302        /*
 303         * Set a new TLS for the child thread?
 304         */
 305        if (clone_flags & CLONE_SETTLS) {
 306#ifdef CONFIG_IA32_EMULATION
 307                if (test_thread_flag(TIF_IA32))
 308                        err = do_set_thread_area(p, -1,
 309                                (struct user_desc __user *)childregs->si, 0);
 310                else
 311#endif
 312                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 313                if (err)
 314                        goto out;
 315        }
 316        err = 0;
 317out:
 318        if (err && p->thread.io_bitmap_ptr) {
 319                kfree(p->thread.io_bitmap_ptr);
 320                p->thread.io_bitmap_max = 0;
 321        }
 322
 323        return err;
 324}
 325
 326static void
 327start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 328                    unsigned long new_sp,
 329                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 330{
 331        loadsegment(fs, 0);
 332        loadsegment(es, _ds);
 333        loadsegment(ds, _ds);
 334        load_gs_index(0);
 335        regs->ip                = new_ip;
 336        regs->sp                = new_sp;
 337        percpu_write(old_rsp, new_sp);
 338        regs->cs                = _cs;
 339        regs->ss                = _ss;
 340        regs->flags             = X86_EFLAGS_IF;
 341        set_fs(USER_DS);
 342        /*
 343         * Free the old FP and other extended state
 344         */
 345        free_thread_xstate(current);
 346}
 347
 348void
 349start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 350{
 351        start_thread_common(regs, new_ip, new_sp,
 352                            __USER_CS, __USER_DS, 0);
 353}
 354
 355#ifdef CONFIG_IA32_EMULATION
 356void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 357{
 358        start_thread_common(regs, new_ip, new_sp,
 359                            __USER32_CS, __USER32_DS, __USER32_DS);
 360}
 361#endif
 362
 363/*
 364 *      switch_to(x,y) should switch tasks from x to y.
 365 *
 366 * This could still be optimized:
 367 * - fold all the options into a flag word and test it with a single test.
 368 * - could test fs/gs bitsliced
 369 *
 370 * Kprobes not supported here. Set the probe on schedule instead.
 371 * Function graph tracer not supported too.
 372 */
 373__notrace_funcgraph struct task_struct *
 374__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 375{
 376        struct thread_struct *prev = &prev_p->thread;
 377        struct thread_struct *next = &next_p->thread;
 378        int cpu = smp_processor_id();
 379        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 380        unsigned fsindex, gsindex;
 381        bool preload_fpu;
 382
 383        /*
 384         * If the task has used fpu the last 5 timeslices, just do a full
 385         * restore of the math state immediately to avoid the trap; the
 386         * chances of needing FPU soon are obviously high now
 387         */
 388        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
 389
 390        /* we're going to use this soon, after a few expensive things */
 391        if (preload_fpu)
 392                prefetch(next->fpu.state);
 393
 394        /*
 395         * Reload esp0, LDT and the page table pointer:
 396         */
 397        load_sp0(tss, next);
 398
 399        /*
 400         * Switch DS and ES.
 401         * This won't pick up thread selector changes, but I guess that is ok.
 402         */
 403        savesegment(es, prev->es);
 404        if (unlikely(next->es | prev->es))
 405                loadsegment(es, next->es);
 406
 407        savesegment(ds, prev->ds);
 408        if (unlikely(next->ds | prev->ds))
 409                loadsegment(ds, next->ds);
 410
 411
 412        /* We must save %fs and %gs before load_TLS() because
 413         * %fs and %gs may be cleared by load_TLS().
 414         *
 415         * (e.g. xen_load_tls())
 416         */
 417        savesegment(fs, fsindex);
 418        savesegment(gs, gsindex);
 419
 420        load_TLS(next, cpu);
 421
 422        /* Must be after DS reload */
 423        __unlazy_fpu(prev_p);
 424
 425        /* Make sure cpu is ready for new context */
 426        if (preload_fpu)
 427                clts();
 428
 429        /*
 430         * Leave lazy mode, flushing any hypercalls made here.
 431         * This must be done before restoring TLS segments so
 432         * the GDT and LDT are properly updated, and must be
 433         * done before math_state_restore, so the TS bit is up
 434         * to date.
 435         */
 436        arch_end_context_switch(next_p);
 437
 438        /*
 439         * Switch FS and GS.
 440         *
 441         * Segment register != 0 always requires a reload.  Also
 442         * reload when it has changed.  When prev process used 64bit
 443         * base always reload to avoid an information leak.
 444         */
 445        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 446                loadsegment(fs, next->fsindex);
 447                /*
 448                 * Check if the user used a selector != 0; if yes
 449                 *  clear 64bit base, since overloaded base is always
 450                 *  mapped to the Null selector
 451                 */
 452                if (fsindex)
 453                        prev->fs = 0;
 454        }
 455        /* when next process has a 64bit base use it */
 456        if (next->fs)
 457                wrmsrl(MSR_FS_BASE, next->fs);
 458        prev->fsindex = fsindex;
 459
 460        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 461                load_gs_index(next->gsindex);
 462                if (gsindex)
 463                        prev->gs = 0;
 464        }
 465        if (next->gs)
 466                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 467        prev->gsindex = gsindex;
 468
 469        /*
 470         * Switch the PDA and FPU contexts.
 471         */
 472        prev->usersp = percpu_read(old_rsp);
 473        percpu_write(old_rsp, next->usersp);
 474        percpu_write(current_task, next_p);
 475
 476        percpu_write(kernel_stack,
 477                  (unsigned long)task_stack_page(next_p) +
 478                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 479
 480        /*
 481         * Now maybe reload the debug registers and handle I/O bitmaps
 482         */
 483        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 484                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 485                __switch_to_xtra(prev_p, next_p, tss);
 486
 487        /*
 488         * Preload the FPU context, now that we've determined that the
 489         * task is likely to be using it. 
 490         */
 491        if (preload_fpu)
 492                __math_state_restore();
 493
 494        return prev_p;
 495}
 496
 497void set_personality_64bit(void)
 498{
 499        /* inherit personality from parent */
 500
 501        /* Make sure to be in 64bit mode */
 502        clear_thread_flag(TIF_IA32);
 503
 504        /* Ensure the corresponding mm is not marked. */
 505        if (current->mm)
 506                current->mm->context.ia32_compat = 0;
 507
 508        /* TBD: overwrites user setup. Should have two bits.
 509           But 64bit processes have always behaved this way,
 510           so it's not too bad. The main problem is just that
 511           32bit childs are affected again. */
 512        current->personality &= ~READ_IMPLIES_EXEC;
 513}
 514
 515void set_personality_ia32(void)
 516{
 517        /* inherit personality from parent */
 518
 519        /* Make sure to be in 32bit mode */
 520        set_thread_flag(TIF_IA32);
 521        current->personality |= force_personality32;
 522
 523        /* Mark the associated mm as containing 32-bit tasks. */
 524        if (current->mm)
 525                current->mm->context.ia32_compat = 1;
 526
 527        /* Prepare the first "return" to user space */
 528        current_thread_info()->status |= TS_COMPAT;
 529}
 530
 531unsigned long get_wchan(struct task_struct *p)
 532{
 533        unsigned long stack;
 534        u64 fp, ip;
 535        int count = 0;
 536
 537        if (!p || p == current || p->state == TASK_RUNNING)
 538                return 0;
 539        stack = (unsigned long)task_stack_page(p);
 540        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 541                return 0;
 542        fp = *(u64 *)(p->thread.sp);
 543        do {
 544                if (fp < (unsigned long)stack ||
 545                    fp >= (unsigned long)stack+THREAD_SIZE)
 546                        return 0;
 547                ip = *(u64 *)(fp+8);
 548                if (!in_sched_functions(ip))
 549                        return ip;
 550                fp = *(u64 *)fp;
 551        } while (count++ < 16);
 552        return 0;
 553}
 554
 555long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 556{
 557        int ret = 0;
 558        int doit = task == current;
 559        int cpu;
 560
 561        switch (code) {
 562        case ARCH_SET_GS:
 563                if (addr >= TASK_SIZE_OF(task))
 564                        return -EPERM;
 565                cpu = get_cpu();
 566                /* handle small bases via the GDT because that's faster to
 567                   switch. */
 568                if (addr <= 0xffffffff) {
 569                        set_32bit_tls(task, GS_TLS, addr);
 570                        if (doit) {
 571                                load_TLS(&task->thread, cpu);
 572                                load_gs_index(GS_TLS_SEL);
 573                        }
 574                        task->thread.gsindex = GS_TLS_SEL;
 575                        task->thread.gs = 0;
 576                } else {
 577                        task->thread.gsindex = 0;
 578                        task->thread.gs = addr;
 579                        if (doit) {
 580                                load_gs_index(0);
 581                                ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
 582                        }
 583                }
 584                put_cpu();
 585                break;
 586        case ARCH_SET_FS:
 587                /* Not strictly needed for fs, but do it for symmetry
 588                   with gs */
 589                if (addr >= TASK_SIZE_OF(task))
 590                        return -EPERM;
 591                cpu = get_cpu();
 592                /* handle small bases via the GDT because that's faster to
 593                   switch. */
 594                if (addr <= 0xffffffff) {
 595                        set_32bit_tls(task, FS_TLS, addr);
 596                        if (doit) {
 597                                load_TLS(&task->thread, cpu);
 598                                loadsegment(fs, FS_TLS_SEL);
 599                        }
 600                        task->thread.fsindex = FS_TLS_SEL;
 601                        task->thread.fs = 0;
 602                } else {
 603                        task->thread.fsindex = 0;
 604                        task->thread.fs = addr;
 605                        if (doit) {
 606                                /* set the selector to 0 to not confuse
 607                                   __switch_to */
 608                                loadsegment(fs, 0);
 609                                ret = checking_wrmsrl(MSR_FS_BASE, addr);
 610                        }
 611                }
 612                put_cpu();
 613                break;
 614        case ARCH_GET_FS: {
 615                unsigned long base;
 616                if (task->thread.fsindex == FS_TLS_SEL)
 617                        base = read_32bit_tls(task, FS_TLS);
 618                else if (doit)
 619                        rdmsrl(MSR_FS_BASE, base);
 620                else
 621                        base = task->thread.fs;
 622                ret = put_user(base, (unsigned long __user *)addr);
 623                break;
 624        }
 625        case ARCH_GET_GS: {
 626                unsigned long base;
 627                unsigned gsindex;
 628                if (task->thread.gsindex == GS_TLS_SEL)
 629                        base = read_32bit_tls(task, GS_TLS);
 630                else if (doit) {
 631                        savesegment(gs, gsindex);
 632                        if (gsindex)
 633                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 634                        else
 635                                base = task->thread.gs;
 636                } else
 637                        base = task->thread.gs;
 638                ret = put_user(base, (unsigned long __user *)addr);
 639                break;
 640        }
 641
 642        default:
 643                ret = -EINVAL;
 644                break;
 645        }
 646
 647        return ret;
 648}
 649
 650long sys_arch_prctl(int code, unsigned long addr)
 651{
 652        return do_arch_prctl(current, code, addr);
 653}
 654
 655unsigned long KSTK_ESP(struct task_struct *task)
 656{
 657        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 658                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 659}
 660