linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/cpu.h>
  18#include <linux/errno.h>
  19#include <linux/sched.h>
  20#include <linux/fs.h>
  21#include <linux/kernel.h>
  22#include <linux/mm.h>
  23#include <linux/elfcore.h>
  24#include <linux/smp.h>
  25#include <linux/slab.h>
  26#include <linux/user.h>
  27#include <linux/interrupt.h>
  28#include <linux/delay.h>
  29#include <linux/module.h>
  30#include <linux/ptrace.h>
  31#include <linux/notifier.h>
  32#include <linux/kprobes.h>
  33#include <linux/kdebug.h>
  34#include <linux/prctl.h>
  35#include <linux/uaccess.h>
  36#include <linux/io.h>
  37#include <linux/ftrace.h>
  38
  39#include <asm/pgtable.h>
  40#include <asm/processor.h>
  41#include <asm/i387.h>
  42#include <asm/fpu-internal.h>
  43#include <asm/mmu_context.h>
  44#include <asm/prctl.h>
  45#include <asm/desc.h>
  46#include <asm/proto.h>
  47#include <asm/ia32.h>
  48#include <asm/idle.h>
  49#include <asm/syscalls.h>
  50#include <asm/debugreg.h>
  51#include <asm/switch_to.h>
  52
  53asmlinkage extern void ret_from_fork(void);
  54
  55DEFINE_PER_CPU(unsigned long, old_rsp);
  56
  57/* Prints also some state that isn't saved in the pt_regs */
  58void __show_regs(struct pt_regs *regs, int all)
  59{
  60        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  61        unsigned long d0, d1, d2, d3, d6, d7;
  62        unsigned int fsindex, gsindex;
  63        unsigned int ds, cs, es;
  64
  65        show_regs_common();
  66        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
  67        printk_address(regs->ip, 1);
  68        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
  69                        regs->sp, regs->flags);
  70        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  71               regs->ax, regs->bx, regs->cx);
  72        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  73               regs->dx, regs->si, regs->di);
  74        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  75               regs->bp, regs->r8, regs->r9);
  76        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  77               regs->r10, regs->r11, regs->r12);
  78        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  79               regs->r13, regs->r14, regs->r15);
  80
  81        asm("movl %%ds,%0" : "=r" (ds));
  82        asm("movl %%cs,%0" : "=r" (cs));
  83        asm("movl %%es,%0" : "=r" (es));
  84        asm("movl %%fs,%0" : "=r" (fsindex));
  85        asm("movl %%gs,%0" : "=r" (gsindex));
  86
  87        rdmsrl(MSR_FS_BASE, fs);
  88        rdmsrl(MSR_GS_BASE, gs);
  89        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  90
  91        if (!all)
  92                return;
  93
  94        cr0 = read_cr0();
  95        cr2 = read_cr2();
  96        cr3 = read_cr3();
  97        cr4 = read_cr4();
  98
  99        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 100               fs, fsindex, gs, gsindex, shadowgs);
 101        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 102                        es, cr0);
 103        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 104                        cr4);
 105
 106        get_debugreg(d0, 0);
 107        get_debugreg(d1, 1);
 108        get_debugreg(d2, 2);
 109        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 110        get_debugreg(d3, 3);
 111        get_debugreg(d6, 6);
 112        get_debugreg(d7, 7);
 113        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 114}
 115
 116void release_thread(struct task_struct *dead_task)
 117{
 118        if (dead_task->mm) {
 119                if (dead_task->mm->context.size) {
 120                        pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 121                                dead_task->comm,
 122                                dead_task->mm->context.ldt,
 123                                dead_task->mm->context.size);
 124                        BUG();
 125                }
 126        }
 127}
 128
 129static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 130{
 131        struct user_desc ud = {
 132                .base_addr = addr,
 133                .limit = 0xfffff,
 134                .seg_32bit = 1,
 135                .limit_in_pages = 1,
 136                .useable = 1,
 137        };
 138        struct desc_struct *desc = t->thread.tls_array;
 139        desc += tls;
 140        fill_ldt(desc, &ud);
 141}
 142
 143static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 144{
 145        return get_desc_base(&t->thread.tls_array[tls]);
 146}
 147
 148int copy_thread(unsigned long clone_flags, unsigned long sp,
 149                unsigned long arg, struct task_struct *p)
 150{
 151        int err;
 152        struct pt_regs *childregs;
 153        struct task_struct *me = current;
 154
 155        p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 156        childregs = task_pt_regs(p);
 157        p->thread.sp = (unsigned long) childregs;
 158        p->thread.usersp = me->thread.usersp;
 159        set_tsk_thread_flag(p, TIF_FORK);
 160        p->fpu_counter = 0;
 161        p->thread.io_bitmap_ptr = NULL;
 162
 163        savesegment(gs, p->thread.gsindex);
 164        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 165        savesegment(fs, p->thread.fsindex);
 166        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 167        savesegment(es, p->thread.es);
 168        savesegment(ds, p->thread.ds);
 169        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 170
 171        if (unlikely(p->flags & PF_KTHREAD)) {
 172                /* kernel thread */
 173                memset(childregs, 0, sizeof(struct pt_regs));
 174                childregs->sp = (unsigned long)childregs;
 175                childregs->ss = __KERNEL_DS;
 176                childregs->bx = sp; /* function */
 177                childregs->bp = arg;
 178                childregs->orig_ax = -1;
 179                childregs->cs = __KERNEL_CS | get_kernel_rpl();
 180                childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
 181                return 0;
 182        }
 183        *childregs = *current_pt_regs();
 184
 185        childregs->ax = 0;
 186        if (sp)
 187                childregs->sp = sp;
 188
 189        err = -ENOMEM;
 190        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 191
 192        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 193                p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 194                                                  IO_BITMAP_BYTES, GFP_KERNEL);
 195                if (!p->thread.io_bitmap_ptr) {
 196                        p->thread.io_bitmap_max = 0;
 197                        return -ENOMEM;
 198                }
 199                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 200        }
 201
 202        /*
 203         * Set a new TLS for the child thread?
 204         */
 205        if (clone_flags & CLONE_SETTLS) {
 206#ifdef CONFIG_IA32_EMULATION
 207                if (test_thread_flag(TIF_IA32))
 208                        err = do_set_thread_area(p, -1,
 209                                (struct user_desc __user *)childregs->si, 0);
 210                else
 211#endif
 212                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 213                if (err)
 214                        goto out;
 215        }
 216        err = 0;
 217out:
 218        if (err && p->thread.io_bitmap_ptr) {
 219                kfree(p->thread.io_bitmap_ptr);
 220                p->thread.io_bitmap_max = 0;
 221        }
 222
 223        return err;
 224}
 225
 226static void
 227start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 228                    unsigned long new_sp,
 229                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 230{
 231        loadsegment(fs, 0);
 232        loadsegment(es, _ds);
 233        loadsegment(ds, _ds);
 234        load_gs_index(0);
 235        current->thread.usersp  = new_sp;
 236        regs->ip                = new_ip;
 237        regs->sp                = new_sp;
 238        this_cpu_write(old_rsp, new_sp);
 239        regs->cs                = _cs;
 240        regs->ss                = _ss;
 241        regs->flags             = X86_EFLAGS_IF;
 242}
 243
 244void
 245start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 246{
 247        start_thread_common(regs, new_ip, new_sp,
 248                            __USER_CS, __USER_DS, 0);
 249}
 250
 251#ifdef CONFIG_IA32_EMULATION
 252void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 253{
 254        start_thread_common(regs, new_ip, new_sp,
 255                            test_thread_flag(TIF_X32)
 256                            ? __USER_CS : __USER32_CS,
 257                            __USER_DS, __USER_DS);
 258}
 259#endif
 260
 261/*
 262 *      switch_to(x,y) should switch tasks from x to y.
 263 *
 264 * This could still be optimized:
 265 * - fold all the options into a flag word and test it with a single test.
 266 * - could test fs/gs bitsliced
 267 *
 268 * Kprobes not supported here. Set the probe on schedule instead.
 269 * Function graph tracer not supported too.
 270 */
 271__notrace_funcgraph struct task_struct *
 272__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 273{
 274        struct thread_struct *prev = &prev_p->thread;
 275        struct thread_struct *next = &next_p->thread;
 276        int cpu = smp_processor_id();
 277        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 278        unsigned fsindex, gsindex;
 279        fpu_switch_t fpu;
 280
 281        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 282
 283        /*
 284         * Reload esp0, LDT and the page table pointer:
 285         */
 286        load_sp0(tss, next);
 287
 288        /*
 289         * Switch DS and ES.
 290         * This won't pick up thread selector changes, but I guess that is ok.
 291         */
 292        savesegment(es, prev->es);
 293        if (unlikely(next->es | prev->es))
 294                loadsegment(es, next->es);
 295
 296        savesegment(ds, prev->ds);
 297        if (unlikely(next->ds | prev->ds))
 298                loadsegment(ds, next->ds);
 299
 300
 301        /* We must save %fs and %gs before load_TLS() because
 302         * %fs and %gs may be cleared by load_TLS().
 303         *
 304         * (e.g. xen_load_tls())
 305         */
 306        savesegment(fs, fsindex);
 307        savesegment(gs, gsindex);
 308
 309        load_TLS(next, cpu);
 310
 311        /*
 312         * Leave lazy mode, flushing any hypercalls made here.
 313         * This must be done before restoring TLS segments so
 314         * the GDT and LDT are properly updated, and must be
 315         * done before math_state_restore, so the TS bit is up
 316         * to date.
 317         */
 318        arch_end_context_switch(next_p);
 319
 320        /*
 321         * Switch FS and GS.
 322         *
 323         * Segment register != 0 always requires a reload.  Also
 324         * reload when it has changed.  When prev process used 64bit
 325         * base always reload to avoid an information leak.
 326         */
 327        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 328                loadsegment(fs, next->fsindex);
 329                /*
 330                 * Check if the user used a selector != 0; if yes
 331                 *  clear 64bit base, since overloaded base is always
 332                 *  mapped to the Null selector
 333                 */
 334                if (fsindex)
 335                        prev->fs = 0;
 336        }
 337        /* when next process has a 64bit base use it */
 338        if (next->fs)
 339                wrmsrl(MSR_FS_BASE, next->fs);
 340        prev->fsindex = fsindex;
 341
 342        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 343                load_gs_index(next->gsindex);
 344                if (gsindex)
 345                        prev->gs = 0;
 346        }
 347        if (next->gs)
 348                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 349        prev->gsindex = gsindex;
 350
 351        switch_fpu_finish(next_p, fpu);
 352
 353        /*
 354         * Switch the PDA and FPU contexts.
 355         */
 356        prev->usersp = this_cpu_read(old_rsp);
 357        this_cpu_write(old_rsp, next->usersp);
 358        this_cpu_write(current_task, next_p);
 359
 360        this_cpu_write(kernel_stack,
 361                  (unsigned long)task_stack_page(next_p) +
 362                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 363
 364        /*
 365         * Now maybe reload the debug registers and handle I/O bitmaps
 366         */
 367        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 368                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 369                __switch_to_xtra(prev_p, next_p, tss);
 370
 371        return prev_p;
 372}
 373
 374void set_personality_64bit(void)
 375{
 376        /* inherit personality from parent */
 377
 378        /* Make sure to be in 64bit mode */
 379        clear_thread_flag(TIF_IA32);
 380        clear_thread_flag(TIF_ADDR32);
 381        clear_thread_flag(TIF_X32);
 382
 383        /* Ensure the corresponding mm is not marked. */
 384        if (current->mm)
 385                current->mm->context.ia32_compat = 0;
 386
 387        /* TBD: overwrites user setup. Should have two bits.
 388           But 64bit processes have always behaved this way,
 389           so it's not too bad. The main problem is just that
 390           32bit childs are affected again. */
 391        current->personality &= ~READ_IMPLIES_EXEC;
 392}
 393
 394void set_personality_ia32(bool x32)
 395{
 396        /* inherit personality from parent */
 397
 398        /* Make sure to be in 32bit mode */
 399        set_thread_flag(TIF_ADDR32);
 400
 401        /* Mark the associated mm as containing 32-bit tasks. */
 402        if (current->mm)
 403                current->mm->context.ia32_compat = 1;
 404
 405        if (x32) {
 406                clear_thread_flag(TIF_IA32);
 407                set_thread_flag(TIF_X32);
 408                current->personality &= ~READ_IMPLIES_EXEC;
 409                /* is_compat_task() uses the presence of the x32
 410                   syscall bit flag to determine compat status */
 411                current_thread_info()->status &= ~TS_COMPAT;
 412        } else {
 413                set_thread_flag(TIF_IA32);
 414                clear_thread_flag(TIF_X32);
 415                current->personality |= force_personality32;
 416                /* Prepare the first "return" to user space */
 417                current_thread_info()->status |= TS_COMPAT;
 418        }
 419}
 420EXPORT_SYMBOL_GPL(set_personality_ia32);
 421
 422unsigned long get_wchan(struct task_struct *p)
 423{
 424        unsigned long stack;
 425        u64 fp, ip;
 426        int count = 0;
 427
 428        if (!p || p == current || p->state == TASK_RUNNING)
 429                return 0;
 430        stack = (unsigned long)task_stack_page(p);
 431        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 432                return 0;
 433        fp = *(u64 *)(p->thread.sp);
 434        do {
 435                if (fp < (unsigned long)stack ||
 436                    fp >= (unsigned long)stack+THREAD_SIZE)
 437                        return 0;
 438                ip = *(u64 *)(fp+8);
 439                if (!in_sched_functions(ip))
 440                        return ip;
 441                fp = *(u64 *)fp;
 442        } while (count++ < 16);
 443        return 0;
 444}
 445
 446long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 447{
 448        int ret = 0;
 449        int doit = task == current;
 450        int cpu;
 451
 452        switch (code) {
 453        case ARCH_SET_GS:
 454                if (addr >= TASK_SIZE_OF(task))
 455                        return -EPERM;
 456                cpu = get_cpu();
 457                /* handle small bases via the GDT because that's faster to
 458                   switch. */
 459                if (addr <= 0xffffffff) {
 460                        set_32bit_tls(task, GS_TLS, addr);
 461                        if (doit) {
 462                                load_TLS(&task->thread, cpu);
 463                                load_gs_index(GS_TLS_SEL);
 464                        }
 465                        task->thread.gsindex = GS_TLS_SEL;
 466                        task->thread.gs = 0;
 467                } else {
 468                        task->thread.gsindex = 0;
 469                        task->thread.gs = addr;
 470                        if (doit) {
 471                                load_gs_index(0);
 472                                ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
 473                        }
 474                }
 475                put_cpu();
 476                break;
 477        case ARCH_SET_FS:
 478                /* Not strictly needed for fs, but do it for symmetry
 479                   with gs */
 480                if (addr >= TASK_SIZE_OF(task))
 481                        return -EPERM;
 482                cpu = get_cpu();
 483                /* handle small bases via the GDT because that's faster to
 484                   switch. */
 485                if (addr <= 0xffffffff) {
 486                        set_32bit_tls(task, FS_TLS, addr);
 487                        if (doit) {
 488                                load_TLS(&task->thread, cpu);
 489                                loadsegment(fs, FS_TLS_SEL);
 490                        }
 491                        task->thread.fsindex = FS_TLS_SEL;
 492                        task->thread.fs = 0;
 493                } else {
 494                        task->thread.fsindex = 0;
 495                        task->thread.fs = addr;
 496                        if (doit) {
 497                                /* set the selector to 0 to not confuse
 498                                   __switch_to */
 499                                loadsegment(fs, 0);
 500                                ret = wrmsrl_safe(MSR_FS_BASE, addr);
 501                        }
 502                }
 503                put_cpu();
 504                break;
 505        case ARCH_GET_FS: {
 506                unsigned long base;
 507                if (task->thread.fsindex == FS_TLS_SEL)
 508                        base = read_32bit_tls(task, FS_TLS);
 509                else if (doit)
 510                        rdmsrl(MSR_FS_BASE, base);
 511                else
 512                        base = task->thread.fs;
 513                ret = put_user(base, (unsigned long __user *)addr);
 514                break;
 515        }
 516        case ARCH_GET_GS: {
 517                unsigned long base;
 518                unsigned gsindex;
 519                if (task->thread.gsindex == GS_TLS_SEL)
 520                        base = read_32bit_tls(task, GS_TLS);
 521                else if (doit) {
 522                        savesegment(gs, gsindex);
 523                        if (gsindex)
 524                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 525                        else
 526                                base = task->thread.gs;
 527                } else
 528                        base = task->thread.gs;
 529                ret = put_user(base, (unsigned long __user *)addr);
 530                break;
 531        }
 532
 533        default:
 534                ret = -EINVAL;
 535                break;
 536        }
 537
 538        return ret;
 539}
 540
 541long sys_arch_prctl(int code, unsigned long addr)
 542{
 543        return do_arch_prctl(current, code, addr);
 544}
 545
 546unsigned long KSTK_ESP(struct task_struct *task)
 547{
 548        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 549                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 550}
 551
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.