linux/kernel/fork.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/fork.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 *  'fork.c' contains the help-routines for the 'fork' system call
   9 * (see also entry.S and others).
  10 * Fork is rather simple, once you get the hang of it, but the memory
  11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  12 */
  13
  14#include <linux/slab.h>
  15#include <linux/init.h>
  16#include <linux/unistd.h>
  17#include <linux/module.h>
  18#include <linux/vmalloc.h>
  19#include <linux/completion.h>
  20#include <linux/mnt_namespace.h>
  21#include <linux/personality.h>
  22#include <linux/mempolicy.h>
  23#include <linux/sem.h>
  24#include <linux/file.h>
  25#include <linux/fdtable.h>
  26#include <linux/iocontext.h>
  27#include <linux/key.h>
  28#include <linux/binfmts.h>
  29#include <linux/mman.h>
  30#include <linux/mmu_notifier.h>
  31#include <linux/fs.h>
  32#include <linux/nsproxy.h>
  33#include <linux/capability.h>
  34#include <linux/cpu.h>
  35#include <linux/cgroup.h>
  36#include <linux/security.h>
  37#include <linux/hugetlb.h>
  38#include <linux/swap.h>
  39#include <linux/syscalls.h>
  40#include <linux/jiffies.h>
  41#include <linux/tracehook.h>
  42#include <linux/futex.h>
  43#include <linux/compat.h>
  44#include <linux/task_io_accounting_ops.h>
  45#include <linux/rcupdate.h>
  46#include <linux/ptrace.h>
  47#include <linux/mount.h>
  48#include <linux/audit.h>
  49#include <linux/memcontrol.h>
  50#include <linux/ftrace.h>
  51#include <linux/profile.h>
  52#include <linux/rmap.h>
  53#include <linux/acct.h>
  54#include <linux/tsacct_kern.h>
  55#include <linux/cn_proc.h>
  56#include <linux/freezer.h>
  57#include <linux/delayacct.h>
  58#include <linux/taskstats_kern.h>
  59#include <linux/random.h>
  60#include <linux/tty.h>
  61#include <linux/proc_fs.h>
  62#include <linux/blkdev.h>
  63#include <trace/sched.h>
  64
  65#include <asm/pgtable.h>
  66#include <asm/pgalloc.h>
  67#include <asm/uaccess.h>
  68#include <asm/mmu_context.h>
  69#include <asm/cacheflush.h>
  70#include <asm/tlbflush.h>
  71
  72/*
  73 * Protected counters by write_lock_irq(&tasklist_lock)
  74 */
  75unsigned long total_forks;      /* Handle normal Linux uptimes. */
  76int nr_threads;                 /* The idle threads do not count.. */
  77
  78int max_threads;                /* tunable limit on nr_threads */
  79
  80DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  81
  82__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
  83
  84DEFINE_TRACE(sched_process_fork);
  85
  86int nr_processes(void)
  87{
  88        int cpu;
  89        int total = 0;
  90
  91        for_each_online_cpu(cpu)
  92                total += per_cpu(process_counts, cpu);
  93
  94        return total;
  95}
  96
  97#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
  98# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
  99# define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
 100static struct kmem_cache *task_struct_cachep;
 101#endif
 102
 103#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
 104static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
 105{
 106#ifdef CONFIG_DEBUG_STACK_USAGE
 107        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
 108#else
 109        gfp_t mask = GFP_KERNEL;
 110#endif
 111        return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
 112}
 113
 114static inline void free_thread_info(struct thread_info *ti)
 115{
 116        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 117}
 118#endif
 119
 120/* SLAB cache for signal_struct structures (tsk->signal) */
 121static struct kmem_cache *signal_cachep;
 122
 123/* SLAB cache for sighand_struct structures (tsk->sighand) */
 124struct kmem_cache *sighand_cachep;
 125
 126/* SLAB cache for files_struct structures (tsk->files) */
 127struct kmem_cache *files_cachep;
 128
 129/* SLAB cache for fs_struct structures (tsk->fs) */
 130struct kmem_cache *fs_cachep;
 131
 132/* SLAB cache for vm_area_struct structures */
 133struct kmem_cache *vm_area_cachep;
 134
 135/* SLAB cache for mm_struct structures (tsk->mm) */
 136static struct kmem_cache *mm_cachep;
 137
 138void free_task(struct task_struct *tsk)
 139{
 140        prop_local_destroy_single(&tsk->dirties);
 141        free_thread_info(tsk->stack);
 142        rt_mutex_debug_task_free(tsk);
 143        ftrace_graph_exit_task(tsk);
 144        free_task_struct(tsk);
 145}
 146EXPORT_SYMBOL(free_task);
 147
 148void __put_task_struct(struct task_struct *tsk)
 149{
 150        WARN_ON(!tsk->exit_state);
 151        WARN_ON(atomic_read(&tsk->usage));
 152        WARN_ON(tsk == current);
 153
 154        put_cred(tsk->real_cred);
 155        put_cred(tsk->cred);
 156        delayacct_tsk_free(tsk);
 157
 158        if (!profile_handoff_task(tsk))
 159                free_task(tsk);
 160}
 161
 162/*
 163 * macro override instead of weak attribute alias, to workaround
 164 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
 165 */
 166#ifndef arch_task_cache_init
 167#define arch_task_cache_init()
 168#endif
 169
 170void __init fork_init(unsigned long mempages)
 171{
 172#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 173#ifndef ARCH_MIN_TASKALIGN
 174#define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
 175#endif
 176        /* create a slab on which task_structs can be allocated */
 177        task_struct_cachep =
 178                kmem_cache_create("task_struct", sizeof(struct task_struct),
 179                        ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
 180#endif
 181
 182        /* do the arch specific task caches init */
 183        arch_task_cache_init();
 184
 185        /*
 186         * The default maximum number of threads is set to a safe
 187         * value: the thread structures can take up at most half
 188         * of memory.
 189         */
 190        max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
 191
 192        /*
 193         * we need to allow at least 20 threads to boot a system
 194         */
 195        if(max_threads < 20)
 196                max_threads = 20;
 197
 198        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 199        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 200        init_task.signal->rlim[RLIMIT_SIGPENDING] =
 201                init_task.signal->rlim[RLIMIT_NPROC];
 202}
 203
 204int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
 205                                               struct task_struct *src)
 206{
 207        *dst = *src;
 208        return 0;
 209}
 210
 211static struct task_struct *dup_task_struct(struct task_struct *orig)
 212{
 213        struct task_struct *tsk;
 214        struct thread_info *ti;
 215        int err;
 216
 217        prepare_to_copy(orig);
 218
 219        tsk = alloc_task_struct();
 220        if (!tsk)
 221                return NULL;
 222
 223        ti = alloc_thread_info(tsk);
 224        if (!ti) {
 225                free_task_struct(tsk);
 226                return NULL;
 227        }
 228
 229        err = arch_dup_task_struct(tsk, orig);
 230        if (err)
 231                goto out;
 232
 233        tsk->stack = ti;
 234
 235        err = prop_local_init_single(&tsk->dirties);
 236        if (err)
 237                goto out;
 238
 239        setup_thread_stack(tsk, orig);
 240
 241#ifdef CONFIG_CC_STACKPROTECTOR
 242        tsk->stack_canary = get_random_int();
 243#endif
 244
 245        /* One for us, one for whoever does the "release_task()" (usually parent) */
 246        atomic_set(&tsk->usage,2);
 247        atomic_set(&tsk->fs_excl, 0);
 248#ifdef CONFIG_BLK_DEV_IO_TRACE
 249        tsk->btrace_seq = 0;
 250#endif
 251        tsk->splice_pipe = NULL;
 252        return tsk;
 253
 254out:
 255        free_thread_info(ti);
 256        free_task_struct(tsk);
 257        return NULL;
 258}
 259
 260#ifdef CONFIG_MMU
 261static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 262{
 263        struct vm_area_struct *mpnt, *tmp, **pprev;
 264        struct rb_node **rb_link, *rb_parent;
 265        int retval;
 266        unsigned long charge;
 267        struct mempolicy *pol;
 268
 269        down_write(&oldmm->mmap_sem);
 270        flush_cache_dup_mm(oldmm);
 271        /*
 272         * Not linked in yet - no deadlock potential:
 273         */
 274        down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
 275
 276        mm->locked_vm = 0;
 277        mm->mmap = NULL;
 278        mm->mmap_cache = NULL;
 279        mm->free_area_cache = oldmm->mmap_base;
 280        mm->cached_hole_size = ~0UL;
 281        mm->map_count = 0;
 282        cpus_clear(mm->cpu_vm_mask);
 283        mm->mm_rb = RB_ROOT;
 284        rb_link = &mm->mm_rb.rb_node;
 285        rb_parent = NULL;
 286        pprev = &mm->mmap;
 287
 288        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 289                struct file *file;
 290
 291                if (mpnt->vm_flags & VM_DONTCOPY) {
 292                        long pages = vma_pages(mpnt);
 293                        mm->total_vm -= pages;
 294                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 295                                                                -pages);
 296                        continue;
 297                }
 298                charge = 0;
 299                if (mpnt->vm_flags & VM_ACCOUNT) {
 300                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 301                        if (security_vm_enough_memory(len))
 302                                goto fail_nomem;
 303                        charge = len;
 304                }
 305                tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 306                if (!tmp)
 307                        goto fail_nomem;
 308                *tmp = *mpnt;
 309                pol = mpol_dup(vma_policy(mpnt));
 310                retval = PTR_ERR(pol);
 311                if (IS_ERR(pol))
 312                        goto fail_nomem_policy;
 313                vma_set_policy(tmp, pol);
 314                tmp->vm_flags &= ~VM_LOCKED;
 315                tmp->vm_mm = mm;
 316                tmp->vm_next = NULL;
 317                anon_vma_link(tmp);
 318                file = tmp->vm_file;
 319                if (file) {
 320                        struct inode *inode = file->f_path.dentry->d_inode;
 321                        struct address_space *mapping = file->f_mapping;
 322
 323                        get_file(file);
 324                        if (tmp->vm_flags & VM_DENYWRITE)
 325                                atomic_dec(&inode->i_writecount);
 326                        spin_lock(&mapping->i_mmap_lock);
 327                        if (tmp->vm_flags & VM_SHARED)
 328                                mapping->i_mmap_writable++;
 329                        tmp->vm_truncate_count = mpnt->vm_truncate_count;
 330                        flush_dcache_mmap_lock(mapping);
 331                        /* insert tmp into the share list, just after mpnt */
 332                        vma_prio_tree_add(tmp, mpnt);
 333                        flush_dcache_mmap_unlock(mapping);
 334                        spin_unlock(&mapping->i_mmap_lock);
 335                }
 336
 337                /*
 338                 * Clear hugetlb-related page reserves for children. This only
 339                 * affects MAP_PRIVATE mappings. Faults generated by the child
 340                 * are not guaranteed to succeed, even if read-only
 341                 */
 342                if (is_vm_hugetlb_page(tmp))
 343                        reset_vma_resv_huge_pages(tmp);
 344
 345                /*
 346                 * Link in the new vma and copy the page table entries.
 347                 */
 348                *pprev = tmp;
 349                pprev = &tmp->vm_next;
 350
 351                __vma_link_rb(mm, tmp, rb_link, rb_parent);
 352                rb_link = &tmp->vm_rb.rb_right;
 353                rb_parent = &tmp->vm_rb;
 354
 355                mm->map_count++;
 356                retval = copy_page_range(mm, oldmm, mpnt);
 357
 358                if (tmp->vm_ops && tmp->vm_ops->open)
 359                        tmp->vm_ops->open(tmp);
 360
 361                if (retval)
 362                        goto out;
 363        }
 364        /* a new mm has just been created */
 365        arch_dup_mmap(oldmm, mm);
 366        retval = 0;
 367out:
 368        up_write(&mm->mmap_sem);
 369        flush_tlb_mm(oldmm);
 370        up_write(&oldmm->mmap_sem);
 371        return retval;
 372fail_nomem_policy:
 373        kmem_cache_free(vm_area_cachep, tmp);
 374fail_nomem:
 375        retval = -ENOMEM;
 376        vm_unacct_memory(charge);
 377        goto out;
 378}
 379
 380static inline int mm_alloc_pgd(struct mm_struct * mm)
 381{
 382        mm->pgd = pgd_alloc(mm);
 383        if (unlikely(!mm->pgd))
 384                return -ENOMEM;
 385        return 0;
 386}
 387
 388static inline void mm_free_pgd(struct mm_struct * mm)
 389{
 390        pgd_free(mm, mm->pgd);
 391}
 392#else
 393#define dup_mmap(mm, oldmm)     (0)
 394#define mm_alloc_pgd(mm)        (0)
 395#define mm_free_pgd(mm)
 396#endif /* CONFIG_MMU */
 397
 398__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 399
 400#define allocate_mm()   (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 401#define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 402
 403static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
 404
 405static int __init coredump_filter_setup(char *s)
 406{
 407        default_dump_filter =
 408                (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
 409                MMF_DUMP_FILTER_MASK;
 410        return 1;
 411}
 412
 413__setup("coredump_filter=", coredump_filter_setup);
 414
 415#include <linux/init_task.h>
 416
 417static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 418{
 419        atomic_set(&mm->mm_users, 1);
 420        atomic_set(&mm->mm_count, 1);
 421        init_rwsem(&mm->mmap_sem);
 422        INIT_LIST_HEAD(&mm->mmlist);
 423        mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
 424        mm->core_state = NULL;
 425        mm->nr_ptes = 0;
 426        set_mm_counter(mm, file_rss, 0);
 427        set_mm_counter(mm, anon_rss, 0);
 428        spin_lock_init(&mm->page_table_lock);
 429        spin_lock_init(&mm->ioctx_lock);
 430        INIT_HLIST_HEAD(&mm->ioctx_list);
 431        mm->free_area_cache = TASK_UNMAPPED_BASE;
 432        mm->cached_hole_size = ~0UL;
 433        mm_init_owner(mm, p);
 434
 435        if (likely(!mm_alloc_pgd(mm))) {
 436                mm->def_flags = 0;
 437                mmu_notifier_mm_init(mm);
 438                return mm;
 439        }
 440
 441        free_mm(mm);
 442        return NULL;
 443}
 444
 445/*
 446 * Allocate and initialize an mm_struct.
 447 */
 448struct mm_struct * mm_alloc(void)
 449{
 450        struct mm_struct * mm;
 451
 452        mm = allocate_mm();
 453        if (mm) {
 454                memset(mm, 0, sizeof(*mm));
 455                mm = mm_init(mm, current);
 456        }
 457        return mm;
 458}
 459
 460/*
 461 * Called when the last reference to the mm
 462 * is dropped: either by a lazy thread or by
 463 * mmput. Free the page directory and the mm.
 464 */
 465void __mmdrop(struct mm_struct *mm)
 466{
 467        BUG_ON(mm == &init_mm);
 468        mm_free_pgd(mm);
 469        destroy_context(mm);
 470        mmu_notifier_mm_destroy(mm);
 471        free_mm(mm);
 472}
 473EXPORT_SYMBOL_GPL(__mmdrop);
 474
 475/*
 476 * Decrement the use count and release all resources for an mm.
 477 */
 478void mmput(struct mm_struct *mm)
 479{
 480        might_sleep();
 481
 482        if (atomic_dec_and_test(&mm->mm_users)) {
 483                exit_aio(mm);
 484                exit_mmap(mm);
 485                set_mm_exe_file(mm, NULL);
 486                if (!list_empty(&mm->mmlist)) {
 487                        spin_lock(&mmlist_lock);
 488                        list_del(&mm->mmlist);
 489                        spin_unlock(&mmlist_lock);
 490                }
 491                put_swap_token(mm);
 492                mmdrop(mm);
 493        }
 494}
 495EXPORT_SYMBOL_GPL(mmput);
 496
 497/**
 498 * get_task_mm - acquire a reference to the task's mm
 499 *
 500 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 501 * this kernel workthread has transiently adopted a user mm with use_mm,
 502 * to do its AIO) is not set and if so returns a reference to it, after
 503 * bumping up the use count.  User must release the mm via mmput()
 504 * after use.  Typically used by /proc and ptrace.
 505 */
 506struct mm_struct *get_task_mm(struct task_struct *task)
 507{
 508        struct mm_struct *mm;
 509
 510        task_lock(task);
 511        mm = task->mm;
 512        if (mm) {
 513                if (task->flags & PF_KTHREAD)
 514                        mm = NULL;
 515                else
 516                        atomic_inc(&mm->mm_users);
 517        }
 518        task_unlock(task);
 519        return mm;
 520}
 521EXPORT_SYMBOL_GPL(get_task_mm);
 522
 523/* Please note the differences between mmput and mm_release.
 524 * mmput is called whenever we stop holding onto a mm_struct,
 525 * error success whatever.
 526 *
 527 * mm_release is called after a mm_struct has been removed
 528 * from the current process.
 529 *
 530 * This difference is important for error handling, when we
 531 * only half set up a mm_struct for a new process and need to restore
 532 * the old one.  Because we mmput the new mm_struct before
 533 * restoring the old one. . .
 534 * Eric Biederman 10 January 1998
 535 */
 536void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 537{
 538        struct completion *vfork_done = tsk->vfork_done;
 539
 540        /* Get rid of any futexes when releasing the mm */
 541#ifdef CONFIG_FUTEX
 542        if (unlikely(tsk->robust_list))
 543                exit_robust_list(tsk);
 544#ifdef CONFIG_COMPAT
 545        if (unlikely(tsk->compat_robust_list))
 546                compat_exit_robust_list(tsk);
 547#endif
 548#endif
 549
 550        /* Get rid of any cached register state */
 551        deactivate_mm(tsk, mm);
 552
 553        /* notify parent sleeping on vfork() */
 554        if (vfork_done) {
 555                tsk->vfork_done = NULL;
 556                complete(vfork_done);
 557        }
 558
 559        /*
 560         * If we're exiting normally, clear a user-space tid field if
 561         * requested.  We leave this alone when dying by signal, to leave
 562         * the value intact in a core dump, and to save the unnecessary
 563         * trouble otherwise.  Userland only wants this done for a sys_exit.
 564         */
 565        if (tsk->clear_child_tid
 566            && !(tsk->flags & PF_SIGNALED)
 567            && atomic_read(&mm->mm_users) > 1) {
 568                u32 __user * tidptr = tsk->clear_child_tid;
 569                tsk->clear_child_tid = NULL;
 570
 571                /*
 572                 * We don't check the error code - if userspace has
 573                 * not set up a proper pointer then tough luck.
 574                 */
 575                put_user(0, tidptr);
 576                sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
 577        }
 578}
 579
 580/*
 581 * Allocate a new mm structure and copy contents from the
 582 * mm structure of the passed in task structure.
 583 */
 584struct mm_struct *dup_mm(struct task_struct *tsk)
 585{
 586        struct mm_struct *mm, *oldmm = current->mm;
 587        int err;
 588
 589        if (!oldmm)
 590                return NULL;
 591
 592        mm = allocate_mm();
 593        if (!mm)
 594                goto fail_nomem;
 595
 596        memcpy(mm, oldmm, sizeof(*mm));
 597
 598        /* Initializing for Swap token stuff */
 599        mm->token_priority = 0;
 600        mm->last_interval = 0;
 601
 602        if (!mm_init(mm, tsk))
 603                goto fail_nomem;
 604
 605        if (init_new_context(tsk, mm))
 606                goto fail_nocontext;
 607
 608        dup_mm_exe_file(oldmm, mm);
 609
 610        err = dup_mmap(mm, oldmm);
 611        if (err)
 612                goto free_pt;
 613
 614        mm->hiwater_rss = get_mm_rss(mm);
 615        mm->hiwater_vm = mm->total_vm;
 616
 617        return mm;
 618
 619free_pt:
 620        mmput(mm);
 621
 622fail_nomem:
 623        return NULL;
 624
 625fail_nocontext:
 626        /*
 627         * If init_new_context() failed, we cannot use mmput() to free the mm
 628         * because it calls destroy_context()
 629         */
 630        mm_free_pgd(mm);
 631        free_mm(mm);
 632        return NULL;
 633}
 634
 635static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 636{
 637        struct mm_struct * mm, *oldmm;
 638        int retval;
 639
 640        tsk->min_flt = tsk->maj_flt = 0;
 641        tsk->nvcsw = tsk->nivcsw = 0;
 642
 643        tsk->mm = NULL;
 644        tsk->active_mm = NULL;
 645
 646        /*
 647         * Are we cloning a kernel thread?
 648         *
 649         * We need to steal a active VM for that..
 650         */
 651        oldmm = current->mm;
 652        if (!oldmm)
 653                return 0;
 654
 655        if (clone_flags & CLONE_VM) {
 656                atomic_inc(&oldmm->mm_users);
 657                mm = oldmm;
 658                goto good_mm;
 659        }
 660
 661        retval = -ENOMEM;
 662        mm = dup_mm(tsk);
 663        if (!mm)
 664                goto fail_nomem;
 665
 666good_mm:
 667        /* Initializing for Swap token stuff */
 668        mm->token_priority = 0;
 669        mm->last_interval = 0;
 670
 671        tsk->mm = mm;
 672        tsk->active_mm = mm;
 673        return 0;
 674
 675fail_nomem:
 676        return retval;
 677}
 678
 679static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 680{
 681        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 682        /* We don't need to lock fs - think why ;-) */
 683        if (fs) {
 684                atomic_set(&fs->count, 1);
 685                rwlock_init(&fs->lock);
 686                fs->umask = old->umask;
 687                read_lock(&old->lock);
 688                fs->root = old->root;
 689                path_get(&old->root);
 690                fs->pwd = old->pwd;
 691                path_get(&old->pwd);
 692                read_unlock(&old->lock);
 693        }
 694        return fs;
 695}
 696
 697struct fs_struct *copy_fs_struct(struct fs_struct *old)
 698{
 699        return __copy_fs_struct(old);
 700}
 701
 702EXPORT_SYMBOL_GPL(copy_fs_struct);
 703
 704static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 705{
 706        if (clone_flags & CLONE_FS) {
 707                atomic_inc(&current->fs->count);
 708                return 0;
 709        }
 710        tsk->fs = __copy_fs_struct(current->fs);
 711        if (!tsk->fs)
 712                return -ENOMEM;
 713        return 0;
 714}
 715
 716static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 717{
 718        struct files_struct *oldf, *newf;
 719        int error = 0;
 720
 721        /*
 722         * A background process may not have any files ...
 723         */
 724        oldf = current->files;
 725        if (!oldf)
 726                goto out;
 727
 728        if (clone_flags & CLONE_FILES) {
 729                atomic_inc(&oldf->count);
 730                goto out;
 731        }
 732
 733        newf = dup_fd(oldf, &error);
 734        if (!newf)
 735                goto out;
 736
 737        tsk->files = newf;
 738        error = 0;
 739out:
 740        return error;
 741}
 742
 743static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 744{
 745#ifdef CONFIG_BLOCK
 746        struct io_context *ioc = current->io_context;
 747
 748        if (!ioc)
 749                return 0;
 750        /*
 751         * Share io context with parent, if CLONE_IO is set
 752         */
 753        if (clone_flags & CLONE_IO) {
 754                tsk->io_context = ioc_task_link(ioc);
 755                if (unlikely(!tsk->io_context))
 756                        return -ENOMEM;
 757        } else if (ioprio_valid(ioc->ioprio)) {
 758                tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
 759                if (unlikely(!tsk->io_context))
 760                        return -ENOMEM;
 761
 762                tsk->io_context->ioprio = ioc->ioprio;
 763        }
 764#endif
 765        return 0;
 766}
 767
 768static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 769{
 770        struct sighand_struct *sig;
 771
 772        if (clone_flags & CLONE_SIGHAND) {
 773                atomic_inc(&current->sighand->count);
 774                return 0;
 775        }
 776        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 777        rcu_assign_pointer(tsk->sighand, sig);
 778        if (!sig)
 779                return -ENOMEM;
 780        atomic_set(&sig->count, 1);
 781        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 782        return 0;
 783}
 784
 785void __cleanup_sighand(struct sighand_struct *sighand)
 786{
 787        if (atomic_dec_and_test(&sighand->count))
 788                kmem_cache_free(sighand_cachep, sighand);
 789}
 790
 791
 792/*
 793 * Initialize POSIX timer handling for a thread group.
 794 */
 795static void posix_cpu_timers_init_group(struct signal_struct *sig)
 796{
 797        /* Thread group counters. */
 798        thread_group_cputime_init(sig);
 799
 800        /* Expiration times and increments. */
 801        sig->it_virt_expires = cputime_zero;
 802        sig->it_virt_incr = cputime_zero;
 803        sig->it_prof_expires = cputime_zero;
 804        sig->it_prof_incr = cputime_zero;
 805
 806        /* Cached expiration times. */
 807        sig->cputime_expires.prof_exp = cputime_zero;
 808        sig->cputime_expires.virt_exp = cputime_zero;
 809        sig->cputime_expires.sched_exp = 0;
 810
 811        /* The timer lists. */
 812        INIT_LIST_HEAD(&sig->cpu_timers[0]);
 813        INIT_LIST_HEAD(&sig->cpu_timers[1]);
 814        INIT_LIST_HEAD(&sig->cpu_timers[2]);
 815}
 816
 817static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 818{
 819        struct signal_struct *sig;
 820
 821        if (clone_flags & CLONE_THREAD) {
 822                atomic_inc(&current->signal->count);
 823                atomic_inc(&current->signal->live);
 824                return 0;
 825        }
 826        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
 827
 828        if (sig)
 829                posix_cpu_timers_init_group(sig);
 830
 831        tsk->signal = sig;
 832        if (!sig)
 833                return -ENOMEM;
 834
 835        atomic_set(&sig->count, 1);
 836        atomic_set(&sig->live, 1);
 837        init_waitqueue_head(&sig->wait_chldexit);
 838        sig->flags = 0;
 839        sig->group_exit_code = 0;
 840        sig->group_exit_task = NULL;
 841        sig->group_stop_count = 0;
 842        sig->curr_target = tsk;
 843        init_sigpending(&sig->shared_pending);
 844        INIT_LIST_HEAD(&sig->posix_timers);
 845
 846        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 847        sig->it_real_incr.tv64 = 0;
 848        sig->real_timer.function = it_real_fn;
 849
 850        sig->leader = 0;        /* session leadership doesn't inherit */
 851        sig->tty_old_pgrp = NULL;
 852        sig->tty = NULL;
 853
 854        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 855        sig->gtime = cputime_zero;
 856        sig->cgtime = cputime_zero;
 857        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 858        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 859        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
 860        task_io_accounting_init(&sig->ioac);
 861        sig->sum_sched_runtime = 0;
 862        taskstats_tgid_init(sig);
 863
 864        task_lock(current->group_leader);
 865        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
 866        task_unlock(current->group_leader);
 867
 868        acct_init_pacct(&sig->pacct);
 869
 870        tty_audit_fork(sig);
 871
 872        return 0;
 873}
 874
 875void __cleanup_signal(struct signal_struct *sig)
 876{
 877        thread_group_cputime_free(sig);
 878        tty_kref_put(sig->tty);
 879        kmem_cache_free(signal_cachep, sig);
 880}
 881
 882static void cleanup_signal(struct task_struct *tsk)
 883{
 884        struct signal_struct *sig = tsk->signal;
 885
 886        atomic_dec(&sig->live);
 887
 888        if (atomic_dec_and_test(&sig->count))
 889                __cleanup_signal(sig);
 890}
 891
 892static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 893{
 894        unsigned long new_flags = p->flags;
 895
 896        new_flags &= ~PF_SUPERPRIV;
 897        new_flags |= PF_FORKNOEXEC;
 898        new_flags |= PF_STARTING;
 899        p->flags = new_flags;
 900        clear_freeze_flag(p);
 901}
 902
 903SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 904{
 905        current->clear_child_tid = tidptr;
 906
 907        return task_pid_vnr(current);
 908}
 909
 910static void rt_mutex_init_task(struct task_struct *p)
 911{
 912        spin_lock_init(&p->pi_lock);
 913#ifdef CONFIG_RT_MUTEXES
 914        plist_head_init(&p->pi_waiters, &p->pi_lock);
 915        p->pi_blocked_on = NULL;
 916#endif
 917}
 918
 919#ifdef CONFIG_MM_OWNER
 920void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 921{
 922        mm->owner = p;
 923}
 924#endif /* CONFIG_MM_OWNER */
 925
 926/*
 927 * Initialize POSIX timer handling for a single task.
 928 */
 929static void posix_cpu_timers_init(struct task_struct *tsk)
 930{
 931        tsk->cputime_expires.prof_exp = cputime_zero;
 932        tsk->cputime_expires.virt_exp = cputime_zero;
 933        tsk->cputime_expires.sched_exp = 0;
 934        INIT_LIST_HEAD(&tsk->cpu_timers[0]);
 935        INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 936        INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 937}
 938
 939/*
 940 * This creates a new process as a copy of the old one,
 941 * but does not actually start it yet.
 942 *
 943 * It copies the registers, and all the appropriate
 944 * parts of the process environment (as per the clone
 945 * flags). The actual kick-off is left to the caller.
 946 */
 947static struct task_struct *copy_process(unsigned long clone_flags,
 948                                        unsigned long stack_start,
 949                                        struct pt_regs *regs,
 950                                        unsigned long stack_size,
 951                                        int __user *child_tidptr,
 952                                        struct pid *pid,
 953                                        int trace)
 954{
 955        int retval;
 956        struct task_struct *p;
 957        int cgroup_callbacks_done = 0;
 958
 959        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 960                return ERR_PTR(-EINVAL);
 961
 962        /*
 963         * Thread groups must share signals as well, and detached threads
 964         * can only be started up within the thread group.
 965         */
 966        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
 967                return ERR_PTR(-EINVAL);
 968
 969        /*
 970         * Shared signal handlers imply shared VM. By way of the above,
 971         * thread groups also imply shared VM. Blocking this case allows
 972         * for various simplifications in other code.
 973         */
 974        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
 975                return ERR_PTR(-EINVAL);
 976
 977        retval = security_task_create(clone_flags);
 978        if (retval)
 979                goto fork_out;
 980
 981        retval = -ENOMEM;
 982        p = dup_task_struct(current);
 983        if (!p)
 984                goto fork_out;
 985
 986        rt_mutex_init_task(p);
 987
 988#ifdef CONFIG_PROVE_LOCKING
 989        DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 990        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 991#endif
 992        retval = -EAGAIN;
 993        if (atomic_read(&p->real_cred->user->processes) >=
 994                        p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 995                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
 996                    p->real_cred->user != INIT_USER)
 997                        goto bad_fork_free;
 998        }
 999
1000        retval = copy_creds(p, clone_flags);
1001        if (retval < 0)
1002                goto bad_fork_free;
1003
1004        /*
1005         * If multiple threads are within copy_process(), then this check
1006         * triggers too late. This doesn't hurt, the check is only there
1007         * to stop root fork bombs.
1008         */
1009        retval = -EAGAIN;
1010        if (nr_threads >= max_threads)
1011                goto bad_fork_cleanup_count;
1012
1013        if (!try_module_get(task_thread_info(p)->exec_domain->module))
1014                goto bad_fork_cleanup_count;
1015
1016        if (p->binfmt && !try_module_get(p->binfmt->module))
1017                goto bad_fork_cleanup_put_domain;
1018
1019        p->did_exec = 0;
1020        delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
1021        copy_flags(clone_flags, p);
1022        INIT_LIST_HEAD(&p->children);
1023        INIT_LIST_HEAD(&p->sibling);
1024#ifdef CONFIG_PREEMPT_RCU
1025        p->rcu_read_lock_nesting = 0;
1026        p->rcu_flipctr_idx = 0;
1027#endif /* #ifdef CONFIG_PREEMPT_RCU */
1028        p->vfork_done = NULL;
1029        spin_lock_init(&p->alloc_lock);
1030
1031        clear_tsk_thread_flag(p, TIF_SIGPENDING);
1032        init_sigpending(&p->pending);
1033
1034        p->utime = cputime_zero;
1035        p->stime = cputime_zero;
1036        p->gtime = cputime_zero;
1037        p->utimescaled = cputime_zero;
1038        p->stimescaled = cputime_zero;
1039        p->prev_utime = cputime_zero;
1040        p->prev_stime = cputime_zero;
1041
1042        p->default_timer_slack_ns = current->timer_slack_ns;
1043
1044#ifdef CONFIG_DETECT_SOFTLOCKUP
1045        p->last_switch_count = 0;
1046        p->last_switch_timestamp = 0;
1047#endif
1048
1049        task_io_accounting_init(&p->ioac);
1050        acct_clear_integrals(p);
1051
1052        posix_cpu_timers_init(p);
1053
1054        p->lock_depth = -1;             /* -1 = no lock */
1055        do_posix_clock_monotonic_gettime(&p->start_time);
1056        p->real_start_time = p->start_time;
1057        monotonic_to_bootbased(&p->real_start_time);
1058        p->io_context = NULL;
1059        p->audit_context = NULL;
1060        cgroup_fork(p);
1061#ifdef CONFIG_NUMA
1062        p->mempolicy = mpol_dup(p->mempolicy);
1063        if (IS_ERR(p->mempolicy)) {
1064                retval = PTR_ERR(p->mempolicy);
1065                p->mempolicy = NULL;
1066                goto bad_fork_cleanup_cgroup;
1067        }
1068        mpol_fix_fork_child_flag(p);
1069#endif
1070#ifdef CONFIG_TRACE_IRQFLAGS
1071        p->irq_events = 0;
1072#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1073        p->hardirqs_enabled = 1;
1074#else
1075        p->hardirqs_enabled = 0;
1076#endif
1077        p->hardirq_enable_ip = 0;
1078        p->hardirq_enable_event = 0;
1079        p->hardirq_disable_ip = _THIS_IP_;
1080        p->hardirq_disable_event = 0;
1081        p->softirqs_enabled = 1;
1082        p->softirq_enable_ip = _THIS_IP_;
1083        p->softirq_enable_event = 0;
1084        p->softirq_disable_ip = 0;
1085        p->softirq_disable_event = 0;
1086        p->hardirq_context = 0;
1087        p->softirq_context = 0;
1088#endif
1089#ifdef CONFIG_LOCKDEP
1090        p->lockdep_depth = 0; /* no locks held yet */
1091        p->curr_chain_key = 0;
1092        p->lockdep_recursion = 0;
1093#endif
1094
1095#ifdef CONFIG_DEBUG_MUTEXES
1096        p->blocked_on = NULL; /* not blocked yet */
1097#endif
1098        if (unlikely(current->ptrace))
1099                ptrace_fork(p, clone_flags);
1100
1101        /* Perform scheduler related setup. Assign this task to a CPU. */
1102        sched_fork(p, clone_flags);
1103
1104        if ((retval = audit_alloc(p)))
1105                goto bad_fork_cleanup_policy;
1106        /* copy all the process information */
1107        if ((retval = copy_semundo(clone_flags, p)))
1108                goto bad_fork_cleanup_audit;
1109        if ((retval = copy_files(clone_flags, p)))
1110                goto bad_fork_cleanup_semundo;
1111        if ((retval = copy_fs(clone_flags, p)))
1112                goto bad_fork_cleanup_files;
1113        if ((retval = copy_sighand(clone_flags, p)))
1114                goto bad_fork_cleanup_fs;
1115        if ((retval = copy_signal(clone_flags, p)))
1116                goto bad_fork_cleanup_sighand;
1117        if ((retval = copy_mm(clone_flags, p)))
1118                goto bad_fork_cleanup_signal;
1119        if ((retval = copy_namespaces(clone_flags, p)))
1120                goto bad_fork_cleanup_mm;
1121        if ((retval = copy_io(clone_flags, p)))
1122                goto bad_fork_cleanup_namespaces;
1123        retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1124        if (retval)
1125                goto bad_fork_cleanup_io;
1126
1127        if (pid != &init_struct_pid) {
1128                retval = -ENOMEM;
1129                pid = alloc_pid(p->nsproxy->pid_ns);
1130                if (!pid)
1131                        goto bad_fork_cleanup_io;
1132
1133                if (clone_flags & CLONE_NEWPID) {
1134                        retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1135                        if (retval < 0)
1136                                goto bad_fork_free_pid;
1137                }
1138        }
1139
1140        ftrace_graph_init_task(p);
1141
1142        p->pid = pid_nr(pid);
1143        p->tgid = p->pid;
1144        if (clone_flags & CLONE_THREAD)
1145                p->tgid = current->tgid;
1146
1147        if (current->nsproxy != p->nsproxy) {
1148                retval = ns_cgroup_clone(p, pid);
1149                if (retval)
1150                        goto bad_fork_free_graph;
1151        }
1152
1153        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1154        /*
1155         * Clear TID on mm_release()?
1156         */
1157        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1158#ifdef CONFIG_FUTEX
1159        p->robust_list = NULL;
1160#ifdef CONFIG_COMPAT
1161        p->compat_robust_list = NULL;
1162#endif
1163        INIT_LIST_HEAD(&p->pi_state_list);
1164        p->pi_state_cache = NULL;
1165#endif
1166        /*
1167         * sigaltstack should be cleared when sharing the same VM
1168         */
1169        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
1170                p->sas_ss_sp = p->sas_ss_size = 0;
1171
1172        /*
1173         * Syscall tracing should be turned off in the child regardless
1174         * of CLONE_PTRACE.
1175         */
1176        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1177#ifdef TIF_SYSCALL_EMU
1178        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1179#endif
1180        clear_all_latency_tracing(p);
1181
1182        /* ok, now we should be set up.. */
1183        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1184        p->pdeath_signal = 0;
1185        p->exit_state = 0;
1186
1187        /*
1188         * Ok, make it visible to the rest of the system.
1189         * We dont wake it up yet.
1190         */
1191        p->group_leader = p;
1192        INIT_LIST_HEAD(&p->thread_group);
1193
1194        /* Now that the task is set up, run cgroup callbacks if
1195         * necessary. We need to run them before the task is visible
1196         * on the tasklist. */
1197        cgroup_fork_callbacks(p);
1198        cgroup_callbacks_done = 1;
1199
1200        /* Need tasklist lock for parent etc handling! */
1201        write_lock_irq(&tasklist_lock);
1202
1203        /*
1204         * The task hasn't been attached yet, so its cpus_allowed mask will
1205         * not be changed, nor will its assigned CPU.
1206         *
1207         * The cpus_allowed mask of the parent may have changed after it was
1208         * copied first time - so re-copy it here, then check the child's CPU
1209         * to ensure it is on a valid CPU (and if not, just force it back to
1210         * parent's CPU). This avoids alot of nasty races.
1211         */
1212        p->cpus_allowed = current->cpus_allowed;
1213        p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1214        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1215                        !cpu_online(task_cpu(p))))
1216                set_task_cpu(p, smp_processor_id());
1217
1218        /* CLONE_PARENT re-uses the old parent */
1219        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1220                p->real_parent = current->real_parent;
1221                p->parent_exec_id = current->parent_exec_id;
1222        } else {
1223                p->real_parent = current;
1224                p->parent_exec_id = current->self_exec_id;
1225        }
1226
1227        spin_lock(&current->sighand->siglock);
1228
1229        /*
1230         * Process group and session signals need to be delivered to just the
1231         * parent before the fork or both the parent and the child after the
1232         * fork. Restart if a signal comes in before we add the new process to
1233         * it's process group.
1234         * A fatal signal pending means that current will exit, so the new
1235         * thread can't slip out of an OOM kill (or normal SIGKILL).
1236         */
1237        recalc_sigpending();
1238        if (signal_pending(current)) {
1239                spin_unlock(&current->sighand->siglock);
1240                write_unlock_irq(&tasklist_lock);
1241                retval = -ERESTARTNOINTR;
1242                goto bad_fork_free_graph;
1243        }
1244
1245        if (clone_flags & CLONE_THREAD) {
1246                p->group_leader = current->group_leader;
1247                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1248        }
1249
1250        if (likely(p->pid)) {
1251                list_add_tail(&p->sibling, &p->real_parent->children);
1252                tracehook_finish_clone(p, clone_flags, trace);
1253
1254                if (thread_group_leader(p)) {
1255                        if (clone_flags & CLONE_NEWPID)
1256                                p->nsproxy->pid_ns->child_reaper = p;
1257
1258                        p->signal->leader_pid = pid;
1259                        tty_kref_put(p->signal->tty);
1260                        p->signal->tty = tty_kref_get(current->signal->tty);
1261                        set_task_pgrp(p, task_pgrp_nr(current));
1262                        set_task_session(p, task_session_nr(current));
1263                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1264                        attach_pid(p, PIDTYPE_SID, task_session(current));
1265                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
1266                        __get_cpu_var(process_counts)++;
1267                }
1268                attach_pid(p, PIDTYPE_PID, pid);
1269                nr_threads++;
1270        }
1271
1272        total_forks++;
1273        spin_unlock(&current->sighand->siglock);
1274        write_unlock_irq(&tasklist_lock);
1275        proc_fork_connector(p);
1276        cgroup_post_fork(p);
1277        return p;
1278
1279bad_fork_free_graph:
1280        ftrace_graph_exit_task(p);
1281bad_fork_free_pid:
1282        if (pid != &init_struct_pid)
1283                free_pid(pid);
1284bad_fork_cleanup_io:
1285        put_io_context(p->io_context);
1286bad_fork_cleanup_namespaces:
1287        exit_task_namespaces(p);
1288bad_fork_cleanup_mm:
1289        if (p->mm)
1290                mmput(p->mm);
1291bad_fork_cleanup_signal:
1292        cleanup_signal(p);
1293bad_fork_cleanup_sighand:
1294        __cleanup_sighand(p->sighand);
1295bad_fork_cleanup_fs:
1296        exit_fs(p); /* blocking */
1297bad_fork_cleanup_files:
1298        exit_files(p); /* blocking */
1299bad_fork_cleanup_semundo:
1300        exit_sem(p);
1301bad_fork_cleanup_audit:
1302        audit_free(p);
1303bad_fork_cleanup_policy:
1304#ifdef CONFIG_NUMA
1305        mpol_put(p->mempolicy);
1306bad_fork_cleanup_cgroup:
1307#endif
1308        cgroup_exit(p, cgroup_callbacks_done);
1309        delayacct_tsk_free(p);
1310        if (p->binfmt)
1311                module_put(p->binfmt->module);
1312bad_fork_cleanup_put_domain:
1313        module_put(task_thread_info(p)->exec_domain->module);
1314bad_fork_cleanup_count:
1315        atomic_dec(&p->cred->user->processes);
1316        put_cred(p->real_cred);
1317        put_cred(p->cred);
1318bad_fork_free:
1319        free_task(p);
1320fork_out:
1321        return ERR_PTR(retval);
1322}
1323
1324noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1325{
1326        memset(regs, 0, sizeof(struct pt_regs));
1327        return regs;
1328}
1329
1330struct task_struct * __cpuinit fork_idle(int cpu)
1331{
1332        struct task_struct *task;
1333        struct pt_regs regs;
1334
1335        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1336                            &init_struct_pid, 0);
1337        if (!IS_ERR(task))
1338                init_idle(task, cpu);
1339
1340        return task;
1341}
1342
1343/*
1344 *  Ok, this is the main fork-routine.
1345 *
1346 * It copies the process, and if successful kick-starts
1347 * it and waits for it to finish using the VM if required.
1348 */
1349long do_fork(unsigned long clone_flags,
1350              unsigned long stack_start,
1351              struct pt_regs *regs,
1352              unsigned long stack_size,
1353              int __user *parent_tidptr,
1354              int __user *child_tidptr)
1355{
1356        struct task_struct *p;
1357        int trace = 0;
1358        long nr;
1359
1360        /*
1361         * Do some preliminary argument and permissions checking before we
1362         * actually start allocating stuff
1363         */
1364        if (clone_flags & CLONE_NEWUSER) {
1365                if (clone_flags & CLONE_THREAD)
1366                        return -EINVAL;
1367                /* hopefully this check will go away when userns support is
1368                 * complete
1369                 */
1370                if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1371                                !capable(CAP_SETGID))
1372                        return -EPERM;
1373        }
1374
1375        /*
1376         * We hope to recycle these flags after 2.6.26
1377         */
1378        if (unlikely(clone_flags & CLONE_STOPPED)) {
1379                static int __read_mostly count = 100;
1380
1381                if (count > 0 && printk_ratelimit()) {
1382                        char comm[TASK_COMM_LEN];
1383
1384                        count--;
1385                        printk(KERN_INFO "fork(): process `%s' used deprecated "
1386                                        "clone flags 0x%lx\n",
1387                                get_task_comm(comm, current),
1388                                clone_flags & CLONE_STOPPED);
1389                }
1390        }
1391
1392        /*
1393         * When called from kernel_thread, don't do user tracing stuff.
1394         */
1395        if (likely(user_mode(regs)))
1396                trace = tracehook_prepare_clone(clone_flags);
1397
1398        p = copy_process(clone_flags, stack_start, regs, stack_size,
1399                         child_tidptr, NULL, trace);
1400        /*
1401         * Do this prior waking up the new thread - the thread pointer
1402         * might get invalid after that point, if the thread exits quickly.
1403         */
1404        if (!IS_ERR(p)) {
1405                struct completion vfork;
1406
1407                trace_sched_process_fork(current, p);
1408
1409                nr = task_pid_vnr(p);
1410
1411                if (clone_flags & CLONE_PARENT_SETTID)
1412                        put_user(nr, parent_tidptr);
1413
1414                if (clone_flags & CLONE_VFORK) {
1415                        p->vfork_done = &vfork;
1416                        init_completion(&vfork);
1417                }
1418
1419                audit_finish_fork(p);
1420                tracehook_report_clone(trace, regs, clone_flags, nr, p);
1421
1422                /*
1423                 * We set PF_STARTING at creation in case tracing wants to
1424                 * use this to distinguish a fully live task from one that
1425                 * hasn't gotten to tracehook_report_clone() yet.  Now we
1426                 * clear it and set the child going.
1427                 */
1428                p->flags &= ~PF_STARTING;
1429
1430                if (unlikely(clone_flags & CLONE_STOPPED)) {
1431                        /*
1432                         * We'll start up with an immediate SIGSTOP.
1433                         */
1434                        sigaddset(&p->pending.signal, SIGSTOP);
1435                        set_tsk_thread_flag(p, TIF_SIGPENDING);
1436                        __set_task_state(p, TASK_STOPPED);
1437                } else {
1438                        wake_up_new_task(p, clone_flags);
1439                }
1440
1441                tracehook_report_clone_complete(trace, regs,
1442                                                clone_flags, nr, p);
1443
1444                if (clone_flags & CLONE_VFORK) {
1445                        freezer_do_not_count();
1446                        wait_for_completion(&vfork);
1447                        freezer_count();
1448                        tracehook_report_vfork_done(p, nr);
1449                }
1450        } else {
1451                nr = PTR_ERR(p);
1452        }
1453        return nr;
1454}
1455
1456#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1457#define ARCH_MIN_MMSTRUCT_ALIGN 0
1458#endif
1459
1460static void sighand_ctor(void *data)
1461{
1462        struct sighand_struct *sighand = data;
1463
1464        spin_lock_init(&sighand->siglock);
1465        init_waitqueue_head(&sighand->signalfd_wqh);
1466}
1467
1468void __init proc_caches_init(void)
1469{
1470        sighand_cachep = kmem_cache_create("sighand_cache",
1471                        sizeof(struct sighand_struct), 0,
1472                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1473                        sighand_ctor);
1474        signal_cachep = kmem_cache_create("signal_cache",
1475                        sizeof(struct signal_struct), 0,
1476                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1477        files_cachep = kmem_cache_create("files_cache",
1478                        sizeof(struct files_struct), 0,
1479                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1480        fs_cachep = kmem_cache_create("fs_cache",
1481                        sizeof(struct fs_struct), 0,
1482                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1483        mm_cachep = kmem_cache_create("mm_struct",
1484                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1485                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1486        mmap_init();
1487}
1488
1489/*
1490 * Check constraints on flags passed to the unshare system call and
1491 * force unsharing of additional process context as appropriate.
1492 */
1493static void check_unshare_flags(unsigned long *flags_ptr)
1494{
1495        /*
1496         * If unsharing a thread from a thread group, must also
1497         * unshare vm.
1498         */
1499        if (*flags_ptr & CLONE_THREAD)
1500                *flags_ptr |= CLONE_VM;
1501
1502        /*
1503         * If unsharing vm, must also unshare signal handlers.
1504         */
1505        if (*flags_ptr & CLONE_VM)
1506                *flags_ptr |= CLONE_SIGHAND;
1507
1508        /*
1509         * If unsharing signal handlers and the task was created
1510         * using CLONE_THREAD, then must unshare the thread
1511         */
1512        if ((*flags_ptr & CLONE_SIGHAND) &&
1513            (atomic_read(&current->signal->count) > 1))
1514                *flags_ptr |= CLONE_THREAD;
1515
1516        /*
1517         * If unsharing namespace, must also unshare filesystem information.
1518         */
1519        if (*flags_ptr & CLONE_NEWNS)
1520                *flags_ptr |= CLONE_FS;
1521}
1522
1523/*
1524 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1525 */
1526static int unshare_thread(unsigned long unshare_flags)
1527{
1528        if (unshare_flags & CLONE_THREAD)
1529                return -EINVAL;
1530
1531        return 0;
1532}
1533
1534/*
1535 * Unshare the filesystem structure if it is being shared
1536 */
1537static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1538{
1539        struct fs_struct *fs = current->fs;
1540
1541        if ((unshare_flags & CLONE_FS) &&
1542            (fs && atomic_read(&fs->count) > 1)) {
1543                *new_fsp = __copy_fs_struct(current->fs);
1544                if (!*new_fsp)
1545                        return -ENOMEM;
1546        }
1547
1548        return 0;
1549}
1550
1551/*
1552 * Unsharing of sighand is not supported yet
1553 */
1554static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1555{
1556        struct sighand_struct *sigh = current->sighand;
1557
1558        if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1559                return -EINVAL;
1560        else
1561                return 0;
1562}
1563
1564/*
1565 * Unshare vm if it is being shared
1566 */
1567static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1568{
1569        struct mm_struct *mm = current->mm;
1570
1571        if ((unshare_flags & CLONE_VM) &&
1572            (mm && atomic_read(&mm->mm_users) > 1)) {
1573                return -EINVAL;
1574        }
1575
1576        return 0;
1577}
1578
1579/*
1580 * Unshare file descriptor table if it is being shared
1581 */
1582static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
1583{
1584        struct files_struct *fd = current->files;
1585        int error = 0;
1586
1587        if ((unshare_flags & CLONE_FILES) &&
1588            (fd && atomic_read(&fd->count) > 1)) {
1589                *new_fdp = dup_fd(fd, &error);
1590                if (!*new_fdp)
1591                        return error;
1592        }
1593
1594        return 0;
1595}
1596
1597/*
1598 * unshare allows a process to 'unshare' part of the process
1599 * context which was originally shared using clone.  copy_*
1600 * functions used by do_fork() cannot be used here directly
1601 * because they modify an inactive task_struct that is being
1602 * constructed. Here we are modifying the current, active,
1603 * task_struct.
1604 */
1605SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1606{
1607        int err = 0;
1608        struct fs_struct *fs, *new_fs = NULL;
1609        struct sighand_struct *new_sigh = NULL;
1610        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1611        struct files_struct *fd, *new_fd = NULL;
1612        struct nsproxy *new_nsproxy = NULL;
1613        int do_sysvsem = 0;
1614
1615        check_unshare_flags(&unshare_flags);
1616
1617        /* Return -EINVAL for all unsupported flags */
1618        err = -EINVAL;
1619        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1620                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1621                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1622                goto bad_unshare_out;
1623
1624        /*
1625         * CLONE_NEWIPC must also detach from the undolist: after switching
1626         * to a new ipc namespace, the semaphore arrays from the old
1627         * namespace are unreachable.
1628         */
1629        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1630                do_sysvsem = 1;
1631        if ((err = unshare_thread(unshare_flags)))
1632                goto bad_unshare_out;
1633        if ((err = unshare_fs(unshare_flags, &new_fs)))
1634                goto bad_unshare_cleanup_thread;
1635        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1636                goto bad_unshare_cleanup_fs;
1637        if ((err = unshare_vm(unshare_flags, &new_mm)))
1638                goto bad_unshare_cleanup_sigh;
1639        if ((err = unshare_fd(unshare_flags, &new_fd)))
1640                goto bad_unshare_cleanup_vm;
1641        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1642                        new_fs)))
1643                goto bad_unshare_cleanup_fd;
1644
1645        if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
1646                if (do_sysvsem) {
1647                        /*
1648                         * CLONE_SYSVSEM is equivalent to sys_exit().
1649                         */
1650                        exit_sem(current);
1651                }
1652
1653                if (new_nsproxy) {
1654                        switch_task_namespaces(current, new_nsproxy);
1655                        new_nsproxy = NULL;
1656                }
1657
1658                task_lock(current);
1659
1660                if (new_fs) {
1661                        fs = current->fs;
1662                        current->fs = new_fs;
1663                        new_fs = fs;
1664                }
1665
1666                if (new_mm) {
1667                        mm = current->mm;
1668                        active_mm = current->active_mm;
1669                        current->mm = new_mm;
1670                        current->active_mm = new_mm;
1671                        activate_mm(active_mm, new_mm);
1672                        new_mm = mm;
1673                }
1674
1675                if (new_fd) {
1676                        fd = current->files;
1677                        current->files = new_fd;
1678                        new_fd = fd;
1679                }
1680
1681                task_unlock(current);
1682        }
1683
1684        if (new_nsproxy)
1685                put_nsproxy(new_nsproxy);
1686
1687bad_unshare_cleanup_fd:
1688        if (new_fd)
1689                put_files_struct(new_fd);
1690
1691bad_unshare_cleanup_vm:
1692        if (new_mm)
1693                mmput(new_mm);
1694
1695bad_unshare_cleanup_sigh:
1696        if (new_sigh)
1697                if (atomic_dec_and_test(&new_sigh->count))
1698                        kmem_cache_free(sighand_cachep, new_sigh);
1699
1700bad_unshare_cleanup_fs:
1701        if (new_fs)
1702                put_fs_struct(new_fs);
1703
1704bad_unshare_cleanup_thread:
1705bad_unshare_out:
1706        return err;
1707}
1708
1709/*
1710 *      Helper to unshare the files of the current task.
1711 *      We don't want to expose copy_files internals to
1712 *      the exec layer of the kernel.
1713 */
1714
1715int unshare_files(struct files_struct **displaced)
1716{
1717        struct task_struct *task = current;
1718        struct files_struct *copy = NULL;
1719        int error;
1720
1721        error = unshare_fd(CLONE_FILES, &copy);
1722        if (error || !copy) {
1723                *displaced = NULL;
1724                return error;
1725        }
1726        *displaced = task->files;
1727        task_lock(task);
1728        task->files = copy;
1729        task_unlock(task);
1730        return 0;
1731}
1732
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.