linux/kernel/fork.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/fork.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 *  'fork.c' contains the help-routines for the 'fork' system call
   9 * (see also entry.S and others).
  10 * Fork is rather simple, once you get the hang of it, but the memory
  11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  12 */
  13
  14#include <linux/config.h>
  15#include <linux/slab.h>
  16#include <linux/init.h>
  17#include <linux/unistd.h>
  18#include <linux/smp_lock.h>
  19#include <linux/module.h>
  20#include <linux/vmalloc.h>
  21#include <linux/completion.h>
  22#include <linux/namespace.h>
  23#include <linux/personality.h>
  24#include <linux/mempolicy.h>
  25#include <linux/sem.h>
  26#include <linux/file.h>
  27#include <linux/key.h>
  28#include <linux/binfmts.h>
  29#include <linux/mman.h>
  30#include <linux/fs.h>
  31#include <linux/capability.h>
  32#include <linux/cpu.h>
  33#include <linux/cpuset.h>
  34#include <linux/security.h>
  35#include <linux/swap.h>
  36#include <linux/syscalls.h>
  37#include <linux/jiffies.h>
  38#include <linux/futex.h>
  39#include <linux/rcupdate.h>
  40#include <linux/ptrace.h>
  41#include <linux/mount.h>
  42#include <linux/audit.h>
  43#include <linux/profile.h>
  44#include <linux/rmap.h>
  45#include <linux/acct.h>
  46#include <linux/cn_proc.h>
  47
  48#include <asm/pgtable.h>
  49#include <asm/pgalloc.h>
  50#include <asm/uaccess.h>
  51#include <asm/mmu_context.h>
  52#include <asm/cacheflush.h>
  53#include <asm/tlbflush.h>
  54
  55/*
  56 * Protected counters by write_lock_irq(&tasklist_lock)
  57 */
  58unsigned long total_forks;      /* Handle normal Linux uptimes. */
  59int nr_threads;                 /* The idle threads do not count.. */
  60
  61int max_threads;                /* tunable limit on nr_threads */
  62
  63DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  64
  65 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
  66
  67EXPORT_SYMBOL(tasklist_lock);
  68
  69int nr_processes(void)
  70{
  71        int cpu;
  72        int total = 0;
  73
  74        for_each_online_cpu(cpu)
  75                total += per_cpu(process_counts, cpu);
  76
  77        return total;
  78}
  79
  80#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
  81# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
  82# define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
  83static kmem_cache_t *task_struct_cachep;
  84#endif
  85
  86/* SLAB cache for signal_struct structures (tsk->signal) */
  87static kmem_cache_t *signal_cachep;
  88
  89/* SLAB cache for sighand_struct structures (tsk->sighand) */
  90kmem_cache_t *sighand_cachep;
  91
  92/* SLAB cache for files_struct structures (tsk->files) */
  93kmem_cache_t *files_cachep;
  94
  95/* SLAB cache for fs_struct structures (tsk->fs) */
  96kmem_cache_t *fs_cachep;
  97
  98/* SLAB cache for vm_area_struct structures */
  99kmem_cache_t *vm_area_cachep;
 100
 101/* SLAB cache for mm_struct structures (tsk->mm) */
 102static kmem_cache_t *mm_cachep;
 103
 104void free_task(struct task_struct *tsk)
 105{
 106        free_thread_info(tsk->thread_info);
 107        free_task_struct(tsk);
 108}
 109EXPORT_SYMBOL(free_task);
 110
 111void __put_task_struct(struct task_struct *tsk)
 112{
 113        WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 114        WARN_ON(atomic_read(&tsk->usage));
 115        WARN_ON(tsk == current);
 116
 117        security_task_free(tsk);
 118        free_uid(tsk->user);
 119        put_group_info(tsk->group_info);
 120
 121        if (!profile_handoff_task(tsk))
 122                free_task(tsk);
 123}
 124
 125void __init fork_init(unsigned long mempages)
 126{
 127#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 128#ifndef ARCH_MIN_TASKALIGN
 129#define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
 130#endif
 131        /* create a slab on which task_structs can be allocated */
 132        task_struct_cachep =
 133                kmem_cache_create("task_struct", sizeof(struct task_struct),
 134                        ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
 135#endif
 136
 137        /*
 138         * The default maximum number of threads is set to a safe
 139         * value: the thread structures can take up at most half
 140         * of memory.
 141         */
 142        max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
 143
 144        /*
 145         * we need to allow at least 20 threads to boot a system
 146         */
 147        if(max_threads < 20)
 148                max_threads = 20;
 149
 150        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 151        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 152        init_task.signal->rlim[RLIMIT_SIGPENDING] =
 153                init_task.signal->rlim[RLIMIT_NPROC];
 154}
 155
 156static struct task_struct *dup_task_struct(struct task_struct *orig)
 157{
 158        struct task_struct *tsk;
 159        struct thread_info *ti;
 160
 161        prepare_to_copy(orig);
 162
 163        tsk = alloc_task_struct();
 164        if (!tsk)
 165                return NULL;
 166
 167        ti = alloc_thread_info(tsk);
 168        if (!ti) {
 169                free_task_struct(tsk);
 170                return NULL;
 171        }
 172
 173        *tsk = *orig;
 174        tsk->thread_info = ti;
 175        setup_thread_stack(tsk, orig);
 176
 177        /* One for us, one for whoever does the "release_task()" (usually parent) */
 178        atomic_set(&tsk->usage,2);
 179        atomic_set(&tsk->fs_excl, 0);
 180        tsk->btrace_seq = 0;
 181        tsk->splice_pipe = NULL;
 182        return tsk;
 183}
 184
 185#ifdef CONFIG_MMU
 186static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 187{
 188        struct vm_area_struct *mpnt, *tmp, **pprev;
 189        struct rb_node **rb_link, *rb_parent;
 190        int retval;
 191        unsigned long charge;
 192        struct mempolicy *pol;
 193
 194        down_write(&oldmm->mmap_sem);
 195        flush_cache_mm(oldmm);
 196        down_write(&mm->mmap_sem);
 197
 198        mm->locked_vm = 0;
 199        mm->mmap = NULL;
 200        mm->mmap_cache = NULL;
 201        mm->free_area_cache = oldmm->mmap_base;
 202        mm->cached_hole_size = ~0UL;
 203        mm->map_count = 0;
 204        cpus_clear(mm->cpu_vm_mask);
 205        mm->mm_rb = RB_ROOT;
 206        rb_link = &mm->mm_rb.rb_node;
 207        rb_parent = NULL;
 208        pprev = &mm->mmap;
 209
 210        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 211                struct file *file;
 212
 213                if (mpnt->vm_flags & VM_DONTCOPY) {
 214                        long pages = vma_pages(mpnt);
 215                        mm->total_vm -= pages;
 216                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 217                                                                -pages);
 218                        continue;
 219                }
 220                charge = 0;
 221                if (mpnt->vm_flags & VM_ACCOUNT) {
 222                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 223                        if (security_vm_enough_memory(len))
 224                                goto fail_nomem;
 225                        charge = len;
 226                }
 227                tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 228                if (!tmp)
 229                        goto fail_nomem;
 230                *tmp = *mpnt;
 231                pol = mpol_copy(vma_policy(mpnt));
 232                retval = PTR_ERR(pol);
 233                if (IS_ERR(pol))
 234                        goto fail_nomem_policy;
 235                vma_set_policy(tmp, pol);
 236                tmp->vm_flags &= ~VM_LOCKED;
 237                tmp->vm_mm = mm;
 238                tmp->vm_next = NULL;
 239                anon_vma_link(tmp);
 240                file = tmp->vm_file;
 241                if (file) {
 242                        struct inode *inode = file->f_dentry->d_inode;
 243                        get_file(file);
 244                        if (tmp->vm_flags & VM_DENYWRITE)
 245                                atomic_dec(&inode->i_writecount);
 246      
 247                        /* insert tmp into the share list, just after mpnt */
 248                        spin_lock(&file->f_mapping->i_mmap_lock);
 249                        tmp->vm_truncate_count = mpnt->vm_truncate_count;
 250                        flush_dcache_mmap_lock(file->f_mapping);
 251                        vma_prio_tree_add(tmp, mpnt);
 252                        flush_dcache_mmap_unlock(file->f_mapping);
 253                        spin_unlock(&file->f_mapping->i_mmap_lock);
 254                }
 255
 256                /*
 257                 * Link in the new vma and copy the page table entries.
 258                 */
 259                *pprev = tmp;
 260                pprev = &tmp->vm_next;
 261
 262                __vma_link_rb(mm, tmp, rb_link, rb_parent);
 263                rb_link = &tmp->vm_rb.rb_right;
 264                rb_parent = &tmp->vm_rb;
 265
 266                mm->map_count++;
 267                retval = copy_page_range(mm, oldmm, mpnt);
 268
 269                if (tmp->vm_ops && tmp->vm_ops->open)
 270                        tmp->vm_ops->open(tmp);
 271
 272                if (retval)
 273                        goto out;
 274        }
 275        retval = 0;
 276out:
 277        up_write(&mm->mmap_sem);
 278        flush_tlb_mm(oldmm);
 279        up_write(&oldmm->mmap_sem);
 280        return retval;
 281fail_nomem_policy:
 282        kmem_cache_free(vm_area_cachep, tmp);
 283fail_nomem:
 284        retval = -ENOMEM;
 285        vm_unacct_memory(charge);
 286        goto out;
 287}
 288
 289static inline int mm_alloc_pgd(struct mm_struct * mm)
 290{
 291        mm->pgd = pgd_alloc(mm);
 292        if (unlikely(!mm->pgd))
 293                return -ENOMEM;
 294        return 0;
 295}
 296
 297static inline void mm_free_pgd(struct mm_struct * mm)
 298{
 299        pgd_free(mm->pgd);
 300}
 301#else
 302#define dup_mmap(mm, oldmm)     (0)
 303#define mm_alloc_pgd(mm)        (0)
 304#define mm_free_pgd(mm)
 305#endif /* CONFIG_MMU */
 306
 307 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 308
 309#define allocate_mm()   (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 310#define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 311
 312#include <linux/init_task.h>
 313
 314static struct mm_struct * mm_init(struct mm_struct * mm)
 315{
 316        atomic_set(&mm->mm_users, 1);
 317        atomic_set(&mm->mm_count, 1);
 318        init_rwsem(&mm->mmap_sem);
 319        INIT_LIST_HEAD(&mm->mmlist);
 320        mm->core_waiters = 0;
 321        mm->nr_ptes = 0;
 322        set_mm_counter(mm, file_rss, 0);
 323        set_mm_counter(mm, anon_rss, 0);
 324        spin_lock_init(&mm->page_table_lock);
 325        rwlock_init(&mm->ioctx_list_lock);
 326        mm->ioctx_list = NULL;
 327        mm->free_area_cache = TASK_UNMAPPED_BASE;
 328        mm->cached_hole_size = ~0UL;
 329
 330        if (likely(!mm_alloc_pgd(mm))) {
 331                mm->def_flags = 0;
 332                return mm;
 333        }
 334        free_mm(mm);
 335        return NULL;
 336}
 337
 338/*
 339 * Allocate and initialize an mm_struct.
 340 */
 341struct mm_struct * mm_alloc(void)
 342{
 343        struct mm_struct * mm;
 344
 345        mm = allocate_mm();
 346        if (mm) {
 347                memset(mm, 0, sizeof(*mm));
 348                mm = mm_init(mm);
 349        }
 350        return mm;
 351}
 352
 353/*
 354 * Called when the last reference to the mm
 355 * is dropped: either by a lazy thread or by
 356 * mmput. Free the page directory and the mm.
 357 */
 358void fastcall __mmdrop(struct mm_struct *mm)
 359{
 360        BUG_ON(mm == &init_mm);
 361        mm_free_pgd(mm);
 362        destroy_context(mm);
 363        free_mm(mm);
 364}
 365
 366/*
 367 * Decrement the use count and release all resources for an mm.
 368 */
 369void mmput(struct mm_struct *mm)
 370{
 371        if (atomic_dec_and_test(&mm->mm_users)) {
 372                exit_aio(mm);
 373                exit_mmap(mm);
 374                if (!list_empty(&mm->mmlist)) {
 375                        spin_lock(&mmlist_lock);
 376                        list_del(&mm->mmlist);
 377                        spin_unlock(&mmlist_lock);
 378                }
 379                put_swap_token(mm);
 380                mmdrop(mm);
 381        }
 382}
 383EXPORT_SYMBOL_GPL(mmput);
 384
 385/**
 386 * get_task_mm - acquire a reference to the task's mm
 387 *
 388 * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
 389 * this kernel workthread has transiently adopted a user mm with use_mm,
 390 * to do its AIO) is not set and if so returns a reference to it, after
 391 * bumping up the use count.  User must release the mm via mmput()
 392 * after use.  Typically used by /proc and ptrace.
 393 */
 394struct mm_struct *get_task_mm(struct task_struct *task)
 395{
 396        struct mm_struct *mm;
 397
 398        task_lock(task);
 399        mm = task->mm;
 400        if (mm) {
 401                if (task->flags & PF_BORROWED_MM)
 402                        mm = NULL;
 403                else
 404                        atomic_inc(&mm->mm_users);
 405        }
 406        task_unlock(task);
 407        return mm;
 408}
 409EXPORT_SYMBOL_GPL(get_task_mm);
 410
 411/* Please note the differences between mmput and mm_release.
 412 * mmput is called whenever we stop holding onto a mm_struct,
 413 * error success whatever.
 414 *
 415 * mm_release is called after a mm_struct has been removed
 416 * from the current process.
 417 *
 418 * This difference is important for error handling, when we
 419 * only half set up a mm_struct for a new process and need to restore
 420 * the old one.  Because we mmput the new mm_struct before
 421 * restoring the old one. . .
 422 * Eric Biederman 10 January 1998
 423 */
 424void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 425{
 426        struct completion *vfork_done = tsk->vfork_done;
 427
 428        /* Get rid of any cached register state */
 429        deactivate_mm(tsk, mm);
 430
 431        /* notify parent sleeping on vfork() */
 432        if (vfork_done) {
 433                tsk->vfork_done = NULL;
 434                complete(vfork_done);
 435        }
 436        if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
 437                u32 __user * tidptr = tsk->clear_child_tid;
 438                tsk->clear_child_tid = NULL;
 439
 440                /*
 441                 * We don't check the error code - if userspace has
 442                 * not set up a proper pointer then tough luck.
 443                 */
 444                put_user(0, tidptr);
 445                sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
 446        }
 447}
 448
 449/*
 450 * Allocate a new mm structure and copy contents from the
 451 * mm structure of the passed in task structure.
 452 */
 453static struct mm_struct *dup_mm(struct task_struct *tsk)
 454{
 455        struct mm_struct *mm, *oldmm = current->mm;
 456        int err;
 457
 458        if (!oldmm)
 459                return NULL;
 460
 461        mm = allocate_mm();
 462        if (!mm)
 463                goto fail_nomem;
 464
 465        memcpy(mm, oldmm, sizeof(*mm));
 466
 467        if (!mm_init(mm))
 468                goto fail_nomem;
 469
 470        if (init_new_context(tsk, mm))
 471                goto fail_nocontext;
 472
 473        err = dup_mmap(mm, oldmm);
 474        if (err)
 475                goto free_pt;
 476
 477        mm->hiwater_rss = get_mm_rss(mm);
 478        mm->hiwater_vm = mm->total_vm;
 479
 480        return mm;
 481
 482free_pt:
 483        mmput(mm);
 484
 485fail_nomem:
 486        return NULL;
 487
 488fail_nocontext:
 489        /*
 490         * If init_new_context() failed, we cannot use mmput() to free the mm
 491         * because it calls destroy_context()
 492         */
 493        mm_free_pgd(mm);
 494        free_mm(mm);
 495        return NULL;
 496}
 497
 498static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 499{
 500        struct mm_struct * mm, *oldmm;
 501        int retval;
 502
 503        tsk->min_flt = tsk->maj_flt = 0;
 504        tsk->nvcsw = tsk->nivcsw = 0;
 505
 506        tsk->mm = NULL;
 507        tsk->active_mm = NULL;
 508
 509        /*
 510         * Are we cloning a kernel thread?
 511         *
 512         * We need to steal a active VM for that..
 513         */
 514        oldmm = current->mm;
 515        if (!oldmm)
 516                return 0;
 517
 518        if (clone_flags & CLONE_VM) {
 519                atomic_inc(&oldmm->mm_users);
 520                mm = oldmm;
 521                goto good_mm;
 522        }
 523
 524        retval = -ENOMEM;
 525        mm = dup_mm(tsk);
 526        if (!mm)
 527                goto fail_nomem;
 528
 529good_mm:
 530        tsk->mm = mm;
 531        tsk->active_mm = mm;
 532        return 0;
 533
 534fail_nomem:
 535        return retval;
 536}
 537
 538static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 539{
 540        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 541        /* We don't need to lock fs - think why ;-) */
 542        if (fs) {
 543                atomic_set(&fs->count, 1);
 544                rwlock_init(&fs->lock);
 545                fs->umask = old->umask;
 546                read_lock(&old->lock);
 547                fs->rootmnt = mntget(old->rootmnt);
 548                fs->root = dget(old->root);
 549                fs->pwdmnt = mntget(old->pwdmnt);
 550                fs->pwd = dget(old->pwd);
 551                if (old->altroot) {
 552                        fs->altrootmnt = mntget(old->altrootmnt);
 553                        fs->altroot = dget(old->altroot);
 554                } else {
 555                        fs->altrootmnt = NULL;
 556                        fs->altroot = NULL;
 557                }
 558                read_unlock(&old->lock);
 559        }
 560        return fs;
 561}
 562
 563struct fs_struct *copy_fs_struct(struct fs_struct *old)
 564{
 565        return __copy_fs_struct(old);
 566}
 567
 568EXPORT_SYMBOL_GPL(copy_fs_struct);
 569
 570static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 571{
 572        if (clone_flags & CLONE_FS) {
 573                atomic_inc(&current->fs->count);
 574                return 0;
 575        }
 576        tsk->fs = __copy_fs_struct(current->fs);
 577        if (!tsk->fs)
 578                return -ENOMEM;
 579        return 0;
 580}
 581
 582static int count_open_files(struct fdtable *fdt)
 583{
 584        int size = fdt->max_fdset;
 585        int i;
 586
 587        /* Find the last open fd */
 588        for (i = size/(8*sizeof(long)); i > 0; ) {
 589                if (fdt->open_fds->fds_bits[--i])
 590                        break;
 591        }
 592        i = (i+1) * 8 * sizeof(long);
 593        return i;
 594}
 595
 596static struct files_struct *alloc_files(void)
 597{
 598        struct files_struct *newf;
 599        struct fdtable *fdt;
 600
 601        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 602        if (!newf)
 603                goto out;
 604
 605        atomic_set(&newf->count, 1);
 606
 607        spin_lock_init(&newf->file_lock);
 608        newf->next_fd = 0;
 609        fdt = &newf->fdtab;
 610        fdt->max_fds = NR_OPEN_DEFAULT;
 611        fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
 612        fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
 613        fdt->open_fds = (fd_set *)&newf->open_fds_init;
 614        fdt->fd = &newf->fd_array[0];
 615        INIT_RCU_HEAD(&fdt->rcu);
 616        fdt->free_files = NULL;
 617        fdt->next = NULL;
 618        rcu_assign_pointer(newf->fdt, fdt);
 619out:
 620        return newf;
 621}
 622
 623/*
 624 * Allocate a new files structure and copy contents from the
 625 * passed in files structure.
 626 */
 627static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 628{
 629        struct files_struct *newf;
 630        struct file **old_fds, **new_fds;
 631        int open_files, size, i, expand;
 632        struct fdtable *old_fdt, *new_fdt;
 633
 634        newf = alloc_files();
 635        if (!newf)
 636                goto out;
 637
 638        spin_lock(&oldf->file_lock);
 639        old_fdt = files_fdtable(oldf);
 640        new_fdt = files_fdtable(newf);
 641        size = old_fdt->max_fdset;
 642        open_files = count_open_files(old_fdt);
 643        expand = 0;
 644
 645        /*
 646         * Check whether we need to allocate a larger fd array or fd set.
 647         * Note: we're not a clone task, so the open count won't  change.
 648         */
 649        if (open_files > new_fdt->max_fdset) {
 650                new_fdt->max_fdset = 0;
 651                expand = 1;
 652        }
 653        if (open_files > new_fdt->max_fds) {
 654                new_fdt->max_fds = 0;
 655                expand = 1;
 656        }
 657
 658        /* if the old fdset gets grown now, we'll only copy up to "size" fds */
 659        if (expand) {
 660                spin_unlock(&oldf->file_lock);
 661                spin_lock(&newf->file_lock);
 662                *errorp = expand_files(newf, open_files-1);
 663                spin_unlock(&newf->file_lock);
 664                if (*errorp < 0)
 665                        goto out_release;
 666                new_fdt = files_fdtable(newf);
 667                /*
 668                 * Reacquire the oldf lock and a pointer to its fd table
 669                 * who knows it may have a new bigger fd table. We need
 670                 * the latest pointer.
 671                 */
 672                spin_lock(&oldf->file_lock);
 673                old_fdt = files_fdtable(oldf);
 674        }
 675
 676        old_fds = old_fdt->fd;
 677        new_fds = new_fdt->fd;
 678
 679        memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
 680        memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
 681
 682        for (i = open_files; i != 0; i--) {
 683                struct file *f = *old_fds++;
 684                if (f) {
 685                        get_file(f);
 686                } else {
 687                        /*
 688                         * The fd may be claimed in the fd bitmap but not yet
 689                         * instantiated in the files array if a sibling thread
 690                         * is partway through open().  So make sure that this
 691                         * fd is available to the new process.
 692                         */
 693                        FD_CLR(open_files - i, new_fdt->open_fds);
 694                }
 695                rcu_assign_pointer(*new_fds++, f);
 696        }
 697        spin_unlock(&oldf->file_lock);
 698
 699        /* compute the remainder to be cleared */
 700        size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
 701
 702        /* This is long word aligned thus could use a optimized version */ 
 703        memset(new_fds, 0, size); 
 704
 705        if (new_fdt->max_fdset > open_files) {
 706                int left = (new_fdt->max_fdset-open_files)/8;
 707                int start = open_files / (8 * sizeof(unsigned long));
 708
 709                memset(&new_fdt->open_fds->fds_bits[start], 0, left);
 710                memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 711        }
 712
 713out:
 714        return newf;
 715
 716out_release:
 717        free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
 718        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
 719        free_fd_array(new_fdt->fd, new_fdt->max_fds);
 720        kmem_cache_free(files_cachep, newf);
 721        return NULL;
 722}
 723
 724static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 725{
 726        struct files_struct *oldf, *newf;
 727        int error = 0;
 728
 729        /*
 730         * A background process may not have any files ...
 731         */
 732        oldf = current->files;
 733        if (!oldf)
 734                goto out;
 735
 736        if (clone_flags & CLONE_FILES) {
 737                atomic_inc(&oldf->count);
 738                goto out;
 739        }
 740
 741        /*
 742         * Note: we may be using current for both targets (See exec.c)
 743         * This works because we cache current->files (old) as oldf. Don't
 744         * break this.
 745         */
 746        tsk->files = NULL;
 747        error = -ENOMEM;
 748        newf = dup_fd(oldf, &error);
 749        if (!newf)
 750                goto out;
 751
 752        tsk->files = newf;
 753        error = 0;
 754out:
 755        return error;
 756}
 757
 758/*
 759 *      Helper to unshare the files of the current task.
 760 *      We don't want to expose copy_files internals to
 761 *      the exec layer of the kernel.
 762 */
 763
 764int unshare_files(void)
 765{
 766        struct files_struct *files  = current->files;
 767        int rc;
 768
 769        BUG_ON(!files);
 770
 771        /* This can race but the race causes us to copy when we don't
 772           need to and drop the copy */
 773        if(atomic_read(&files->count) == 1)
 774        {
 775                atomic_inc(&files->count);
 776                return 0;
 777        }
 778        rc = copy_files(0, current);
 779        if(rc)
 780                current->files = files;
 781        return rc;
 782}
 783
 784EXPORT_SYMBOL(unshare_files);
 785
 786static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 787{
 788        struct sighand_struct *sig;
 789
 790        if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
 791                atomic_inc(&current->sighand->count);
 792                return 0;
 793        }
 794        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 795        rcu_assign_pointer(tsk->sighand, sig);
 796        if (!sig)
 797                return -ENOMEM;
 798        atomic_set(&sig->count, 1);
 799        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 800        return 0;
 801}
 802
 803void __cleanup_sighand(struct sighand_struct *sighand)
 804{
 805        if (atomic_dec_and_test(&sighand->count))
 806                kmem_cache_free(sighand_cachep, sighand);
 807}
 808
 809static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 810{
 811        struct signal_struct *sig;
 812        int ret;
 813
 814        if (clone_flags & CLONE_THREAD) {
 815                atomic_inc(&current->signal->count);
 816                atomic_inc(&current->signal->live);
 817                return 0;
 818        }
 819        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
 820        tsk->signal = sig;
 821        if (!sig)
 822                return -ENOMEM;
 823
 824        ret = copy_thread_group_keys(tsk);
 825        if (ret < 0) {
 826                kmem_cache_free(signal_cachep, sig);
 827                return ret;
 828        }
 829
 830        atomic_set(&sig->count, 1);
 831        atomic_set(&sig->live, 1);
 832        init_waitqueue_head(&sig->wait_chldexit);
 833        sig->flags = 0;
 834        sig->group_exit_code = 0;
 835        sig->group_exit_task = NULL;
 836        sig->group_stop_count = 0;
 837        sig->curr_target = NULL;
 838        init_sigpending(&sig->shared_pending);
 839        INIT_LIST_HEAD(&sig->posix_timers);
 840
 841        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
 842        sig->it_real_incr.tv64 = 0;
 843        sig->real_timer.function = it_real_fn;
 844        sig->tsk = tsk;
 845
 846        sig->it_virt_expires = cputime_zero;
 847        sig->it_virt_incr = cputime_zero;
 848        sig->it_prof_expires = cputime_zero;
 849        sig->it_prof_incr = cputime_zero;
 850
 851        sig->leader = 0;        /* session leadership doesn't inherit */
 852        sig->tty_old_pgrp = 0;
 853
 854        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 855        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 856        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 857        sig->sched_time = 0;
 858        INIT_LIST_HEAD(&sig->cpu_timers[0]);
 859        INIT_LIST_HEAD(&sig->cpu_timers[1]);
 860        INIT_LIST_HEAD(&sig->cpu_timers[2]);
 861
 862        task_lock(current->group_leader);
 863        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
 864        task_unlock(current->group_leader);
 865
 866        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
 867                /*
 868                 * New sole thread in the process gets an expiry time
 869                 * of the whole CPU time limit.
 870                 */
 871                tsk->it_prof_expires =
 872                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
 873        }
 874
 875        return 0;
 876}
 877
 878void __cleanup_signal(struct signal_struct *sig)
 879{
 880        exit_thread_group_keys(sig);
 881        kmem_cache_free(signal_cachep, sig);
 882}
 883
 884static inline void cleanup_signal(struct task_struct *tsk)
 885{
 886        struct signal_struct *sig = tsk->signal;
 887
 888        atomic_dec(&sig->live);
 889
 890        if (atomic_dec_and_test(&sig->count))
 891                __cleanup_signal(sig);
 892}
 893
 894static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 895{
 896        unsigned long new_flags = p->flags;
 897
 898        new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
 899        new_flags |= PF_FORKNOEXEC;
 900        if (!(clone_flags & CLONE_PTRACE))
 901                p->ptrace = 0;
 902        p->flags = new_flags;
 903}
 904
 905asmlinkage long sys_set_tid_address(int __user *tidptr)
 906{
 907        current->clear_child_tid = tidptr;
 908
 909        return current->pid;
 910}
 911
 912/*
 913 * This creates a new process as a copy of the old one,
 914 * but does not actually start it yet.
 915 *
 916 * It copies the registers, and all the appropriate
 917 * parts of the process environment (as per the clone
 918 * flags). The actual kick-off is left to the caller.
 919 */
 920static task_t *copy_process(unsigned long clone_flags,
 921                                 unsigned long stack_start,
 922                                 struct pt_regs *regs,
 923                                 unsigned long stack_size,
 924                                 int __user *parent_tidptr,
 925                                 int __user *child_tidptr,
 926                                 int pid)
 927{
 928        int retval;
 929        struct task_struct *p = NULL;
 930
 931        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 932                return ERR_PTR(-EINVAL);
 933
 934        /*
 935         * Thread groups must share signals as well, and detached threads
 936         * can only be started up within the thread group.
 937         */
 938        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
 939                return ERR_PTR(-EINVAL);
 940
 941        /*
 942         * Shared signal handlers imply shared VM. By way of the above,
 943         * thread groups also imply shared VM. Blocking this case allows
 944         * for various simplifications in other code.
 945         */
 946        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
 947                return ERR_PTR(-EINVAL);
 948
 949        retval = security_task_create(clone_flags);
 950        if (retval)
 951                goto fork_out;
 952
 953        retval = -ENOMEM;
 954        p = dup_task_struct(current);
 955        if (!p)
 956                goto fork_out;
 957
 958        retval = -EAGAIN;
 959        if (atomic_read(&p->user->processes) >=
 960                        p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 961                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
 962                                p->user != &root_user)
 963                        goto bad_fork_free;
 964        }
 965
 966        atomic_inc(&p->user->__count);
 967        atomic_inc(&p->user->processes);
 968        get_group_info(p->group_info);
 969
 970        /*
 971         * If multiple threads are within copy_process(), then this check
 972         * triggers too late. This doesn't hurt, the check is only there
 973         * to stop root fork bombs.
 974         */
 975        if (nr_threads >= max_threads)
 976                goto bad_fork_cleanup_count;
 977
 978        if (!try_module_get(task_thread_info(p)->exec_domain->module))
 979                goto bad_fork_cleanup_count;
 980
 981        if (p->binfmt && !try_module_get(p->binfmt->module))
 982                goto bad_fork_cleanup_put_domain;
 983
 984        p->did_exec = 0;
 985        copy_flags(clone_flags, p);
 986        p->pid = pid;
 987        retval = -EFAULT;
 988        if (clone_flags & CLONE_PARENT_SETTID)
 989                if (put_user(p->pid, parent_tidptr))
 990                        goto bad_fork_cleanup;
 991
 992        p->proc_dentry = NULL;
 993
 994        INIT_LIST_HEAD(&p->children);
 995        INIT_LIST_HEAD(&p->sibling);
 996        p->vfork_done = NULL;
 997        spin_lock_init(&p->alloc_lock);
 998        spin_lock_init(&p->proc_lock);
 999
1000        clear_tsk_thread_flag(p, TIF_SIGPENDING);
1001        init_sigpending(&p->pending);
1002
1003        p->utime = cputime_zero;
1004        p->stime = cputime_zero;
1005        p->sched_time = 0;
1006        p->rchar = 0;           /* I/O counter: bytes read */
1007        p->wchar = 0;           /* I/O counter: bytes written */
1008        p->syscr = 0;           /* I/O counter: read syscalls */
1009        p->syscw = 0;           /* I/O counter: write syscalls */
1010        acct_clear_integrals(p);
1011
1012        p->it_virt_expires = cputime_zero;
1013        p->it_prof_expires = cputime_zero;
1014        p->it_sched_expires = 0;
1015        INIT_LIST_HEAD(&p->cpu_timers[0]);
1016        INIT_LIST_HEAD(&p->cpu_timers[1]);
1017        INIT_LIST_HEAD(&p->cpu_timers[2]);
1018
1019        p->lock_depth = -1;             /* -1 = no lock */
1020        do_posix_clock_monotonic_gettime(&p->start_time);
1021        p->security = NULL;
1022        p->io_context = NULL;
1023        p->io_wait = NULL;
1024        p->audit_context = NULL;
1025        cpuset_fork(p);
1026#ifdef CONFIG_NUMA
1027        p->mempolicy = mpol_copy(p->mempolicy);
1028        if (IS_ERR(p->mempolicy)) {
1029                retval = PTR_ERR(p->mempolicy);
1030                p->mempolicy = NULL;
1031                goto bad_fork_cleanup_cpuset;
1032        }
1033        mpol_fix_fork_child_flag(p);
1034#endif
1035
1036#ifdef CONFIG_DEBUG_MUTEXES
1037        p->blocked_on = NULL; /* not blocked yet */
1038#endif
1039
1040        p->tgid = p->pid;
1041        if (clone_flags & CLONE_THREAD)
1042                p->tgid = current->tgid;
1043
1044        if ((retval = security_task_alloc(p)))
1045                goto bad_fork_cleanup_policy;
1046        if ((retval = audit_alloc(p)))
1047                goto bad_fork_cleanup_security;
1048        /* copy all the process information */
1049        if ((retval = copy_semundo(clone_flags, p)))
1050                goto bad_fork_cleanup_audit;
1051        if ((retval = copy_files(clone_flags, p)))
1052                goto bad_fork_cleanup_semundo;
1053        if ((retval = copy_fs(clone_flags, p)))
1054                goto bad_fork_cleanup_files;
1055        if ((retval = copy_sighand(clone_flags, p)))
1056                goto bad_fork_cleanup_fs;
1057        if ((retval = copy_signal(clone_flags, p)))
1058                goto bad_fork_cleanup_sighand;
1059        if ((retval = copy_mm(clone_flags, p)))
1060                goto bad_fork_cleanup_signal;
1061        if ((retval = copy_keys(clone_flags, p)))
1062                goto bad_fork_cleanup_mm;
1063        if ((retval = copy_namespace(clone_flags, p)))
1064                goto bad_fork_cleanup_keys;
1065        retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1066        if (retval)
1067                goto bad_fork_cleanup_namespace;
1068
1069        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1070        /*
1071         * Clear TID on mm_release()?
1072         */
1073        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1074        p->robust_list = NULL;
1075#ifdef CONFIG_COMPAT
1076        p->compat_robust_list = NULL;
1077#endif
1078        /*
1079         * sigaltstack should be cleared when sharing the same VM
1080         */
1081        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
1082                p->sas_ss_sp = p->sas_ss_size = 0;
1083
1084        /*
1085         * Syscall tracing should be turned off in the child regardless
1086         * of CLONE_PTRACE.
1087         */
1088        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1089#ifdef TIF_SYSCALL_EMU
1090        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1091#endif
1092
1093        /* Our parent execution domain becomes current domain
1094           These must match for thread signalling to apply */
1095           
1096        p->parent_exec_id = p->self_exec_id;
1097
1098        /* ok, now we should be set up.. */
1099        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1100        p->pdeath_signal = 0;
1101        p->exit_state = 0;
1102
1103        /*
1104         * Ok, make it visible to the rest of the system.
1105         * We dont wake it up yet.
1106         */
1107        p->group_leader = p;
1108        INIT_LIST_HEAD(&p->thread_group);
1109        INIT_LIST_HEAD(&p->ptrace_children);
1110        INIT_LIST_HEAD(&p->ptrace_list);
1111
1112        /* Perform scheduler related setup. Assign this task to a CPU. */
1113        sched_fork(p, clone_flags);
1114
1115        /* Need tasklist lock for parent etc handling! */
1116        write_lock_irq(&tasklist_lock);
1117
1118        /*
1119         * The task hasn't been attached yet, so its cpus_allowed mask will
1120         * not be changed, nor will its assigned CPU.
1121         *
1122         * The cpus_allowed mask of the parent may have changed after it was
1123         * copied first time - so re-copy it here, then check the child's CPU
1124         * to ensure it is on a valid CPU (and if not, just force it back to
1125         * parent's CPU). This avoids alot of nasty races.
1126         */
1127        p->cpus_allowed = current->cpus_allowed;
1128        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1129                        !cpu_online(task_cpu(p))))
1130                set_task_cpu(p, smp_processor_id());
1131
1132        /* CLONE_PARENT re-uses the old parent */
1133        if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1134                p->real_parent = current->real_parent;
1135        else
1136                p->real_parent = current;
1137        p->parent = p->real_parent;
1138
1139        spin_lock(&current->sighand->siglock);
1140
1141        /*
1142         * Process group and session signals need to be delivered to just the
1143         * parent before the fork or both the parent and the child after the
1144         * fork. Restart if a signal comes in before we add the new process to
1145         * it's process group.
1146         * A fatal signal pending means that current will exit, so the new
1147         * thread can't slip out of an OOM kill (or normal SIGKILL).
1148         */
1149        recalc_sigpending();
1150        if (signal_pending(current)) {
1151                spin_unlock(&current->sighand->siglock);
1152                write_unlock_irq(&tasklist_lock);
1153                retval = -ERESTARTNOINTR;
1154                goto bad_fork_cleanup_namespace;
1155        }
1156
1157        if (clone_flags & CLONE_THREAD) {
1158                /*
1159                 * Important: if an exit-all has been started then
1160                 * do not create this new thread - the whole thread
1161                 * group is supposed to exit anyway.
1162                 */
1163                if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1164                        spin_unlock(&current->sighand->siglock);
1165                        write_unlock_irq(&tasklist_lock);
1166                        retval = -EAGAIN;
1167                        goto bad_fork_cleanup_namespace;
1168                }
1169
1170                p->group_leader = current->group_leader;
1171                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1172
1173                if (!cputime_eq(current->signal->it_virt_expires,
1174                                cputime_zero) ||
1175                    !cputime_eq(current->signal->it_prof_expires,
1176                                cputime_zero) ||
1177                    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1178                    !list_empty(&current->signal->cpu_timers[0]) ||
1179                    !list_empty(&current->signal->cpu_timers[1]) ||
1180                    !list_empty(&current->signal->cpu_timers[2])) {
1181                        /*
1182                         * Have child wake up on its first tick to check
1183                         * for process CPU timers.
1184                         */
1185                        p->it_prof_expires = jiffies_to_cputime(1);
1186                }
1187        }
1188
1189        /*
1190         * inherit ioprio
1191         */
1192        p->ioprio = current->ioprio;
1193
1194        if (likely(p->pid)) {
1195                add_parent(p);
1196                if (unlikely(p->ptrace & PT_PTRACED))
1197                        __ptrace_link(p, current->parent);
1198
1199                if (thread_group_leader(p)) {
1200                        p->signal->tty = current->signal->tty;
1201                        p->signal->pgrp = process_group(current);
1202                        p->signal->session = current->signal->session;
1203                        attach_pid(p, PIDTYPE_PGID, process_group(p));
1204                        attach_pid(p, PIDTYPE_SID, p->signal->session);
1205
1206                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
1207                        __get_cpu_var(process_counts)++;
1208                }
1209                attach_pid(p, PIDTYPE_PID, p->pid);
1210                nr_threads++;
1211        }
1212
1213        total_forks++;
1214        spin_unlock(&current->sighand->siglock);
1215        write_unlock_irq(&tasklist_lock);
1216        proc_fork_connector(p);
1217        return p;
1218
1219bad_fork_cleanup_namespace:
1220        exit_namespace(p);
1221bad_fork_cleanup_keys:
1222        exit_keys(p);
1223bad_fork_cleanup_mm:
1224        if (p->mm)
1225                mmput(p->mm);
1226bad_fork_cleanup_signal:
1227        cleanup_signal(p);
1228bad_fork_cleanup_sighand:
1229        __cleanup_sighand(p->sighand);
1230bad_fork_cleanup_fs:
1231        exit_fs(p); /* blocking */
1232bad_fork_cleanup_files:
1233        exit_files(p); /* blocking */
1234bad_fork_cleanup_semundo:
1235        exit_sem(p);
1236bad_fork_cleanup_audit:
1237        audit_free(p);
1238bad_fork_cleanup_security:
1239        security_task_free(p);
1240bad_fork_cleanup_policy:
1241#ifdef CONFIG_NUMA
1242        mpol_free(p->mempolicy);
1243bad_fork_cleanup_cpuset:
1244#endif
1245        cpuset_exit(p);
1246bad_fork_cleanup:
1247        if (p->binfmt)
1248                module_put(p->binfmt->module);
1249bad_fork_cleanup_put_domain:
1250        module_put(task_thread_info(p)->exec_domain->module);
1251bad_fork_cleanup_count:
1252        put_group_info(p->group_info);
1253        atomic_dec(&p->user->processes);
1254        free_uid(p->user);
1255bad_fork_free:
1256        free_task(p);
1257fork_out:
1258        return ERR_PTR(retval);
1259}
1260
1261struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1262{
1263        memset(regs, 0, sizeof(struct pt_regs));
1264        return regs;
1265}
1266
1267task_t * __devinit fork_idle(int cpu)
1268{
1269        task_t *task;
1270        struct pt_regs regs;
1271
1272        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
1273        if (!task)
1274                return ERR_PTR(-ENOMEM);
1275        init_idle(task, cpu);
1276
1277        return task;
1278}
1279
1280static inline int fork_traceflag (unsigned clone_flags)
1281{
1282        if (clone_flags & CLONE_UNTRACED)
1283                return 0;
1284        else if (clone_flags & CLONE_VFORK) {
1285                if (current->ptrace & PT_TRACE_VFORK)
1286                        return PTRACE_EVENT_VFORK;
1287        } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1288                if (current->ptrace & PT_TRACE_CLONE)
1289                        return PTRACE_EVENT_CLONE;
1290        } else if (current->ptrace & PT_TRACE_FORK)
1291                return PTRACE_EVENT_FORK;
1292
1293        return 0;
1294}
1295
1296/*
1297 *  Ok, this is the main fork-routine.
1298 *
1299 * It copies the process, and if successful kick-starts
1300 * it and waits for it to finish using the VM if required.
1301 */
1302long do_fork(unsigned long clone_flags,
1303              unsigned long stack_start,
1304              struct pt_regs *regs,
1305              unsigned long stack_size,
1306              int __user *parent_tidptr,
1307              int __user *child_tidptr)
1308{
1309        struct task_struct *p;
1310        int trace = 0;
1311        struct pid *pid = alloc_pid();
1312        long nr;
1313
1314        if (!pid)
1315                return -EAGAIN;
1316        nr = pid->nr;
1317        if (unlikely(current->ptrace)) {
1318                trace = fork_traceflag (clone_flags);
1319                if (trace)
1320                        clone_flags |= CLONE_PTRACE;
1321        }
1322
1323        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
1324        /*
1325         * Do this prior waking up the new thread - the thread pointer
1326         * might get invalid after that point, if the thread exits quickly.
1327         */
1328        if (!IS_ERR(p)) {
1329                struct completion vfork;
1330
1331                if (clone_flags & CLONE_VFORK) {
1332                        p->vfork_done = &vfork;
1333                        init_completion(&vfork);
1334                }
1335
1336                if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
1337                        /*
1338                         * We'll start up with an immediate SIGSTOP.
1339                         */
1340                        sigaddset(&p->pending.signal, SIGSTOP);
1341                        set_tsk_thread_flag(p, TIF_SIGPENDING);
1342                }
1343
1344                if (!(clone_flags & CLONE_STOPPED))
1345                        wake_up_new_task(p, clone_flags);
1346                else
1347                        p->state = TASK_STOPPED;
1348
1349                if (unlikely (trace)) {
1350                        current->ptrace_message = nr;
1351                        ptrace_notify ((trace << 8) | SIGTRAP);
1352                }
1353
1354                if (clone_flags & CLONE_VFORK) {
1355                        wait_for_completion(&vfork);
1356                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
1357                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1358                }
1359        } else {
1360                free_pid(pid);
1361                nr = PTR_ERR(p);
1362        }
1363        return nr;
1364}
1365
1366#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1367#define ARCH_MIN_MMSTRUCT_ALIGN 0
1368#endif
1369
1370static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
1371{
1372        struct sighand_struct *sighand = data;
1373
1374        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1375                                        SLAB_CTOR_CONSTRUCTOR)
1376                spin_lock_init(&sighand->siglock);
1377}
1378
1379void __init proc_caches_init(void)
1380{
1381        sighand_cachep = kmem_cache_create("sighand_cache",
1382                        sizeof(struct sighand_struct), 0,
1383                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1384                        sighand_ctor, NULL);
1385        signal_cachep = kmem_cache_create("signal_cache",
1386                        sizeof(struct signal_struct), 0,
1387                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1388        files_cachep = kmem_cache_create("files_cache", 
1389                        sizeof(struct files_struct), 0,
1390                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1391        fs_cachep = kmem_cache_create("fs_cache", 
1392                        sizeof(struct fs_struct), 0,
1393                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1394        vm_area_cachep = kmem_cache_create("vm_area_struct",
1395                        sizeof(struct vm_area_struct), 0,
1396                        SLAB_PANIC, NULL, NULL);
1397        mm_cachep = kmem_cache_create("mm_struct",
1398                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1399                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1400}
1401
1402
1403/*
1404 * Check constraints on flags passed to the unshare system call and
1405 * force unsharing of additional process context as appropriate.
1406 */
1407static inline void check_unshare_flags(unsigned long *flags_ptr)
1408{
1409        /*
1410         * If unsharing a thread from a thread group, must also
1411         * unshare vm.
1412         */
1413        if (*flags_ptr & CLONE_THREAD)
1414                *flags_ptr |= CLONE_VM;
1415
1416        /*
1417         * If unsharing vm, must also unshare signal handlers.
1418         */
1419        if (*flags_ptr & CLONE_VM)
1420                *flags_ptr |= CLONE_SIGHAND;
1421
1422        /*
1423         * If unsharing signal handlers and the task was created
1424         * using CLONE_THREAD, then must unshare the thread
1425         */
1426        if ((*flags_ptr & CLONE_SIGHAND) &&
1427            (atomic_read(&current->signal->count) > 1))
1428                *flags_ptr |= CLONE_THREAD;
1429
1430        /*
1431         * If unsharing namespace, must also unshare filesystem information.
1432         */
1433        if (*flags_ptr & CLONE_NEWNS)
1434                *flags_ptr |= CLONE_FS;
1435}
1436
1437/*
1438 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1439 */
1440static int unshare_thread(unsigned long unshare_flags)
1441{
1442        if (unshare_flags & CLONE_THREAD)
1443                return -EINVAL;
1444
1445        return 0;
1446}
1447
1448/*
1449 * Unshare the filesystem structure if it is being shared
1450 */
1451static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1452{
1453        struct fs_struct *fs = current->fs;
1454
1455        if ((unshare_flags & CLONE_FS) &&
1456            (fs && atomic_read(&fs->count) > 1)) {
1457                *new_fsp = __copy_fs_struct(current->fs);
1458                if (!*new_fsp)
1459                        return -ENOMEM;
1460        }
1461
1462        return 0;
1463}
1464
1465/*
1466 * Unshare the namespace structure if it is being shared
1467 */
1468static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
1469{
1470        struct namespace *ns = current->namespace;
1471
1472        if ((unshare_flags & CLONE_NEWNS) &&
1473            (ns && atomic_read(&ns->count) > 1)) {
1474                if (!capable(CAP_SYS_ADMIN))
1475                        return -EPERM;
1476
1477                *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
1478                if (!*new_nsp)
1479                        return -ENOMEM;
1480        }
1481
1482        return 0;
1483}
1484
1485/*
1486 * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
1487 * supported yet
1488 */
1489static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1490{
1491        struct sighand_struct *sigh = current->sighand;
1492
1493        if ((unshare_flags & CLONE_SIGHAND) &&
1494            (sigh && atomic_read(&sigh->count) > 1))
1495                return -EINVAL;
1496        else
1497                return 0;
1498}
1499
1500/*
1501 * Unshare vm if it is being shared
1502 */
1503static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1504{
1505        struct mm_struct *mm = current->mm;
1506
1507        if ((unshare_flags & CLONE_VM) &&
1508            (mm && atomic_read(&mm->mm_users) > 1)) {
1509                return -EINVAL;
1510        }
1511
1512        return 0;
1513}
1514
1515/*
1516 * Unshare file descriptor table if it is being shared
1517 */
1518static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
1519{
1520        struct files_struct *fd = current->files;
1521        int error = 0;
1522
1523        if ((unshare_flags & CLONE_FILES) &&
1524            (fd && atomic_read(&fd->count) > 1)) {
1525                *new_fdp = dup_fd(fd, &error);
1526                if (!*new_fdp)
1527                        return error;
1528        }
1529
1530        return 0;
1531}
1532
1533/*
1534 * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
1535 * supported yet
1536 */
1537static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
1538{
1539        if (unshare_flags & CLONE_SYSVSEM)
1540                return -EINVAL;
1541
1542        return 0;
1543}
1544
1545/*
1546 * unshare allows a process to 'unshare' part of the process
1547 * context which was originally shared using clone.  copy_*
1548 * functions used by do_fork() cannot be used here directly
1549 * because they modify an inactive task_struct that is being
1550 * constructed. Here we are modifying the current, active,
1551 * task_struct.
1552 */
1553asmlinkage long sys_unshare(unsigned long unshare_flags)
1554{
1555        int err = 0;
1556        struct fs_struct *fs, *new_fs = NULL;
1557        struct namespace *ns, *new_ns = NULL;
1558        struct sighand_struct *sigh, *new_sigh = NULL;
1559        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1560        struct files_struct *fd, *new_fd = NULL;
1561        struct sem_undo_list *new_ulist = NULL;
1562
1563        check_unshare_flags(&unshare_flags);
1564
1565        /* Return -EINVAL for all unsupported flags */
1566        err = -EINVAL;
1567        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1568                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
1569                goto bad_unshare_out;
1570
1571        if ((err = unshare_thread(unshare_flags)))
1572                goto bad_unshare_out;
1573        if ((err = unshare_fs(unshare_flags, &new_fs)))
1574                goto bad_unshare_cleanup_thread;
1575        if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
1576                goto bad_unshare_cleanup_fs;
1577        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1578                goto bad_unshare_cleanup_ns;
1579        if ((err = unshare_vm(unshare_flags, &new_mm)))
1580                goto bad_unshare_cleanup_sigh;
1581        if ((err = unshare_fd(unshare_flags, &new_fd)))
1582                goto bad_unshare_cleanup_vm;
1583        if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1584                goto bad_unshare_cleanup_fd;
1585
1586        if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
1587
1588                task_lock(current);
1589
1590                if (new_fs) {
1591                        fs = current->fs;
1592                        current->fs = new_fs;
1593                        new_fs = fs;
1594                }
1595
1596                if (new_ns) {
1597                        ns = current->namespace;
1598                        current->namespace = new_ns;
1599                        new_ns = ns;
1600                }
1601
1602                if (new_sigh) {
1603                        sigh = current->sighand;
1604                        rcu_assign_pointer(current->sighand, new_sigh);
1605                        new_sigh = sigh;
1606                }
1607
1608                if (new_mm) {
1609                        mm = current->mm;
1610                        active_mm = current->active_mm;
1611                        current->mm = new_mm;
1612                        current->active_mm = new_mm;
1613                        activate_mm(active_mm, new_mm);
1614                        new_mm = mm;
1615                }
1616
1617                if (new_fd) {
1618                        fd = current->files;
1619                        current->files = new_fd;
1620                        new_fd = fd;
1621                }
1622
1623                task_unlock(current);
1624        }
1625
1626bad_unshare_cleanup_fd:
1627        if (new_fd)
1628                put_files_struct(new_fd);
1629
1630bad_unshare_cleanup_vm:
1631        if (new_mm)
1632                mmput(new_mm);
1633
1634bad_unshare_cleanup_sigh:
1635        if (new_sigh)
1636                if (atomic_dec_and_test(&new_sigh->count))
1637                        kmem_cache_free(sighand_cachep, new_sigh);
1638
1639bad_unshare_cleanup_ns:
1640        if (new_ns)
1641                put_namespace(new_ns);
1642
1643bad_unshare_cleanup_fs:
1644        if (new_fs)
1645                put_fs_struct(new_fs);
1646
1647bad_unshare_cleanup_thread:
1648bad_unshare_out:
1649        return err;
1650}
1651
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.