linux-old/kernel/fork.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/fork.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 *  'fork.c' contains the help-routines for the 'fork' system call
   9 * (see also system_call.s).
  10 * Fork is rather simple, once you get the hang of it, but the memory
  11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
  12 */
  13
  14#include <linux/init.h>
  15#include <linux/errno.h>
  16#include <linux/sched.h>
  17#include <linux/kernel.h>
  18#include <linux/mm.h>
  19#include <linux/slab.h>
  20#include <linux/unistd.h>
  21#include <linux/ptrace.h>
  22#include <linux/malloc.h>
  23#include <linux/smp.h>
  24#include <linux/smp_lock.h>
  25#include <linux/module.h>
  26
  27#include <asm/system.h>
  28#include <asm/pgtable.h>
  29#include <asm/mmu_context.h>
  30#include <asm/uaccess.h>
  31
  32int nr_tasks=1;
  33int nr_running=1;
  34unsigned long int total_forks=0;        /* Handle normal Linux uptimes. */
  35int last_pid=0;
  36
  37/* SLAB cache for mm_struct's. */
  38kmem_cache_t *mm_cachep;
  39
  40/* SLAB cache for files structs */
  41kmem_cache_t *files_cachep; 
  42
  43struct task_struct *pidhash[PIDHASH_SZ];
  44spinlock_t pidhash_lock = SPIN_LOCK_UNLOCKED;
  45
  46struct task_struct **tarray_freelist = NULL;
  47spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
  48
  49/* UID task count cache, to prevent walking entire process list every
  50 * single fork() operation.
  51 */
  52#define UIDHASH_SZ      (PIDHASH_SZ >> 2)
  53
  54static struct uid_taskcount {
  55        struct uid_taskcount *next, **pprev;
  56        unsigned short uid;
  57        int task_count;
  58} *uidhash[UIDHASH_SZ];
  59
  60#ifdef __SMP__
  61static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
  62#endif
  63
  64kmem_cache_t *uid_cachep;
  65
  66#define uidhashfn(uid)  (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
  67
  68static inline void uid_hash_insert(struct uid_taskcount *up, unsigned int hashent)
  69{
  70        spin_lock(&uidhash_lock);
  71        if((up->next = uidhash[hashent]) != NULL)
  72                uidhash[hashent]->pprev = &up->next;
  73        up->pprev = &uidhash[hashent];
  74        uidhash[hashent] = up;
  75        spin_unlock(&uidhash_lock);
  76}
  77
  78static inline void uid_hash_remove(struct uid_taskcount *up)
  79{
  80        spin_lock(&uidhash_lock);
  81        if(up->next)
  82                up->next->pprev = up->pprev;
  83        *up->pprev = up->next;
  84        spin_unlock(&uidhash_lock);
  85}
  86
  87static inline struct uid_taskcount *uid_find(unsigned short uid, unsigned int hashent)
  88{
  89        struct uid_taskcount *up;
  90
  91        spin_lock(&uidhash_lock);
  92        for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next)
  93                ;
  94        spin_unlock(&uidhash_lock);
  95        return up;
  96}
  97
  98int charge_uid(struct task_struct *p, int count)
  99{
 100        unsigned int hashent = uidhashfn(p->uid);
 101        struct uid_taskcount *up = uid_find(p->uid, hashent);
 102
 103        if(up) {
 104                int limit = p->rlim[RLIMIT_NPROC].rlim_cur;
 105                int newcnt = up->task_count + count;
 106
 107                if(newcnt > limit)
 108                        return -EAGAIN;
 109                else if(newcnt == 0) {
 110                        uid_hash_remove(up);
 111                        kmem_cache_free(uid_cachep, up);
 112                        return 0;
 113                }
 114        } else {
 115                up = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
 116                if(!up)
 117                        return -EAGAIN;
 118                up->uid = p->uid;
 119                up->task_count = 0;
 120                uid_hash_insert(up, hashent);
 121        }
 122        up->task_count += count;
 123        return 0;
 124}
 125
 126__initfunc(void uidcache_init(void))
 127{
 128        int i;
 129
 130        uid_cachep = kmem_cache_create("uid_cache", sizeof(struct uid_taskcount),
 131                                       0,
 132                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
 133        if(!uid_cachep)
 134                panic("Cannot create uid taskcount SLAB cache\n");
 135
 136        for(i = 0; i < UIDHASH_SZ; i++)
 137                uidhash[i] = 0;
 138}
 139
 140static inline int find_empty_process(void)
 141{
 142        struct task_struct **tslot;
 143
 144        if(current->uid) {
 145                int error;
 146
 147                if(nr_tasks >= NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT)
 148                        return -EAGAIN;
 149                if((error = charge_uid(current, 1)) < 0)
 150                        return error;
 151        }
 152        tslot = get_free_taskslot();
 153        if(tslot)
 154                return tslot - &task[0];
 155        return -EAGAIN;
 156}
 157
 158#ifdef __SMP__
 159/* Protects next_safe and last_pid. */
 160static spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
 161#endif
 162
 163static int get_pid(unsigned long flags)
 164{
 165        static int next_safe = PID_MAX;
 166        struct task_struct *p;
 167
 168        if (flags & CLONE_PID)
 169                return current->pid;
 170
 171        spin_lock(&lastpid_lock);
 172        if((++last_pid) & 0xffff8000) {
 173                last_pid = 300;         /* Skip daemons etc. */
 174                goto inside;
 175        }
 176        if(last_pid >= next_safe) {
 177inside:
 178                next_safe = PID_MAX;
 179                read_lock(&tasklist_lock);
 180        repeat:
 181                for_each_task(p) {
 182                        if(p->pid == last_pid   ||
 183                           p->pgrp == last_pid  ||
 184                           p->session == last_pid) {
 185                                if(++last_pid >= next_safe) {
 186                                        if(last_pid & 0xffff8000)
 187                                                last_pid = 300;
 188                                        next_safe = PID_MAX;
 189                                        goto repeat;
 190                                }
 191                        }
 192                        if(p->pid > last_pid && next_safe > p->pid)
 193                                next_safe = p->pid;
 194                        if(p->pgrp > last_pid && next_safe > p->pgrp)
 195                                next_safe = p->pgrp;
 196                        if(p->session > last_pid && next_safe > p->session)
 197                                next_safe = p->session;
 198                }
 199                read_unlock(&tasklist_lock);
 200        }
 201        spin_unlock(&lastpid_lock);
 202
 203        return last_pid;
 204}
 205
 206static inline int dup_mmap(struct mm_struct * mm)
 207{
 208        struct vm_area_struct * mpnt, *tmp, **pprev;
 209        int retval;
 210
 211        mm->mmap = mm->mmap_cache = NULL;
 212        flush_cache_mm(current->mm);
 213        pprev = &mm->mmap;
 214        for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 215                struct dentry *dentry;
 216
 217                retval = -ENOMEM;
 218                tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 219                if (!tmp)
 220                        goto fail_nomem;
 221                *tmp = *mpnt;
 222                tmp->vm_flags &= ~VM_LOCKED;
 223                tmp->vm_mm = mm;
 224                tmp->vm_next = NULL;
 225                dentry = tmp->vm_dentry;
 226                if (dentry) {
 227                        dget(dentry);
 228                        if (tmp->vm_flags & VM_DENYWRITE)
 229                                dentry->d_inode->i_writecount--;
 230      
 231                        /* insert tmp into the share list, just after mpnt */
 232                        if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
 233                                mpnt->vm_next_share->vm_pprev_share =
 234                                        &tmp->vm_next_share;
 235                        mpnt->vm_next_share = tmp;
 236                        tmp->vm_pprev_share = &mpnt->vm_next_share;
 237                }
 238
 239                /* Copy the pages, but defer checking for errors */
 240                retval = copy_page_range(mm, current->mm, tmp);
 241                if (!retval && tmp->vm_ops && tmp->vm_ops->open)
 242                        tmp->vm_ops->open(tmp);
 243
 244                /*
 245                 * Link in the new vma even if an error occurred,
 246                 * so that exit_mmap() can clean up the mess.
 247                 */
 248                if((tmp->vm_next = *pprev) != NULL)
 249                        (*pprev)->vm_pprev = &tmp->vm_next;
 250                *pprev = tmp;
 251                tmp->vm_pprev = pprev;
 252
 253                pprev = &tmp->vm_next;
 254                if (retval)
 255                        goto fail_nomem;
 256        }
 257        flush_tlb_mm(current->mm);
 258        return 0;
 259
 260fail_nomem:
 261        flush_tlb_mm(current->mm);
 262        return retval;
 263}
 264
 265/*
 266 * Allocate and initialize an mm_struct.
 267 */
 268struct mm_struct * mm_alloc(void)
 269{
 270        struct mm_struct * mm;
 271
 272        mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
 273        if (mm) {
 274                *mm = *current->mm;
 275                init_new_context(mm);
 276                mm->count = 1;
 277                mm->def_flags = 0;
 278                mm->mmap_sem = MUTEX;
 279                mm->pgd = NULL;
 280                mm->mmap = mm->mmap_cache = NULL;
 281
 282                /* It has not run yet, so cannot be present in anyone's
 283                 * cache or tlb.
 284                 */
 285                mm->cpu_vm_mask = 0;
 286        }
 287        return mm;
 288}
 289
 290/*
 291 * Decrement the use count and release all resources for an mm.
 292 */
 293void mmput(struct mm_struct *mm)
 294{
 295        if (!--mm->count) {
 296                exit_mmap(mm);
 297                free_page_tables(mm);
 298                kmem_cache_free(mm_cachep, mm);
 299        }
 300}
 301
 302static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 303{
 304        struct mm_struct * mm;
 305        int retval;
 306
 307        if (clone_flags & CLONE_VM) {
 308                mmget(current->mm);
 309                SET_PAGE_DIR(tsk, current->mm->pgd);
 310                return 0;
 311        }
 312
 313        retval = -ENOMEM;
 314        mm = mm_alloc();
 315        if (!mm)
 316                goto fail_nomem;
 317
 318        tsk->mm = mm;
 319        tsk->min_flt = tsk->maj_flt = 0;
 320        tsk->cmin_flt = tsk->cmaj_flt = 0;
 321        tsk->nswap = tsk->cnswap = 0;
 322        retval = new_page_tables(tsk);
 323        if (retval)
 324                goto free_mm;
 325        retval = dup_mmap(mm);
 326        if (retval)
 327                goto free_mm;
 328        return 0;
 329
 330free_mm:
 331        tsk->mm = NULL;
 332        mmput(mm);
 333fail_nomem:
 334        return retval;
 335}
 336
 337static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 338{
 339        if (clone_flags & CLONE_FS) {
 340                current->fs->count++;
 341                return 0;
 342        }
 343        tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
 344        if (!tsk->fs)
 345                return -1;
 346        tsk->fs->count = 1;
 347        tsk->fs->umask = current->fs->umask;
 348        tsk->fs->root = dget(current->fs->root);
 349        tsk->fs->pwd = dget(current->fs->pwd);
 350        return 0;
 351}
 352
 353/* return value is only accurate by +-sizeof(long)*8 fds */ 
 354/* XXX make this architecture specific */
 355static inline int __copy_fdset(unsigned long *d, unsigned long *src)
 356{
 357        int i; 
 358        unsigned long *p = src; 
 359        unsigned long *max = src; 
 360
 361        for (i = __FDSET_LONGS; i; --i) {
 362                if ((*d++ = *p++) != 0) 
 363                        max = p; 
 364        }
 365        return (max - src)*sizeof(long)*8; 
 366}
 367
 368static inline int copy_fdset(fd_set *dst, fd_set *src)
 369{
 370        return __copy_fdset(dst->fds_bits, src->fds_bits);  
 371}
 372
 373static inline int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 374{
 375        int i;  
 376        struct files_struct *oldf, *newf;
 377        struct file **old_fds, **new_fds;
 378
 379        oldf = current->files;
 380        if (clone_flags & CLONE_FILES) {
 381                oldf->count++;
 382                return 0;
 383        }
 384
 385        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 386        tsk->files = newf;
 387        if (!newf) 
 388                return -1;
 389
 390        newf->count = 1;
 391        newf->close_on_exec = oldf->close_on_exec;
 392        i = copy_fdset(&newf->open_fds,&oldf->open_fds);
 393
 394        old_fds = oldf->fd;
 395        new_fds = newf->fd;
 396        for (; i != 0; i--) {
 397                struct file * f = *old_fds;
 398                old_fds++;
 399                *new_fds = f;
 400                new_fds++;
 401                if (f)
 402                        f->f_count++;
 403        }
 404        return 0;
 405}
 406
 407static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 408{
 409        if (clone_flags & CLONE_SIGHAND) {
 410                atomic_inc(&current->sig->count);
 411                return 0;
 412        }
 413        tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
 414        if (!tsk->sig)
 415                return -1;
 416        spin_lock_init(&tsk->sig->siglock);
 417        atomic_set(&tsk->sig->count, 1);
 418        memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
 419        return 0;
 420}
 421
 422/*
 423 *  Ok, this is the main fork-routine. It copies the system process
 424 * information (task[nr]) and sets up the necessary registers. It
 425 * also copies the data segment in its entirety.
 426 */
 427int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
 428{
 429        int nr;
 430        int error = -ENOMEM;
 431        struct task_struct *p;
 432
 433        lock_kernel();
 434        p = alloc_task_struct();
 435        if (!p)
 436                goto bad_fork;
 437
 438        error = -EAGAIN;
 439        nr = find_empty_process();
 440        if (nr < 0)
 441                goto bad_fork_free;
 442
 443        *p = *current;
 444
 445        if (p->exec_domain && p->exec_domain->module)
 446                __MOD_INC_USE_COUNT(p->exec_domain->module);
 447        if (p->binfmt && p->binfmt->module)
 448                __MOD_INC_USE_COUNT(p->binfmt->module);
 449
 450        p->did_exec = 0;
 451        p->swappable = 0;
 452        p->state = TASK_UNINTERRUPTIBLE;
 453        p->flags &= ~(PF_PTRACED|PF_TRACESYS|PF_SUPERPRIV);
 454        p->flags |= PF_FORKNOEXEC;
 455        p->pid = get_pid(clone_flags);
 456        p->next_run = NULL;
 457        p->prev_run = NULL;
 458        p->p_pptr = p->p_opptr = current;
 459        p->p_cptr = NULL;
 460        init_waitqueue(&p->wait_chldexit);
 461        p->signal = 0;
 462        p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
 463        p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
 464        init_timer(&p->real_timer);
 465        p->real_timer.data = (unsigned long) p;
 466        p->leader = 0;          /* session leadership doesn't inherit */
 467        p->tty_old_pgrp = 0;
 468        p->times.tms_utime = p->times.tms_stime = 0;
 469        p->times.tms_cutime = p->times.tms_cstime = 0;
 470#ifdef __SMP__
 471        p->has_cpu = 0;
 472        p->processor = NO_PROC_ID;
 473#endif
 474        p->lock_depth = 0;
 475        p->start_time = jiffies;
 476        p->tarray_ptr = &task[nr];
 477        *p->tarray_ptr = p;
 478        SET_LINKS(p);
 479        hash_pid(p);
 480        nr_tasks++;
 481
 482        error = -ENOMEM;
 483        /* copy all the process information */
 484        if (copy_files(clone_flags, p))
 485                goto bad_fork_cleanup;
 486        if (copy_fs(clone_flags, p))
 487                goto bad_fork_cleanup_files;
 488        if (copy_sighand(clone_flags, p))
 489                goto bad_fork_cleanup_fs;
 490        if (copy_mm(clone_flags, p))
 491                goto bad_fork_cleanup_sighand;
 492        error = copy_thread(nr, clone_flags, usp, p, regs);
 493        if (error)
 494                goto bad_fork_cleanup_sighand;
 495        p->semundo = NULL;
 496
 497        /* ok, now we should be set up.. */
 498        p->swappable = 1;
 499        p->exit_signal = clone_flags & CSIGNAL;
 500
 501        /*
 502         * "share" dynamic priority between parent and child, thus the
 503         * total amount of dynamic priorities in the system doesnt change,
 504         * more scheduling fairness. This is only important in the first
 505         * timeslice, on the long run the scheduling behaviour is unchanged.
 506         */
 507        current->counter >>= 1;
 508        p->counter = current->counter;
 509
 510        if(p->pid) {
 511                wake_up_process(p);             /* do this last, just in case */
 512        } else {
 513                p->state = TASK_RUNNING;
 514                p->next_run = p->prev_run = p;
 515        }
 516        ++total_forks;
 517        error = p->pid;
 518        goto fork_out;
 519
 520bad_fork_cleanup_sighand:
 521        exit_sighand(p);
 522bad_fork_cleanup_fs:
 523        exit_fs(p); /* blocking */
 524bad_fork_cleanup_files:
 525        exit_files(p); /* blocking */
 526bad_fork_cleanup:
 527        charge_uid(current, -1);
 528        if (p->exec_domain && p->exec_domain->module)
 529                __MOD_DEC_USE_COUNT(p->exec_domain->module);
 530        if (p->binfmt && p->binfmt->module)
 531                __MOD_DEC_USE_COUNT(p->binfmt->module);
 532        add_free_taskslot(p->tarray_ptr);
 533        unhash_pid(p);
 534        REMOVE_LINKS(p);
 535        nr_tasks--;
 536bad_fork_free:
 537        free_task_struct(p);
 538bad_fork:
 539fork_out:
 540        unlock_kernel();
 541        return error;
 542}
 543
 544static void files_ctor(void *fp, kmem_cache_t *cachep, unsigned long flags)
 545{
 546        struct files_struct *f = fp;
 547
 548        memset(f, 0, sizeof(*f));
 549}
 550
 551__initfunc(void filescache_init(void))
 552{
 553        files_cachep = kmem_cache_create("files_cache", 
 554                                         sizeof(struct files_struct),
 555                                         0, 
 556                                         SLAB_HWCACHE_ALIGN,
 557                                         files_ctor, NULL);
 558        if (!files_cachep) 
 559                panic("Cannot create files cache"); 
 560}
 561
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.