linux/fs/exec.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/exec.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * #!-checking implemented by tytso.
   9 */
  10/*
  11 * Demand-loading implemented 01.12.91 - no need to read anything but
  12 * the header into memory. The inode of the executable is put into
  13 * "current->executable", and page faults do the actual loading. Clean.
  14 *
  15 * Once more I can proudly say that linux stood up to being changed: it
  16 * was less than 2 hours work to get demand-loading completely implemented.
  17 *
  18 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
  19 * current->executable is only used by the procfs.  This allows a dispatch
  20 * table to check for several different types  of binary formats.  We keep
  21 * trying until we recognize the file or we run out of supported binary
  22 * formats. 
  23 */
  24
  25#include <linux/slab.h>
  26#include <linux/file.h>
  27#include <linux/mman.h>
  28#include <linux/a.out.h>
  29#include <linux/stat.h>
  30#include <linux/fcntl.h>
  31#include <linux/smp_lock.h>
  32#include <linux/string.h>
  33#include <linux/init.h>
  34#include <linux/pagemap.h>
  35#include <linux/highmem.h>
  36#include <linux/spinlock.h>
  37#include <linux/key.h>
  38#include <linux/personality.h>
  39#include <linux/binfmts.h>
  40#include <linux/swap.h>
  41#include <linux/utsname.h>
  42#include <linux/pid_namespace.h>
  43#include <linux/module.h>
  44#include <linux/namei.h>
  45#include <linux/proc_fs.h>
  46#include <linux/ptrace.h>
  47#include <linux/mount.h>
  48#include <linux/security.h>
  49#include <linux/syscalls.h>
  50#include <linux/rmap.h>
  51#include <linux/tsacct_kern.h>
  52#include <linux/cn_proc.h>
  53#include <linux/audit.h>
  54
  55#include <asm/uaccess.h>
  56#include <asm/mmu_context.h>
  57#include <asm/tlb.h>
  58
  59#ifdef CONFIG_KMOD
  60#include <linux/kmod.h>
  61#endif
  62
  63int core_uses_pid;
  64char core_pattern[CORENAME_MAX_SIZE] = "core";
  65int suid_dumpable = 0;
  66
  67/* The maximal length of core_pattern is also specified in sysctl.c */
  68
  69static LIST_HEAD(formats);
  70static DEFINE_RWLOCK(binfmt_lock);
  71
  72int register_binfmt(struct linux_binfmt * fmt)
  73{
  74        if (!fmt)
  75                return -EINVAL;
  76        write_lock(&binfmt_lock);
  77        list_add(&fmt->lh, &formats);
  78        write_unlock(&binfmt_lock);
  79        return 0;       
  80}
  81
  82EXPORT_SYMBOL(register_binfmt);
  83
  84void unregister_binfmt(struct linux_binfmt * fmt)
  85{
  86        write_lock(&binfmt_lock);
  87        list_del(&fmt->lh);
  88        write_unlock(&binfmt_lock);
  89}
  90
  91EXPORT_SYMBOL(unregister_binfmt);
  92
  93static inline void put_binfmt(struct linux_binfmt * fmt)
  94{
  95        module_put(fmt->module);
  96}
  97
  98/*
  99 * Note that a shared library must be both readable and executable due to
 100 * security reasons.
 101 *
 102 * Also note that we take the address to load from from the file itself.
 103 */
 104asmlinkage long sys_uselib(const char __user * library)
 105{
 106        struct file * file;
 107        struct nameidata nd;
 108        int error;
 109
 110        error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 111        if (error)
 112                goto out;
 113
 114        error = -EINVAL;
 115        if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
 116                goto exit;
 117
 118        error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
 119        if (error)
 120                goto exit;
 121
 122        file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
 123        error = PTR_ERR(file);
 124        if (IS_ERR(file))
 125                goto out;
 126
 127        error = -ENOEXEC;
 128        if(file->f_op) {
 129                struct linux_binfmt * fmt;
 130
 131                read_lock(&binfmt_lock);
 132                list_for_each_entry(fmt, &formats, lh) {
 133                        if (!fmt->load_shlib)
 134                                continue;
 135                        if (!try_module_get(fmt->module))
 136                                continue;
 137                        read_unlock(&binfmt_lock);
 138                        error = fmt->load_shlib(file);
 139                        read_lock(&binfmt_lock);
 140                        put_binfmt(fmt);
 141                        if (error != -ENOEXEC)
 142                                break;
 143                }
 144                read_unlock(&binfmt_lock);
 145        }
 146        fput(file);
 147out:
 148        return error;
 149exit:
 150        release_open_intent(&nd);
 151        path_put(&nd.path);
 152        goto out;
 153}
 154
 155#ifdef CONFIG_MMU
 156
 157static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 158                int write)
 159{
 160        struct page *page;
 161        int ret;
 162
 163#ifdef CONFIG_STACK_GROWSUP
 164        if (write) {
 165                ret = expand_stack_downwards(bprm->vma, pos);
 166                if (ret < 0)
 167                        return NULL;
 168        }
 169#endif
 170        ret = get_user_pages(current, bprm->mm, pos,
 171                        1, write, 1, &page, NULL);
 172        if (ret <= 0)
 173                return NULL;
 174
 175        if (write) {
 176                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 177                struct rlimit *rlim;
 178
 179                /*
 180                 * We've historically supported up to 32 pages (ARG_MAX)
 181                 * of argument strings even with small stacks
 182                 */
 183                if (size <= ARG_MAX)
 184                        return page;
 185
 186                /*
 187                 * Limit to 1/4-th the stack size for the argv+env strings.
 188                 * This ensures that:
 189                 *  - the remaining binfmt code will not run out of stack space,
 190                 *  - the program will have a reasonable amount of stack left
 191                 *    to work from.
 192                 */
 193                rlim = current->signal->rlim;
 194                if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
 195                        put_page(page);
 196                        return NULL;
 197                }
 198        }
 199
 200        return page;
 201}
 202
 203static void put_arg_page(struct page *page)
 204{
 205        put_page(page);
 206}
 207
 208static void free_arg_page(struct linux_binprm *bprm, int i)
 209{
 210}
 211
 212static void free_arg_pages(struct linux_binprm *bprm)
 213{
 214}
 215
 216static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 217                struct page *page)
 218{
 219        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 220}
 221
 222static int __bprm_mm_init(struct linux_binprm *bprm)
 223{
 224        int err = -ENOMEM;
 225        struct vm_area_struct *vma = NULL;
 226        struct mm_struct *mm = bprm->mm;
 227
 228        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 229        if (!vma)
 230                goto err;
 231
 232        down_write(&mm->mmap_sem);
 233        vma->vm_mm = mm;
 234
 235        /*
 236         * Place the stack at the largest stack address the architecture
 237         * supports. Later, we'll move this to an appropriate place. We don't
 238         * use STACK_TOP because that can depend on attributes which aren't
 239         * configured yet.
 240         */
 241        vma->vm_end = STACK_TOP_MAX;
 242        vma->vm_start = vma->vm_end - PAGE_SIZE;
 243
 244        vma->vm_flags = VM_STACK_FLAGS;
 245        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 246        err = insert_vm_struct(mm, vma);
 247        if (err) {
 248                up_write(&mm->mmap_sem);
 249                goto err;
 250        }
 251
 252        mm->stack_vm = mm->total_vm = 1;
 253        up_write(&mm->mmap_sem);
 254
 255        bprm->p = vma->vm_end - sizeof(void *);
 256
 257        return 0;
 258
 259err:
 260        if (vma) {
 261                bprm->vma = NULL;
 262                kmem_cache_free(vm_area_cachep, vma);
 263        }
 264
 265        return err;
 266}
 267
 268static bool valid_arg_len(struct linux_binprm *bprm, long len)
 269{
 270        return len <= MAX_ARG_STRLEN;
 271}
 272
 273#else
 274
 275static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 276                int write)
 277{
 278        struct page *page;
 279
 280        page = bprm->page[pos / PAGE_SIZE];
 281        if (!page && write) {
 282                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
 283                if (!page)
 284                        return NULL;
 285                bprm->page[pos / PAGE_SIZE] = page;
 286        }
 287
 288        return page;
 289}
 290
 291static void put_arg_page(struct page *page)
 292{
 293}
 294
 295static void free_arg_page(struct linux_binprm *bprm, int i)
 296{
 297        if (bprm->page[i]) {
 298                __free_page(bprm->page[i]);
 299                bprm->page[i] = NULL;
 300        }
 301}
 302
 303static void free_arg_pages(struct linux_binprm *bprm)
 304{
 305        int i;
 306
 307        for (i = 0; i < MAX_ARG_PAGES; i++)
 308                free_arg_page(bprm, i);
 309}
 310
 311static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 312                struct page *page)
 313{
 314}
 315
 316static int __bprm_mm_init(struct linux_binprm *bprm)
 317{
 318        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
 319        return 0;
 320}
 321
 322static bool valid_arg_len(struct linux_binprm *bprm, long len)
 323{
 324        return len <= bprm->p;
 325}
 326
 327#endif /* CONFIG_MMU */
 328
 329/*
 330 * Create a new mm_struct and populate it with a temporary stack
 331 * vm_area_struct.  We don't have enough context at this point to set the stack
 332 * flags, permissions, and offset, so we use temporary values.  We'll update
 333 * them later in setup_arg_pages().
 334 */
 335int bprm_mm_init(struct linux_binprm *bprm)
 336{
 337        int err;
 338        struct mm_struct *mm = NULL;
 339
 340        bprm->mm = mm = mm_alloc();
 341        err = -ENOMEM;
 342        if (!mm)
 343                goto err;
 344
 345        err = init_new_context(current, mm);
 346        if (err)
 347                goto err;
 348
 349        err = __bprm_mm_init(bprm);
 350        if (err)
 351                goto err;
 352
 353        return 0;
 354
 355err:
 356        if (mm) {
 357                bprm->mm = NULL;
 358                mmdrop(mm);
 359        }
 360
 361        return err;
 362}
 363
 364/*
 365 * count() counts the number of strings in array ARGV.
 366 */
 367static int count(char __user * __user * argv, int max)
 368{
 369        int i = 0;
 370
 371        if (argv != NULL) {
 372                for (;;) {
 373                        char __user * p;
 374
 375                        if (get_user(p, argv))
 376                                return -EFAULT;
 377                        if (!p)
 378                                break;
 379                        argv++;
 380                        if(++i > max)
 381                                return -E2BIG;
 382                        cond_resched();
 383                }
 384        }
 385        return i;
 386}
 387
 388/*
 389 * 'copy_strings()' copies argument/environment strings from the old
 390 * processes's memory to the new process's stack.  The call to get_user_pages()
 391 * ensures the destination page is created and not swapped out.
 392 */
 393static int copy_strings(int argc, char __user * __user * argv,
 394                        struct linux_binprm *bprm)
 395{
 396        struct page *kmapped_page = NULL;
 397        char *kaddr = NULL;
 398        unsigned long kpos = 0;
 399        int ret;
 400
 401        while (argc-- > 0) {
 402                char __user *str;
 403                int len;
 404                unsigned long pos;
 405
 406                if (get_user(str, argv+argc) ||
 407                                !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
 408                        ret = -EFAULT;
 409                        goto out;
 410                }
 411
 412                if (!valid_arg_len(bprm, len)) {
 413                        ret = -E2BIG;
 414                        goto out;
 415                }
 416
 417                /* We're going to work our way backwords. */
 418                pos = bprm->p;
 419                str += len;
 420                bprm->p -= len;
 421
 422                while (len > 0) {
 423                        int offset, bytes_to_copy;
 424
 425                        offset = pos % PAGE_SIZE;
 426                        if (offset == 0)
 427                                offset = PAGE_SIZE;
 428
 429                        bytes_to_copy = offset;
 430                        if (bytes_to_copy > len)
 431                                bytes_to_copy = len;
 432
 433                        offset -= bytes_to_copy;
 434                        pos -= bytes_to_copy;
 435                        str -= bytes_to_copy;
 436                        len -= bytes_to_copy;
 437
 438                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 439                                struct page *page;
 440
 441                                page = get_arg_page(bprm, pos, 1);
 442                                if (!page) {
 443                                        ret = -E2BIG;
 444                                        goto out;
 445                                }
 446
 447                                if (kmapped_page) {
 448                                        flush_kernel_dcache_page(kmapped_page);
 449                                        kunmap(kmapped_page);
 450                                        put_arg_page(kmapped_page);
 451                                }
 452                                kmapped_page = page;
 453                                kaddr = kmap(kmapped_page);
 454                                kpos = pos & PAGE_MASK;
 455                                flush_arg_page(bprm, kpos, kmapped_page);
 456                        }
 457                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 458                                ret = -EFAULT;
 459                                goto out;
 460                        }
 461                }
 462        }
 463        ret = 0;
 464out:
 465        if (kmapped_page) {
 466                flush_kernel_dcache_page(kmapped_page);
 467                kunmap(kmapped_page);
 468                put_arg_page(kmapped_page);
 469        }
 470        return ret;
 471}
 472
 473/*
 474 * Like copy_strings, but get argv and its values from kernel memory.
 475 */
 476int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
 477{
 478        int r;
 479        mm_segment_t oldfs = get_fs();
 480        set_fs(KERNEL_DS);
 481        r = copy_strings(argc, (char __user * __user *)argv, bprm);
 482        set_fs(oldfs);
 483        return r;
 484}
 485EXPORT_SYMBOL(copy_strings_kernel);
 486
 487#ifdef CONFIG_MMU
 488
 489/*
 490 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 491 * the binfmt code determines where the new stack should reside, we shift it to
 492 * its final location.  The process proceeds as follows:
 493 *
 494 * 1) Use shift to calculate the new vma endpoints.
 495 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 496 *    arguments passed to subsequent functions are consistent.
 497 * 3) Move vma's page tables to the new range.
 498 * 4) Free up any cleared pgd range.
 499 * 5) Shrink the vma to cover only the new range.
 500 */
 501static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 502{
 503        struct mm_struct *mm = vma->vm_mm;
 504        unsigned long old_start = vma->vm_start;
 505        unsigned long old_end = vma->vm_end;
 506        unsigned long length = old_end - old_start;
 507        unsigned long new_start = old_start - shift;
 508        unsigned long new_end = old_end - shift;
 509        struct mmu_gather *tlb;
 510
 511        BUG_ON(new_start > new_end);
 512
 513        /*
 514         * ensure there are no vmas between where we want to go
 515         * and where we are
 516         */
 517        if (vma != find_vma(mm, new_start))
 518                return -EFAULT;
 519
 520        /*
 521         * cover the whole range: [new_start, old_end)
 522         */
 523        vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
 524
 525        /*
 526         * move the page tables downwards, on failure we rely on
 527         * process cleanup to remove whatever mess we made.
 528         */
 529        if (length != move_page_tables(vma, old_start,
 530                                       vma, new_start, length))
 531                return -ENOMEM;
 532
 533        lru_add_drain();
 534        tlb = tlb_gather_mmu(mm, 0);
 535        if (new_end > old_start) {
 536                /*
 537                 * when the old and new regions overlap clear from new_end.
 538                 */
 539                free_pgd_range(&tlb, new_end, old_end, new_end,
 540                        vma->vm_next ? vma->vm_next->vm_start : 0);
 541        } else {
 542                /*
 543                 * otherwise, clean from old_start; this is done to not touch
 544                 * the address space in [new_end, old_start) some architectures
 545                 * have constraints on va-space that make this illegal (IA64) -
 546                 * for the others its just a little faster.
 547                 */
 548                free_pgd_range(&tlb, old_start, old_end, new_end,
 549                        vma->vm_next ? vma->vm_next->vm_start : 0);
 550        }
 551        tlb_finish_mmu(tlb, new_end, old_end);
 552
 553        /*
 554         * shrink the vma to just the new range.
 555         */
 556        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 557
 558        return 0;
 559}
 560
 561#define EXTRA_STACK_VM_PAGES    20      /* random */
 562
 563/*
 564 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 565 * the stack is optionally relocated, and some extra space is added.
 566 */
 567int setup_arg_pages(struct linux_binprm *bprm,
 568                    unsigned long stack_top,
 569                    int executable_stack)
 570{
 571        unsigned long ret;
 572        unsigned long stack_shift;
 573        struct mm_struct *mm = current->mm;
 574        struct vm_area_struct *vma = bprm->vma;
 575        struct vm_area_struct *prev = NULL;
 576        unsigned long vm_flags;
 577        unsigned long stack_base;
 578
 579#ifdef CONFIG_STACK_GROWSUP
 580        /* Limit stack size to 1GB */
 581        stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
 582        if (stack_base > (1 << 30))
 583                stack_base = 1 << 30;
 584
 585        /* Make sure we didn't let the argument array grow too large. */
 586        if (vma->vm_end - vma->vm_start > stack_base)
 587                return -ENOMEM;
 588
 589        stack_base = PAGE_ALIGN(stack_top - stack_base);
 590
 591        stack_shift = vma->vm_start - stack_base;
 592        mm->arg_start = bprm->p - stack_shift;
 593        bprm->p = vma->vm_end - stack_shift;
 594#else
 595        stack_top = arch_align_stack(stack_top);
 596        stack_top = PAGE_ALIGN(stack_top);
 597        stack_shift = vma->vm_end - stack_top;
 598
 599        bprm->p -= stack_shift;
 600        mm->arg_start = bprm->p;
 601#endif
 602
 603        if (bprm->loader)
 604                bprm->loader -= stack_shift;
 605        bprm->exec -= stack_shift;
 606
 607        down_write(&mm->mmap_sem);
 608        vm_flags = vma->vm_flags;
 609
 610        /*
 611         * Adjust stack execute permissions; explicitly enable for
 612         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
 613         * (arch default) otherwise.
 614         */
 615        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
 616                vm_flags |= VM_EXEC;
 617        else if (executable_stack == EXSTACK_DISABLE_X)
 618                vm_flags &= ~VM_EXEC;
 619        vm_flags |= mm->def_flags;
 620
 621        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 622                        vm_flags);
 623        if (ret)
 624                goto out_unlock;
 625        BUG_ON(prev != vma);
 626
 627        /* Move stack pages down in memory. */
 628        if (stack_shift) {
 629                ret = shift_arg_pages(vma, stack_shift);
 630                if (ret) {
 631                        up_write(&mm->mmap_sem);
 632                        return ret;
 633                }
 634        }
 635
 636#ifdef CONFIG_STACK_GROWSUP
 637        stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 638#else
 639        stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 640#endif
 641        ret = expand_stack(vma, stack_base);
 642        if (ret)
 643                ret = -EFAULT;
 644
 645out_unlock:
 646        up_write(&mm->mmap_sem);
 647        return 0;
 648}
 649EXPORT_SYMBOL(setup_arg_pages);
 650
 651#endif /* CONFIG_MMU */
 652
 653struct file *open_exec(const char *name)
 654{
 655        struct nameidata nd;
 656        int err;
 657        struct file *file;
 658
 659        err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 660        file = ERR_PTR(err);
 661
 662        if (!err) {
 663                struct inode *inode = nd.path.dentry->d_inode;
 664                file = ERR_PTR(-EACCES);
 665                if (S_ISREG(inode->i_mode)) {
 666                        int err = vfs_permission(&nd, MAY_EXEC);
 667                        file = ERR_PTR(err);
 668                        if (!err) {
 669                                file = nameidata_to_filp(&nd,
 670                                                        O_RDONLY|O_LARGEFILE);
 671                                if (!IS_ERR(file)) {
 672                                        err = deny_write_access(file);
 673                                        if (err) {
 674                                                fput(file);
 675                                                file = ERR_PTR(err);
 676                                        }
 677                                }
 678out:
 679                                return file;
 680                        }
 681                }
 682                release_open_intent(&nd);
 683                path_put(&nd.path);
 684        }
 685        goto out;
 686}
 687
 688EXPORT_SYMBOL(open_exec);
 689
 690int kernel_read(struct file *file, unsigned long offset,
 691        char *addr, unsigned long count)
 692{
 693        mm_segment_t old_fs;
 694        loff_t pos = offset;
 695        int result;
 696
 697        old_fs = get_fs();
 698        set_fs(get_ds());
 699        /* The cast to a user pointer is valid due to the set_fs() */
 700        result = vfs_read(file, (void __user *)addr, count, &pos);
 701        set_fs(old_fs);
 702        return result;
 703}
 704
 705EXPORT_SYMBOL(kernel_read);
 706
 707static int exec_mmap(struct mm_struct *mm)
 708{
 709        struct task_struct *tsk;
 710        struct mm_struct * old_mm, *active_mm;
 711
 712        /* Notify parent that we're no longer interested in the old VM */
 713        tsk = current;
 714        old_mm = current->mm;
 715        mm_release(tsk, old_mm);
 716
 717        if (old_mm) {
 718                /*
 719                 * Make sure that if there is a core dump in progress
 720                 * for the old mm, we get out and die instead of going
 721                 * through with the exec.  We must hold mmap_sem around
 722                 * checking core_waiters and changing tsk->mm.  The
 723                 * core-inducing thread will increment core_waiters for
 724                 * each thread whose ->mm == old_mm.
 725                 */
 726                down_read(&old_mm->mmap_sem);
 727                if (unlikely(old_mm->core_waiters)) {
 728                        up_read(&old_mm->mmap_sem);
 729                        return -EINTR;
 730                }
 731        }
 732        task_lock(tsk);
 733        active_mm = tsk->active_mm;
 734        tsk->mm = mm;
 735        tsk->active_mm = mm;
 736        activate_mm(active_mm, mm);
 737        task_unlock(tsk);
 738        arch_pick_mmap_layout(mm);
 739        if (old_mm) {
 740                up_read(&old_mm->mmap_sem);
 741                BUG_ON(active_mm != old_mm);
 742                mmput(old_mm);
 743                return 0;
 744        }
 745        mmdrop(active_mm);
 746        return 0;
 747}
 748
 749/*
 750 * This function makes sure the current process has its own signal table,
 751 * so that flush_signal_handlers can later reset the handlers without
 752 * disturbing other processes.  (Other processes might share the signal
 753 * table via the CLONE_SIGHAND option to clone().)
 754 */
 755static int de_thread(struct task_struct *tsk)
 756{
 757        struct signal_struct *sig = tsk->signal;
 758        struct sighand_struct *oldsighand = tsk->sighand;
 759        spinlock_t *lock = &oldsighand->siglock;
 760        struct task_struct *leader = NULL;
 761        int count;
 762
 763        if (thread_group_empty(tsk))
 764                goto no_thread_group;
 765
 766        /*
 767         * Kill all other threads in the thread group.
 768         * We must hold tasklist_lock to call zap_other_threads.
 769         */
 770        read_lock(&tasklist_lock);
 771        spin_lock_irq(lock);
 772        if (signal_group_exit(sig)) {
 773                /*
 774                 * Another group action in progress, just
 775                 * return so that the signal is processed.
 776                 */
 777                spin_unlock_irq(lock);
 778                read_unlock(&tasklist_lock);
 779                return -EAGAIN;
 780        }
 781
 782        /*
 783         * child_reaper ignores SIGKILL, change it now.
 784         * Reparenting needs write_lock on tasklist_lock,
 785         * so it is safe to do it under read_lock.
 786         */
 787        if (unlikely(tsk->group_leader == task_child_reaper(tsk)))
 788                task_active_pid_ns(tsk)->child_reaper = tsk;
 789
 790        sig->group_exit_task = tsk;
 791        zap_other_threads(tsk);
 792        read_unlock(&tasklist_lock);
 793
 794        /* Account for the thread group leader hanging around: */
 795        count = thread_group_leader(tsk) ? 1 : 2;
 796        sig->notify_count = count;
 797        while (atomic_read(&sig->count) > count) {
 798                __set_current_state(TASK_UNINTERRUPTIBLE);
 799                spin_unlock_irq(lock);
 800                schedule();
 801                spin_lock_irq(lock);
 802        }
 803        spin_unlock_irq(lock);
 804
 805        /*
 806         * At this point all other threads have exited, all we have to
 807         * do is to wait for the thread group leader to become inactive,
 808         * and to assume its PID:
 809         */
 810        if (!thread_group_leader(tsk)) {
 811                leader = tsk->group_leader;
 812
 813                sig->notify_count = -1;
 814                for (;;) {
 815                        write_lock_irq(&tasklist_lock);
 816                        if (likely(leader->exit_state))
 817                                break;
 818                        __set_current_state(TASK_UNINTERRUPTIBLE);
 819                        write_unlock_irq(&tasklist_lock);
 820                        schedule();
 821                }
 822
 823                /*
 824                 * The only record we have of the real-time age of a
 825                 * process, regardless of execs it's done, is start_time.
 826                 * All the past CPU time is accumulated in signal_struct
 827                 * from sister threads now dead.  But in this non-leader
 828                 * exec, nothing survives from the original leader thread,
 829                 * whose birth marks the true age of this process now.
 830                 * When we take on its identity by switching to its PID, we
 831                 * also take its birthdate (always earlier than our own).
 832                 */
 833                tsk->start_time = leader->start_time;
 834
 835                BUG_ON(!same_thread_group(leader, tsk));
 836                BUG_ON(has_group_leader_pid(tsk));
 837                /*
 838                 * An exec() starts a new thread group with the
 839                 * TGID of the previous thread group. Rehash the
 840                 * two threads with a switched PID, and release
 841                 * the former thread group leader:
 842                 */
 843
 844                /* Become a process group leader with the old leader's pid.
 845                 * The old leader becomes a thread of the this thread group.
 846                 * Note: The old leader also uses this pid until release_task
 847                 *       is called.  Odd but simple and correct.
 848                 */
 849                detach_pid(tsk, PIDTYPE_PID);
 850                tsk->pid = leader->pid;
 851                attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
 852                transfer_pid(leader, tsk, PIDTYPE_PGID);
 853                transfer_pid(leader, tsk, PIDTYPE_SID);
 854                list_replace_rcu(&leader->tasks, &tsk->tasks);
 855
 856                tsk->group_leader = tsk;
 857                leader->group_leader = tsk;
 858
 859                tsk->exit_signal = SIGCHLD;
 860
 861                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 862                leader->exit_state = EXIT_DEAD;
 863
 864                write_unlock_irq(&tasklist_lock);
 865        }
 866
 867        sig->group_exit_task = NULL;
 868        sig->notify_count = 0;
 869
 870no_thread_group:
 871        exit_itimers(sig);
 872        if (leader)
 873                release_task(leader);
 874
 875        if (atomic_read(&oldsighand->count) != 1) {
 876                struct sighand_struct *newsighand;
 877                /*
 878                 * This ->sighand is shared with the CLONE_SIGHAND
 879                 * but not CLONE_THREAD task, switch to the new one.
 880                 */
 881                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 882                if (!newsighand)
 883                        return -ENOMEM;
 884
 885                atomic_set(&newsighand->count, 1);
 886                memcpy(newsighand->action, oldsighand->action,
 887                       sizeof(newsighand->action));
 888
 889                write_lock_irq(&tasklist_lock);
 890                spin_lock(&oldsighand->siglock);
 891                rcu_assign_pointer(tsk->sighand, newsighand);
 892                spin_unlock(&oldsighand->siglock);
 893                write_unlock_irq(&tasklist_lock);
 894
 895                __cleanup_sighand(oldsighand);
 896        }
 897
 898        BUG_ON(!thread_group_leader(tsk));
 899        return 0;
 900}
 901
 902/*
 903 * These functions flushes out all traces of the currently running executable
 904 * so that a new one can be started
 905 */
 906static void flush_old_files(struct files_struct * files)
 907{
 908        long j = -1;
 909        struct fdtable *fdt;
 910
 911        spin_lock(&files->file_lock);
 912        for (;;) {
 913                unsigned long set, i;
 914
 915                j++;
 916                i = j * __NFDBITS;
 917                fdt = files_fdtable(files);
 918                if (i >= fdt->max_fds)
 919                        break;
 920                set = fdt->close_on_exec->fds_bits[j];
 921                if (!set)
 922                        continue;
 923                fdt->close_on_exec->fds_bits[j] = 0;
 924                spin_unlock(&files->file_lock);
 925                for ( ; set ; i++,set >>= 1) {
 926                        if (set & 1) {
 927                                sys_close(i);
 928                        }
 929                }
 930                spin_lock(&files->file_lock);
 931
 932        }
 933        spin_unlock(&files->file_lock);
 934}
 935
 936char *get_task_comm(char *buf, struct task_struct *tsk)
 937{
 938        /* buf must be at least sizeof(tsk->comm) in size */
 939        task_lock(tsk);
 940        strncpy(buf, tsk->comm, sizeof(tsk->comm));
 941        task_unlock(tsk);
 942        return buf;
 943}
 944
 945void set_task_comm(struct task_struct *tsk, char *buf)
 946{
 947        task_lock(tsk);
 948        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 949        task_unlock(tsk);
 950}
 951
 952int flush_old_exec(struct linux_binprm * bprm)
 953{
 954        char * name;
 955        int i, ch, retval;
 956        struct files_struct *files;
 957        char tcomm[sizeof(current->comm)];
 958
 959        /*
 960         * Make sure we have a private signal table and that
 961         * we are unassociated from the previous thread group.
 962         */
 963        retval = de_thread(current);
 964        if (retval)
 965                goto out;
 966
 967        /*
 968         * Make sure we have private file handles. Ask the
 969         * fork helper to do the work for us and the exit
 970         * helper to do the cleanup of the old one.
 971         */
 972        files = current->files;         /* refcounted so safe to hold */
 973        retval = unshare_files();
 974        if (retval)
 975                goto out;
 976        /*
 977         * Release all of the old mmap stuff
 978         */
 979        retval = exec_mmap(bprm->mm);
 980        if (retval)
 981                goto mmap_failed;
 982
 983        bprm->mm = NULL;                /* We're using it now */
 984
 985        /* This is the point of no return */
 986        put_files_struct(files);
 987
 988        current->sas_ss_sp = current->sas_ss_size = 0;
 989
 990        if (current->euid == current->uid && current->egid == current->gid)
 991                set_dumpable(current->mm, 1);
 992        else
 993                set_dumpable(current->mm, suid_dumpable);
 994
 995        name = bprm->filename;
 996
 997        /* Copies the binary name from after last slash */
 998        for (i=0; (ch = *(name++)) != '\0';) {
 999                if (ch == '/')
1000                        i = 0; /* overwrite what we wrote */
1001                else
1002                        if (i < (sizeof(tcomm) - 1))
1003                                tcomm[i++] = ch;
1004        }
1005        tcomm[i] = '\0';
1006        set_task_comm(current, tcomm);
1007
1008        current->flags &= ~PF_RANDOMIZE;
1009        flush_thread();
1010
1011        /* Set the new mm task size. We have to do that late because it may
1012         * depend on TIF_32BIT which is only updated in flush_thread() on
1013         * some architectures like powerpc
1014         */
1015        current->mm->task_size = TASK_SIZE;
1016
1017        if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
1018                suid_keys(current);
1019                set_dumpable(current->mm, suid_dumpable);
1020                current->pdeath_signal = 0;
1021        } else if (file_permission(bprm->file, MAY_READ) ||
1022                        (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
1023                suid_keys(current);
1024                set_dumpable(current->mm, suid_dumpable);
1025        }
1026
1027        /* An exec changes our domain. We are no longer part of the thread
1028           group */
1029
1030        current->self_exec_id++;
1031                        
1032        flush_signal_handlers(current, 0);
1033        flush_old_files(current->files);
1034
1035        return 0;
1036
1037mmap_failed:
1038        reset_files_struct(current, files);
1039out:
1040        return retval;
1041}
1042
1043EXPORT_SYMBOL(flush_old_exec);
1044
1045/* 
1046 * Fill the binprm structure from the inode. 
1047 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1048 */
1049int prepare_binprm(struct linux_binprm *bprm)
1050{
1051        int mode;
1052        struct inode * inode = bprm->file->f_path.dentry->d_inode;
1053        int retval;
1054
1055        mode = inode->i_mode;
1056        if (bprm->file->f_op == NULL)
1057                return -EACCES;
1058
1059        bprm->e_uid = current->euid;
1060        bprm->e_gid = current->egid;
1061
1062        if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
1063                /* Set-uid? */
1064                if (mode & S_ISUID) {
1065                        current->personality &= ~PER_CLEAR_ON_SETID;
1066                        bprm->e_uid = inode->i_uid;
1067                }
1068
1069                /* Set-gid? */
1070                /*
1071                 * If setgid is set but no group execute bit then this
1072                 * is a candidate for mandatory locking, not a setgid
1073                 * executable.
1074                 */
1075                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1076                        current->personality &= ~PER_CLEAR_ON_SETID;
1077                        bprm->e_gid = inode->i_gid;
1078                }
1079        }
1080
1081        /* fill in binprm security blob */
1082        retval = security_bprm_set(bprm);
1083        if (retval)
1084                return retval;
1085
1086        memset(bprm->buf,0,BINPRM_BUF_SIZE);
1087        return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
1088}
1089
1090EXPORT_SYMBOL(prepare_binprm);
1091
1092static int unsafe_exec(struct task_struct *p)
1093{
1094        int unsafe = 0;
1095        if (p->ptrace & PT_PTRACED) {
1096                if (p->ptrace & PT_PTRACE_CAP)
1097                        unsafe |= LSM_UNSAFE_PTRACE_CAP;
1098                else
1099                        unsafe |= LSM_UNSAFE_PTRACE;
1100        }
1101        if (atomic_read(&p->fs->count) > 1 ||
1102            atomic_read(&p->files->count) > 1 ||
1103            atomic_read(&p->sighand->count) > 1)
1104                unsafe |= LSM_UNSAFE_SHARE;
1105
1106        return unsafe;
1107}
1108
1109void compute_creds(struct linux_binprm *bprm)
1110{
1111        int unsafe;
1112
1113        if (bprm->e_uid != current->uid) {
1114                suid_keys(current);
1115                current->pdeath_signal = 0;
1116        }
1117        exec_keys(current);
1118
1119        task_lock(current);
1120        unsafe = unsafe_exec(current);
1121        security_bprm_apply_creds(bprm, unsafe);
1122        task_unlock(current);
1123        security_bprm_post_apply_creds(bprm);
1124}
1125EXPORT_SYMBOL(compute_creds);
1126
1127/*
1128 * Arguments are '\0' separated strings found at the location bprm->p
1129 * points to; chop off the first by relocating brpm->p to right after
1130 * the first '\0' encountered.
1131 */
1132int remove_arg_zero(struct linux_binprm *bprm)
1133{
1134        int ret = 0;
1135        unsigned long offset;
1136        char *kaddr;
1137        struct page *page;
1138
1139        if (!bprm->argc)
1140                return 0;
1141
1142        do {
1143                offset = bprm->p & ~PAGE_MASK;
1144                page = get_arg_page(bprm, bprm->p, 0);
1145                if (!page) {
1146                        ret = -EFAULT;
1147                        goto out;
1148                }
1149                kaddr = kmap_atomic(page, KM_USER0);
1150
1151                for (; offset < PAGE_SIZE && kaddr[offset];
1152                                offset++, bprm->p++)
1153                        ;
1154
1155                kunmap_atomic(kaddr, KM_USER0);
1156                put_arg_page(page);
1157
1158                if (offset == PAGE_SIZE)
1159                        free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
1160        } while (offset == PAGE_SIZE);
1161
1162        bprm->p++;
1163        bprm->argc--;
1164        ret = 0;
1165
1166out:
1167        return ret;
1168}
1169EXPORT_SYMBOL(remove_arg_zero);
1170
1171/*
1172 * cycle the list of binary formats handler, until one recognizes the image
1173 */
1174int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1175{
1176        int try,retval;
1177        struct linux_binfmt *fmt;
1178#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT)
1179        /* handle /sbin/loader.. */
1180        {
1181            struct exec * eh = (struct exec *) bprm->buf;
1182
1183            if (!bprm->loader && eh->fh.f_magic == 0x183 &&
1184                (eh->fh.f_flags & 0x3000) == 0x3000)
1185            {
1186                struct file * file;
1187                unsigned long loader;
1188
1189                allow_write_access(bprm->file);
1190                fput(bprm->file);
1191                bprm->file = NULL;
1192
1193                loader = bprm->vma->vm_end - sizeof(void *);
1194
1195                file = open_exec("/sbin/loader");
1196                retval = PTR_ERR(file);
1197                if (IS_ERR(file))
1198                        return retval;
1199
1200                /* Remember if the application is TASO.  */
1201                bprm->sh_bang = eh->ah.entry < 0x100000000UL;
1202
1203                bprm->file = file;
1204                bprm->loader = loader;
1205                retval = prepare_binprm(bprm);
1206                if (retval<0)
1207                        return retval;
1208                /* should call search_binary_handler recursively here,
1209                   but it does not matter */
1210            }
1211        }
1212#endif
1213        retval = security_bprm_check(bprm);
1214        if (retval)
1215                return retval;
1216
1217        /* kernel module loader fixup */
1218        /* so we don't try to load run modprobe in kernel space. */
1219        set_fs(USER_DS);
1220
1221        retval = audit_bprm(bprm);
1222        if (retval)
1223                return retval;
1224
1225        retval = -ENOENT;
1226        for (try=0; try<2; try++) {
1227                read_lock(&binfmt_lock);
1228                list_for_each_entry(fmt, &formats, lh) {
1229                        int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
1230                        if (!fn)
1231                                continue;
1232                        if (!try_module_get(fmt->module))
1233                                continue;
1234                        read_unlock(&binfmt_lock);
1235                        retval = fn(bprm, regs);
1236                        if (retval >= 0) {
1237                                put_binfmt(fmt);
1238                                allow_write_access(bprm->file);
1239                                if (bprm->file)
1240                                        fput(bprm->file);
1241                                bprm->file = NULL;
1242                                current->did_exec = 1;
1243                                proc_exec_connector(current);
1244                                return retval;
1245                        }
1246                        read_lock(&binfmt_lock);
1247                        put_binfmt(fmt);
1248                        if (retval != -ENOEXEC || bprm->mm == NULL)
1249                                break;
1250                        if (!bprm->file) {
1251                                read_unlock(&binfmt_lock);
1252                                return retval;
1253                        }
1254                }
1255                read_unlock(&binfmt_lock);
1256                if (retval != -ENOEXEC || bprm->mm == NULL) {
1257                        break;
1258#ifdef CONFIG_KMOD
1259                }else{
1260#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1261                        if (printable(bprm->buf[0]) &&
1262                            printable(bprm->buf[1]) &&
1263                            printable(bprm->buf[2]) &&
1264                            printable(bprm->buf[3]))
1265                                break; /* -ENOEXEC */
1266                        request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1267#endif
1268                }
1269        }
1270        return retval;
1271}
1272
1273EXPORT_SYMBOL(search_binary_handler);
1274
1275/*
1276 * sys_execve() executes a new program.
1277 */
1278int do_execve(char * filename,
1279        char __user *__user *argv,
1280        char __user *__user *envp,
1281        struct pt_regs * regs)
1282{
1283        struct linux_binprm *bprm;
1284        struct file *file;
1285        unsigned long env_p;
1286        int retval;
1287
1288        retval = -ENOMEM;
1289        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1290        if (!bprm)
1291                goto out_ret;
1292
1293        file = open_exec(filename);
1294        retval = PTR_ERR(file);
1295        if (IS_ERR(file))
1296                goto out_kfree;
1297
1298        sched_exec();
1299
1300        bprm->file = file;
1301        bprm->filename = filename;
1302        bprm->interp = filename;
1303
1304        retval = bprm_mm_init(bprm);
1305        if (retval)
1306                goto out_file;
1307
1308        bprm->argc = count(argv, MAX_ARG_STRINGS);
1309        if ((retval = bprm->argc) < 0)
1310                goto out_mm;
1311
1312        bprm->envc = count(envp, MAX_ARG_STRINGS);
1313        if ((retval = bprm->envc) < 0)
1314                goto out_mm;
1315
1316        retval = security_bprm_alloc(bprm);
1317        if (retval)
1318                goto out;
1319
1320        retval = prepare_binprm(bprm);
1321        if (retval < 0)
1322                goto out;
1323
1324        retval = copy_strings_kernel(1, &bprm->filename, bprm);
1325        if (retval < 0)
1326                goto out;
1327
1328        bprm->exec = bprm->p;
1329        retval = copy_strings(bprm->envc, envp, bprm);
1330        if (retval < 0)
1331                goto out;
1332
1333        env_p = bprm->p;
1334        retval = copy_strings(bprm->argc, argv, bprm);
1335        if (retval < 0)
1336                goto out;
1337        bprm->argv_len = env_p - bprm->p;
1338
1339        retval = search_binary_handler(bprm,regs);
1340        if (retval >= 0) {
1341                /* execve success */
1342                free_arg_pages(bprm);
1343                security_bprm_free(bprm);
1344                acct_update_integrals(current);
1345                kfree(bprm);
1346                return retval;
1347        }
1348
1349out:
1350        free_arg_pages(bprm);
1351        if (bprm->security)
1352                security_bprm_free(bprm);
1353
1354out_mm:
1355        if (bprm->mm)
1356                mmput (bprm->mm);
1357
1358out_file:
1359        if (bprm->file) {
1360                allow_write_access(bprm->file);
1361                fput(bprm->file);
1362        }
1363out_kfree:
1364        kfree(bprm);
1365
1366out_ret:
1367        return retval;
1368}
1369
1370int set_binfmt(struct linux_binfmt *new)
1371{
1372        struct linux_binfmt *old = current->binfmt;
1373
1374        if (new) {
1375                if (!try_module_get(new->module))
1376                        return -1;
1377        }
1378        current->binfmt = new;
1379        if (old)
1380                module_put(old->module);
1381        return 0;
1382}
1383
1384EXPORT_SYMBOL(set_binfmt);
1385
1386/* format_corename will inspect the pattern parameter, and output a
1387 * name into corename, which must have space for at least
1388 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1389 */
1390static int format_corename(char *corename, const char *pattern, long signr)
1391{
1392        const char *pat_ptr = pattern;
1393        char *out_ptr = corename;
1394        char *const out_end = corename + CORENAME_MAX_SIZE;
1395        int rc;
1396        int pid_in_pattern = 0;
1397        int ispipe = 0;
1398
1399        if (*pattern == '|')
1400                ispipe = 1;
1401
1402        /* Repeat as long as we have more pattern to process and more output
1403           space */
1404        while (*pat_ptr) {
1405                if (*pat_ptr != '%') {
1406                        if (out_ptr == out_end)
1407                                goto out;
1408                        *out_ptr++ = *pat_ptr++;
1409                } else {
1410                        switch (*++pat_ptr) {
1411                        case 0:
1412                                goto out;
1413                        /* Double percent, output one percent */
1414                        case '%':
1415                                if (out_ptr == out_end)
1416                                        goto out;
1417                                *out_ptr++ = '%';
1418                                break;
1419                        /* pid */
1420                        case 'p':
1421                                pid_in_pattern = 1;
1422                                rc = snprintf(out_ptr, out_end - out_ptr,
1423                                              "%d", task_tgid_vnr(current));
1424                                if (rc > out_end - out_ptr)
1425                                        goto out;
1426                                out_ptr += rc;
1427                                break;
1428                        /* uid */
1429                        case 'u':
1430                                rc = snprintf(out_ptr, out_end - out_ptr,
1431                                              "%d", current->uid);
1432                                if (rc > out_end - out_ptr)
1433                                        goto out;
1434                                out_ptr += rc;
1435                                break;
1436                        /* gid */
1437                        case 'g':
1438                                rc = snprintf(out_ptr, out_end - out_ptr,
1439                                              "%d", current->gid);
1440                                if (rc > out_end - out_ptr)
1441                                        goto out;
1442                                out_ptr += rc;
1443                                break;
1444                        /* signal that caused the coredump */
1445                        case 's':
1446                                rc = snprintf(out_ptr, out_end - out_ptr,
1447                                              "%ld", signr);
1448                                if (rc > out_end - out_ptr)
1449                                        goto out;
1450                                out_ptr += rc;
1451                                break;
1452                        /* UNIX time of coredump */
1453                        case 't': {
1454                                struct timeval tv;
1455                                do_gettimeofday(&tv);
1456                                rc = snprintf(out_ptr, out_end - out_ptr,
1457                                              "%lu", tv.tv_sec);
1458                                if (rc > out_end - out_ptr)
1459                                        goto out;
1460                                out_ptr += rc;
1461                                break;
1462                        }
1463                        /* hostname */
1464                        case 'h':
1465                                down_read(&uts_sem);
1466                                rc = snprintf(out_ptr, out_end - out_ptr,
1467                                              "%s", utsname()->nodename);
1468                                up_read(&uts_sem);
1469                                if (rc > out_end - out_ptr)
1470                                        goto out;
1471                                out_ptr += rc;
1472                                break;
1473                        /* executable */
1474                        case 'e':
1475                                rc = snprintf(out_ptr, out_end - out_ptr,
1476                                              "%s", current->comm);
1477                                if (rc > out_end - out_ptr)
1478                                        goto out;
1479                                out_ptr += rc;
1480                                break;
1481                        /* core limit size */
1482                        case 'c':
1483                                rc = snprintf(out_ptr, out_end - out_ptr,
1484                                              "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur);
1485                                if (rc > out_end - out_ptr)
1486                                        goto out;
1487                                out_ptr += rc;
1488                                break;
1489                        default:
1490                                break;
1491                        }
1492                        ++pat_ptr;
1493                }
1494        }
1495        /* Backward compatibility with core_uses_pid:
1496         *
1497         * If core_pattern does not include a %p (as is the default)
1498         * and core_uses_pid is set, then .%pid will be appended to
1499         * the filename. Do not do this for piped commands. */
1500        if (!ispipe && !pid_in_pattern
1501            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
1502                rc = snprintf(out_ptr, out_end - out_ptr,
1503                              ".%d", task_tgid_vnr(current));
1504                if (rc > out_end - out_ptr)
1505                        goto out;
1506                out_ptr += rc;
1507        }
1508out:
1509        *out_ptr = 0;
1510        return ispipe;
1511}
1512
1513static void zap_process(struct task_struct *start)
1514{
1515        struct task_struct *t;
1516
1517        start->signal->flags = SIGNAL_GROUP_EXIT;
1518        start->signal->group_stop_count = 0;
1519
1520        t = start;
1521        do {
1522                if (t != current && t->mm) {
1523                        t->mm->core_waiters++;
1524                        sigaddset(&t->pending.signal, SIGKILL);
1525                        signal_wake_up(t, 1);
1526                }
1527        } while ((t = next_thread(t)) != start);
1528}
1529
1530static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1531                                int exit_code)
1532{
1533        struct task_struct *g, *p;
1534        unsigned long flags;
1535        int err = -EAGAIN;
1536
1537        spin_lock_irq(&tsk->sighand->siglock);
1538        if (!signal_group_exit(tsk->signal)) {
1539                tsk->signal->group_exit_code = exit_code;
1540                zap_process(tsk);
1541                err = 0;
1542        }
1543        spin_unlock_irq(&tsk->sighand->siglock);
1544        if (err)
1545                return err;
1546
1547        if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
1548                goto done;
1549
1550        rcu_read_lock();
1551        for_each_process(g) {
1552                if (g == tsk->group_leader)
1553                        continue;
1554
1555                p = g;
1556                do {
1557                        if (p->mm) {
1558                                if (p->mm == mm) {
1559                                        /*
1560                                         * p->sighand can't disappear, but
1561                                         * may be changed by de_thread()
1562                                         */
1563                                        lock_task_sighand(p, &flags);
1564                                        zap_process(p);
1565                                        unlock_task_sighand(p, &flags);
1566                                }
1567                                break;
1568                        }
1569                } while ((p = next_thread(p)) != g);
1570        }
1571        rcu_read_unlock();
1572done:
1573        return mm->core_waiters;
1574}
1575
1576static int coredump_wait(int exit_code)
1577{
1578        struct task_struct *tsk = current;
1579        struct mm_struct *mm = tsk->mm;
1580        struct completion startup_done;
1581        struct completion *vfork_done;
1582        int core_waiters;
1583
1584        init_completion(&mm->core_done);
1585        init_completion(&startup_done);
1586        mm->core_startup_done = &startup_done;
1587
1588        core_waiters = zap_threads(tsk, mm, exit_code);
1589        up_write(&mm->mmap_sem);
1590
1591        if (unlikely(core_waiters < 0))
1592                goto fail;
1593
1594        /*
1595         * Make sure nobody is waiting for us to release the VM,
1596         * otherwise we can deadlock when we wait on each other
1597         */
1598        vfork_done = tsk->vfork_done;
1599        if (vfork_done) {
1600                tsk->vfork_done = NULL;
1601                complete(vfork_done);
1602        }
1603
1604        if (core_waiters)
1605                wait_for_completion(&startup_done);
1606fail:
1607        BUG_ON(mm->core_waiters);
1608        return core_waiters;
1609}
1610
1611/*
1612 * set_dumpable converts traditional three-value dumpable to two flags and
1613 * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
1614 * these bits are not changed atomically.  So get_dumpable can observe the
1615 * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
1616 * return either old dumpable or new one by paying attention to the order of
1617 * modifying the bits.
1618 *
1619 * dumpable |   mm->flags (binary)
1620 * old  new | initial interim  final
1621 * ---------+-----------------------
1622 *  0    1  |   00      01      01
1623 *  0    2  |   00      10(*)   11
1624 *  1    0  |   01      00      00
1625 *  1    2  |   01      11      11
1626 *  2    0  |   11      10(*)   00
1627 *  2    1  |   11      11      01
1628 *
1629 * (*) get_dumpable regards interim value of 10 as 11.
1630 */
1631void set_dumpable(struct mm_struct *mm, int value)
1632{
1633        switch (value) {
1634        case 0:
1635                clear_bit(MMF_DUMPABLE, &mm->flags);
1636                smp_wmb();
1637                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1638                break;
1639        case 1:
1640                set_bit(MMF_DUMPABLE, &mm->flags);
1641                smp_wmb();
1642                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1643                break;
1644        case 2:
1645                set_bit(MMF_DUMP_SECURELY, &mm->flags);
1646                smp_wmb();
1647                set_bit(MMF_DUMPABLE, &mm->flags);
1648                break;
1649        }
1650}
1651
1652int get_dumpable(struct mm_struct *mm)
1653{
1654        int ret;
1655
1656        ret = mm->flags & 0x3;
1657        return (ret >= 2) ? 2 : ret;
1658}
1659
1660int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1661{
1662        char corename[CORENAME_MAX_SIZE + 1];
1663        struct mm_struct *mm = current->mm;
1664        struct linux_binfmt * binfmt;
1665        struct inode * inode;
1666        struct file * file;
1667        int retval = 0;
1668        int fsuid = current->fsuid;
1669        int flag = 0;
1670        int ispipe = 0;
1671        unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1672        char **helper_argv = NULL;
1673        int helper_argc = 0;
1674        char *delimit;
1675
1676        audit_core_dumps(signr);
1677
1678        binfmt = current->binfmt;
1679        if (!binfmt || !binfmt->core_dump)
1680                goto fail;
1681        down_write(&mm->mmap_sem);
1682        /*
1683         * If another thread got here first, or we are not dumpable, bail out.
1684         */
1685        if (mm->core_waiters || !get_dumpable(mm)) {
1686                up_write(&mm->mmap_sem);
1687                goto fail;
1688        }
1689
1690        /*
1691         *      We cannot trust fsuid as being the "true" uid of the
1692         *      process nor do we know its entire history. We only know it
1693         *      was tainted so we dump it as root in mode 2.
1694         */
1695        if (get_dumpable(mm) == 2) {    /* Setuid core dump mode */
1696                flag = O_EXCL;          /* Stop rewrite attacks */
1697                current->fsuid = 0;     /* Dump root private */
1698        }
1699
1700        retval = coredump_wait(exit_code);
1701        if (retval < 0)
1702                goto fail;
1703
1704        /*
1705         * Clear any false indication of pending signals that might
1706         * be seen by the filesystem code called to write the core file.
1707         */
1708        clear_thread_flag(TIF_SIGPENDING);
1709
1710        /*
1711         * lock_kernel() because format_corename() is controlled by sysctl, which
1712         * uses lock_kernel()
1713         */
1714        lock_kernel();
1715        ispipe = format_corename(corename, core_pattern, signr);
1716        unlock_kernel();
1717        /*
1718         * Don't bother to check the RLIMIT_CORE value if core_pattern points
1719         * to a pipe.  Since we're not writing directly to the filesystem
1720         * RLIMIT_CORE doesn't really apply, as no actual core file will be
1721         * created unless the pipe reader choses to write out the core file
1722         * at which point file size limits and permissions will be imposed
1723         * as it does with any other process
1724         */
1725        if ((!ispipe) && (core_limit < binfmt->min_coredump))
1726                goto fail_unlock;
1727
1728        if (ispipe) {
1729                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1730                /* Terminate the string before the first option */
1731                delimit = strchr(corename, ' ');
1732                if (delimit)
1733                        *delimit = '\0';
1734                delimit = strrchr(helper_argv[0], '/');
1735                if (delimit)
1736                        delimit++;
1737                else
1738                        delimit = helper_argv[0];
1739                if (!strcmp(delimit, current->comm)) {
1740                        printk(KERN_NOTICE "Recursive core dump detected, "
1741                                        "aborting\n");
1742                        goto fail_unlock;
1743                }
1744
1745                core_limit = RLIM_INFINITY;
1746
1747                /* SIGPIPE can happen, but it's just never processed */
1748                if (call_usermodehelper_pipe(corename+1, helper_argv, NULL,
1749                                &file)) {
1750                        printk(KERN_INFO "Core dump to %s pipe failed\n",
1751                               corename);
1752                        goto fail_unlock;
1753                }
1754        } else
1755                file = filp_open(corename,
1756                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1757                                 0600);
1758        if (IS_ERR(file))
1759                goto fail_unlock;
1760        inode = file->f_path.dentry->d_inode;
1761        if (inode->i_nlink > 1)
1762                goto close_fail;        /* multiple links - don't dump */
1763        if (!ispipe && d_unhashed(file->f_path.dentry))
1764                goto close_fail;
1765
1766        /* AK: actually i see no reason to not allow this for named pipes etc.,
1767           but keep the previous behaviour for now. */
1768        if (!ispipe && !S_ISREG(inode->i_mode))
1769                goto close_fail;
1770        /*
1771         * Dont allow local users get cute and trick others to coredump
1772         * into their pre-created files:
1773         */
1774        if (inode->i_uid != current->fsuid)
1775                goto close_fail;
1776        if (!file->f_op)
1777                goto close_fail;
1778        if (!file->f_op->write)
1779                goto close_fail;
1780        if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
1781                goto close_fail;
1782
1783        retval = binfmt->core_dump(signr, regs, file, core_limit);
1784
1785        if (retval)
1786                current->signal->group_exit_code |= 0x80;
1787close_fail:
1788        filp_close(file, NULL);
1789fail_unlock:
1790        if (helper_argv)
1791                argv_free(helper_argv);
1792
1793        current->fsuid = fsuid;
1794        complete_all(&mm->core_done);
1795fail:
1796        return retval;
1797}
1798
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.