linux/fs/exec.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/exec.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * #!-checking implemented by tytso.
   9 */
  10/*
  11 * Demand-loading implemented 01.12.91 - no need to read anything but
  12 * the header into memory. The inode of the executable is put into
  13 * "current->executable", and page faults do the actual loading. Clean.
  14 *
  15 * Once more I can proudly say that linux stood up to being changed: it
  16 * was less than 2 hours work to get demand-loading completely implemented.
  17 *
  18 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
  19 * current->executable is only used by the procfs.  This allows a dispatch
  20 * table to check for several different types  of binary formats.  We keep
  21 * trying until we recognize the file or we run out of supported binary
  22 * formats. 
  23 */
  24
  25#include <linux/slab.h>
  26#include <linux/file.h>
  27#include <linux/fdtable.h>
  28#include <linux/mm.h>
  29#include <linux/stat.h>
  30#include <linux/fcntl.h>
  31#include <linux/swap.h>
  32#include <linux/string.h>
  33#include <linux/init.h>
  34#include <linux/pagemap.h>
  35#include <linux/perf_event.h>
  36#include <linux/highmem.h>
  37#include <linux/spinlock.h>
  38#include <linux/key.h>
  39#include <linux/personality.h>
  40#include <linux/binfmts.h>
  41#include <linux/utsname.h>
  42#include <linux/pid_namespace.h>
  43#include <linux/module.h>
  44#include <linux/namei.h>
  45#include <linux/mount.h>
  46#include <linux/security.h>
  47#include <linux/syscalls.h>
  48#include <linux/tsacct_kern.h>
  49#include <linux/cn_proc.h>
  50#include <linux/audit.h>
  51#include <linux/tracehook.h>
  52#include <linux/kmod.h>
  53#include <linux/fsnotify.h>
  54#include <linux/fs_struct.h>
  55#include <linux/pipe_fs_i.h>
  56#include <linux/oom.h>
  57#include <linux/compat.h>
  58
  59#include <asm/uaccess.h>
  60#include <asm/mmu_context.h>
  61#include <asm/tlb.h>
  62#include <asm/exec.h>
  63
  64#include <trace/events/task.h>
  65#include "internal.h"
  66
  67#include <trace/events/sched.h>
  68
  69int core_uses_pid;
  70char core_pattern[CORENAME_MAX_SIZE] = "core";
  71unsigned int core_pipe_limit;
  72int suid_dumpable = 0;
  73
  74struct core_name {
  75        char *corename;
  76        int used, size;
  77};
  78static atomic_t call_count = ATOMIC_INIT(1);
  79
  80/* The maximal length of core_pattern is also specified in sysctl.c */
  81
  82static LIST_HEAD(formats);
  83static DEFINE_RWLOCK(binfmt_lock);
  84
  85void __register_binfmt(struct linux_binfmt * fmt, int insert)
  86{
  87        BUG_ON(!fmt);
  88        write_lock(&binfmt_lock);
  89        insert ? list_add(&fmt->lh, &formats) :
  90                 list_add_tail(&fmt->lh, &formats);
  91        write_unlock(&binfmt_lock);
  92}
  93
  94EXPORT_SYMBOL(__register_binfmt);
  95
  96void unregister_binfmt(struct linux_binfmt * fmt)
  97{
  98        write_lock(&binfmt_lock);
  99        list_del(&fmt->lh);
 100        write_unlock(&binfmt_lock);
 101}
 102
 103EXPORT_SYMBOL(unregister_binfmt);
 104
 105static inline void put_binfmt(struct linux_binfmt * fmt)
 106{
 107        module_put(fmt->module);
 108}
 109
 110/*
 111 * Note that a shared library must be both readable and executable due to
 112 * security reasons.
 113 *
 114 * Also note that we take the address to load from from the file itself.
 115 */
 116SYSCALL_DEFINE1(uselib, const char __user *, library)
 117{
 118        struct file *file;
 119        char *tmp = getname(library);
 120        int error = PTR_ERR(tmp);
 121        static const struct open_flags uselib_flags = {
 122                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 123                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
 124                .intent = LOOKUP_OPEN
 125        };
 126
 127        if (IS_ERR(tmp))
 128                goto out;
 129
 130        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
 131        putname(tmp);
 132        error = PTR_ERR(file);
 133        if (IS_ERR(file))
 134                goto out;
 135
 136        error = -EINVAL;
 137        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
 138                goto exit;
 139
 140        error = -EACCES;
 141        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 142                goto exit;
 143
 144        fsnotify_open(file);
 145
 146        error = -ENOEXEC;
 147        if(file->f_op) {
 148                struct linux_binfmt * fmt;
 149
 150                read_lock(&binfmt_lock);
 151                list_for_each_entry(fmt, &formats, lh) {
 152                        if (!fmt->load_shlib)
 153                                continue;
 154                        if (!try_module_get(fmt->module))
 155                                continue;
 156                        read_unlock(&binfmt_lock);
 157                        error = fmt->load_shlib(file);
 158                        read_lock(&binfmt_lock);
 159                        put_binfmt(fmt);
 160                        if (error != -ENOEXEC)
 161                                break;
 162                }
 163                read_unlock(&binfmt_lock);
 164        }
 165exit:
 166        fput(file);
 167out:
 168        return error;
 169}
 170
 171#ifdef CONFIG_MMU
 172/*
 173 * The nascent bprm->mm is not visible until exec_mmap() but it can
 174 * use a lot of memory, account these pages in current->mm temporary
 175 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 176 * change the counter back via acct_arg_size(0).
 177 */
 178static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 179{
 180        struct mm_struct *mm = current->mm;
 181        long diff = (long)(pages - bprm->vma_pages);
 182
 183        if (!mm || !diff)
 184                return;
 185
 186        bprm->vma_pages = pages;
 187        add_mm_counter(mm, MM_ANONPAGES, diff);
 188}
 189
 190static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 191                int write)
 192{
 193        struct page *page;
 194        int ret;
 195
 196#ifdef CONFIG_STACK_GROWSUP
 197        if (write) {
 198                ret = expand_downwards(bprm->vma, pos);
 199                if (ret < 0)
 200                        return NULL;
 201        }
 202#endif
 203        ret = get_user_pages(current, bprm->mm, pos,
 204                        1, write, 1, &page, NULL);
 205        if (ret <= 0)
 206                return NULL;
 207
 208        if (write) {
 209                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 210                struct rlimit *rlim;
 211
 212                acct_arg_size(bprm, size / PAGE_SIZE);
 213
 214                /*
 215                 * We've historically supported up to 32 pages (ARG_MAX)
 216                 * of argument strings even with small stacks
 217                 */
 218                if (size <= ARG_MAX)
 219                        return page;
 220
 221                /*
 222                 * Limit to 1/4-th the stack size for the argv+env strings.
 223                 * This ensures that:
 224                 *  - the remaining binfmt code will not run out of stack space,
 225                 *  - the program will have a reasonable amount of stack left
 226                 *    to work from.
 227                 */
 228                rlim = current->signal->rlim;
 229                if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
 230                        put_page(page);
 231                        return NULL;
 232                }
 233        }
 234
 235        return page;
 236}
 237
 238static void put_arg_page(struct page *page)
 239{
 240        put_page(page);
 241}
 242
 243static void free_arg_page(struct linux_binprm *bprm, int i)
 244{
 245}
 246
 247static void free_arg_pages(struct linux_binprm *bprm)
 248{
 249}
 250
 251static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 252                struct page *page)
 253{
 254        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 255}
 256
 257static int __bprm_mm_init(struct linux_binprm *bprm)
 258{
 259        int err;
 260        struct vm_area_struct *vma = NULL;
 261        struct mm_struct *mm = bprm->mm;
 262
 263        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 264        if (!vma)
 265                return -ENOMEM;
 266
 267        down_write(&mm->mmap_sem);
 268        vma->vm_mm = mm;
 269
 270        /*
 271         * Place the stack at the largest stack address the architecture
 272         * supports. Later, we'll move this to an appropriate place. We don't
 273         * use STACK_TOP because that can depend on attributes which aren't
 274         * configured yet.
 275         */
 276        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 277        vma->vm_end = STACK_TOP_MAX;
 278        vma->vm_start = vma->vm_end - PAGE_SIZE;
 279        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 280        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 281        INIT_LIST_HEAD(&vma->anon_vma_chain);
 282
 283        err = insert_vm_struct(mm, vma);
 284        if (err)
 285                goto err;
 286
 287        mm->stack_vm = mm->total_vm = 1;
 288        up_write(&mm->mmap_sem);
 289        bprm->p = vma->vm_end - sizeof(void *);
 290        return 0;
 291err:
 292        up_write(&mm->mmap_sem);
 293        bprm->vma = NULL;
 294        kmem_cache_free(vm_area_cachep, vma);
 295        return err;
 296}
 297
 298static bool valid_arg_len(struct linux_binprm *bprm, long len)
 299{
 300        return len <= MAX_ARG_STRLEN;
 301}
 302
 303#else
 304
 305static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 306{
 307}
 308
 309static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 310                int write)
 311{
 312        struct page *page;
 313
 314        page = bprm->page[pos / PAGE_SIZE];
 315        if (!page && write) {
 316                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
 317                if (!page)
 318                        return NULL;
 319                bprm->page[pos / PAGE_SIZE] = page;
 320        }
 321
 322        return page;
 323}
 324
 325static void put_arg_page(struct page *page)
 326{
 327}
 328
 329static void free_arg_page(struct linux_binprm *bprm, int i)
 330{
 331        if (bprm->page[i]) {
 332                __free_page(bprm->page[i]);
 333                bprm->page[i] = NULL;
 334        }
 335}
 336
 337static void free_arg_pages(struct linux_binprm *bprm)
 338{
 339        int i;
 340
 341        for (i = 0; i < MAX_ARG_PAGES; i++)
 342                free_arg_page(bprm, i);
 343}
 344
 345static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 346                struct page *page)
 347{
 348}
 349
 350static int __bprm_mm_init(struct linux_binprm *bprm)
 351{
 352        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
 353        return 0;
 354}
 355
 356static bool valid_arg_len(struct linux_binprm *bprm, long len)
 357{
 358        return len <= bprm->p;
 359}
 360
 361#endif /* CONFIG_MMU */
 362
 363/*
 364 * Create a new mm_struct and populate it with a temporary stack
 365 * vm_area_struct.  We don't have enough context at this point to set the stack
 366 * flags, permissions, and offset, so we use temporary values.  We'll update
 367 * them later in setup_arg_pages().
 368 */
 369int bprm_mm_init(struct linux_binprm *bprm)
 370{
 371        int err;
 372        struct mm_struct *mm = NULL;
 373
 374        bprm->mm = mm = mm_alloc();
 375        err = -ENOMEM;
 376        if (!mm)
 377                goto err;
 378
 379        err = init_new_context(current, mm);
 380        if (err)
 381                goto err;
 382
 383        err = __bprm_mm_init(bprm);
 384        if (err)
 385                goto err;
 386
 387        return 0;
 388
 389err:
 390        if (mm) {
 391                bprm->mm = NULL;
 392                mmdrop(mm);
 393        }
 394
 395        return err;
 396}
 397
 398struct user_arg_ptr {
 399#ifdef CONFIG_COMPAT
 400        bool is_compat;
 401#endif
 402        union {
 403                const char __user *const __user *native;
 404#ifdef CONFIG_COMPAT
 405                compat_uptr_t __user *compat;
 406#endif
 407        } ptr;
 408};
 409
 410static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 411{
 412        const char __user *native;
 413
 414#ifdef CONFIG_COMPAT
 415        if (unlikely(argv.is_compat)) {
 416                compat_uptr_t compat;
 417
 418                if (get_user(compat, argv.ptr.compat + nr))
 419                        return ERR_PTR(-EFAULT);
 420
 421                return compat_ptr(compat);
 422        }
 423#endif
 424
 425        if (get_user(native, argv.ptr.native + nr))
 426                return ERR_PTR(-EFAULT);
 427
 428        return native;
 429}
 430
 431/*
 432 * count() counts the number of strings in array ARGV.
 433 */
 434static int count(struct user_arg_ptr argv, int max)
 435{
 436        int i = 0;
 437
 438        if (argv.ptr.native != NULL) {
 439                for (;;) {
 440                        const char __user *p = get_user_arg_ptr(argv, i);
 441
 442                        if (!p)
 443                                break;
 444
 445                        if (IS_ERR(p))
 446                                return -EFAULT;
 447
 448                        if (i++ >= max)
 449                                return -E2BIG;
 450
 451                        if (fatal_signal_pending(current))
 452                                return -ERESTARTNOHAND;
 453                        cond_resched();
 454                }
 455        }
 456        return i;
 457}
 458
 459/*
 460 * 'copy_strings()' copies argument/environment strings from the old
 461 * processes's memory to the new process's stack.  The call to get_user_pages()
 462 * ensures the destination page is created and not swapped out.
 463 */
 464static int copy_strings(int argc, struct user_arg_ptr argv,
 465                        struct linux_binprm *bprm)
 466{
 467        struct page *kmapped_page = NULL;
 468        char *kaddr = NULL;
 469        unsigned long kpos = 0;
 470        int ret;
 471
 472        while (argc-- > 0) {
 473                const char __user *str;
 474                int len;
 475                unsigned long pos;
 476
 477                ret = -EFAULT;
 478                str = get_user_arg_ptr(argv, argc);
 479                if (IS_ERR(str))
 480                        goto out;
 481
 482                len = strnlen_user(str, MAX_ARG_STRLEN);
 483                if (!len)
 484                        goto out;
 485
 486                ret = -E2BIG;
 487                if (!valid_arg_len(bprm, len))
 488                        goto out;
 489
 490                /* We're going to work our way backwords. */
 491                pos = bprm->p;
 492                str += len;
 493                bprm->p -= len;
 494
 495                while (len > 0) {
 496                        int offset, bytes_to_copy;
 497
 498                        if (fatal_signal_pending(current)) {
 499                                ret = -ERESTARTNOHAND;
 500                                goto out;
 501                        }
 502                        cond_resched();
 503
 504                        offset = pos % PAGE_SIZE;
 505                        if (offset == 0)
 506                                offset = PAGE_SIZE;
 507
 508                        bytes_to_copy = offset;
 509                        if (bytes_to_copy > len)
 510                                bytes_to_copy = len;
 511
 512                        offset -= bytes_to_copy;
 513                        pos -= bytes_to_copy;
 514                        str -= bytes_to_copy;
 515                        len -= bytes_to_copy;
 516
 517                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 518                                struct page *page;
 519
 520                                page = get_arg_page(bprm, pos, 1);
 521                                if (!page) {
 522                                        ret = -E2BIG;
 523                                        goto out;
 524                                }
 525
 526                                if (kmapped_page) {
 527                                        flush_kernel_dcache_page(kmapped_page);
 528                                        kunmap(kmapped_page);
 529                                        put_arg_page(kmapped_page);
 530                                }
 531                                kmapped_page = page;
 532                                kaddr = kmap(kmapped_page);
 533                                kpos = pos & PAGE_MASK;
 534                                flush_arg_page(bprm, kpos, kmapped_page);
 535                        }
 536                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 537                                ret = -EFAULT;
 538                                goto out;
 539                        }
 540                }
 541        }
 542        ret = 0;
 543out:
 544        if (kmapped_page) {
 545                flush_kernel_dcache_page(kmapped_page);
 546                kunmap(kmapped_page);
 547                put_arg_page(kmapped_page);
 548        }
 549        return ret;
 550}
 551
 552/*
 553 * Like copy_strings, but get argv and its values from kernel memory.
 554 */
 555int copy_strings_kernel(int argc, const char *const *__argv,
 556                        struct linux_binprm *bprm)
 557{
 558        int r;
 559        mm_segment_t oldfs = get_fs();
 560        struct user_arg_ptr argv = {
 561                .ptr.native = (const char __user *const  __user *)__argv,
 562        };
 563
 564        set_fs(KERNEL_DS);
 565        r = copy_strings(argc, argv, bprm);
 566        set_fs(oldfs);
 567
 568        return r;
 569}
 570EXPORT_SYMBOL(copy_strings_kernel);
 571
 572#ifdef CONFIG_MMU
 573
 574/*
 575 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 576 * the binfmt code determines where the new stack should reside, we shift it to
 577 * its final location.  The process proceeds as follows:
 578 *
 579 * 1) Use shift to calculate the new vma endpoints.
 580 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 581 *    arguments passed to subsequent functions are consistent.
 582 * 3) Move vma's page tables to the new range.
 583 * 4) Free up any cleared pgd range.
 584 * 5) Shrink the vma to cover only the new range.
 585 */
 586static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 587{
 588        struct mm_struct *mm = vma->vm_mm;
 589        unsigned long old_start = vma->vm_start;
 590        unsigned long old_end = vma->vm_end;
 591        unsigned long length = old_end - old_start;
 592        unsigned long new_start = old_start - shift;
 593        unsigned long new_end = old_end - shift;
 594        struct mmu_gather tlb;
 595
 596        BUG_ON(new_start > new_end);
 597
 598        /*
 599         * ensure there are no vmas between where we want to go
 600         * and where we are
 601         */
 602        if (vma != find_vma(mm, new_start))
 603                return -EFAULT;
 604
 605        /*
 606         * cover the whole range: [new_start, old_end)
 607         */
 608        if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
 609                return -ENOMEM;
 610
 611        /*
 612         * move the page tables downwards, on failure we rely on
 613         * process cleanup to remove whatever mess we made.
 614         */
 615        if (length != move_page_tables(vma, old_start,
 616                                       vma, new_start, length))
 617                return -ENOMEM;
 618
 619        lru_add_drain();
 620        tlb_gather_mmu(&tlb, mm, 0);
 621        if (new_end > old_start) {
 622                /*
 623                 * when the old and new regions overlap clear from new_end.
 624                 */
 625                free_pgd_range(&tlb, new_end, old_end, new_end,
 626                        vma->vm_next ? vma->vm_next->vm_start : 0);
 627        } else {
 628                /*
 629                 * otherwise, clean from old_start; this is done to not touch
 630                 * the address space in [new_end, old_start) some architectures
 631                 * have constraints on va-space that make this illegal (IA64) -
 632                 * for the others its just a little faster.
 633                 */
 634                free_pgd_range(&tlb, old_start, old_end, new_end,
 635                        vma->vm_next ? vma->vm_next->vm_start : 0);
 636        }
 637        tlb_finish_mmu(&tlb, new_end, old_end);
 638
 639        /*
 640         * Shrink the vma to just the new range.  Always succeeds.
 641         */
 642        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 643
 644        return 0;
 645}
 646
 647/*
 648 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 649 * the stack is optionally relocated, and some extra space is added.
 650 */
 651int setup_arg_pages(struct linux_binprm *bprm,
 652                    unsigned long stack_top,
 653                    int executable_stack)
 654{
 655        unsigned long ret;
 656        unsigned long stack_shift;
 657        struct mm_struct *mm = current->mm;
 658        struct vm_area_struct *vma = bprm->vma;
 659        struct vm_area_struct *prev = NULL;
 660        unsigned long vm_flags;
 661        unsigned long stack_base;
 662        unsigned long stack_size;
 663        unsigned long stack_expand;
 664        unsigned long rlim_stack;
 665
 666#ifdef CONFIG_STACK_GROWSUP
 667        /* Limit stack size to 1GB */
 668        stack_base = rlimit_max(RLIMIT_STACK);
 669        if (stack_base > (1 << 30))
 670                stack_base = 1 << 30;
 671
 672        /* Make sure we didn't let the argument array grow too large. */
 673        if (vma->vm_end - vma->vm_start > stack_base)
 674                return -ENOMEM;
 675
 676        stack_base = PAGE_ALIGN(stack_top - stack_base);
 677
 678        stack_shift = vma->vm_start - stack_base;
 679        mm->arg_start = bprm->p - stack_shift;
 680        bprm->p = vma->vm_end - stack_shift;
 681#else
 682        stack_top = arch_align_stack(stack_top);
 683        stack_top = PAGE_ALIGN(stack_top);
 684
 685        if (unlikely(stack_top < mmap_min_addr) ||
 686            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
 687                return -ENOMEM;
 688
 689        stack_shift = vma->vm_end - stack_top;
 690
 691        bprm->p -= stack_shift;
 692        mm->arg_start = bprm->p;
 693#endif
 694
 695        if (bprm->loader)
 696                bprm->loader -= stack_shift;
 697        bprm->exec -= stack_shift;
 698
 699        down_write(&mm->mmap_sem);
 700        vm_flags = VM_STACK_FLAGS;
 701
 702        /*
 703         * Adjust stack execute permissions; explicitly enable for
 704         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
 705         * (arch default) otherwise.
 706         */
 707        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
 708                vm_flags |= VM_EXEC;
 709        else if (executable_stack == EXSTACK_DISABLE_X)
 710                vm_flags &= ~VM_EXEC;
 711        vm_flags |= mm->def_flags;
 712        vm_flags |= VM_STACK_INCOMPLETE_SETUP;
 713
 714        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 715                        vm_flags);
 716        if (ret)
 717                goto out_unlock;
 718        BUG_ON(prev != vma);
 719
 720        /* Move stack pages down in memory. */
 721        if (stack_shift) {
 722                ret = shift_arg_pages(vma, stack_shift);
 723                if (ret)
 724                        goto out_unlock;
 725        }
 726
 727        /* mprotect_fixup is overkill to remove the temporary stack flags */
 728        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
 729
 730        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
 731        stack_size = vma->vm_end - vma->vm_start;
 732        /*
 733         * Align this down to a page boundary as expand_stack
 734         * will align it up.
 735         */
 736        rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
 737#ifdef CONFIG_STACK_GROWSUP
 738        if (stack_size + stack_expand > rlim_stack)
 739                stack_base = vma->vm_start + rlim_stack;
 740        else
 741                stack_base = vma->vm_end + stack_expand;
 742#else
 743        if (stack_size + stack_expand > rlim_stack)
 744                stack_base = vma->vm_end - rlim_stack;
 745        else
 746                stack_base = vma->vm_start - stack_expand;
 747#endif
 748        current->mm->start_stack = bprm->p;
 749        ret = expand_stack(vma, stack_base);
 750        if (ret)
 751                ret = -EFAULT;
 752
 753out_unlock:
 754        up_write(&mm->mmap_sem);
 755        return ret;
 756}
 757EXPORT_SYMBOL(setup_arg_pages);
 758
 759#endif /* CONFIG_MMU */
 760
 761struct file *open_exec(const char *name)
 762{
 763        struct file *file;
 764        int err;
 765        static const struct open_flags open_exec_flags = {
 766                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 767                .acc_mode = MAY_EXEC | MAY_OPEN,
 768                .intent = LOOKUP_OPEN
 769        };
 770
 771        file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
 772        if (IS_ERR(file))
 773                goto out;
 774
 775        err = -EACCES;
 776        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
 777                goto exit;
 778
 779        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 780                goto exit;
 781
 782        fsnotify_open(file);
 783
 784        err = deny_write_access(file);
 785        if (err)
 786                goto exit;
 787
 788out:
 789        return file;
 790
 791exit:
 792        fput(file);
 793        return ERR_PTR(err);
 794}
 795EXPORT_SYMBOL(open_exec);
 796
 797int kernel_read(struct file *file, loff_t offset,
 798                char *addr, unsigned long count)
 799{
 800        mm_segment_t old_fs;
 801        loff_t pos = offset;
 802        int result;
 803
 804        old_fs = get_fs();
 805        set_fs(get_ds());
 806        /* The cast to a user pointer is valid due to the set_fs() */
 807        result = vfs_read(file, (void __user *)addr, count, &pos);
 808        set_fs(old_fs);
 809        return result;
 810}
 811
 812EXPORT_SYMBOL(kernel_read);
 813
 814static int exec_mmap(struct mm_struct *mm)
 815{
 816        struct task_struct *tsk;
 817        struct mm_struct * old_mm, *active_mm;
 818
 819        /* Notify parent that we're no longer interested in the old VM */
 820        tsk = current;
 821        old_mm = current->mm;
 822        mm_release(tsk, old_mm);
 823
 824        if (old_mm) {
 825                sync_mm_rss(old_mm);
 826                /*
 827                 * Make sure that if there is a core dump in progress
 828                 * for the old mm, we get out and die instead of going
 829                 * through with the exec.  We must hold mmap_sem around
 830                 * checking core_state and changing tsk->mm.
 831                 */
 832                down_read(&old_mm->mmap_sem);
 833                if (unlikely(old_mm->core_state)) {
 834                        up_read(&old_mm->mmap_sem);
 835                        return -EINTR;
 836                }
 837        }
 838        task_lock(tsk);
 839        active_mm = tsk->active_mm;
 840        tsk->mm = mm;
 841        tsk->active_mm = mm;
 842        activate_mm(active_mm, mm);
 843        task_unlock(tsk);
 844        arch_pick_mmap_layout(mm);
 845        if (old_mm) {
 846                up_read(&old_mm->mmap_sem);
 847                BUG_ON(active_mm != old_mm);
 848                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 849                mm_update_next_owner(old_mm);
 850                mmput(old_mm);
 851                return 0;
 852        }
 853        mmdrop(active_mm);
 854        return 0;
 855}
 856
 857/*
 858 * This function makes sure the current process has its own signal table,
 859 * so that flush_signal_handlers can later reset the handlers without
 860 * disturbing other processes.  (Other processes might share the signal
 861 * table via the CLONE_SIGHAND option to clone().)
 862 */
 863static int de_thread(struct task_struct *tsk)
 864{
 865        struct signal_struct *sig = tsk->signal;
 866        struct sighand_struct *oldsighand = tsk->sighand;
 867        spinlock_t *lock = &oldsighand->siglock;
 868
 869        if (thread_group_empty(tsk))
 870                goto no_thread_group;
 871
 872        /*
 873         * Kill all other threads in the thread group.
 874         */
 875        spin_lock_irq(lock);
 876        if (signal_group_exit(sig)) {
 877                /*
 878                 * Another group action in progress, just
 879                 * return so that the signal is processed.
 880                 */
 881                spin_unlock_irq(lock);
 882                return -EAGAIN;
 883        }
 884
 885        sig->group_exit_task = tsk;
 886        sig->notify_count = zap_other_threads(tsk);
 887        if (!thread_group_leader(tsk))
 888                sig->notify_count--;
 889
 890        while (sig->notify_count) {
 891                __set_current_state(TASK_UNINTERRUPTIBLE);
 892                spin_unlock_irq(lock);
 893                schedule();
 894                spin_lock_irq(lock);
 895        }
 896        spin_unlock_irq(lock);
 897
 898        /*
 899         * At this point all other threads have exited, all we have to
 900         * do is to wait for the thread group leader to become inactive,
 901         * and to assume its PID:
 902         */
 903        if (!thread_group_leader(tsk)) {
 904                struct task_struct *leader = tsk->group_leader;
 905
 906                sig->notify_count = -1; /* for exit_notify() */
 907                for (;;) {
 908                        write_lock_irq(&tasklist_lock);
 909                        if (likely(leader->exit_state))
 910                                break;
 911                        __set_current_state(TASK_UNINTERRUPTIBLE);
 912                        write_unlock_irq(&tasklist_lock);
 913                        schedule();
 914                }
 915
 916                /*
 917                 * The only record we have of the real-time age of a
 918                 * process, regardless of execs it's done, is start_time.
 919                 * All the past CPU time is accumulated in signal_struct
 920                 * from sister threads now dead.  But in this non-leader
 921                 * exec, nothing survives from the original leader thread,
 922                 * whose birth marks the true age of this process now.
 923                 * When we take on its identity by switching to its PID, we
 924                 * also take its birthdate (always earlier than our own).
 925                 */
 926                tsk->start_time = leader->start_time;
 927
 928                BUG_ON(!same_thread_group(leader, tsk));
 929                BUG_ON(has_group_leader_pid(tsk));
 930                /*
 931                 * An exec() starts a new thread group with the
 932                 * TGID of the previous thread group. Rehash the
 933                 * two threads with a switched PID, and release
 934                 * the former thread group leader:
 935                 */
 936
 937                /* Become a process group leader with the old leader's pid.
 938                 * The old leader becomes a thread of the this thread group.
 939                 * Note: The old leader also uses this pid until release_task
 940                 *       is called.  Odd but simple and correct.
 941                 */
 942                detach_pid(tsk, PIDTYPE_PID);
 943                tsk->pid = leader->pid;
 944                attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
 945                transfer_pid(leader, tsk, PIDTYPE_PGID);
 946                transfer_pid(leader, tsk, PIDTYPE_SID);
 947
 948                list_replace_rcu(&leader->tasks, &tsk->tasks);
 949                list_replace_init(&leader->sibling, &tsk->sibling);
 950
 951                tsk->group_leader = tsk;
 952                leader->group_leader = tsk;
 953
 954                tsk->exit_signal = SIGCHLD;
 955                leader->exit_signal = -1;
 956
 957                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 958                leader->exit_state = EXIT_DEAD;
 959
 960                /*
 961                 * We are going to release_task()->ptrace_unlink() silently,
 962                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
 963                 * the tracer wont't block again waiting for this thread.
 964                 */
 965                if (unlikely(leader->ptrace))
 966                        __wake_up_parent(leader, leader->parent);
 967                write_unlock_irq(&tasklist_lock);
 968
 969                release_task(leader);
 970        }
 971
 972        sig->group_exit_task = NULL;
 973        sig->notify_count = 0;
 974
 975no_thread_group:
 976        /* we have changed execution domain */
 977        tsk->exit_signal = SIGCHLD;
 978
 979        exit_itimers(sig);
 980        flush_itimer_signals();
 981
 982        if (atomic_read(&oldsighand->count) != 1) {
 983                struct sighand_struct *newsighand;
 984                /*
 985                 * This ->sighand is shared with the CLONE_SIGHAND
 986                 * but not CLONE_THREAD task, switch to the new one.
 987                 */
 988                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 989                if (!newsighand)
 990                        return -ENOMEM;
 991
 992                atomic_set(&newsighand->count, 1);
 993                memcpy(newsighand->action, oldsighand->action,
 994                       sizeof(newsighand->action));
 995
 996                write_lock_irq(&tasklist_lock);
 997                spin_lock(&oldsighand->siglock);
 998                rcu_assign_pointer(tsk->sighand, newsighand);
 999                spin_unlock(&oldsighand->siglock);
1000                write_unlock_irq(&tasklist_lock);
1001
1002                __cleanup_sighand(oldsighand);
1003        }
1004
1005        BUG_ON(!thread_group_leader(tsk));
1006        return 0;
1007}
1008
1009/*
1010 * These functions flushes out all traces of the currently running executable
1011 * so that a new one can be started
1012 */
1013static void flush_old_files(struct files_struct * files)
1014{
1015        long j = -1;
1016        struct fdtable *fdt;
1017
1018        spin_lock(&files->file_lock);
1019        for (;;) {
1020                unsigned long set, i;
1021
1022                j++;
1023                i = j * BITS_PER_LONG;
1024                fdt = files_fdtable(files);
1025                if (i >= fdt->max_fds)
1026                        break;
1027                set = fdt->close_on_exec[j];
1028                if (!set)
1029                        continue;
1030                fdt->close_on_exec[j] = 0;
1031                spin_unlock(&files->file_lock);
1032                for ( ; set ; i++,set >>= 1) {
1033                        if (set & 1) {
1034                                sys_close(i);
1035                        }
1036                }
1037                spin_lock(&files->file_lock);
1038
1039        }
1040        spin_unlock(&files->file_lock);
1041}
1042
1043char *get_task_comm(char *buf, struct task_struct *tsk)
1044{
1045        /* buf must be at least sizeof(tsk->comm) in size */
1046        task_lock(tsk);
1047        strncpy(buf, tsk->comm, sizeof(tsk->comm));
1048        task_unlock(tsk);
1049        return buf;
1050}
1051EXPORT_SYMBOL_GPL(get_task_comm);
1052
1053void set_task_comm(struct task_struct *tsk, char *buf)
1054{
1055        task_lock(tsk);
1056
1057        trace_task_rename(tsk, buf);
1058
1059        /*
1060         * Threads may access current->comm without holding
1061         * the task lock, so write the string carefully.
1062         * Readers without a lock may see incomplete new
1063         * names but are safe from non-terminating string reads.
1064         */
1065        memset(tsk->comm, 0, TASK_COMM_LEN);
1066        wmb();
1067        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1068        task_unlock(tsk);
1069        perf_event_comm(tsk);
1070}
1071
1072static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
1073{
1074        int i, ch;
1075
1076        /* Copies the binary name from after last slash */
1077        for (i = 0; (ch = *(fn++)) != '\0';) {
1078                if (ch == '/')
1079                        i = 0; /* overwrite what we wrote */
1080                else
1081                        if (i < len - 1)
1082                                tcomm[i++] = ch;
1083        }
1084        tcomm[i] = '\0';
1085}
1086
1087int flush_old_exec(struct linux_binprm * bprm)
1088{
1089        int retval;
1090
1091        /*
1092         * Make sure we have a private signal table and that
1093         * we are unassociated from the previous thread group.
1094         */
1095        retval = de_thread(current);
1096        if (retval)
1097                goto out;
1098
1099        set_mm_exe_file(bprm->mm, bprm->file);
1100
1101        filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
1102        /*
1103         * Release all of the old mmap stuff
1104         */
1105        acct_arg_size(bprm, 0);
1106        retval = exec_mmap(bprm->mm);
1107        if (retval)
1108                goto out;
1109
1110        bprm->mm = NULL;                /* We're using it now */
1111
1112        set_fs(USER_DS);
1113        current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
1114        flush_thread();
1115        current->personality &= ~bprm->per_clear;
1116
1117        return 0;
1118
1119out:
1120        return retval;
1121}
1122EXPORT_SYMBOL(flush_old_exec);
1123
1124void would_dump(struct linux_binprm *bprm, struct file *file)
1125{
1126        if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
1127                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
1128}
1129EXPORT_SYMBOL(would_dump);
1130
1131void setup_new_exec(struct linux_binprm * bprm)
1132{
1133        arch_pick_mmap_layout(current->mm);
1134
1135        /* This is the point of no return */
1136        current->sas_ss_sp = current->sas_ss_size = 0;
1137
1138        if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
1139                set_dumpable(current->mm, 1);
1140        else
1141                set_dumpable(current->mm, suid_dumpable);
1142
1143        set_task_comm(current, bprm->tcomm);
1144
1145        /* Set the new mm task size. We have to do that late because it may
1146         * depend on TIF_32BIT which is only updated in flush_thread() on
1147         * some architectures like powerpc
1148         */
1149        current->mm->task_size = TASK_SIZE;
1150
1151        /* install the new credentials */
1152        if (!uid_eq(bprm->cred->uid, current_euid()) ||
1153            !gid_eq(bprm->cred->gid, current_egid())) {
1154                current->pdeath_signal = 0;
1155        } else {
1156                would_dump(bprm, bprm->file);
1157                if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
1158                        set_dumpable(current->mm, suid_dumpable);
1159        }
1160
1161        /*
1162         * Flush performance counters when crossing a
1163         * security domain:
1164         */
1165        if (!get_dumpable(current->mm))
1166                perf_event_exit_task(current);
1167
1168        /* An exec changes our domain. We are no longer part of the thread
1169           group */
1170
1171        current->self_exec_id++;
1172                        
1173        flush_signal_handlers(current, 0);
1174        flush_old_files(current->files);
1175}
1176EXPORT_SYMBOL(setup_new_exec);
1177
1178/*
1179 * Prepare credentials and lock ->cred_guard_mutex.
1180 * install_exec_creds() commits the new creds and drops the lock.
1181 * Or, if exec fails before, free_bprm() should release ->cred and
1182 * and unlock.
1183 */
1184int prepare_bprm_creds(struct linux_binprm *bprm)
1185{
1186        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1187                return -ERESTARTNOINTR;
1188
1189        bprm->cred = prepare_exec_creds();
1190        if (likely(bprm->cred))
1191                return 0;
1192
1193        mutex_unlock(&current->signal->cred_guard_mutex);
1194        return -ENOMEM;
1195}
1196
1197void free_bprm(struct linux_binprm *bprm)
1198{
1199        free_arg_pages(bprm);
1200        if (bprm->cred) {
1201                mutex_unlock(&current->signal->cred_guard_mutex);
1202                abort_creds(bprm->cred);
1203        }
1204        kfree(bprm);
1205}
1206
1207/*
1208 * install the new credentials for this executable
1209 */
1210void install_exec_creds(struct linux_binprm *bprm)
1211{
1212        security_bprm_committing_creds(bprm);
1213
1214        commit_creds(bprm->cred);
1215        bprm->cred = NULL;
1216        /*
1217         * cred_guard_mutex must be held at least to this point to prevent
1218         * ptrace_attach() from altering our determination of the task's
1219         * credentials; any time after this it may be unlocked.
1220         */
1221        security_bprm_committed_creds(bprm);
1222        mutex_unlock(&current->signal->cred_guard_mutex);
1223}
1224EXPORT_SYMBOL(install_exec_creds);
1225
1226/*
1227 * determine how safe it is to execute the proposed program
1228 * - the caller must hold ->cred_guard_mutex to protect against
1229 *   PTRACE_ATTACH
1230 */
1231static int check_unsafe_exec(struct linux_binprm *bprm)
1232{
1233        struct task_struct *p = current, *t;
1234        unsigned n_fs;
1235        int res = 0;
1236
1237        if (p->ptrace) {
1238                if (p->ptrace & PT_PTRACE_CAP)
1239                        bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
1240                else
1241                        bprm->unsafe |= LSM_UNSAFE_PTRACE;
1242        }
1243
1244        /*
1245         * This isn't strictly necessary, but it makes it harder for LSMs to
1246         * mess up.
1247         */
1248        if (current->no_new_privs)
1249                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1250
1251        n_fs = 1;
1252        spin_lock(&p->fs->lock);
1253        rcu_read_lock();
1254        for (t = next_thread(p); t != p; t = next_thread(t)) {
1255                if (t->fs == p->fs)
1256                        n_fs++;
1257        }
1258        rcu_read_unlock();
1259
1260        if (p->fs->users > n_fs) {
1261                bprm->unsafe |= LSM_UNSAFE_SHARE;
1262        } else {
1263                res = -EAGAIN;
1264                if (!p->fs->in_exec) {
1265                        p->fs->in_exec = 1;
1266                        res = 1;
1267                }
1268        }
1269        spin_unlock(&p->fs->lock);
1270
1271        return res;
1272}
1273
1274/* 
1275 * Fill the binprm structure from the inode. 
1276 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1277 *
1278 * This may be called multiple times for binary chains (scripts for example).
1279 */
1280int prepare_binprm(struct linux_binprm *bprm)
1281{
1282        umode_t mode;
1283        struct inode * inode = bprm->file->f_path.dentry->d_inode;
1284        int retval;
1285
1286        mode = inode->i_mode;
1287        if (bprm->file->f_op == NULL)
1288                return -EACCES;
1289
1290        /* clear any previous set[ug]id data from a previous binary */
1291        bprm->cred->euid = current_euid();
1292        bprm->cred->egid = current_egid();
1293
1294        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1295            !current->no_new_privs) {
1296                /* Set-uid? */
1297                if (mode & S_ISUID) {
1298                        if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
1299                                return -EPERM;
1300                        bprm->per_clear |= PER_CLEAR_ON_SETID;
1301                        bprm->cred->euid = inode->i_uid;
1302
1303                }
1304
1305                /* Set-gid? */
1306                /*
1307                 * If setgid is set but no group execute bit then this
1308                 * is a candidate for mandatory locking, not a setgid
1309                 * executable.
1310                 */
1311                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1312                        if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
1313                                return -EPERM;
1314                        bprm->per_clear |= PER_CLEAR_ON_SETID;
1315                        bprm->cred->egid = inode->i_gid;
1316                }
1317        }
1318
1319        /* fill in binprm security blob */
1320        retval = security_bprm_set_creds(bprm);
1321        if (retval)
1322                return retval;
1323        bprm->cred_prepared = 1;
1324
1325        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1326        return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
1327}
1328
1329EXPORT_SYMBOL(prepare_binprm);
1330
1331/*
1332 * Arguments are '\0' separated strings found at the location bprm->p
1333 * points to; chop off the first by relocating brpm->p to right after
1334 * the first '\0' encountered.
1335 */
1336int remove_arg_zero(struct linux_binprm *bprm)
1337{
1338        int ret = 0;
1339        unsigned long offset;
1340        char *kaddr;
1341        struct page *page;
1342
1343        if (!bprm->argc)
1344                return 0;
1345
1346        do {
1347                offset = bprm->p & ~PAGE_MASK;
1348                page = get_arg_page(bprm, bprm->p, 0);
1349                if (!page) {
1350                        ret = -EFAULT;
1351                        goto out;
1352                }
1353                kaddr = kmap_atomic(page);
1354
1355                for (; offset < PAGE_SIZE && kaddr[offset];
1356                                offset++, bprm->p++)
1357                        ;
1358
1359                kunmap_atomic(kaddr);
1360                put_arg_page(page);
1361
1362                if (offset == PAGE_SIZE)
1363                        free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
1364        } while (offset == PAGE_SIZE);
1365
1366        bprm->p++;
1367        bprm->argc--;
1368        ret = 0;
1369
1370out:
1371        return ret;
1372}
1373EXPORT_SYMBOL(remove_arg_zero);
1374
1375/*
1376 * cycle the list of binary formats handler, until one recognizes the image
1377 */
1378int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1379{
1380        unsigned int depth = bprm->recursion_depth;
1381        int try,retval;
1382        struct linux_binfmt *fmt;
1383        pid_t old_pid, old_vpid;
1384
1385        retval = security_bprm_check(bprm);
1386        if (retval)
1387                return retval;
1388
1389        retval = audit_bprm(bprm);
1390        if (retval)
1391                return retval;
1392
1393        /* Need to fetch pid before load_binary changes it */
1394        old_pid = current->pid;
1395        rcu_read_lock();
1396        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1397        rcu_read_unlock();
1398
1399        retval = -ENOENT;
1400        for (try=0; try<2; try++) {
1401                read_lock(&binfmt_lock);
1402                list_for_each_entry(fmt, &formats, lh) {
1403                        int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
1404                        if (!fn)
1405                                continue;
1406                        if (!try_module_get(fmt->module))
1407                                continue;
1408                        read_unlock(&binfmt_lock);
1409                        retval = fn(bprm, regs);
1410                        /*
1411                         * Restore the depth counter to its starting value
1412                         * in this call, so we don't have to rely on every
1413                         * load_binary function to restore it on return.
1414                         */
1415                        bprm->recursion_depth = depth;
1416                        if (retval >= 0) {
1417                                if (depth == 0) {
1418                                        trace_sched_process_exec(current, old_pid, bprm);
1419                                        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1420                                }
1421                                put_binfmt(fmt);
1422                                allow_write_access(bprm->file);
1423                                if (bprm->file)
1424                                        fput(bprm->file);
1425                                bprm->file = NULL;
1426                                current->did_exec = 1;
1427                                proc_exec_connector(current);
1428                                return retval;
1429                        }
1430                        read_lock(&binfmt_lock);
1431                        put_binfmt(fmt);
1432                        if (retval != -ENOEXEC || bprm->mm == NULL)
1433                                break;
1434                        if (!bprm->file) {
1435                                read_unlock(&binfmt_lock);
1436                                return retval;
1437                        }
1438                }
1439                read_unlock(&binfmt_lock);
1440#ifdef CONFIG_MODULES
1441                if (retval != -ENOEXEC || bprm->mm == NULL) {
1442                        break;
1443                } else {
1444#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1445                        if (printable(bprm->buf[0]) &&
1446                            printable(bprm->buf[1]) &&
1447                            printable(bprm->buf[2]) &&
1448                            printable(bprm->buf[3]))
1449                                break; /* -ENOEXEC */
1450                        if (try)
1451                                break; /* -ENOEXEC */
1452                        request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1453                }
1454#else
1455                break;
1456#endif
1457        }
1458        return retval;
1459}
1460
1461EXPORT_SYMBOL(search_binary_handler);
1462
1463/*
1464 * sys_execve() executes a new program.
1465 */
1466static int do_execve_common(const char *filename,
1467                                struct user_arg_ptr argv,
1468                                struct user_arg_ptr envp,
1469                                struct pt_regs *regs)
1470{
1471        struct linux_binprm *bprm;
1472        struct file *file;
1473        struct files_struct *displaced;
1474        bool clear_in_exec;
1475        int retval;
1476        const struct cred *cred = current_cred();
1477
1478        /*
1479         * We move the actual failure in case of RLIMIT_NPROC excess from
1480         * set*uid() to execve() because too many poorly written programs
1481         * don't check setuid() return code.  Here we additionally recheck
1482         * whether NPROC limit is still exceeded.
1483         */
1484        if ((current->flags & PF_NPROC_EXCEEDED) &&
1485            atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
1486                retval = -EAGAIN;
1487                goto out_ret;
1488        }
1489
1490        /* We're below the limit (still or again), so we don't want to make
1491         * further execve() calls fail. */
1492        current->flags &= ~PF_NPROC_EXCEEDED;
1493
1494        retval = unshare_files(&displaced);
1495        if (retval)
1496                goto out_ret;
1497
1498        retval = -ENOMEM;
1499        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1500        if (!bprm)
1501                goto out_files;
1502
1503        retval = prepare_bprm_creds(bprm);
1504        if (retval)
1505                goto out_free;
1506
1507        retval = check_unsafe_exec(bprm);
1508        if (retval < 0)
1509                goto out_free;
1510        clear_in_exec = retval;
1511        current->in_execve = 1;
1512
1513        file = open_exec(filename);
1514        retval = PTR_ERR(file);
1515        if (IS_ERR(file))
1516                goto out_unmark;
1517
1518        sched_exec();
1519
1520        bprm->file = file;
1521        bprm->filename = filename;
1522        bprm->interp = filename;
1523
1524        retval = bprm_mm_init(bprm);
1525        if (retval)
1526                goto out_file;
1527
1528        bprm->argc = count(argv, MAX_ARG_STRINGS);
1529        if ((retval = bprm->argc) < 0)
1530                goto out;
1531
1532        bprm->envc = count(envp, MAX_ARG_STRINGS);
1533        if ((retval = bprm->envc) < 0)
1534                goto out;
1535
1536        retval = prepare_binprm(bprm);
1537        if (retval < 0)
1538                goto out;
1539
1540        retval = copy_strings_kernel(1, &bprm->filename, bprm);
1541        if (retval < 0)
1542                goto out;
1543
1544        bprm->exec = bprm->p;
1545        retval = copy_strings(bprm->envc, envp, bprm);
1546        if (retval < 0)
1547                goto out;
1548
1549        retval = copy_strings(bprm->argc, argv, bprm);
1550        if (retval < 0)
1551                goto out;
1552
1553        retval = search_binary_handler(bprm,regs);
1554        if (retval < 0)
1555                goto out;
1556
1557        /* execve succeeded */
1558        current->fs->in_exec = 0;
1559        current->in_execve = 0;
1560        acct_update_integrals(current);
1561        free_bprm(bprm);
1562        if (displaced)
1563                put_files_struct(displaced);
1564        return retval;
1565
1566out:
1567        if (bprm->mm) {
1568                acct_arg_size(bprm, 0);
1569                mmput(bprm->mm);
1570        }
1571
1572out_file:
1573        if (bprm->file) {
1574                allow_write_access(bprm->file);
1575                fput(bprm->file);
1576        }
1577
1578out_unmark:
1579        if (clear_in_exec)
1580                current->fs->in_exec = 0;
1581        current->in_execve = 0;
1582
1583out_free:
1584        free_bprm(bprm);
1585
1586out_files:
1587        if (displaced)
1588                reset_files_struct(displaced);
1589out_ret:
1590        return retval;
1591}
1592
1593int do_execve(const char *filename,
1594        const char __user *const __user *__argv,
1595        const char __user *const __user *__envp,
1596        struct pt_regs *regs)
1597{
1598        struct user_arg_ptr argv = { .ptr.native = __argv };
1599        struct user_arg_ptr envp = { .ptr.native = __envp };
1600        return do_execve_common(filename, argv, envp, regs);
1601}
1602
1603#ifdef CONFIG_COMPAT
1604int compat_do_execve(char *filename,
1605        compat_uptr_t __user *__argv,
1606        compat_uptr_t __user *__envp,
1607        struct pt_regs *regs)
1608{
1609        struct user_arg_ptr argv = {
1610                .is_compat = true,
1611                .ptr.compat = __argv,
1612        };
1613        struct user_arg_ptr envp = {
1614                .is_compat = true,
1615                .ptr.compat = __envp,
1616        };
1617        return do_execve_common(filename, argv, envp, regs);
1618}
1619#endif
1620
1621void set_binfmt(struct linux_binfmt *new)
1622{
1623        struct mm_struct *mm = current->mm;
1624
1625        if (mm->binfmt)
1626                module_put(mm->binfmt->module);
1627
1628        mm->binfmt = new;
1629        if (new)
1630                __module_get(new->module);
1631}
1632
1633EXPORT_SYMBOL(set_binfmt);
1634
1635static int expand_corename(struct core_name *cn)
1636{
1637        char *old_corename = cn->corename;
1638
1639        cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
1640        cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
1641
1642        if (!cn->corename) {
1643                kfree(old_corename);
1644                return -ENOMEM;
1645        }
1646
1647        return 0;
1648}
1649
1650static int cn_printf(struct core_name *cn, const char *fmt, ...)
1651{
1652        char *cur;
1653        int need;
1654        int ret;
1655        va_list arg;
1656
1657        va_start(arg, fmt);
1658        need = vsnprintf(NULL, 0, fmt, arg);
1659        va_end(arg);
1660
1661        if (likely(need < cn->size - cn->used - 1))
1662                goto out_printf;
1663
1664        ret = expand_corename(cn);
1665        if (ret)
1666                goto expand_fail;
1667
1668out_printf:
1669        cur = cn->corename + cn->used;
1670        va_start(arg, fmt);
1671        vsnprintf(cur, need + 1, fmt, arg);
1672        va_end(arg);
1673        cn->used += need;
1674        return 0;
1675
1676expand_fail:
1677        return ret;
1678}
1679
1680static void cn_escape(char *str)
1681{
1682        for (; *str; str++)
1683                if (*str == '/')
1684                        *str = '!';
1685}
1686
1687static int cn_print_exe_file(struct core_name *cn)
1688{
1689        struct file *exe_file;
1690        char *pathbuf, *path;
1691        int ret;
1692
1693        exe_file = get_mm_exe_file(current->mm);
1694        if (!exe_file) {
1695                char *commstart = cn->corename + cn->used;
1696                ret = cn_printf(cn, "%s (path unknown)", current->comm);
1697                cn_escape(commstart);
1698                return ret;
1699        }
1700
1701        pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
1702        if (!pathbuf) {
1703                ret = -ENOMEM;
1704                goto put_exe_file;
1705        }
1706
1707        path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
1708        if (IS_ERR(path)) {
1709                ret = PTR_ERR(path);
1710                goto free_buf;
1711        }
1712
1713        cn_escape(path);
1714
1715        ret = cn_printf(cn, "%s", path);
1716
1717free_buf:
1718        kfree(pathbuf);
1719put_exe_file:
1720        fput(exe_file);
1721        return ret;
1722}
1723
1724/* format_corename will inspect the pattern parameter, and output a
1725 * name into corename, which must have space for at least
1726 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1727 */
1728static int format_corename(struct core_name *cn, long signr)
1729{
1730        const struct cred *cred = current_cred();
1731        const char *pat_ptr = core_pattern;
1732        int ispipe = (*pat_ptr == '|');
1733        int pid_in_pattern = 0;
1734        int err = 0;
1735
1736        cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
1737        cn->corename = kmalloc(cn->size, GFP_KERNEL);
1738        cn->used = 0;
1739
1740        if (!cn->corename)
1741                return -ENOMEM;
1742
1743        /* Repeat as long as we have more pattern to process and more output
1744           space */
1745        while (*pat_ptr) {
1746                if (*pat_ptr != '%') {
1747                        if (*pat_ptr == 0)
1748                                goto out;
1749                        err = cn_printf(cn, "%c", *pat_ptr++);
1750                } else {
1751                        switch (*++pat_ptr) {
1752                        /* single % at the end, drop that */
1753                        case 0:
1754                                goto out;
1755                        /* Double percent, output one percent */
1756                        case '%':
1757                                err = cn_printf(cn, "%c", '%');
1758                                break;
1759                        /* pid */
1760                        case 'p':
1761                                pid_in_pattern = 1;
1762                                err = cn_printf(cn, "%d",
1763                                              task_tgid_vnr(current));
1764                                break;
1765                        /* uid */
1766                        case 'u':
1767                                err = cn_printf(cn, "%d", cred->uid);
1768                                break;
1769                        /* gid */
1770                        case 'g':
1771                                err = cn_printf(cn, "%d", cred->gid);
1772                                break;
1773                        /* signal that caused the coredump */
1774                        case 's':
1775                                err = cn_printf(cn, "%ld", signr);
1776                                break;
1777                        /* UNIX time of coredump */
1778                        case 't': {
1779                                struct timeval tv;
1780                                do_gettimeofday(&tv);
1781                                err = cn_printf(cn, "%lu", tv.tv_sec);
1782                                break;
1783                        }
1784                        /* hostname */
1785                        case 'h': {
1786                                char *namestart = cn->corename + cn->used;
1787                                down_read(&uts_sem);
1788                                err = cn_printf(cn, "%s",
1789                                              utsname()->nodename);
1790                                up_read(&uts_sem);
1791                                cn_escape(namestart);
1792                                break;
1793                        }
1794                        /* executable */
1795                        case 'e': {
1796                                char *commstart = cn->corename + cn->used;
1797                                err = cn_printf(cn, "%s", current->comm);
1798                                cn_escape(commstart);
1799                                break;
1800                        }
1801                        case 'E':
1802                                err = cn_print_exe_file(cn);
1803                                break;
1804                        /* core limit size */
1805                        case 'c':
1806                                err = cn_printf(cn, "%lu",
1807                                              rlimit(RLIMIT_CORE));
1808                                break;
1809                        default:
1810                                break;
1811                        }
1812                        ++pat_ptr;
1813                }
1814
1815                if (err)
1816                        return err;
1817        }
1818
1819        /* Backward compatibility with core_uses_pid:
1820         *
1821         * If core_pattern does not include a %p (as is the default)
1822         * and core_uses_pid is set, then .%pid will be appended to
1823         * the filename. Do not do this for piped commands. */
1824        if (!ispipe && !pid_in_pattern && core_uses_pid) {
1825                err = cn_printf(cn, ".%d", task_tgid_vnr(current));
1826                if (err)
1827                        return err;
1828        }
1829out:
1830        return ispipe;
1831}
1832
1833static int zap_process(struct task_struct *start, int exit_code)
1834{
1835        struct task_struct *t;
1836        int nr = 0;
1837
1838        start->signal->flags = SIGNAL_GROUP_EXIT;
1839        start->signal->group_exit_code = exit_code;
1840        start->signal->group_stop_count = 0;
1841
1842        t = start;
1843        do {
1844                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
1845                if (t != current && t->mm) {
1846                        sigaddset(&t->pending.signal, SIGKILL);
1847                        signal_wake_up(t, 1);
1848                        nr++;
1849                }
1850        } while_each_thread(start, t);
1851
1852        return nr;
1853}
1854
1855static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1856                                struct core_state *core_state, int exit_code)
1857{
1858        struct task_struct *g, *p;
1859        unsigned long flags;
1860        int nr = -EAGAIN;
1861
1862        spin_lock_irq(&tsk->sighand->siglock);
1863        if (!signal_group_exit(tsk->signal)) {
1864                mm->core_state = core_state;
1865                nr = zap_process(tsk, exit_code);
1866        }
1867        spin_unlock_irq(&tsk->sighand->siglock);
1868        if (unlikely(nr < 0))
1869                return nr;
1870
1871        if (atomic_read(&mm->mm_users) == nr + 1)
1872                goto done;
1873        /*
1874         * We should find and kill all tasks which use this mm, and we should
1875         * count them correctly into ->nr_threads. We don't take tasklist
1876         * lock, but this is safe wrt:
1877         *
1878         * fork:
1879         *      None of sub-threads can fork after zap_process(leader). All
1880         *      processes which were created before this point should be
1881         *      visible to zap_threads() because copy_process() adds the new
1882         *      process to the tail of init_task.tasks list, and lock/unlock
1883         *      of ->siglock provides a memory barrier.
1884         *
1885         * do_exit:
1886         *      The caller holds mm->mmap_sem. This means that the task which
1887         *      uses this mm can't pass exit_mm(), so it can't exit or clear
1888         *      its ->mm.
1889         *
1890         * de_thread:
1891         *      It does list_replace_rcu(&leader->tasks, &current->tasks),
1892         *      we must see either old or new leader, this does not matter.
1893         *      However, it can change p->sighand, so lock_task_sighand(p)
1894         *      must be used. Since p->mm != NULL and we hold ->mmap_sem
1895         *      it can't fail.
1896         *
1897         *      Note also that "g" can be the old leader with ->mm == NULL
1898         *      and already unhashed and thus removed from ->thread_group.
1899         *      This is OK, __unhash_process()->list_del_rcu() does not
1900         *      clear the ->next pointer, we will find the new leader via
1901         *      next_thread().
1902         */
1903        rcu_read_lock();
1904        for_each_process(g) {
1905                if (g == tsk->group_leader)
1906                        continue;
1907                if (g->flags & PF_KTHREAD)
1908                        continue;
1909                p = g;
1910                do {
1911                        if (p->mm) {
1912                                if (unlikely(p->mm == mm)) {
1913                                        lock_task_sighand(p, &flags);
1914                                        nr += zap_process(p, exit_code);
1915                                        unlock_task_sighand(p, &flags);
1916                                }
1917                                break;
1918                        }
1919                } while_each_thread(g, p);
1920        }
1921        rcu_read_unlock();
1922done:
1923        atomic_set(&core_state->nr_threads, nr);
1924        return nr;
1925}
1926
1927static int coredump_wait(int exit_code, struct core_state *core_state)
1928{
1929        struct task_struct *tsk = current;
1930        struct mm_struct *mm = tsk->mm;
1931        int core_waiters = -EBUSY;
1932
1933        init_completion(&core_state->startup);
1934        core_state->dumper.task = tsk;
1935        core_state->dumper.next = NULL;
1936
1937        down_write(&mm->mmap_sem);
1938        if (!mm->core_state)
1939                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1940        up_write(&mm->mmap_sem);
1941
1942        if (core_waiters > 0) {
1943                struct core_thread *ptr;
1944
1945                wait_for_completion(&core_state->startup);
1946                /*
1947                 * Wait for all the threads to become inactive, so that
1948                 * all the thread context (extended register state, like
1949                 * fpu etc) gets copied to the memory.
1950                 */
1951                ptr = core_state->dumper.next;
1952                while (ptr != NULL) {
1953                        wait_task_inactive(ptr->task, 0);
1954                        ptr = ptr->next;
1955                }
1956        }
1957
1958        return core_waiters;
1959}
1960
1961static void coredump_finish(struct mm_struct *mm)
1962{
1963        struct core_thread *curr, *next;
1964        struct task_struct *task;
1965
1966        next = mm->core_state->dumper.next;
1967        while ((curr = next) != NULL) {
1968                next = curr->next;
1969                task = curr->task;
1970                /*
1971                 * see exit_mm(), curr->task must not see
1972                 * ->task == NULL before we read ->next.
1973                 */
1974                smp_mb();
1975                curr->task = NULL;
1976                wake_up_process(task);
1977        }
1978
1979        mm->core_state = NULL;
1980}
1981
1982/*
1983 * set_dumpable converts traditional three-value dumpable to two flags and
1984 * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
1985 * these bits are not changed atomically.  So get_dumpable can observe the
1986 * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
1987 * return either old dumpable or new one by paying attention to the order of
1988 * modifying the bits.
1989 *
1990 * dumpable |   mm->flags (binary)
1991 * old  new | initial interim  final
1992 * ---------+-----------------------
1993 *  0    1  |   00      01      01
1994 *  0    2  |   00      10(*)   11
1995 *  1    0  |   01      00      00
1996 *  1    2  |   01      11      11
1997 *  2    0  |   11      10(*)   00
1998 *  2    1  |   11      11      01
1999 *
2000 * (*) get_dumpable regards interim value of 10 as 11.
2001 */
2002void set_dumpable(struct mm_struct *mm, int value)
2003{
2004        switch (value) {
2005        case SUID_DUMPABLE_DISABLED:
2006                clear_bit(MMF_DUMPABLE, &mm->flags);
2007                smp_wmb();
2008                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
2009                break;
2010        case SUID_DUMPABLE_ENABLED:
2011                set_bit(MMF_DUMPABLE, &mm->flags);
2012                smp_wmb();
2013                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
2014                break;
2015        case SUID_DUMPABLE_SAFE:
2016                set_bit(MMF_DUMP_SECURELY, &mm->flags);
2017                smp_wmb();
2018                set_bit(MMF_DUMPABLE, &mm->flags);
2019                break;
2020        }
2021}
2022
2023static int __get_dumpable(unsigned long mm_flags)
2024{
2025        int ret;
2026
2027        ret = mm_flags & MMF_DUMPABLE_MASK;
2028        return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret;
2029}
2030
2031int get_dumpable(struct mm_struct *mm)
2032{
2033        return __get_dumpable(mm->flags);
2034}
2035
2036static void wait_for_dump_helpers(struct file *file)
2037{
2038        struct pipe_inode_info *pipe;
2039
2040        pipe = file->f_path.dentry->d_inode->i_pipe;
2041
2042        pipe_lock(pipe);
2043        pipe->readers++;
2044        pipe->writers--;
2045
2046        while ((pipe->readers > 1) && (!signal_pending(current))) {
2047                wake_up_interruptible_sync(&pipe->wait);
2048                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
2049                pipe_wait(pipe);
2050        }
2051
2052        pipe->readers--;
2053        pipe->writers++;
2054        pipe_unlock(pipe);
2055
2056}
2057
2058
2059/*
2060 * umh_pipe_setup
2061 * helper function to customize the process used
2062 * to collect the core in userspace.  Specifically
2063 * it sets up a pipe and installs it as fd 0 (stdin)
2064 * for the process.  Returns 0 on success, or
2065 * PTR_ERR on failure.
2066 * Note that it also sets the core limit to 1.  This
2067 * is a special value that we use to trap recursive
2068 * core dumps
2069 */
2070static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
2071{
2072        struct file *files[2];
2073        struct fdtable *fdt;
2074        struct coredump_params *cp = (struct coredump_params *)info->data;
2075        struct files_struct *cf = current->files;
2076        int err = create_pipe_files(files, 0);
2077        if (err)
2078                return err;
2079
2080        cp->file = files[1];
2081
2082        sys_close(0);
2083        fd_install(0, files[0]);
2084        spin_lock(&cf->file_lock);
2085        fdt = files_fdtable(cf);
2086        __set_open_fd(0, fdt);
2087        __clear_close_on_exec(0, fdt);
2088        spin_unlock(&cf->file_lock);
2089
2090        /* and disallow core files too */
2091        current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
2092
2093        return 0;
2094}
2095
2096void do_coredump(long signr, int exit_code, struct pt_regs *regs)
2097{
2098        struct core_state core_state;
2099        struct core_name cn;
2100        struct mm_struct *mm = current->mm;
2101        struct linux_binfmt * binfmt;
2102        const struct cred *old_cred;
2103        struct cred *cred;
2104        int retval = 0;
2105        int flag = 0;
2106        int ispipe;
2107        bool need_nonrelative = false;
2108        static atomic_t core_dump_count = ATOMIC_INIT(0);
2109        struct coredump_params cprm = {
2110                .signr = signr,
2111                .regs = regs,
2112                .limit = rlimit(RLIMIT_CORE),
2113                /*
2114                 * We must use the same mm->flags while dumping core to avoid
2115                 * inconsistency of bit flags, since this flag is not protected
2116                 * by any locks.
2117                 */
2118                .mm_flags = mm->flags,
2119        };
2120
2121        audit_core_dumps(signr);
2122
2123        binfmt = mm->binfmt;
2124        if (!binfmt || !binfmt->core_dump)
2125                goto fail;
2126        if (!__get_dumpable(cprm.mm_flags))
2127                goto fail;
2128
2129        cred = prepare_creds();
2130        if (!cred)
2131                goto fail;
2132        /*
2133         * We cannot trust fsuid as being the "true" uid of the process
2134         * nor do we know its entire history. We only know it was tainted
2135         * so we dump it as root in mode 2, and only into a controlled
2136         * environment (pipe handler or fully qualified path).
2137         */
2138        if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
2139                /* Setuid core dump mode */
2140                flag = O_EXCL;          /* Stop rewrite attacks */
2141                cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
2142                need_nonrelative = true;
2143        }
2144
2145        retval = coredump_wait(exit_code, &core_state);
2146        if (retval < 0)
2147                goto fail_creds;
2148
2149        old_cred = override_creds(cred);
2150
2151        /*
2152         * Clear any false indication of pending signals that might
2153         * be seen by the filesystem code called to write the core file.
2154         */
2155        clear_thread_flag(TIF_SIGPENDING);
2156
2157        ispipe = format_corename(&cn, signr);
2158
2159        if (ispipe) {
2160                int dump_count;
2161                char **helper_argv;
2162
2163                if (ispipe < 0) {
2164                        printk(KERN_WARNING "format_corename failed\n");
2165                        printk(KERN_WARNING "Aborting core\n");
2166                        goto fail_corename;
2167                }
2168
2169                if (cprm.limit == 1) {
2170                        /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
2171                         *
2172                         * Normally core limits are irrelevant to pipes, since
2173                         * we're not writing to the file system, but we use
2174                         * cprm.limit of 1 here as a speacial value, this is a
2175                         * consistent way to catch recursive crashes.
2176                         * We can still crash if the core_pattern binary sets
2177                         * RLIM_CORE = !1, but it runs as root, and can do
2178                         * lots of stupid things.
2179                         *
2180                         * Note that we use task_tgid_vnr here to grab the pid
2181                         * of the process group leader.  That way we get the
2182                         * right pid if a thread in a multi-threaded
2183                         * core_pattern process dies.
2184                         */
2185                        printk(KERN_WARNING
2186                                "Process %d(%s) has RLIMIT_CORE set to 1\n",
2187                                task_tgid_vnr(current), current->comm);
2188                        printk(KERN_WARNING "Aborting core\n");
2189                        goto fail_unlock;
2190                }
2191                cprm.limit = RLIM_INFINITY;
2192
2193                dump_count = atomic_inc_return(&core_dump_count);
2194                if (core_pipe_limit && (core_pipe_limit < dump_count)) {
2195                        printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
2196                               task_tgid_vnr(current), current->comm);
2197                        printk(KERN_WARNING "Skipping core dump\n");
2198                        goto fail_dropcount;
2199                }
2200
2201                helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
2202                if (!helper_argv) {
2203                        printk(KERN_WARNING "%s failed to allocate memory\n",
2204                               __func__);
2205                        goto fail_dropcount;
2206                }
2207
2208                retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
2209                                        NULL, UMH_WAIT_EXEC, umh_pipe_setup,
2210                                        NULL, &cprm);
2211                argv_free(helper_argv);
2212                if (retval) {
2213                        printk(KERN_INFO "Core dump to %s pipe failed\n",
2214                               cn.corename);
2215                        goto close_fail;
2216                }
2217        } else {
2218                struct inode *inode;
2219
2220                if (cprm.limit < binfmt->min_coredump)
2221                        goto fail_unlock;
2222
2223                if (need_nonrelative && cn.corename[0] != '/') {
2224                        printk(KERN_WARNING "Pid %d(%s) can only dump core "\
2225                                "to fully qualified path!\n",
2226                                task_tgid_vnr(current), current->comm);
2227                        printk(KERN_WARNING "Skipping core dump\n");
2228                        goto fail_unlock;
2229                }
2230
2231                cprm.file = filp_open(cn.corename,
2232                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
2233                                 0600);
2234                if (IS_ERR(cprm.file))
2235                        goto fail_unlock;
2236
2237                inode = cprm.file->f_path.dentry->d_inode;
2238                if (inode->i_nlink > 1)
2239                        goto close_fail;
2240                if (d_unhashed(cprm.file->f_path.dentry))
2241                        goto close_fail;
2242                /*
2243                 * AK: actually i see no reason to not allow this for named
2244                 * pipes etc, but keep the previous behaviour for now.
2245                 */
2246                if (!S_ISREG(inode->i_mode))
2247                        goto close_fail;
2248                /*
2249                 * Dont allow local users get cute and trick others to coredump
2250                 * into their pre-created files.
2251                 */
2252                if (!uid_eq(inode->i_uid, current_fsuid()))
2253                        goto close_fail;
2254                if (!cprm.file->f_op || !cprm.file->f_op->write)
2255                        goto close_fail;
2256                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
2257                        goto close_fail;
2258        }
2259
2260        retval = binfmt->core_dump(&cprm);
2261        if (retval)
2262                current->signal->group_exit_code |= 0x80;
2263
2264        if (ispipe && core_pipe_limit)
2265                wait_for_dump_helpers(cprm.file);
2266close_fail:
2267        if (cprm.file)
2268                filp_close(cprm.file, NULL);
2269fail_dropcount:
2270        if (ispipe)
2271                atomic_dec(&core_dump_count);
2272fail_unlock:
2273        kfree(cn.corename);
2274fail_corename:
2275        coredump_finish(mm);
2276        revert_creds(old_cred);
2277fail_creds:
2278        put_cred(cred);
2279fail:
2280        return;
2281}
2282
2283/*
2284 * Core dumping helper functions.  These are the only things you should
2285 * do on a core-file: use only these functions to write out all the
2286 * necessary info.
2287 */
2288int dump_write(struct file *file, const void *addr, int nr)
2289{
2290        return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2291}
2292EXPORT_SYMBOL(dump_write);
2293
2294int dump_seek(struct file *file, loff_t off)
2295{
2296        int ret = 1;
2297
2298        if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2299                if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2300                        return 0;
2301        } else {
2302                char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2303
2304                if (!buf)
2305                        return 0;
2306                while (off > 0) {
2307                        unsigned long n = off;
2308
2309                        if (n > PAGE_SIZE)
2310                                n = PAGE_SIZE;
2311                        if (!dump_write(file, buf, n)) {
2312                                ret = 0;
2313                                break;
2314                        }
2315                        off -= n;
2316                }
2317                free_page((unsigned long)buf);
2318        }
2319        return ret;
2320}
2321EXPORT_SYMBOL(dump_seek);
2322
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.