linux/fs/exec.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/exec.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * #!-checking implemented by tytso.
   9 */
  10/*
  11 * Demand-loading implemented 01.12.91 - no need to read anything but
  12 * the header into memory. The inode of the executable is put into
  13 * "current->executable", and page faults do the actual loading. Clean.
  14 *
  15 * Once more I can proudly say that linux stood up to being changed: it
  16 * was less than 2 hours work to get demand-loading completely implemented.
  17 *
  18 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
  19 * current->executable is only used by the procfs.  This allows a dispatch
  20 * table to check for several different types  of binary formats.  We keep
  21 * trying until we recognize the file or we run out of supported binary
  22 * formats. 
  23 */
  24
  25#include <linux/slab.h>
  26#include <linux/file.h>
  27#include <linux/fdtable.h>
  28#include <linux/mm.h>
  29#include <linux/stat.h>
  30#include <linux/fcntl.h>
  31#include <linux/swap.h>
  32#include <linux/string.h>
  33#include <linux/init.h>
  34#include <linux/pagemap.h>
  35#include <linux/perf_event.h>
  36#include <linux/highmem.h>
  37#include <linux/spinlock.h>
  38#include <linux/key.h>
  39#include <linux/personality.h>
  40#include <linux/binfmts.h>
  41#include <linux/utsname.h>
  42#include <linux/pid_namespace.h>
  43#include <linux/module.h>
  44#include <linux/namei.h>
  45#include <linux/mount.h>
  46#include <linux/security.h>
  47#include <linux/syscalls.h>
  48#include <linux/tsacct_kern.h>
  49#include <linux/cn_proc.h>
  50#include <linux/audit.h>
  51#include <linux/tracehook.h>
  52#include <linux/kmod.h>
  53#include <linux/fsnotify.h>
  54#include <linux/fs_struct.h>
  55#include <linux/pipe_fs_i.h>
  56#include <linux/oom.h>
  57#include <linux/compat.h>
  58
  59#include <asm/uaccess.h>
  60#include <asm/mmu_context.h>
  61#include <asm/tlb.h>
  62#include <asm/exec.h>
  63
  64#include <trace/events/task.h>
  65#include "internal.h"
  66
  67#include <trace/events/sched.h>
  68
  69int core_uses_pid;
  70char core_pattern[CORENAME_MAX_SIZE] = "core";
  71unsigned int core_pipe_limit;
  72int suid_dumpable = 0;
  73
  74struct core_name {
  75        char *corename;
  76        int used, size;
  77};
  78static atomic_t call_count = ATOMIC_INIT(1);
  79
  80/* The maximal length of core_pattern is also specified in sysctl.c */
  81
  82static LIST_HEAD(formats);
  83static DEFINE_RWLOCK(binfmt_lock);
  84
  85void __register_binfmt(struct linux_binfmt * fmt, int insert)
  86{
  87        BUG_ON(!fmt);
  88        write_lock(&binfmt_lock);
  89        insert ? list_add(&fmt->lh, &formats) :
  90                 list_add_tail(&fmt->lh, &formats);
  91        write_unlock(&binfmt_lock);
  92}
  93
  94EXPORT_SYMBOL(__register_binfmt);
  95
  96void unregister_binfmt(struct linux_binfmt * fmt)
  97{
  98        write_lock(&binfmt_lock);
  99        list_del(&fmt->lh);
 100        write_unlock(&binfmt_lock);
 101}
 102
 103EXPORT_SYMBOL(unregister_binfmt);
 104
 105static inline void put_binfmt(struct linux_binfmt * fmt)
 106{
 107        module_put(fmt->module);
 108}
 109
 110/*
 111 * Note that a shared library must be both readable and executable due to
 112 * security reasons.
 113 *
 114 * Also note that we take the address to load from from the file itself.
 115 */
 116SYSCALL_DEFINE1(uselib, const char __user *, library)
 117{
 118        struct file *file;
 119        char *tmp = getname(library);
 120        int error = PTR_ERR(tmp);
 121        static const struct open_flags uselib_flags = {
 122                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 123                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
 124                .intent = LOOKUP_OPEN
 125        };
 126
 127        if (IS_ERR(tmp))
 128                goto out;
 129
 130        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
 131        putname(tmp);
 132        error = PTR_ERR(file);
 133        if (IS_ERR(file))
 134                goto out;
 135
 136        error = -EINVAL;
 137        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
 138                goto exit;
 139
 140        error = -EACCES;
 141        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 142                goto exit;
 143
 144        fsnotify_open(file);
 145
 146        error = -ENOEXEC;
 147        if(file->f_op) {
 148                struct linux_binfmt * fmt;
 149
 150                read_lock(&binfmt_lock);
 151                list_for_each_entry(fmt, &formats, lh) {
 152                        if (!fmt->load_shlib)
 153                                continue;
 154                        if (!try_module_get(fmt->module))
 155                                continue;
 156                        read_unlock(&binfmt_lock);
 157                        error = fmt->load_shlib(file);
 158                        read_lock(&binfmt_lock);
 159                        put_binfmt(fmt);
 160                        if (error != -ENOEXEC)
 161                                break;
 162                }
 163                read_unlock(&binfmt_lock);
 164        }
 165exit:
 166        fput(file);
 167out:
 168        return error;
 169}
 170
 171#ifdef CONFIG_MMU
 172/*
 173 * The nascent bprm->mm is not visible until exec_mmap() but it can
 174 * use a lot of memory, account these pages in current->mm temporary
 175 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 176 * change the counter back via acct_arg_size(0).
 177 */
 178static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 179{
 180        struct mm_struct *mm = current->mm;
 181        long diff = (long)(pages - bprm->vma_pages);
 182
 183        if (!mm || !diff)
 184                return;
 185
 186        bprm->vma_pages = pages;
 187        add_mm_counter(mm, MM_ANONPAGES, diff);
 188}
 189
 190static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 191                int write)
 192{
 193        struct page *page;
 194        int ret;
 195
 196#ifdef CONFIG_STACK_GROWSUP
 197        if (write) {
 198                ret = expand_downwards(bprm->vma, pos);
 199                if (ret < 0)
 200                        return NULL;
 201        }
 202#endif
 203        ret = get_user_pages(current, bprm->mm, pos,
 204                        1, write, 1, &page, NULL);
 205        if (ret <= 0)
 206                return NULL;
 207
 208        if (write) {
 209                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 210                struct rlimit *rlim;
 211
 212                acct_arg_size(bprm, size / PAGE_SIZE);
 213
 214                /*
 215                 * We've historically supported up to 32 pages (ARG_MAX)
 216                 * of argument strings even with small stacks
 217                 */
 218                if (size <= ARG_MAX)
 219                        return page;
 220
 221                /*
 222                 * Limit to 1/4-th the stack size for the argv+env strings.
 223                 * This ensures that:
 224                 *  - the remaining binfmt code will not run out of stack space,
 225                 *  - the program will have a reasonable amount of stack left
 226                 *    to work from.
 227                 */
 228                rlim = current->signal->rlim;
 229                if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
 230                        put_page(page);
 231                        return NULL;
 232                }
 233        }
 234
 235        return page;
 236}
 237
 238static void put_arg_page(struct page *page)
 239{
 240        put_page(page);
 241}
 242
 243static void free_arg_page(struct linux_binprm *bprm, int i)
 244{
 245}
 246
 247static void free_arg_pages(struct linux_binprm *bprm)
 248{
 249}
 250
 251static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 252                struct page *page)
 253{
 254        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 255}
 256
 257static int __bprm_mm_init(struct linux_binprm *bprm)
 258{
 259        int err;
 260        struct vm_area_struct *vma = NULL;
 261        struct mm_struct *mm = bprm->mm;
 262
 263        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 264        if (!vma)
 265                return -ENOMEM;
 266
 267        down_write(&mm->mmap_sem);
 268        vma->vm_mm = mm;
 269
 270        /*
 271         * Place the stack at the largest stack address the architecture
 272         * supports. Later, we'll move this to an appropriate place. We don't
 273         * use STACK_TOP because that can depend on attributes which aren't
 274         * configured yet.
 275         */
 276        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 277        vma->vm_end = STACK_TOP_MAX;
 278        vma->vm_start = vma->vm_end - PAGE_SIZE;
 279        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 280        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 281        INIT_LIST_HEAD(&vma->anon_vma_chain);
 282
 283        err = insert_vm_struct(mm, vma);
 284        if (err)
 285                goto err;
 286
 287        mm->stack_vm = mm->total_vm = 1;
 288        up_write(&mm->mmap_sem);
 289        bprm->p = vma->vm_end - sizeof(void *);
 290        return 0;
 291err:
 292        up_write(&mm->mmap_sem);
 293        bprm->vma = NULL;
 294        kmem_cache_free(vm_area_cachep, vma);
 295        return err;
 296}
 297
 298static bool valid_arg_len(struct linux_binprm *bprm, long len)
 299{
 300        return len <= MAX_ARG_STRLEN;
 301}
 302
 303#else
 304
 305static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 306{
 307}
 308
 309static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 310                int write)
 311{
 312        struct page *page;
 313
 314        page = bprm->page[pos / PAGE_SIZE];
 315        if (!page && write) {
 316                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
 317                if (!page)
 318                        return NULL;
 319                bprm->page[pos / PAGE_SIZE] = page;
 320        }
 321
 322        return page;
 323}
 324
 325static void put_arg_page(struct page *page)
 326{
 327}
 328
 329static void free_arg_page(struct linux_binprm *bprm, int i)
 330{
 331        if (bprm->page[i]) {
 332                __free_page(bprm->page[i]);
 333                bprm->page[i] = NULL;
 334        }
 335}
 336
 337static void free_arg_pages(struct linux_binprm *bprm)
 338{
 339        int i;
 340
 341        for (i = 0; i < MAX_ARG_PAGES; i++)
 342                free_arg_page(bprm, i);
 343}
 344
 345static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 346                struct page *page)
 347{
 348}
 349
 350static int __bprm_mm_init(struct linux_binprm *bprm)
 351{
 352        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
 353        return 0;
 354}
 355
 356static bool valid_arg_len(struct linux_binprm *bprm, long len)
 357{
 358        return len <= bprm->p;
 359}
 360
 361#endif /* CONFIG_MMU */
 362
 363/*
 364 * Create a new mm_struct and populate it with a temporary stack
 365 * vm_area_struct.  We don't have enough context at this point to set the stack
 366 * flags, permissions, and offset, so we use temporary values.  We'll update
 367 * them later in setup_arg_pages().
 368 */
 369int bprm_mm_init(struct linux_binprm *bprm)
 370{
 371        int err;
 372        struct mm_struct *mm = NULL;
 373
 374        bprm->mm = mm = mm_alloc();
 375        err = -ENOMEM;
 376        if (!mm)
 377                goto err;
 378
 379        err = init_new_context(current, mm);
 380        if (err)
 381                goto err;
 382
 383        err = __bprm_mm_init(bprm);
 384        if (err)
 385                goto err;
 386
 387        return 0;
 388
 389err:
 390        if (mm) {
 391                bprm->mm = NULL;
 392                mmdrop(mm);
 393        }
 394
 395        return err;
 396}
 397
 398struct user_arg_ptr {
 399#ifdef CONFIG_COMPAT
 400        bool is_compat;
 401#endif
 402        union {
 403                const char __user *const __user *native;
 404#ifdef CONFIG_COMPAT
 405                compat_uptr_t __user *compat;
 406#endif
 407        } ptr;
 408};
 409
 410static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 411{
 412        const char __user *native;
 413
 414#ifdef CONFIG_COMPAT
 415        if (unlikely(argv.is_compat)) {
 416                compat_uptr_t compat;
 417
 418                if (get_user(compat, argv.ptr.compat + nr))
 419                        return ERR_PTR(-EFAULT);
 420
 421                return compat_ptr(compat);
 422        }
 423#endif
 424
 425        if (get_user(native, argv.ptr.native + nr))
 426                return ERR_PTR(-EFAULT);
 427
 428        return native;
 429}
 430
 431/*
 432 * count() counts the number of strings in array ARGV.
 433 */
 434static int count(struct user_arg_ptr argv, int max)
 435{
 436        int i = 0;
 437
 438        if (argv.ptr.native != NULL) {
 439                for (;;) {
 440                        const char __user *p = get_user_arg_ptr(argv, i);
 441
 442                        if (!p)
 443                                break;
 444
 445                        if (IS_ERR(p))
 446                                return -EFAULT;
 447
 448                        if (i++ >= max)
 449                                return -E2BIG;
 450
 451                        if (fatal_signal_pending(current))
 452                                return -ERESTARTNOHAND;
 453                        cond_resched();
 454                }
 455        }
 456        return i;
 457}
 458
 459/*
 460 * 'copy_strings()' copies argument/environment strings from the old
 461 * processes's memory to the new process's stack.  The call to get_user_pages()
 462 * ensures the destination page is created and not swapped out.
 463 */
 464static int copy_strings(int argc, struct user_arg_ptr argv,
 465                        struct linux_binprm *bprm)
 466{
 467        struct page *kmapped_page = NULL;
 468        char *kaddr = NULL;
 469        unsigned long kpos = 0;
 470        int ret;
 471
 472        while (argc-- > 0) {
 473                const char __user *str;
 474                int len;
 475                unsigned long pos;
 476
 477                ret = -EFAULT;
 478                str = get_user_arg_ptr(argv, argc);
 479                if (IS_ERR(str))
 480                        goto out;
 481
 482                len = strnlen_user(str, MAX_ARG_STRLEN);
 483                if (!len)
 484                        goto out;
 485
 486                ret = -E2BIG;
 487                if (!valid_arg_len(bprm, len))
 488                        goto out;
 489
 490                /* We're going to work our way backwords. */
 491                pos = bprm->p;
 492                str += len;
 493                bprm->p -= len;
 494
 495                while (len > 0) {
 496                        int offset, bytes_to_copy;
 497
 498                        if (fatal_signal_pending(current)) {
 499                                ret = -ERESTARTNOHAND;
 500                                goto out;
 501                        }
 502                        cond_resched();
 503
 504                        offset = pos % PAGE_SIZE;
 505                        if (offset == 0)
 506                                offset = PAGE_SIZE;
 507
 508                        bytes_to_copy = offset;
 509                        if (bytes_to_copy > len)
 510                                bytes_to_copy = len;
 511
 512                        offset -= bytes_to_copy;
 513                        pos -= bytes_to_copy;
 514                        str -= bytes_to_copy;
 515                        len -= bytes_to_copy;
 516
 517                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 518                                struct page *page;
 519
 520                                page = get_arg_page(bprm, pos, 1);
 521                                if (!page) {
 522                                        ret = -E2BIG;
 523                                        goto out;
 524                                }
 525
 526                                if (kmapped_page) {
 527                                        flush_kernel_dcache_page(kmapped_page);
 528                                        kunmap(kmapped_page);
 529                                        put_arg_page(kmapped_page);
 530                                }
 531                                kmapped_page = page;
 532                                kaddr = kmap(kmapped_page);
 533                                kpos = pos & PAGE_MASK;
 534                                flush_arg_page(bprm, kpos, kmapped_page);
 535                        }
 536                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 537                                ret = -EFAULT;
 538                                goto out;
 539                        }
 540                }
 541        }
 542        ret = 0;
 543out:
 544        if (kmapped_page) {
 545                flush_kernel_dcache_page(kmapped_page);
 546                kunmap(kmapped_page);
 547                put_arg_page(kmapped_page);
 548        }
 549        return ret;
 550}
 551
 552/*
 553 * Like copy_strings, but get argv and its values from kernel memory.
 554 */
 555int copy_strings_kernel(int argc, const char *const *__argv,
 556                        struct linux_binprm *bprm)
 557{
 558        int r;
 559        mm_segment_t oldfs = get_fs();
 560        struct user_arg_ptr argv = {
 561                .ptr.native = (const char __user *const  __user *)__argv,
 562        };
 563
 564        set_fs(KERNEL_DS);
 565        r = copy_strings(argc, argv, bprm);
 566        set_fs(oldfs);
 567
 568        return r;
 569}
 570EXPORT_SYMBOL(copy_strings_kernel);
 571
 572#ifdef CONFIG_MMU
 573
 574/*
 575 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 576 * the binfmt code determines where the new stack should reside, we shift it to
 577 * its final location.  The process proceeds as follows:
 578 *
 579 * 1) Use shift to calculate the new vma endpoints.
 580 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 581 *    arguments passed to subsequent functions are consistent.
 582 * 3) Move vma's page tables to the new range.
 583 * 4) Free up any cleared pgd range.
 584 * 5) Shrink the vma to cover only the new range.
 585 */
 586static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 587{
 588        struct mm_struct *mm = vma->vm_mm;
 589        unsigned long old_start = vma->vm_start;
 590        unsigned long old_end = vma->vm_end;
 591        unsigned long length = old_end - old_start;
 592        unsigned long new_start = old_start - shift;
 593        unsigned long new_end = old_end - shift;
 594        struct mmu_gather tlb;
 595
 596        BUG_ON(new_start > new_end);
 597
 598        /*
 599         * ensure there are no vmas between where we want to go
 600         * and where we are
 601         */
 602        if (vma != find_vma(mm, new_start))
 603                return -EFAULT;
 604
 605        /*
 606         * cover the whole range: [new_start, old_end)
 607         */
 608        if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
 609                return -ENOMEM;
 610
 611        /*
 612         * move the page tables downwards, on failure we rely on
 613         * process cleanup to remove whatever mess we made.
 614         */
 615        if (length != move_page_tables(vma, old_start,
 616                                       vma, new_start, length))
 617                return -ENOMEM;
 618
 619        lru_add_drain();
 620        tlb_gather_mmu(&tlb, mm, 0);
 621        if (new_end > old_start) {
 622                /*
 623                 * when the old and new regions overlap clear from new_end.
 624                 */
 625                free_pgd_range(&tlb, new_end, old_end, new_end,
 626                        vma->vm_next ? vma->vm_next->vm_start : 0);
 627        } else {
 628                /*
 629                 * otherwise, clean from old_start; this is done to not touch
 630                 * the address space in [new_end, old_start) some architectures
 631                 * have constraints on va-space that make this illegal (IA64) -
 632                 * for the others its just a little faster.
 633                 */
 634                free_pgd_range(&tlb, old_start, old_end, new_end,
 635                        vma->vm_next ? vma->vm_next->vm_start : 0);
 636        }
 637        tlb_finish_mmu(&tlb, new_end, old_end);
 638
 639        /*
 640         * Shrink the vma to just the new range.  Always succeeds.
 641         */
 642        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 643
 644        return 0;
 645}
 646
 647/*
 648 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 649 * the stack is optionally relocated, and some extra space is added.
 650 */
 651int setup_arg_pages(struct linux_binprm *bprm,
 652                    unsigned long stack_top,
 653                    int executable_stack)
 654{
 655        unsigned long ret;
 656        unsigned long stack_shift;
 657        struct mm_struct *mm = current->mm;
 658        struct vm_area_struct *vma = bprm->vma;
 659        struct vm_area_struct *prev = NULL;
 660        unsigned long vm_flags;
 661        unsigned long stack_base;
 662        unsigned long stack_size;
 663        unsigned long stack_expand;
 664        unsigned long rlim_stack;
 665
 666#ifdef CONFIG_STACK_GROWSUP
 667        /* Limit stack size to 1GB */
 668        stack_base = rlimit_max(RLIMIT_STACK);
 669        if (stack_base > (1 << 30))
 670                stack_base = 1 << 30;
 671
 672        /* Make sure we didn't let the argument array grow too large. */
 673        if (vma->vm_end - vma->vm_start > stack_base)
 674                return -ENOMEM;
 675
 676        stack_base = PAGE_ALIGN(stack_top - stack_base);
 677
 678        stack_shift = vma->vm_start - stack_base;
 679        mm->arg_start = bprm->p - stack_shift;
 680        bprm->p = vma->vm_end - stack_shift;
 681#else
 682        stack_top = arch_align_stack(stack_top);
 683        stack_top = PAGE_ALIGN(stack_top);
 684
 685        if (unlikely(stack_top < mmap_min_addr) ||
 686            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
 687                return -ENOMEM;
 688
 689        stack_shift = vma->vm_end - stack_top;
 690
 691        bprm->p -= stack_shift;
 692        mm->arg_start = bprm->p;
 693#endif
 694
 695        if (bprm->loader)
 696                bprm->loader -= stack_shift;
 697        bprm->exec -= stack_shift;
 698
 699        down_write(&mm->mmap_sem);
 700        vm_flags = VM_STACK_FLAGS;
 701
 702        /*
 703         * Adjust stack execute permissions; explicitly enable for
 704         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
 705         * (arch default) otherwise.
 706         */
 707        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
 708                vm_flags |= VM_EXEC;
 709        else if (executable_stack == EXSTACK_DISABLE_X)
 710                vm_flags &= ~VM_EXEC;
 711        vm_flags |= mm->def_flags;
 712        vm_flags |= VM_STACK_INCOMPLETE_SETUP;
 713
 714        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 715                        vm_flags);
 716        if (ret)
 717                goto out_unlock;
 718        BUG_ON(prev != vma);
 719
 720        /* Move stack pages down in memory. */
 721        if (stack_shift) {
 722                ret = shift_arg_pages(vma, stack_shift);
 723                if (ret)
 724                        goto out_unlock;
 725        }
 726
 727        /* mprotect_fixup is overkill to remove the temporary stack flags */
 728        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
 729
 730        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
 731        stack_size = vma->vm_end - vma->vm_start;
 732        /*
 733         * Align this down to a page boundary as expand_stack
 734         * will align it up.
 735         */
 736        rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
 737#ifdef CONFIG_STACK_GROWSUP
 738        if (stack_size + stack_expand > rlim_stack)
 739                stack_base = vma->vm_start + rlim_stack;
 740        else
 741                stack_base = vma->vm_end + stack_expand;
 742#else
 743        if (stack_size + stack_expand > rlim_stack)
 744                stack_base = vma->vm_end - rlim_stack;
 745        else
 746                stack_base = vma->vm_start - stack_expand;
 747#endif
 748        current->mm->start_stack = bprm->p;
 749        ret = expand_stack(vma, stack_base);
 750        if (ret)
 751                ret = -EFAULT;
 752
 753out_unlock:
 754        up_write(&mm->mmap_sem);
 755        return ret;
 756}
 757EXPORT_SYMBOL(setup_arg_pages);
 758
 759#endif /* CONFIG_MMU */
 760
 761struct file *open_exec(const char *name)
 762{
 763        struct file *file;
 764        int err;
 765        static const struct open_flags open_exec_flags = {
 766                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 767                .acc_mode = MAY_EXEC | MAY_OPEN,
 768                .intent = LOOKUP_OPEN
 769        };
 770
 771        file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
 772        if (IS_ERR(file))
 773                goto out;
 774
 775        err = -EACCES;
 776        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
 777                goto exit;
 778
 779        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 780                goto exit;
 781
 782        fsnotify_open(file);
 783
 784        err = deny_write_access(file);
 785        if (err)
 786                goto exit;
 787
 788out:
 789        return file;
 790
 791exit:
 792        fput(file);
 793        return ERR_PTR(err);
 794}
 795EXPORT_SYMBOL(open_exec);
 796
 797int kernel_read(struct file *file, loff_t offset,
 798                char *addr, unsigned long count)
 799{
 800        mm_segment_t old_fs;
 801        loff_t pos = offset;
 802        int result;
 803
 804        old_fs = get_fs();
 805        set_fs(get_ds());
 806        /* The cast to a user pointer is valid due to the set_fs() */
 807        result = vfs_read(file, (void __user *)addr, count, &pos);
 808        set_fs(old_fs);
 809        return result;
 810}
 811
 812EXPORT_SYMBOL(kernel_read);
 813
 814static int exec_mmap(struct mm_struct *mm)
 815{
 816        struct task_struct *tsk;
 817        struct mm_struct * old_mm, *active_mm;
 818
 819        /* Notify parent that we're no longer interested in the old VM */
 820        tsk = current;
 821        old_mm = current->mm;
 822        mm_release(tsk, old_mm);
 823
 824        if (old_mm) {
 825                sync_mm_rss(old_mm);
 826                /*
 827                 * Make sure that if there is a core dump in progress
 828                 * for the old mm, we get out and die instead of going
 829                 * through with the exec.  We must hold mmap_sem around
 830                 * checking core_state and changing tsk->mm.
 831                 */
 832                down_read(&old_mm->mmap_sem);
 833                if (unlikely(old_mm->core_state)) {
 834                        up_read(&old_mm->mmap_sem);
 835                        return -EINTR;
 836                }
 837        }
 838        task_lock(tsk);
 839        active_mm = tsk->active_mm;
 840        tsk->mm = mm;
 841        tsk->active_mm = mm;
 842        activate_mm(active_mm, mm);
 843        task_unlock(tsk);
 844        arch_pick_mmap_layout(mm);
 845        if (old_mm) {
 846                up_read(&old_mm->mmap_sem);
 847                BUG_ON(active_mm != old_mm);
 848                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 849                mm_update_next_owner(old_mm);
 850                mmput(old_mm);
 851                return 0;
 852        }
 853        mmdrop(active_mm);
 854        return 0;
 855}
 856
 857/*
 858 * This function makes sure the current process has its own signal table,
 859 * so that flush_signal_handlers can later reset the handlers without
 860 * disturbing other processes.  (Other processes might share the signal
 861 * table via the CLONE_SIGHAND option to clone().)
 862 */
 863static int de_thread(struct task_struct *tsk)
 864{
 865        struct signal_struct *sig = tsk->signal;
 866        struct sighand_struct *oldsighand = tsk->sighand;
 867        spinlock_t *lock = &oldsighand->siglock;
 868
 869        if (thread_group_empty(tsk))
 870                goto no_thread_group;
 871
 872        /*
 873         * Kill all other threads in the thread group.
 874         */
 875        spin_lock_irq(lock);
 876        if (signal_group_exit(sig)) {
 877                /*
 878                 * Another group action in progress, just
 879                 * return so that the signal is processed.
 880                 */
 881                spin_unlock_irq(lock);
 882                return -EAGAIN;
 883        }
 884
 885        sig->group_exit_task = tsk;
 886        sig->notify_count = zap_other_threads(tsk);
 887        if (!thread_group_leader(tsk))
 888                sig->notify_count--;
 889
 890        while (sig->notify_count) {
 891                __set_current_state(TASK_UNINTERRUPTIBLE);
 892                spin_unlock_irq(lock);
 893                schedule();
 894                spin_lock_irq(lock);
 895        }
 896        spin_unlock_irq(lock);
 897
 898        /*
 899         * At this point all other threads have exited, all we have to
 900         * do is to wait for the thread group leader to become inactive,
 901         * and to assume its PID:
 902         */
 903        if (!thread_group_leader(tsk)) {
 904                struct task_struct *leader = tsk->group_leader;
 905
 906                sig->notify_count = -1; /* for exit_notify() */
 907                for (;;) {
 908                        write_lock_irq(&tasklist_lock);
 909                        if (likely(leader->exit_state))
 910                                break;
 911                        __set_current_state(TASK_UNINTERRUPTIBLE);
 912                        write_unlock_irq(&tasklist_lock);
 913                        schedule();
 914                }
 915
 916                /*
 917                 * The only record we have of the real-time age of a
 918                 * process, regardless of execs it's done, is start_time.
 919                 * All the past CPU time is accumulated in signal_struct
 920                 * from sister threads now dead.  But in this non-leader
 921                 * exec, nothing survives from the original leader thread,
 922                 * whose birth marks the true age of this process now.
 923                 * When we take on its identity by switching to its PID, we
 924                 * also take its birthdate (always earlier than our own).
 925                 */
 926                tsk->start_time = leader->start_time;
 927
 928                BUG_ON(!same_thread_group(leader, tsk));
 929                BUG_ON(has_group_leader_pid(tsk));
 930                /*
 931                 * An exec() starts a new thread group with the
 932                 * TGID of the previous thread group. Rehash the
 933                 * two threads with a switched PID, and release
 934                 * the former thread group leader:
 935                 */
 936
 937                /* Become a process group leader with the old leader's pid.
 938                 * The old leader becomes a thread of the this thread group.
 939                 * Note: The old leader also uses this pid until release_task
 940                 *       is called.  Odd but simple and correct.
 941                 */
 942                detach_pid(tsk, PIDTYPE_PID);
 943                tsk->pid = leader->pid;
 944                attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
 945                transfer_pid(leader, tsk, PIDTYPE_PGID);
 946                transfer_pid(leader, tsk, PIDTYPE_SID);
 947
 948                list_replace_rcu(&leader->tasks, &tsk->tasks);
 949                list_replace_init(&leader->sibling, &tsk->sibling);
 950
 951                tsk->group_leader = tsk;
 952                leader->group_leader = tsk;
 953
 954                tsk->exit_signal = SIGCHLD;
 955                leader->exit_signal = -1;
 956
 957                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 958                leader->exit_state = EXIT_DEAD;
 959
 960                /*
 961                 * We are going to release_task()->ptrace_unlink() silently,
 962                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
 963                 * the tracer wont't block again waiting for this thread.
 964                 */
 965                if (unlikely(leader->ptrace))
 966                        __wake_up_parent(leader, leader->parent);
 967                write_unlock_irq(&tasklist_lock);
 968
 969                release_task(leader);
 970        }
 971
 972        sig->group_exit_task = NULL;
 973        sig->notify_count = 0;
 974
 975no_thread_group:
 976        /* we have changed execution domain */
 977        tsk->exit_signal = SIGCHLD;
 978
 979        exit_itimers(sig);
 980        flush_itimer_signals();
 981
 982        if (atomic_read(&oldsighand->count) != 1) {
 983                struct sighand_struct *newsighand;
 984                /*
 985                 * This ->sighand is shared with the CLONE_SIGHAND
 986                 * but not CLONE_THREAD task, switch to the new one.
 987                 */
 988                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 989                if (!newsighand)
 990                        return -ENOMEM;
 991
 992                atomic_set(&newsighand->count, 1);
 993                memcpy(newsighand->action, oldsighand->action,
 994                       sizeof(newsighand->action));
 995
 996                write_lock_irq(&tasklist_lock);
 997                spin_lock(&oldsighand->siglock);
 998                rcu_assign_pointer(tsk->sighand, newsighand);
 999                spin_unlock(&oldsighand->siglock);
1000                write_unlock_irq(&tasklist_lock);
1001
1002                __cleanup_sighand(oldsighand);
1003        }
1004
1005        BUG_ON(!thread_group_leader(tsk));
1006        return 0;
1007}
1008
1009/*
1010 * These functions flushes out all traces of the currently running executable
1011 * so that a new one can be started
1012 */
1013static void flush_old_files(struct files_struct * files)
1014{
1015        long j = -1;
1016        struct fdtable *fdt;
1017
1018        spin_lock(&files->file_lock);
1019        for (;;) {
1020                unsigned long set, i;
1021
1022                j++;
1023                i = j * BITS_PER_LONG;
1024                fdt = files_fdtable(files);
1025                if (i >= fdt->max_fds)
1026                        break;
1027                set = fdt->close_on_exec[j];
1028                if (!set)
1029                        continue;
1030                fdt->close_on_exec[j] = 0;
1031                spin_unlock(&files->file_lock);
1032                for ( ; set ; i++,set >>= 1) {
1033                        if (set & 1) {
1034                                sys_close(i);
1035                        }
1036                }
1037                spin_lock(&files->file_lock);
1038
1039        }
1040        spin_unlock(&files->file_lock);
1041}
1042
1043char *get_task_comm(char *buf, struct task_struct *tsk)
1044{
1045        /* buf must be at least sizeof(tsk->comm) in size */
1046        task_lock(tsk);
1047        strncpy(buf, tsk->comm, sizeof(tsk->comm));
1048        task_unlock(tsk);
1049        return buf;
1050}
1051EXPORT_SYMBOL_GPL(get_task_comm);
1052
1053void set_task_comm(struct task_struct *tsk, char *buf)
1054{
1055        task_lock(tsk);
1056
1057        trace_task_rename(tsk, buf);
1058
1059        /*
1060         * Threads may access current->comm without holding
1061         * the task lock, so write the string carefully.
1062         * Readers without a lock may see incomplete new
1063         * names but are safe from non-terminating string reads.
1064         */
1065        memset(tsk->comm, 0, TASK_COMM_LEN);
1066        wmb();
1067        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1068        task_unlock(tsk);
1069        perf_event_comm(tsk);
1070}
1071
1072static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
1073{
1074        int i, ch;
1075
1076        /* Copies the binary name from after last slash */
1077        for (i = 0; (ch = *(fn++)) != '\0';) {
1078                if (ch == '/')
1079                        i = 0; /* overwrite what we wrote */
1080                else
1081                        if (i < len - 1)
1082                                tcomm[i++] = ch;
1083        }
1084        tcomm[i] = '\0';
1085}
1086
1087int flush_old_exec(struct linux_binprm * bprm)
1088{
1089        int retval;
1090
1091        /*
1092         * Make sure we have a private signal table and that
1093         * we are unassociated from the previous thread group.
1094         */
1095        retval = de_thread(current);
1096        if (retval)
1097                goto out;
1098
1099        set_mm_exe_file(bprm->mm, bprm->file);
1100
1101        filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
1102        /*
1103         * Release all of the old mmap stuff
1104         */
1105        acct_arg_size(bprm, 0);
1106        retval = exec_mmap(bprm->mm);
1107        if (retval)
1108                goto out;
1109
1110        bprm->mm = NULL;                /* We're using it now */
1111
1112        set_fs(USER_DS);
1113        current->flags &=
1114                ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
1115        flush_thread();
1116        current->personality &= ~bprm->per_clear;
1117
1118        return 0;
1119
1120out:
1121        return retval;
1122}
1123EXPORT_SYMBOL(flush_old_exec);
1124
1125void would_dump(struct linux_binprm *bprm, struct file *file)
1126{
1127        if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
1128                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
1129}
1130EXPORT_SYMBOL(would_dump);
1131
1132void setup_new_exec(struct linux_binprm * bprm)
1133{
1134        arch_pick_mmap_layout(current->mm);
1135
1136        /* This is the point of no return */
1137        current->sas_ss_sp = current->sas_ss_size = 0;
1138
1139        if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
1140                set_dumpable(current->mm, 1);
1141        else
1142                set_dumpable(current->mm, suid_dumpable);
1143
1144        set_task_comm(current, bprm->tcomm);
1145
1146        /* Set the new mm task size. We have to do that late because it may
1147         * depend on TIF_32BIT which is only updated in flush_thread() on
1148         * some architectures like powerpc
1149         */
1150        current->mm->task_size = TASK_SIZE;
1151
1152        /* install the new credentials */
1153        if (!uid_eq(bprm->cred->uid, current_euid()) ||
1154            !gid_eq(bprm->cred->gid, current_egid())) {
1155                current->pdeath_signal = 0;
1156        } else {
1157                would_dump(bprm, bprm->file);
1158                if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
1159                        set_dumpable(current->mm, suid_dumpable);
1160        }
1161
1162        /*
1163         * Flush performance counters when crossing a
1164         * security domain:
1165         */
1166        if (!get_dumpable(current->mm))
1167                perf_event_exit_task(current);
1168
1169        /* An exec changes our domain. We are no longer part of the thread
1170           group */
1171
1172        current->self_exec_id++;
1173                        
1174        flush_signal_handlers(current, 0);
1175        flush_old_files(current->files);
1176}
1177EXPORT_SYMBOL(setup_new_exec);
1178
1179/*
1180 * Prepare credentials and lock ->cred_guard_mutex.
1181 * install_exec_creds() commits the new creds and drops the lock.
1182 * Or, if exec fails before, free_bprm() should release ->cred and
1183 * and unlock.
1184 */
1185int prepare_bprm_creds(struct linux_binprm *bprm)
1186{
1187        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1188                return -ERESTARTNOINTR;
1189
1190        bprm->cred = prepare_exec_creds();
1191        if (likely(bprm->cred))
1192                return 0;
1193
1194        mutex_unlock(&current->signal->cred_guard_mutex);
1195        return -ENOMEM;
1196}
1197
1198void free_bprm(struct linux_binprm *bprm)
1199{
1200        free_arg_pages(bprm);
1201        if (bprm->cred) {
1202                mutex_unlock(&current->signal->cred_guard_mutex);
1203                abort_creds(bprm->cred);
1204        }
1205        kfree(bprm);
1206}
1207
1208/*
1209 * install the new credentials for this executable
1210 */
1211void install_exec_creds(struct linux_binprm *bprm)
1212{
1213        security_bprm_committing_creds(bprm);
1214
1215        commit_creds(bprm->cred);
1216        bprm->cred = NULL;
1217        /*
1218         * cred_guard_mutex must be held at least to this point to prevent
1219         * ptrace_attach() from altering our determination of the task's
1220         * credentials; any time after this it may be unlocked.
1221         */
1222        security_bprm_committed_creds(bprm);
1223        mutex_unlock(&current->signal->cred_guard_mutex);
1224}
1225EXPORT_SYMBOL(install_exec_creds);
1226
1227/*
1228 * determine how safe it is to execute the proposed program
1229 * - the caller must hold ->cred_guard_mutex to protect against
1230 *   PTRACE_ATTACH
1231 */
1232static int check_unsafe_exec(struct linux_binprm *bprm)
1233{
1234        struct task_struct *p = current, *t;
1235        unsigned n_fs;
1236        int res = 0;
1237
1238        if (p->ptrace) {
1239                if (p->ptrace & PT_PTRACE_CAP)
1240                        bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
1241                else
1242                        bprm->unsafe |= LSM_UNSAFE_PTRACE;
1243        }
1244
1245        /*
1246         * This isn't strictly necessary, but it makes it harder for LSMs to
1247         * mess up.
1248         */
1249        if (current->no_new_privs)
1250                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1251
1252        n_fs = 1;
1253        spin_lock(&p->fs->lock);
1254        rcu_read_lock();
1255        for (t = next_thread(p); t != p; t = next_thread(t)) {
1256                if (t->fs == p->fs)
1257                        n_fs++;
1258        }
1259        rcu_read_unlock();
1260
1261        if (p->fs->users > n_fs) {
1262                bprm->unsafe |= LSM_UNSAFE_SHARE;
1263        } else {
1264                res = -EAGAIN;
1265                if (!p->fs->in_exec) {
1266                        p->fs->in_exec = 1;
1267                        res = 1;
1268                }
1269        }
1270        spin_unlock(&p->fs->lock);
1271
1272        return res;
1273}
1274
1275/* 
1276 * Fill the binprm structure from the inode. 
1277 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1278 *
1279 * This may be called multiple times for binary chains (scripts for example).
1280 */
1281int prepare_binprm(struct linux_binprm *bprm)
1282{
1283        umode_t mode;
1284        struct inode * inode = bprm->file->f_path.dentry->d_inode;
1285        int retval;
1286
1287        mode = inode->i_mode;
1288        if (bprm->file->f_op == NULL)
1289                return -EACCES;
1290
1291        /* clear any previous set[ug]id data from a previous binary */
1292        bprm->cred->euid = current_euid();
1293        bprm->cred->egid = current_egid();
1294
1295        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1296            !current->no_new_privs) {
1297                /* Set-uid? */
1298                if (mode & S_ISUID) {
1299                        if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
1300                                return -EPERM;
1301                        bprm->per_clear |= PER_CLEAR_ON_SETID;
1302                        bprm->cred->euid = inode->i_uid;
1303
1304                }
1305
1306                /* Set-gid? */
1307                /*
1308                 * If setgid is set but no group execute bit then this
1309                 * is a candidate for mandatory locking, not a setgid
1310                 * executable.
1311                 */
1312                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1313                        if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
1314                                return -EPERM;
1315                        bprm->per_clear |= PER_CLEAR_ON_SETID;
1316                        bprm->cred->egid = inode->i_gid;
1317                }
1318        }
1319
1320        /* fill in binprm security blob */
1321        retval = security_bprm_set_creds(bprm);
1322        if (retval)
1323                return retval;
1324        bprm->cred_prepared = 1;
1325
1326        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1327        return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
1328}
1329
1330EXPORT_SYMBOL(prepare_binprm);
1331
1332/*
1333 * Arguments are '\0' separated strings found at the location bprm->p
1334 * points to; chop off the first by relocating brpm->p to right after
1335 * the first '\0' encountered.
1336 */
1337int remove_arg_zero(struct linux_binprm *bprm)
1338{
1339        int ret = 0;
1340        unsigned long offset;
1341        char *kaddr;
1342        struct page *page;
1343
1344        if (!bprm->argc)
1345                return 0;
1346
1347        do {
1348                offset = bprm->p & ~PAGE_MASK;
1349                page = get_arg_page(bprm, bprm->p, 0);
1350                if (!page) {
1351                        ret = -EFAULT;
1352                        goto out;
1353                }
1354                kaddr = kmap_atomic(page);
1355
1356                for (; offset < PAGE_SIZE && kaddr[offset];
1357                                offset++, bprm->p++)
1358                        ;
1359
1360                kunmap_atomic(kaddr);
1361                put_arg_page(page);
1362
1363                if (offset == PAGE_SIZE)
1364                        free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
1365        } while (offset == PAGE_SIZE);
1366
1367        bprm->p++;
1368        bprm->argc--;
1369        ret = 0;
1370
1371out:
1372        return ret;
1373}
1374EXPORT_SYMBOL(remove_arg_zero);
1375
1376/*
1377 * cycle the list of binary formats handler, until one recognizes the image
1378 */
1379int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1380{
1381        unsigned int depth = bprm->recursion_depth;
1382        int try,retval;
1383        struct linux_binfmt *fmt;
1384        pid_t old_pid, old_vpid;
1385
1386        retval = security_bprm_check(bprm);
1387        if (retval)
1388                return retval;
1389
1390        retval = audit_bprm(bprm);
1391        if (retval)
1392                return retval;
1393
1394        /* Need to fetch pid before load_binary changes it */
1395        old_pid = current->pid;
1396        rcu_read_lock();
1397        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1398        rcu_read_unlock();
1399
1400        retval = -ENOENT;
1401        for (try=0; try<2; try++) {
1402                read_lock(&binfmt_lock);
1403                list_for_each_entry(fmt, &formats, lh) {
1404                        int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
1405                        if (!fn)
1406                                continue;
1407                        if (!try_module_get(fmt->module))
1408                                continue;
1409                        read_unlock(&binfmt_lock);
1410                        retval = fn(bprm, regs);
1411                        /*
1412                         * Restore the depth counter to its starting value
1413                         * in this call, so we don't have to rely on every
1414                         * load_binary function to restore it on return.
1415                         */
1416                        bprm->recursion_depth = depth;
1417                        if (retval >= 0) {
1418                                if (depth == 0) {
1419                                        trace_sched_process_exec(current, old_pid, bprm);
1420                                        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1421                                }
1422                                put_binfmt(fmt);
1423                                allow_write_access(bprm->file);
1424                                if (bprm->file)
1425                                        fput(bprm->file);
1426                                bprm->file = NULL;
1427                                current->did_exec = 1;
1428                                proc_exec_connector(current);
1429                                return retval;
1430                        }
1431                        read_lock(&binfmt_lock);
1432                        put_binfmt(fmt);
1433                        if (retval != -ENOEXEC || bprm->mm == NULL)
1434                                break;
1435                        if (!bprm->file) {
1436                                read_unlock(&binfmt_lock);
1437                                return retval;
1438                        }
1439                }
1440                read_unlock(&binfmt_lock);
1441#ifdef CONFIG_MODULES
1442                if (retval != -ENOEXEC || bprm->mm == NULL) {
1443                        break;
1444                } else {
1445#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1446                        if (printable(bprm->buf[0]) &&
1447                            printable(bprm->buf[1]) &&
1448                            printable(bprm->buf[2]) &&
1449                            printable(bprm->buf[3]))
1450                                break; /* -ENOEXEC */
1451                        if (try)
1452                                break; /* -ENOEXEC */
1453                        request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1454                }
1455#else
1456                break;
1457#endif
1458        }
1459        return retval;
1460}
1461
1462EXPORT_SYMBOL(search_binary_handler);
1463
1464/*
1465 * sys_execve() executes a new program.
1466 */
1467static int do_execve_common(const char *filename,
1468                                struct user_arg_ptr argv,
1469                                struct user_arg_ptr envp,
1470                                struct pt_regs *regs)
1471{
1472        struct linux_binprm *bprm;
1473        struct file *file;
1474        struct files_struct *displaced;
1475        bool clear_in_exec;
1476        int retval;
1477        const struct cred *cred = current_cred();
1478
1479        /*
1480         * We move the actual failure in case of RLIMIT_NPROC excess from
1481         * set*uid() to execve() because too many poorly written programs
1482         * don't check setuid() return code.  Here we additionally recheck
1483         * whether NPROC limit is still exceeded.
1484         */
1485        if ((current->flags & PF_NPROC_EXCEEDED) &&
1486            atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
1487                retval = -EAGAIN;
1488                goto out_ret;
1489        }
1490
1491        /* We're below the limit (still or again), so we don't want to make
1492         * further execve() calls fail. */
1493        current->flags &= ~PF_NPROC_EXCEEDED;
1494
1495        retval = unshare_files(&displaced);
1496        if (retval)
1497                goto out_ret;
1498
1499        retval = -ENOMEM;
1500        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1501        if (!bprm)
1502                goto out_files;
1503
1504        retval = prepare_bprm_creds(bprm);
1505        if (retval)
1506                goto out_free;
1507
1508        retval = check_unsafe_exec(bprm);
1509        if (retval < 0)
1510                goto out_free;
1511        clear_in_exec = retval;
1512        current->in_execve = 1;
1513
1514        file = open_exec(filename);
1515        retval = PTR_ERR(file);
1516        if (IS_ERR(file))
1517                goto out_unmark;
1518
1519        sched_exec();
1520
1521        bprm->file = file;
1522        bprm->filename = filename;
1523        bprm->interp = filename;
1524
1525        retval = bprm_mm_init(bprm);
1526        if (retval)
1527                goto out_file;
1528
1529        bprm->argc = count(argv, MAX_ARG_STRINGS);
1530        if ((retval = bprm->argc) < 0)
1531                goto out;
1532
1533        bprm->envc = count(envp, MAX_ARG_STRINGS);
1534        if ((retval = bprm->envc) < 0)
1535                goto out;
1536
1537        retval = prepare_binprm(bprm);
1538        if (retval < 0)
1539                goto out;
1540
1541        retval = copy_strings_kernel(1, &bprm->filename, bprm);
1542        if (retval < 0)
1543                goto out;
1544
1545        bprm->exec = bprm->p;
1546        retval = copy_strings(bprm->envc, envp, bprm);
1547        if (retval < 0)
1548                goto out;
1549
1550        retval = copy_strings(bprm->argc, argv, bprm);
1551        if (retval < 0)
1552                goto out;
1553
1554        retval = search_binary_handler(bprm,regs);
1555        if (retval < 0)
1556                goto out;
1557
1558        /* execve succeeded */
1559        current->fs->in_exec = 0;
1560        current->in_execve = 0;
1561        acct_update_integrals(current);
1562        free_bprm(bprm);
1563        if (displaced)
1564                put_files_struct(displaced);
1565        return retval;
1566
1567out:
1568        if (bprm->mm) {
1569                acct_arg_size(bprm, 0);
1570                mmput(bprm->mm);
1571        }
1572
1573out_file:
1574        if (bprm->file) {
1575                allow_write_access(bprm->file);
1576                fput(bprm->file);
1577        }
1578
1579out_unmark:
1580        if (clear_in_exec)
1581                current->fs->in_exec = 0;
1582        current->in_execve = 0;
1583
1584out_free:
1585        free_bprm(bprm);
1586
1587out_files:
1588        if (displaced)
1589                reset_files_struct(displaced);
1590out_ret:
1591        return retval;
1592}
1593
1594int do_execve(const char *filename,
1595        const char __user *const __user *__argv,
1596        const char __user *const __user *__envp,
1597        struct pt_regs *regs)
1598{
1599        struct user_arg_ptr argv = { .ptr.native = __argv };
1600        struct user_arg_ptr envp = { .ptr.native = __envp };
1601        return do_execve_common(filename, argv, envp, regs);
1602}
1603
1604#ifdef CONFIG_COMPAT
1605int compat_do_execve(char *filename,
1606        compat_uptr_t __user *__argv,
1607        compat_uptr_t __user *__envp,
1608        struct pt_regs *regs)
1609{
1610        struct user_arg_ptr argv = {
1611                .is_compat = true,
1612                .ptr.compat = __argv,
1613        };
1614        struct user_arg_ptr envp = {
1615                .is_compat = true,
1616                .ptr.compat = __envp,
1617        };
1618        return do_execve_common(filename, argv, envp, regs);
1619}
1620#endif
1621
1622void set_binfmt(struct linux_binfmt *new)
1623{
1624        struct mm_struct *mm = current->mm;
1625
1626        if (mm->binfmt)
1627                module_put(mm->binfmt->module);
1628
1629        mm->binfmt = new;
1630        if (new)
1631                __module_get(new->module);
1632}
1633
1634EXPORT_SYMBOL(set_binfmt);
1635
1636static int expand_corename(struct core_name *cn)
1637{
1638        char *old_corename = cn->corename;
1639
1640        cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
1641        cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
1642
1643        if (!cn->corename) {
1644                kfree(old_corename);
1645                return -ENOMEM;
1646        }
1647
1648        return 0;
1649}
1650
1651static int cn_printf(struct core_name *cn, const char *fmt, ...)
1652{
1653        char *cur;
1654        int need;
1655        int ret;
1656        va_list arg;
1657
1658        va_start(arg, fmt);
1659        need = vsnprintf(NULL, 0, fmt, arg);
1660        va_end(arg);
1661
1662        if (likely(need < cn->size - cn->used - 1))
1663                goto out_printf;
1664
1665        ret = expand_corename(cn);
1666        if (ret)
1667                goto expand_fail;
1668
1669out_printf:
1670        cur = cn->corename + cn->used;
1671        va_start(arg, fmt);
1672        vsnprintf(cur, need + 1, fmt, arg);
1673        va_end(arg);
1674        cn->used += need;
1675        return 0;
1676
1677expand_fail:
1678        return ret;
1679}
1680
1681static void cn_escape(char *str)
1682{
1683        for (; *str; str++)
1684                if (*str == '/')
1685                        *str = '!';
1686}
1687
1688static int cn_print_exe_file(struct core_name *cn)
1689{
1690        struct file *exe_file;
1691        char *pathbuf, *path;
1692        int ret;
1693
1694        exe_file = get_mm_exe_file(current->mm);
1695        if (!exe_file) {
1696                char *commstart = cn->corename + cn->used;
1697                ret = cn_printf(cn, "%s (path unknown)", current->comm);
1698                cn_escape(commstart);
1699                return ret;
1700        }
1701
1702        pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
1703        if (!pathbuf) {
1704                ret = -ENOMEM;
1705                goto put_exe_file;
1706        }
1707
1708        path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
1709        if (IS_ERR(path)) {
1710                ret = PTR_ERR(path);
1711                goto free_buf;
1712        }
1713
1714        cn_escape(path);
1715
1716        ret = cn_printf(cn, "%s", path);
1717
1718free_buf:
1719        kfree(pathbuf);
1720put_exe_file:
1721        fput(exe_file);
1722        return ret;
1723}
1724
1725/* format_corename will inspect the pattern parameter, and output a
1726 * name into corename, which must have space for at least
1727 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1728 */
1729static int format_corename(struct core_name *cn, long signr)
1730{
1731        const struct cred *cred = current_cred();
1732        const char *pat_ptr = core_pattern;
1733        int ispipe = (*pat_ptr == '|');
1734        int pid_in_pattern = 0;
1735        int err = 0;
1736
1737        cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
1738        cn->corename = kmalloc(cn->size, GFP_KERNEL);
1739        cn->used = 0;
1740
1741        if (!cn->corename)
1742                return -ENOMEM;
1743
1744        /* Repeat as long as we have more pattern to process and more output
1745           space */
1746        while (*pat_ptr) {
1747                if (*pat_ptr != '%') {
1748                        if (*pat_ptr == 0)
1749                                goto out;
1750                        err = cn_printf(cn, "%c", *pat_ptr++);
1751                } else {
1752                        switch (*++pat_ptr) {
1753                        /* single % at the end, drop that */
1754                        case 0:
1755                                goto out;
1756                        /* Double percent, output one percent */
1757                        case '%':
1758                                err = cn_printf(cn, "%c", '%');
1759                                break;
1760                        /* pid */
1761                        case 'p':
1762                                pid_in_pattern = 1;
1763                                err = cn_printf(cn, "%d",
1764                                              task_tgid_vnr(current));
1765                                break;
1766                        /* uid */
1767                        case 'u':
1768                                err = cn_printf(cn, "%d", cred->uid);
1769                                break;
1770                        /* gid */
1771                        case 'g':
1772                                err = cn_printf(cn, "%d", cred->gid);
1773                                break;
1774                        /* signal that caused the coredump */
1775                        case 's':
1776                                err = cn_printf(cn, "%ld", signr);
1777                                break;
1778                        /* UNIX time of coredump */
1779                        case 't': {
1780                                struct timeval tv;
1781                                do_gettimeofday(&tv);
1782                                err = cn_printf(cn, "%lu", tv.tv_sec);
1783                                break;
1784                        }
1785                        /* hostname */
1786                        case 'h': {
1787                                char *namestart = cn->corename + cn->used;
1788                                down_read(&uts_sem);
1789                                err = cn_printf(cn, "%s",
1790                                              utsname()->nodename);
1791                                up_read(&uts_sem);
1792                                cn_escape(namestart);
1793                                break;
1794                        }
1795                        /* executable */
1796                        case 'e': {
1797                                char *commstart = cn->corename + cn->used;
1798                                err = cn_printf(cn, "%s", current->comm);
1799                                cn_escape(commstart);
1800                                break;
1801                        }
1802                        case 'E':
1803                                err = cn_print_exe_file(cn);
1804                                break;
1805                        /* core limit size */
1806                        case 'c':
1807                                err = cn_printf(cn, "%lu",
1808                                              rlimit(RLIMIT_CORE));
1809                                break;
1810                        default:
1811                                break;
1812                        }
1813                        ++pat_ptr;
1814                }
1815
1816                if (err)
1817                        return err;
1818        }
1819
1820        /* Backward compatibility with core_uses_pid:
1821         *
1822         * If core_pattern does not include a %p (as is the default)
1823         * and core_uses_pid is set, then .%pid will be appended to
1824         * the filename. Do not do this for piped commands. */
1825        if (!ispipe && !pid_in_pattern && core_uses_pid) {
1826                err = cn_printf(cn, ".%d", task_tgid_vnr(current));
1827                if (err)
1828                        return err;
1829        }
1830out:
1831        return ispipe;
1832}
1833
1834static int zap_process(struct task_struct *start, int exit_code)
1835{
1836        struct task_struct *t;
1837        int nr = 0;
1838
1839        start->signal->flags = SIGNAL_GROUP_EXIT;
1840        start->signal->group_exit_code = exit_code;
1841        start->signal->group_stop_count = 0;
1842
1843        t = start;
1844        do {
1845                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
1846                if (t != current && t->mm) {
1847                        sigaddset(&t->pending.signal, SIGKILL);
1848                        signal_wake_up(t, 1);
1849                        nr++;
1850                }
1851        } while_each_thread(start, t);
1852
1853        return nr;
1854}
1855
1856static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1857                                struct core_state *core_state, int exit_code)
1858{
1859        struct task_struct *g, *p;
1860        unsigned long flags;
1861        int nr = -EAGAIN;
1862
1863        spin_lock_irq(&tsk->sighand->siglock);
1864        if (!signal_group_exit(tsk->signal)) {
1865                mm->core_state = core_state;
1866                nr = zap_process(tsk, exit_code);
1867        }
1868        spin_unlock_irq(&tsk->sighand->siglock);
1869        if (unlikely(nr < 0))
1870                return nr;
1871
1872        if (atomic_read(&mm->mm_users) == nr + 1)
1873                goto done;
1874        /*
1875         * We should find and kill all tasks which use this mm, and we should
1876         * count them correctly into ->nr_threads. We don't take tasklist
1877         * lock, but this is safe wrt:
1878         *
1879         * fork:
1880         *      None of sub-threads can fork after zap_process(leader). All
1881         *      processes which were created before this point should be
1882         *      visible to zap_threads() because copy_process() adds the new
1883         *      process to the tail of init_task.tasks list, and lock/unlock
1884         *      of ->siglock provides a memory barrier.
1885         *
1886         * do_exit:
1887         *      The caller holds mm->mmap_sem. This means that the task which
1888         *      uses this mm can't pass exit_mm(), so it can't exit or clear
1889         *      its ->mm.
1890         *
1891         * de_thread:
1892         *      It does list_replace_rcu(&leader->tasks, &current->tasks),
1893         *      we must see either old or new leader, this does not matter.
1894         *      However, it can change p->sighand, so lock_task_sighand(p)
1895         *      must be used. Since p->mm != NULL and we hold ->mmap_sem
1896         *      it can't fail.
1897         *
1898         *      Note also that "g" can be the old leader with ->mm == NULL
1899         *      and already unhashed and thus removed from ->thread_group.
1900         *      This is OK, __unhash_process()->list_del_rcu() does not
1901         *      clear the ->next pointer, we will find the new leader via
1902         *      next_thread().
1903         */
1904        rcu_read_lock();
1905        for_each_process(g) {
1906                if (g == tsk->group_leader)
1907                        continue;
1908                if (g->flags & PF_KTHREAD)
1909                        continue;
1910                p = g;
1911                do {
1912                        if (p->mm) {
1913                                if (unlikely(p->mm == mm)) {
1914                                        lock_task_sighand(p, &flags);
1915                                        nr += zap_process(p, exit_code);
1916                                        unlock_task_sighand(p, &flags);
1917                                }
1918                                break;
1919                        }
1920                } while_each_thread(g, p);
1921        }
1922        rcu_read_unlock();
1923done:
1924        atomic_set(&core_state->nr_threads, nr);
1925        return nr;
1926}
1927
1928static int coredump_wait(int exit_code, struct core_state *core_state)
1929{
1930        struct task_struct *tsk = current;
1931        struct mm_struct *mm = tsk->mm;
1932        int core_waiters = -EBUSY;
1933
1934        init_completion(&core_state->startup);
1935        core_state->dumper.task = tsk;
1936        core_state->dumper.next = NULL;
1937
1938        down_write(&mm->mmap_sem);
1939        if (!mm->core_state)
1940                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1941        up_write(&mm->mmap_sem);
1942
1943        if (core_waiters > 0) {
1944                struct core_thread *ptr;
1945
1946                wait_for_completion(&core_state->startup);
1947                /*
1948                 * Wait for all the threads to become inactive, so that
1949                 * all the thread context (extended register state, like
1950                 * fpu etc) gets copied to the memory.
1951                 */
1952                ptr = core_state->dumper.next;
1953                while (ptr != NULL) {
1954                        wait_task_inactive(ptr->task, 0);
1955                        ptr = ptr->next;
1956                }
1957        }
1958
1959        return core_waiters;
1960}
1961
1962static void coredump_finish(struct mm_struct *mm)
1963{
1964        struct core_thread *curr, *next;
1965        struct task_struct *task;
1966
1967        next = mm->core_state->dumper.next;
1968        while ((curr = next) != NULL) {
1969                next = curr->next;
1970                task = curr->task;
1971                /*
1972                 * see exit_mm(), curr->task must not see
1973                 * ->task == NULL before we read ->next.
1974                 */
1975                smp_mb();
1976                curr->task = NULL;
1977                wake_up_process(task);
1978        }
1979
1980        mm->core_state = NULL;
1981}
1982
1983/*
1984 * set_dumpable converts traditional three-value dumpable to two flags and
1985 * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
1986 * these bits are not changed atomically.  So get_dumpable can observe the
1987 * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
1988 * return either old dumpable or new one by paying attention to the order of
1989 * modifying the bits.
1990 *
1991 * dumpable |   mm->flags (binary)
1992 * old  new | initial interim  final
1993 * ---------+-----------------------
1994 *  0    1  |   00      01      01
1995 *  0    2  |   00      10(*)   11
1996 *  1    0  |   01      00      00
1997 *  1    2  |   01      11      11
1998 *  2    0  |   11      10(*)   00
1999 *  2    1  |   11      11      01
2000 *
2001 * (*) get_dumpable regards interim value of 10 as 11.
2002 */
2003void set_dumpable(struct mm_struct *mm, int value)
2004{
2005        switch (value) {
2006        case SUID_DUMPABLE_DISABLED:
2007                clear_bit(MMF_DUMPABLE, &mm->flags);
2008                smp_wmb();
2009                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
2010                break;
2011        case SUID_DUMPABLE_ENABLED:
2012                set_bit(MMF_DUMPABLE, &mm->flags);
2013                smp_wmb();
2014                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
2015                break;
2016        case SUID_DUMPABLE_SAFE:
2017                set_bit(MMF_DUMP_SECURELY, &mm->flags);
2018                smp_wmb();
2019                set_bit(MMF_DUMPABLE, &mm->flags);
2020                break;
2021        }
2022}
2023
2024static int __get_dumpable(unsigned long mm_flags)
2025{
2026        int ret;
2027
2028        ret = mm_flags & MMF_DUMPABLE_MASK;
2029        return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret;
2030}
2031
2032int get_dumpable(struct mm_struct *mm)
2033{
2034        return __get_dumpable(mm->flags);
2035}
2036
2037static void wait_for_dump_helpers(struct file *file)
2038{
2039        struct pipe_inode_info *pipe;
2040
2041        pipe = file->f_path.dentry->d_inode->i_pipe;
2042
2043        pipe_lock(pipe);
2044        pipe->readers++;
2045        pipe->writers--;
2046
2047        while ((pipe->readers > 1) && (!signal_pending(current))) {
2048                wake_up_interruptible_sync(&pipe->wait);
2049                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
2050                pipe_wait(pipe);
2051        }
2052
2053        pipe->readers--;
2054        pipe->writers++;
2055        pipe_unlock(pipe);
2056
2057}
2058
2059
2060/*
2061 * umh_pipe_setup
2062 * helper function to customize the process used
2063 * to collect the core in userspace.  Specifically
2064 * it sets up a pipe and installs it as fd 0 (stdin)
2065 * for the process.  Returns 0 on success, or
2066 * PTR_ERR on failure.
2067 * Note that it also sets the core limit to 1.  This
2068 * is a special value that we use to trap recursive
2069 * core dumps
2070 */
2071static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
2072{
2073        struct file *files[2];
2074        struct fdtable *fdt;
2075        struct coredump_params *cp = (struct coredump_params *)info->data;
2076        struct files_struct *cf = current->files;
2077        int err = create_pipe_files(files, 0);
2078        if (err)
2079                return err;
2080
2081        cp->file = files[1];
2082
2083        sys_close(0);
2084        fd_install(0, files[0]);
2085        spin_lock(&cf->file_lock);
2086        fdt = files_fdtable(cf);
2087        __set_open_fd(0, fdt);
2088        __clear_close_on_exec(0, fdt);
2089        spin_unlock(&cf->file_lock);
2090
2091        /* and disallow core files too */
2092        current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
2093
2094        return 0;
2095}
2096
2097void do_coredump(long signr, int exit_code, struct pt_regs *regs)
2098{
2099        struct core_state core_state;
2100        struct core_name cn;
2101        struct mm_struct *mm = current->mm;
2102        struct linux_binfmt * binfmt;
2103        const struct cred *old_cred;
2104        struct cred *cred;
2105        int retval = 0;
2106        int flag = 0;
2107        int ispipe;
2108        bool need_nonrelative = false;
2109        static atomic_t core_dump_count = ATOMIC_INIT(0);
2110        struct coredump_params cprm = {
2111                .signr = signr,
2112                .regs = regs,
2113                .limit = rlimit(RLIMIT_CORE),
2114                /*
2115                 * We must use the same mm->flags while dumping core to avoid
2116                 * inconsistency of bit flags, since this flag is not protected
2117                 * by any locks.
2118                 */
2119                .mm_flags = mm->flags,
2120        };
2121
2122        audit_core_dumps(signr);
2123
2124        binfmt = mm->binfmt;
2125        if (!binfmt || !binfmt->core_dump)
2126                goto fail;
2127        if (!__get_dumpable(cprm.mm_flags))
2128                goto fail;
2129
2130        cred = prepare_creds();
2131        if (!cred)
2132                goto fail;
2133        /*
2134         * We cannot trust fsuid as being the "true" uid of the process
2135         * nor do we know its entire history. We only know it was tainted
2136         * so we dump it as root in mode 2, and only into a controlled
2137         * environment (pipe handler or fully qualified path).
2138         */
2139        if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
2140                /* Setuid core dump mode */
2141                flag = O_EXCL;          /* Stop rewrite attacks */
2142                cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
2143                need_nonrelative = true;
2144        }
2145
2146        retval = coredump_wait(exit_code, &core_state);
2147        if (retval < 0)
2148                goto fail_creds;
2149
2150        old_cred = override_creds(cred);
2151
2152        /*
2153         * Clear any false indication of pending signals that might
2154         * be seen by the filesystem code called to write the core file.
2155         */
2156        clear_thread_flag(TIF_SIGPENDING);
2157
2158        ispipe = format_corename(&cn, signr);
2159
2160        if (ispipe) {
2161                int dump_count;
2162                char **helper_argv;
2163
2164                if (ispipe < 0) {
2165                        printk(KERN_WARNING "format_corename failed\n");
2166                        printk(KERN_WARNING "Aborting core\n");
2167                        goto fail_corename;
2168                }
2169
2170                if (cprm.limit == 1) {
2171                        /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
2172                         *
2173                         * Normally core limits are irrelevant to pipes, since
2174                         * we're not writing to the file system, but we use
2175                         * cprm.limit of 1 here as a speacial value, this is a
2176                         * consistent way to catch recursive crashes.
2177                         * We can still crash if the core_pattern binary sets
2178                         * RLIM_CORE = !1, but it runs as root, and can do
2179                         * lots of stupid things.
2180                         *
2181                         * Note that we use task_tgid_vnr here to grab the pid
2182                         * of the process group leader.  That way we get the
2183                         * right pid if a thread in a multi-threaded
2184                         * core_pattern process dies.
2185                         */
2186                        printk(KERN_WARNING
2187                                "Process %d(%s) has RLIMIT_CORE set to 1\n",
2188                                task_tgid_vnr(current), current->comm);
2189                        printk(KERN_WARNING "Aborting core\n");
2190                        goto fail_unlock;
2191                }
2192                cprm.limit = RLIM_INFINITY;
2193
2194                dump_count = atomic_inc_return(&core_dump_count);
2195                if (core_pipe_limit && (core_pipe_limit < dump_count)) {
2196                        printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
2197                               task_tgid_vnr(current), current->comm);
2198                        printk(KERN_WARNING "Skipping core dump\n");
2199                        goto fail_dropcount;
2200                }
2201
2202                helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
2203                if (!helper_argv) {
2204                        printk(KERN_WARNING "%s failed to allocate memory\n",
2205                               __func__);
2206                        goto fail_dropcount;
2207                }
2208
2209                retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
2210                                        NULL, UMH_WAIT_EXEC, umh_pipe_setup,
2211                                        NULL, &cprm);
2212                argv_free(helper_argv);
2213                if (retval) {
2214                        printk(KERN_INFO "Core dump to %s pipe failed\n",
2215                               cn.corename);
2216                        goto close_fail;
2217                }
2218        } else {
2219                struct inode *inode;
2220
2221                if (cprm.limit < binfmt->min_coredump)
2222                        goto fail_unlock;
2223
2224                if (need_nonrelative && cn.corename[0] != '/') {
2225                        printk(KERN_WARNING "Pid %d(%s) can only dump core "\
2226                                "to fully qualified path!\n",
2227                                task_tgid_vnr(current), current->comm);
2228                        printk(KERN_WARNING "Skipping core dump\n");
2229                        goto fail_unlock;
2230                }
2231
2232                cprm.file = filp_open(cn.corename,
2233                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
2234                                 0600);
2235                if (IS_ERR(cprm.file))
2236                        goto fail_unlock;
2237
2238                inode = cprm.file->f_path.dentry->d_inode;
2239                if (inode->i_nlink > 1)
2240                        goto close_fail;
2241                if (d_unhashed(cprm.file->f_path.dentry))
2242                        goto close_fail;
2243                /*
2244                 * AK: actually i see no reason to not allow this for named
2245                 * pipes etc, but keep the previous behaviour for now.
2246                 */
2247                if (!S_ISREG(inode->i_mode))
2248                        goto close_fail;
2249                /*
2250                 * Dont allow local users get cute and trick others to coredump
2251                 * into their pre-created files.
2252                 */
2253                if (!uid_eq(inode->i_uid, current_fsuid()))
2254                        goto close_fail;
2255                if (!cprm.file->f_op || !cprm.file->f_op->write)
2256                        goto close_fail;
2257                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
2258                        goto close_fail;
2259        }
2260
2261        retval = binfmt->core_dump(&cprm);
2262        if (retval)
2263                current->signal->group_exit_code |= 0x80;
2264
2265        if (ispipe && core_pipe_limit)
2266                wait_for_dump_helpers(cprm.file);
2267close_fail:
2268        if (cprm.file)
2269                filp_close(cprm.file, NULL);
2270fail_dropcount:
2271        if (ispipe)
2272                atomic_dec(&core_dump_count);
2273fail_unlock:
2274        kfree(cn.corename);
2275fail_corename:
2276        coredump_finish(mm);
2277        revert_creds(old_cred);
2278fail_creds:
2279        put_cred(cred);
2280fail:
2281        return;
2282}
2283
2284/*
2285 * Core dumping helper functions.  These are the only things you should
2286 * do on a core-file: use only these functions to write out all the
2287 * necessary info.
2288 */
2289int dump_write(struct file *file, const void *addr, int nr)
2290{
2291        return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2292}
2293EXPORT_SYMBOL(dump_write);
2294
2295int dump_seek(struct file *file, loff_t off)
2296{
2297        int ret = 1;
2298
2299        if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2300                if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2301                        return 0;
2302        } else {
2303                char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2304
2305                if (!buf)
2306                        return 0;
2307                while (off > 0) {
2308                        unsigned long n = off;
2309
2310                        if (n > PAGE_SIZE)
2311                                n = PAGE_SIZE;
2312                        if (!dump_write(file, buf, n)) {
2313                                ret = 0;
2314                                break;
2315                        }
2316                        off -= n;
2317                }
2318                free_page((unsigned long)buf);
2319        }
2320        return ret;
2321}
2322EXPORT_SYMBOL(dump_seek);
2323
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.