linux/fs/exec.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/exec.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * #!-checking implemented by tytso.
   9 */
  10/*
  11 * Demand-loading implemented 01.12.91 - no need to read anything but
  12 * the header into memory. The inode of the executable is put into
  13 * "current->executable", and page faults do the actual loading. Clean.
  14 *
  15 * Once more I can proudly say that linux stood up to being changed: it
  16 * was less than 2 hours work to get demand-loading completely implemented.
  17 *
  18 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
  19 * current->executable is only used by the procfs.  This allows a dispatch
  20 * table to check for several different types  of binary formats.  We keep
  21 * trying until we recognize the file or we run out of supported binary
  22 * formats. 
  23 */
  24
  25#include <linux/slab.h>
  26#include <linux/file.h>
  27#include <linux/fdtable.h>
  28#include <linux/mm.h>
  29#include <linux/vmacache.h>
  30#include <linux/stat.h>
  31#include <linux/fcntl.h>
  32#include <linux/swap.h>
  33#include <linux/string.h>
  34#include <linux/init.h>
  35#include <linux/pagemap.h>
  36#include <linux/perf_event.h>
  37#include <linux/highmem.h>
  38#include <linux/spinlock.h>
  39#include <linux/key.h>
  40#include <linux/personality.h>
  41#include <linux/binfmts.h>
  42#include <linux/utsname.h>
  43#include <linux/pid_namespace.h>
  44#include <linux/module.h>
  45#include <linux/namei.h>
  46#include <linux/mount.h>
  47#include <linux/security.h>
  48#include <linux/syscalls.h>
  49#include <linux/tsacct_kern.h>
  50#include <linux/cn_proc.h>
  51#include <linux/audit.h>
  52#include <linux/tracehook.h>
  53#include <linux/kmod.h>
  54#include <linux/fsnotify.h>
  55#include <linux/fs_struct.h>
  56#include <linux/pipe_fs_i.h>
  57#include <linux/oom.h>
  58#include <linux/compat.h>
  59
  60#include <asm/uaccess.h>
  61#include <asm/mmu_context.h>
  62#include <asm/tlb.h>
  63
  64#include <trace/events/task.h>
  65#include "internal.h"
  66
  67#include <trace/events/sched.h>
  68
  69int suid_dumpable = 0;
  70
  71static LIST_HEAD(formats);
  72static DEFINE_RWLOCK(binfmt_lock);
  73
  74void __register_binfmt(struct linux_binfmt * fmt, int insert)
  75{
  76        BUG_ON(!fmt);
  77        if (WARN_ON(!fmt->load_binary))
  78                return;
  79        write_lock(&binfmt_lock);
  80        insert ? list_add(&fmt->lh, &formats) :
  81                 list_add_tail(&fmt->lh, &formats);
  82        write_unlock(&binfmt_lock);
  83}
  84
  85EXPORT_SYMBOL(__register_binfmt);
  86
  87void unregister_binfmt(struct linux_binfmt * fmt)
  88{
  89        write_lock(&binfmt_lock);
  90        list_del(&fmt->lh);
  91        write_unlock(&binfmt_lock);
  92}
  93
  94EXPORT_SYMBOL(unregister_binfmt);
  95
  96static inline void put_binfmt(struct linux_binfmt * fmt)
  97{
  98        module_put(fmt->module);
  99}
 100
 101#ifdef CONFIG_USELIB
 102/*
 103 * Note that a shared library must be both readable and executable due to
 104 * security reasons.
 105 *
 106 * Also note that we take the address to load from from the file itself.
 107 */
 108SYSCALL_DEFINE1(uselib, const char __user *, library)
 109{
 110        struct linux_binfmt *fmt;
 111        struct file *file;
 112        struct filename *tmp = getname(library);
 113        int error = PTR_ERR(tmp);
 114        static const struct open_flags uselib_flags = {
 115                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 116                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
 117                .intent = LOOKUP_OPEN,
 118                .lookup_flags = LOOKUP_FOLLOW,
 119        };
 120
 121        if (IS_ERR(tmp))
 122                goto out;
 123
 124        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
 125        putname(tmp);
 126        error = PTR_ERR(file);
 127        if (IS_ERR(file))
 128                goto out;
 129
 130        error = -EINVAL;
 131        if (!S_ISREG(file_inode(file)->i_mode))
 132                goto exit;
 133
 134        error = -EACCES;
 135        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 136                goto exit;
 137
 138        fsnotify_open(file);
 139
 140        error = -ENOEXEC;
 141
 142        read_lock(&binfmt_lock);
 143        list_for_each_entry(fmt, &formats, lh) {
 144                if (!fmt->load_shlib)
 145                        continue;
 146                if (!try_module_get(fmt->module))
 147                        continue;
 148                read_unlock(&binfmt_lock);
 149                error = fmt->load_shlib(file);
 150                read_lock(&binfmt_lock);
 151                put_binfmt(fmt);
 152                if (error != -ENOEXEC)
 153                        break;
 154        }
 155        read_unlock(&binfmt_lock);
 156exit:
 157        fput(file);
 158out:
 159        return error;
 160}
 161#endif /* #ifdef CONFIG_USELIB */
 162
 163#ifdef CONFIG_MMU
 164/*
 165 * The nascent bprm->mm is not visible until exec_mmap() but it can
 166 * use a lot of memory, account these pages in current->mm temporary
 167 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 168 * change the counter back via acct_arg_size(0).
 169 */
 170static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 171{
 172        struct mm_struct *mm = current->mm;
 173        long diff = (long)(pages - bprm->vma_pages);
 174
 175        if (!mm || !diff)
 176                return;
 177
 178        bprm->vma_pages = pages;
 179        add_mm_counter(mm, MM_ANONPAGES, diff);
 180}
 181
 182static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 183                int write)
 184{
 185        struct page *page;
 186        int ret;
 187
 188#ifdef CONFIG_STACK_GROWSUP
 189        if (write) {
 190                ret = expand_downwards(bprm->vma, pos);
 191                if (ret < 0)
 192                        return NULL;
 193        }
 194#endif
 195        ret = get_user_pages(current, bprm->mm, pos,
 196                        1, write, 1, &page, NULL);
 197        if (ret <= 0)
 198                return NULL;
 199
 200        if (write) {
 201                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 202                struct rlimit *rlim;
 203
 204                acct_arg_size(bprm, size / PAGE_SIZE);
 205
 206                /*
 207                 * We've historically supported up to 32 pages (ARG_MAX)
 208                 * of argument strings even with small stacks
 209                 */
 210                if (size <= ARG_MAX)
 211                        return page;
 212
 213                /*
 214                 * Limit to 1/4-th the stack size for the argv+env strings.
 215                 * This ensures that:
 216                 *  - the remaining binfmt code will not run out of stack space,
 217                 *  - the program will have a reasonable amount of stack left
 218                 *    to work from.
 219                 */
 220                rlim = current->signal->rlim;
 221                if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
 222                        put_page(page);
 223                        return NULL;
 224                }
 225        }
 226
 227        return page;
 228}
 229
 230static void put_arg_page(struct page *page)
 231{
 232        put_page(page);
 233}
 234
 235static void free_arg_page(struct linux_binprm *bprm, int i)
 236{
 237}
 238
 239static void free_arg_pages(struct linux_binprm *bprm)
 240{
 241}
 242
 243static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 244                struct page *page)
 245{
 246        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 247}
 248
 249static int __bprm_mm_init(struct linux_binprm *bprm)
 250{
 251        int err;
 252        struct vm_area_struct *vma = NULL;
 253        struct mm_struct *mm = bprm->mm;
 254
 255        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 256        if (!vma)
 257                return -ENOMEM;
 258
 259        down_write(&mm->mmap_sem);
 260        vma->vm_mm = mm;
 261
 262        /*
 263         * Place the stack at the largest stack address the architecture
 264         * supports. Later, we'll move this to an appropriate place. We don't
 265         * use STACK_TOP because that can depend on attributes which aren't
 266         * configured yet.
 267         */
 268        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 269        vma->vm_end = STACK_TOP_MAX;
 270        vma->vm_start = vma->vm_end - PAGE_SIZE;
 271        vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 272        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 273        INIT_LIST_HEAD(&vma->anon_vma_chain);
 274
 275        err = insert_vm_struct(mm, vma);
 276        if (err)
 277                goto err;
 278
 279        mm->stack_vm = mm->total_vm = 1;
 280        up_write(&mm->mmap_sem);
 281        bprm->p = vma->vm_end - sizeof(void *);
 282        return 0;
 283err:
 284        up_write(&mm->mmap_sem);
 285        bprm->vma = NULL;
 286        kmem_cache_free(vm_area_cachep, vma);
 287        return err;
 288}
 289
 290static bool valid_arg_len(struct linux_binprm *bprm, long len)
 291{
 292        return len <= MAX_ARG_STRLEN;
 293}
 294
 295#else
 296
 297static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 298{
 299}
 300
 301static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 302                int write)
 303{
 304        struct page *page;
 305
 306        page = bprm->page[pos / PAGE_SIZE];
 307        if (!page && write) {
 308                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
 309                if (!page)
 310                        return NULL;
 311                bprm->page[pos / PAGE_SIZE] = page;
 312        }
 313
 314        return page;
 315}
 316
 317static void put_arg_page(struct page *page)
 318{
 319}
 320
 321static void free_arg_page(struct linux_binprm *bprm, int i)
 322{
 323        if (bprm->page[i]) {
 324                __free_page(bprm->page[i]);
 325                bprm->page[i] = NULL;
 326        }
 327}
 328
 329static void free_arg_pages(struct linux_binprm *bprm)
 330{
 331        int i;
 332
 333        for (i = 0; i < MAX_ARG_PAGES; i++)
 334                free_arg_page(bprm, i);
 335}
 336
 337static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 338                struct page *page)
 339{
 340}
 341
 342static int __bprm_mm_init(struct linux_binprm *bprm)
 343{
 344        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
 345        return 0;
 346}
 347
 348static bool valid_arg_len(struct linux_binprm *bprm, long len)
 349{
 350        return len <= bprm->p;
 351}
 352
 353#endif /* CONFIG_MMU */
 354
 355/*
 356 * Create a new mm_struct and populate it with a temporary stack
 357 * vm_area_struct.  We don't have enough context at this point to set the stack
 358 * flags, permissions, and offset, so we use temporary values.  We'll update
 359 * them later in setup_arg_pages().
 360 */
 361static int bprm_mm_init(struct linux_binprm *bprm)
 362{
 363        int err;
 364        struct mm_struct *mm = NULL;
 365
 366        bprm->mm = mm = mm_alloc();
 367        err = -ENOMEM;
 368        if (!mm)
 369                goto err;
 370
 371        err = __bprm_mm_init(bprm);
 372        if (err)
 373                goto err;
 374
 375        return 0;
 376
 377err:
 378        if (mm) {
 379                bprm->mm = NULL;
 380                mmdrop(mm);
 381        }
 382
 383        return err;
 384}
 385
 386struct user_arg_ptr {
 387#ifdef CONFIG_COMPAT
 388        bool is_compat;
 389#endif
 390        union {
 391                const char __user *const __user *native;
 392#ifdef CONFIG_COMPAT
 393                const compat_uptr_t __user *compat;
 394#endif
 395        } ptr;
 396};
 397
 398static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 399{
 400        const char __user *native;
 401
 402#ifdef CONFIG_COMPAT
 403        if (unlikely(argv.is_compat)) {
 404                compat_uptr_t compat;
 405
 406                if (get_user(compat, argv.ptr.compat + nr))
 407                        return ERR_PTR(-EFAULT);
 408
 409                return compat_ptr(compat);
 410        }
 411#endif
 412
 413        if (get_user(native, argv.ptr.native + nr))
 414                return ERR_PTR(-EFAULT);
 415
 416        return native;
 417}
 418
 419/*
 420 * count() counts the number of strings in array ARGV.
 421 */
 422static int count(struct user_arg_ptr argv, int max)
 423{
 424        int i = 0;
 425
 426        if (argv.ptr.native != NULL) {
 427                for (;;) {
 428                        const char __user *p = get_user_arg_ptr(argv, i);
 429
 430                        if (!p)
 431                                break;
 432
 433                        if (IS_ERR(p))
 434                                return -EFAULT;
 435
 436                        if (i >= max)
 437                                return -E2BIG;
 438                        ++i;
 439
 440                        if (fatal_signal_pending(current))
 441                                return -ERESTARTNOHAND;
 442                        cond_resched();
 443                }
 444        }
 445        return i;
 446}
 447
 448/*
 449 * 'copy_strings()' copies argument/environment strings from the old
 450 * processes's memory to the new process's stack.  The call to get_user_pages()
 451 * ensures the destination page is created and not swapped out.
 452 */
 453static int copy_strings(int argc, struct user_arg_ptr argv,
 454                        struct linux_binprm *bprm)
 455{
 456        struct page *kmapped_page = NULL;
 457        char *kaddr = NULL;
 458        unsigned long kpos = 0;
 459        int ret;
 460
 461        while (argc-- > 0) {
 462                const char __user *str;
 463                int len;
 464                unsigned long pos;
 465
 466                ret = -EFAULT;
 467                str = get_user_arg_ptr(argv, argc);
 468                if (IS_ERR(str))
 469                        goto out;
 470
 471                len = strnlen_user(str, MAX_ARG_STRLEN);
 472                if (!len)
 473                        goto out;
 474
 475                ret = -E2BIG;
 476                if (!valid_arg_len(bprm, len))
 477                        goto out;
 478
 479                /* We're going to work our way backwords. */
 480                pos = bprm->p;
 481                str += len;
 482                bprm->p -= len;
 483
 484                while (len > 0) {
 485                        int offset, bytes_to_copy;
 486
 487                        if (fatal_signal_pending(current)) {
 488                                ret = -ERESTARTNOHAND;
 489                                goto out;
 490                        }
 491                        cond_resched();
 492
 493                        offset = pos % PAGE_SIZE;
 494                        if (offset == 0)
 495                                offset = PAGE_SIZE;
 496
 497                        bytes_to_copy = offset;
 498                        if (bytes_to_copy > len)
 499                                bytes_to_copy = len;
 500
 501                        offset -= bytes_to_copy;
 502                        pos -= bytes_to_copy;
 503                        str -= bytes_to_copy;
 504                        len -= bytes_to_copy;
 505
 506                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 507                                struct page *page;
 508
 509                                page = get_arg_page(bprm, pos, 1);
 510                                if (!page) {
 511                                        ret = -E2BIG;
 512                                        goto out;
 513                                }
 514
 515                                if (kmapped_page) {
 516                                        flush_kernel_dcache_page(kmapped_page);
 517                                        kunmap(kmapped_page);
 518                                        put_arg_page(kmapped_page);
 519                                }
 520                                kmapped_page = page;
 521                                kaddr = kmap(kmapped_page);
 522                                kpos = pos & PAGE_MASK;
 523                                flush_arg_page(bprm, kpos, kmapped_page);
 524                        }
 525                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 526                                ret = -EFAULT;
 527                                goto out;
 528                        }
 529                }
 530        }
 531        ret = 0;
 532out:
 533        if (kmapped_page) {
 534                flush_kernel_dcache_page(kmapped_page);
 535                kunmap(kmapped_page);
 536                put_arg_page(kmapped_page);
 537        }
 538        return ret;
 539}
 540
 541/*
 542 * Like copy_strings, but get argv and its values from kernel memory.
 543 */
 544int copy_strings_kernel(int argc, const char *const *__argv,
 545                        struct linux_binprm *bprm)
 546{
 547        int r;
 548        mm_segment_t oldfs = get_fs();
 549        struct user_arg_ptr argv = {
 550                .ptr.native = (const char __user *const  __user *)__argv,
 551        };
 552
 553        set_fs(KERNEL_DS);
 554        r = copy_strings(argc, argv, bprm);
 555        set_fs(oldfs);
 556
 557        return r;
 558}
 559EXPORT_SYMBOL(copy_strings_kernel);
 560
 561#ifdef CONFIG_MMU
 562
 563/*
 564 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 565 * the binfmt code determines where the new stack should reside, we shift it to
 566 * its final location.  The process proceeds as follows:
 567 *
 568 * 1) Use shift to calculate the new vma endpoints.
 569 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 570 *    arguments passed to subsequent functions are consistent.
 571 * 3) Move vma's page tables to the new range.
 572 * 4) Free up any cleared pgd range.
 573 * 5) Shrink the vma to cover only the new range.
 574 */
 575static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 576{
 577        struct mm_struct *mm = vma->vm_mm;
 578        unsigned long old_start = vma->vm_start;
 579        unsigned long old_end = vma->vm_end;
 580        unsigned long length = old_end - old_start;
 581        unsigned long new_start = old_start - shift;
 582        unsigned long new_end = old_end - shift;
 583        struct mmu_gather tlb;
 584
 585        BUG_ON(new_start > new_end);
 586
 587        /*
 588         * ensure there are no vmas between where we want to go
 589         * and where we are
 590         */
 591        if (vma != find_vma(mm, new_start))
 592                return -EFAULT;
 593
 594        /*
 595         * cover the whole range: [new_start, old_end)
 596         */
 597        if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
 598                return -ENOMEM;
 599
 600        /*
 601         * move the page tables downwards, on failure we rely on
 602         * process cleanup to remove whatever mess we made.
 603         */
 604        if (length != move_page_tables(vma, old_start,
 605                                       vma, new_start, length, false))
 606                return -ENOMEM;
 607
 608        lru_add_drain();
 609        tlb_gather_mmu(&tlb, mm, old_start, old_end);
 610        if (new_end > old_start) {
 611                /*
 612                 * when the old and new regions overlap clear from new_end.
 613                 */
 614                free_pgd_range(&tlb, new_end, old_end, new_end,
 615                        vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 616        } else {
 617                /*
 618                 * otherwise, clean from old_start; this is done to not touch
 619                 * the address space in [new_end, old_start) some architectures
 620                 * have constraints on va-space that make this illegal (IA64) -
 621                 * for the others its just a little faster.
 622                 */
 623                free_pgd_range(&tlb, old_start, old_end, new_end,
 624                        vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 625        }
 626        tlb_finish_mmu(&tlb, old_start, old_end);
 627
 628        /*
 629         * Shrink the vma to just the new range.  Always succeeds.
 630         */
 631        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 632
 633        return 0;
 634}
 635
 636/*
 637 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 638 * the stack is optionally relocated, and some extra space is added.
 639 */
 640int setup_arg_pages(struct linux_binprm *bprm,
 641                    unsigned long stack_top,
 642                    int executable_stack)
 643{
 644        unsigned long ret;
 645        unsigned long stack_shift;
 646        struct mm_struct *mm = current->mm;
 647        struct vm_area_struct *vma = bprm->vma;
 648        struct vm_area_struct *prev = NULL;
 649        unsigned long vm_flags;
 650        unsigned long stack_base;
 651        unsigned long stack_size;
 652        unsigned long stack_expand;
 653        unsigned long rlim_stack;
 654
 655#ifdef CONFIG_STACK_GROWSUP
 656        /* Limit stack size */
 657        stack_base = rlimit_max(RLIMIT_STACK);
 658        if (stack_base > STACK_SIZE_MAX)
 659                stack_base = STACK_SIZE_MAX;
 660
 661        /* Make sure we didn't let the argument array grow too large. */
 662        if (vma->vm_end - vma->vm_start > stack_base)
 663                return -ENOMEM;
 664
 665        stack_base = PAGE_ALIGN(stack_top - stack_base);
 666
 667        stack_shift = vma->vm_start - stack_base;
 668        mm->arg_start = bprm->p - stack_shift;
 669        bprm->p = vma->vm_end - stack_shift;
 670#else
 671        stack_top = arch_align_stack(stack_top);
 672        stack_top = PAGE_ALIGN(stack_top);
 673
 674        if (unlikely(stack_top < mmap_min_addr) ||
 675            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
 676                return -ENOMEM;
 677
 678        stack_shift = vma->vm_end - stack_top;
 679
 680        bprm->p -= stack_shift;
 681        mm->arg_start = bprm->p;
 682#endif
 683
 684        if (bprm->loader)
 685                bprm->loader -= stack_shift;
 686        bprm->exec -= stack_shift;
 687
 688        down_write(&mm->mmap_sem);
 689        vm_flags = VM_STACK_FLAGS;
 690
 691        /*
 692         * Adjust stack execute permissions; explicitly enable for
 693         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
 694         * (arch default) otherwise.
 695         */
 696        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
 697                vm_flags |= VM_EXEC;
 698        else if (executable_stack == EXSTACK_DISABLE_X)
 699                vm_flags &= ~VM_EXEC;
 700        vm_flags |= mm->def_flags;
 701        vm_flags |= VM_STACK_INCOMPLETE_SETUP;
 702
 703        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 704                        vm_flags);
 705        if (ret)
 706                goto out_unlock;
 707        BUG_ON(prev != vma);
 708
 709        /* Move stack pages down in memory. */
 710        if (stack_shift) {
 711                ret = shift_arg_pages(vma, stack_shift);
 712                if (ret)
 713                        goto out_unlock;
 714        }
 715
 716        /* mprotect_fixup is overkill to remove the temporary stack flags */
 717        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
 718
 719        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
 720        stack_size = vma->vm_end - vma->vm_start;
 721        /*
 722         * Align this down to a page boundary as expand_stack
 723         * will align it up.
 724         */
 725        rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
 726#ifdef CONFIG_STACK_GROWSUP
 727        if (stack_size + stack_expand > rlim_stack)
 728                stack_base = vma->vm_start + rlim_stack;
 729        else
 730                stack_base = vma->vm_end + stack_expand;
 731#else
 732        if (stack_size + stack_expand > rlim_stack)
 733                stack_base = vma->vm_end - rlim_stack;
 734        else
 735                stack_base = vma->vm_start - stack_expand;
 736#endif
 737        current->mm->start_stack = bprm->p;
 738        ret = expand_stack(vma, stack_base);
 739        if (ret)
 740                ret = -EFAULT;
 741
 742out_unlock:
 743        up_write(&mm->mmap_sem);
 744        return ret;
 745}
 746EXPORT_SYMBOL(setup_arg_pages);
 747
 748#endif /* CONFIG_MMU */
 749
 750static struct file *do_open_exec(struct filename *name)
 751{
 752        struct file *file;
 753        int err;
 754        static const struct open_flags open_exec_flags = {
 755                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 756                .acc_mode = MAY_EXEC | MAY_OPEN,
 757                .intent = LOOKUP_OPEN,
 758                .lookup_flags = LOOKUP_FOLLOW,
 759        };
 760
 761        file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
 762        if (IS_ERR(file))
 763                goto out;
 764
 765        err = -EACCES;
 766        if (!S_ISREG(file_inode(file)->i_mode))
 767                goto exit;
 768
 769        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 770                goto exit;
 771
 772        fsnotify_open(file);
 773
 774        err = deny_write_access(file);
 775        if (err)
 776                goto exit;
 777
 778out:
 779        return file;
 780
 781exit:
 782        fput(file);
 783        return ERR_PTR(err);
 784}
 785
 786struct file *open_exec(const char *name)
 787{
 788        struct filename tmp = { .name = name };
 789        return do_open_exec(&tmp);
 790}
 791EXPORT_SYMBOL(open_exec);
 792
 793int kernel_read(struct file *file, loff_t offset,
 794                char *addr, unsigned long count)
 795{
 796        mm_segment_t old_fs;
 797        loff_t pos = offset;
 798        int result;
 799
 800        old_fs = get_fs();
 801        set_fs(get_ds());
 802        /* The cast to a user pointer is valid due to the set_fs() */
 803        result = vfs_read(file, (void __user *)addr, count, &pos);
 804        set_fs(old_fs);
 805        return result;
 806}
 807
 808EXPORT_SYMBOL(kernel_read);
 809
 810ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 811{
 812        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
 813        if (res > 0)
 814                flush_icache_range(addr, addr + len);
 815        return res;
 816}
 817EXPORT_SYMBOL(read_code);
 818
 819static int exec_mmap(struct mm_struct *mm)
 820{
 821        struct task_struct *tsk;
 822        struct mm_struct *old_mm, *active_mm;
 823
 824        /* Notify parent that we're no longer interested in the old VM */
 825        tsk = current;
 826        old_mm = current->mm;
 827        mm_release(tsk, old_mm);
 828
 829        if (old_mm) {
 830                sync_mm_rss(old_mm);
 831                /*
 832                 * Make sure that if there is a core dump in progress
 833                 * for the old mm, we get out and die instead of going
 834                 * through with the exec.  We must hold mmap_sem around
 835                 * checking core_state and changing tsk->mm.
 836                 */
 837                down_read(&old_mm->mmap_sem);
 838                if (unlikely(old_mm->core_state)) {
 839                        up_read(&old_mm->mmap_sem);
 840                        return -EINTR;
 841                }
 842        }
 843        task_lock(tsk);
 844        active_mm = tsk->active_mm;
 845        tsk->mm = mm;
 846        tsk->active_mm = mm;
 847        activate_mm(active_mm, mm);
 848        tsk->mm->vmacache_seqnum = 0;
 849        vmacache_flush(tsk);
 850        task_unlock(tsk);
 851        if (old_mm) {
 852                up_read(&old_mm->mmap_sem);
 853                BUG_ON(active_mm != old_mm);
 854                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 855                mm_update_next_owner(old_mm);
 856                mmput(old_mm);
 857                return 0;
 858        }
 859        mmdrop(active_mm);
 860        return 0;
 861}
 862
 863/*
 864 * This function makes sure the current process has its own signal table,
 865 * so that flush_signal_handlers can later reset the handlers without
 866 * disturbing other processes.  (Other processes might share the signal
 867 * table via the CLONE_SIGHAND option to clone().)
 868 */
 869static int de_thread(struct task_struct *tsk)
 870{
 871        struct signal_struct *sig = tsk->signal;
 872        struct sighand_struct *oldsighand = tsk->sighand;
 873        spinlock_t *lock = &oldsighand->siglock;
 874
 875        if (thread_group_empty(tsk))
 876                goto no_thread_group;
 877
 878        /*
 879         * Kill all other threads in the thread group.
 880         */
 881        spin_lock_irq(lock);
 882        if (signal_group_exit(sig)) {
 883                /*
 884                 * Another group action in progress, just
 885                 * return so that the signal is processed.
 886                 */
 887                spin_unlock_irq(lock);
 888                return -EAGAIN;
 889        }
 890
 891        sig->group_exit_task = tsk;
 892        sig->notify_count = zap_other_threads(tsk);
 893        if (!thread_group_leader(tsk))
 894                sig->notify_count--;
 895
 896        while (sig->notify_count) {
 897                __set_current_state(TASK_KILLABLE);
 898                spin_unlock_irq(lock);
 899                schedule();
 900                if (unlikely(__fatal_signal_pending(tsk)))
 901                        goto killed;
 902                spin_lock_irq(lock);
 903        }
 904        spin_unlock_irq(lock);
 905
 906        /*
 907         * At this point all other threads have exited, all we have to
 908         * do is to wait for the thread group leader to become inactive,
 909         * and to assume its PID:
 910         */
 911        if (!thread_group_leader(tsk)) {
 912                struct task_struct *leader = tsk->group_leader;
 913
 914                sig->notify_count = -1; /* for exit_notify() */
 915                for (;;) {
 916                        threadgroup_change_begin(tsk);
 917                        write_lock_irq(&tasklist_lock);
 918                        if (likely(leader->exit_state))
 919                                break;
 920                        __set_current_state(TASK_KILLABLE);
 921                        write_unlock_irq(&tasklist_lock);
 922                        threadgroup_change_end(tsk);
 923                        schedule();
 924                        if (unlikely(__fatal_signal_pending(tsk)))
 925                                goto killed;
 926                }
 927
 928                /*
 929                 * The only record we have of the real-time age of a
 930                 * process, regardless of execs it's done, is start_time.
 931                 * All the past CPU time is accumulated in signal_struct
 932                 * from sister threads now dead.  But in this non-leader
 933                 * exec, nothing survives from the original leader thread,
 934                 * whose birth marks the true age of this process now.
 935                 * When we take on its identity by switching to its PID, we
 936                 * also take its birthdate (always earlier than our own).
 937                 */
 938                tsk->start_time = leader->start_time;
 939                tsk->real_start_time = leader->real_start_time;
 940
 941                BUG_ON(!same_thread_group(leader, tsk));
 942                BUG_ON(has_group_leader_pid(tsk));
 943                /*
 944                 * An exec() starts a new thread group with the
 945                 * TGID of the previous thread group. Rehash the
 946                 * two threads with a switched PID, and release
 947                 * the former thread group leader:
 948                 */
 949
 950                /* Become a process group leader with the old leader's pid.
 951                 * The old leader becomes a thread of the this thread group.
 952                 * Note: The old leader also uses this pid until release_task
 953                 *       is called.  Odd but simple and correct.
 954                 */
 955                tsk->pid = leader->pid;
 956                change_pid(tsk, PIDTYPE_PID, task_pid(leader));
 957                transfer_pid(leader, tsk, PIDTYPE_PGID);
 958                transfer_pid(leader, tsk, PIDTYPE_SID);
 959
 960                list_replace_rcu(&leader->tasks, &tsk->tasks);
 961                list_replace_init(&leader->sibling, &tsk->sibling);
 962
 963                tsk->group_leader = tsk;
 964                leader->group_leader = tsk;
 965
 966                tsk->exit_signal = SIGCHLD;
 967                leader->exit_signal = -1;
 968
 969                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 970                leader->exit_state = EXIT_DEAD;
 971
 972                /*
 973                 * We are going to release_task()->ptrace_unlink() silently,
 974                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
 975                 * the tracer wont't block again waiting for this thread.
 976                 */
 977                if (unlikely(leader->ptrace))
 978                        __wake_up_parent(leader, leader->parent);
 979                write_unlock_irq(&tasklist_lock);
 980                threadgroup_change_end(tsk);
 981
 982                release_task(leader);
 983        }
 984
 985        sig->group_exit_task = NULL;
 986        sig->notify_count = 0;
 987
 988no_thread_group:
 989        /* we have changed execution domain */
 990        tsk->exit_signal = SIGCHLD;
 991
 992        exit_itimers(sig);
 993        flush_itimer_signals();
 994
 995        if (atomic_read(&oldsighand->count) != 1) {
 996                struct sighand_struct *newsighand;
 997                /*
 998                 * This ->sighand is shared with the CLONE_SIGHAND
 999                 * but not CLONE_THREAD task, switch to the new one.
1000                 */
1001                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1002                if (!newsighand)
1003                        return -ENOMEM;
1004
1005                atomic_set(&newsighand->count, 1);
1006                memcpy(newsighand->action, oldsighand->action,
1007                       sizeof(newsighand->action));
1008
1009                write_lock_irq(&tasklist_lock);
1010                spin_lock(&oldsighand->siglock);
1011                rcu_assign_pointer(tsk->sighand, newsighand);
1012                spin_unlock(&oldsighand->siglock);
1013                write_unlock_irq(&tasklist_lock);
1014
1015                __cleanup_sighand(oldsighand);
1016        }
1017
1018        BUG_ON(!thread_group_leader(tsk));
1019        return 0;
1020
1021killed:
1022        /* protects against exit_notify() and __exit_signal() */
1023        read_lock(&tasklist_lock);
1024        sig->group_exit_task = NULL;
1025        sig->notify_count = 0;
1026        read_unlock(&tasklist_lock);
1027        return -EAGAIN;
1028}
1029
1030char *get_task_comm(char *buf, struct task_struct *tsk)
1031{
1032        /* buf must be at least sizeof(tsk->comm) in size */
1033        task_lock(tsk);
1034        strncpy(buf, tsk->comm, sizeof(tsk->comm));
1035        task_unlock(tsk);
1036        return buf;
1037}
1038EXPORT_SYMBOL_GPL(get_task_comm);
1039
1040/*
1041 * These functions flushes out all traces of the currently running executable
1042 * so that a new one can be started
1043 */
1044
1045void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
1046{
1047        task_lock(tsk);
1048        trace_task_rename(tsk, buf);
1049        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1050        task_unlock(tsk);
1051        perf_event_comm(tsk, exec);
1052}
1053
1054int flush_old_exec(struct linux_binprm * bprm)
1055{
1056        int retval;
1057
1058        /*
1059         * Make sure we have a private signal table and that
1060         * we are unassociated from the previous thread group.
1061         */
1062        retval = de_thread(current);
1063        if (retval)
1064                goto out;
1065
1066        set_mm_exe_file(bprm->mm, bprm->file);
1067        /*
1068         * Release all of the old mmap stuff
1069         */
1070        acct_arg_size(bprm, 0);
1071        retval = exec_mmap(bprm->mm);
1072        if (retval)
1073                goto out;
1074
1075        bprm->mm = NULL;                /* We're using it now */
1076
1077        set_fs(USER_DS);
1078        current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1079                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
1080        flush_thread();
1081        current->personality &= ~bprm->per_clear;
1082
1083        return 0;
1084
1085out:
1086        return retval;
1087}
1088EXPORT_SYMBOL(flush_old_exec);
1089
1090void would_dump(struct linux_binprm *bprm, struct file *file)
1091{
1092        if (inode_permission(file_inode(file), MAY_READ) < 0)
1093                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
1094}
1095EXPORT_SYMBOL(would_dump);
1096
1097void setup_new_exec(struct linux_binprm * bprm)
1098{
1099        arch_pick_mmap_layout(current->mm);
1100
1101        /* This is the point of no return */
1102        current->sas_ss_sp = current->sas_ss_size = 0;
1103
1104        if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
1105                set_dumpable(current->mm, SUID_DUMP_USER);
1106        else
1107                set_dumpable(current->mm, suid_dumpable);
1108
1109        perf_event_exec();
1110        __set_task_comm(current, kbasename(bprm->filename), true);
1111
1112        /* Set the new mm task size. We have to do that late because it may
1113         * depend on TIF_32BIT which is only updated in flush_thread() on
1114         * some architectures like powerpc
1115         */
1116        current->mm->task_size = TASK_SIZE;
1117
1118        /* install the new credentials */
1119        if (!uid_eq(bprm->cred->uid, current_euid()) ||
1120            !gid_eq(bprm->cred->gid, current_egid())) {
1121                current->pdeath_signal = 0;
1122        } else {
1123                would_dump(bprm, bprm->file);
1124                if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
1125                        set_dumpable(current->mm, suid_dumpable);
1126        }
1127
1128        /* An exec changes our domain. We are no longer part of the thread
1129           group */
1130        current->self_exec_id++;
1131        flush_signal_handlers(current, 0);
1132        do_close_on_exec(current->files);
1133}
1134EXPORT_SYMBOL(setup_new_exec);
1135
1136/*
1137 * Prepare credentials and lock ->cred_guard_mutex.
1138 * install_exec_creds() commits the new creds and drops the lock.
1139 * Or, if exec fails before, free_bprm() should release ->cred and
1140 * and unlock.
1141 */
1142int prepare_bprm_creds(struct linux_binprm *bprm)
1143{
1144        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1145                return -ERESTARTNOINTR;
1146
1147        bprm->cred = prepare_exec_creds();
1148        if (likely(bprm->cred))
1149                return 0;
1150
1151        mutex_unlock(&current->signal->cred_guard_mutex);
1152        return -ENOMEM;
1153}
1154
1155static void free_bprm(struct linux_binprm *bprm)
1156{
1157        free_arg_pages(bprm);
1158        if (bprm->cred) {
1159                mutex_unlock(&current->signal->cred_guard_mutex);
1160                abort_creds(bprm->cred);
1161        }
1162        if (bprm->file) {
1163                allow_write_access(bprm->file);
1164                fput(bprm->file);
1165        }
1166        /* If a binfmt changed the interp, free it. */
1167        if (bprm->interp != bprm->filename)
1168                kfree(bprm->interp);
1169        kfree(bprm);
1170}
1171
1172int bprm_change_interp(char *interp, struct linux_binprm *bprm)
1173{
1174        /* If a binfmt changed the interp, free it first. */
1175        if (bprm->interp != bprm->filename)
1176                kfree(bprm->interp);
1177        bprm->interp = kstrdup(interp, GFP_KERNEL);
1178        if (!bprm->interp)
1179                return -ENOMEM;
1180        return 0;
1181}
1182EXPORT_SYMBOL(bprm_change_interp);
1183
1184/*
1185 * install the new credentials for this executable
1186 */
1187void install_exec_creds(struct linux_binprm *bprm)
1188{
1189        security_bprm_committing_creds(bprm);
1190
1191        commit_creds(bprm->cred);
1192        bprm->cred = NULL;
1193
1194        /*
1195         * Disable monitoring for regular users
1196         * when executing setuid binaries. Must
1197         * wait until new credentials are committed
1198         * by commit_creds() above
1199         */
1200        if (get_dumpable(current->mm) != SUID_DUMP_USER)
1201                perf_event_exit_task(current);
1202        /*
1203         * cred_guard_mutex must be held at least to this point to prevent
1204         * ptrace_attach() from altering our determination of the task's
1205         * credentials; any time after this it may be unlocked.
1206         */
1207        security_bprm_committed_creds(bprm);
1208        mutex_unlock(&current->signal->cred_guard_mutex);
1209}
1210EXPORT_SYMBOL(install_exec_creds);
1211
1212/*
1213 * determine how safe it is to execute the proposed program
1214 * - the caller must hold ->cred_guard_mutex to protect against
1215 *   PTRACE_ATTACH or seccomp thread-sync
1216 */
1217static void check_unsafe_exec(struct linux_binprm *bprm)
1218{
1219        struct task_struct *p = current, *t;
1220        unsigned n_fs;
1221
1222        if (p->ptrace) {
1223                if (p->ptrace & PT_PTRACE_CAP)
1224                        bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
1225                else
1226                        bprm->unsafe |= LSM_UNSAFE_PTRACE;
1227        }
1228
1229        /*
1230         * This isn't strictly necessary, but it makes it harder for LSMs to
1231         * mess up.
1232         */
1233        if (task_no_new_privs(current))
1234                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1235
1236        t = p;
1237        n_fs = 1;
1238        spin_lock(&p->fs->lock);
1239        rcu_read_lock();
1240        while_each_thread(p, t) {
1241                if (t->fs == p->fs)
1242                        n_fs++;
1243        }
1244        rcu_read_unlock();
1245
1246        if (p->fs->users > n_fs)
1247                bprm->unsafe |= LSM_UNSAFE_SHARE;
1248        else
1249                p->fs->in_exec = 1;
1250        spin_unlock(&p->fs->lock);
1251}
1252
1253/*
1254 * Fill the binprm structure from the inode.
1255 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1256 *
1257 * This may be called multiple times for binary chains (scripts for example).
1258 */
1259int prepare_binprm(struct linux_binprm *bprm)
1260{
1261        struct inode *inode = file_inode(bprm->file);
1262        umode_t mode = inode->i_mode;
1263        int retval;
1264
1265
1266        /* clear any previous set[ug]id data from a previous binary */
1267        bprm->cred->euid = current_euid();
1268        bprm->cred->egid = current_egid();
1269
1270        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1271            !task_no_new_privs(current) &&
1272            kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
1273            kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
1274                /* Set-uid? */
1275                if (mode & S_ISUID) {
1276                        bprm->per_clear |= PER_CLEAR_ON_SETID;
1277                        bprm->cred->euid = inode->i_uid;
1278                }
1279
1280                /* Set-gid? */
1281                /*
1282                 * If setgid is set but no group execute bit then this
1283                 * is a candidate for mandatory locking, not a setgid
1284                 * executable.
1285                 */
1286                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1287                        bprm->per_clear |= PER_CLEAR_ON_SETID;
1288                        bprm->cred->egid = inode->i_gid;
1289                }
1290        }
1291
1292        /* fill in binprm security blob */
1293        retval = security_bprm_set_creds(bprm);
1294        if (retval)
1295                return retval;
1296        bprm->cred_prepared = 1;
1297
1298        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1299        return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
1300}
1301
1302EXPORT_SYMBOL(prepare_binprm);
1303
1304/*
1305 * Arguments are '\0' separated strings found at the location bprm->p
1306 * points to; chop off the first by relocating brpm->p to right after
1307 * the first '\0' encountered.
1308 */
1309int remove_arg_zero(struct linux_binprm *bprm)
1310{
1311        int ret = 0;
1312        unsigned long offset;
1313        char *kaddr;
1314        struct page *page;
1315
1316        if (!bprm->argc)
1317                return 0;
1318
1319        do {
1320                offset = bprm->p & ~PAGE_MASK;
1321                page = get_arg_page(bprm, bprm->p, 0);
1322                if (!page) {
1323                        ret = -EFAULT;
1324                        goto out;
1325                }
1326                kaddr = kmap_atomic(page);
1327
1328                for (; offset < PAGE_SIZE && kaddr[offset];
1329                                offset++, bprm->p++)
1330                        ;
1331
1332                kunmap_atomic(kaddr);
1333                put_arg_page(page);
1334
1335                if (offset == PAGE_SIZE)
1336                        free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
1337        } while (offset == PAGE_SIZE);
1338
1339        bprm->p++;
1340        bprm->argc--;
1341        ret = 0;
1342
1343out:
1344        return ret;
1345}
1346EXPORT_SYMBOL(remove_arg_zero);
1347
1348#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1349/*
1350 * cycle the list of binary formats handler, until one recognizes the image
1351 */
1352int search_binary_handler(struct linux_binprm *bprm)
1353{
1354        bool need_retry = IS_ENABLED(CONFIG_MODULES);
1355        struct linux_binfmt *fmt;
1356        int retval;
1357
1358        /* This allows 4 levels of binfmt rewrites before failing hard. */
1359        if (bprm->recursion_depth > 5)
1360                return -ELOOP;
1361
1362        retval = security_bprm_check(bprm);
1363        if (retval)
1364                return retval;
1365
1366        retval = -ENOENT;
1367 retry:
1368        read_lock(&binfmt_lock);
1369        list_for_each_entry(fmt, &formats, lh) {
1370                if (!try_module_get(fmt->module))
1371                        continue;
1372                read_unlock(&binfmt_lock);
1373                bprm->recursion_depth++;
1374                retval = fmt->load_binary(bprm);
1375                read_lock(&binfmt_lock);
1376                put_binfmt(fmt);
1377                bprm->recursion_depth--;
1378                if (retval < 0 && !bprm->mm) {
1379                        /* we got to flush_old_exec() and failed after it */
1380                        read_unlock(&binfmt_lock);
1381                        force_sigsegv(SIGSEGV, current);
1382                        return retval;
1383                }
1384                if (retval != -ENOEXEC || !bprm->file) {
1385                        read_unlock(&binfmt_lock);
1386                        return retval;
1387                }
1388        }
1389        read_unlock(&binfmt_lock);
1390
1391        if (need_retry) {
1392                if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
1393                    printable(bprm->buf[2]) && printable(bprm->buf[3]))
1394                        return retval;
1395                if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
1396                        return retval;
1397                need_retry = false;
1398                goto retry;
1399        }
1400
1401        return retval;
1402}
1403EXPORT_SYMBOL(search_binary_handler);
1404
1405static int exec_binprm(struct linux_binprm *bprm)
1406{
1407        pid_t old_pid, old_vpid;
1408        int ret;
1409
1410        /* Need to fetch pid before load_binary changes it */
1411        old_pid = current->pid;
1412        rcu_read_lock();
1413        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1414        rcu_read_unlock();
1415
1416        ret = search_binary_handler(bprm);
1417        if (ret >= 0) {
1418                audit_bprm(bprm);
1419                trace_sched_process_exec(current, old_pid, bprm);
1420                ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1421                proc_exec_connector(current);
1422        }
1423
1424        return ret;
1425}
1426
1427/*
1428 * sys_execve() executes a new program.
1429 */
1430static int do_execve_common(struct filename *filename,
1431                                struct user_arg_ptr argv,
1432                                struct user_arg_ptr envp)
1433{
1434        struct linux_binprm *bprm;
1435        struct file *file;
1436        struct files_struct *displaced;
1437        int retval;
1438
1439        if (IS_ERR(filename))
1440                return PTR_ERR(filename);
1441
1442        /*
1443         * We move the actual failure in case of RLIMIT_NPROC excess from
1444         * set*uid() to execve() because too many poorly written programs
1445         * don't check setuid() return code.  Here we additionally recheck
1446         * whether NPROC limit is still exceeded.
1447         */
1448        if ((current->flags & PF_NPROC_EXCEEDED) &&
1449            atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1450                retval = -EAGAIN;
1451                goto out_ret;
1452        }
1453
1454        /* We're below the limit (still or again), so we don't want to make
1455         * further execve() calls fail. */
1456        current->flags &= ~PF_NPROC_EXCEEDED;
1457
1458        retval = unshare_files(&displaced);
1459        if (retval)
1460                goto out_ret;
1461
1462        retval = -ENOMEM;
1463        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1464        if (!bprm)
1465                goto out_files;
1466
1467        retval = prepare_bprm_creds(bprm);
1468        if (retval)
1469                goto out_free;
1470
1471        check_unsafe_exec(bprm);
1472        current->in_execve = 1;
1473
1474        file = do_open_exec(filename);
1475        retval = PTR_ERR(file);
1476        if (IS_ERR(file))
1477                goto out_unmark;
1478
1479        sched_exec();
1480
1481        bprm->file = file;
1482        bprm->filename = bprm->interp = filename->name;
1483
1484        retval = bprm_mm_init(bprm);
1485        if (retval)
1486                goto out_unmark;
1487
1488        bprm->argc = count(argv, MAX_ARG_STRINGS);
1489        if ((retval = bprm->argc) < 0)
1490                goto out;
1491
1492        bprm->envc = count(envp, MAX_ARG_STRINGS);
1493        if ((retval = bprm->envc) < 0)
1494                goto out;
1495
1496        retval = prepare_binprm(bprm);
1497        if (retval < 0)
1498                goto out;
1499
1500        retval = copy_strings_kernel(1, &bprm->filename, bprm);
1501        if (retval < 0)
1502                goto out;
1503
1504        bprm->exec = bprm->p;
1505        retval = copy_strings(bprm->envc, envp, bprm);
1506        if (retval < 0)
1507                goto out;
1508
1509        retval = copy_strings(bprm->argc, argv, bprm);
1510        if (retval < 0)
1511                goto out;
1512
1513        retval = exec_binprm(bprm);
1514        if (retval < 0)
1515                goto out;
1516
1517        /* execve succeeded */
1518        current->fs->in_exec = 0;
1519        current->in_execve = 0;
1520        acct_update_integrals(current);
1521        task_numa_free(current);
1522        free_bprm(bprm);
1523        putname(filename);
1524        if (displaced)
1525                put_files_struct(displaced);
1526        return retval;
1527
1528out:
1529        if (bprm->mm) {
1530                acct_arg_size(bprm, 0);
1531                mmput(bprm->mm);
1532        }
1533
1534out_unmark:
1535        current->fs->in_exec = 0;
1536        current->in_execve = 0;
1537
1538out_free:
1539        free_bprm(bprm);
1540
1541out_files:
1542        if (displaced)
1543                reset_files_struct(displaced);
1544out_ret:
1545        putname(filename);
1546        return retval;
1547}
1548
1549int do_execve(struct filename *filename,
1550        const char __user *const __user *__argv,
1551        const char __user *const __user *__envp)
1552{
1553        struct user_arg_ptr argv = { .ptr.native = __argv };
1554        struct user_arg_ptr envp = { .ptr.native = __envp };
1555        return do_execve_common(filename, argv, envp);
1556}
1557
1558#ifdef CONFIG_COMPAT
1559static int compat_do_execve(struct filename *filename,
1560        const compat_uptr_t __user *__argv,
1561        const compat_uptr_t __user *__envp)
1562{
1563        struct user_arg_ptr argv = {
1564                .is_compat = true,
1565                .ptr.compat = __argv,
1566        };
1567        struct user_arg_ptr envp = {
1568                .is_compat = true,
1569                .ptr.compat = __envp,
1570        };
1571        return do_execve_common(filename, argv, envp);
1572}
1573#endif
1574
1575void set_binfmt(struct linux_binfmt *new)
1576{
1577        struct mm_struct *mm = current->mm;
1578
1579        if (mm->binfmt)
1580                module_put(mm->binfmt->module);
1581
1582        mm->binfmt = new;
1583        if (new)
1584                __module_get(new->module);
1585}
1586EXPORT_SYMBOL(set_binfmt);
1587
1588/*
1589 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1590 */
1591void set_dumpable(struct mm_struct *mm, int value)
1592{
1593        unsigned long old, new;
1594
1595        if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
1596                return;
1597
1598        do {
1599                old = ACCESS_ONCE(mm->flags);
1600                new = (old & ~MMF_DUMPABLE_MASK) | value;
1601        } while (cmpxchg(&mm->flags, old, new) != old);
1602}
1603
1604SYSCALL_DEFINE3(execve,
1605                const char __user *, filename,
1606                const char __user *const __user *, argv,
1607                const char __user *const __user *, envp)
1608{
1609        return do_execve(getname(filename), argv, envp);
1610}
1611#ifdef CONFIG_COMPAT
1612COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1613        const compat_uptr_t __user *, argv,
1614        const compat_uptr_t __user *, envp)
1615{
1616        return compat_do_execve(getname(filename), argv, envp);
1617}
1618#endif
1619