linux/fs/binfmt_misc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * binfmt_misc.c
   4 *
   5 * Copyright (C) 1997 Richard G\xC3\xBCnther
   6 *
   7 * binfmt_misc detects binaries via a magic or filename extension and invokes
   8 * a specified wrapper. See Documentation/admin-guide/binfmt-misc.rst for more details.
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/kernel.h>
  14#include <linux/module.h>
  15#include <linux/init.h>
  16#include <linux/sched/mm.h>
  17#include <linux/magic.h>
  18#include <linux/binfmts.h>
  19#include <linux/slab.h>
  20#include <linux/ctype.h>
  21#include <linux/string_helpers.h>
  22#include <linux/file.h>
  23#include <linux/pagemap.h>
  24#include <linux/namei.h>
  25#include <linux/mount.h>
  26#include <linux/fs_context.h>
  27#include <linux/syscalls.h>
  28#include <linux/fs.h>
  29#include <linux/uaccess.h>
  30
  31#include "internal.h"
  32
  33#ifdef DEBUG
  34# define USE_DEBUG 1
  35#else
  36# define USE_DEBUG 0
  37#endif
  38
  39enum {
  40        VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
  41};
  42
  43enum {Enabled, Magic};
  44#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
  45#define MISC_FMT_OPEN_BINARY (1UL << 30)
  46#define MISC_FMT_CREDENTIALS (1UL << 29)
  47#define MISC_FMT_OPEN_FILE (1UL << 28)
  48
  49typedef struct {
  50        struct list_head list;
  51        unsigned long flags;            /* type, status, etc. */
  52        int offset;                     /* offset of magic */
  53        int size;                       /* size of magic/mask */
  54        char *magic;                    /* magic or filename extension */
  55        char *mask;                     /* mask, NULL for exact match */
  56        const char *interpreter;        /* filename of interpreter */
  57        char *name;
  58        struct dentry *dentry;
  59        struct file *interp_file;
  60        refcount_t users;               /* sync removal with load_misc_binary() */
  61} Node;
  62
  63static struct file_system_type bm_fs_type;
  64
  65/*
  66 * Max length of the register string.  Determined by:
  67 *  - 7 delimiters
  68 *  - name:   ~50 bytes
  69 *  - type:   1 byte
  70 *  - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE)
  71 *  - magic:  128 bytes (512 in escaped form)
  72 *  - mask:   128 bytes (512 in escaped form)
  73 *  - interp: ~50 bytes
  74 *  - flags:  5 bytes
  75 * Round that up a bit, and then back off to hold the internal data
  76 * (like struct Node).
  77 */
  78#define MAX_REGISTER_LENGTH 1920
  79
  80/**
  81 * search_binfmt_handler - search for a binary handler for @bprm
  82 * @misc: handle to binfmt_misc instance
  83 * @bprm: binary for which we are looking for a handler
  84 *
  85 * Search for a binary type handler for @bprm in the list of registered binary
  86 * type handlers.
  87 *
  88 * Return: binary type list entry on success, NULL on failure
  89 */
  90static Node *search_binfmt_handler(struct binfmt_misc *misc,
  91                                   struct linux_binprm *bprm)
  92{
  93        char *p = strrchr(bprm->interp, '.');
  94        Node *e;
  95
  96        /* Walk all the registered handlers. */
  97        list_for_each_entry(e, &misc->entries, list) {
  98                char *s;
  99                int j;
 100
 101                /* Make sure this one is currently enabled. */
 102                if (!test_bit(Enabled, &e->flags))
 103                        continue;
 104
 105                /* Do matching based on extension if applicable. */
 106                if (!test_bit(Magic, &e->flags)) {
 107                        if (p && !strcmp(e->magic, p + 1))
 108                                return e;
 109                        continue;
 110                }
 111
 112                /* Do matching based on magic & mask. */
 113                s = bprm->buf + e->offset;
 114                if (e->mask) {
 115                        for (j = 0; j < e->size; j++)
 116                                if ((*s++ ^ e->magic[j]) & e->mask[j])
 117                                        break;
 118                } else {
 119                        for (j = 0; j < e->size; j++)
 120                                if ((*s++ ^ e->magic[j]))
 121                                        break;
 122                }
 123                if (j == e->size)
 124                        return e;
 125        }
 126
 127        return NULL;
 128}
 129
 130/**
 131 * get_binfmt_handler - try to find a binary type handler
 132 * @misc: handle to binfmt_misc instance
 133 * @bprm: binary for which we are looking for a handler
 134 *
 135 * Try to find a binfmt handler for the binary type. If one is found take a
 136 * reference to protect against removal via bm_{entry,status}_write().
 137 *
 138 * Return: binary type list entry on success, NULL on failure
 139 */
 140static Node *get_binfmt_handler(struct binfmt_misc *misc,
 141                                struct linux_binprm *bprm)
 142{
 143        Node *e;
 144
 145        read_lock(&misc->entries_lock);
 146        e = search_binfmt_handler(misc, bprm);
 147        if (e)
 148                refcount_inc(&e->users);
 149        read_unlock(&misc->entries_lock);
 150        return e;
 151}
 152
 153/**
 154 * put_binfmt_handler - put binary handler node
 155 * @e: node to put
 156 *
 157 * Free node syncing with load_misc_binary() and defer final free to
 158 * load_misc_binary() in case it is using the binary type handler we were
 159 * requested to remove.
 160 */
 161static void put_binfmt_handler(Node *e)
 162{
 163        if (refcount_dec_and_test(&e->users)) {
 164                if (e->flags & MISC_FMT_OPEN_FILE)
 165                        filp_close(e->interp_file, NULL);
 166                kfree(e);
 167        }
 168}
 169
 170/**
 171 * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
 172 *
 173 * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
 174 * If a user namespace doesn't have its own binfmt_misc mount it can make use
 175 * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
 176 * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
 177 * available to all user and user namespaces on the system.
 178 *
 179 * Return: the binfmt_misc instance of the caller's user namespace
 180 */
 181static struct binfmt_misc *load_binfmt_misc(void)
 182{
 183        const struct user_namespace *user_ns;
 184        struct binfmt_misc *misc;
 185
 186        user_ns = current_user_ns();
 187        while (user_ns) {
 188                /* Pairs with smp_store_release() in bm_fill_super(). */
 189                misc = smp_load_acquire(&user_ns->binfmt_misc);
 190                if (misc)
 191                        return misc;
 192
 193                user_ns = user_ns->parent;
 194        }
 195
 196        return &init_binfmt_misc;
 197}
 198
 199/*
 200 * the loader itself
 201 */
 202static int load_misc_binary(struct linux_binprm *bprm)
 203{
 204        Node *fmt;
 205        struct file *interp_file = NULL;
 206        int retval = -ENOEXEC;
 207        struct binfmt_misc *misc;
 208
 209        misc = load_binfmt_misc();
 210        if (!misc->enabled)
 211                return retval;
 212
 213        fmt = get_binfmt_handler(misc, bprm);
 214        if (!fmt)
 215                return retval;
 216
 217        /* Need to be able to load the file after exec */
 218        retval = -ENOENT;
 219        if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
 220                goto ret;
 221
 222        if (fmt->flags & MISC_FMT_PRESERVE_ARGV0) {
 223                bprm->interp_flags |= BINPRM_FLAGS_PRESERVE_ARGV0;
 224        } else {
 225                retval = remove_arg_zero(bprm);
 226                if (retval)
 227                        goto ret;
 228        }
 229
 230        if (fmt->flags & MISC_FMT_OPEN_BINARY)
 231                bprm->have_execfd = 1;
 232
 233        /* make argv[1] be the path to the binary */
 234        retval = copy_string_kernel(bprm->interp, bprm);
 235        if (retval < 0)
 236                goto ret;
 237        bprm->argc++;
 238
 239        /* add the interp as argv[0] */
 240        retval = copy_string_kernel(fmt->interpreter, bprm);
 241        if (retval < 0)
 242                goto ret;
 243        bprm->argc++;
 244
 245        /* Update interp in case binfmt_script needs it. */
 246        retval = bprm_change_interp(fmt->interpreter, bprm);
 247        if (retval < 0)
 248                goto ret;
 249
 250        if (fmt->flags & MISC_FMT_OPEN_FILE) {
 251                interp_file = file_clone_open(fmt->interp_file);
 252                if (!IS_ERR(interp_file))
 253                        deny_write_access(interp_file);
 254        } else {
 255                interp_file = open_exec(fmt->interpreter);
 256        }
 257        retval = PTR_ERR(interp_file);
 258        if (IS_ERR(interp_file))
 259                goto ret;
 260
 261        bprm->interpreter = interp_file;
 262        if (fmt->flags & MISC_FMT_CREDENTIALS)
 263                bprm->execfd_creds = 1;
 264
 265        retval = 0;
 266ret:
 267
 268        /*
 269         * If we actually put the node here all concurrent calls to
 270         * load_misc_binary() will have finished. We also know
 271         * that for the refcount to be zero someone must have concurently
 272         * removed the binary type handler from the list and it's our job to
 273         * free it.
 274         */
 275        put_binfmt_handler(fmt);
 276
 277        return retval;
 278}
 279
 280/* Command parsers */
 281
 282/*
 283 * parses and copies one argument enclosed in del from *sp to *dp,
 284 * recognising the \x special.
 285 * returns pointer to the copied argument or NULL in case of an
 286 * error (and sets err) or null argument length.
 287 */
 288static char *scanarg(char *s, char del)
 289{
 290        char c;
 291
 292        while ((c = *s++) != del) {
 293                if (c == '\\' && *s == 'x') {
 294                        s++;
 295                        if (!isxdigit(*s++))
 296                                return NULL;
 297                        if (!isxdigit(*s++))
 298                                return NULL;
 299                }
 300        }
 301        s[-1] ='\0';
 302        return s;
 303}
 304
 305static char *check_special_flags(char *sfs, Node *e)
 306{
 307        char *p = sfs;
 308        int cont = 1;
 309
 310        /* special flags */
 311        while (cont) {
 312                switch (*p) {
 313                case 'P':
 314                        pr_debug("register: flag: P (preserve argv0)\n");
 315                        p++;
 316                        e->flags |= MISC_FMT_PRESERVE_ARGV0;
 317                        break;
 318                case 'O':
 319                        pr_debug("register: flag: O (open binary)\n");
 320                        p++;
 321                        e->flags |= MISC_FMT_OPEN_BINARY;
 322                        break;
 323                case 'C':
 324                        pr_debug("register: flag: C (preserve creds)\n");
 325                        p++;
 326                        /* this flags also implies the
 327                           open-binary flag */
 328                        e->flags |= (MISC_FMT_CREDENTIALS |
 329                                        MISC_FMT_OPEN_BINARY);
 330                        break;
 331                case 'F':
 332                        pr_debug("register: flag: F: open interpreter file now\n");
 333                        p++;
 334                        e->flags |= MISC_FMT_OPEN_FILE;
 335                        break;
 336                default:
 337                        cont = 0;
 338                }
 339        }
 340
 341        return p;
 342}
 343
 344/*
 345 * This registers a new binary format, it recognises the syntax
 346 * ':name:type:offset:magic:mask:interpreter:flags'
 347 * where the ':' is the IFS, that can be chosen with the first char
 348 */
 349static Node *create_entry(const char __user *buffer, size_t count)
 350{
 351        Node *e;
 352        int memsize, err;
 353        char *buf, *p;
 354        char del;
 355
 356        pr_debug("register: received %zu bytes\n", count);
 357
 358        /* some sanity checks */
 359        err = -EINVAL;
 360        if ((count < 11) || (count > MAX_REGISTER_LENGTH))
 361                goto out;
 362
 363        err = -ENOMEM;
 364        memsize = sizeof(Node) + count + 8;
 365        e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
 366        if (!e)
 367                goto out;
 368
 369        p = buf = (char *)e + sizeof(Node);
 370
 371        memset(e, 0, sizeof(Node));
 372        if (copy_from_user(buf, buffer, count))
 373                goto efault;
 374
 375        del = *p++;     /* delimeter */
 376
 377        pr_debug("register: delim: %#x {%c}\n", del, del);
 378
 379        /* Pad the buffer with the delim to simplify parsing below. */
 380        memset(buf + count, del, 8);
 381
 382        /* Parse the 'name' field. */
 383        e->name = p;
 384        p = strchr(p, del);
 385        if (!p)
 386                goto einval;
 387        *p++ = '\0';
 388        if (!e->name[0] ||
 389            !strcmp(e->name, ".") ||
 390            !strcmp(e->name, "..") ||
 391            strchr(e->name, '/'))
 392                goto einval;
 393
 394        pr_debug("register: name: {%s}\n", e->name);
 395
 396        /* Parse the 'type' field. */
 397        switch (*p++) {
 398        case 'E':
 399                pr_debug("register: type: E (extension)\n");
 400                e->flags = 1 << Enabled;
 401                break;
 402        case 'M':
 403                pr_debug("register: type: M (magic)\n");
 404                e->flags = (1 << Enabled) | (1 << Magic);
 405                break;
 406        default:
 407                goto einval;
 408        }
 409        if (*p++ != del)
 410                goto einval;
 411
 412        if (test_bit(Magic, &e->flags)) {
 413                /* Handle the 'M' (magic) format. */
 414                char *s;
 415
 416                /* Parse the 'offset' field. */
 417                s = strchr(p, del);
 418                if (!s)
 419                        goto einval;
 420                *s = '\0';
 421                if (p != s) {
 422                        int r = kstrtoint(p, 10, &e->offset);
 423                        if (r != 0 || e->offset < 0)
 424                                goto einval;
 425                }
 426                p = s;
 427                if (*p++)
 428                        goto einval;
 429                pr_debug("register: offset: %#x\n", e->offset);
 430
 431                /* Parse the 'magic' field. */
 432                e->magic = p;
 433                p = scanarg(p, del);
 434                if (!p)
 435                        goto einval;
 436                if (!e->magic[0])
 437                        goto einval;
 438                if (USE_DEBUG)
 439                        print_hex_dump_bytes(
 440                                KBUILD_MODNAME ": register: magic[raw]: ",
 441                                DUMP_PREFIX_NONE, e->magic, p - e->magic);
 442
 443                /* Parse the 'mask' field. */
 444                e->mask = p;
 445                p = scanarg(p, del);
 446                if (!p)
 447                        goto einval;
 448                if (!e->mask[0]) {
 449                        e->mask = NULL;
 450                        pr_debug("register:  mask[raw]: none\n");
 451                } else if (USE_DEBUG)
 452                        print_hex_dump_bytes(
 453                                KBUILD_MODNAME ": register:  mask[raw]: ",
 454                                DUMP_PREFIX_NONE, e->mask, p - e->mask);
 455
 456                /*
 457                 * Decode the magic & mask fields.
 458                 * Note: while we might have accepted embedded NUL bytes from
 459                 * above, the unescape helpers here will stop at the first one
 460                 * it encounters.
 461                 */
 462                e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
 463                if (e->mask &&
 464                    string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
 465                        goto einval;
 466                if (e->size > BINPRM_BUF_SIZE ||
 467                    BINPRM_BUF_SIZE - e->size < e->offset)
 468                        goto einval;
 469                pr_debug("register: magic/mask length: %i\n", e->size);
 470                if (USE_DEBUG) {
 471                        print_hex_dump_bytes(
 472                                KBUILD_MODNAME ": register: magic[decoded]: ",
 473                                DUMP_PREFIX_NONE, e->magic, e->size);
 474
 475                        if (e->mask) {
 476                                int i;
 477                                char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
 478
 479                                print_hex_dump_bytes(
 480                                        KBUILD_MODNAME ": register:  mask[decoded]: ",
 481                                        DUMP_PREFIX_NONE, e->mask, e->size);
 482
 483                                if (masked) {
 484                                        for (i = 0; i < e->size; ++i)
 485                                                masked[i] = e->magic[i] & e->mask[i];
 486                                        print_hex_dump_bytes(
 487                                                KBUILD_MODNAME ": register:  magic[masked]: ",
 488                                                DUMP_PREFIX_NONE, masked, e->size);
 489
 490                                        kfree(masked);
 491                                }
 492                        }
 493                }
 494        } else {
 495                /* Handle the 'E' (extension) format. */
 496
 497                /* Skip the 'offset' field. */
 498                p = strchr(p, del);
 499                if (!p)
 500                        goto einval;
 501                *p++ = '\0';
 502
 503                /* Parse the 'magic' field. */
 504                e->magic = p;
 505                p = strchr(p, del);
 506                if (!p)
 507                        goto einval;
 508                *p++ = '\0';
 509                if (!e->magic[0] || strchr(e->magic, '/'))
 510                        goto einval;
 511                pr_debug("register: extension: {%s}\n", e->magic);
 512
 513                /* Skip the 'mask' field. */
 514                p = strchr(p, del);
 515                if (!p)
 516                        goto einval;
 517                *p++ = '\0';
 518        }
 519
 520        /* Parse the 'interpreter' field. */
 521        e->interpreter = p;
 522        p = strchr(p, del);
 523        if (!p)
 524                goto einval;
 525        *p++ = '\0';
 526        if (!e->interpreter[0])
 527                goto einval;
 528        pr_debug("register: interpreter: {%s}\n", e->interpreter);
 529
 530        /* Parse the 'flags' field. */
 531        p = check_special_flags(p, e);
 532        if (*p == '\n')
 533                p++;
 534        if (p != buf + count)
 535                goto einval;
 536
 537        return e;
 538
 539out:
 540        return ERR_PTR(err);
 541
 542efault:
 543        kfree(e);
 544        return ERR_PTR(-EFAULT);
 545einval:
 546        kfree(e);
 547        return ERR_PTR(-EINVAL);
 548}
 549
 550/*
 551 * Set status of entry/binfmt_misc:
 552 * '1' enables, '0' disables and '-1' clears entry/binfmt_misc
 553 */
 554static int parse_command(const char __user *buffer, size_t count)
 555{
 556        char s[4];
 557
 558        if (count > 3)
 559                return -EINVAL;
 560        if (copy_from_user(s, buffer, count))
 561                return -EFAULT;
 562        if (!count)
 563                return 0;
 564        if (s[count - 1] == '\n')
 565                count--;
 566        if (count == 1 && s[0] == '0')
 567                return 1;
 568        if (count == 1 && s[0] == '1')
 569                return 2;
 570        if (count == 2 && s[0] == '-' && s[1] == '1')
 571                return 3;
 572        return -EINVAL;
 573}
 574
 575/* generic stuff */
 576
 577static void entry_status(Node *e, char *page)
 578{
 579        char *dp = page;
 580        const char *status = "disabled";
 581
 582        if (test_bit(Enabled, &e->flags))
 583                status = "enabled";
 584
 585        if (!VERBOSE_STATUS) {
 586                sprintf(page, "%s\n", status);
 587                return;
 588        }
 589
 590        dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter);
 591
 592        /* print the special flags */
 593        dp += sprintf(dp, "flags: ");
 594        if (e->flags & MISC_FMT_PRESERVE_ARGV0)
 595                *dp++ = 'P';
 596        if (e->flags & MISC_FMT_OPEN_BINARY)
 597                *dp++ = 'O';
 598        if (e->flags & MISC_FMT_CREDENTIALS)
 599                *dp++ = 'C';
 600        if (e->flags & MISC_FMT_OPEN_FILE)
 601                *dp++ = 'F';
 602        *dp++ = '\n';
 603
 604        if (!test_bit(Magic, &e->flags)) {
 605                sprintf(dp, "extension .%s\n", e->magic);
 606        } else {
 607                dp += sprintf(dp, "offset %i\nmagic ", e->offset);
 608                dp = bin2hex(dp, e->magic, e->size);
 609                if (e->mask) {
 610                        dp += sprintf(dp, "\nmask ");
 611                        dp = bin2hex(dp, e->mask, e->size);
 612                }
 613                *dp++ = '\n';
 614                *dp = '\0';
 615        }
 616}
 617
 618static struct inode *bm_get_inode(struct super_block *sb, int mode)
 619{
 620        struct inode *inode = new_inode(sb);
 621
 622        if (inode) {
 623                inode->i_ino = get_next_ino();
 624                inode->i_mode = mode;
 625                simple_inode_init_ts(inode);
 626        }
 627        return inode;
 628}
 629
 630/**
 631 * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
 632 * @inode: inode of the relevant binfmt_misc instance
 633 *
 634 * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
 635 * be done without any memory barriers because we are guaranteed that
 636 * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
 637 * binfmt_misc mount was first created.
 638 *
 639 * Return: struct binfmt_misc of the relevant binfmt_misc instance
 640 */
 641static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
 642{
 643        return inode->i_sb->s_user_ns->binfmt_misc;
 644}
 645
 646/**
 647 * bm_evict_inode - cleanup data associated with @inode
 648 * @inode: inode to which the data is attached
 649 *
 650 * Cleanup the binary type handler data associated with @inode if a binary type
 651 * entry is removed or the filesystem is unmounted and the super block is
 652 * shutdown.
 653 *
 654 * If the ->evict call was not caused by a super block shutdown but by a write
 655 * to remove the entry or all entries via bm_{entry,status}_write() the entry
 656 * will have already been removed from the list. We keep the list_empty() check
 657 * to make that explicit.
 658*/
 659static void bm_evict_inode(struct inode *inode)
 660{
 661        Node *e = inode->i_private;
 662
 663        clear_inode(inode);
 664
 665        if (e) {
 666                struct binfmt_misc *misc;
 667
 668                misc = i_binfmt_misc(inode);
 669                write_lock(&misc->entries_lock);
 670                if (!list_empty(&e->list))
 671                        list_del_init(&e->list);
 672                write_unlock(&misc->entries_lock);
 673                put_binfmt_handler(e);
 674        }
 675}
 676
 677/**
 678 * unlink_binfmt_dentry - remove the dentry for the binary type handler
 679 * @dentry: dentry associated with the binary type handler
 680 *
 681 * Do the actual filesystem work to remove a dentry for a registered binary
 682 * type handler. Since binfmt_misc only allows simple files to be created
 683 * directly under the root dentry of the filesystem we ensure that we are
 684 * indeed passed a dentry directly beneath the root dentry, that the inode
 685 * associated with the root dentry is locked, and that it is a regular file we
 686 * are asked to remove.
 687 */
 688static void unlink_binfmt_dentry(struct dentry *dentry)
 689{
 690        struct dentry *parent = dentry->d_parent;
 691        struct inode *inode, *parent_inode;
 692
 693        /* All entries are immediate descendants of the root dentry. */
 694        if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
 695                return;
 696
 697        /* We only expect to be called on regular files. */
 698        inode = d_inode(dentry);
 699        if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
 700                return;
 701
 702        /* The parent inode must be locked. */
 703        parent_inode = d_inode(parent);
 704        if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
 705                return;
 706
 707        if (simple_positive(dentry)) {
 708                dget(dentry);
 709                simple_unlink(parent_inode, dentry);
 710                d_delete(dentry);
 711                dput(dentry);
 712        }
 713}
 714
 715/**
 716 * remove_binfmt_handler - remove a binary type handler
 717 * @misc: handle to binfmt_misc instance
 718 * @e: binary type handler to remove
 719 *
 720 * Remove a binary type handler from the list of binary type handlers and
 721 * remove its associated dentry. This is called from
 722 * binfmt_{entry,status}_write(). In the future, we might want to think about
 723 * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's
 724 * to use writes to files in order to delete binary type handlers. But it has
 725 * worked for so long that it's not a pressing issue.
 726 */
 727static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
 728{
 729        write_lock(&misc->entries_lock);
 730        list_del_init(&e->list);
 731        write_unlock(&misc->entries_lock);
 732        unlink_binfmt_dentry(e->dentry);
 733}
 734
 735/* /<entry> */
 736
 737static ssize_t
 738bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 739{
 740        Node *e = file_inode(file)->i_private;
 741        ssize_t res;
 742        char *page;
 743
 744        page = (char *) __get_free_page(GFP_KERNEL);
 745        if (!page)
 746                return -ENOMEM;
 747
 748        entry_status(e, page);
 749
 750        res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
 751
 752        free_page((unsigned long) page);
 753        return res;
 754}
 755
 756static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 757                                size_t count, loff_t *ppos)
 758{
 759        struct inode *inode = file_inode(file);
 760        Node *e = inode->i_private;
 761        int res = parse_command(buffer, count);
 762
 763        switch (res) {
 764        case 1:
 765                /* Disable this handler. */
 766                clear_bit(Enabled, &e->flags);
 767                break;
 768        case 2:
 769                /* Enable this handler. */
 770                set_bit(Enabled, &e->flags);
 771                break;
 772        case 3:
 773                /* Delete this handler. */
 774                inode = d_inode(inode->i_sb->s_root);
 775                inode_lock(inode);
 776
 777                /*
 778                 * In order to add new element or remove elements from the list
 779                 * via bm_{entry,register,status}_write() inode_lock() on the
 780                 * root inode must be held.
 781                 * The lock is exclusive ensuring that the list can't be
 782                 * modified. Only load_misc_binary() can access but does so
 783                 * read-only. So we only need to take the write lock when we
 784                 * actually remove the entry from the list.
 785                 */
 786                if (!list_empty(&e->list))
 787                        remove_binfmt_handler(i_binfmt_misc(inode), e);
 788
 789                inode_unlock(inode);
 790                break;
 791        default:
 792                return res;
 793        }
 794
 795        return count;
 796}
 797
 798static const struct file_operations bm_entry_operations = {
 799        .read           = bm_entry_read,
 800        .write          = bm_entry_write,
 801        .llseek         = default_llseek,
 802};
 803
 804/* /register */
 805
 806static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 807                               size_t count, loff_t *ppos)
 808{
 809        Node *e;
 810        struct inode *inode;
 811        struct super_block *sb = file_inode(file)->i_sb;
 812        struct dentry *root = sb->s_root, *dentry;
 813        struct binfmt_misc *misc;
 814        int err = 0;
 815        struct file *f = NULL;
 816
 817        e = create_entry(buffer, count);
 818
 819        if (IS_ERR(e))
 820                return PTR_ERR(e);
 821
 822        if (e->flags & MISC_FMT_OPEN_FILE) {
 823                const struct cred *old_cred;
 824
 825                /*
 826                 * Now that we support unprivileged binfmt_misc mounts make
 827                 * sure we use the credentials that the register @file was
 828                 * opened with to also open the interpreter. Before that this
 829                 * didn't matter much as only a privileged process could open
 830                 * the register file.
 831                 */
 832                old_cred = override_creds(file->f_cred);
 833                f = open_exec(e->interpreter);
 834                revert_creds(old_cred);
 835                if (IS_ERR(f)) {
 836                        pr_notice("register: failed to install interpreter file %s\n",
 837                                 e->interpreter);
 838                        kfree(e);
 839                        return PTR_ERR(f);
 840                }
 841                e->interp_file = f;
 842        }
 843
 844        inode_lock(d_inode(root));
 845        dentry = lookup_one_len(e->name, root, strlen(e->name));
 846        err = PTR_ERR(dentry);
 847        if (IS_ERR(dentry))
 848                goto out;
 849
 850        err = -EEXIST;
 851        if (d_really_is_positive(dentry))
 852                goto out2;
 853
 854        inode = bm_get_inode(sb, S_IFREG | 0644);
 855
 856        err = -ENOMEM;
 857        if (!inode)
 858                goto out2;
 859
 860        refcount_set(&e->users, 1);
 861        e->dentry = dget(dentry);
 862        inode->i_private = e;
 863        inode->i_fop = &bm_entry_operations;
 864
 865        d_instantiate(dentry, inode);
 866        misc = i_binfmt_misc(inode);
 867        write_lock(&misc->entries_lock);
 868        list_add(&e->list, &misc->entries);
 869        write_unlock(&misc->entries_lock);
 870
 871        err = 0;
 872out2:
 873        dput(dentry);
 874out:
 875        inode_unlock(d_inode(root));
 876
 877        if (err) {
 878                if (f)
 879                        filp_close(f, NULL);
 880                kfree(e);
 881                return err;
 882        }
 883        return count;
 884}
 885
 886static const struct file_operations bm_register_operations = {
 887        .write          = bm_register_write,
 888        .llseek         = noop_llseek,
 889};
 890
 891/* /status */
 892
 893static ssize_t
 894bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 895{
 896        struct binfmt_misc *misc;
 897        char *s;
 898
 899        misc = i_binfmt_misc(file_inode(file));
 900        s = misc->enabled ? "enabled\n" : "disabled\n";
 901        return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 902}
 903
 904static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 905                size_t count, loff_t *ppos)
 906{
 907        struct binfmt_misc *misc;
 908        int res = parse_command(buffer, count);
 909        Node *e, *next;
 910        struct inode *inode;
 911
 912        misc = i_binfmt_misc(file_inode(file));
 913        switch (res) {
 914        case 1:
 915                /* Disable all handlers. */
 916                misc->enabled = false;
 917                break;
 918        case 2:
 919                /* Enable all handlers. */
 920                misc->enabled = true;
 921                break;
 922        case 3:
 923                /* Delete all handlers. */
 924                inode = d_inode(file_inode(file)->i_sb->s_root);
 925                inode_lock(inode);
 926
 927                /*
 928                 * In order to add new element or remove elements from the list
 929                 * via bm_{entry,register,status}_write() inode_lock() on the
 930                 * root inode must be held.
 931                 * The lock is exclusive ensuring that the list can't be
 932                 * modified. Only load_misc_binary() can access but does so
 933                 * read-only. So we only need to take the write lock when we
 934                 * actually remove the entry from the list.
 935                 */
 936                list_for_each_entry_safe(e, next, &misc->entries, list)
 937                        remove_binfmt_handler(misc, e);
 938
 939                inode_unlock(inode);
 940                break;
 941        default:
 942                return res;
 943        }
 944
 945        return count;
 946}
 947
 948static const struct file_operations bm_status_operations = {
 949        .read           = bm_status_read,
 950        .write          = bm_status_write,
 951        .llseek         = default_llseek,
 952};
 953
 954/* Superblock handling */
 955
 956static void bm_put_super(struct super_block *sb)
 957{
 958        struct user_namespace *user_ns = sb->s_fs_info;
 959
 960        sb->s_fs_info = NULL;
 961        put_user_ns(user_ns);
 962}
 963
 964static const struct super_operations s_ops = {
 965        .statfs         = simple_statfs,
 966        .evict_inode    = bm_evict_inode,
 967        .put_super      = bm_put_super,
 968};
 969
 970static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 971{
 972        int err;
 973        struct user_namespace *user_ns = sb->s_user_ns;
 974        struct binfmt_misc *misc;
 975        static const struct tree_descr bm_files[] = {
 976                [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
 977                [3] = {"register", &bm_register_operations, S_IWUSR},
 978                /* last one */ {""}
 979        };
 980
 981        if (WARN_ON(user_ns != current_user_ns()))
 982                return -EINVAL;
 983
 984        /*
 985         * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
 986         * do it here during the first mount of binfmt_misc. We don't need to
 987         * waste memory for every user namespace allocation. It's likely much
 988         * more common to not mount a separate binfmt_misc instance than it is
 989         * to mount one.
 990         *
 991         * While multiple superblocks can exist they are keyed by userns in
 992         * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
 993         * bm_fill_super() is called exactly once whenever a binfmt_misc
 994         * superblock for a userns is created. This in turn lets us conclude
 995         * that when a binfmt_misc superblock is created for the first time for
 996         * a userns there's no one racing us. Therefore we don't need any
 997         * barriers when we dereference binfmt_misc.
 998         */
 999        misc = user_ns->binfmt_misc;
1000        if (!misc) {
1001                /*
1002                 * If it turns out that most user namespaces actually want to
1003                 * register their own binary type handler and therefore all
1004                 * create their own separate binfm_misc mounts we should
1005                 * consider turning this into a kmem cache.
1006                 */
1007                misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
1008                if (!misc)
1009                        return -ENOMEM;
1010
1011                INIT_LIST_HEAD(&misc->entries);
1012                rwlock_init(&misc->entries_lock);
1013
1014                /* Pairs with smp_load_acquire() in load_binfmt_misc(). */
1015                smp_store_release(&user_ns->binfmt_misc, misc);
1016        }
1017
1018        /*
1019         * When the binfmt_misc superblock for this userns is shutdown
1020         * ->enabled might have been set to false and we don't reinitialize
1021         * ->enabled again in put_super() as someone might already be mounting
1022         * binfmt_misc again. It also would be pointless since by the time
1023         * ->put_super() is called we know that the binary type list for this
1024         * bintfmt_misc mount is empty making load_misc_binary() return
1025         * -ENOEXEC independent of whether ->enabled is true. Instead, if
1026         * someone mounts binfmt_misc for the first time or again we simply
1027         * reset ->enabled to true.
1028         */
1029        misc->enabled = true;
1030
1031        err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
1032        if (!err)
1033                sb->s_op = &s_ops;
1034        return err;
1035}
1036
1037static void bm_free(struct fs_context *fc)
1038{
1039        if (fc->s_fs_info)
1040                put_user_ns(fc->s_fs_info);
1041}
1042
1043static int bm_get_tree(struct fs_context *fc)
1044{
1045        return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
1046}
1047
1048static const struct fs_context_operations bm_context_ops = {
1049        .free           = bm_free,
1050        .get_tree       = bm_get_tree,
1051};
1052
1053static int bm_init_fs_context(struct fs_context *fc)
1054{
1055        fc->ops = &bm_context_ops;
1056        return 0;
1057}
1058
1059static struct linux_binfmt misc_format = {
1060        .module = THIS_MODULE,
1061        .load_binary = load_misc_binary,
1062};
1063
1064static struct file_system_type bm_fs_type = {
1065        .owner          = THIS_MODULE,
1066        .name           = "binfmt_misc",
1067        .init_fs_context = bm_init_fs_context,
1068        .fs_flags       = FS_USERNS_MOUNT,
1069        .kill_sb        = kill_litter_super,
1070};
1071MODULE_ALIAS_FS("binfmt_misc");
1072
1073static int __init init_misc_binfmt(void)
1074{
1075        int err = register_filesystem(&bm_fs_type);
1076        if (!err)
1077                insert_binfmt(&misc_format);
1078        return err;
1079}
1080
1081static void __exit exit_misc_binfmt(void)
1082{
1083        unregister_binfmt(&misc_format);
1084        unregister_filesystem(&bm_fs_type);
1085}
1086
1087core_initcall(init_misc_binfmt);
1088module_exit(exit_misc_binfmt);
1089MODULE_LICENSE("GPL");
1090