linux/kernel/seccomp.c
<<
>>
Prefs
   1/*
   2 * linux/kernel/seccomp.c
   3 *
   4 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
   5 *
   6 * Copyright (C) 2012 Google, Inc.
   7 * Will Drewry <wad@chromium.org>
   8 *
   9 * This defines a simple but solid secure-computing facility.
  10 *
  11 * Mode 1 uses a fixed list of allowed system calls.
  12 * Mode 2 allows user-defined system call filters in the form
  13 *        of Berkeley Packet Filters/Linux Socket Filters.
  14 */
  15
  16#include <linux/atomic.h>
  17#include <linux/audit.h>
  18#include <linux/compat.h>
  19#include <linux/sched.h>
  20#include <linux/seccomp.h>
  21
  22/* #define SECCOMP_DEBUG 1 */
  23
  24#ifdef CONFIG_SECCOMP_FILTER
  25#include <asm/syscall.h>
  26#include <linux/filter.h>
  27#include <linux/ptrace.h>
  28#include <linux/security.h>
  29#include <linux/slab.h>
  30#include <linux/tracehook.h>
  31#include <linux/uaccess.h>
  32
  33/**
  34 * struct seccomp_filter - container for seccomp BPF programs
  35 *
  36 * @usage: reference count to manage the object lifetime.
  37 *         get/put helpers should be used when accessing an instance
  38 *         outside of a lifetime-guarded section.  In general, this
  39 *         is only needed for handling filters shared across tasks.
  40 * @prev: points to a previously installed, or inherited, filter
  41 * @len: the number of instructions in the program
  42 * @insns: the BPF program instructions to evaluate
  43 *
  44 * seccomp_filter objects are organized in a tree linked via the @prev
  45 * pointer.  For any task, it appears to be a singly-linked list starting
  46 * with current->seccomp.filter, the most recently attached or inherited filter.
  47 * However, multiple filters may share a @prev node, by way of fork(), which
  48 * results in a unidirectional tree existing in memory.  This is similar to
  49 * how namespaces work.
  50 *
  51 * seccomp_filter objects should never be modified after being attached
  52 * to a task_struct (other than @usage).
  53 */
  54struct seccomp_filter {
  55        atomic_t usage;
  56        struct seccomp_filter *prev;
  57        unsigned short len;  /* Instruction count */
  58        struct sock_filter insns[];
  59};
  60
  61/* Limit any path through the tree to 256KB worth of instructions. */
  62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
  63
  64/**
  65 * get_u32 - returns a u32 offset into data
  66 * @data: a unsigned 64 bit value
  67 * @index: 0 or 1 to return the first or second 32-bits
  68 *
  69 * This inline exists to hide the length of unsigned long.  If a 32-bit
  70 * unsigned long is passed in, it will be extended and the top 32-bits will be
  71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
  72 * properly returned.
  73 *
  74 * Endianness is explicitly ignored and left for BPF program authors to manage
  75 * as per the specific architecture.
  76 */
  77static inline u32 get_u32(u64 data, int index)
  78{
  79        return ((u32 *)&data)[index];
  80}
  81
  82/* Helper for bpf_load below. */
  83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
  84/**
  85 * bpf_load: checks and returns a pointer to the requested offset
  86 * @off: offset into struct seccomp_data to load from
  87 *
  88 * Returns the requested 32-bits of data.
  89 * seccomp_check_filter() should assure that @off is 32-bit aligned
  90 * and not out of bounds.  Failure to do so is a BUG.
  91 */
  92u32 seccomp_bpf_load(int off)
  93{
  94        struct pt_regs *regs = task_pt_regs(current);
  95        if (off == BPF_DATA(nr))
  96                return syscall_get_nr(current, regs);
  97        if (off == BPF_DATA(arch))
  98                return syscall_get_arch(current, regs);
  99        if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
 100                unsigned long value;
 101                int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
 102                int index = !!(off % sizeof(u64));
 103                syscall_get_arguments(current, regs, arg, 1, &value);
 104                return get_u32(value, index);
 105        }
 106        if (off == BPF_DATA(instruction_pointer))
 107                return get_u32(KSTK_EIP(current), 0);
 108        if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
 109                return get_u32(KSTK_EIP(current), 1);
 110        /* seccomp_check_filter should make this impossible. */
 111        BUG();
 112}
 113
 114/**
 115 *      seccomp_check_filter - verify seccomp filter code
 116 *      @filter: filter to verify
 117 *      @flen: length of filter
 118 *
 119 * Takes a previously checked filter (by sk_chk_filter) and
 120 * redirects all filter code that loads struct sk_buff data
 121 * and related data through seccomp_bpf_load.  It also
 122 * enforces length and alignment checking of those loads.
 123 *
 124 * Returns 0 if the rule set is legal or -EINVAL if not.
 125 */
 126static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 127{
 128        int pc;
 129        for (pc = 0; pc < flen; pc++) {
 130                struct sock_filter *ftest = &filter[pc];
 131                u16 code = ftest->code;
 132                u32 k = ftest->k;
 133
 134                switch (code) {
 135                case BPF_S_LD_W_ABS:
 136                        ftest->code = BPF_S_ANC_SECCOMP_LD_W;
 137                        /* 32-bit aligned and not out of bounds. */
 138                        if (k >= sizeof(struct seccomp_data) || k & 3)
 139                                return -EINVAL;
 140                        continue;
 141                case BPF_S_LD_W_LEN:
 142                        ftest->code = BPF_S_LD_IMM;
 143                        ftest->k = sizeof(struct seccomp_data);
 144                        continue;
 145                case BPF_S_LDX_W_LEN:
 146                        ftest->code = BPF_S_LDX_IMM;
 147                        ftest->k = sizeof(struct seccomp_data);
 148                        continue;
 149                /* Explicitly include allowed calls. */
 150                case BPF_S_RET_K:
 151                case BPF_S_RET_A:
 152                case BPF_S_ALU_ADD_K:
 153                case BPF_S_ALU_ADD_X:
 154                case BPF_S_ALU_SUB_K:
 155                case BPF_S_ALU_SUB_X:
 156                case BPF_S_ALU_MUL_K:
 157                case BPF_S_ALU_MUL_X:
 158                case BPF_S_ALU_DIV_X:
 159                case BPF_S_ALU_AND_K:
 160                case BPF_S_ALU_AND_X:
 161                case BPF_S_ALU_OR_K:
 162                case BPF_S_ALU_OR_X:
 163                case BPF_S_ALU_XOR_K:
 164                case BPF_S_ALU_XOR_X:
 165                case BPF_S_ALU_LSH_K:
 166                case BPF_S_ALU_LSH_X:
 167                case BPF_S_ALU_RSH_K:
 168                case BPF_S_ALU_RSH_X:
 169                case BPF_S_ALU_NEG:
 170                case BPF_S_LD_IMM:
 171                case BPF_S_LDX_IMM:
 172                case BPF_S_MISC_TAX:
 173                case BPF_S_MISC_TXA:
 174                case BPF_S_ALU_DIV_K:
 175                case BPF_S_LD_MEM:
 176                case BPF_S_LDX_MEM:
 177                case BPF_S_ST:
 178                case BPF_S_STX:
 179                case BPF_S_JMP_JA:
 180                case BPF_S_JMP_JEQ_K:
 181                case BPF_S_JMP_JEQ_X:
 182                case BPF_S_JMP_JGE_K:
 183                case BPF_S_JMP_JGE_X:
 184                case BPF_S_JMP_JGT_K:
 185                case BPF_S_JMP_JGT_X:
 186                case BPF_S_JMP_JSET_K:
 187                case BPF_S_JMP_JSET_X:
 188                        continue;
 189                default:
 190                        return -EINVAL;
 191                }
 192        }
 193        return 0;
 194}
 195
 196/**
 197 * seccomp_run_filters - evaluates all seccomp filters against @syscall
 198 * @syscall: number of the current system call
 199 *
 200 * Returns valid seccomp BPF response codes.
 201 */
 202static u32 seccomp_run_filters(int syscall)
 203{
 204        struct seccomp_filter *f;
 205        u32 ret = SECCOMP_RET_ALLOW;
 206
 207        /* Ensure unexpected behavior doesn't result in failing open. */
 208        if (WARN_ON(current->seccomp.filter == NULL))
 209                return SECCOMP_RET_KILL;
 210
 211        /*
 212         * All filters in the list are evaluated and the lowest BPF return
 213         * value always takes priority (ignoring the DATA).
 214         */
 215        for (f = current->seccomp.filter; f; f = f->prev) {
 216                u32 cur_ret = sk_run_filter(NULL, f->insns);
 217                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 218                        ret = cur_ret;
 219        }
 220        return ret;
 221}
 222
 223/**
 224 * seccomp_attach_filter: Attaches a seccomp filter to current.
 225 * @fprog: BPF program to install
 226 *
 227 * Returns 0 on success or an errno on failure.
 228 */
 229static long seccomp_attach_filter(struct sock_fprog *fprog)
 230{
 231        struct seccomp_filter *filter;
 232        unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
 233        unsigned long total_insns = fprog->len;
 234        long ret;
 235
 236        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 237                return -EINVAL;
 238
 239        for (filter = current->seccomp.filter; filter; filter = filter->prev)
 240                total_insns += filter->len + 4;  /* include a 4 instr penalty */
 241        if (total_insns > MAX_INSNS_PER_PATH)
 242                return -ENOMEM;
 243
 244        /*
 245         * Installing a seccomp filter requires that the task have
 246         * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
 247         * This avoids scenarios where unprivileged tasks can affect the
 248         * behavior of privileged children.
 249         */
 250        if (!current->no_new_privs &&
 251            security_capable_noaudit(current_cred(), current_user_ns(),
 252                                     CAP_SYS_ADMIN) != 0)
 253                return -EACCES;
 254
 255        /* Allocate a new seccomp_filter */
 256        filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
 257                         GFP_KERNEL|__GFP_NOWARN);
 258        if (!filter)
 259                return -ENOMEM;
 260        atomic_set(&filter->usage, 1);
 261        filter->len = fprog->len;
 262
 263        /* Copy the instructions from fprog. */
 264        ret = -EFAULT;
 265        if (copy_from_user(filter->insns, fprog->filter, fp_size))
 266                goto fail;
 267
 268        /* Check and rewrite the fprog via the skb checker */
 269        ret = sk_chk_filter(filter->insns, filter->len);
 270        if (ret)
 271                goto fail;
 272
 273        /* Check and rewrite the fprog for seccomp use */
 274        ret = seccomp_check_filter(filter->insns, filter->len);
 275        if (ret)
 276                goto fail;
 277
 278        /*
 279         * If there is an existing filter, make it the prev and don't drop its
 280         * task reference.
 281         */
 282        filter->prev = current->seccomp.filter;
 283        current->seccomp.filter = filter;
 284        return 0;
 285fail:
 286        kfree(filter);
 287        return ret;
 288}
 289
 290/**
 291 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
 292 * @user_filter: pointer to the user data containing a sock_fprog.
 293 *
 294 * Returns 0 on success and non-zero otherwise.
 295 */
 296long seccomp_attach_user_filter(char __user *user_filter)
 297{
 298        struct sock_fprog fprog;
 299        long ret = -EFAULT;
 300
 301#ifdef CONFIG_COMPAT
 302        if (is_compat_task()) {
 303                struct compat_sock_fprog fprog32;
 304                if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
 305                        goto out;
 306                fprog.len = fprog32.len;
 307                fprog.filter = compat_ptr(fprog32.filter);
 308        } else /* falls through to the if below. */
 309#endif
 310        if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
 311                goto out;
 312        ret = seccomp_attach_filter(&fprog);
 313out:
 314        return ret;
 315}
 316
 317/* get_seccomp_filter - increments the reference count of the filter on @tsk */
 318void get_seccomp_filter(struct task_struct *tsk)
 319{
 320        struct seccomp_filter *orig = tsk->seccomp.filter;
 321        if (!orig)
 322                return;
 323        /* Reference count is bounded by the number of total processes. */
 324        atomic_inc(&orig->usage);
 325}
 326
 327/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
 328void put_seccomp_filter(struct task_struct *tsk)
 329{
 330        struct seccomp_filter *orig = tsk->seccomp.filter;
 331        /* Clean up single-reference branches iteratively. */
 332        while (orig && atomic_dec_and_test(&orig->usage)) {
 333                struct seccomp_filter *freeme = orig;
 334                orig = orig->prev;
 335                kfree(freeme);
 336        }
 337}
 338
 339/**
 340 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
 341 * @syscall: syscall number to send to userland
 342 * @reason: filter-supplied reason code to send to userland (via si_errno)
 343 *
 344 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 345 */
 346static void seccomp_send_sigsys(int syscall, int reason)
 347{
 348        struct siginfo info;
 349        memset(&info, 0, sizeof(info));
 350        info.si_signo = SIGSYS;
 351        info.si_code = SYS_SECCOMP;
 352        info.si_call_addr = (void __user *)KSTK_EIP(current);
 353        info.si_errno = reason;
 354        info.si_arch = syscall_get_arch(current, task_pt_regs(current));
 355        info.si_syscall = syscall;
 356        force_sig_info(SIGSYS, &info, current);
 357}
 358#endif  /* CONFIG_SECCOMP_FILTER */
 359
 360/*
 361 * Secure computing mode 1 allows only read/write/exit/sigreturn.
 362 * To be fully secure this must be combined with rlimit
 363 * to limit the stack allocations too.
 364 */
 365static int mode1_syscalls[] = {
 366        __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
 367        0, /* null terminated */
 368};
 369
 370#ifdef CONFIG_COMPAT
 371static int mode1_syscalls_32[] = {
 372        __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
 373        0, /* null terminated */
 374};
 375#endif
 376
 377int __secure_computing(int this_syscall)
 378{
 379        int mode = current->seccomp.mode;
 380        int exit_sig = 0;
 381        int *syscall;
 382        u32 ret;
 383
 384        switch (mode) {
 385        case SECCOMP_MODE_STRICT:
 386                syscall = mode1_syscalls;
 387#ifdef CONFIG_COMPAT
 388                if (is_compat_task())
 389                        syscall = mode1_syscalls_32;
 390#endif
 391                do {
 392                        if (*syscall == this_syscall)
 393                                return 0;
 394                } while (*++syscall);
 395                exit_sig = SIGKILL;
 396                ret = SECCOMP_RET_KILL;
 397                break;
 398#ifdef CONFIG_SECCOMP_FILTER
 399        case SECCOMP_MODE_FILTER: {
 400                int data;
 401                struct pt_regs *regs = task_pt_regs(current);
 402                ret = seccomp_run_filters(this_syscall);
 403                data = ret & SECCOMP_RET_DATA;
 404                ret &= SECCOMP_RET_ACTION;
 405                switch (ret) {
 406                case SECCOMP_RET_ERRNO:
 407                        /* Set the low-order 16-bits as a errno. */
 408                        syscall_set_return_value(current, regs,
 409                                                 -data, 0);
 410                        goto skip;
 411                case SECCOMP_RET_TRAP:
 412                        /* Show the handler the original registers. */
 413                        syscall_rollback(current, regs);
 414                        /* Let the filter pass back 16 bits of data. */
 415                        seccomp_send_sigsys(this_syscall, data);
 416                        goto skip;
 417                case SECCOMP_RET_TRACE:
 418                        /* Skip these calls if there is no tracer. */
 419                        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
 420                                syscall_set_return_value(current, regs,
 421                                                         -ENOSYS, 0);
 422                                goto skip;
 423                        }
 424                        /* Allow the BPF to provide the event message */
 425                        ptrace_event(PTRACE_EVENT_SECCOMP, data);
 426                        /*
 427                         * The delivery of a fatal signal during event
 428                         * notification may silently skip tracer notification.
 429                         * Terminating the task now avoids executing a system
 430                         * call that may not be intended.
 431                         */
 432                        if (fatal_signal_pending(current))
 433                                break;
 434                        if (syscall_get_nr(current, regs) < 0)
 435                                goto skip;  /* Explicit request to skip. */
 436
 437                        return 0;
 438                case SECCOMP_RET_ALLOW:
 439                        return 0;
 440                case SECCOMP_RET_KILL:
 441                default:
 442                        break;
 443                }
 444                exit_sig = SIGSYS;
 445                break;
 446        }
 447#endif
 448        default:
 449                BUG();
 450        }
 451
 452#ifdef SECCOMP_DEBUG
 453        dump_stack();
 454#endif
 455        audit_seccomp(this_syscall, exit_sig, ret);
 456        do_exit(exit_sig);
 457#ifdef CONFIG_SECCOMP_FILTER
 458skip:
 459        audit_seccomp(this_syscall, exit_sig, ret);
 460#endif
 461        return -1;
 462}
 463
 464long prctl_get_seccomp(void)
 465{
 466        return current->seccomp.mode;
 467}
 468
 469/**
 470 * prctl_set_seccomp: configures current->seccomp.mode
 471 * @seccomp_mode: requested mode to use
 472 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
 473 *
 474 * This function may be called repeatedly with a @seccomp_mode of
 475 * SECCOMP_MODE_FILTER to install additional filters.  Every filter
 476 * successfully installed will be evaluated (in reverse order) for each system
 477 * call the task makes.
 478 *
 479 * Once current->seccomp.mode is non-zero, it may not be changed.
 480 *
 481 * Returns 0 on success or -EINVAL on failure.
 482 */
 483long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 484{
 485        long ret = -EINVAL;
 486
 487        if (current->seccomp.mode &&
 488            current->seccomp.mode != seccomp_mode)
 489                goto out;
 490
 491        switch (seccomp_mode) {
 492        case SECCOMP_MODE_STRICT:
 493                ret = 0;
 494#ifdef TIF_NOTSC
 495                disable_TSC();
 496#endif
 497                break;
 498#ifdef CONFIG_SECCOMP_FILTER
 499        case SECCOMP_MODE_FILTER:
 500                ret = seccomp_attach_user_filter(filter);
 501                if (ret)
 502                        goto out;
 503                break;
 504#endif
 505        default:
 506                goto out;
 507        }
 508
 509        current->seccomp.mode = seccomp_mode;
 510        set_thread_flag(TIF_SECCOMP);
 511out:
 512        return ret;
 513}
 514
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.