linux/kernel/bpf/cgroup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Functions to manage eBPF programs attached to cgroups
   4 *
   5 * Copyright (c) 2016 Daniel Mack
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/atomic.h>
  10#include <linux/cgroup.h>
  11#include <linux/filter.h>
  12#include <linux/slab.h>
  13#include <linux/sysctl.h>
  14#include <linux/string.h>
  15#include <linux/bpf.h>
  16#include <linux/bpf-cgroup.h>
  17#include <linux/bpf_lsm.h>
  18#include <linux/bpf_verifier.h>
  19#include <net/sock.h>
  20#include <net/bpf_sk_storage.h>
  21
  22#include "../cgroup/cgroup-internal.h"
  23
  24DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
  25EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  26
  27/* __always_inline is necessary to prevent indirect call through run_prog
  28 * function pointer.
  29 */
  30static __always_inline int
  31bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
  32                      enum cgroup_bpf_attach_type atype,
  33                      const void *ctx, bpf_prog_run_fn run_prog,
  34                      int retval, u32 *ret_flags)
  35{
  36        const struct bpf_prog_array_item *item;
  37        const struct bpf_prog *prog;
  38        const struct bpf_prog_array *array;
  39        struct bpf_run_ctx *old_run_ctx;
  40        struct bpf_cg_run_ctx run_ctx;
  41        u32 func_ret;
  42
  43        run_ctx.retval = retval;
  44        migrate_disable();
  45        rcu_read_lock();
  46        array = rcu_dereference(cgrp->effective[atype]);
  47        item = &array->items[0];
  48        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
  49        while ((prog = READ_ONCE(item->prog))) {
  50                run_ctx.prog_item = item;
  51                func_ret = run_prog(prog, ctx);
  52                if (ret_flags) {
  53                        *(ret_flags) |= (func_ret >> 1);
  54                        func_ret &= 1;
  55                }
  56                if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
  57                        run_ctx.retval = -EPERM;
  58                item++;
  59        }
  60        bpf_reset_run_ctx(old_run_ctx);
  61        rcu_read_unlock();
  62        migrate_enable();
  63        return run_ctx.retval;
  64}
  65
  66unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
  67                                       const struct bpf_insn *insn)
  68{
  69        const struct bpf_prog *shim_prog;
  70        struct sock *sk;
  71        struct cgroup *cgrp;
  72        int ret = 0;
  73        u64 *args;
  74
  75        args = (u64 *)ctx;
  76        sk = (void *)(unsigned long)args[0];
  77        /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
  78        shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
  79
  80        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
  81        if (likely(cgrp))
  82                ret = bpf_prog_run_array_cg(&cgrp->bpf,
  83                                            shim_prog->aux->cgroup_atype,
  84                                            ctx, bpf_prog_run, 0, NULL);
  85        return ret;
  86}
  87
  88unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
  89                                         const struct bpf_insn *insn)
  90{
  91        const struct bpf_prog *shim_prog;
  92        struct socket *sock;
  93        struct cgroup *cgrp;
  94        int ret = 0;
  95        u64 *args;
  96
  97        args = (u64 *)ctx;
  98        sock = (void *)(unsigned long)args[0];
  99        /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
 100        shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
 101
 102        cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
 103        if (likely(cgrp))
 104                ret = bpf_prog_run_array_cg(&cgrp->bpf,
 105                                            shim_prog->aux->cgroup_atype,
 106                                            ctx, bpf_prog_run, 0, NULL);
 107        return ret;
 108}
 109
 110unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
 111                                          const struct bpf_insn *insn)
 112{
 113        const struct bpf_prog *shim_prog;
 114        struct cgroup *cgrp;
 115        int ret = 0;
 116
 117        /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
 118        shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
 119
 120        /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
 121        cgrp = task_dfl_cgroup(current);
 122        if (likely(cgrp))
 123                ret = bpf_prog_run_array_cg(&cgrp->bpf,
 124                                            shim_prog->aux->cgroup_atype,
 125                                            ctx, bpf_prog_run, 0, NULL);
 126        return ret;
 127}
 128
 129#ifdef CONFIG_BPF_LSM
 130struct cgroup_lsm_atype {
 131        u32 attach_btf_id;
 132        int refcnt;
 133};
 134
 135static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
 136
 137static enum cgroup_bpf_attach_type
 138bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
 139{
 140        int i;
 141
 142        lockdep_assert_held(&cgroup_mutex);
 143
 144        if (attach_type != BPF_LSM_CGROUP)
 145                return to_cgroup_bpf_attach_type(attach_type);
 146
 147        for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
 148                if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
 149                        return CGROUP_LSM_START + i;
 150
 151        for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
 152                if (cgroup_lsm_atype[i].attach_btf_id == 0)
 153                        return CGROUP_LSM_START + i;
 154
 155        return -E2BIG;
 156
 157}
 158
 159void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
 160{
 161        int i = cgroup_atype - CGROUP_LSM_START;
 162
 163        lockdep_assert_held(&cgroup_mutex);
 164
 165        WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
 166                     cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
 167
 168        cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
 169        cgroup_lsm_atype[i].refcnt++;
 170}
 171
 172void bpf_cgroup_atype_put(int cgroup_atype)
 173{
 174        int i = cgroup_atype - CGROUP_LSM_START;
 175
 176        mutex_lock(&cgroup_mutex);
 177        if (--cgroup_lsm_atype[i].refcnt <= 0)
 178                cgroup_lsm_atype[i].attach_btf_id = 0;
 179        WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
 180        mutex_unlock(&cgroup_mutex);
 181}
 182#else
 183static enum cgroup_bpf_attach_type
 184bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
 185{
 186        if (attach_type != BPF_LSM_CGROUP)
 187                return to_cgroup_bpf_attach_type(attach_type);
 188        return -EOPNOTSUPP;
 189}
 190#endif /* CONFIG_BPF_LSM */
 191
 192void cgroup_bpf_offline(struct cgroup *cgrp)
 193{
 194        cgroup_get(cgrp);
 195        percpu_ref_kill(&cgrp->bpf.refcnt);
 196}
 197
 198static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
 199{
 200        enum bpf_cgroup_storage_type stype;
 201
 202        for_each_cgroup_storage_type(stype)
 203                bpf_cgroup_storage_free(storages[stype]);
 204}
 205
 206static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
 207                                     struct bpf_cgroup_storage *new_storages[],
 208                                     enum bpf_attach_type type,
 209                                     struct bpf_prog *prog,
 210                                     struct cgroup *cgrp)
 211{
 212        enum bpf_cgroup_storage_type stype;
 213        struct bpf_cgroup_storage_key key;
 214        struct bpf_map *map;
 215
 216        key.cgroup_inode_id = cgroup_id(cgrp);
 217        key.attach_type = type;
 218
 219        for_each_cgroup_storage_type(stype) {
 220                map = prog->aux->cgroup_storage[stype];
 221                if (!map)
 222                        continue;
 223
 224                storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
 225                if (storages[stype])
 226                        continue;
 227
 228                storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
 229                if (IS_ERR(storages[stype])) {
 230                        bpf_cgroup_storages_free(new_storages);
 231                        return -ENOMEM;
 232                }
 233
 234                new_storages[stype] = storages[stype];
 235        }
 236
 237        return 0;
 238}
 239
 240static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
 241                                       struct bpf_cgroup_storage *src[])
 242{
 243        enum bpf_cgroup_storage_type stype;
 244
 245        for_each_cgroup_storage_type(stype)
 246                dst[stype] = src[stype];
 247}
 248
 249static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
 250                                     struct cgroup *cgrp,
 251                                     enum bpf_attach_type attach_type)
 252{
 253        enum bpf_cgroup_storage_type stype;
 254
 255        for_each_cgroup_storage_type(stype)
 256                bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
 257}
 258
 259/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
 260 * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
 261 * doesn't free link memory, which will eventually be done by bpf_link's
 262 * release() callback, when its last FD is closed.
 263 */
 264static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
 265{
 266        cgroup_put(link->cgroup);
 267        link->cgroup = NULL;
 268}
 269
 270/**
 271 * cgroup_bpf_release() - put references of all bpf programs and
 272 *                        release all cgroup bpf data
 273 * @work: work structure embedded into the cgroup to modify
 274 */
 275static void cgroup_bpf_release(struct work_struct *work)
 276{
 277        struct cgroup *p, *cgrp = container_of(work, struct cgroup,
 278                                               bpf.release_work);
 279        struct bpf_prog_array *old_array;
 280        struct list_head *storages = &cgrp->bpf.storages;
 281        struct bpf_cgroup_storage *storage, *stmp;
 282
 283        unsigned int atype;
 284
 285        mutex_lock(&cgroup_mutex);
 286
 287        for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
 288                struct hlist_head *progs = &cgrp->bpf.progs[atype];
 289                struct bpf_prog_list *pl;
 290                struct hlist_node *pltmp;
 291
 292                hlist_for_each_entry_safe(pl, pltmp, progs, node) {
 293                        hlist_del(&pl->node);
 294                        if (pl->prog) {
 295                                if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
 296                                        bpf_trampoline_unlink_cgroup_shim(pl->prog);
 297                                bpf_prog_put(pl->prog);
 298                        }
 299                        if (pl->link) {
 300                                if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
 301                                        bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
 302                                bpf_cgroup_link_auto_detach(pl->link);
 303                        }
 304                        kfree(pl);
 305                        static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 306                }
 307                old_array = rcu_dereference_protected(
 308                                cgrp->bpf.effective[atype],
 309                                lockdep_is_held(&cgroup_mutex));
 310                bpf_prog_array_free(old_array);
 311        }
 312
 313        list_for_each_entry_safe(storage, stmp, storages, list_cg) {
 314                bpf_cgroup_storage_unlink(storage);
 315                bpf_cgroup_storage_free(storage);
 316        }
 317
 318        mutex_unlock(&cgroup_mutex);
 319
 320        for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 321                cgroup_bpf_put(p);
 322
 323        percpu_ref_exit(&cgrp->bpf.refcnt);
 324        cgroup_put(cgrp);
 325}
 326
 327/**
 328 * cgroup_bpf_release_fn() - callback used to schedule releasing
 329 *                           of bpf cgroup data
 330 * @ref: percpu ref counter structure
 331 */
 332static void cgroup_bpf_release_fn(struct percpu_ref *ref)
 333{
 334        struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
 335
 336        INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
 337        queue_work(system_wq, &cgrp->bpf.release_work);
 338}
 339
 340/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
 341 * link or direct prog.
 342 */
 343static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
 344{
 345        if (pl->prog)
 346                return pl->prog;
 347        if (pl->link)
 348                return pl->link->link.prog;
 349        return NULL;
 350}
 351
 352/* count number of elements in the list.
 353 * it's slow but the list cannot be long
 354 */
 355static u32 prog_list_length(struct hlist_head *head)
 356{
 357        struct bpf_prog_list *pl;
 358        u32 cnt = 0;
 359
 360        hlist_for_each_entry(pl, head, node) {
 361                if (!prog_list_prog(pl))
 362                        continue;
 363                cnt++;
 364        }
 365        return cnt;
 366}
 367
 368/* if parent has non-overridable prog attached,
 369 * disallow attaching new programs to the descendent cgroup.
 370 * if parent has overridable or multi-prog, allow attaching
 371 */
 372static bool hierarchy_allows_attach(struct cgroup *cgrp,
 373                                    enum cgroup_bpf_attach_type atype)
 374{
 375        struct cgroup *p;
 376
 377        p = cgroup_parent(cgrp);
 378        if (!p)
 379                return true;
 380        do {
 381                u32 flags = p->bpf.flags[atype];
 382                u32 cnt;
 383
 384                if (flags & BPF_F_ALLOW_MULTI)
 385                        return true;
 386                cnt = prog_list_length(&p->bpf.progs[atype]);
 387                WARN_ON_ONCE(cnt > 1);
 388                if (cnt == 1)
 389                        return !!(flags & BPF_F_ALLOW_OVERRIDE);
 390                p = cgroup_parent(p);
 391        } while (p);
 392        return true;
 393}
 394
 395/* compute a chain of effective programs for a given cgroup:
 396 * start from the list of programs in this cgroup and add
 397 * all parent programs.
 398 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
 399 * to programs in this cgroup
 400 */
 401static int compute_effective_progs(struct cgroup *cgrp,
 402                                   enum cgroup_bpf_attach_type atype,
 403                                   struct bpf_prog_array **array)
 404{
 405        struct bpf_prog_array_item *item;
 406        struct bpf_prog_array *progs;
 407        struct bpf_prog_list *pl;
 408        struct cgroup *p = cgrp;
 409        int cnt = 0;
 410
 411        /* count number of effective programs by walking parents */
 412        do {
 413                if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 414                        cnt += prog_list_length(&p->bpf.progs[atype]);
 415                p = cgroup_parent(p);
 416        } while (p);
 417
 418        progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
 419        if (!progs)
 420                return -ENOMEM;
 421
 422        /* populate the array with effective progs */
 423        cnt = 0;
 424        p = cgrp;
 425        do {
 426                if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 427                        continue;
 428
 429                hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
 430                        if (!prog_list_prog(pl))
 431                                continue;
 432
 433                        item = &progs->items[cnt];
 434                        item->prog = prog_list_prog(pl);
 435                        bpf_cgroup_storages_assign(item->cgroup_storage,
 436                                                   pl->storage);
 437                        cnt++;
 438                }
 439        } while ((p = cgroup_parent(p)));
 440
 441        *array = progs;
 442        return 0;
 443}
 444
 445static void activate_effective_progs(struct cgroup *cgrp,
 446                                     enum cgroup_bpf_attach_type atype,
 447                                     struct bpf_prog_array *old_array)
 448{
 449        old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
 450                                        lockdep_is_held(&cgroup_mutex));
 451        /* free prog array after grace period, since __cgroup_bpf_run_*()
 452         * might be still walking the array
 453         */
 454        bpf_prog_array_free(old_array);
 455}
 456
 457/**
 458 * cgroup_bpf_inherit() - inherit effective programs from parent
 459 * @cgrp: the cgroup to modify
 460 */
 461int cgroup_bpf_inherit(struct cgroup *cgrp)
 462{
 463/* has to use marco instead of const int, since compiler thinks
 464 * that array below is variable length
 465 */
 466#define NR ARRAY_SIZE(cgrp->bpf.effective)
 467        struct bpf_prog_array *arrays[NR] = {};
 468        struct cgroup *p;
 469        int ret, i;
 470
 471        ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
 472                              GFP_KERNEL);
 473        if (ret)
 474                return ret;
 475
 476        for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 477                cgroup_bpf_get(p);
 478
 479        for (i = 0; i < NR; i++)
 480                INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
 481
 482        INIT_LIST_HEAD(&cgrp->bpf.storages);
 483
 484        for (i = 0; i < NR; i++)
 485                if (compute_effective_progs(cgrp, i, &arrays[i]))
 486                        goto cleanup;
 487
 488        for (i = 0; i < NR; i++)
 489                activate_effective_progs(cgrp, i, arrays[i]);
 490
 491        return 0;
 492cleanup:
 493        for (i = 0; i < NR; i++)
 494                bpf_prog_array_free(arrays[i]);
 495
 496        for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 497                cgroup_bpf_put(p);
 498
 499        percpu_ref_exit(&cgrp->bpf.refcnt);
 500
 501        return -ENOMEM;
 502}
 503
 504static int update_effective_progs(struct cgroup *cgrp,
 505                                  enum cgroup_bpf_attach_type atype)
 506{
 507        struct cgroup_subsys_state *css;
 508        int err;
 509
 510        /* allocate and recompute effective prog arrays */
 511        css_for_each_descendant_pre(css, &cgrp->self) {
 512                struct cgroup *desc = container_of(css, struct cgroup, self);
 513
 514                if (percpu_ref_is_zero(&desc->bpf.refcnt))
 515                        continue;
 516
 517                err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
 518                if (err)
 519                        goto cleanup;
 520        }
 521
 522        /* all allocations were successful. Activate all prog arrays */
 523        css_for_each_descendant_pre(css, &cgrp->self) {
 524                struct cgroup *desc = container_of(css, struct cgroup, self);
 525
 526                if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
 527                        if (unlikely(desc->bpf.inactive)) {
 528                                bpf_prog_array_free(desc->bpf.inactive);
 529                                desc->bpf.inactive = NULL;
 530                        }
 531                        continue;
 532                }
 533
 534                activate_effective_progs(desc, atype, desc->bpf.inactive);
 535                desc->bpf.inactive = NULL;
 536        }
 537
 538        return 0;
 539
 540cleanup:
 541        /* oom while computing effective. Free all computed effective arrays
 542         * since they were not activated
 543         */
 544        css_for_each_descendant_pre(css, &cgrp->self) {
 545                struct cgroup *desc = container_of(css, struct cgroup, self);
 546
 547                bpf_prog_array_free(desc->bpf.inactive);
 548                desc->bpf.inactive = NULL;
 549        }
 550
 551        return err;
 552}
 553
 554#define BPF_CGROUP_MAX_PROGS 64
 555
 556static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 557                                               struct bpf_prog *prog,
 558                                               struct bpf_cgroup_link *link,
 559                                               struct bpf_prog *replace_prog,
 560                                               bool allow_multi)
 561{
 562        struct bpf_prog_list *pl;
 563
 564        /* single-attach case */
 565        if (!allow_multi) {
 566                if (hlist_empty(progs))
 567                        return NULL;
 568                return hlist_entry(progs->first, typeof(*pl), node);
 569        }
 570
 571        hlist_for_each_entry(pl, progs, node) {
 572                if (prog && pl->prog == prog && prog != replace_prog)
 573                        /* disallow attaching the same prog twice */
 574                        return ERR_PTR(-EINVAL);
 575                if (link && pl->link == link)
 576                        /* disallow attaching the same link twice */
 577                        return ERR_PTR(-EINVAL);
 578        }
 579
 580        /* direct prog multi-attach w/ replacement case */
 581        if (replace_prog) {
 582                hlist_for_each_entry(pl, progs, node) {
 583                        if (pl->prog == replace_prog)
 584                                /* a match found */
 585                                return pl;
 586                }
 587                /* prog to replace not found for cgroup */
 588                return ERR_PTR(-ENOENT);
 589        }
 590
 591        return NULL;
 592}
 593
 594/**
 595 * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
 596 *                         propagate the change to descendants
 597 * @cgrp: The cgroup which descendants to traverse
 598 * @prog: A program to attach
 599 * @link: A link to attach
 600 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
 601 * @type: Type of attach operation
 602 * @flags: Option flags
 603 *
 604 * Exactly one of @prog or @link can be non-null.
 605 * Must be called with cgroup_mutex held.
 606 */
 607static int __cgroup_bpf_attach(struct cgroup *cgrp,
 608                               struct bpf_prog *prog, struct bpf_prog *replace_prog,
 609                               struct bpf_cgroup_link *link,
 610                               enum bpf_attach_type type, u32 flags)
 611{
 612        u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
 613        struct bpf_prog *old_prog = NULL;
 614        struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 615        struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 616        struct bpf_prog *new_prog = prog ? : link->link.prog;
 617        enum cgroup_bpf_attach_type atype;
 618        struct bpf_prog_list *pl;
 619        struct hlist_head *progs;
 620        int err;
 621
 622        if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
 623            ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
 624                /* invalid combination */
 625                return -EINVAL;
 626        if (link && (prog || replace_prog))
 627                /* only either link or prog/replace_prog can be specified */
 628                return -EINVAL;
 629        if (!!replace_prog != !!(flags & BPF_F_REPLACE))
 630                /* replace_prog implies BPF_F_REPLACE, and vice versa */
 631                return -EINVAL;
 632
 633        atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
 634        if (atype < 0)
 635                return -EINVAL;
 636
 637        progs = &cgrp->bpf.progs[atype];
 638
 639        if (!hierarchy_allows_attach(cgrp, atype))
 640                return -EPERM;
 641
 642        if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
 643                /* Disallow attaching non-overridable on top
 644                 * of existing overridable in this cgroup.
 645                 * Disallow attaching multi-prog if overridable or none
 646                 */
 647                return -EPERM;
 648
 649        if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 650                return -E2BIG;
 651
 652        pl = find_attach_entry(progs, prog, link, replace_prog,
 653                               flags & BPF_F_ALLOW_MULTI);
 654        if (IS_ERR(pl))
 655                return PTR_ERR(pl);
 656
 657        if (bpf_cgroup_storages_alloc(storage, new_storage, type,
 658                                      prog ? : link->link.prog, cgrp))
 659                return -ENOMEM;
 660
 661        if (pl) {
 662                old_prog = pl->prog;
 663        } else {
 664                struct hlist_node *last = NULL;
 665
 666                pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 667                if (!pl) {
 668                        bpf_cgroup_storages_free(new_storage);
 669                        return -ENOMEM;
 670                }
 671                if (hlist_empty(progs))
 672                        hlist_add_head(&pl->node, progs);
 673                else
 674                        hlist_for_each(last, progs) {
 675                                if (last->next)
 676                                        continue;
 677                                hlist_add_behind(&pl->node, last);
 678                                break;
 679                        }
 680        }
 681
 682        pl->prog = prog;
 683        pl->link = link;
 684        bpf_cgroup_storages_assign(pl->storage, storage);
 685        cgrp->bpf.flags[atype] = saved_flags;
 686
 687        if (type == BPF_LSM_CGROUP) {
 688                err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
 689                if (err)
 690                        goto cleanup;
 691        }
 692
 693        err = update_effective_progs(cgrp, atype);
 694        if (err)
 695                goto cleanup_trampoline;
 696
 697        if (old_prog) {
 698                if (type == BPF_LSM_CGROUP)
 699                        bpf_trampoline_unlink_cgroup_shim(old_prog);
 700                bpf_prog_put(old_prog);
 701        } else {
 702                static_branch_inc(&cgroup_bpf_enabled_key[atype]);
 703        }
 704        bpf_cgroup_storages_link(new_storage, cgrp, type);
 705        return 0;
 706
 707cleanup_trampoline:
 708        if (type == BPF_LSM_CGROUP)
 709                bpf_trampoline_unlink_cgroup_shim(new_prog);
 710
 711cleanup:
 712        if (old_prog) {
 713                pl->prog = old_prog;
 714                pl->link = NULL;
 715        }
 716        bpf_cgroup_storages_free(new_storage);
 717        if (!old_prog) {
 718                hlist_del(&pl->node);
 719                kfree(pl);
 720        }
 721        return err;
 722}
 723
 724static int cgroup_bpf_attach(struct cgroup *cgrp,
 725                             struct bpf_prog *prog, struct bpf_prog *replace_prog,
 726                             struct bpf_cgroup_link *link,
 727                             enum bpf_attach_type type,
 728                             u32 flags)
 729{
 730        int ret;
 731
 732        mutex_lock(&cgroup_mutex);
 733        ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
 734        mutex_unlock(&cgroup_mutex);
 735        return ret;
 736}
 737
 738/* Swap updated BPF program for given link in effective program arrays across
 739 * all descendant cgroups. This function is guaranteed to succeed.
 740 */
 741static void replace_effective_prog(struct cgroup *cgrp,
 742                                   enum cgroup_bpf_attach_type atype,
 743                                   struct bpf_cgroup_link *link)
 744{
 745        struct bpf_prog_array_item *item;
 746        struct cgroup_subsys_state *css;
 747        struct bpf_prog_array *progs;
 748        struct bpf_prog_list *pl;
 749        struct hlist_head *head;
 750        struct cgroup *cg;
 751        int pos;
 752
 753        css_for_each_descendant_pre(css, &cgrp->self) {
 754                struct cgroup *desc = container_of(css, struct cgroup, self);
 755
 756                if (percpu_ref_is_zero(&desc->bpf.refcnt))
 757                        continue;
 758
 759                /* find position of link in effective progs array */
 760                for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
 761                        if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 762                                continue;
 763
 764                        head = &cg->bpf.progs[atype];
 765                        hlist_for_each_entry(pl, head, node) {
 766                                if (!prog_list_prog(pl))
 767                                        continue;
 768                                if (pl->link == link)
 769                                        goto found;
 770                                pos++;
 771                        }
 772                }
 773found:
 774                BUG_ON(!cg);
 775                progs = rcu_dereference_protected(
 776                                desc->bpf.effective[atype],
 777                                lockdep_is_held(&cgroup_mutex));
 778                item = &progs->items[pos];
 779                WRITE_ONCE(item->prog, link->link.prog);
 780        }
 781}
 782
 783/**
 784 * __cgroup_bpf_replace() - Replace link's program and propagate the change
 785 *                          to descendants
 786 * @cgrp: The cgroup which descendants to traverse
 787 * @link: A link for which to replace BPF program
 788 * @type: Type of attach operation
 789 *
 790 * Must be called with cgroup_mutex held.
 791 */
 792static int __cgroup_bpf_replace(struct cgroup *cgrp,
 793                                struct bpf_cgroup_link *link,
 794                                struct bpf_prog *new_prog)
 795{
 796        enum cgroup_bpf_attach_type atype;
 797        struct bpf_prog *old_prog;
 798        struct bpf_prog_list *pl;
 799        struct hlist_head *progs;
 800        bool found = false;
 801
 802        atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
 803        if (atype < 0)
 804                return -EINVAL;
 805
 806        progs = &cgrp->bpf.progs[atype];
 807
 808        if (link->link.prog->type != new_prog->type)
 809                return -EINVAL;
 810
 811        hlist_for_each_entry(pl, progs, node) {
 812                if (pl->link == link) {
 813                        found = true;
 814                        break;
 815                }
 816        }
 817        if (!found)
 818                return -ENOENT;
 819
 820        old_prog = xchg(&link->link.prog, new_prog);
 821        replace_effective_prog(cgrp, atype, link);
 822        bpf_prog_put(old_prog);
 823        return 0;
 824}
 825
 826static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
 827                              struct bpf_prog *old_prog)
 828{
 829        struct bpf_cgroup_link *cg_link;
 830        int ret;
 831
 832        cg_link = container_of(link, struct bpf_cgroup_link, link);
 833
 834        mutex_lock(&cgroup_mutex);
 835        /* link might have been auto-released by dying cgroup, so fail */
 836        if (!cg_link->cgroup) {
 837                ret = -ENOLINK;
 838                goto out_unlock;
 839        }
 840        if (old_prog && link->prog != old_prog) {
 841                ret = -EPERM;
 842                goto out_unlock;
 843        }
 844        ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
 845out_unlock:
 846        mutex_unlock(&cgroup_mutex);
 847        return ret;
 848}
 849
 850static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
 851                                               struct bpf_prog *prog,
 852                                               struct bpf_cgroup_link *link,
 853                                               bool allow_multi)
 854{
 855        struct bpf_prog_list *pl;
 856
 857        if (!allow_multi) {
 858                if (hlist_empty(progs))
 859                        /* report error when trying to detach and nothing is attached */
 860                        return ERR_PTR(-ENOENT);
 861
 862                /* to maintain backward compatibility NONE and OVERRIDE cgroups
 863                 * allow detaching with invalid FD (prog==NULL) in legacy mode
 864                 */
 865                return hlist_entry(progs->first, typeof(*pl), node);
 866        }
 867
 868        if (!prog && !link)
 869                /* to detach MULTI prog the user has to specify valid FD
 870                 * of the program or link to be detached
 871                 */
 872                return ERR_PTR(-EINVAL);
 873
 874        /* find the prog or link and detach it */
 875        hlist_for_each_entry(pl, progs, node) {
 876                if (pl->prog == prog && pl->link == link)
 877                        return pl;
 878        }
 879        return ERR_PTR(-ENOENT);
 880}
 881
 882/**
 883 * purge_effective_progs() - After compute_effective_progs fails to alloc new
 884 *                           cgrp->bpf.inactive table we can recover by
 885 *                           recomputing the array in place.
 886 *
 887 * @cgrp: The cgroup which descendants to travers
 888 * @prog: A program to detach or NULL
 889 * @link: A link to detach or NULL
 890 * @atype: Type of detach operation
 891 */
 892static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 893                                  struct bpf_cgroup_link *link,
 894                                  enum cgroup_bpf_attach_type atype)
 895{
 896        struct cgroup_subsys_state *css;
 897        struct bpf_prog_array *progs;
 898        struct bpf_prog_list *pl;
 899        struct hlist_head *head;
 900        struct cgroup *cg;
 901        int pos;
 902
 903        /* recompute effective prog array in place */
 904        css_for_each_descendant_pre(css, &cgrp->self) {
 905                struct cgroup *desc = container_of(css, struct cgroup, self);
 906
 907                if (percpu_ref_is_zero(&desc->bpf.refcnt))
 908                        continue;
 909
 910                /* find position of link or prog in effective progs array */
 911                for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
 912                        if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 913                                continue;
 914
 915                        head = &cg->bpf.progs[atype];
 916                        hlist_for_each_entry(pl, head, node) {
 917                                if (!prog_list_prog(pl))
 918                                        continue;
 919                                if (pl->prog == prog && pl->link == link)
 920                                        goto found;
 921                                pos++;
 922                        }
 923                }
 924
 925                /* no link or prog match, skip the cgroup of this layer */
 926                continue;
 927found:
 928                progs = rcu_dereference_protected(
 929                                desc->bpf.effective[atype],
 930                                lockdep_is_held(&cgroup_mutex));
 931
 932                /* Remove the program from the array */
 933                WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
 934                          "Failed to purge a prog from array at index %d", pos);
 935        }
 936}
 937
 938/**
 939 * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
 940 *                         propagate the change to descendants
 941 * @cgrp: The cgroup which descendants to traverse
 942 * @prog: A program to detach or NULL
 943 * @link: A link to detach or NULL
 944 * @type: Type of detach operation
 945 *
 946 * At most one of @prog or @link can be non-NULL.
 947 * Must be called with cgroup_mutex held.
 948 */
 949static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 950                               struct bpf_cgroup_link *link, enum bpf_attach_type type)
 951{
 952        enum cgroup_bpf_attach_type atype;
 953        struct bpf_prog *old_prog;
 954        struct bpf_prog_list *pl;
 955        struct hlist_head *progs;
 956        u32 attach_btf_id = 0;
 957        u32 flags;
 958
 959        if (prog)
 960                attach_btf_id = prog->aux->attach_btf_id;
 961        if (link)
 962                attach_btf_id = link->link.prog->aux->attach_btf_id;
 963
 964        atype = bpf_cgroup_atype_find(type, attach_btf_id);
 965        if (atype < 0)
 966                return -EINVAL;
 967
 968        progs = &cgrp->bpf.progs[atype];
 969        flags = cgrp->bpf.flags[atype];
 970
 971        if (prog && link)
 972                /* only one of prog or link can be specified */
 973                return -EINVAL;
 974
 975        pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
 976        if (IS_ERR(pl))
 977                return PTR_ERR(pl);
 978
 979        /* mark it deleted, so it's ignored while recomputing effective */
 980        old_prog = pl->prog;
 981        pl->prog = NULL;
 982        pl->link = NULL;
 983
 984        if (update_effective_progs(cgrp, atype)) {
 985                /* if update effective array failed replace the prog with a dummy prog*/
 986                pl->prog = old_prog;
 987                pl->link = link;
 988                purge_effective_progs(cgrp, old_prog, link, atype);
 989        }
 990
 991        /* now can actually delete it from this cgroup list */
 992        hlist_del(&pl->node);
 993
 994        kfree(pl);
 995        if (hlist_empty(progs))
 996                /* last program was detached, reset flags to zero */
 997                cgrp->bpf.flags[atype] = 0;
 998        if (old_prog) {
 999                if (type == BPF_LSM_CGROUP)
1000                        bpf_trampoline_unlink_cgroup_shim(old_prog);
1001                bpf_prog_put(old_prog);
1002        }
1003        static_branch_dec(&cgroup_bpf_enabled_key[atype]);
1004        return 0;
1005}
1006
1007static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1008                             enum bpf_attach_type type)
1009{
1010        int ret;
1011
1012        mutex_lock(&cgroup_mutex);
1013        ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
1014        mutex_unlock(&cgroup_mutex);
1015        return ret;
1016}
1017
1018/* Must be called with cgroup_mutex held to avoid races. */
1019static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1020                              union bpf_attr __user *uattr)
1021{
1022        __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
1023        bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
1024        __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
1025        enum bpf_attach_type type = attr->query.attach_type;
1026        enum cgroup_bpf_attach_type from_atype, to_atype;
1027        enum cgroup_bpf_attach_type atype;
1028        struct bpf_prog_array *effective;
1029        int cnt, ret = 0, i;
1030        int total_cnt = 0;
1031        u32 flags;
1032
1033        if (effective_query && prog_attach_flags)
1034                return -EINVAL;
1035
1036        if (type == BPF_LSM_CGROUP) {
1037                if (!effective_query && attr->query.prog_cnt &&
1038                    prog_ids && !prog_attach_flags)
1039                        return -EINVAL;
1040
1041                from_atype = CGROUP_LSM_START;
1042                to_atype = CGROUP_LSM_END;
1043                flags = 0;
1044        } else {
1045                from_atype = to_cgroup_bpf_attach_type(type);
1046                if (from_atype < 0)
1047                        return -EINVAL;
1048                to_atype = from_atype;
1049                flags = cgrp->bpf.flags[from_atype];
1050        }
1051
1052        for (atype = from_atype; atype <= to_atype; atype++) {
1053                if (effective_query) {
1054                        effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1055                                                              lockdep_is_held(&cgroup_mutex));
1056                        total_cnt += bpf_prog_array_length(effective);
1057                } else {
1058                        total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
1059                }
1060        }
1061
1062        /* always output uattr->query.attach_flags as 0 during effective query */
1063        flags = effective_query ? 0 : flags;
1064        if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
1065                return -EFAULT;
1066        if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
1067                return -EFAULT;
1068        if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
1069                /* return early if user requested only program count + flags */
1070                return 0;
1071
1072        if (attr->query.prog_cnt < total_cnt) {
1073                total_cnt = attr->query.prog_cnt;
1074                ret = -ENOSPC;
1075        }
1076
1077        for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
1078                if (effective_query) {
1079                        effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1080                                                              lockdep_is_held(&cgroup_mutex));
1081                        cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
1082                        ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
1083                } else {
1084                        struct hlist_head *progs;
1085                        struct bpf_prog_list *pl;
1086                        struct bpf_prog *prog;
1087                        u32 id;
1088
1089                        progs = &cgrp->bpf.progs[atype];
1090                        cnt = min_t(int, prog_list_length(progs), total_cnt);
1091                        i = 0;
1092                        hlist_for_each_entry(pl, progs, node) {
1093                                prog = prog_list_prog(pl);
1094                                id = prog->aux->id;
1095                                if (copy_to_user(prog_ids + i, &id, sizeof(id)))
1096                                        return -EFAULT;
1097                                if (++i == cnt)
1098                                        break;
1099                        }
1100
1101                        if (prog_attach_flags) {
1102                                flags = cgrp->bpf.flags[atype];
1103
1104                                for (i = 0; i < cnt; i++)
1105                                        if (copy_to_user(prog_attach_flags + i,
1106                                                         &flags, sizeof(flags)))
1107                                                return -EFAULT;
1108                                prog_attach_flags += cnt;
1109                        }
1110                }
1111
1112                prog_ids += cnt;
1113                total_cnt -= cnt;
1114        }
1115        return ret;
1116}
1117
1118static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1119                            union bpf_attr __user *uattr)
1120{
1121        int ret;
1122
1123        mutex_lock(&cgroup_mutex);
1124        ret = __cgroup_bpf_query(cgrp, attr, uattr);
1125        mutex_unlock(&cgroup_mutex);
1126        return ret;
1127}
1128
1129int cgroup_bpf_prog_attach(const union bpf_attr *attr,
1130                           enum bpf_prog_type ptype, struct bpf_prog *prog)
1131{
1132        struct bpf_prog *replace_prog = NULL;
1133        struct cgroup *cgrp;
1134        int ret;
1135
1136        cgrp = cgroup_get_from_fd(attr->target_fd);
1137        if (IS_ERR(cgrp))
1138                return PTR_ERR(cgrp);
1139
1140        if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
1141            (attr->attach_flags & BPF_F_REPLACE)) {
1142                replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
1143                if (IS_ERR(replace_prog)) {
1144                        cgroup_put(cgrp);
1145                        return PTR_ERR(replace_prog);
1146                }
1147        }
1148
1149        ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1150                                attr->attach_type, attr->attach_flags);
1151
1152        if (replace_prog)
1153                bpf_prog_put(replace_prog);
1154        cgroup_put(cgrp);
1155        return ret;
1156}
1157
1158int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
1159{
1160        struct bpf_prog *prog;
1161        struct cgroup *cgrp;
1162        int ret;
1163
1164        cgrp = cgroup_get_from_fd(attr->target_fd);
1165        if (IS_ERR(cgrp))
1166                return PTR_ERR(cgrp);
1167
1168        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
1169        if (IS_ERR(prog))
1170                prog = NULL;
1171
1172        ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
1173        if (prog)
1174                bpf_prog_put(prog);
1175
1176        cgroup_put(cgrp);
1177        return ret;
1178}
1179
1180static void bpf_cgroup_link_release(struct bpf_link *link)
1181{
1182        struct bpf_cgroup_link *cg_link =
1183                container_of(link, struct bpf_cgroup_link, link);
1184        struct cgroup *cg;
1185
1186        /* link might have been auto-detached by dying cgroup already,
1187         * in that case our work is done here
1188         */
1189        if (!cg_link->cgroup)
1190                return;
1191
1192        mutex_lock(&cgroup_mutex);
1193
1194        /* re-check cgroup under lock again */
1195        if (!cg_link->cgroup) {
1196                mutex_unlock(&cgroup_mutex);
1197                return;
1198        }
1199
1200        WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1201                                    cg_link->type));
1202        if (cg_link->type == BPF_LSM_CGROUP)
1203                bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
1204
1205        cg = cg_link->cgroup;
1206        cg_link->cgroup = NULL;
1207
1208        mutex_unlock(&cgroup_mutex);
1209
1210        cgroup_put(cg);
1211}
1212
1213static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1214{
1215        struct bpf_cgroup_link *cg_link =
1216                container_of(link, struct bpf_cgroup_link, link);
1217
1218        kfree(cg_link);
1219}
1220
1221static int bpf_cgroup_link_detach(struct bpf_link *link)
1222{
1223        bpf_cgroup_link_release(link);
1224
1225        return 0;
1226}
1227
1228static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1229                                        struct seq_file *seq)
1230{
1231        struct bpf_cgroup_link *cg_link =
1232                container_of(link, struct bpf_cgroup_link, link);
1233        u64 cg_id = 0;
1234
1235        mutex_lock(&cgroup_mutex);
1236        if (cg_link->cgroup)
1237                cg_id = cgroup_id(cg_link->cgroup);
1238        mutex_unlock(&cgroup_mutex);
1239
1240        seq_printf(seq,
1241                   "cgroup_id:\t%llu\n"
1242                   "attach_type:\t%d\n",
1243                   cg_id,
1244                   cg_link->type);
1245}
1246
1247static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1248                                          struct bpf_link_info *info)
1249{
1250        struct bpf_cgroup_link *cg_link =
1251                container_of(link, struct bpf_cgroup_link, link);
1252        u64 cg_id = 0;
1253
1254        mutex_lock(&cgroup_mutex);
1255        if (cg_link->cgroup)
1256                cg_id = cgroup_id(cg_link->cgroup);
1257        mutex_unlock(&cgroup_mutex);
1258
1259        info->cgroup.cgroup_id = cg_id;
1260        info->cgroup.attach_type = cg_link->type;
1261        return 0;
1262}
1263
1264static const struct bpf_link_ops bpf_cgroup_link_lops = {
1265        .release = bpf_cgroup_link_release,
1266        .dealloc = bpf_cgroup_link_dealloc,
1267        .detach = bpf_cgroup_link_detach,
1268        .update_prog = cgroup_bpf_replace,
1269        .show_fdinfo = bpf_cgroup_link_show_fdinfo,
1270        .fill_link_info = bpf_cgroup_link_fill_link_info,
1271};
1272
1273int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1274{
1275        struct bpf_link_primer link_primer;
1276        struct bpf_cgroup_link *link;
1277        struct cgroup *cgrp;
1278        int err;
1279
1280        if (attr->link_create.flags)
1281                return -EINVAL;
1282
1283        cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1284        if (IS_ERR(cgrp))
1285                return PTR_ERR(cgrp);
1286
1287        link = kzalloc(sizeof(*link), GFP_USER);
1288        if (!link) {
1289                err = -ENOMEM;
1290                goto out_put_cgroup;
1291        }
1292        bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1293                      prog);
1294        link->cgroup = cgrp;
1295        link->type = attr->link_create.attach_type;
1296
1297        err = bpf_link_prime(&link->link, &link_primer);
1298        if (err) {
1299                kfree(link);
1300                goto out_put_cgroup;
1301        }
1302
1303        err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1304                                link->type, BPF_F_ALLOW_MULTI);
1305        if (err) {
1306                bpf_link_cleanup(&link_primer);
1307                goto out_put_cgroup;
1308        }
1309
1310        return bpf_link_settle(&link_primer);
1311
1312out_put_cgroup:
1313        cgroup_put(cgrp);
1314        return err;
1315}
1316
1317int cgroup_bpf_prog_query(const union bpf_attr *attr,
1318                          union bpf_attr __user *uattr)
1319{
1320        struct cgroup *cgrp;
1321        int ret;
1322
1323        cgrp = cgroup_get_from_fd(attr->query.target_fd);
1324        if (IS_ERR(cgrp))
1325                return PTR_ERR(cgrp);
1326
1327        ret = cgroup_bpf_query(cgrp, attr, uattr);
1328
1329        cgroup_put(cgrp);
1330        return ret;
1331}
1332
1333/**
1334 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1335 * @sk: The socket sending or receiving traffic
1336 * @skb: The skb that is being sent or received
1337 * @type: The type of program to be executed
1338 *
1339 * If no socket is passed, or the socket is not of type INET or INET6,
1340 * this function does nothing and returns 0.
1341 *
1342 * The program type passed in via @type must be suitable for network
1343 * filtering. No further check is performed to assert that.
1344 *
1345 * For egress packets, this function can return:
1346 *   NET_XMIT_SUCCESS    (0)    - continue with packet output
1347 *   NET_XMIT_DROP       (1)    - drop packet and notify TCP to call cwr
1348 *   NET_XMIT_CN         (2)    - continue with packet output and notify TCP
1349 *                                to call cwr
1350 *   -err                       - drop packet
1351 *
1352 * For ingress packets, this function will return -EPERM if any
1353 * attached program was found and if it returned != 1 during execution.
1354 * Otherwise 0 is returned.
1355 */
1356int __cgroup_bpf_run_filter_skb(struct sock *sk,
1357                                struct sk_buff *skb,
1358                                enum cgroup_bpf_attach_type atype)
1359{
1360        unsigned int offset = skb->data - skb_network_header(skb);
1361        struct sock *save_sk;
1362        void *saved_data_end;
1363        struct cgroup *cgrp;
1364        int ret;
1365
1366        if (!sk || !sk_fullsock(sk))
1367                return 0;
1368
1369        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1370                return 0;
1371
1372        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1373        save_sk = skb->sk;
1374        skb->sk = sk;
1375        __skb_push(skb, offset);
1376
1377        /* compute pointers for the bpf prog */
1378        bpf_compute_and_save_data_end(skb, &saved_data_end);
1379
1380        if (atype == CGROUP_INET_EGRESS) {
1381                u32 flags = 0;
1382                bool cn;
1383
1384                ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1385                                            __bpf_prog_run_save_cb, 0, &flags);
1386
1387                /* Return values of CGROUP EGRESS BPF programs are:
1388                 *   0: drop packet
1389                 *   1: keep packet
1390                 *   2: drop packet and cn
1391                 *   3: keep packet and cn
1392                 *
1393                 * The returned value is then converted to one of the NET_XMIT
1394                 * or an error code that is then interpreted as drop packet
1395                 * (and no cn):
1396                 *   0: NET_XMIT_SUCCESS  skb should be transmitted
1397                 *   1: NET_XMIT_DROP     skb should be dropped and cn
1398                 *   2: NET_XMIT_CN       skb should be transmitted and cn
1399                 *   3: -err              skb should be dropped
1400                 */
1401
1402                cn = flags & BPF_RET_SET_CN;
1403                if (ret && !IS_ERR_VALUE((long)ret))
1404                        ret = -EFAULT;
1405                if (!ret)
1406                        ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1407                else
1408                        ret = (cn ? NET_XMIT_DROP : ret);
1409        } else {
1410                ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1411                                            skb, __bpf_prog_run_save_cb, 0,
1412                                            NULL);
1413                if (ret && !IS_ERR_VALUE((long)ret))
1414                        ret = -EFAULT;
1415        }
1416        bpf_restore_data_end(skb, saved_data_end);
1417        __skb_pull(skb, offset);
1418        skb->sk = save_sk;
1419
1420        return ret;
1421}
1422EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1423
1424/**
1425 * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1426 * @sk: sock structure to manipulate
1427 * @type: The type of program to be executed
1428 *
1429 * socket is passed is expected to be of type INET or INET6.
1430 *
1431 * The program type passed in via @type must be suitable for sock
1432 * filtering. No further check is performed to assert that.
1433 *
1434 * This function will return %-EPERM if any if an attached program was found
1435 * and if it returned != 1 during execution. In all other cases, 0 is returned.
1436 */
1437int __cgroup_bpf_run_filter_sk(struct sock *sk,
1438                               enum cgroup_bpf_attach_type atype)
1439{
1440        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1441
1442        return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1443                                     NULL);
1444}
1445EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1446
1447/**
1448 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1449 *                                       provided by user sockaddr
1450 * @sk: sock struct that will use sockaddr
1451 * @uaddr: sockaddr struct provided by user
1452 * @type: The type of program to be executed
1453 * @t_ctx: Pointer to attach type specific context
1454 * @flags: Pointer to u32 which contains higher bits of BPF program
1455 *         return value (OR'ed together).
1456 *
1457 * socket is expected to be of type INET or INET6.
1458 *
1459 * This function will return %-EPERM if an attached program is found and
1460 * returned value != 1 during execution. In all other cases, 0 is returned.
1461 */
1462int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1463                                      struct sockaddr *uaddr,
1464                                      enum cgroup_bpf_attach_type atype,
1465                                      void *t_ctx,
1466                                      u32 *flags)
1467{
1468        struct bpf_sock_addr_kern ctx = {
1469                .sk = sk,
1470                .uaddr = uaddr,
1471                .t_ctx = t_ctx,
1472        };
1473        struct sockaddr_storage unspec;
1474        struct cgroup *cgrp;
1475
1476        /* Check socket family since not all sockets represent network
1477         * endpoint (e.g. AF_UNIX).
1478         */
1479        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1480                return 0;
1481
1482        if (!ctx.uaddr) {
1483                memset(&unspec, 0, sizeof(unspec));
1484                ctx.uaddr = (struct sockaddr *)&unspec;
1485        }
1486
1487        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1488        return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1489                                     0, flags);
1490}
1491EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1492
1493/**
1494 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1495 * @sk: socket to get cgroup from
1496 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1497 * sk with connection information (IP addresses, etc.) May not contain
1498 * cgroup info if it is a req sock.
1499 * @type: The type of program to be executed
1500 *
1501 * socket passed is expected to be of type INET or INET6.
1502 *
1503 * The program type passed in via @type must be suitable for sock_ops
1504 * filtering. No further check is performed to assert that.
1505 *
1506 * This function will return %-EPERM if any if an attached program was found
1507 * and if it returned != 1 during execution. In all other cases, 0 is returned.
1508 */
1509int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1510                                     struct bpf_sock_ops_kern *sock_ops,
1511                                     enum cgroup_bpf_attach_type atype)
1512{
1513        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1514
1515        return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1516                                     0, NULL);
1517}
1518EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1519
1520int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1521                                      short access, enum cgroup_bpf_attach_type atype)
1522{
1523        struct cgroup *cgrp;
1524        struct bpf_cgroup_dev_ctx ctx = {
1525                .access_type = (access << 16) | dev_type,
1526                .major = major,
1527                .minor = minor,
1528        };
1529        int ret;
1530
1531        rcu_read_lock();
1532        cgrp = task_dfl_cgroup(current);
1533        ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1534                                    NULL);
1535        rcu_read_unlock();
1536
1537        return ret;
1538}
1539
1540BPF_CALL_0(bpf_get_retval)
1541{
1542        struct bpf_cg_run_ctx *ctx =
1543                container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1544
1545        return ctx->retval;
1546}
1547
1548const struct bpf_func_proto bpf_get_retval_proto = {
1549        .func           = bpf_get_retval,
1550        .gpl_only       = false,
1551        .ret_type       = RET_INTEGER,
1552};
1553
1554BPF_CALL_1(bpf_set_retval, int, retval)
1555{
1556        struct bpf_cg_run_ctx *ctx =
1557                container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1558
1559        ctx->retval = retval;
1560        return 0;
1561}
1562
1563const struct bpf_func_proto bpf_set_retval_proto = {
1564        .func           = bpf_set_retval,
1565        .gpl_only       = false,
1566        .ret_type       = RET_INTEGER,
1567        .arg1_type      = ARG_ANYTHING,
1568};
1569
1570static const struct bpf_func_proto *
1571cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1572{
1573        switch (func_id) {
1574        case BPF_FUNC_get_current_uid_gid:
1575                return &bpf_get_current_uid_gid_proto;
1576        case BPF_FUNC_get_local_storage:
1577                return &bpf_get_local_storage_proto;
1578        case BPF_FUNC_get_current_cgroup_id:
1579                return &bpf_get_current_cgroup_id_proto;
1580        case BPF_FUNC_perf_event_output:
1581                return &bpf_event_output_data_proto;
1582        case BPF_FUNC_get_retval:
1583                return &bpf_get_retval_proto;
1584        case BPF_FUNC_set_retval:
1585                return &bpf_set_retval_proto;
1586        default:
1587                return bpf_base_func_proto(func_id);
1588        }
1589}
1590
1591static const struct bpf_func_proto *
1592cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1593{
1594        return cgroup_base_func_proto(func_id, prog);
1595}
1596
1597static bool cgroup_dev_is_valid_access(int off, int size,
1598                                       enum bpf_access_type type,
1599                                       const struct bpf_prog *prog,
1600                                       struct bpf_insn_access_aux *info)
1601{
1602        const int size_default = sizeof(__u32);
1603
1604        if (type == BPF_WRITE)
1605                return false;
1606
1607        if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1608                return false;
1609        /* The verifier guarantees that size > 0. */
1610        if (off % size != 0)
1611                return false;
1612
1613        switch (off) {
1614        case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1615                bpf_ctx_record_field_size(info, size_default);
1616                if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1617                        return false;
1618                break;
1619        default:
1620                if (size != size_default)
1621                        return false;
1622        }
1623
1624        return true;
1625}
1626
1627const struct bpf_prog_ops cg_dev_prog_ops = {
1628};
1629
1630const struct bpf_verifier_ops cg_dev_verifier_ops = {
1631        .get_func_proto         = cgroup_dev_func_proto,
1632        .is_valid_access        = cgroup_dev_is_valid_access,
1633};
1634
1635/**
1636 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1637 *
1638 * @head: sysctl table header
1639 * @table: sysctl table
1640 * @write: sysctl is being read (= 0) or written (= 1)
1641 * @buf: pointer to buffer (in and out)
1642 * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1643 *      result is size of @new_buf if program set new value, initial value
1644 *      otherwise
1645 * @ppos: value-result argument: value is position at which read from or write
1646 *      to sysctl is happening, result is new position if program overrode it,
1647 *      initial value otherwise
1648 * @type: type of program to be executed
1649 *
1650 * Program is run when sysctl is being accessed, either read or written, and
1651 * can allow or deny such access.
1652 *
1653 * This function will return %-EPERM if an attached program is found and
1654 * returned value != 1 during execution. In all other cases 0 is returned.
1655 */
1656int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1657                                   struct ctl_table *table, int write,
1658                                   char **buf, size_t *pcount, loff_t *ppos,
1659                                   enum cgroup_bpf_attach_type atype)
1660{
1661        struct bpf_sysctl_kern ctx = {
1662                .head = head,
1663                .table = table,
1664                .write = write,
1665                .ppos = ppos,
1666                .cur_val = NULL,
1667                .cur_len = PAGE_SIZE,
1668                .new_val = NULL,
1669                .new_len = 0,
1670                .new_updated = 0,
1671        };
1672        struct cgroup *cgrp;
1673        loff_t pos = 0;
1674        int ret;
1675
1676        ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1677        if (!ctx.cur_val ||
1678            table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1679                /* Let BPF program decide how to proceed. */
1680                ctx.cur_len = 0;
1681        }
1682
1683        if (write && *buf && *pcount) {
1684                /* BPF program should be able to override new value with a
1685                 * buffer bigger than provided by user.
1686                 */
1687                ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1688                ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1689                if (ctx.new_val) {
1690                        memcpy(ctx.new_val, *buf, ctx.new_len);
1691                } else {
1692                        /* Let BPF program decide how to proceed. */
1693                        ctx.new_len = 0;
1694                }
1695        }
1696
1697        rcu_read_lock();
1698        cgrp = task_dfl_cgroup(current);
1699        ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1700                                    NULL);
1701        rcu_read_unlock();
1702
1703        kfree(ctx.cur_val);
1704
1705        if (ret == 1 && ctx.new_updated) {
1706                kfree(*buf);
1707                *buf = ctx.new_val;
1708                *pcount = ctx.new_len;
1709        } else {
1710                kfree(ctx.new_val);
1711        }
1712
1713        return ret;
1714}
1715
1716#ifdef CONFIG_NET
1717static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1718                             struct bpf_sockopt_buf *buf)
1719{
1720        if (unlikely(max_optlen < 0))
1721                return -EINVAL;
1722
1723        if (unlikely(max_optlen > PAGE_SIZE)) {
1724                /* We don't expose optvals that are greater than PAGE_SIZE
1725                 * to the BPF program.
1726                 */
1727                max_optlen = PAGE_SIZE;
1728        }
1729
1730        if (max_optlen <= sizeof(buf->data)) {
1731                /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1732                 * bytes avoid the cost of kzalloc.
1733                 */
1734                ctx->optval = buf->data;
1735                ctx->optval_end = ctx->optval + max_optlen;
1736                return max_optlen;
1737        }
1738
1739        ctx->optval = kzalloc(max_optlen, GFP_USER);
1740        if (!ctx->optval)
1741                return -ENOMEM;
1742
1743        ctx->optval_end = ctx->optval + max_optlen;
1744
1745        return max_optlen;
1746}
1747
1748static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1749                             struct bpf_sockopt_buf *buf)
1750{
1751        if (ctx->optval == buf->data)
1752                return;
1753        kfree(ctx->optval);
1754}
1755
1756static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1757                                  struct bpf_sockopt_buf *buf)
1758{
1759        return ctx->optval != buf->data;
1760}
1761
1762int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1763                                       int *optname, char __user *optval,
1764                                       int *optlen, char **kernel_optval)
1765{
1766        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1767        struct bpf_sockopt_buf buf = {};
1768        struct bpf_sockopt_kern ctx = {
1769                .sk = sk,
1770                .level = *level,
1771                .optname = *optname,
1772        };
1773        int ret, max_optlen;
1774
1775        /* Allocate a bit more than the initial user buffer for
1776         * BPF program. The canonical use case is overriding
1777         * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1778         */
1779        max_optlen = max_t(int, 16, *optlen);
1780        max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1781        if (max_optlen < 0)
1782                return max_optlen;
1783
1784        ctx.optlen = *optlen;
1785
1786        if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
1787                ret = -EFAULT;
1788                goto out;
1789        }
1790
1791        lock_sock(sk);
1792        ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1793                                    &ctx, bpf_prog_run, 0, NULL);
1794        release_sock(sk);
1795
1796        if (ret)
1797                goto out;
1798
1799        if (ctx.optlen == -1) {
1800                /* optlen set to -1, bypass kernel */
1801                ret = 1;
1802        } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1803                /* optlen is out of bounds */
1804                ret = -EFAULT;
1805        } else {
1806                /* optlen within bounds, run kernel handler */
1807                ret = 0;
1808
1809                /* export any potential modifications */
1810                *level = ctx.level;
1811                *optname = ctx.optname;
1812
1813                /* optlen == 0 from BPF indicates that we should
1814                 * use original userspace data.
1815                 */
1816                if (ctx.optlen != 0) {
1817                        *optlen = ctx.optlen;
1818                        /* We've used bpf_sockopt_kern->buf as an intermediary
1819                         * storage, but the BPF program indicates that we need
1820                         * to pass this data to the kernel setsockopt handler.
1821                         * No way to export on-stack buf, have to allocate a
1822                         * new buffer.
1823                         */
1824                        if (!sockopt_buf_allocated(&ctx, &buf)) {
1825                                void *p = kmalloc(ctx.optlen, GFP_USER);
1826
1827                                if (!p) {
1828                                        ret = -ENOMEM;
1829                                        goto out;
1830                                }
1831                                memcpy(p, ctx.optval, ctx.optlen);
1832                                *kernel_optval = p;
1833                        } else {
1834                                *kernel_optval = ctx.optval;
1835                        }
1836                        /* export and don't free sockopt buf */
1837                        return 0;
1838                }
1839        }
1840
1841out:
1842        sockopt_free_buf(&ctx, &buf);
1843        return ret;
1844}
1845
1846int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1847                                       int optname, char __user *optval,
1848                                       int __user *optlen, int max_optlen,
1849                                       int retval)
1850{
1851        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1852        struct bpf_sockopt_buf buf = {};
1853        struct bpf_sockopt_kern ctx = {
1854                .sk = sk,
1855                .level = level,
1856                .optname = optname,
1857                .current_task = current,
1858        };
1859        int ret;
1860
1861        ctx.optlen = max_optlen;
1862        max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1863        if (max_optlen < 0)
1864                return max_optlen;
1865
1866        if (!retval) {
1867                /* If kernel getsockopt finished successfully,
1868                 * copy whatever was returned to the user back
1869                 * into our temporary buffer. Set optlen to the
1870                 * one that kernel returned as well to let
1871                 * BPF programs inspect the value.
1872                 */
1873
1874                if (get_user(ctx.optlen, optlen)) {
1875                        ret = -EFAULT;
1876                        goto out;
1877                }
1878
1879                if (ctx.optlen < 0) {
1880                        ret = -EFAULT;
1881                        goto out;
1882                }
1883
1884                if (copy_from_user(ctx.optval, optval,
1885                                   min(ctx.optlen, max_optlen)) != 0) {
1886                        ret = -EFAULT;
1887                        goto out;
1888                }
1889        }
1890
1891        lock_sock(sk);
1892        ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1893                                    &ctx, bpf_prog_run, retval, NULL);
1894        release_sock(sk);
1895
1896        if (ret < 0)
1897                goto out;
1898
1899        if (ctx.optlen > max_optlen || ctx.optlen < 0) {
1900                ret = -EFAULT;
1901                goto out;
1902        }
1903
1904        if (ctx.optlen != 0) {
1905                if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1906                    put_user(ctx.optlen, optlen)) {
1907                        ret = -EFAULT;
1908                        goto out;
1909                }
1910        }
1911
1912out:
1913        sockopt_free_buf(&ctx, &buf);
1914        return ret;
1915}
1916
1917int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1918                                            int optname, void *optval,
1919                                            int *optlen, int retval)
1920{
1921        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1922        struct bpf_sockopt_kern ctx = {
1923                .sk = sk,
1924                .level = level,
1925                .optname = optname,
1926                .optlen = *optlen,
1927                .optval = optval,
1928                .optval_end = optval + *optlen,
1929                .current_task = current,
1930        };
1931        int ret;
1932
1933        /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
1934         * user data back into BPF buffer when reval != 0. This is
1935         * done as an optimization to avoid extra copy, assuming
1936         * kernel won't populate the data in case of an error.
1937         * Here we always pass the data and memset() should
1938         * be called if that data shouldn't be "exported".
1939         */
1940
1941        ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1942                                    &ctx, bpf_prog_run, retval, NULL);
1943        if (ret < 0)
1944                return ret;
1945
1946        if (ctx.optlen > *optlen)
1947                return -EFAULT;
1948
1949        /* BPF programs can shrink the buffer, export the modifications.
1950         */
1951        if (ctx.optlen != 0)
1952                *optlen = ctx.optlen;
1953
1954        return ret;
1955}
1956#endif
1957
1958static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1959                              size_t *lenp)
1960{
1961        ssize_t tmp_ret = 0, ret;
1962
1963        if (dir->header.parent) {
1964                tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1965                if (tmp_ret < 0)
1966                        return tmp_ret;
1967        }
1968
1969        ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1970        if (ret < 0)
1971                return ret;
1972        *bufp += ret;
1973        *lenp -= ret;
1974        ret += tmp_ret;
1975
1976        /* Avoid leading slash. */
1977        if (!ret)
1978                return ret;
1979
1980        tmp_ret = strscpy(*bufp, "/", *lenp);
1981        if (tmp_ret < 0)
1982                return tmp_ret;
1983        *bufp += tmp_ret;
1984        *lenp -= tmp_ret;
1985
1986        return ret + tmp_ret;
1987}
1988
1989BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1990           size_t, buf_len, u64, flags)
1991{
1992        ssize_t tmp_ret = 0, ret;
1993
1994        if (!buf)
1995                return -EINVAL;
1996
1997        if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1998                if (!ctx->head)
1999                        return -EINVAL;
2000                tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
2001                if (tmp_ret < 0)
2002                        return tmp_ret;
2003        }
2004
2005        ret = strscpy(buf, ctx->table->procname, buf_len);
2006
2007        return ret < 0 ? ret : tmp_ret + ret;
2008}
2009
2010static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
2011        .func           = bpf_sysctl_get_name,
2012        .gpl_only       = false,
2013        .ret_type       = RET_INTEGER,
2014        .arg1_type      = ARG_PTR_TO_CTX,
2015        .arg2_type      = ARG_PTR_TO_MEM,
2016        .arg3_type      = ARG_CONST_SIZE,
2017        .arg4_type      = ARG_ANYTHING,
2018};
2019
2020static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
2021                             size_t src_len)
2022{
2023        if (!dst)
2024                return -EINVAL;
2025
2026        if (!dst_len)
2027                return -E2BIG;
2028
2029        if (!src || !src_len) {
2030                memset(dst, 0, dst_len);
2031                return -EINVAL;
2032        }
2033
2034        memcpy(dst, src, min(dst_len, src_len));
2035
2036        if (dst_len > src_len) {
2037                memset(dst + src_len, '\0', dst_len - src_len);
2038                return src_len;
2039        }
2040
2041        dst[dst_len - 1] = '\0';
2042
2043        return -E2BIG;
2044}
2045
2046BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
2047           char *, buf, size_t, buf_len)
2048{
2049        return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
2050}
2051
2052static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
2053        .func           = bpf_sysctl_get_current_value,
2054        .gpl_only       = false,
2055        .ret_type       = RET_INTEGER,
2056        .arg1_type      = ARG_PTR_TO_CTX,
2057        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
2058        .arg3_type      = ARG_CONST_SIZE,
2059};
2060
2061BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
2062           size_t, buf_len)
2063{
2064        if (!ctx->write) {
2065                if (buf && buf_len)
2066                        memset(buf, '\0', buf_len);
2067                return -EINVAL;
2068        }
2069        return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
2070}
2071
2072static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
2073        .func           = bpf_sysctl_get_new_value,
2074        .gpl_only       = false,
2075        .ret_type       = RET_INTEGER,
2076        .arg1_type      = ARG_PTR_TO_CTX,
2077        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
2078        .arg3_type      = ARG_CONST_SIZE,
2079};
2080
2081BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
2082           const char *, buf, size_t, buf_len)
2083{
2084        if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
2085                return -EINVAL;
2086
2087        if (buf_len > PAGE_SIZE - 1)
2088                return -E2BIG;
2089
2090        memcpy(ctx->new_val, buf, buf_len);
2091        ctx->new_len = buf_len;
2092        ctx->new_updated = 1;
2093
2094        return 0;
2095}
2096
2097static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2098        .func           = bpf_sysctl_set_new_value,
2099        .gpl_only       = false,
2100        .ret_type       = RET_INTEGER,
2101        .arg1_type      = ARG_PTR_TO_CTX,
2102        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
2103        .arg3_type      = ARG_CONST_SIZE,
2104};
2105
2106static const struct bpf_func_proto *
2107sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2108{
2109        switch (func_id) {
2110        case BPF_FUNC_strtol:
2111                return &bpf_strtol_proto;
2112        case BPF_FUNC_strtoul:
2113                return &bpf_strtoul_proto;
2114        case BPF_FUNC_sysctl_get_name:
2115                return &bpf_sysctl_get_name_proto;
2116        case BPF_FUNC_sysctl_get_current_value:
2117                return &bpf_sysctl_get_current_value_proto;
2118        case BPF_FUNC_sysctl_get_new_value:
2119                return &bpf_sysctl_get_new_value_proto;
2120        case BPF_FUNC_sysctl_set_new_value:
2121                return &bpf_sysctl_set_new_value_proto;
2122        case BPF_FUNC_ktime_get_coarse_ns:
2123                return &bpf_ktime_get_coarse_ns_proto;
2124        default:
2125                return cgroup_base_func_proto(func_id, prog);
2126        }
2127}
2128
2129static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
2130                                   const struct bpf_prog *prog,
2131                                   struct bpf_insn_access_aux *info)
2132{
2133        const int size_default = sizeof(__u32);
2134
2135        if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
2136                return false;
2137
2138        switch (off) {
2139        case bpf_ctx_range(struct bpf_sysctl, write):
2140                if (type != BPF_READ)
2141                        return false;
2142                bpf_ctx_record_field_size(info, size_default);
2143                return bpf_ctx_narrow_access_ok(off, size, size_default);
2144        case bpf_ctx_range(struct bpf_sysctl, file_pos):
2145                if (type == BPF_READ) {
2146                        bpf_ctx_record_field_size(info, size_default);
2147                        return bpf_ctx_narrow_access_ok(off, size, size_default);
2148                } else {
2149                        return size == size_default;
2150                }
2151        default:
2152                return false;
2153        }
2154}
2155
2156static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
2157                                     const struct bpf_insn *si,
2158                                     struct bpf_insn *insn_buf,
2159                                     struct bpf_prog *prog, u32 *target_size)
2160{
2161        struct bpf_insn *insn = insn_buf;
2162        u32 read_size;
2163
2164        switch (si->off) {
2165        case offsetof(struct bpf_sysctl, write):
2166                *insn++ = BPF_LDX_MEM(
2167                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
2168                        bpf_target_off(struct bpf_sysctl_kern, write,
2169                                       sizeof_field(struct bpf_sysctl_kern,
2170                                                    write),
2171                                       target_size));
2172                break;
2173        case offsetof(struct bpf_sysctl, file_pos):
2174                /* ppos is a pointer so it should be accessed via indirect
2175                 * loads and stores. Also for stores additional temporary
2176                 * register is used since neither src_reg nor dst_reg can be
2177                 * overridden.
2178                 */
2179                if (type == BPF_WRITE) {
2180                        int treg = BPF_REG_9;
2181
2182                        if (si->src_reg == treg || si->dst_reg == treg)
2183                                --treg;
2184                        if (si->src_reg == treg || si->dst_reg == treg)
2185                                --treg;
2186                        *insn++ = BPF_STX_MEM(
2187                                BPF_DW, si->dst_reg, treg,
2188                                offsetof(struct bpf_sysctl_kern, tmp_reg));
2189                        *insn++ = BPF_LDX_MEM(
2190                                BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2191                                treg, si->dst_reg,
2192                                offsetof(struct bpf_sysctl_kern, ppos));
2193                        *insn++ = BPF_STX_MEM(
2194                                BPF_SIZEOF(u32), treg, si->src_reg,
2195                                bpf_ctx_narrow_access_offset(
2196                                        0, sizeof(u32), sizeof(loff_t)));
2197                        *insn++ = BPF_LDX_MEM(
2198                                BPF_DW, treg, si->dst_reg,
2199                                offsetof(struct bpf_sysctl_kern, tmp_reg));
2200                } else {
2201                        *insn++ = BPF_LDX_MEM(
2202                                BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2203                                si->dst_reg, si->src_reg,
2204                                offsetof(struct bpf_sysctl_kern, ppos));
2205                        read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2206                        *insn++ = BPF_LDX_MEM(
2207                                BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2208                                bpf_ctx_narrow_access_offset(
2209                                        0, read_size, sizeof(loff_t)));
2210                }
2211                *target_size = sizeof(u32);
2212                break;
2213        }
2214
2215        return insn - insn_buf;
2216}
2217
2218const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2219        .get_func_proto         = sysctl_func_proto,
2220        .is_valid_access        = sysctl_is_valid_access,
2221        .convert_ctx_access     = sysctl_convert_ctx_access,
2222};
2223
2224const struct bpf_prog_ops cg_sysctl_prog_ops = {
2225};
2226
2227#ifdef CONFIG_NET
2228BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2229{
2230        const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2231
2232        return net->net_cookie;
2233}
2234
2235static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2236        .func           = bpf_get_netns_cookie_sockopt,
2237        .gpl_only       = false,
2238        .ret_type       = RET_INTEGER,
2239        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
2240};
2241#endif
2242
2243static const struct bpf_func_proto *
2244cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2245{
2246        switch (func_id) {
2247#ifdef CONFIG_NET
2248        case BPF_FUNC_get_netns_cookie:
2249                return &bpf_get_netns_cookie_sockopt_proto;
2250        case BPF_FUNC_sk_storage_get:
2251                return &bpf_sk_storage_get_proto;
2252        case BPF_FUNC_sk_storage_delete:
2253                return &bpf_sk_storage_delete_proto;
2254        case BPF_FUNC_setsockopt:
2255                if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2256                        return &bpf_sk_setsockopt_proto;
2257                return NULL;
2258        case BPF_FUNC_getsockopt:
2259                if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2260                        return &bpf_sk_getsockopt_proto;
2261                return NULL;
2262#endif
2263#ifdef CONFIG_INET
2264        case BPF_FUNC_tcp_sock:
2265                return &bpf_tcp_sock_proto;
2266#endif
2267        default:
2268                return cgroup_base_func_proto(func_id, prog);
2269        }
2270}
2271
2272static bool cg_sockopt_is_valid_access(int off, int size,
2273                                       enum bpf_access_type type,
2274                                       const struct bpf_prog *prog,
2275                                       struct bpf_insn_access_aux *info)
2276{
2277        const int size_default = sizeof(__u32);
2278
2279        if (off < 0 || off >= sizeof(struct bpf_sockopt))
2280                return false;
2281
2282        if (off % size != 0)
2283                return false;
2284
2285        if (type == BPF_WRITE) {
2286                switch (off) {
2287                case offsetof(struct bpf_sockopt, retval):
2288                        if (size != size_default)
2289                                return false;
2290                        return prog->expected_attach_type ==
2291                                BPF_CGROUP_GETSOCKOPT;
2292                case offsetof(struct bpf_sockopt, optname):
2293                        fallthrough;
2294                case offsetof(struct bpf_sockopt, level):
2295                        if (size != size_default)
2296                                return false;
2297                        return prog->expected_attach_type ==
2298                                BPF_CGROUP_SETSOCKOPT;
2299                case offsetof(struct bpf_sockopt, optlen):
2300                        return size == size_default;
2301                default:
2302                        return false;
2303                }
2304        }
2305
2306        switch (off) {
2307        case offsetof(struct bpf_sockopt, sk):
2308                if (size != sizeof(__u64))
2309                        return false;
2310                info->reg_type = PTR_TO_SOCKET;
2311                break;
2312        case offsetof(struct bpf_sockopt, optval):
2313                if (size != sizeof(__u64))
2314                        return false;
2315                info->reg_type = PTR_TO_PACKET;
2316                break;
2317        case offsetof(struct bpf_sockopt, optval_end):
2318                if (size != sizeof(__u64))
2319                        return false;
2320                info->reg_type = PTR_TO_PACKET_END;
2321                break;
2322        case offsetof(struct bpf_sockopt, retval):
2323                if (size != size_default)
2324                        return false;
2325                return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2326        default:
2327                if (size != size_default)
2328                        return false;
2329                break;
2330        }
2331        return true;
2332}
2333
2334#define CG_SOCKOPT_ACCESS_FIELD(T, F)                                   \
2335        T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),                 \
2336          si->dst_reg, si->src_reg,                                     \
2337          offsetof(struct bpf_sockopt_kern, F))
2338
2339static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2340                                         const struct bpf_insn *si,
2341                                         struct bpf_insn *insn_buf,
2342                                         struct bpf_prog *prog,
2343                                         u32 *target_size)
2344{
2345        struct bpf_insn *insn = insn_buf;
2346
2347        switch (si->off) {
2348        case offsetof(struct bpf_sockopt, sk):
2349                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
2350                break;
2351        case offsetof(struct bpf_sockopt, level):
2352                if (type == BPF_WRITE)
2353                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
2354                else
2355                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
2356                break;
2357        case offsetof(struct bpf_sockopt, optname):
2358                if (type == BPF_WRITE)
2359                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
2360                else
2361                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
2362                break;
2363        case offsetof(struct bpf_sockopt, optlen):
2364                if (type == BPF_WRITE)
2365                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
2366                else
2367                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
2368                break;
2369        case offsetof(struct bpf_sockopt, retval):
2370                BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2371
2372                if (type == BPF_WRITE) {
2373                        int treg = BPF_REG_9;
2374
2375                        if (si->src_reg == treg || si->dst_reg == treg)
2376                                --treg;
2377                        if (si->src_reg == treg || si->dst_reg == treg)
2378                                --treg;
2379                        *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2380                                              offsetof(struct bpf_sockopt_kern, tmp_reg));
2381                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2382                                              treg, si->dst_reg,
2383                                              offsetof(struct bpf_sockopt_kern, current_task));
2384                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2385                                              treg, treg,
2386                                              offsetof(struct task_struct, bpf_ctx));
2387                        *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2388                                              treg, si->src_reg,
2389                                              offsetof(struct bpf_cg_run_ctx, retval));
2390                        *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2391                                              offsetof(struct bpf_sockopt_kern, tmp_reg));
2392                } else {
2393                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2394                                              si->dst_reg, si->src_reg,
2395                                              offsetof(struct bpf_sockopt_kern, current_task));
2396                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2397                                              si->dst_reg, si->dst_reg,
2398                                              offsetof(struct task_struct, bpf_ctx));
2399                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2400                                              si->dst_reg, si->dst_reg,
2401                                              offsetof(struct bpf_cg_run_ctx, retval));
2402                }
2403                break;
2404        case offsetof(struct bpf_sockopt, optval):
2405                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
2406                break;
2407        case offsetof(struct bpf_sockopt, optval_end):
2408                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
2409                break;
2410        }
2411
2412        return insn - insn_buf;
2413}
2414
2415static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2416                                   bool direct_write,
2417                                   const struct bpf_prog *prog)
2418{
2419        /* Nothing to do for sockopt argument. The data is kzalloc'ated.
2420         */
2421        return 0;
2422}
2423
2424const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2425        .get_func_proto         = cg_sockopt_func_proto,
2426        .is_valid_access        = cg_sockopt_is_valid_access,
2427        .convert_ctx_access     = cg_sockopt_convert_ctx_access,
2428        .gen_prologue           = cg_sockopt_get_prologue,
2429};
2430
2431const struct bpf_prog_ops cg_sockopt_prog_ops = {
2432};
2433