linux/kernel/bpf/core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Linux Socket Filter - Kernel level socket filtering
   4 *
   5 * Based on the design of the Berkeley Packet Filter. The new
   6 * internal format has been designed by PLUMgrid:
   7 *
   8 *      Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
   9 *
  10 * Authors:
  11 *
  12 *      Jay Schulist <jschlst@samba.org>
  13 *      Alexei Starovoitov <ast@plumgrid.com>
  14 *      Daniel Borkmann <dborkman@redhat.com>
  15 *
  16 * Andi Kleen - Fix a few bad bugs and races.
  17 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  18 */
  19
  20#include <uapi/linux/btf.h>
  21#include <linux/filter.h>
  22#include <linux/skbuff.h>
  23#include <linux/vmalloc.h>
  24#include <linux/random.h>
  25#include <linux/moduleloader.h>
  26#include <linux/bpf.h>
  27#include <linux/btf.h>
  28#include <linux/objtool.h>
  29#include <linux/rbtree_latch.h>
  30#include <linux/kallsyms.h>
  31#include <linux/rcupdate.h>
  32#include <linux/perf_event.h>
  33#include <linux/extable.h>
  34#include <linux/log2.h>
  35#include <linux/bpf_verifier.h>
  36#include <linux/nodemask.h>
  37
  38#include <asm/barrier.h>
  39#include <asm/unaligned.h>
  40
  41/* Registers */
  42#define BPF_R0  regs[BPF_REG_0]
  43#define BPF_R1  regs[BPF_REG_1]
  44#define BPF_R2  regs[BPF_REG_2]
  45#define BPF_R3  regs[BPF_REG_3]
  46#define BPF_R4  regs[BPF_REG_4]
  47#define BPF_R5  regs[BPF_REG_5]
  48#define BPF_R6  regs[BPF_REG_6]
  49#define BPF_R7  regs[BPF_REG_7]
  50#define BPF_R8  regs[BPF_REG_8]
  51#define BPF_R9  regs[BPF_REG_9]
  52#define BPF_R10 regs[BPF_REG_10]
  53
  54/* Named registers */
  55#define DST     regs[insn->dst_reg]
  56#define SRC     regs[insn->src_reg]
  57#define FP      regs[BPF_REG_FP]
  58#define AX      regs[BPF_REG_AX]
  59#define ARG1    regs[BPF_REG_ARG1]
  60#define CTX     regs[BPF_REG_CTX]
  61#define IMM     insn->imm
  62
  63/* No hurry in this branch
  64 *
  65 * Exported for the bpf jit load helper.
  66 */
  67void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
  68{
  69        u8 *ptr = NULL;
  70
  71        if (k >= SKF_NET_OFF) {
  72                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
  73        } else if (k >= SKF_LL_OFF) {
  74                if (unlikely(!skb_mac_header_was_set(skb)))
  75                        return NULL;
  76                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
  77        }
  78        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
  79                return ptr;
  80
  81        return NULL;
  82}
  83
  84struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
  85{
  86        gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
  87        struct bpf_prog_aux *aux;
  88        struct bpf_prog *fp;
  89
  90        size = round_up(size, PAGE_SIZE);
  91        fp = __vmalloc(size, gfp_flags);
  92        if (fp == NULL)
  93                return NULL;
  94
  95        aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
  96        if (aux == NULL) {
  97                vfree(fp);
  98                return NULL;
  99        }
 100        fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
 101        if (!fp->active) {
 102                vfree(fp);
 103                kfree(aux);
 104                return NULL;
 105        }
 106
 107        fp->pages = size / PAGE_SIZE;
 108        fp->aux = aux;
 109        fp->aux->prog = fp;
 110        fp->jit_requested = ebpf_jit_enabled();
 111        fp->blinding_requested = bpf_jit_blinding_enabled(fp);
 112#ifdef CONFIG_CGROUP_BPF
 113        aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
 114#endif
 115
 116        INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
 117        mutex_init(&fp->aux->used_maps_mutex);
 118        mutex_init(&fp->aux->dst_mutex);
 119
 120        return fp;
 121}
 122
 123struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 124{
 125        gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
 126        struct bpf_prog *prog;
 127        int cpu;
 128
 129        prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
 130        if (!prog)
 131                return NULL;
 132
 133        prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
 134        if (!prog->stats) {
 135                free_percpu(prog->active);
 136                kfree(prog->aux);
 137                vfree(prog);
 138                return NULL;
 139        }
 140
 141        for_each_possible_cpu(cpu) {
 142                struct bpf_prog_stats *pstats;
 143
 144                pstats = per_cpu_ptr(prog->stats, cpu);
 145                u64_stats_init(&pstats->syncp);
 146        }
 147        return prog;
 148}
 149EXPORT_SYMBOL_GPL(bpf_prog_alloc);
 150
 151int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
 152{
 153        if (!prog->aux->nr_linfo || !prog->jit_requested)
 154                return 0;
 155
 156        prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
 157                                          sizeof(*prog->aux->jited_linfo),
 158                                          GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 159        if (!prog->aux->jited_linfo)
 160                return -ENOMEM;
 161
 162        return 0;
 163}
 164
 165void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
 166{
 167        if (prog->aux->jited_linfo &&
 168            (!prog->jited || !prog->aux->jited_linfo[0])) {
 169                kvfree(prog->aux->jited_linfo);
 170                prog->aux->jited_linfo = NULL;
 171        }
 172
 173        kfree(prog->aux->kfunc_tab);
 174        prog->aux->kfunc_tab = NULL;
 175}
 176
 177/* The jit engine is responsible to provide an array
 178 * for insn_off to the jited_off mapping (insn_to_jit_off).
 179 *
 180 * The idx to this array is the insn_off.  Hence, the insn_off
 181 * here is relative to the prog itself instead of the main prog.
 182 * This array has one entry for each xlated bpf insn.
 183 *
 184 * jited_off is the byte off to the end of the jited insn.
 185 *
 186 * Hence, with
 187 * insn_start:
 188 *      The first bpf insn off of the prog.  The insn off
 189 *      here is relative to the main prog.
 190 *      e.g. if prog is a subprog, insn_start > 0
 191 * linfo_idx:
 192 *      The prog's idx to prog->aux->linfo and jited_linfo
 193 *
 194 * jited_linfo[linfo_idx] = prog->bpf_func
 195 *
 196 * For i > linfo_idx,
 197 *
 198 * jited_linfo[i] = prog->bpf_func +
 199 *      insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
 200 */
 201void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 202                               const u32 *insn_to_jit_off)
 203{
 204        u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
 205        const struct bpf_line_info *linfo;
 206        void **jited_linfo;
 207
 208        if (!prog->aux->jited_linfo)
 209                /* Userspace did not provide linfo */
 210                return;
 211
 212        linfo_idx = prog->aux->linfo_idx;
 213        linfo = &prog->aux->linfo[linfo_idx];
 214        insn_start = linfo[0].insn_off;
 215        insn_end = insn_start + prog->len;
 216
 217        jited_linfo = &prog->aux->jited_linfo[linfo_idx];
 218        jited_linfo[0] = prog->bpf_func;
 219
 220        nr_linfo = prog->aux->nr_linfo - linfo_idx;
 221
 222        for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
 223                /* The verifier ensures that linfo[i].insn_off is
 224                 * strictly increasing
 225                 */
 226                jited_linfo[i] = prog->bpf_func +
 227                        insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
 228}
 229
 230struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 231                                  gfp_t gfp_extra_flags)
 232{
 233        gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
 234        struct bpf_prog *fp;
 235        u32 pages;
 236
 237        size = round_up(size, PAGE_SIZE);
 238        pages = size / PAGE_SIZE;
 239        if (pages <= fp_old->pages)
 240                return fp_old;
 241
 242        fp = __vmalloc(size, gfp_flags);
 243        if (fp) {
 244                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 245                fp->pages = pages;
 246                fp->aux->prog = fp;
 247
 248                /* We keep fp->aux from fp_old around in the new
 249                 * reallocated structure.
 250                 */
 251                fp_old->aux = NULL;
 252                fp_old->stats = NULL;
 253                fp_old->active = NULL;
 254                __bpf_prog_free(fp_old);
 255        }
 256
 257        return fp;
 258}
 259
 260void __bpf_prog_free(struct bpf_prog *fp)
 261{
 262        if (fp->aux) {
 263                mutex_destroy(&fp->aux->used_maps_mutex);
 264                mutex_destroy(&fp->aux->dst_mutex);
 265                kfree(fp->aux->poke_tab);
 266                kfree(fp->aux);
 267        }
 268        free_percpu(fp->stats);
 269        free_percpu(fp->active);
 270        vfree(fp);
 271}
 272
 273int bpf_prog_calc_tag(struct bpf_prog *fp)
 274{
 275        const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
 276        u32 raw_size = bpf_prog_tag_scratch_size(fp);
 277        u32 digest[SHA1_DIGEST_WORDS];
 278        u32 ws[SHA1_WORKSPACE_WORDS];
 279        u32 i, bsize, psize, blocks;
 280        struct bpf_insn *dst;
 281        bool was_ld_map;
 282        u8 *raw, *todo;
 283        __be32 *result;
 284        __be64 *bits;
 285
 286        raw = vmalloc(raw_size);
 287        if (!raw)
 288                return -ENOMEM;
 289
 290        sha1_init(digest);
 291        memset(ws, 0, sizeof(ws));
 292
 293        /* We need to take out the map fd for the digest calculation
 294         * since they are unstable from user space side.
 295         */
 296        dst = (void *)raw;
 297        for (i = 0, was_ld_map = false; i < fp->len; i++) {
 298                dst[i] = fp->insnsi[i];
 299                if (!was_ld_map &&
 300                    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
 301                    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
 302                     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
 303                        was_ld_map = true;
 304                        dst[i].imm = 0;
 305                } else if (was_ld_map &&
 306                           dst[i].code == 0 &&
 307                           dst[i].dst_reg == 0 &&
 308                           dst[i].src_reg == 0 &&
 309                           dst[i].off == 0) {
 310                        was_ld_map = false;
 311                        dst[i].imm = 0;
 312                } else {
 313                        was_ld_map = false;
 314                }
 315        }
 316
 317        psize = bpf_prog_insn_size(fp);
 318        memset(&raw[psize], 0, raw_size - psize);
 319        raw[psize++] = 0x80;
 320
 321        bsize  = round_up(psize, SHA1_BLOCK_SIZE);
 322        blocks = bsize / SHA1_BLOCK_SIZE;
 323        todo   = raw;
 324        if (bsize - psize >= sizeof(__be64)) {
 325                bits = (__be64 *)(todo + bsize - sizeof(__be64));
 326        } else {
 327                bits = (__be64 *)(todo + bsize + bits_offset);
 328                blocks++;
 329        }
 330        *bits = cpu_to_be64((psize - 1) << 3);
 331
 332        while (blocks--) {
 333                sha1_transform(digest, todo, ws);
 334                todo += SHA1_BLOCK_SIZE;
 335        }
 336
 337        result = (__force __be32 *)digest;
 338        for (i = 0; i < SHA1_DIGEST_WORDS; i++)
 339                result[i] = cpu_to_be32(digest[i]);
 340        memcpy(fp->tag, result, sizeof(fp->tag));
 341
 342        vfree(raw);
 343        return 0;
 344}
 345
 346static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
 347                                s32 end_new, s32 curr, const bool probe_pass)
 348{
 349        const s64 imm_min = S32_MIN, imm_max = S32_MAX;
 350        s32 delta = end_new - end_old;
 351        s64 imm = insn->imm;
 352
 353        if (curr < pos && curr + imm + 1 >= end_old)
 354                imm += delta;
 355        else if (curr >= end_new && curr + imm + 1 < end_new)
 356                imm -= delta;
 357        if (imm < imm_min || imm > imm_max)
 358                return -ERANGE;
 359        if (!probe_pass)
 360                insn->imm = imm;
 361        return 0;
 362}
 363
 364static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
 365                                s32 end_new, s32 curr, const bool probe_pass)
 366{
 367        const s32 off_min = S16_MIN, off_max = S16_MAX;
 368        s32 delta = end_new - end_old;
 369        s32 off = insn->off;
 370
 371        if (curr < pos && curr + off + 1 >= end_old)
 372                off += delta;
 373        else if (curr >= end_new && curr + off + 1 < end_new)
 374                off -= delta;
 375        if (off < off_min || off > off_max)
 376                return -ERANGE;
 377        if (!probe_pass)
 378                insn->off = off;
 379        return 0;
 380}
 381
 382static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
 383                            s32 end_new, const bool probe_pass)
 384{
 385        u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
 386        struct bpf_insn *insn = prog->insnsi;
 387        int ret = 0;
 388
 389        for (i = 0; i < insn_cnt; i++, insn++) {
 390                u8 code;
 391
 392                /* In the probing pass we still operate on the original,
 393                 * unpatched image in order to check overflows before we
 394                 * do any other adjustments. Therefore skip the patchlet.
 395                 */
 396                if (probe_pass && i == pos) {
 397                        i = end_new;
 398                        insn = prog->insnsi + end_old;
 399                }
 400                if (bpf_pseudo_func(insn)) {
 401                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
 402                                                   end_new, i, probe_pass);
 403                        if (ret)
 404                                return ret;
 405                        continue;
 406                }
 407                code = insn->code;
 408                if ((BPF_CLASS(code) != BPF_JMP &&
 409                     BPF_CLASS(code) != BPF_JMP32) ||
 410                    BPF_OP(code) == BPF_EXIT)
 411                        continue;
 412                /* Adjust offset of jmps if we cross patch boundaries. */
 413                if (BPF_OP(code) == BPF_CALL) {
 414                        if (insn->src_reg != BPF_PSEUDO_CALL)
 415                                continue;
 416                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
 417                                                   end_new, i, probe_pass);
 418                } else {
 419                        ret = bpf_adj_delta_to_off(insn, pos, end_old,
 420                                                   end_new, i, probe_pass);
 421                }
 422                if (ret)
 423                        break;
 424        }
 425
 426        return ret;
 427}
 428
 429static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
 430{
 431        struct bpf_line_info *linfo;
 432        u32 i, nr_linfo;
 433
 434        nr_linfo = prog->aux->nr_linfo;
 435        if (!nr_linfo || !delta)
 436                return;
 437
 438        linfo = prog->aux->linfo;
 439
 440        for (i = 0; i < nr_linfo; i++)
 441                if (off < linfo[i].insn_off)
 442                        break;
 443
 444        /* Push all off < linfo[i].insn_off by delta */
 445        for (; i < nr_linfo; i++)
 446                linfo[i].insn_off += delta;
 447}
 448
 449struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 450                                       const struct bpf_insn *patch, u32 len)
 451{
 452        u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
 453        const u32 cnt_max = S16_MAX;
 454        struct bpf_prog *prog_adj;
 455        int err;
 456
 457        /* Since our patchlet doesn't expand the image, we're done. */
 458        if (insn_delta == 0) {
 459                memcpy(prog->insnsi + off, patch, sizeof(*patch));
 460                return prog;
 461        }
 462
 463        insn_adj_cnt = prog->len + insn_delta;
 464
 465        /* Reject anything that would potentially let the insn->off
 466         * target overflow when we have excessive program expansions.
 467         * We need to probe here before we do any reallocation where
 468         * we afterwards may not fail anymore.
 469         */
 470        if (insn_adj_cnt > cnt_max &&
 471            (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
 472                return ERR_PTR(err);
 473
 474        /* Several new instructions need to be inserted. Make room
 475         * for them. Likely, there's no need for a new allocation as
 476         * last page could have large enough tailroom.
 477         */
 478        prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
 479                                    GFP_USER);
 480        if (!prog_adj)
 481                return ERR_PTR(-ENOMEM);
 482
 483        prog_adj->len = insn_adj_cnt;
 484
 485        /* Patching happens in 3 steps:
 486         *
 487         * 1) Move over tail of insnsi from next instruction onwards,
 488         *    so we can patch the single target insn with one or more
 489         *    new ones (patching is always from 1 to n insns, n > 0).
 490         * 2) Inject new instructions at the target location.
 491         * 3) Adjust branch offsets if necessary.
 492         */
 493        insn_rest = insn_adj_cnt - off - len;
 494
 495        memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
 496                sizeof(*patch) * insn_rest);
 497        memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
 498
 499        /* We are guaranteed to not fail at this point, otherwise
 500         * the ship has sailed to reverse to the original state. An
 501         * overflow cannot happen at this point.
 502         */
 503        BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
 504
 505        bpf_adj_linfo(prog_adj, off, insn_delta);
 506
 507        return prog_adj;
 508}
 509
 510int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
 511{
 512        /* Branch offsets can't overflow when program is shrinking, no need
 513         * to call bpf_adj_branches(..., true) here
 514         */
 515        memmove(prog->insnsi + off, prog->insnsi + off + cnt,
 516                sizeof(struct bpf_insn) * (prog->len - off - cnt));
 517        prog->len -= cnt;
 518
 519        return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
 520}
 521
 522static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
 523{
 524        int i;
 525
 526        for (i = 0; i < fp->aux->func_cnt; i++)
 527                bpf_prog_kallsyms_del(fp->aux->func[i]);
 528}
 529
 530void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
 531{
 532        bpf_prog_kallsyms_del_subprogs(fp);
 533        bpf_prog_kallsyms_del(fp);
 534}
 535
 536#ifdef CONFIG_BPF_JIT
 537/* All BPF JIT sysctl knobs here. */
 538int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
 539int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
 540int bpf_jit_harden   __read_mostly;
 541long bpf_jit_limit   __read_mostly;
 542long bpf_jit_limit_max __read_mostly;
 543
 544static void
 545bpf_prog_ksym_set_addr(struct bpf_prog *prog)
 546{
 547        WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
 548
 549        prog->aux->ksym.start = (unsigned long) prog->bpf_func;
 550        prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
 551}
 552
 553static void
 554bpf_prog_ksym_set_name(struct bpf_prog *prog)
 555{
 556        char *sym = prog->aux->ksym.name;
 557        const char *end = sym + KSYM_NAME_LEN;
 558        const struct btf_type *type;
 559        const char *func_name;
 560
 561        BUILD_BUG_ON(sizeof("bpf_prog_") +
 562                     sizeof(prog->tag) * 2 +
 563                     /* name has been null terminated.
 564                      * We should need +1 for the '_' preceding
 565                      * the name.  However, the null character
 566                      * is double counted between the name and the
 567                      * sizeof("bpf_prog_") above, so we omit
 568                      * the +1 here.
 569                      */
 570                     sizeof(prog->aux->name) > KSYM_NAME_LEN);
 571
 572        sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
 573        sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
 574
 575        /* prog->aux->name will be ignored if full btf name is available */
 576        if (prog->aux->func_info_cnt) {
 577                type = btf_type_by_id(prog->aux->btf,
 578                                      prog->aux->func_info[prog->aux->func_idx].type_id);
 579                func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
 580                snprintf(sym, (size_t)(end - sym), "_%s", func_name);
 581                return;
 582        }
 583
 584        if (prog->aux->name[0])
 585                snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
 586        else
 587                *sym = 0;
 588}
 589
 590static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
 591{
 592        return container_of(n, struct bpf_ksym, tnode)->start;
 593}
 594
 595static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
 596                                          struct latch_tree_node *b)
 597{
 598        return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
 599}
 600
 601static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
 602{
 603        unsigned long val = (unsigned long)key;
 604        const struct bpf_ksym *ksym;
 605
 606        ksym = container_of(n, struct bpf_ksym, tnode);
 607
 608        if (val < ksym->start)
 609                return -1;
 610        if (val >= ksym->end)
 611                return  1;
 612
 613        return 0;
 614}
 615
 616static const struct latch_tree_ops bpf_tree_ops = {
 617        .less   = bpf_tree_less,
 618        .comp   = bpf_tree_comp,
 619};
 620
 621static DEFINE_SPINLOCK(bpf_lock);
 622static LIST_HEAD(bpf_kallsyms);
 623static struct latch_tree_root bpf_tree __cacheline_aligned;
 624
 625void bpf_ksym_add(struct bpf_ksym *ksym)
 626{
 627        spin_lock_bh(&bpf_lock);
 628        WARN_ON_ONCE(!list_empty(&ksym->lnode));
 629        list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
 630        latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
 631        spin_unlock_bh(&bpf_lock);
 632}
 633
 634static void __bpf_ksym_del(struct bpf_ksym *ksym)
 635{
 636        if (list_empty(&ksym->lnode))
 637                return;
 638
 639        latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
 640        list_del_rcu(&ksym->lnode);
 641}
 642
 643void bpf_ksym_del(struct bpf_ksym *ksym)
 644{
 645        spin_lock_bh(&bpf_lock);
 646        __bpf_ksym_del(ksym);
 647        spin_unlock_bh(&bpf_lock);
 648}
 649
 650static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 651{
 652        return fp->jited && !bpf_prog_was_classic(fp);
 653}
 654
 655void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 656{
 657        if (!bpf_prog_kallsyms_candidate(fp) ||
 658            !bpf_capable())
 659                return;
 660
 661        bpf_prog_ksym_set_addr(fp);
 662        bpf_prog_ksym_set_name(fp);
 663        fp->aux->ksym.prog = true;
 664
 665        bpf_ksym_add(&fp->aux->ksym);
 666}
 667
 668void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 669{
 670        if (!bpf_prog_kallsyms_candidate(fp))
 671                return;
 672
 673        bpf_ksym_del(&fp->aux->ksym);
 674}
 675
 676static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
 677{
 678        struct latch_tree_node *n;
 679
 680        n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
 681        return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
 682}
 683
 684const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
 685                                 unsigned long *off, char *sym)
 686{
 687        struct bpf_ksym *ksym;
 688        char *ret = NULL;
 689
 690        rcu_read_lock();
 691        ksym = bpf_ksym_find(addr);
 692        if (ksym) {
 693                unsigned long symbol_start = ksym->start;
 694                unsigned long symbol_end = ksym->end;
 695
 696                strncpy(sym, ksym->name, KSYM_NAME_LEN);
 697
 698                ret = sym;
 699                if (size)
 700                        *size = symbol_end - symbol_start;
 701                if (off)
 702                        *off  = addr - symbol_start;
 703        }
 704        rcu_read_unlock();
 705
 706        return ret;
 707}
 708
 709bool is_bpf_text_address(unsigned long addr)
 710{
 711        bool ret;
 712
 713        rcu_read_lock();
 714        ret = bpf_ksym_find(addr) != NULL;
 715        rcu_read_unlock();
 716
 717        return ret;
 718}
 719
 720static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
 721{
 722        struct bpf_ksym *ksym = bpf_ksym_find(addr);
 723
 724        return ksym && ksym->prog ?
 725               container_of(ksym, struct bpf_prog_aux, ksym)->prog :
 726               NULL;
 727}
 728
 729const struct exception_table_entry *search_bpf_extables(unsigned long addr)
 730{
 731        const struct exception_table_entry *e = NULL;
 732        struct bpf_prog *prog;
 733
 734        rcu_read_lock();
 735        prog = bpf_prog_ksym_find(addr);
 736        if (!prog)
 737                goto out;
 738        if (!prog->aux->num_exentries)
 739                goto out;
 740
 741        e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
 742out:
 743        rcu_read_unlock();
 744        return e;
 745}
 746
 747int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 748                    char *sym)
 749{
 750        struct bpf_ksym *ksym;
 751        unsigned int it = 0;
 752        int ret = -ERANGE;
 753
 754        if (!bpf_jit_kallsyms_enabled())
 755                return ret;
 756
 757        rcu_read_lock();
 758        list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
 759                if (it++ != symnum)
 760                        continue;
 761
 762                strncpy(sym, ksym->name, KSYM_NAME_LEN);
 763
 764                *value = ksym->start;
 765                *type  = BPF_SYM_ELF_TYPE;
 766
 767                ret = 0;
 768                break;
 769        }
 770        rcu_read_unlock();
 771
 772        return ret;
 773}
 774
 775int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 776                                struct bpf_jit_poke_descriptor *poke)
 777{
 778        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
 779        static const u32 poke_tab_max = 1024;
 780        u32 slot = prog->aux->size_poke_tab;
 781        u32 size = slot + 1;
 782
 783        if (size > poke_tab_max)
 784                return -ENOSPC;
 785        if (poke->tailcall_target || poke->tailcall_target_stable ||
 786            poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
 787                return -EINVAL;
 788
 789        switch (poke->reason) {
 790        case BPF_POKE_REASON_TAIL_CALL:
 791                if (!poke->tail_call.map)
 792                        return -EINVAL;
 793                break;
 794        default:
 795                return -EINVAL;
 796        }
 797
 798        tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
 799        if (!tab)
 800                return -ENOMEM;
 801
 802        memcpy(&tab[slot], poke, sizeof(*poke));
 803        prog->aux->size_poke_tab = size;
 804        prog->aux->poke_tab = tab;
 805
 806        return slot;
 807}
 808
 809/*
 810 * BPF program pack allocator.
 811 *
 812 * Most BPF programs are pretty small. Allocating a hole page for each
 813 * program is sometime a waste. Many small bpf program also adds pressure
 814 * to instruction TLB. To solve this issue, we introduce a BPF program pack
 815 * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
 816 * to host BPF programs.
 817 */
 818#define BPF_PROG_CHUNK_SHIFT    6
 819#define BPF_PROG_CHUNK_SIZE     (1 << BPF_PROG_CHUNK_SHIFT)
 820#define BPF_PROG_CHUNK_MASK     (~(BPF_PROG_CHUNK_SIZE - 1))
 821
 822struct bpf_prog_pack {
 823        struct list_head list;
 824        void *ptr;
 825        unsigned long bitmap[];
 826};
 827
 828void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
 829{
 830        memset(area, 0, size);
 831}
 832
 833#define BPF_PROG_SIZE_TO_NBITS(size)    (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
 834
 835static DEFINE_MUTEX(pack_mutex);
 836static LIST_HEAD(pack_list);
 837
 838/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
 839 * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
 840 */
 841#ifdef PMD_SIZE
 842#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
 843#else
 844#define BPF_PROG_PACK_SIZE PAGE_SIZE
 845#endif
 846
 847#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
 848
 849static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
 850{
 851        struct bpf_prog_pack *pack;
 852
 853        pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
 854                       GFP_KERNEL);
 855        if (!pack)
 856                return NULL;
 857        pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
 858        if (!pack->ptr) {
 859                kfree(pack);
 860                return NULL;
 861        }
 862        bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
 863        bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
 864        list_add_tail(&pack->list, &pack_list);
 865
 866        set_vm_flush_reset_perms(pack->ptr);
 867        set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
 868        set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
 869        return pack;
 870}
 871
 872void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
 873{
 874        unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
 875        struct bpf_prog_pack *pack;
 876        unsigned long pos;
 877        void *ptr = NULL;
 878
 879        mutex_lock(&pack_mutex);
 880        if (size > BPF_PROG_PACK_SIZE) {
 881                size = round_up(size, PAGE_SIZE);
 882                ptr = module_alloc(size);
 883                if (ptr) {
 884                        bpf_fill_ill_insns(ptr, size);
 885                        set_vm_flush_reset_perms(ptr);
 886                        set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
 887                        set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
 888                }
 889                goto out;
 890        }
 891        list_for_each_entry(pack, &pack_list, list) {
 892                pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
 893                                                 nbits, 0);
 894                if (pos < BPF_PROG_CHUNK_COUNT)
 895                        goto found_free_area;
 896        }
 897
 898        pack = alloc_new_pack(bpf_fill_ill_insns);
 899        if (!pack)
 900                goto out;
 901
 902        pos = 0;
 903
 904found_free_area:
 905        bitmap_set(pack->bitmap, pos, nbits);
 906        ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
 907
 908out:
 909        mutex_unlock(&pack_mutex);
 910        return ptr;
 911}
 912
 913void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 914{
 915        struct bpf_prog_pack *pack = NULL, *tmp;
 916        unsigned int nbits;
 917        unsigned long pos;
 918
 919        mutex_lock(&pack_mutex);
 920        if (hdr->size > BPF_PROG_PACK_SIZE) {
 921                module_memfree(hdr);
 922                goto out;
 923        }
 924
 925        list_for_each_entry(tmp, &pack_list, list) {
 926                if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
 927                        pack = tmp;
 928                        break;
 929                }
 930        }
 931
 932        if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
 933                goto out;
 934
 935        nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
 936        pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
 937
 938        WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
 939                  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
 940
 941        bitmap_clear(pack->bitmap, pos, nbits);
 942        if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
 943                                       BPF_PROG_CHUNK_COUNT, 0) == 0) {
 944                list_del(&pack->list);
 945                module_memfree(pack->ptr);
 946                kfree(pack);
 947        }
 948out:
 949        mutex_unlock(&pack_mutex);
 950}
 951
 952static atomic_long_t bpf_jit_current;
 953
 954/* Can be overridden by an arch's JIT compiler if it has a custom,
 955 * dedicated BPF backend memory area, or if neither of the two
 956 * below apply.
 957 */
 958u64 __weak bpf_jit_alloc_exec_limit(void)
 959{
 960#if defined(MODULES_VADDR)
 961        return MODULES_END - MODULES_VADDR;
 962#else
 963        return VMALLOC_END - VMALLOC_START;
 964#endif
 965}
 966
 967static int __init bpf_jit_charge_init(void)
 968{
 969        /* Only used as heuristic here to derive limit. */
 970        bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
 971        bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
 972                                            PAGE_SIZE), LONG_MAX);
 973        return 0;
 974}
 975pure_initcall(bpf_jit_charge_init);
 976
 977int bpf_jit_charge_modmem(u32 size)
 978{
 979        if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
 980                if (!bpf_capable()) {
 981                        atomic_long_sub(size, &bpf_jit_current);
 982                        return -EPERM;
 983                }
 984        }
 985
 986        return 0;
 987}
 988
 989void bpf_jit_uncharge_modmem(u32 size)
 990{
 991        atomic_long_sub(size, &bpf_jit_current);
 992}
 993
 994void *__weak bpf_jit_alloc_exec(unsigned long size)
 995{
 996        return module_alloc(size);
 997}
 998
 999void __weak bpf_jit_free_exec(void *addr)
1000{
1001        module_memfree(addr);
1002}
1003
1004struct bpf_binary_header *
1005bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
1006                     unsigned int alignment,
1007                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
1008{
1009        struct bpf_binary_header *hdr;
1010        u32 size, hole, start;
1011
1012        WARN_ON_ONCE(!is_power_of_2(alignment) ||
1013                     alignment > BPF_IMAGE_ALIGNMENT);
1014
1015        /* Most of BPF filters are really small, but if some of them
1016         * fill a page, allow at least 128 extra bytes to insert a
1017         * random section of illegal instructions.
1018         */
1019        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
1020
1021        if (bpf_jit_charge_modmem(size))
1022                return NULL;
1023        hdr = bpf_jit_alloc_exec(size);
1024        if (!hdr) {
1025                bpf_jit_uncharge_modmem(size);
1026                return NULL;
1027        }
1028
1029        /* Fill space with illegal/arch-dep instructions. */
1030        bpf_fill_ill_insns(hdr, size);
1031
1032        hdr->size = size;
1033        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
1034                     PAGE_SIZE - sizeof(*hdr));
1035        start = (get_random_int() % hole) & ~(alignment - 1);
1036
1037        /* Leave a random number of instructions before BPF code. */
1038        *image_ptr = &hdr->image[start];
1039
1040        return hdr;
1041}
1042
1043void bpf_jit_binary_free(struct bpf_binary_header *hdr)
1044{
1045        u32 size = hdr->size;
1046
1047        bpf_jit_free_exec(hdr);
1048        bpf_jit_uncharge_modmem(size);
1049}
1050
1051/* Allocate jit binary from bpf_prog_pack allocator.
1052 * Since the allocated memory is RO+X, the JIT engine cannot write directly
1053 * to the memory. To solve this problem, a RW buffer is also allocated at
1054 * as the same time. The JIT engine should calculate offsets based on the
1055 * RO memory address, but write JITed program to the RW buffer. Once the
1056 * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
1057 * the JITed program to the RO memory.
1058 */
1059struct bpf_binary_header *
1060bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
1061                          unsigned int alignment,
1062                          struct bpf_binary_header **rw_header,
1063                          u8 **rw_image,
1064                          bpf_jit_fill_hole_t bpf_fill_ill_insns)
1065{
1066        struct bpf_binary_header *ro_header;
1067        u32 size, hole, start;
1068
1069        WARN_ON_ONCE(!is_power_of_2(alignment) ||
1070                     alignment > BPF_IMAGE_ALIGNMENT);
1071
1072        /* add 16 bytes for a random section of illegal instructions */
1073        size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
1074
1075        if (bpf_jit_charge_modmem(size))
1076                return NULL;
1077        ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
1078        if (!ro_header) {
1079                bpf_jit_uncharge_modmem(size);
1080                return NULL;
1081        }
1082
1083        *rw_header = kvmalloc(size, GFP_KERNEL);
1084        if (!*rw_header) {
1085                bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
1086                bpf_prog_pack_free(ro_header);
1087                bpf_jit_uncharge_modmem(size);
1088                return NULL;
1089        }
1090
1091        /* Fill space with illegal/arch-dep instructions. */
1092        bpf_fill_ill_insns(*rw_header, size);
1093        (*rw_header)->size = size;
1094
1095        hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
1096                     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
1097        start = (get_random_int() % hole) & ~(alignment - 1);
1098
1099        *image_ptr = &ro_header->image[start];
1100        *rw_image = &(*rw_header)->image[start];
1101
1102        return ro_header;
1103}
1104
1105/* Copy JITed text from rw_header to its final location, the ro_header. */
1106int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
1107                                 struct bpf_binary_header *ro_header,
1108                                 struct bpf_binary_header *rw_header)
1109{
1110        void *ptr;
1111
1112        ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
1113
1114        kvfree(rw_header);
1115
1116        if (IS_ERR(ptr)) {
1117                bpf_prog_pack_free(ro_header);
1118                return PTR_ERR(ptr);
1119        }
1120        return 0;
1121}
1122
1123/* bpf_jit_binary_pack_free is called in two different scenarios:
1124 *   1) when the program is freed after;
1125 *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
1126 * For case 2), we need to free both the RO memory and the RW buffer.
1127 *
1128 * bpf_jit_binary_pack_free requires proper ro_header->size. However,
1129 * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
1130 * must be set with either bpf_jit_binary_pack_finalize (normal path) or
1131 * bpf_arch_text_copy (when jit fails).
1132 */
1133void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
1134                              struct bpf_binary_header *rw_header)
1135{
1136        u32 size = ro_header->size;
1137
1138        bpf_prog_pack_free(ro_header);
1139        kvfree(rw_header);
1140        bpf_jit_uncharge_modmem(size);
1141}
1142
1143struct bpf_binary_header *
1144bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
1145{
1146        unsigned long real_start = (unsigned long)fp->bpf_func;
1147        unsigned long addr;
1148
1149        addr = real_start & BPF_PROG_CHUNK_MASK;
1150        return (void *)addr;
1151}
1152
1153static inline struct bpf_binary_header *
1154bpf_jit_binary_hdr(const struct bpf_prog *fp)
1155{
1156        unsigned long real_start = (unsigned long)fp->bpf_func;
1157        unsigned long addr;
1158
1159        addr = real_start & PAGE_MASK;
1160        return (void *)addr;
1161}
1162
1163/* This symbol is only overridden by archs that have different
1164 * requirements than the usual eBPF JITs, f.e. when they only
1165 * implement cBPF JIT, do not set images read-only, etc.
1166 */
1167void __weak bpf_jit_free(struct bpf_prog *fp)
1168{
1169        if (fp->jited) {
1170                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
1171
1172                bpf_jit_binary_free(hdr);
1173                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
1174        }
1175
1176        bpf_prog_unlock_free(fp);
1177}
1178
1179int bpf_jit_get_func_addr(const struct bpf_prog *prog,
1180                          const struct bpf_insn *insn, bool extra_pass,
1181                          u64 *func_addr, bool *func_addr_fixed)
1182{
1183        s16 off = insn->off;
1184        s32 imm = insn->imm;
1185        u8 *addr;
1186
1187        *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
1188        if (!*func_addr_fixed) {
1189                /* Place-holder address till the last pass has collected
1190                 * all addresses for JITed subprograms in which case we
1191                 * can pick them up from prog->aux.
1192                 */
1193                if (!extra_pass)
1194                        addr = NULL;
1195                else if (prog->aux->func &&
1196                         off >= 0 && off < prog->aux->func_cnt)
1197                        addr = (u8 *)prog->aux->func[off]->bpf_func;
1198                else
1199                        return -EINVAL;
1200        } else {
1201                /* Address of a BPF helper call. Since part of the core
1202                 * kernel, it's always at a fixed location. __bpf_call_base
1203                 * and the helper with imm relative to it are both in core
1204                 * kernel.
1205                 */
1206                addr = (u8 *)__bpf_call_base + imm;
1207        }
1208
1209        *func_addr = (unsigned long)addr;
1210        return 0;
1211}
1212
1213static int bpf_jit_blind_insn(const struct bpf_insn *from,
1214                              const struct bpf_insn *aux,
1215                              struct bpf_insn *to_buff,
1216                              bool emit_zext)
1217{
1218        struct bpf_insn *to = to_buff;
1219        u32 imm_rnd = get_random_int();
1220        s16 off;
1221
1222        BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
1223        BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
1224
1225        /* Constraints on AX register:
1226         *
1227         * AX register is inaccessible from user space. It is mapped in
1228         * all JITs, and used here for constant blinding rewrites. It is
1229         * typically "stateless" meaning its contents are only valid within
1230         * the executed instruction, but not across several instructions.
1231         * There are a few exceptions however which are further detailed
1232         * below.
1233         *
1234         * Constant blinding is only used by JITs, not in the interpreter.
1235         * The interpreter uses AX in some occasions as a local temporary
1236         * register e.g. in DIV or MOD instructions.
1237         *
1238         * In restricted circumstances, the verifier can also use the AX
1239         * register for rewrites as long as they do not interfere with
1240         * the above cases!
1241         */
1242        if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
1243                goto out;
1244
1245        if (from->imm == 0 &&
1246            (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
1247             from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
1248                *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
1249                goto out;
1250        }
1251
1252        switch (from->code) {
1253        case BPF_ALU | BPF_ADD | BPF_K:
1254        case BPF_ALU | BPF_SUB | BPF_K:
1255        case BPF_ALU | BPF_AND | BPF_K:
1256        case BPF_ALU | BPF_OR  | BPF_K:
1257        case BPF_ALU | BPF_XOR | BPF_K:
1258        case BPF_ALU | BPF_MUL | BPF_K:
1259        case BPF_ALU | BPF_MOV | BPF_K:
1260        case BPF_ALU | BPF_DIV | BPF_K:
1261        case BPF_ALU | BPF_MOD | BPF_K:
1262                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1263                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1264                *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
1265                break;
1266
1267        case BPF_ALU64 | BPF_ADD | BPF_K:
1268        case BPF_ALU64 | BPF_SUB | BPF_K:
1269        case BPF_ALU64 | BPF_AND | BPF_K:
1270        case BPF_ALU64 | BPF_OR  | BPF_K:
1271        case BPF_ALU64 | BPF_XOR | BPF_K:
1272        case BPF_ALU64 | BPF_MUL | BPF_K:
1273        case BPF_ALU64 | BPF_MOV | BPF_K:
1274        case BPF_ALU64 | BPF_DIV | BPF_K:
1275        case BPF_ALU64 | BPF_MOD | BPF_K:
1276                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1277                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1278                *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
1279                break;
1280
1281        case BPF_JMP | BPF_JEQ  | BPF_K:
1282        case BPF_JMP | BPF_JNE  | BPF_K:
1283        case BPF_JMP | BPF_JGT  | BPF_K:
1284        case BPF_JMP | BPF_JLT  | BPF_K:
1285        case BPF_JMP | BPF_JGE  | BPF_K:
1286        case BPF_JMP | BPF_JLE  | BPF_K:
1287        case BPF_JMP | BPF_JSGT | BPF_K:
1288        case BPF_JMP | BPF_JSLT | BPF_K:
1289        case BPF_JMP | BPF_JSGE | BPF_K:
1290        case BPF_JMP | BPF_JSLE | BPF_K:
1291        case BPF_JMP | BPF_JSET | BPF_K:
1292                /* Accommodate for extra offset in case of a backjump. */
1293                off = from->off;
1294                if (off < 0)
1295                        off -= 2;
1296                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1297                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1298                *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
1299                break;
1300
1301        case BPF_JMP32 | BPF_JEQ  | BPF_K:
1302        case BPF_JMP32 | BPF_JNE  | BPF_K:
1303        case BPF_JMP32 | BPF_JGT  | BPF_K:
1304        case BPF_JMP32 | BPF_JLT  | BPF_K:
1305        case BPF_JMP32 | BPF_JGE  | BPF_K:
1306        case BPF_JMP32 | BPF_JLE  | BPF_K:
1307        case BPF_JMP32 | BPF_JSGT | BPF_K:
1308        case BPF_JMP32 | BPF_JSLT | BPF_K:
1309        case BPF_JMP32 | BPF_JSGE | BPF_K:
1310        case BPF_JMP32 | BPF_JSLE | BPF_K:
1311        case BPF_JMP32 | BPF_JSET | BPF_K:
1312                /* Accommodate for extra offset in case of a backjump. */
1313                off = from->off;
1314                if (off < 0)
1315                        off -= 2;
1316                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1317                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1318                *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
1319                                      off);
1320                break;
1321
1322        case BPF_LD | BPF_IMM | BPF_DW:
1323                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
1324                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1325                *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
1326                *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
1327                break;
1328        case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
1329                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
1330                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1331                if (emit_zext)
1332                        *to++ = BPF_ZEXT_REG(BPF_REG_AX);
1333                *to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
1334                break;
1335
1336        case BPF_ST | BPF_MEM | BPF_DW:
1337        case BPF_ST | BPF_MEM | BPF_W:
1338        case BPF_ST | BPF_MEM | BPF_H:
1339        case BPF_ST | BPF_MEM | BPF_B:
1340                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1341                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1342                *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
1343                break;
1344        }
1345out:
1346        return to - to_buff;
1347}
1348
1349static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
1350                                              gfp_t gfp_extra_flags)
1351{
1352        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
1353        struct bpf_prog *fp;
1354
1355        fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
1356        if (fp != NULL) {
1357                /* aux->prog still points to the fp_other one, so
1358                 * when promoting the clone to the real program,
1359                 * this still needs to be adapted.
1360                 */
1361                memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
1362        }
1363
1364        return fp;
1365}
1366
1367static void bpf_prog_clone_free(struct bpf_prog *fp)
1368{
1369        /* aux was stolen by the other clone, so we cannot free
1370         * it from this path! It will be freed eventually by the
1371         * other program on release.
1372         *
1373         * At this point, we don't need a deferred release since
1374         * clone is guaranteed to not be locked.
1375         */
1376        fp->aux = NULL;
1377        fp->stats = NULL;
1378        fp->active = NULL;
1379        __bpf_prog_free(fp);
1380}
1381
1382void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
1383{
1384        /* We have to repoint aux->prog to self, as we don't
1385         * know whether fp here is the clone or the original.
1386         */
1387        fp->aux->prog = fp;
1388        bpf_prog_clone_free(fp_other);
1389}
1390
1391struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
1392{
1393        struct bpf_insn insn_buff[16], aux[2];
1394        struct bpf_prog *clone, *tmp;
1395        int insn_delta, insn_cnt;
1396        struct bpf_insn *insn;
1397        int i, rewritten;
1398
1399        if (!prog->blinding_requested || prog->blinded)
1400                return prog;
1401
1402        clone = bpf_prog_clone_create(prog, GFP_USER);
1403        if (!clone)
1404                return ERR_PTR(-ENOMEM);
1405
1406        insn_cnt = clone->len;
1407        insn = clone->insnsi;
1408
1409        for (i = 0; i < insn_cnt; i++, insn++) {
1410                if (bpf_pseudo_func(insn)) {
1411                        /* ld_imm64 with an address of bpf subprog is not
1412                         * a user controlled constant. Don't randomize it,
1413                         * since it will conflict with jit_subprogs() logic.
1414                         */
1415                        insn++;
1416                        i++;
1417                        continue;
1418                }
1419
1420                /* We temporarily need to hold the original ld64 insn
1421                 * so that we can still access the first part in the
1422                 * second blinding run.
1423                 */
1424                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
1425                    insn[1].code == 0)
1426                        memcpy(aux, insn, sizeof(aux));
1427
1428                rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
1429                                                clone->aux->verifier_zext);
1430                if (!rewritten)
1431                        continue;
1432
1433                tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
1434                if (IS_ERR(tmp)) {
1435                        /* Patching may have repointed aux->prog during
1436                         * realloc from the original one, so we need to
1437                         * fix it up here on error.
1438                         */
1439                        bpf_jit_prog_release_other(prog, clone);
1440                        return tmp;
1441                }
1442
1443                clone = tmp;
1444                insn_delta = rewritten - 1;
1445
1446                /* Walk new program and skip insns we just inserted. */
1447                insn = clone->insnsi + i + insn_delta;
1448                insn_cnt += insn_delta;
1449                i        += insn_delta;
1450        }
1451
1452        clone->blinded = 1;
1453        return clone;
1454}
1455#endif /* CONFIG_BPF_JIT */
1456
1457/* Base function for offset calculation. Needs to go into .text section,
1458 * therefore keeping it non-static as well; will also be used by JITs
1459 * anyway later on, so do not let the compiler omit it. This also needs
1460 * to go into kallsyms for correlation from e.g. bpftool, so naming
1461 * must not change.
1462 */
1463noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1464{
1465        return 0;
1466}
1467EXPORT_SYMBOL_GPL(__bpf_call_base);
1468
1469/* All UAPI available opcodes. */
1470#define BPF_INSN_MAP(INSN_2, INSN_3)            \
1471        /* 32 bit ALU operations. */            \
1472        /*   Register based. */                 \
1473        INSN_3(ALU, ADD,  X),                   \
1474        INSN_3(ALU, SUB,  X),                   \
1475        INSN_3(ALU, AND,  X),                   \
1476        INSN_3(ALU, OR,   X),                   \
1477        INSN_3(ALU, LSH,  X),                   \
1478        INSN_3(ALU, RSH,  X),                   \
1479        INSN_3(ALU, XOR,  X),                   \
1480        INSN_3(ALU, MUL,  X),                   \
1481        INSN_3(ALU, MOV,  X),                   \
1482        INSN_3(ALU, ARSH, X),                   \
1483        INSN_3(ALU, DIV,  X),                   \
1484        INSN_3(ALU, MOD,  X),                   \
1485        INSN_2(ALU, NEG),                       \
1486        INSN_3(ALU, END, TO_BE),                \
1487        INSN_3(ALU, END, TO_LE),                \
1488        /*   Immediate based. */                \
1489        INSN_3(ALU, ADD,  K),                   \
1490        INSN_3(ALU, SUB,  K),                   \
1491        INSN_3(ALU, AND,  K),                   \
1492        INSN_3(ALU, OR,   K),                   \
1493        INSN_3(ALU, LSH,  K),                   \
1494        INSN_3(ALU, RSH,  K),                   \
1495        INSN_3(ALU, XOR,  K),                   \
1496        INSN_3(ALU, MUL,  K),                   \
1497        INSN_3(ALU, MOV,  K),                   \
1498        INSN_3(ALU, ARSH, K),                   \
1499        INSN_3(ALU, DIV,  K),                   \
1500        INSN_3(ALU, MOD,  K),                   \
1501        /* 64 bit ALU operations. */            \
1502        /*   Register based. */                 \
1503        INSN_3(ALU64, ADD,  X),                 \
1504        INSN_3(ALU64, SUB,  X),                 \
1505        INSN_3(ALU64, AND,  X),                 \
1506        INSN_3(ALU64, OR,   X),                 \
1507        INSN_3(ALU64, LSH,  X),                 \
1508        INSN_3(ALU64, RSH,  X),                 \
1509        INSN_3(ALU64, XOR,  X),                 \
1510        INSN_3(ALU64, MUL,  X),                 \
1511        INSN_3(ALU64, MOV,  X),                 \
1512        INSN_3(ALU64, ARSH, X),                 \
1513        INSN_3(ALU64, DIV,  X),                 \
1514        INSN_3(ALU64, MOD,  X),                 \
1515        INSN_2(ALU64, NEG),                     \
1516        /*   Immediate based. */                \
1517        INSN_3(ALU64, ADD,  K),                 \
1518        INSN_3(ALU64, SUB,  K),                 \
1519        INSN_3(ALU64, AND,  K),                 \
1520        INSN_3(ALU64, OR,   K),                 \
1521        INSN_3(ALU64, LSH,  K),                 \
1522        INSN_3(ALU64, RSH,  K),                 \
1523        INSN_3(ALU64, XOR,  K),                 \
1524        INSN_3(ALU64, MUL,  K),                 \
1525        INSN_3(ALU64, MOV,  K),                 \
1526        INSN_3(ALU64, ARSH, K),                 \
1527        INSN_3(ALU64, DIV,  K),                 \
1528        INSN_3(ALU64, MOD,  K),                 \
1529        /* Call instruction. */                 \
1530        INSN_2(JMP, CALL),                      \
1531        /* Exit instruction. */                 \
1532        INSN_2(JMP, EXIT),                      \
1533        /* 32-bit Jump instructions. */         \
1534        /*   Register based. */                 \
1535        INSN_3(JMP32, JEQ,  X),                 \
1536        INSN_3(JMP32, JNE,  X),                 \
1537        INSN_3(JMP32, JGT,  X),                 \
1538        INSN_3(JMP32, JLT,  X),                 \
1539        INSN_3(JMP32, JGE,  X),                 \
1540        INSN_3(JMP32, JLE,  X),                 \
1541        INSN_3(JMP32, JSGT, X),                 \
1542        INSN_3(JMP32, JSLT, X),                 \
1543        INSN_3(JMP32, JSGE, X),                 \
1544        INSN_3(JMP32, JSLE, X),                 \
1545        INSN_3(JMP32, JSET, X),                 \
1546        /*   Immediate based. */                \
1547        INSN_3(JMP32, JEQ,  K),                 \
1548        INSN_3(JMP32, JNE,  K),                 \
1549        INSN_3(JMP32, JGT,  K),                 \
1550        INSN_3(JMP32, JLT,  K),                 \
1551        INSN_3(JMP32, JGE,  K),                 \
1552        INSN_3(JMP32, JLE,  K),                 \
1553        INSN_3(JMP32, JSGT, K),                 \
1554        INSN_3(JMP32, JSLT, K),                 \
1555        INSN_3(JMP32, JSGE, K),                 \
1556        INSN_3(JMP32, JSLE, K),                 \
1557        INSN_3(JMP32, JSET, K),                 \
1558        /* Jump instructions. */                \
1559        /*   Register based. */                 \
1560        INSN_3(JMP, JEQ,  X),                   \
1561        INSN_3(JMP, JNE,  X),                   \
1562        INSN_3(JMP, JGT,  X),                   \
1563        INSN_3(JMP, JLT,  X),                   \
1564        INSN_3(JMP, JGE,  X),                   \
1565        INSN_3(JMP, JLE,  X),                   \
1566        INSN_3(JMP, JSGT, X),                   \
1567        INSN_3(JMP, JSLT, X),                   \
1568        INSN_3(JMP, JSGE, X),                   \
1569        INSN_3(JMP, JSLE, X),                   \
1570        INSN_3(JMP, JSET, X),                   \
1571        /*   Immediate based. */                \
1572        INSN_3(JMP, JEQ,  K),                   \
1573        INSN_3(JMP, JNE,  K),                   \
1574        INSN_3(JMP, JGT,  K),                   \
1575        INSN_3(JMP, JLT,  K),                   \
1576        INSN_3(JMP, JGE,  K),                   \
1577        INSN_3(JMP, JLE,  K),                   \
1578        INSN_3(JMP, JSGT, K),                   \
1579        INSN_3(JMP, JSLT, K),                   \
1580        INSN_3(JMP, JSGE, K),                   \
1581        INSN_3(JMP, JSLE, K),                   \
1582        INSN_3(JMP, JSET, K),                   \
1583        INSN_2(JMP, JA),                        \
1584        /* Store instructions. */               \
1585        /*   Register based. */                 \
1586        INSN_3(STX, MEM,  B),                   \
1587        INSN_3(STX, MEM,  H),                   \
1588        INSN_3(STX, MEM,  W),                   \
1589        INSN_3(STX, MEM,  DW),                  \
1590        INSN_3(STX, ATOMIC, W),                 \
1591        INSN_3(STX, ATOMIC, DW),                \
1592        /*   Immediate based. */                \
1593        INSN_3(ST, MEM, B),                     \
1594        INSN_3(ST, MEM, H),                     \
1595        INSN_3(ST, MEM, W),                     \
1596        INSN_3(ST, MEM, DW),                    \
1597        /* Load instructions. */                \
1598        /*   Register based. */                 \
1599        INSN_3(LDX, MEM, B),                    \
1600        INSN_3(LDX, MEM, H),                    \
1601        INSN_3(LDX, MEM, W),                    \
1602        INSN_3(LDX, MEM, DW),                   \
1603        /*   Immediate based. */                \
1604        INSN_3(LD, IMM, DW)
1605
1606bool bpf_opcode_in_insntable(u8 code)
1607{
1608#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
1609#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
1610        static const bool public_insntable[256] = {
1611                [0 ... 255] = false,
1612                /* Now overwrite non-defaults ... */
1613                BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
1614                /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
1615                [BPF_LD | BPF_ABS | BPF_B] = true,
1616                [BPF_LD | BPF_ABS | BPF_H] = true,
1617                [BPF_LD | BPF_ABS | BPF_W] = true,
1618                [BPF_LD | BPF_IND | BPF_B] = true,
1619                [BPF_LD | BPF_IND | BPF_H] = true,
1620                [BPF_LD | BPF_IND | BPF_W] = true,
1621        };
1622#undef BPF_INSN_3_TBL
1623#undef BPF_INSN_2_TBL
1624        return public_insntable[code];
1625}
1626
1627#ifndef CONFIG_BPF_JIT_ALWAYS_ON
1628u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
1629{
1630        memset(dst, 0, size);
1631        return -EFAULT;
1632}
1633
1634/**
1635 *      ___bpf_prog_run - run eBPF program on a given context
1636 *      @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
1637 *      @insn: is the array of eBPF instructions
1638 *
1639 * Decode and execute eBPF instructions.
1640 *
1641 * Return: whatever value is in %BPF_R0 at program exit
1642 */
1643static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
1644{
1645#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
1646#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
1647        static const void * const jumptable[256] __annotate_jump_table = {
1648                [0 ... 255] = &&default_label,
1649                /* Now overwrite non-defaults ... */
1650                BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
1651                /* Non-UAPI available opcodes. */
1652                [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
1653                [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
1654                [BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
1655                [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
1656                [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
1657                [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
1658                [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
1659        };
1660#undef BPF_INSN_3_LBL
1661#undef BPF_INSN_2_LBL
1662        u32 tail_call_cnt = 0;
1663
1664#define CONT     ({ insn++; goto select_insn; })
1665#define CONT_JMP ({ insn++; goto select_insn; })
1666
1667select_insn:
1668        goto *jumptable[insn->code];
1669
1670        /* Explicitly mask the register-based shift amounts with 63 or 31
1671         * to avoid undefined behavior. Normally this won't affect the
1672         * generated code, for example, in case of native 64 bit archs such
1673         * as x86-64 or arm64, the compiler is optimizing the AND away for
1674         * the interpreter. In case of JITs, each of the JIT backends compiles
1675         * the BPF shift operations to machine instructions which produce
1676         * implementation-defined results in such a case; the resulting
1677         * contents of the register may be arbitrary, but program behaviour
1678         * as a whole remains defined. In other words, in case of JIT backends,
1679         * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
1680         */
1681        /* ALU (shifts) */
1682#define SHT(OPCODE, OP)                                 \
1683        ALU64_##OPCODE##_X:                             \
1684                DST = DST OP (SRC & 63);                \
1685                CONT;                                   \
1686        ALU_##OPCODE##_X:                               \
1687                DST = (u32) DST OP ((u32) SRC & 31);    \
1688                CONT;                                   \
1689        ALU64_##OPCODE##_K:                             \
1690                DST = DST OP IMM;                       \
1691                CONT;                                   \
1692        ALU_##OPCODE##_K:                               \
1693                DST = (u32) DST OP (u32) IMM;           \
1694                CONT;
1695        /* ALU (rest) */
1696#define ALU(OPCODE, OP)                                 \
1697        ALU64_##OPCODE##_X:                             \
1698                DST = DST OP SRC;                       \
1699                CONT;                                   \
1700        ALU_##OPCODE##_X:                               \
1701                DST = (u32) DST OP (u32) SRC;           \
1702                CONT;                                   \
1703        ALU64_##OPCODE##_K:                             \
1704                DST = DST OP IMM;                       \
1705                CONT;                                   \
1706        ALU_##OPCODE##_K:                               \
1707                DST = (u32) DST OP (u32) IMM;           \
1708                CONT;
1709        ALU(ADD,  +)
1710        ALU(SUB,  -)
1711        ALU(AND,  &)
1712        ALU(OR,   |)
1713        ALU(XOR,  ^)
1714        ALU(MUL,  *)
1715        SHT(LSH, <<)
1716        SHT(RSH, >>)
1717#undef SHT
1718#undef ALU
1719        ALU_NEG:
1720                DST = (u32) -DST;
1721                CONT;
1722        ALU64_NEG:
1723                DST = -DST;
1724                CONT;
1725        ALU_MOV_X:
1726                DST = (u32) SRC;
1727                CONT;
1728        ALU_MOV_K:
1729                DST = (u32) IMM;
1730                CONT;
1731        ALU64_MOV_X:
1732                DST = SRC;
1733                CONT;
1734        ALU64_MOV_K:
1735                DST = IMM;
1736                CONT;
1737        LD_IMM_DW:
1738                DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
1739                insn++;
1740                CONT;
1741        ALU_ARSH_X:
1742                DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
1743                CONT;
1744        ALU_ARSH_K:
1745                DST = (u64) (u32) (((s32) DST) >> IMM);
1746                CONT;
1747        ALU64_ARSH_X:
1748                (*(s64 *) &DST) >>= (SRC & 63);
1749                CONT;
1750        ALU64_ARSH_K:
1751                (*(s64 *) &DST) >>= IMM;
1752                CONT;
1753        ALU64_MOD_X:
1754                div64_u64_rem(DST, SRC, &AX);
1755                DST = AX;
1756                CONT;
1757        ALU_MOD_X:
1758                AX = (u32) DST;
1759                DST = do_div(AX, (u32) SRC);
1760                CONT;
1761        ALU64_MOD_K:
1762                div64_u64_rem(DST, IMM, &AX);
1763                DST = AX;
1764                CONT;
1765        ALU_MOD_K:
1766                AX = (u32) DST;
1767                DST = do_div(AX, (u32) IMM);
1768                CONT;
1769        ALU64_DIV_X:
1770                DST = div64_u64(DST, SRC);
1771                CONT;
1772        ALU_DIV_X:
1773                AX = (u32) DST;
1774                do_div(AX, (u32) SRC);
1775                DST = (u32) AX;
1776                CONT;
1777        ALU64_DIV_K:
1778                DST = div64_u64(DST, IMM);
1779                CONT;
1780        ALU_DIV_K:
1781                AX = (u32) DST;
1782                do_div(AX, (u32) IMM);
1783                DST = (u32) AX;
1784                CONT;
1785        ALU_END_TO_BE:
1786                switch (IMM) {
1787                case 16:
1788                        DST = (__force u16) cpu_to_be16(DST);
1789                        break;
1790                case 32:
1791                        DST = (__force u32) cpu_to_be32(DST);
1792                        break;
1793                case 64:
1794                        DST = (__force u64) cpu_to_be64(DST);
1795                        break;
1796                }
1797                CONT;
1798        ALU_END_TO_LE:
1799                switch (IMM) {
1800                case 16:
1801                        DST = (__force u16) cpu_to_le16(DST);
1802                        break;
1803                case 32:
1804                        DST = (__force u32) cpu_to_le32(DST);
1805                        break;
1806                case 64:
1807                        DST = (__force u64) cpu_to_le64(DST);
1808                        break;
1809                }
1810                CONT;
1811
1812        /* CALL */
1813        JMP_CALL:
1814                /* Function call scratches BPF_R1-BPF_R5 registers,
1815                 * preserves BPF_R6-BPF_R9, and stores return value
1816                 * into BPF_R0.
1817                 */
1818                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
1819                                                       BPF_R4, BPF_R5);
1820                CONT;
1821
1822        JMP_CALL_ARGS:
1823                BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
1824                                                            BPF_R3, BPF_R4,
1825                                                            BPF_R5,
1826                                                            insn + insn->off + 1);
1827                CONT;
1828
1829        JMP_TAIL_CALL: {
1830                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
1831                struct bpf_array *array = container_of(map, struct bpf_array, map);
1832                struct bpf_prog *prog;
1833                u32 index = BPF_R3;
1834
1835                if (unlikely(index >= array->map.max_entries))
1836                        goto out;
1837
1838                if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
1839                        goto out;
1840
1841                tail_call_cnt++;
1842
1843                prog = READ_ONCE(array->ptrs[index]);
1844                if (!prog)
1845                        goto out;
1846
1847                /* ARG1 at this point is guaranteed to point to CTX from
1848                 * the verifier side due to the fact that the tail call is
1849                 * handled like a helper, that is, bpf_tail_call_proto,
1850                 * where arg1_type is ARG_PTR_TO_CTX.
1851                 */
1852                insn = prog->insnsi;
1853                goto select_insn;
1854out:
1855                CONT;
1856        }
1857        JMP_JA:
1858                insn += insn->off;
1859                CONT;
1860        JMP_EXIT:
1861                return BPF_R0;
1862        /* JMP */
1863#define COND_JMP(SIGN, OPCODE, CMP_OP)                          \
1864        JMP_##OPCODE##_X:                                       \
1865                if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {     \
1866                        insn += insn->off;                      \
1867                        CONT_JMP;                               \
1868                }                                               \
1869                CONT;                                           \
1870        JMP32_##OPCODE##_X:                                     \
1871                if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {     \
1872                        insn += insn->off;                      \
1873                        CONT_JMP;                               \
1874                }                                               \
1875                CONT;                                           \
1876        JMP_##OPCODE##_K:                                       \
1877                if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {     \
1878                        insn += insn->off;                      \
1879                        CONT_JMP;                               \
1880                }                                               \
1881                CONT;                                           \
1882        JMP32_##OPCODE##_K:                                     \
1883                if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {     \
1884                        insn += insn->off;                      \
1885                        CONT_JMP;                               \
1886                }                                               \
1887                CONT;
1888        COND_JMP(u, JEQ, ==)
1889        COND_JMP(u, JNE, !=)
1890        COND_JMP(u, JGT, >)
1891        COND_JMP(u, JLT, <)
1892        COND_JMP(u, JGE, >=)
1893        COND_JMP(u, JLE, <=)
1894        COND_JMP(u, JSET, &)
1895        COND_JMP(s, JSGT, >)
1896        COND_JMP(s, JSLT, <)
1897        COND_JMP(s, JSGE, >=)
1898        COND_JMP(s, JSLE, <=)
1899#undef COND_JMP
1900        /* ST, STX and LDX*/
1901        ST_NOSPEC:
1902                /* Speculation barrier for mitigating Speculative Store Bypass.
1903                 * In case of arm64, we rely on the firmware mitigation as
1904                 * controlled via the ssbd kernel parameter. Whenever the
1905                 * mitigation is enabled, it works for all of the kernel code
1906                 * with no need to provide any additional instructions here.
1907                 * In case of x86, we use 'lfence' insn for mitigation. We
1908                 * reuse preexisting logic from Spectre v1 mitigation that
1909                 * happens to produce the required code on x86 for v4 as well.
1910                 */
1911#ifdef CONFIG_X86
1912                barrier_nospec();
1913#endif
1914                CONT;
1915#define LDST(SIZEOP, SIZE)                                              \
1916        STX_MEM_##SIZEOP:                                               \
1917                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;       \
1918                CONT;                                                   \
1919        ST_MEM_##SIZEOP:                                                \
1920                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;       \
1921                CONT;                                                   \
1922        LDX_MEM_##SIZEOP:                                               \
1923                DST = *(SIZE *)(unsigned long) (SRC + insn->off);       \
1924                CONT;                                                   \
1925        LDX_PROBE_MEM_##SIZEOP:                                         \
1926                bpf_probe_read_kernel(&DST, sizeof(SIZE),               \
1927                                      (const void *)(long) (SRC + insn->off));  \
1928                DST = *((SIZE *)&DST);                                  \
1929                CONT;
1930
1931        LDST(B,   u8)
1932        LDST(H,  u16)
1933        LDST(W,  u32)
1934        LDST(DW, u64)
1935#undef LDST
1936
1937#define ATOMIC_ALU_OP(BOP, KOP)                                         \
1938                case BOP:                                               \
1939                        if (BPF_SIZE(insn->code) == BPF_W)              \
1940                                atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
1941                                             (DST + insn->off));        \
1942                        else                                            \
1943                                atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
1944                                               (DST + insn->off));      \
1945                        break;                                          \
1946                case BOP | BPF_FETCH:                                   \
1947                        if (BPF_SIZE(insn->code) == BPF_W)              \
1948                                SRC = (u32) atomic_fetch_##KOP(         \
1949                                        (u32) SRC,                      \
1950                                        (atomic_t *)(unsigned long) (DST + insn->off)); \
1951                        else                                            \
1952                                SRC = (u64) atomic64_fetch_##KOP(       \
1953                                        (u64) SRC,                      \
1954                                        (atomic64_t *)(unsigned long) (DST + insn->off)); \
1955                        break;
1956
1957        STX_ATOMIC_DW:
1958        STX_ATOMIC_W:
1959                switch (IMM) {
1960                ATOMIC_ALU_OP(BPF_ADD, add)
1961                ATOMIC_ALU_OP(BPF_AND, and)
1962                ATOMIC_ALU_OP(BPF_OR, or)
1963                ATOMIC_ALU_OP(BPF_XOR, xor)
1964#undef ATOMIC_ALU_OP
1965
1966                case BPF_XCHG:
1967                        if (BPF_SIZE(insn->code) == BPF_W)
1968                                SRC = (u32) atomic_xchg(
1969                                        (atomic_t *)(unsigned long) (DST + insn->off),
1970                                        (u32) SRC);
1971                        else
1972                                SRC = (u64) atomic64_xchg(
1973                                        (atomic64_t *)(unsigned long) (DST + insn->off),
1974                                        (u64) SRC);
1975                        break;
1976                case BPF_CMPXCHG:
1977                        if (BPF_SIZE(insn->code) == BPF_W)
1978                                BPF_R0 = (u32) atomic_cmpxchg(
1979                                        (atomic_t *)(unsigned long) (DST + insn->off),
1980                                        (u32) BPF_R0, (u32) SRC);
1981                        else
1982                                BPF_R0 = (u64) atomic64_cmpxchg(
1983                                        (atomic64_t *)(unsigned long) (DST + insn->off),
1984                                        (u64) BPF_R0, (u64) SRC);
1985                        break;
1986
1987                default:
1988                        goto default_label;
1989                }
1990                CONT;
1991
1992        default_label:
1993                /* If we ever reach this, we have a bug somewhere. Die hard here
1994                 * instead of just returning 0; we could be somewhere in a subprog,
1995                 * so execution could continue otherwise which we do /not/ want.
1996                 *
1997                 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
1998                 */
1999                pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
2000                        insn->code, insn->imm);
2001                BUG_ON(1);
2002                return 0;
2003}
2004
2005#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
2006#define DEFINE_BPF_PROG_RUN(stack_size) \
2007static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
2008{ \
2009        u64 stack[stack_size / sizeof(u64)]; \
2010        u64 regs[MAX_BPF_EXT_REG]; \
2011\
2012        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2013        ARG1 = (u64) (unsigned long) ctx; \
2014        return ___bpf_prog_run(regs, insn); \
2015}
2016
2017#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
2018#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
2019static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
2020                                      const struct bpf_insn *insn) \
2021{ \
2022        u64 stack[stack_size / sizeof(u64)]; \
2023        u64 regs[MAX_BPF_EXT_REG]; \
2024\
2025        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2026        BPF_R1 = r1; \
2027        BPF_R2 = r2; \
2028        BPF_R3 = r3; \
2029        BPF_R4 = r4; \
2030        BPF_R5 = r5; \
2031        return ___bpf_prog_run(regs, insn); \
2032}
2033
2034#define EVAL1(FN, X) FN(X)
2035#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
2036#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
2037#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
2038#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
2039#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
2040
2041EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
2042EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
2043EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
2044
2045EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
2046EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
2047EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
2048
2049#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
2050
2051static unsigned int (*interpreters[])(const void *ctx,
2052                                      const struct bpf_insn *insn) = {
2053EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2054EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2055EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2056};
2057#undef PROG_NAME_LIST
2058#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
2059static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
2060                                  const struct bpf_insn *insn) = {
2061EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2062EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2063EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2064};
2065#undef PROG_NAME_LIST
2066
2067void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
2068{
2069        stack_depth = max_t(u32, stack_depth, 1);
2070        insn->off = (s16) insn->imm;
2071        insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
2072                __bpf_call_base_args;
2073        insn->code = BPF_JMP | BPF_CALL_ARGS;
2074}
2075
2076#else
2077static unsigned int __bpf_prog_ret0_warn(const void *ctx,
2078                                         const struct bpf_insn *insn)
2079{
2080        /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
2081         * is not working properly, so warn about it!
2082         */
2083        WARN_ON_ONCE(1);
2084        return 0;
2085}
2086#endif
2087
2088bool bpf_prog_map_compatible(struct bpf_map *map,
2089                             const struct bpf_prog *fp)
2090{
2091        bool ret;
2092
2093        if (fp->kprobe_override)
2094                return false;
2095
2096        spin_lock(&map->owner.lock);
2097        if (!map->owner.type) {
2098                /* There's no owner yet where we could check for
2099                 * compatibility.
2100                 */
2101                map->owner.type  = fp->type;
2102                map->owner.jited = fp->jited;
2103                map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
2104                ret = true;
2105        } else {
2106                ret = map->owner.type  == fp->type &&
2107                      map->owner.jited == fp->jited &&
2108                      map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
2109        }
2110        spin_unlock(&map->owner.lock);
2111
2112        return ret;
2113}
2114
2115static int bpf_check_tail_call(const struct bpf_prog *fp)
2116{
2117        struct bpf_prog_aux *aux = fp->aux;
2118        int i, ret = 0;
2119
2120        mutex_lock(&aux->used_maps_mutex);
2121        for (i = 0; i < aux->used_map_cnt; i++) {
2122                struct bpf_map *map = aux->used_maps[i];
2123
2124                if (!map_type_contains_progs(map))
2125                        continue;
2126
2127                if (!bpf_prog_map_compatible(map, fp)) {
2128                        ret = -EINVAL;
2129                        goto out;
2130                }
2131        }
2132
2133out:
2134        mutex_unlock(&aux->used_maps_mutex);
2135        return ret;
2136}
2137
2138static void bpf_prog_select_func(struct bpf_prog *fp)
2139{
2140#ifndef CONFIG_BPF_JIT_ALWAYS_ON
2141        u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
2142
2143        fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
2144#else
2145        fp->bpf_func = __bpf_prog_ret0_warn;
2146#endif
2147}
2148
2149/**
2150 *      bpf_prog_select_runtime - select exec runtime for BPF program
2151 *      @fp: bpf_prog populated with BPF program
2152 *      @err: pointer to error variable
2153 *
2154 * Try to JIT eBPF program, if JIT is not available, use interpreter.
2155 * The BPF program will be executed via bpf_prog_run() function.
2156 *
2157 * Return: the &fp argument along with &err set to 0 for success or
2158 * a negative errno code on failure
2159 */
2160struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
2161{
2162        /* In case of BPF to BPF calls, verifier did all the prep
2163         * work with regards to JITing, etc.
2164         */
2165        bool jit_needed = false;
2166
2167        if (fp->bpf_func)
2168                goto finalize;
2169
2170        if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
2171            bpf_prog_has_kfunc_call(fp))
2172                jit_needed = true;
2173
2174        bpf_prog_select_func(fp);
2175
2176        /* eBPF JITs can rewrite the program in case constant
2177         * blinding is active. However, in case of error during
2178         * blinding, bpf_int_jit_compile() must always return a
2179         * valid program, which in this case would simply not
2180         * be JITed, but falls back to the interpreter.
2181         */
2182        if (!bpf_prog_is_dev_bound(fp->aux)) {
2183                *err = bpf_prog_alloc_jited_linfo(fp);
2184                if (*err)
2185                        return fp;
2186
2187                fp = bpf_int_jit_compile(fp);
2188                bpf_prog_jit_attempt_done(fp);
2189                if (!fp->jited && jit_needed) {
2190                        *err = -ENOTSUPP;
2191                        return fp;
2192                }
2193        } else {
2194                *err = bpf_prog_offload_compile(fp);
2195                if (*err)
2196                        return fp;
2197        }
2198
2199finalize:
2200        bpf_prog_lock_ro(fp);
2201
2202        /* The tail call compatibility check can only be done at
2203         * this late stage as we need to determine, if we deal
2204         * with JITed or non JITed program concatenations and not
2205         * all eBPF JITs might immediately support all features.
2206         */
2207        *err = bpf_check_tail_call(fp);
2208
2209        return fp;
2210}
2211EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
2212
2213static unsigned int __bpf_prog_ret1(const void *ctx,
2214                                    const struct bpf_insn *insn)
2215{
2216        return 1;
2217}
2218
2219static struct bpf_prog_dummy {
2220        struct bpf_prog prog;
2221} dummy_bpf_prog = {
2222        .prog = {
2223                .bpf_func = __bpf_prog_ret1,
2224        },
2225};
2226
2227struct bpf_empty_prog_array bpf_empty_prog_array = {
2228        .null_prog = NULL,
2229};
2230EXPORT_SYMBOL(bpf_empty_prog_array);
2231
2232struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
2233{
2234        if (prog_cnt)
2235                return kzalloc(sizeof(struct bpf_prog_array) +
2236                               sizeof(struct bpf_prog_array_item) *
2237                               (prog_cnt + 1),
2238                               flags);
2239
2240        return &bpf_empty_prog_array.hdr;
2241}
2242
2243void bpf_prog_array_free(struct bpf_prog_array *progs)
2244{
2245        if (!progs || progs == &bpf_empty_prog_array.hdr)
2246                return;
2247        kfree_rcu(progs, rcu);
2248}
2249
2250static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
2251{
2252        struct bpf_prog_array *progs;
2253
2254        progs = container_of(rcu, struct bpf_prog_array, rcu);
2255        kfree_rcu(progs, rcu);
2256}
2257
2258void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
2259{
2260        if (!progs || progs == &bpf_empty_prog_array.hdr)
2261                return;
2262        call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
2263}
2264
2265int bpf_prog_array_length(struct bpf_prog_array *array)
2266{
2267        struct bpf_prog_array_item *item;
2268        u32 cnt = 0;
2269
2270        for (item = array->items; item->prog; item++)
2271                if (item->prog != &dummy_bpf_prog.prog)
2272                        cnt++;
2273        return cnt;
2274}
2275
2276bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
2277{
2278        struct bpf_prog_array_item *item;
2279
2280        for (item = array->items; item->prog; item++)
2281                if (item->prog != &dummy_bpf_prog.prog)
2282                        return false;
2283        return true;
2284}
2285
2286static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
2287                                     u32 *prog_ids,
2288                                     u32 request_cnt)
2289{
2290        struct bpf_prog_array_item *item;
2291        int i = 0;
2292
2293        for (item = array->items; item->prog; item++) {
2294                if (item->prog == &dummy_bpf_prog.prog)
2295                        continue;
2296                prog_ids[i] = item->prog->aux->id;
2297                if (++i == request_cnt) {
2298                        item++;
2299                        break;
2300                }
2301        }
2302
2303        return !!(item->prog);
2304}
2305
2306int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
2307                                __u32 __user *prog_ids, u32 cnt)
2308{
2309        unsigned long err = 0;
2310        bool nospc;
2311        u32 *ids;
2312
2313        /* users of this function are doing:
2314         * cnt = bpf_prog_array_length();
2315         * if (cnt > 0)
2316         *     bpf_prog_array_copy_to_user(..., cnt);
2317         * so below kcalloc doesn't need extra cnt > 0 check.
2318         */
2319        ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
2320        if (!ids)
2321                return -ENOMEM;
2322        nospc = bpf_prog_array_copy_core(array, ids, cnt);
2323        err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
2324        kfree(ids);
2325        if (err)
2326                return -EFAULT;
2327        if (nospc)
2328                return -ENOSPC;
2329        return 0;
2330}
2331
2332void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
2333                                struct bpf_prog *old_prog)
2334{
2335        struct bpf_prog_array_item *item;
2336
2337        for (item = array->items; item->prog; item++)
2338                if (item->prog == old_prog) {
2339                        WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
2340                        break;
2341                }
2342}
2343
2344/**
2345 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
2346 *                                   index into the program array with
2347 *                                   a dummy no-op program.
2348 * @array: a bpf_prog_array
2349 * @index: the index of the program to replace
2350 *
2351 * Skips over dummy programs, by not counting them, when calculating
2352 * the position of the program to replace.
2353 *
2354 * Return:
2355 * * 0          - Success
2356 * * -EINVAL    - Invalid index value. Must be a non-negative integer.
2357 * * -ENOENT    - Index out of range
2358 */
2359int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
2360{
2361        return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
2362}
2363
2364/**
2365 * bpf_prog_array_update_at() - Updates the program at the given index
2366 *                              into the program array.
2367 * @array: a bpf_prog_array
2368 * @index: the index of the program to update
2369 * @prog: the program to insert into the array
2370 *
2371 * Skips over dummy programs, by not counting them, when calculating
2372 * the position of the program to update.
2373 *
2374 * Return:
2375 * * 0          - Success
2376 * * -EINVAL    - Invalid index value. Must be a non-negative integer.
2377 * * -ENOENT    - Index out of range
2378 */
2379int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
2380                             struct bpf_prog *prog)
2381{
2382        struct bpf_prog_array_item *item;
2383
2384        if (unlikely(index < 0))
2385                return -EINVAL;
2386
2387        for (item = array->items; item->prog; item++) {
2388                if (item->prog == &dummy_bpf_prog.prog)
2389                        continue;
2390                if (!index) {
2391                        WRITE_ONCE(item->prog, prog);
2392                        return 0;
2393                }
2394                index--;
2395        }
2396        return -ENOENT;
2397}
2398
2399int bpf_prog_array_copy(struct bpf_prog_array *old_array,
2400                        struct bpf_prog *exclude_prog,
2401                        struct bpf_prog *include_prog,
2402                        u64 bpf_cookie,
2403                        struct bpf_prog_array **new_array)
2404{
2405        int new_prog_cnt, carry_prog_cnt = 0;
2406        struct bpf_prog_array_item *existing, *new;
2407        struct bpf_prog_array *array;
2408        bool found_exclude = false;
2409
2410        /* Figure out how many existing progs we need to carry over to
2411         * the new array.
2412         */
2413        if (old_array) {
2414                existing = old_array->items;
2415                for (; existing->prog; existing++) {
2416                        if (existing->prog == exclude_prog) {
2417                                found_exclude = true;
2418                                continue;
2419                        }
2420                        if (existing->prog != &dummy_bpf_prog.prog)
2421                                carry_prog_cnt++;
2422                        if (existing->prog == include_prog)
2423                                return -EEXIST;
2424                }
2425        }
2426
2427        if (exclude_prog && !found_exclude)
2428                return -ENOENT;
2429
2430        /* How many progs (not NULL) will be in the new array? */
2431        new_prog_cnt = carry_prog_cnt;
2432        if (include_prog)
2433                new_prog_cnt += 1;
2434
2435        /* Do we have any prog (not NULL) in the new array? */
2436        if (!new_prog_cnt) {
2437                *new_array = NULL;
2438                return 0;
2439        }
2440
2441        /* +1 as the end of prog_array is marked with NULL */
2442        array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
2443        if (!array)
2444                return -ENOMEM;
2445        new = array->items;
2446
2447        /* Fill in the new prog array */
2448        if (carry_prog_cnt) {
2449                existing = old_array->items;
2450                for (; existing->prog; existing++) {
2451                        if (existing->prog == exclude_prog ||
2452                            existing->prog == &dummy_bpf_prog.prog)
2453                                continue;
2454
2455                        new->prog = existing->prog;
2456                        new->bpf_cookie = existing->bpf_cookie;
2457                        new++;
2458                }
2459        }
2460        if (include_prog) {
2461                new->prog = include_prog;
2462                new->bpf_cookie = bpf_cookie;
2463                new++;
2464        }
2465        new->prog = NULL;
2466        *new_array = array;
2467        return 0;
2468}
2469
2470int bpf_prog_array_copy_info(struct bpf_prog_array *array,
2471                             u32 *prog_ids, u32 request_cnt,
2472                             u32 *prog_cnt)
2473{
2474        u32 cnt = 0;
2475
2476        if (array)
2477                cnt = bpf_prog_array_length(array);
2478
2479        *prog_cnt = cnt;
2480
2481        /* return early if user requested only program count or nothing to copy */
2482        if (!request_cnt || !cnt)
2483                return 0;
2484
2485        /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
2486        return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
2487                                                                     : 0;
2488}
2489
2490void __bpf_free_used_maps(struct bpf_prog_aux *aux,
2491                          struct bpf_map **used_maps, u32 len)
2492{
2493        struct bpf_map *map;
2494        u32 i;
2495
2496        for (i = 0; i < len; i++) {
2497                map = used_maps[i];
2498                if (map->ops->map_poke_untrack)
2499                        map->ops->map_poke_untrack(map, aux);
2500                bpf_map_put(map);
2501        }
2502}
2503
2504static void bpf_free_used_maps(struct bpf_prog_aux *aux)
2505{
2506        __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
2507        kfree(aux->used_maps);
2508}
2509
2510void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
2511                          struct btf_mod_pair *used_btfs, u32 len)
2512{
2513#ifdef CONFIG_BPF_SYSCALL
2514        struct btf_mod_pair *btf_mod;
2515        u32 i;
2516
2517        for (i = 0; i < len; i++) {
2518                btf_mod = &used_btfs[i];
2519                if (btf_mod->module)
2520                        module_put(btf_mod->module);
2521                btf_put(btf_mod->btf);
2522        }
2523#endif
2524}
2525
2526static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
2527{
2528        __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
2529        kfree(aux->used_btfs);
2530}
2531
2532static void bpf_prog_free_deferred(struct work_struct *work)
2533{
2534        struct bpf_prog_aux *aux;
2535        int i;
2536
2537        aux = container_of(work, struct bpf_prog_aux, work);
2538#ifdef CONFIG_BPF_SYSCALL
2539        bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
2540#endif
2541#ifdef CONFIG_CGROUP_BPF
2542        if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
2543                bpf_cgroup_atype_put(aux->cgroup_atype);
2544#endif
2545        bpf_free_used_maps(aux);
2546        bpf_free_used_btfs(aux);
2547        if (bpf_prog_is_dev_bound(aux))
2548                bpf_prog_offload_destroy(aux->prog);
2549#ifdef CONFIG_PERF_EVENTS
2550        if (aux->prog->has_callchain_buf)
2551                put_callchain_buffers();
2552#endif
2553        if (aux->dst_trampoline)
2554                bpf_trampoline_put(aux->dst_trampoline);
2555        for (i = 0; i < aux->func_cnt; i++) {
2556                /* We can just unlink the subprog poke descriptor table as
2557                 * it was originally linked to the main program and is also
2558                 * released along with it.
2559                 */
2560                aux->func[i]->aux->poke_tab = NULL;
2561                bpf_jit_free(aux->func[i]);
2562        }
2563        if (aux->func_cnt) {
2564                kfree(aux->func);
2565                bpf_prog_unlock_free(aux->prog);
2566        } else {
2567                bpf_jit_free(aux->prog);
2568        }
2569}
2570
2571void bpf_prog_free(struct bpf_prog *fp)
2572{
2573        struct bpf_prog_aux *aux = fp->aux;
2574
2575        if (aux->dst_prog)
2576                bpf_prog_put(aux->dst_prog);
2577        INIT_WORK(&aux->work, bpf_prog_free_deferred);
2578        schedule_work(&aux->work);
2579}
2580EXPORT_SYMBOL_GPL(bpf_prog_free);
2581
2582/* RNG for unpriviledged user space with separated state from prandom_u32(). */
2583static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
2584
2585void bpf_user_rnd_init_once(void)
2586{
2587        prandom_init_once(&bpf_user_rnd_state);
2588}
2589
2590BPF_CALL_0(bpf_user_rnd_u32)
2591{
2592        /* Should someone ever have the rather unwise idea to use some
2593         * of the registers passed into this function, then note that
2594         * this function is called from native eBPF and classic-to-eBPF
2595         * transformations. Register assignments from both sides are
2596         * different, f.e. classic always sets fn(ctx, A, X) here.
2597         */
2598        struct rnd_state *state;
2599        u32 res;
2600
2601        state = &get_cpu_var(bpf_user_rnd_state);
2602        res = prandom_u32_state(state);
2603        put_cpu_var(bpf_user_rnd_state);
2604
2605        return res;
2606}
2607
2608BPF_CALL_0(bpf_get_raw_cpu_id)
2609{
2610        return raw_smp_processor_id();
2611}
2612
2613/* Weak definitions of helper functions in case we don't have bpf syscall. */
2614const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
2615const struct bpf_func_proto bpf_map_update_elem_proto __weak;
2616const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
2617const struct bpf_func_proto bpf_map_push_elem_proto __weak;
2618const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
2619const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
2620const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
2621const struct bpf_func_proto bpf_spin_lock_proto __weak;
2622const struct bpf_func_proto bpf_spin_unlock_proto __weak;
2623const struct bpf_func_proto bpf_jiffies64_proto __weak;
2624
2625const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
2626const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
2627const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
2628const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
2629const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
2630const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
2631
2632const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
2633const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
2634const struct bpf_func_proto bpf_get_current_comm_proto __weak;
2635const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
2636const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
2637const struct bpf_func_proto bpf_get_local_storage_proto __weak;
2638const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
2639const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
2640const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
2641const struct bpf_func_proto bpf_set_retval_proto __weak;
2642const struct bpf_func_proto bpf_get_retval_proto __weak;
2643
2644const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
2645{
2646        return NULL;
2647}
2648
2649const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
2650{
2651        return NULL;
2652}
2653
2654u64 __weak
2655bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
2656                 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
2657{
2658        return -ENOTSUPP;
2659}
2660EXPORT_SYMBOL_GPL(bpf_event_output);
2661
2662/* Always built-in helper functions. */
2663const struct bpf_func_proto bpf_tail_call_proto = {
2664        .func           = NULL,
2665        .gpl_only       = false,
2666        .ret_type       = RET_VOID,
2667        .arg1_type      = ARG_PTR_TO_CTX,
2668        .arg2_type      = ARG_CONST_MAP_PTR,
2669        .arg3_type      = ARG_ANYTHING,
2670};
2671
2672/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
2673 * It is encouraged to implement bpf_int_jit_compile() instead, so that
2674 * eBPF and implicitly also cBPF can get JITed!
2675 */
2676struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
2677{
2678        return prog;
2679}
2680
2681/* Stub for JITs that support eBPF. All cBPF code gets transformed into
2682 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
2683 */
2684void __weak bpf_jit_compile(struct bpf_prog *prog)
2685{
2686}
2687
2688bool __weak bpf_helper_changes_pkt_data(void *func)
2689{
2690        return false;
2691}
2692
2693/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
2694 * analysis code and wants explicit zero extension inserted by verifier.
2695 * Otherwise, return FALSE.
2696 *
2697 * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
2698 * you don't override this. JITs that don't want these extra insns can detect
2699 * them using insn_is_zext.
2700 */
2701bool __weak bpf_jit_needs_zext(void)
2702{
2703        return false;
2704}
2705
2706/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
2707bool __weak bpf_jit_supports_subprog_tailcalls(void)
2708{
2709        return false;
2710}
2711
2712bool __weak bpf_jit_supports_kfunc_call(void)
2713{
2714        return false;
2715}
2716
2717/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
2718 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
2719 */
2720int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
2721                         int len)
2722{
2723        return -EFAULT;
2724}
2725
2726int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
2727                              void *addr1, void *addr2)
2728{
2729        return -ENOTSUPP;
2730}
2731
2732void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
2733{
2734        return ERR_PTR(-ENOTSUPP);
2735}
2736
2737int __weak bpf_arch_text_invalidate(void *dst, size_t len)
2738{
2739        return -ENOTSUPP;
2740}
2741
2742DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
2743EXPORT_SYMBOL(bpf_stats_enabled_key);
2744
2745/* All definitions of tracepoints related to BPF. */
2746#define CREATE_TRACE_POINTS
2747#include <linux/bpf_trace.h>
2748
2749EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
2750EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
2751