linux/net/core/filter.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Linux Socket Filter - Kernel level socket filtering
   4 *
   5 * Based on the design of the Berkeley Packet Filter. The new
   6 * internal format has been designed by PLUMgrid:
   7 *
   8 *      Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
   9 *
  10 * Authors:
  11 *
  12 *      Jay Schulist <jschlst@samba.org>
  13 *      Alexei Starovoitov <ast@plumgrid.com>
  14 *      Daniel Borkmann <dborkman@redhat.com>
  15 *
  16 * Andi Kleen - Fix a few bad bugs and races.
  17 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  18 */
  19
  20#include <linux/atomic.h>
  21#include <linux/module.h>
  22#include <linux/types.h>
  23#include <linux/mm.h>
  24#include <linux/fcntl.h>
  25#include <linux/socket.h>
  26#include <linux/sock_diag.h>
  27#include <linux/in.h>
  28#include <linux/inet.h>
  29#include <linux/netdevice.h>
  30#include <linux/if_packet.h>
  31#include <linux/if_arp.h>
  32#include <linux/gfp.h>
  33#include <net/inet_common.h>
  34#include <net/ip.h>
  35#include <net/protocol.h>
  36#include <net/netlink.h>
  37#include <linux/skbuff.h>
  38#include <linux/skmsg.h>
  39#include <net/sock.h>
  40#include <net/flow_dissector.h>
  41#include <linux/errno.h>
  42#include <linux/timer.h>
  43#include <linux/uaccess.h>
  44#include <asm/unaligned.h>
  45#include <linux/filter.h>
  46#include <linux/ratelimit.h>
  47#include <linux/seccomp.h>
  48#include <linux/if_vlan.h>
  49#include <linux/bpf.h>
  50#include <linux/btf.h>
  51#include <net/sch_generic.h>
  52#include <net/cls_cgroup.h>
  53#include <net/dst_metadata.h>
  54#include <net/dst.h>
  55#include <net/sock_reuseport.h>
  56#include <net/busy_poll.h>
  57#include <net/tcp.h>
  58#include <net/xfrm.h>
  59#include <net/udp.h>
  60#include <linux/bpf_trace.h>
  61#include <net/xdp_sock.h>
  62#include <linux/inetdevice.h>
  63#include <net/inet_hashtables.h>
  64#include <net/inet6_hashtables.h>
  65#include <net/ip_fib.h>
  66#include <net/nexthop.h>
  67#include <net/flow.h>
  68#include <net/arp.h>
  69#include <net/ipv6.h>
  70#include <net/net_namespace.h>
  71#include <linux/seg6_local.h>
  72#include <net/seg6.h>
  73#include <net/seg6_local.h>
  74#include <net/lwtunnel.h>
  75#include <net/ipv6_stubs.h>
  76#include <net/bpf_sk_storage.h>
  77#include <net/transp_v6.h>
  78#include <linux/btf_ids.h>
  79#include <net/tls.h>
  80
  81static const struct bpf_func_proto *
  82bpf_sk_base_func_proto(enum bpf_func_id func_id);
  83
  84int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
  85{
  86        if (in_compat_syscall()) {
  87                struct compat_sock_fprog f32;
  88
  89                if (len != sizeof(f32))
  90                        return -EINVAL;
  91                if (copy_from_sockptr(&f32, src, sizeof(f32)))
  92                        return -EFAULT;
  93                memset(dst, 0, sizeof(*dst));
  94                dst->len = f32.len;
  95                dst->filter = compat_ptr(f32.filter);
  96        } else {
  97                if (len != sizeof(*dst))
  98                        return -EINVAL;
  99                if (copy_from_sockptr(dst, src, sizeof(*dst)))
 100                        return -EFAULT;
 101        }
 102
 103        return 0;
 104}
 105EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
 106
 107/**
 108 *      sk_filter_trim_cap - run a packet through a socket filter
 109 *      @sk: sock associated with &sk_buff
 110 *      @skb: buffer to filter
 111 *      @cap: limit on how short the eBPF program may trim the packet
 112 *
 113 * Run the eBPF program and then cut skb->data to correct size returned by
 114 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
 115 * than pkt_len we keep whole skb->data. This is the socket level
 116 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
 117 * be accepted or -EPERM if the packet should be tossed.
 118 *
 119 */
 120int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 121{
 122        int err;
 123        struct sk_filter *filter;
 124
 125        /*
 126         * If the skb was allocated from pfmemalloc reserves, only
 127         * allow SOCK_MEMALLOC sockets to use it as this socket is
 128         * helping free memory
 129         */
 130        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
 131                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
 132                return -ENOMEM;
 133        }
 134        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
 135        if (err)
 136                return err;
 137
 138        err = security_sock_rcv_skb(sk, skb);
 139        if (err)
 140                return err;
 141
 142        rcu_read_lock();
 143        filter = rcu_dereference(sk->sk_filter);
 144        if (filter) {
 145                struct sock *save_sk = skb->sk;
 146                unsigned int pkt_len;
 147
 148                skb->sk = sk;
 149                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
 150                skb->sk = save_sk;
 151                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
 152        }
 153        rcu_read_unlock();
 154
 155        return err;
 156}
 157EXPORT_SYMBOL(sk_filter_trim_cap);
 158
 159BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
 160{
 161        return skb_get_poff(skb);
 162}
 163
 164BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
 165{
 166        struct nlattr *nla;
 167
 168        if (skb_is_nonlinear(skb))
 169                return 0;
 170
 171        if (skb->len < sizeof(struct nlattr))
 172                return 0;
 173
 174        if (a > skb->len - sizeof(struct nlattr))
 175                return 0;
 176
 177        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
 178        if (nla)
 179                return (void *) nla - (void *) skb->data;
 180
 181        return 0;
 182}
 183
 184BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 185{
 186        struct nlattr *nla;
 187
 188        if (skb_is_nonlinear(skb))
 189                return 0;
 190
 191        if (skb->len < sizeof(struct nlattr))
 192                return 0;
 193
 194        if (a > skb->len - sizeof(struct nlattr))
 195                return 0;
 196
 197        nla = (struct nlattr *) &skb->data[a];
 198        if (nla->nla_len > skb->len - a)
 199                return 0;
 200
 201        nla = nla_find_nested(nla, x);
 202        if (nla)
 203                return (void *) nla - (void *) skb->data;
 204
 205        return 0;
 206}
 207
 208BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
 209           data, int, headlen, int, offset)
 210{
 211        u8 tmp, *ptr;
 212        const int len = sizeof(tmp);
 213
 214        if (offset >= 0) {
 215                if (headlen - offset >= len)
 216                        return *(u8 *)(data + offset);
 217                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
 218                        return tmp;
 219        } else {
 220                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
 221                if (likely(ptr))
 222                        return *(u8 *)ptr;
 223        }
 224
 225        return -EFAULT;
 226}
 227
 228BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
 229           int, offset)
 230{
 231        return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
 232                                         offset);
 233}
 234
 235BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
 236           data, int, headlen, int, offset)
 237{
 238        u16 tmp, *ptr;
 239        const int len = sizeof(tmp);
 240
 241        if (offset >= 0) {
 242                if (headlen - offset >= len)
 243                        return get_unaligned_be16(data + offset);
 244                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
 245                        return be16_to_cpu(tmp);
 246        } else {
 247                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
 248                if (likely(ptr))
 249                        return get_unaligned_be16(ptr);
 250        }
 251
 252        return -EFAULT;
 253}
 254
 255BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
 256           int, offset)
 257{
 258        return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
 259                                          offset);
 260}
 261
 262BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
 263           data, int, headlen, int, offset)
 264{
 265        u32 tmp, *ptr;
 266        const int len = sizeof(tmp);
 267
 268        if (likely(offset >= 0)) {
 269                if (headlen - offset >= len)
 270                        return get_unaligned_be32(data + offset);
 271                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
 272                        return be32_to_cpu(tmp);
 273        } else {
 274                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
 275                if (likely(ptr))
 276                        return get_unaligned_be32(ptr);
 277        }
 278
 279        return -EFAULT;
 280}
 281
 282BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
 283           int, offset)
 284{
 285        return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
 286                                          offset);
 287}
 288
 289static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
 290                              struct bpf_insn *insn_buf)
 291{
 292        struct bpf_insn *insn = insn_buf;
 293
 294        switch (skb_field) {
 295        case SKF_AD_MARK:
 296                BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);
 297
 298                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
 299                                      offsetof(struct sk_buff, mark));
 300                break;
 301
 302        case SKF_AD_PKTTYPE:
 303                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
 304                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
 305#ifdef __BIG_ENDIAN_BITFIELD
 306                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
 307#endif
 308                break;
 309
 310        case SKF_AD_QUEUE:
 311                BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);
 312
 313                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
 314                                      offsetof(struct sk_buff, queue_mapping));
 315                break;
 316
 317        case SKF_AD_VLAN_TAG:
 318                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);
 319
 320                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
 321                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
 322                                      offsetof(struct sk_buff, vlan_tci));
 323                break;
 324        case SKF_AD_VLAN_TAG_PRESENT:
 325                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET());
 326                if (PKT_VLAN_PRESENT_BIT)
 327                        *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
 328                if (PKT_VLAN_PRESENT_BIT < 7)
 329                        *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
 330                break;
 331        }
 332
 333        return insn - insn_buf;
 334}
 335
 336static bool convert_bpf_extensions(struct sock_filter *fp,
 337                                   struct bpf_insn **insnp)
 338{
 339        struct bpf_insn *insn = *insnp;
 340        u32 cnt;
 341
 342        switch (fp->k) {
 343        case SKF_AD_OFF + SKF_AD_PROTOCOL:
 344                BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);
 345
 346                /* A = *(u16 *) (CTX + offsetof(protocol)) */
 347                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 348                                      offsetof(struct sk_buff, protocol));
 349                /* A = ntohs(A) [emitting a nop or swap16] */
 350                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 351                break;
 352
 353        case SKF_AD_OFF + SKF_AD_PKTTYPE:
 354                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
 355                insn += cnt - 1;
 356                break;
 357
 358        case SKF_AD_OFF + SKF_AD_IFINDEX:
 359        case SKF_AD_OFF + SKF_AD_HATYPE:
 360                BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
 361                BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);
 362
 363                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 364                                      BPF_REG_TMP, BPF_REG_CTX,
 365                                      offsetof(struct sk_buff, dev));
 366                /* if (tmp != 0) goto pc + 1 */
 367                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
 368                *insn++ = BPF_EXIT_INSN();
 369                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
 370                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
 371                                            offsetof(struct net_device, ifindex));
 372                else
 373                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
 374                                            offsetof(struct net_device, type));
 375                break;
 376
 377        case SKF_AD_OFF + SKF_AD_MARK:
 378                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
 379                insn += cnt - 1;
 380                break;
 381
 382        case SKF_AD_OFF + SKF_AD_RXHASH:
 383                BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);
 384
 385                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
 386                                    offsetof(struct sk_buff, hash));
 387                break;
 388
 389        case SKF_AD_OFF + SKF_AD_QUEUE:
 390                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
 391                insn += cnt - 1;
 392                break;
 393
 394        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
 395                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
 396                                         BPF_REG_A, BPF_REG_CTX, insn);
 397                insn += cnt - 1;
 398                break;
 399
 400        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
 401                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
 402                                         BPF_REG_A, BPF_REG_CTX, insn);
 403                insn += cnt - 1;
 404                break;
 405
 406        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
 407                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);
 408
 409                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
 410                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 411                                      offsetof(struct sk_buff, vlan_proto));
 412                /* A = ntohs(A) [emitting a nop or swap16] */
 413                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 414                break;
 415
 416        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 417        case SKF_AD_OFF + SKF_AD_NLATTR:
 418        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 419        case SKF_AD_OFF + SKF_AD_CPU:
 420        case SKF_AD_OFF + SKF_AD_RANDOM:
 421                /* arg1 = CTX */
 422                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 423                /* arg2 = A */
 424                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
 425                /* arg3 = X */
 426                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
 427                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
 428                switch (fp->k) {
 429                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 430                        *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
 431                        break;
 432                case SKF_AD_OFF + SKF_AD_NLATTR:
 433                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
 434                        break;
 435                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 436                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
 437                        break;
 438                case SKF_AD_OFF + SKF_AD_CPU:
 439                        *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
 440                        break;
 441                case SKF_AD_OFF + SKF_AD_RANDOM:
 442                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
 443                        bpf_user_rnd_init_once();
 444                        break;
 445                }
 446                break;
 447
 448        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
 449                /* A ^= X */
 450                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
 451                break;
 452
 453        default:
 454                /* This is just a dummy call to avoid letting the compiler
 455                 * evict __bpf_call_base() as an optimization. Placed here
 456                 * where no-one bothers.
 457                 */
 458                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
 459                return false;
 460        }
 461
 462        *insnp = insn;
 463        return true;
 464}
 465
 466static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
 467{
 468        const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
 469        int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
 470        bool endian = BPF_SIZE(fp->code) == BPF_H ||
 471                      BPF_SIZE(fp->code) == BPF_W;
 472        bool indirect = BPF_MODE(fp->code) == BPF_IND;
 473        const int ip_align = NET_IP_ALIGN;
 474        struct bpf_insn *insn = *insnp;
 475        int offset = fp->k;
 476
 477        if (!indirect &&
 478            ((unaligned_ok && offset >= 0) ||
 479             (!unaligned_ok && offset >= 0 &&
 480              offset + ip_align >= 0 &&
 481              offset + ip_align % size == 0))) {
 482                bool ldx_off_ok = offset <= S16_MAX;
 483
 484                *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
 485                if (offset)
 486                        *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
 487                *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
 488                                      size, 2 + endian + (!ldx_off_ok * 2));
 489                if (ldx_off_ok) {
 490                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
 491                                              BPF_REG_D, offset);
 492                } else {
 493                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
 494                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
 495                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
 496                                              BPF_REG_TMP, 0);
 497                }
 498                if (endian)
 499                        *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
 500                *insn++ = BPF_JMP_A(8);
 501        }
 502
 503        *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 504        *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
 505        *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
 506        if (!indirect) {
 507                *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
 508        } else {
 509                *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
 510                if (fp->k)
 511                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
 512        }
 513
 514        switch (BPF_SIZE(fp->code)) {
 515        case BPF_B:
 516                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
 517                break;
 518        case BPF_H:
 519                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
 520                break;
 521        case BPF_W:
 522                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
 523                break;
 524        default:
 525                return false;
 526        }
 527
 528        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
 529        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
 530        *insn   = BPF_EXIT_INSN();
 531
 532        *insnp = insn;
 533        return true;
 534}
 535
 536/**
 537 *      bpf_convert_filter - convert filter program
 538 *      @prog: the user passed filter program
 539 *      @len: the length of the user passed filter program
 540 *      @new_prog: allocated 'struct bpf_prog' or NULL
 541 *      @new_len: pointer to store length of converted program
 542 *      @seen_ld_abs: bool whether we've seen ld_abs/ind
 543 *
 544 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
 545 * style extended BPF (eBPF).
 546 * Conversion workflow:
 547 *
 548 * 1) First pass for calculating the new program length:
 549 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
 550 *
 551 * 2) 2nd pass to remap in two passes: 1st pass finds new
 552 *    jump offsets, 2nd pass remapping:
 553 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
 554 */
 555static int bpf_convert_filter(struct sock_filter *prog, int len,
 556                              struct bpf_prog *new_prog, int *new_len,
 557                              bool *seen_ld_abs)
 558{
 559        int new_flen = 0, pass = 0, target, i, stack_off;
 560        struct bpf_insn *new_insn, *first_insn = NULL;
 561        struct sock_filter *fp;
 562        int *addrs = NULL;
 563        u8 bpf_src;
 564
 565        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
 566        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
 567
 568        if (len <= 0 || len > BPF_MAXINSNS)
 569                return -EINVAL;
 570
 571        if (new_prog) {
 572                first_insn = new_prog->insnsi;
 573                addrs = kcalloc(len, sizeof(*addrs),
 574                                GFP_KERNEL | __GFP_NOWARN);
 575                if (!addrs)
 576                        return -ENOMEM;
 577        }
 578
 579do_pass:
 580        new_insn = first_insn;
 581        fp = prog;
 582
 583        /* Classic BPF related prologue emission. */
 584        if (new_prog) {
 585                /* Classic BPF expects A and X to be reset first. These need
 586                 * to be guaranteed to be the first two instructions.
 587                 */
 588                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
 589                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
 590
 591                /* All programs must keep CTX in callee saved BPF_REG_CTX.
 592                 * In eBPF case it's done by the compiler, here we need to
 593                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
 594                 */
 595                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
 596                if (*seen_ld_abs) {
 597                        /* For packet access in classic BPF, cache skb->data
 598                         * in callee-saved BPF R8 and skb->len - skb->data_len
 599                         * (headlen) in BPF R9. Since classic BPF is read-only
 600                         * on CTX, we only need to cache it once.
 601                         */
 602                        *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
 603                                                  BPF_REG_D, BPF_REG_CTX,
 604                                                  offsetof(struct sk_buff, data));
 605                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
 606                                                  offsetof(struct sk_buff, len));
 607                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
 608                                                  offsetof(struct sk_buff, data_len));
 609                        *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
 610                }
 611        } else {
 612                new_insn += 3;
 613        }
 614
 615        for (i = 0; i < len; fp++, i++) {
 616                struct bpf_insn tmp_insns[32] = { };
 617                struct bpf_insn *insn = tmp_insns;
 618
 619                if (addrs)
 620                        addrs[i] = new_insn - first_insn;
 621
 622                switch (fp->code) {
 623                /* All arithmetic insns and skb loads map as-is. */
 624                case BPF_ALU | BPF_ADD | BPF_X:
 625                case BPF_ALU | BPF_ADD | BPF_K:
 626                case BPF_ALU | BPF_SUB | BPF_X:
 627                case BPF_ALU | BPF_SUB | BPF_K:
 628                case BPF_ALU | BPF_AND | BPF_X:
 629                case BPF_ALU | BPF_AND | BPF_K:
 630                case BPF_ALU | BPF_OR | BPF_X:
 631                case BPF_ALU | BPF_OR | BPF_K:
 632                case BPF_ALU | BPF_LSH | BPF_X:
 633                case BPF_ALU | BPF_LSH | BPF_K:
 634                case BPF_ALU | BPF_RSH | BPF_X:
 635                case BPF_ALU | BPF_RSH | BPF_K:
 636                case BPF_ALU | BPF_XOR | BPF_X:
 637                case BPF_ALU | BPF_XOR | BPF_K:
 638                case BPF_ALU | BPF_MUL | BPF_X:
 639                case BPF_ALU | BPF_MUL | BPF_K:
 640                case BPF_ALU | BPF_DIV | BPF_X:
 641                case BPF_ALU | BPF_DIV | BPF_K:
 642                case BPF_ALU | BPF_MOD | BPF_X:
 643                case BPF_ALU | BPF_MOD | BPF_K:
 644                case BPF_ALU | BPF_NEG:
 645                case BPF_LD | BPF_ABS | BPF_W:
 646                case BPF_LD | BPF_ABS | BPF_H:
 647                case BPF_LD | BPF_ABS | BPF_B:
 648                case BPF_LD | BPF_IND | BPF_W:
 649                case BPF_LD | BPF_IND | BPF_H:
 650                case BPF_LD | BPF_IND | BPF_B:
 651                        /* Check for overloaded BPF extension and
 652                         * directly convert it if found, otherwise
 653                         * just move on with mapping.
 654                         */
 655                        if (BPF_CLASS(fp->code) == BPF_LD &&
 656                            BPF_MODE(fp->code) == BPF_ABS &&
 657                            convert_bpf_extensions(fp, &insn))
 658                                break;
 659                        if (BPF_CLASS(fp->code) == BPF_LD &&
 660                            convert_bpf_ld_abs(fp, &insn)) {
 661                                *seen_ld_abs = true;
 662                                break;
 663                        }
 664
 665                        if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
 666                            fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
 667                                *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
 668                                /* Error with exception code on div/mod by 0.
 669                                 * For cBPF programs, this was always return 0.
 670                                 */
 671                                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
 672                                *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
 673                                *insn++ = BPF_EXIT_INSN();
 674                        }
 675
 676                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 677                        break;
 678
 679                /* Jump transformation cannot use BPF block macros
 680                 * everywhere as offset calculation and target updates
 681                 * require a bit more work than the rest, i.e. jump
 682                 * opcodes map as-is, but offsets need adjustment.
 683                 */
 684
 685#define BPF_EMIT_JMP                                                    \
 686        do {                                                            \
 687                const s32 off_min = S16_MIN, off_max = S16_MAX;         \
 688                s32 off;                                                \
 689                                                                        \
 690                if (target >= len || target < 0)                        \
 691                        goto err;                                       \
 692                off = addrs ? addrs[target] - addrs[i] - 1 : 0;         \
 693                /* Adjust pc relative offset for 2nd or 3rd insn. */    \
 694                off -= insn - tmp_insns;                                \
 695                /* Reject anything not fitting into insn->off. */       \
 696                if (off < off_min || off > off_max)                     \
 697                        goto err;                                       \
 698                insn->off = off;                                        \
 699        } while (0)
 700
 701                case BPF_JMP | BPF_JA:
 702                        target = i + fp->k + 1;
 703                        insn->code = fp->code;
 704                        BPF_EMIT_JMP;
 705                        break;
 706
 707                case BPF_JMP | BPF_JEQ | BPF_K:
 708                case BPF_JMP | BPF_JEQ | BPF_X:
 709                case BPF_JMP | BPF_JSET | BPF_K:
 710                case BPF_JMP | BPF_JSET | BPF_X:
 711                case BPF_JMP | BPF_JGT | BPF_K:
 712                case BPF_JMP | BPF_JGT | BPF_X:
 713                case BPF_JMP | BPF_JGE | BPF_K:
 714                case BPF_JMP | BPF_JGE | BPF_X:
 715                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
 716                                /* BPF immediates are signed, zero extend
 717                                 * immediate into tmp register and use it
 718                                 * in compare insn.
 719                                 */
 720                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
 721
 722                                insn->dst_reg = BPF_REG_A;
 723                                insn->src_reg = BPF_REG_TMP;
 724                                bpf_src = BPF_X;
 725                        } else {
 726                                insn->dst_reg = BPF_REG_A;
 727                                insn->imm = fp->k;
 728                                bpf_src = BPF_SRC(fp->code);
 729                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
 730                        }
 731
 732                        /* Common case where 'jump_false' is next insn. */
 733                        if (fp->jf == 0) {
 734                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 735                                target = i + fp->jt + 1;
 736                                BPF_EMIT_JMP;
 737                                break;
 738                        }
 739
 740                        /* Convert some jumps when 'jump_true' is next insn. */
 741                        if (fp->jt == 0) {
 742                                switch (BPF_OP(fp->code)) {
 743                                case BPF_JEQ:
 744                                        insn->code = BPF_JMP | BPF_JNE | bpf_src;
 745                                        break;
 746                                case BPF_JGT:
 747                                        insn->code = BPF_JMP | BPF_JLE | bpf_src;
 748                                        break;
 749                                case BPF_JGE:
 750                                        insn->code = BPF_JMP | BPF_JLT | bpf_src;
 751                                        break;
 752                                default:
 753                                        goto jmp_rest;
 754                                }
 755
 756                                target = i + fp->jf + 1;
 757                                BPF_EMIT_JMP;
 758                                break;
 759                        }
 760jmp_rest:
 761                        /* Other jumps are mapped into two insns: Jxx and JA. */
 762                        target = i + fp->jt + 1;
 763                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 764                        BPF_EMIT_JMP;
 765                        insn++;
 766
 767                        insn->code = BPF_JMP | BPF_JA;
 768                        target = i + fp->jf + 1;
 769                        BPF_EMIT_JMP;
 770                        break;
 771
 772                /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
 773                case BPF_LDX | BPF_MSH | BPF_B: {
 774                        struct sock_filter tmp = {
 775                                .code   = BPF_LD | BPF_ABS | BPF_B,
 776                                .k      = fp->k,
 777                        };
 778
 779                        *seen_ld_abs = true;
 780
 781                        /* X = A */
 782                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 783                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
 784                        convert_bpf_ld_abs(&tmp, &insn);
 785                        insn++;
 786                        /* A &= 0xf */
 787                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 788                        /* A <<= 2 */
 789                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
 790                        /* tmp = X */
 791                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
 792                        /* X = A */
 793                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 794                        /* A = tmp */
 795                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 796                        break;
 797                }
 798                /* RET_K is remaped into 2 insns. RET_A case doesn't need an
 799                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
 800                 */
 801                case BPF_RET | BPF_A:
 802                case BPF_RET | BPF_K:
 803                        if (BPF_RVAL(fp->code) == BPF_K)
 804                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
 805                                                        0, fp->k);
 806                        *insn = BPF_EXIT_INSN();
 807                        break;
 808
 809                /* Store to stack. */
 810                case BPF_ST:
 811                case BPF_STX:
 812                        stack_off = fp->k * 4  + 4;
 813                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
 814                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
 815                                            -stack_off);
 816                        /* check_load_and_stores() verifies that classic BPF can
 817                         * load from stack only after write, so tracking
 818                         * stack_depth for ST|STX insns is enough
 819                         */
 820                        if (new_prog && new_prog->aux->stack_depth < stack_off)
 821                                new_prog->aux->stack_depth = stack_off;
 822                        break;
 823
 824                /* Load from stack. */
 825                case BPF_LD | BPF_MEM:
 826                case BPF_LDX | BPF_MEM:
 827                        stack_off = fp->k * 4  + 4;
 828                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
 829                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
 830                                            -stack_off);
 831                        break;
 832
 833                /* A = K or X = K */
 834                case BPF_LD | BPF_IMM:
 835                case BPF_LDX | BPF_IMM:
 836                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
 837                                              BPF_REG_A : BPF_REG_X, fp->k);
 838                        break;
 839
 840                /* X = A */
 841                case BPF_MISC | BPF_TAX:
 842                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 843                        break;
 844
 845                /* A = X */
 846                case BPF_MISC | BPF_TXA:
 847                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
 848                        break;
 849
 850                /* A = skb->len or X = skb->len */
 851                case BPF_LD | BPF_W | BPF_LEN:
 852                case BPF_LDX | BPF_W | BPF_LEN:
 853                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
 854                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
 855                                            offsetof(struct sk_buff, len));
 856                        break;
 857
 858                /* Access seccomp_data fields. */
 859                case BPF_LDX | BPF_ABS | BPF_W:
 860                        /* A = *(u32 *) (ctx + K) */
 861                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
 862                        break;
 863
 864                /* Unknown instruction. */
 865                default:
 866                        goto err;
 867                }
 868
 869                insn++;
 870                if (new_prog)
 871                        memcpy(new_insn, tmp_insns,
 872                               sizeof(*insn) * (insn - tmp_insns));
 873                new_insn += insn - tmp_insns;
 874        }
 875
 876        if (!new_prog) {
 877                /* Only calculating new length. */
 878                *new_len = new_insn - first_insn;
 879                if (*seen_ld_abs)
 880                        *new_len += 4; /* Prologue bits. */
 881                return 0;
 882        }
 883
 884        pass++;
 885        if (new_flen != new_insn - first_insn) {
 886                new_flen = new_insn - first_insn;
 887                if (pass > 2)
 888                        goto err;
 889                goto do_pass;
 890        }
 891
 892        kfree(addrs);
 893        BUG_ON(*new_len != new_flen);
 894        return 0;
 895err:
 896        kfree(addrs);
 897        return -EINVAL;
 898}
 899
 900/* Security:
 901 *
 902 * As we dont want to clear mem[] array for each packet going through
 903 * __bpf_prog_run(), we check that filter loaded by user never try to read
 904 * a cell if not previously written, and we check all branches to be sure
 905 * a malicious user doesn't try to abuse us.
 906 */
 907static int check_load_and_stores(const struct sock_filter *filter, int flen)
 908{
 909        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
 910        int pc, ret = 0;
 911
 912        BUILD_BUG_ON(BPF_MEMWORDS > 16);
 913
 914        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
 915        if (!masks)
 916                return -ENOMEM;
 917
 918        memset(masks, 0xff, flen * sizeof(*masks));
 919
 920        for (pc = 0; pc < flen; pc++) {
 921                memvalid &= masks[pc];
 922
 923                switch (filter[pc].code) {
 924                case BPF_ST:
 925                case BPF_STX:
 926                        memvalid |= (1 << filter[pc].k);
 927                        break;
 928                case BPF_LD | BPF_MEM:
 929                case BPF_LDX | BPF_MEM:
 930                        if (!(memvalid & (1 << filter[pc].k))) {
 931                                ret = -EINVAL;
 932                                goto error;
 933                        }
 934                        break;
 935                case BPF_JMP | BPF_JA:
 936                        /* A jump must set masks on target */
 937                        masks[pc + 1 + filter[pc].k] &= memvalid;
 938                        memvalid = ~0;
 939                        break;
 940                case BPF_JMP | BPF_JEQ | BPF_K:
 941                case BPF_JMP | BPF_JEQ | BPF_X:
 942                case BPF_JMP | BPF_JGE | BPF_K:
 943                case BPF_JMP | BPF_JGE | BPF_X:
 944                case BPF_JMP | BPF_JGT | BPF_K:
 945                case BPF_JMP | BPF_JGT | BPF_X:
 946                case BPF_JMP | BPF_JSET | BPF_K:
 947                case BPF_JMP | BPF_JSET | BPF_X:
 948                        /* A jump must set masks on targets */
 949                        masks[pc + 1 + filter[pc].jt] &= memvalid;
 950                        masks[pc + 1 + filter[pc].jf] &= memvalid;
 951                        memvalid = ~0;
 952                        break;
 953                }
 954        }
 955error:
 956        kfree(masks);
 957        return ret;
 958}
 959
 960static bool chk_code_allowed(u16 code_to_probe)
 961{
 962        static const bool codes[] = {
 963                /* 32 bit ALU operations */
 964                [BPF_ALU | BPF_ADD | BPF_K] = true,
 965                [BPF_ALU | BPF_ADD | BPF_X] = true,
 966                [BPF_ALU | BPF_SUB | BPF_K] = true,
 967                [BPF_ALU | BPF_SUB | BPF_X] = true,
 968                [BPF_ALU | BPF_MUL | BPF_K] = true,
 969                [BPF_ALU | BPF_MUL | BPF_X] = true,
 970                [BPF_ALU | BPF_DIV | BPF_K] = true,
 971                [BPF_ALU | BPF_DIV | BPF_X] = true,
 972                [BPF_ALU | BPF_MOD | BPF_K] = true,
 973                [BPF_ALU | BPF_MOD | BPF_X] = true,
 974                [BPF_ALU | BPF_AND | BPF_K] = true,
 975                [BPF_ALU | BPF_AND | BPF_X] = true,
 976                [BPF_ALU | BPF_OR | BPF_K] = true,
 977                [BPF_ALU | BPF_OR | BPF_X] = true,
 978                [BPF_ALU | BPF_XOR | BPF_K] = true,
 979                [BPF_ALU | BPF_XOR | BPF_X] = true,
 980                [BPF_ALU | BPF_LSH | BPF_K] = true,
 981                [BPF_ALU | BPF_LSH | BPF_X] = true,
 982                [BPF_ALU | BPF_RSH | BPF_K] = true,
 983                [BPF_ALU | BPF_RSH | BPF_X] = true,
 984                [BPF_ALU | BPF_NEG] = true,
 985                /* Load instructions */
 986                [BPF_LD | BPF_W | BPF_ABS] = true,
 987                [BPF_LD | BPF_H | BPF_ABS] = true,
 988                [BPF_LD | BPF_B | BPF_ABS] = true,
 989                [BPF_LD | BPF_W | BPF_LEN] = true,
 990                [BPF_LD | BPF_W | BPF_IND] = true,
 991                [BPF_LD | BPF_H | BPF_IND] = true,
 992                [BPF_LD | BPF_B | BPF_IND] = true,
 993                [BPF_LD | BPF_IMM] = true,
 994                [BPF_LD | BPF_MEM] = true,
 995                [BPF_LDX | BPF_W | BPF_LEN] = true,
 996                [BPF_LDX | BPF_B | BPF_MSH] = true,
 997                [BPF_LDX | BPF_IMM] = true,
 998                [BPF_LDX | BPF_MEM] = true,
 999                /* Store instructions */
1000                [BPF_ST] = true,
1001                [BPF_STX] = true,
1002                /* Misc instructions */
1003                [BPF_MISC | BPF_TAX] = true,
1004                [BPF_MISC | BPF_TXA] = true,
1005                /* Return instructions */
1006                [BPF_RET | BPF_K] = true,
1007                [BPF_RET | BPF_A] = true,
1008                /* Jump instructions */
1009                [BPF_JMP | BPF_JA] = true,
1010                [BPF_JMP | BPF_JEQ | BPF_K] = true,
1011                [BPF_JMP | BPF_JEQ | BPF_X] = true,
1012                [BPF_JMP | BPF_JGE | BPF_K] = true,
1013                [BPF_JMP | BPF_JGE | BPF_X] = true,
1014                [BPF_JMP | BPF_JGT | BPF_K] = true,
1015                [BPF_JMP | BPF_JGT | BPF_X] = true,
1016                [BPF_JMP | BPF_JSET | BPF_K] = true,
1017                [BPF_JMP | BPF_JSET | BPF_X] = true,
1018        };
1019
1020        if (code_to_probe >= ARRAY_SIZE(codes))
1021                return false;
1022
1023        return codes[code_to_probe];
1024}
1025
1026static bool bpf_check_basics_ok(const struct sock_filter *filter,
1027                                unsigned int flen)
1028{
1029        if (filter == NULL)
1030                return false;
1031        if (flen == 0 || flen > BPF_MAXINSNS)
1032                return false;
1033
1034        return true;
1035}
1036
1037/**
1038 *      bpf_check_classic - verify socket filter code
1039 *      @filter: filter to verify
1040 *      @flen: length of filter
1041 *
1042 * Check the user's filter code. If we let some ugly
1043 * filter code slip through kaboom! The filter must contain
1044 * no references or jumps that are out of range, no illegal
1045 * instructions, and must end with a RET instruction.
1046 *
1047 * All jumps are forward as they are not signed.
1048 *
1049 * Returns 0 if the rule set is legal or -EINVAL if not.
1050 */
1051static int bpf_check_classic(const struct sock_filter *filter,
1052                             unsigned int flen)
1053{
1054        bool anc_found;
1055        int pc;
1056
1057        /* Check the filter code now */
1058        for (pc = 0; pc < flen; pc++) {
1059                const struct sock_filter *ftest = &filter[pc];
1060
1061                /* May we actually operate on this code? */
1062                if (!chk_code_allowed(ftest->code))
1063                        return -EINVAL;
1064
1065                /* Some instructions need special checks */
1066                switch (ftest->code) {
1067                case BPF_ALU | BPF_DIV | BPF_K:
1068                case BPF_ALU | BPF_MOD | BPF_K:
1069                        /* Check for division by zero */
1070                        if (ftest->k == 0)
1071                                return -EINVAL;
1072                        break;
1073                case BPF_ALU | BPF_LSH | BPF_K:
1074                case BPF_ALU | BPF_RSH | BPF_K:
1075                        if (ftest->k >= 32)
1076                                return -EINVAL;
1077                        break;
1078                case BPF_LD | BPF_MEM:
1079                case BPF_LDX | BPF_MEM:
1080                case BPF_ST:
1081                case BPF_STX:
1082                        /* Check for invalid memory addresses */
1083                        if (ftest->k >= BPF_MEMWORDS)
1084                                return -EINVAL;
1085                        break;
1086                case BPF_JMP | BPF_JA:
1087                        /* Note, the large ftest->k might cause loops.
1088                         * Compare this with conditional jumps below,
1089                         * where offsets are limited. --ANK (981016)
1090                         */
1091                        if (ftest->k >= (unsigned int)(flen - pc - 1))
1092                                return -EINVAL;
1093                        break;
1094                case BPF_JMP | BPF_JEQ | BPF_K:
1095                case BPF_JMP | BPF_JEQ | BPF_X:
1096                case BPF_JMP | BPF_JGE | BPF_K:
1097                case BPF_JMP | BPF_JGE | BPF_X:
1098                case BPF_JMP | BPF_JGT | BPF_K:
1099                case BPF_JMP | BPF_JGT | BPF_X:
1100                case BPF_JMP | BPF_JSET | BPF_K:
1101                case BPF_JMP | BPF_JSET | BPF_X:
1102                        /* Both conditionals must be safe */
1103                        if (pc + ftest->jt + 1 >= flen ||
1104                            pc + ftest->jf + 1 >= flen)
1105                                return -EINVAL;
1106                        break;
1107                case BPF_LD | BPF_W | BPF_ABS:
1108                case BPF_LD | BPF_H | BPF_ABS:
1109                case BPF_LD | BPF_B | BPF_ABS:
1110                        anc_found = false;
1111                        if (bpf_anc_helper(ftest) & BPF_ANC)
1112                                anc_found = true;
1113                        /* Ancillary operation unknown or unsupported */
1114                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
1115                                return -EINVAL;
1116                }
1117        }
1118
1119        /* Last instruction must be a RET code */
1120        switch (filter[flen - 1].code) {
1121        case BPF_RET | BPF_K:
1122        case BPF_RET | BPF_A:
1123                return check_load_and_stores(filter, flen);
1124        }
1125
1126        return -EINVAL;
1127}
1128
1129static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
1130                                      const struct sock_fprog *fprog)
1131{
1132        unsigned int fsize = bpf_classic_proglen(fprog);
1133        struct sock_fprog_kern *fkprog;
1134
1135        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
1136        if (!fp->orig_prog)
1137                return -ENOMEM;
1138
1139        fkprog = fp->orig_prog;
1140        fkprog->len = fprog->len;
1141
1142        fkprog->filter = kmemdup(fp->insns, fsize,
1143                                 GFP_KERNEL | __GFP_NOWARN);
1144        if (!fkprog->filter) {
1145                kfree(fp->orig_prog);
1146                return -ENOMEM;
1147        }
1148
1149        return 0;
1150}
1151
1152static void bpf_release_orig_filter(struct bpf_prog *fp)
1153{
1154        struct sock_fprog_kern *fprog = fp->orig_prog;
1155
1156        if (fprog) {
1157                kfree(fprog->filter);
1158                kfree(fprog);
1159        }
1160}
1161
1162static void __bpf_prog_release(struct bpf_prog *prog)
1163{
1164        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
1165                bpf_prog_put(prog);
1166        } else {
1167                bpf_release_orig_filter(prog);
1168                bpf_prog_free(prog);
1169        }
1170}
1171
1172static void __sk_filter_release(struct sk_filter *fp)
1173{
1174        __bpf_prog_release(fp->prog);
1175        kfree(fp);
1176}
1177
1178/**
1179 *      sk_filter_release_rcu - Release a socket filter by rcu_head
1180 *      @rcu: rcu_head that contains the sk_filter to free
1181 */
1182static void sk_filter_release_rcu(struct rcu_head *rcu)
1183{
1184        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
1185
1186        __sk_filter_release(fp);
1187}
1188
1189/**
1190 *      sk_filter_release - release a socket filter
1191 *      @fp: filter to remove
1192 *
1193 *      Remove a filter from a socket and release its resources.
1194 */
1195static void sk_filter_release(struct sk_filter *fp)
1196{
1197        if (refcount_dec_and_test(&fp->refcnt))
1198                call_rcu(&fp->rcu, sk_filter_release_rcu);
1199}
1200
1201void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1202{
1203        u32 filter_size = bpf_prog_size(fp->prog->len);
1204
1205        atomic_sub(filter_size, &sk->sk_omem_alloc);
1206        sk_filter_release(fp);
1207}
1208
1209/* try to charge the socket memory if there is space available
1210 * return true on success
1211 */
1212static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1213{
1214        u32 filter_size = bpf_prog_size(fp->prog->len);
1215
1216        /* same check as in sock_kmalloc() */
1217        if (filter_size <= sysctl_optmem_max &&
1218            atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
1219                atomic_add(filter_size, &sk->sk_omem_alloc);
1220                return true;
1221        }
1222        return false;
1223}
1224
1225bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1226{
1227        if (!refcount_inc_not_zero(&fp->refcnt))
1228                return false;
1229
1230        if (!__sk_filter_charge(sk, fp)) {
1231                sk_filter_release(fp);
1232                return false;
1233        }
1234        return true;
1235}
1236
1237static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1238{
1239        struct sock_filter *old_prog;
1240        struct bpf_prog *old_fp;
1241        int err, new_len, old_len = fp->len;
1242        bool seen_ld_abs = false;
1243
1244        /* We are free to overwrite insns et al right here as it
1245         * won't be used at this point in time anymore internally
1246         * after the migration to the internal BPF instruction
1247         * representation.
1248         */
1249        BUILD_BUG_ON(sizeof(struct sock_filter) !=
1250                     sizeof(struct bpf_insn));
1251
1252        /* Conversion cannot happen on overlapping memory areas,
1253         * so we need to keep the user BPF around until the 2nd
1254         * pass. At this time, the user BPF is stored in fp->insns.
1255         */
1256        old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
1257                           GFP_KERNEL | __GFP_NOWARN);
1258        if (!old_prog) {
1259                err = -ENOMEM;
1260                goto out_err;
1261        }
1262
1263        /* 1st pass: calculate the new program length. */
1264        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
1265                                 &seen_ld_abs);
1266        if (err)
1267                goto out_err_free;
1268
1269        /* Expand fp for appending the new filter representation. */
1270        old_fp = fp;
1271        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
1272        if (!fp) {
1273                /* The old_fp is still around in case we couldn't
1274                 * allocate new memory, so uncharge on that one.
1275                 */
1276                fp = old_fp;
1277                err = -ENOMEM;
1278                goto out_err_free;
1279        }
1280
1281        fp->len = new_len;
1282
1283        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1284        err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
1285                                 &seen_ld_abs);
1286        if (err)
1287                /* 2nd bpf_convert_filter() can fail only if it fails
1288                 * to allocate memory, remapping must succeed. Note,
1289                 * that at this time old_fp has already been released
1290                 * by krealloc().
1291                 */
1292                goto out_err_free;
1293
1294        fp = bpf_prog_select_runtime(fp, &err);
1295        if (err)
1296                goto out_err_free;
1297
1298        kfree(old_prog);
1299        return fp;
1300
1301out_err_free:
1302        kfree(old_prog);
1303out_err:
1304        __bpf_prog_release(fp);
1305        return ERR_PTR(err);
1306}
1307
1308static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1309                                           bpf_aux_classic_check_t trans)
1310{
1311        int err;
1312
1313        fp->bpf_func = NULL;
1314        fp->jited = 0;
1315
1316        err = bpf_check_classic(fp->insns, fp->len);
1317        if (err) {
1318                __bpf_prog_release(fp);
1319                return ERR_PTR(err);
1320        }
1321
1322        /* There might be additional checks and transformations
1323         * needed on classic filters, f.e. in case of seccomp.
1324         */
1325        if (trans) {
1326                err = trans(fp->insns, fp->len);
1327                if (err) {
1328                        __bpf_prog_release(fp);
1329                        return ERR_PTR(err);
1330                }
1331        }
1332
1333        /* Probe if we can JIT compile the filter and if so, do
1334         * the compilation of the filter.
1335         */
1336        bpf_jit_compile(fp);
1337
1338        /* JIT compiler couldn't process this filter, so do the
1339         * internal BPF translation for the optimized interpreter.
1340         */
1341        if (!fp->jited)
1342                fp = bpf_migrate_filter(fp);
1343
1344        return fp;
1345}
1346
1347/**
1348 *      bpf_prog_create - create an unattached filter
1349 *      @pfp: the unattached filter that is created
1350 *      @fprog: the filter program
1351 *
1352 * Create a filter independent of any socket. We first run some
1353 * sanity checks on it to make sure it does not explode on us later.
1354 * If an error occurs or there is insufficient memory for the filter
1355 * a negative errno code is returned. On success the return is zero.
1356 */
1357int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1358{
1359        unsigned int fsize = bpf_classic_proglen(fprog);
1360        struct bpf_prog *fp;
1361
1362        /* Make sure new filter is there and in the right amounts. */
1363        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1364                return -EINVAL;
1365
1366        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1367        if (!fp)
1368                return -ENOMEM;
1369
1370        memcpy(fp->insns, fprog->filter, fsize);
1371
1372        fp->len = fprog->len;
1373        /* Since unattached filters are not copied back to user
1374         * space through sk_get_filter(), we do not need to hold
1375         * a copy here, and can spare us the work.
1376         */
1377        fp->orig_prog = NULL;
1378
1379        /* bpf_prepare_filter() already takes care of freeing
1380         * memory in case something goes wrong.
1381         */
1382        fp = bpf_prepare_filter(fp, NULL);
1383        if (IS_ERR(fp))
1384                return PTR_ERR(fp);
1385
1386        *pfp = fp;
1387        return 0;
1388}
1389EXPORT_SYMBOL_GPL(bpf_prog_create);
1390
1391/**
1392 *      bpf_prog_create_from_user - create an unattached filter from user buffer
1393 *      @pfp: the unattached filter that is created
1394 *      @fprog: the filter program
1395 *      @trans: post-classic verifier transformation handler
1396 *      @save_orig: save classic BPF program
1397 *
1398 * This function effectively does the same as bpf_prog_create(), only
1399 * that it builds up its insns buffer from user space provided buffer.
1400 * It also allows for passing a bpf_aux_classic_check_t handler.
1401 */
1402int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
1403                              bpf_aux_classic_check_t trans, bool save_orig)
1404{
1405        unsigned int fsize = bpf_classic_proglen(fprog);
1406        struct bpf_prog *fp;
1407        int err;
1408
1409        /* Make sure new filter is there and in the right amounts. */
1410        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1411                return -EINVAL;
1412
1413        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1414        if (!fp)
1415                return -ENOMEM;
1416
1417        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1418                __bpf_prog_free(fp);
1419                return -EFAULT;
1420        }
1421
1422        fp->len = fprog->len;
1423        fp->orig_prog = NULL;
1424
1425        if (save_orig) {
1426                err = bpf_prog_store_orig_filter(fp, fprog);
1427                if (err) {
1428                        __bpf_prog_free(fp);
1429                        return -ENOMEM;
1430                }
1431        }
1432
1433        /* bpf_prepare_filter() already takes care of freeing
1434         * memory in case something goes wrong.
1435         */
1436        fp = bpf_prepare_filter(fp, trans);
1437        if (IS_ERR(fp))
1438                return PTR_ERR(fp);
1439
1440        *pfp = fp;
1441        return 0;
1442}
1443EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
1444
1445void bpf_prog_destroy(struct bpf_prog *fp)
1446{
1447        __bpf_prog_release(fp);
1448}
1449EXPORT_SYMBOL_GPL(bpf_prog_destroy);
1450
1451static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1452{
1453        struct sk_filter *fp, *old_fp;
1454
1455        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1456        if (!fp)
1457                return -ENOMEM;
1458
1459        fp->prog = prog;
1460
1461        if (!__sk_filter_charge(sk, fp)) {
1462                kfree(fp);
1463                return -ENOMEM;
1464        }
1465        refcount_set(&fp->refcnt, 1);
1466
1467        old_fp = rcu_dereference_protected(sk->sk_filter,
1468                                           lockdep_sock_is_held(sk));
1469        rcu_assign_pointer(sk->sk_filter, fp);
1470
1471        if (old_fp)
1472                sk_filter_uncharge(sk, old_fp);
1473
1474        return 0;
1475}
1476
1477static
1478struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1479{
1480        unsigned int fsize = bpf_classic_proglen(fprog);
1481        struct bpf_prog *prog;
1482        int err;
1483
1484        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1485                return ERR_PTR(-EPERM);
1486
1487        /* Make sure new filter is there and in the right amounts. */
1488        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1489                return ERR_PTR(-EINVAL);
1490
1491        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1492        if (!prog)
1493                return ERR_PTR(-ENOMEM);
1494
1495        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1496                __bpf_prog_free(prog);
1497                return ERR_PTR(-EFAULT);
1498        }
1499
1500        prog->len = fprog->len;
1501
1502        err = bpf_prog_store_orig_filter(prog, fprog);
1503        if (err) {
1504                __bpf_prog_free(prog);
1505                return ERR_PTR(-ENOMEM);
1506        }
1507
1508        /* bpf_prepare_filter() already takes care of freeing
1509         * memory in case something goes wrong.
1510         */
1511        return bpf_prepare_filter(prog, NULL);
1512}
1513
1514/**
1515 *      sk_attach_filter - attach a socket filter
1516 *      @fprog: the filter program
1517 *      @sk: the socket to use
1518 *
1519 * Attach the user's filter code. We first run some sanity checks on
1520 * it to make sure it does not explode on us later. If an error
1521 * occurs or there is insufficient memory for the filter a negative
1522 * errno code is returned. On success the return is zero.
1523 */
1524int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1525{
1526        struct bpf_prog *prog = __get_filter(fprog, sk);
1527        int err;
1528
1529        if (IS_ERR(prog))
1530                return PTR_ERR(prog);
1531
1532        err = __sk_attach_prog(prog, sk);
1533        if (err < 0) {
1534                __bpf_prog_release(prog);
1535                return err;
1536        }
1537
1538        return 0;
1539}
1540EXPORT_SYMBOL_GPL(sk_attach_filter);
1541
1542int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1543{
1544        struct bpf_prog *prog = __get_filter(fprog, sk);
1545        int err;
1546
1547        if (IS_ERR(prog))
1548                return PTR_ERR(prog);
1549
1550        if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1551                err = -ENOMEM;
1552        else
1553                err = reuseport_attach_prog(sk, prog);
1554
1555        if (err)
1556                __bpf_prog_release(prog);
1557
1558        return err;
1559}
1560
1561static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1562{
1563        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1564                return ERR_PTR(-EPERM);
1565
1566        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1567}
1568
1569int sk_attach_bpf(u32 ufd, struct sock *sk)
1570{
1571        struct bpf_prog *prog = __get_bpf(ufd, sk);
1572        int err;
1573
1574        if (IS_ERR(prog))
1575                return PTR_ERR(prog);
1576
1577        err = __sk_attach_prog(prog, sk);
1578        if (err < 0) {
1579                bpf_prog_put(prog);
1580                return err;
1581        }
1582
1583        return 0;
1584}
1585
1586int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1587{
1588        struct bpf_prog *prog;
1589        int err;
1590
1591        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1592                return -EPERM;
1593
1594        prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1595        if (PTR_ERR(prog) == -EINVAL)
1596                prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
1597        if (IS_ERR(prog))
1598                return PTR_ERR(prog);
1599
1600        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
1601                /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
1602                 * bpf prog (e.g. sockmap).  It depends on the
1603                 * limitation imposed by bpf_prog_load().
1604                 * Hence, sysctl_optmem_max is not checked.
1605                 */
1606                if ((sk->sk_type != SOCK_STREAM &&
1607                     sk->sk_type != SOCK_DGRAM) ||
1608                    (sk->sk_protocol != IPPROTO_UDP &&
1609                     sk->sk_protocol != IPPROTO_TCP) ||
1610                    (sk->sk_family != AF_INET &&
1611                     sk->sk_family != AF_INET6)) {
1612                        err = -ENOTSUPP;
1613                        goto err_prog_put;
1614                }
1615        } else {
1616                /* BPF_PROG_TYPE_SOCKET_FILTER */
1617                if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
1618                        err = -ENOMEM;
1619                        goto err_prog_put;
1620                }
1621        }
1622
1623        err = reuseport_attach_prog(sk, prog);
1624err_prog_put:
1625        if (err)
1626                bpf_prog_put(prog);
1627
1628        return err;
1629}
1630
1631void sk_reuseport_prog_free(struct bpf_prog *prog)
1632{
1633        if (!prog)
1634                return;
1635
1636        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
1637                bpf_prog_put(prog);
1638        else
1639                bpf_prog_destroy(prog);
1640}
1641
1642struct bpf_scratchpad {
1643        union {
1644                __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
1645                u8     buff[MAX_BPF_STACK];
1646        };
1647};
1648
1649static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
1650
1651static inline int __bpf_try_make_writable(struct sk_buff *skb,
1652                                          unsigned int write_len)
1653{
1654        return skb_ensure_writable(skb, write_len);
1655}
1656
1657static inline int bpf_try_make_writable(struct sk_buff *skb,
1658                                        unsigned int write_len)
1659{
1660        int err = __bpf_try_make_writable(skb, write_len);
1661
1662        bpf_compute_data_pointers(skb);
1663        return err;
1664}
1665
1666static int bpf_try_make_head_writable(struct sk_buff *skb)
1667{
1668        return bpf_try_make_writable(skb, skb_headlen(skb));
1669}
1670
1671static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1672{
1673        if (skb_at_tc_ingress(skb))
1674                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1675}
1676
1677static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1678{
1679        if (skb_at_tc_ingress(skb))
1680                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1681}
1682
1683BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
1684           const void *, from, u32, len, u64, flags)
1685{
1686        void *ptr;
1687
1688        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
1689                return -EINVAL;
1690        if (unlikely(offset > 0xffff))
1691                return -EFAULT;
1692        if (unlikely(bpf_try_make_writable(skb, offset + len)))
1693                return -EFAULT;
1694
1695        ptr = skb->data + offset;
1696        if (flags & BPF_F_RECOMPUTE_CSUM)
1697                __skb_postpull_rcsum(skb, ptr, len, offset);
1698
1699        memcpy(ptr, from, len);
1700
1701        if (flags & BPF_F_RECOMPUTE_CSUM)
1702                __skb_postpush_rcsum(skb, ptr, len, offset);
1703        if (flags & BPF_F_INVALIDATE_HASH)
1704                skb_clear_hash(skb);
1705
1706        return 0;
1707}
1708
1709static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1710        .func           = bpf_skb_store_bytes,
1711        .gpl_only       = false,
1712        .ret_type       = RET_INTEGER,
1713        .arg1_type      = ARG_PTR_TO_CTX,
1714        .arg2_type      = ARG_ANYTHING,
1715        .arg3_type      = ARG_PTR_TO_MEM,
1716        .arg4_type      = ARG_CONST_SIZE,
1717        .arg5_type      = ARG_ANYTHING,
1718};
1719
1720BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
1721           void *, to, u32, len)
1722{
1723        void *ptr;
1724
1725        if (unlikely(offset > 0xffff))
1726                goto err_clear;
1727
1728        ptr = skb_header_pointer(skb, offset, len, to);
1729        if (unlikely(!ptr))
1730                goto err_clear;
1731        if (ptr != to)
1732                memcpy(to, ptr, len);
1733
1734        return 0;
1735err_clear:
1736        memset(to, 0, len);
1737        return -EFAULT;
1738}
1739
1740static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1741        .func           = bpf_skb_load_bytes,
1742        .gpl_only       = false,
1743        .ret_type       = RET_INTEGER,
1744        .arg1_type      = ARG_PTR_TO_CTX,
1745        .arg2_type      = ARG_ANYTHING,
1746        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
1747        .arg4_type      = ARG_CONST_SIZE,
1748};
1749
1750BPF_CALL_4(bpf_flow_dissector_load_bytes,
1751           const struct bpf_flow_dissector *, ctx, u32, offset,
1752           void *, to, u32, len)
1753{
1754        void *ptr;
1755
1756        if (unlikely(offset > 0xffff))
1757                goto err_clear;
1758
1759        if (unlikely(!ctx->skb))
1760                goto err_clear;
1761
1762        ptr = skb_header_pointer(ctx->skb, offset, len, to);
1763        if (unlikely(!ptr))
1764                goto err_clear;
1765        if (ptr != to)
1766                memcpy(to, ptr, len);
1767
1768        return 0;
1769err_clear:
1770        memset(to, 0, len);
1771        return -EFAULT;
1772}
1773
1774static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
1775        .func           = bpf_flow_dissector_load_bytes,
1776        .gpl_only       = false,
1777        .ret_type       = RET_INTEGER,
1778        .arg1_type      = ARG_PTR_TO_CTX,
1779        .arg2_type      = ARG_ANYTHING,
1780        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
1781        .arg4_type      = ARG_CONST_SIZE,
1782};
1783
1784BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
1785           u32, offset, void *, to, u32, len, u32, start_header)
1786{
1787        u8 *end = skb_tail_pointer(skb);
1788        u8 *start, *ptr;
1789
1790        if (unlikely(offset > 0xffff))
1791                goto err_clear;
1792
1793        switch (start_header) {
1794        case BPF_HDR_START_MAC:
1795                if (unlikely(!skb_mac_header_was_set(skb)))
1796                        goto err_clear;
1797                start = skb_mac_header(skb);
1798                break;
1799        case BPF_HDR_START_NET:
1800                start = skb_network_header(skb);
1801                break;
1802        default:
1803                goto err_clear;
1804        }
1805
1806        ptr = start + offset;
1807
1808        if (likely(ptr + len <= end)) {
1809                memcpy(to, ptr, len);
1810                return 0;
1811        }
1812
1813err_clear:
1814        memset(to, 0, len);
1815        return -EFAULT;
1816}
1817
1818static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
1819        .func           = bpf_skb_load_bytes_relative,
1820        .gpl_only       = false,
1821        .ret_type       = RET_INTEGER,
1822        .arg1_type      = ARG_PTR_TO_CTX,
1823        .arg2_type      = ARG_ANYTHING,
1824        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
1825        .arg4_type      = ARG_CONST_SIZE,
1826        .arg5_type      = ARG_ANYTHING,
1827};
1828
1829BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1830{
1831        /* Idea is the following: should the needed direct read/write
1832         * test fail during runtime, we can pull in more data and redo
1833         * again, since implicitly, we invalidate previous checks here.
1834         *
1835         * Or, since we know how much we need to make read/writeable,
1836         * this can be done once at the program beginning for direct
1837         * access case. By this we overcome limitations of only current
1838         * headroom being accessible.
1839         */
1840        return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
1841}
1842
1843static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1844        .func           = bpf_skb_pull_data,
1845        .gpl_only       = false,
1846        .ret_type       = RET_INTEGER,
1847        .arg1_type      = ARG_PTR_TO_CTX,
1848        .arg2_type      = ARG_ANYTHING,
1849};
1850
1851BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
1852{
1853        return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
1854}
1855
1856static const struct bpf_func_proto bpf_sk_fullsock_proto = {
1857        .func           = bpf_sk_fullsock,
1858        .gpl_only       = false,
1859        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
1860        .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
1861};
1862
1863static inline int sk_skb_try_make_writable(struct sk_buff *skb,
1864                                           unsigned int write_len)
1865{
1866        return __bpf_try_make_writable(skb, write_len);
1867}
1868
1869BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
1870{
1871        /* Idea is the following: should the needed direct read/write
1872         * test fail during runtime, we can pull in more data and redo
1873         * again, since implicitly, we invalidate previous checks here.
1874         *
1875         * Or, since we know how much we need to make read/writeable,
1876         * this can be done once at the program beginning for direct
1877         * access case. By this we overcome limitations of only current
1878         * headroom being accessible.
1879         */
1880        return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
1881}
1882
1883static const struct bpf_func_proto sk_skb_pull_data_proto = {
1884        .func           = sk_skb_pull_data,
1885        .gpl_only       = false,
1886        .ret_type       = RET_INTEGER,
1887        .arg1_type      = ARG_PTR_TO_CTX,
1888        .arg2_type      = ARG_ANYTHING,
1889};
1890
1891BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1892           u64, from, u64, to, u64, flags)
1893{
1894        __sum16 *ptr;
1895
1896        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1897                return -EINVAL;
1898        if (unlikely(offset > 0xffff || offset & 1))
1899                return -EFAULT;
1900        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1901                return -EFAULT;
1902
1903        ptr = (__sum16 *)(skb->data + offset);
1904        switch (flags & BPF_F_HDR_FIELD_MASK) {
1905        case 0:
1906                if (unlikely(from != 0))
1907                        return -EINVAL;
1908
1909                csum_replace_by_diff(ptr, to);
1910                break;
1911        case 2:
1912                csum_replace2(ptr, from, to);
1913                break;
1914        case 4:
1915                csum_replace4(ptr, from, to);
1916                break;
1917        default:
1918                return -EINVAL;
1919        }
1920
1921        return 0;
1922}
1923
1924static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1925        .func           = bpf_l3_csum_replace,
1926        .gpl_only       = false,
1927        .ret_type       = RET_INTEGER,
1928        .arg1_type      = ARG_PTR_TO_CTX,
1929        .arg2_type      = ARG_ANYTHING,
1930        .arg3_type      = ARG_ANYTHING,
1931        .arg4_type      = ARG_ANYTHING,
1932        .arg5_type      = ARG_ANYTHING,
1933};
1934
1935BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1936           u64, from, u64, to, u64, flags)
1937{
1938        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1939        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1940        bool do_mforce = flags & BPF_F_MARK_ENFORCE;
1941        __sum16 *ptr;
1942
1943        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
1944                               BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
1945                return -EINVAL;
1946        if (unlikely(offset > 0xffff || offset & 1))
1947                return -EFAULT;
1948        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1949                return -EFAULT;
1950
1951        ptr = (__sum16 *)(skb->data + offset);
1952        if (is_mmzero && !do_mforce && !*ptr)
1953                return 0;
1954
1955        switch (flags & BPF_F_HDR_FIELD_MASK) {
1956        case 0:
1957                if (unlikely(from != 0))
1958                        return -EINVAL;
1959
1960                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
1961                break;
1962        case 2:
1963                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
1964                break;
1965        case 4:
1966                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
1967                break;
1968        default:
1969                return -EINVAL;
1970        }
1971
1972        if (is_mmzero && !*ptr)
1973                *ptr = CSUM_MANGLED_0;
1974        return 0;
1975}
1976
1977static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
1978        .func           = bpf_l4_csum_replace,
1979        .gpl_only       = false,
1980        .ret_type       = RET_INTEGER,
1981        .arg1_type      = ARG_PTR_TO_CTX,
1982        .arg2_type      = ARG_ANYTHING,
1983        .arg3_type      = ARG_ANYTHING,
1984        .arg4_type      = ARG_ANYTHING,
1985        .arg5_type      = ARG_ANYTHING,
1986};
1987
1988BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
1989           __be32 *, to, u32, to_size, __wsum, seed)
1990{
1991        struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
1992        u32 diff_size = from_size + to_size;
1993        int i, j = 0;
1994
1995        /* This is quite flexible, some examples:
1996         *
1997         * from_size == 0, to_size > 0,  seed := csum --> pushing data
1998         * from_size > 0,  to_size == 0, seed := csum --> pulling data
1999         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
2000         *
2001         * Even for diffing, from_size and to_size don't need to be equal.
2002         */
2003        if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
2004                     diff_size > sizeof(sp->diff)))
2005                return -EINVAL;
2006
2007        for (i = 0; i < from_size / sizeof(__be32); i++, j++)
2008                sp->diff[j] = ~from[i];
2009        for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
2010                sp->diff[j] = to[i];
2011
2012        return csum_partial(sp->diff, diff_size, seed);
2013}
2014
2015static const struct bpf_func_proto bpf_csum_diff_proto = {
2016        .func           = bpf_csum_diff,
2017        .gpl_only       = false,
2018        .pkt_access     = true,
2019        .ret_type       = RET_INTEGER,
2020        .arg1_type      = ARG_PTR_TO_MEM_OR_NULL,
2021        .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
2022        .arg3_type      = ARG_PTR_TO_MEM_OR_NULL,
2023        .arg4_type      = ARG_CONST_SIZE_OR_ZERO,
2024        .arg5_type      = ARG_ANYTHING,
2025};
2026
2027BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
2028{
2029        /* The interface is to be used in combination with bpf_csum_diff()
2030         * for direct packet writes. csum rotation for alignment as well
2031         * as emulating csum_sub() can be done from the eBPF program.
2032         */
2033        if (skb->ip_summed == CHECKSUM_COMPLETE)
2034                return (skb->csum = csum_add(skb->csum, csum));
2035
2036        return -ENOTSUPP;
2037}
2038
2039static const struct bpf_func_proto bpf_csum_update_proto = {
2040        .func           = bpf_csum_update,
2041        .gpl_only       = false,
2042        .ret_type       = RET_INTEGER,
2043        .arg1_type      = ARG_PTR_TO_CTX,
2044        .arg2_type      = ARG_ANYTHING,
2045};
2046
2047BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
2048{
2049        /* The interface is to be used in combination with bpf_skb_adjust_room()
2050         * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
2051         * is passed as flags, for example.
2052         */
2053        switch (level) {
2054        case BPF_CSUM_LEVEL_INC:
2055                __skb_incr_checksum_unnecessary(skb);
2056                break;
2057        case BPF_CSUM_LEVEL_DEC:
2058                __skb_decr_checksum_unnecessary(skb);
2059                break;
2060        case BPF_CSUM_LEVEL_RESET:
2061                __skb_reset_checksum_unnecessary(skb);
2062                break;
2063        case BPF_CSUM_LEVEL_QUERY:
2064                return skb->ip_summed == CHECKSUM_UNNECESSARY ?
2065                       skb->csum_level : -EACCES;
2066        default:
2067                return -EINVAL;
2068        }
2069
2070        return 0;
2071}
2072
2073static const struct bpf_func_proto bpf_csum_level_proto = {
2074        .func           = bpf_csum_level,
2075        .gpl_only       = false,
2076        .ret_type       = RET_INTEGER,
2077        .arg1_type      = ARG_PTR_TO_CTX,
2078        .arg2_type      = ARG_ANYTHING,
2079};
2080
2081static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
2082{
2083        return dev_forward_skb_nomtu(dev, skb);
2084}
2085
2086static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
2087                                      struct sk_buff *skb)
2088{
2089        int ret = ____dev_forward_skb(dev, skb, false);
2090
2091        if (likely(!ret)) {
2092                skb->dev = dev;
2093                ret = netif_rx(skb);
2094        }
2095
2096        return ret;
2097}
2098
2099static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
2100{
2101        int ret;
2102
2103        if (dev_xmit_recursion()) {
2104                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2105                kfree_skb(skb);
2106                return -ENETDOWN;
2107        }
2108
2109        skb->dev = dev;
2110        skb->tstamp = 0;
2111
2112        dev_xmit_recursion_inc();
2113        ret = dev_queue_xmit(skb);
2114        dev_xmit_recursion_dec();
2115
2116        return ret;
2117}
2118
2119static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
2120                                 u32 flags)
2121{
2122        unsigned int mlen = skb_network_offset(skb);
2123
2124        if (mlen) {
2125                __skb_pull(skb, mlen);
2126
2127                /* At ingress, the mac header has already been pulled once.
2128                 * At egress, skb_pospull_rcsum has to be done in case that
2129                 * the skb is originated from ingress (i.e. a forwarded skb)
2130                 * to ensure that rcsum starts at net header.
2131                 */
2132                if (!skb_at_tc_ingress(skb))
2133                        skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
2134        }
2135        skb_pop_mac_header(skb);
2136        skb_reset_mac_len(skb);
2137        return flags & BPF_F_INGRESS ?
2138               __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
2139}
2140
2141static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
2142                                 u32 flags)
2143{
2144        /* Verify that a link layer header is carried */
2145        if (unlikely(skb->mac_header >= skb->network_header)) {
2146                kfree_skb(skb);
2147                return -ERANGE;
2148        }
2149
2150        bpf_push_mac_rcsum(skb);
2151        return flags & BPF_F_INGRESS ?
2152               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
2153}
2154
2155static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
2156                          u32 flags)
2157{
2158        if (dev_is_mac_header_xmit(dev))
2159                return __bpf_redirect_common(skb, dev, flags);
2160        else
2161                return __bpf_redirect_no_mac(skb, dev, flags);
2162}
2163
2164#if IS_ENABLED(CONFIG_IPV6)
2165static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
2166                            struct net_device *dev, struct bpf_nh_params *nh)
2167{
2168        u32 hh_len = LL_RESERVED_SPACE(dev);
2169        const struct in6_addr *nexthop;
2170        struct dst_entry *dst = NULL;
2171        struct neighbour *neigh;
2172
2173        if (dev_xmit_recursion()) {
2174                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2175                goto out_drop;
2176        }
2177
2178        skb->dev = dev;
2179        skb->tstamp = 0;
2180
2181        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2182                struct sk_buff *skb2;
2183
2184                skb2 = skb_realloc_headroom(skb, hh_len);
2185                if (unlikely(!skb2)) {
2186                        kfree_skb(skb);
2187                        return -ENOMEM;
2188                }
2189                if (skb->sk)
2190                        skb_set_owner_w(skb2, skb->sk);
2191                consume_skb(skb);
2192                skb = skb2;
2193        }
2194
2195        rcu_read_lock_bh();
2196        if (!nh) {
2197                dst = skb_dst(skb);
2198                nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
2199                                      &ipv6_hdr(skb)->daddr);
2200        } else {
2201                nexthop = &nh->ipv6_nh;
2202        }
2203        neigh = ip_neigh_gw6(dev, nexthop);
2204        if (likely(!IS_ERR(neigh))) {
2205                int ret;
2206
2207                sock_confirm_neigh(skb, neigh);
2208                dev_xmit_recursion_inc();
2209                ret = neigh_output(neigh, skb, false);
2210                dev_xmit_recursion_dec();
2211                rcu_read_unlock_bh();
2212                return ret;
2213        }
2214        rcu_read_unlock_bh();
2215        if (dst)
2216                IP6_INC_STATS(dev_net(dst->dev),
2217                              ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
2218out_drop:
2219        kfree_skb(skb);
2220        return -ENETDOWN;
2221}
2222
2223static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
2224                                   struct bpf_nh_params *nh)
2225{
2226        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
2227        struct net *net = dev_net(dev);
2228        int err, ret = NET_XMIT_DROP;
2229
2230        if (!nh) {
2231                struct dst_entry *dst;
2232                struct flowi6 fl6 = {
2233                        .flowi6_flags = FLOWI_FLAG_ANYSRC,
2234                        .flowi6_mark  = skb->mark,
2235                        .flowlabel    = ip6_flowinfo(ip6h),
2236                        .flowi6_oif   = dev->ifindex,
2237                        .flowi6_proto = ip6h->nexthdr,
2238                        .daddr        = ip6h->daddr,
2239                        .saddr        = ip6h->saddr,
2240                };
2241
2242                dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
2243                if (IS_ERR(dst))
2244                        goto out_drop;
2245
2246                skb_dst_set(skb, dst);
2247        } else if (nh->nh_family != AF_INET6) {
2248                goto out_drop;
2249        }
2250
2251        err = bpf_out_neigh_v6(net, skb, dev, nh);
2252        if (unlikely(net_xmit_eval(err)))
2253                dev->stats.tx_errors++;
2254        else
2255                ret = NET_XMIT_SUCCESS;
2256        goto out_xmit;
2257out_drop:
2258        dev->stats.tx_errors++;
2259        kfree_skb(skb);
2260out_xmit:
2261        return ret;
2262}
2263#else
2264static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
2265                                   struct bpf_nh_params *nh)
2266{
2267        kfree_skb(skb);
2268        return NET_XMIT_DROP;
2269}
2270#endif /* CONFIG_IPV6 */
2271
2272#if IS_ENABLED(CONFIG_INET)
2273static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
2274                            struct net_device *dev, struct bpf_nh_params *nh)
2275{
2276        u32 hh_len = LL_RESERVED_SPACE(dev);
2277        struct neighbour *neigh;
2278        bool is_v6gw = false;
2279
2280        if (dev_xmit_recursion()) {
2281                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2282                goto out_drop;
2283        }
2284
2285        skb->dev = dev;
2286        skb->tstamp = 0;
2287
2288        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2289                struct sk_buff *skb2;
2290
2291                skb2 = skb_realloc_headroom(skb, hh_len);
2292                if (unlikely(!skb2)) {
2293                        kfree_skb(skb);
2294                        return -ENOMEM;
2295                }
2296                if (skb->sk)
2297                        skb_set_owner_w(skb2, skb->sk);
2298                consume_skb(skb);
2299                skb = skb2;
2300        }
2301
2302        rcu_read_lock_bh();
2303        if (!nh) {
2304                struct dst_entry *dst = skb_dst(skb);
2305                struct rtable *rt = container_of(dst, struct rtable, dst);
2306
2307                neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
2308        } else if (nh->nh_family == AF_INET6) {
2309                neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
2310                is_v6gw = true;
2311        } else if (nh->nh_family == AF_INET) {
2312                neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
2313        } else {
2314                rcu_read_unlock_bh();
2315                goto out_drop;
2316        }
2317
2318        if (likely(!IS_ERR(neigh))) {
2319                int ret;
2320
2321                sock_confirm_neigh(skb, neigh);
2322                dev_xmit_recursion_inc();
2323                ret = neigh_output(neigh, skb, is_v6gw);
2324                dev_xmit_recursion_dec();
2325                rcu_read_unlock_bh();
2326                return ret;
2327        }
2328        rcu_read_unlock_bh();
2329out_drop:
2330        kfree_skb(skb);
2331        return -ENETDOWN;
2332}
2333
2334static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
2335                                   struct bpf_nh_params *nh)
2336{
2337        const struct iphdr *ip4h = ip_hdr(skb);
2338        struct net *net = dev_net(dev);
2339        int err, ret = NET_XMIT_DROP;
2340
2341        if (!nh) {
2342                struct flowi4 fl4 = {
2343                        .flowi4_flags = FLOWI_FLAG_ANYSRC,
2344                        .flowi4_mark  = skb->mark,
2345                        .flowi4_tos   = RT_TOS(ip4h->tos),
2346                        .flowi4_oif   = dev->ifindex,
2347                        .flowi4_proto = ip4h->protocol,
2348                        .daddr        = ip4h->daddr,
2349                        .saddr        = ip4h->saddr,
2350                };
2351                struct rtable *rt;
2352
2353                rt = ip_route_output_flow(net, &fl4, NULL);
2354                if (IS_ERR(rt))
2355                        goto out_drop;
2356                if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
2357                        ip_rt_put(rt);
2358                        goto out_drop;
2359                }
2360
2361                skb_dst_set(skb, &rt->dst);
2362        }
2363
2364        err = bpf_out_neigh_v4(net, skb, dev, nh);
2365        if (unlikely(net_xmit_eval(err)))
2366                dev->stats.tx_errors++;
2367        else
2368                ret = NET_XMIT_SUCCESS;
2369        goto out_xmit;
2370out_drop:
2371        dev->stats.tx_errors++;
2372        kfree_skb(skb);
2373out_xmit:
2374        return ret;
2375}
2376#else
2377static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
2378                                   struct bpf_nh_params *nh)
2379{
2380        kfree_skb(skb);
2381        return NET_XMIT_DROP;
2382}
2383#endif /* CONFIG_INET */
2384
2385static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
2386                                struct bpf_nh_params *nh)
2387{
2388        struct ethhdr *ethh = eth_hdr(skb);
2389
2390        if (unlikely(skb->mac_header >= skb->network_header))
2391                goto out;
2392        bpf_push_mac_rcsum(skb);
2393        if (is_multicast_ether_addr(ethh->h_dest))
2394                goto out;
2395
2396        skb_pull(skb, sizeof(*ethh));
2397        skb_unset_mac_header(skb);
2398        skb_reset_network_header(skb);
2399
2400        if (skb->protocol == htons(ETH_P_IP))
2401                return __bpf_redirect_neigh_v4(skb, dev, nh);
2402        else if (skb->protocol == htons(ETH_P_IPV6))
2403                return __bpf_redirect_neigh_v6(skb, dev, nh);
2404out:
2405        kfree_skb(skb);
2406        return -ENOTSUPP;
2407}
2408
2409/* Internal, non-exposed redirect flags. */
2410enum {
2411        BPF_F_NEIGH     = (1ULL << 1),
2412        BPF_F_PEER      = (1ULL << 2),
2413        BPF_F_NEXTHOP   = (1ULL << 3),
2414#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
2415};
2416
2417BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
2418{
2419        struct net_device *dev;
2420        struct sk_buff *clone;
2421        int ret;
2422
2423        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
2424                return -EINVAL;
2425
2426        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
2427        if (unlikely(!dev))
2428                return -EINVAL;
2429
2430        clone = skb_clone(skb, GFP_ATOMIC);
2431        if (unlikely(!clone))
2432                return -ENOMEM;
2433
2434        /* For direct write, we need to keep the invariant that the skbs
2435         * we're dealing with need to be uncloned. Should uncloning fail
2436         * here, we need to free the just generated clone to unclone once
2437         * again.
2438         */
2439        ret = bpf_try_make_head_writable(skb);
2440        if (unlikely(ret)) {
2441                kfree_skb(clone);
2442                return -ENOMEM;
2443        }
2444
2445        return __bpf_redirect(clone, dev, flags);
2446}
2447
2448static const struct bpf_func_proto bpf_clone_redirect_proto = {
2449        .func           = bpf_clone_redirect,
2450        .gpl_only       = false,
2451        .ret_type       = RET_INTEGER,
2452        .arg1_type      = ARG_PTR_TO_CTX,
2453        .arg2_type      = ARG_ANYTHING,
2454        .arg3_type      = ARG_ANYTHING,
2455};
2456
2457DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
2458EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
2459
2460int skb_do_redirect(struct sk_buff *skb)
2461{
2462        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2463        struct net *net = dev_net(skb->dev);
2464        struct net_device *dev;
2465        u32 flags = ri->flags;
2466
2467        dev = dev_get_by_index_rcu(net, ri->tgt_index);
2468        ri->tgt_index = 0;
2469        ri->flags = 0;
2470        if (unlikely(!dev))
2471                goto out_drop;
2472        if (flags & BPF_F_PEER) {
2473                const struct net_device_ops *ops = dev->netdev_ops;
2474
2475                if (unlikely(!ops->ndo_get_peer_dev ||
2476                             !skb_at_tc_ingress(skb)))
2477                        goto out_drop;
2478                dev = ops->ndo_get_peer_dev(dev);
2479                if (unlikely(!dev ||
2480                             !(dev->flags & IFF_UP) ||
2481                             net_eq(net, dev_net(dev))))
2482                        goto out_drop;
2483                skb->dev = dev;
2484                return -EAGAIN;
2485        }
2486        return flags & BPF_F_NEIGH ?
2487               __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
2488                                    &ri->nh : NULL) :
2489               __bpf_redirect(skb, dev, flags);
2490out_drop:
2491        kfree_skb(skb);
2492        return -EINVAL;
2493}
2494
2495BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2496{
2497        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2498
2499        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
2500                return TC_ACT_SHOT;
2501
2502        ri->flags = flags;
2503        ri->tgt_index = ifindex;
2504
2505        return TC_ACT_REDIRECT;
2506}
2507
2508static const struct bpf_func_proto bpf_redirect_proto = {
2509        .func           = bpf_redirect,
2510        .gpl_only       = false,
2511        .ret_type       = RET_INTEGER,
2512        .arg1_type      = ARG_ANYTHING,
2513        .arg2_type      = ARG_ANYTHING,
2514};
2515
2516BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
2517{
2518        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2519
2520        if (unlikely(flags))
2521                return TC_ACT_SHOT;
2522
2523        ri->flags = BPF_F_PEER;
2524        ri->tgt_index = ifindex;
2525
2526        return TC_ACT_REDIRECT;
2527}
2528
2529static const struct bpf_func_proto bpf_redirect_peer_proto = {
2530        .func           = bpf_redirect_peer,
2531        .gpl_only       = false,
2532        .ret_type       = RET_INTEGER,
2533        .arg1_type      = ARG_ANYTHING,
2534        .arg2_type      = ARG_ANYTHING,
2535};
2536
2537BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
2538           int, plen, u64, flags)
2539{
2540        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2541
2542        if (unlikely((plen && plen < sizeof(*params)) || flags))
2543                return TC_ACT_SHOT;
2544
2545        ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
2546        ri->tgt_index = ifindex;
2547
2548        BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
2549        if (plen)
2550                memcpy(&ri->nh, params, sizeof(ri->nh));
2551
2552        return TC_ACT_REDIRECT;
2553}
2554
2555static const struct bpf_func_proto bpf_redirect_neigh_proto = {
2556        .func           = bpf_redirect_neigh,
2557        .gpl_only       = false,
2558        .ret_type       = RET_INTEGER,
2559        .arg1_type      = ARG_ANYTHING,
2560        .arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
2561        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
2562        .arg4_type      = ARG_ANYTHING,
2563};
2564
2565BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
2566{
2567        msg->apply_bytes = bytes;
2568        return 0;
2569}
2570
2571static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
2572        .func           = bpf_msg_apply_bytes,
2573        .gpl_only       = false,
2574        .ret_type       = RET_INTEGER,
2575        .arg1_type      = ARG_PTR_TO_CTX,
2576        .arg2_type      = ARG_ANYTHING,
2577};
2578
2579BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
2580{
2581        msg->cork_bytes = bytes;
2582        return 0;
2583}
2584
2585static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
2586        .func           = bpf_msg_cork_bytes,
2587        .gpl_only       = false,
2588        .ret_type       = RET_INTEGER,
2589        .arg1_type      = ARG_PTR_TO_CTX,
2590        .arg2_type      = ARG_ANYTHING,
2591};
2592
2593BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
2594           u32, end, u64, flags)
2595{
2596        u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
2597        u32 first_sge, last_sge, i, shift, bytes_sg_total;
2598        struct scatterlist *sge;
2599        u8 *raw, *to, *from;
2600        struct page *page;
2601
2602        if (unlikely(flags || end <= start))
2603                return -EINVAL;
2604
2605        /* First find the starting scatterlist element */
2606        i = msg->sg.start;
2607        do {
2608                offset += len;
2609                len = sk_msg_elem(msg, i)->length;
2610                if (start < offset + len)
2611                        break;
2612                sk_msg_iter_var_next(i);
2613        } while (i != msg->sg.end);
2614
2615        if (unlikely(start >= offset + len))
2616                return -EINVAL;
2617
2618        first_sge = i;
2619        /* The start may point into the sg element so we need to also
2620         * account for the headroom.
2621         */
2622        bytes_sg_total = start - offset + bytes;
2623        if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
2624                goto out;
2625
2626        /* At this point we need to linearize multiple scatterlist
2627         * elements or a single shared page. Either way we need to
2628         * copy into a linear buffer exclusively owned by BPF. Then
2629         * place the buffer in the scatterlist and fixup the original
2630         * entries by removing the entries now in the linear buffer
2631         * and shifting the remaining entries. For now we do not try
2632         * to copy partial entries to avoid complexity of running out
2633         * of sg_entry slots. The downside is reading a single byte
2634         * will copy the entire sg entry.
2635         */
2636        do {
2637                copy += sk_msg_elem(msg, i)->length;
2638                sk_msg_iter_var_next(i);
2639                if (bytes_sg_total <= copy)
2640                        break;
2641        } while (i != msg->sg.end);
2642        last_sge = i;
2643
2644        if (unlikely(bytes_sg_total > copy))
2645                return -EINVAL;
2646
2647        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2648                           get_order(copy));
2649        if (unlikely(!page))
2650                return -ENOMEM;
2651
2652        raw = page_address(page);
2653        i = first_sge;
2654        do {
2655                sge = sk_msg_elem(msg, i);
2656                from = sg_virt(sge);
2657                len = sge->length;
2658                to = raw + poffset;
2659
2660                memcpy(to, from, len);
2661                poffset += len;
2662                sge->length = 0;
2663                put_page(sg_page(sge));
2664
2665                sk_msg_iter_var_next(i);
2666        } while (i != last_sge);
2667
2668        sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
2669
2670        /* To repair sg ring we need to shift entries. If we only
2671         * had a single entry though we can just replace it and
2672         * be done. Otherwise walk the ring and shift the entries.
2673         */
2674        WARN_ON_ONCE(last_sge == first_sge);
2675        shift = last_sge > first_sge ?
2676                last_sge - first_sge - 1 :
2677                NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
2678        if (!shift)
2679                goto out;
2680
2681        i = first_sge;
2682        sk_msg_iter_var_next(i);
2683        do {
2684                u32 move_from;
2685
2686                if (i + shift >= NR_MSG_FRAG_IDS)
2687                        move_from = i + shift - NR_MSG_FRAG_IDS;
2688                else
2689                        move_from = i + shift;
2690                if (move_from == msg->sg.end)
2691                        break;
2692
2693                msg->sg.data[i] = msg->sg.data[move_from];
2694                msg->sg.data[move_from].length = 0;
2695                msg->sg.data[move_from].page_link = 0;
2696                msg->sg.data[move_from].offset = 0;
2697                sk_msg_iter_var_next(i);
2698        } while (1);
2699
2700        msg->sg.end = msg->sg.end - shift > msg->sg.end ?
2701                      msg->sg.end - shift + NR_MSG_FRAG_IDS :
2702                      msg->sg.end - shift;
2703out:
2704        msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
2705        msg->data_end = msg->data + bytes;
2706        return 0;
2707}
2708
2709static const struct bpf_func_proto bpf_msg_pull_data_proto = {
2710        .func           = bpf_msg_pull_data,
2711        .gpl_only       = false,
2712        .ret_type       = RET_INTEGER,
2713        .arg1_type      = ARG_PTR_TO_CTX,
2714        .arg2_type      = ARG_ANYTHING,
2715        .arg3_type      = ARG_ANYTHING,
2716        .arg4_type      = ARG_ANYTHING,
2717};
2718
2719BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
2720           u32, len, u64, flags)
2721{
2722        struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
2723        u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
2724        u8 *raw, *to, *from;
2725        struct page *page;
2726
2727        if (unlikely(flags))
2728                return -EINVAL;
2729
2730        /* First find the starting scatterlist element */
2731        i = msg->sg.start;
2732        do {
2733                offset += l;
2734                l = sk_msg_elem(msg, i)->length;
2735
2736                if (start < offset + l)
2737                        break;
2738                sk_msg_iter_var_next(i);
2739        } while (i != msg->sg.end);
2740
2741        if (start >= offset + l)
2742                return -EINVAL;
2743
2744        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2745
2746        /* If no space available will fallback to copy, we need at
2747         * least one scatterlist elem available to push data into
2748         * when start aligns to the beginning of an element or two
2749         * when it falls inside an element. We handle the start equals
2750         * offset case because its the common case for inserting a
2751         * header.
2752         */
2753        if (!space || (space == 1 && start != offset))
2754                copy = msg->sg.data[i].length;
2755
2756        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2757                           get_order(copy + len));
2758        if (unlikely(!page))
2759                return -ENOMEM;
2760
2761        if (copy) {
2762                int front, back;
2763
2764                raw = page_address(page);
2765
2766                psge = sk_msg_elem(msg, i);
2767                front = start - offset;
2768                back = psge->length - front;
2769                from = sg_virt(psge);
2770
2771                if (front)
2772                        memcpy(raw, from, front);
2773
2774                if (back) {
2775                        from += front;
2776                        to = raw + front + len;
2777
2778                        memcpy(to, from, back);
2779                }
2780
2781                put_page(sg_page(psge));
2782        } else if (start - offset) {
2783                psge = sk_msg_elem(msg, i);
2784                rsge = sk_msg_elem_cpy(msg, i);
2785
2786                psge->length = start - offset;
2787                rsge.length -= psge->length;
2788                rsge.offset += start;
2789
2790                sk_msg_iter_var_next(i);
2791                sg_unmark_end(psge);
2792                sg_unmark_end(&rsge);
2793                sk_msg_iter_next(msg, end);
2794        }
2795
2796        /* Slot(s) to place newly allocated data */
2797        new = i;
2798
2799        /* Shift one or two slots as needed */
2800        if (!copy) {
2801                sge = sk_msg_elem_cpy(msg, i);
2802
2803                sk_msg_iter_var_next(i);
2804                sg_unmark_end(&sge);
2805                sk_msg_iter_next(msg, end);
2806
2807                nsge = sk_msg_elem_cpy(msg, i);
2808                if (rsge.length) {
2809                        sk_msg_iter_var_next(i);
2810                        nnsge = sk_msg_elem_cpy(msg, i);
2811                }
2812
2813                while (i != msg->sg.end) {
2814                        msg->sg.data[i] = sge;
2815                        sge = nsge;
2816                        sk_msg_iter_var_next(i);
2817                        if (rsge.length) {
2818                                nsge = nnsge;
2819                                nnsge = sk_msg_elem_cpy(msg, i);
2820                        } else {
2821                                nsge = sk_msg_elem_cpy(msg, i);
2822                        }
2823                }
2824        }
2825
2826        /* Place newly allocated data buffer */
2827        sk_mem_charge(msg->sk, len);
2828        msg->sg.size += len;
2829        __clear_bit(new, &msg->sg.copy);
2830        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
2831        if (rsge.length) {
2832                get_page(sg_page(&rsge));
2833                sk_msg_iter_var_next(new);
2834                msg->sg.data[new] = rsge;
2835        }
2836
2837        sk_msg_compute_data_pointers(msg);
2838        return 0;
2839}
2840
2841static const struct bpf_func_proto bpf_msg_push_data_proto = {
2842        .func           = bpf_msg_push_data,
2843        .gpl_only       = false,
2844        .ret_type       = RET_INTEGER,
2845        .arg1_type      = ARG_PTR_TO_CTX,
2846        .arg2_type      = ARG_ANYTHING,
2847        .arg3_type      = ARG_ANYTHING,
2848        .arg4_type      = ARG_ANYTHING,
2849};
2850
2851static void sk_msg_shift_left(struct sk_msg *msg, int i)
2852{
2853        int prev;
2854
2855        do {
2856                prev = i;
2857                sk_msg_iter_var_next(i);
2858                msg->sg.data[prev] = msg->sg.data[i];
2859        } while (i != msg->sg.end);
2860
2861        sk_msg_iter_prev(msg, end);
2862}
2863
2864static void sk_msg_shift_right(struct sk_msg *msg, int i)
2865{
2866        struct scatterlist tmp, sge;
2867
2868        sk_msg_iter_next(msg, end);
2869        sge = sk_msg_elem_cpy(msg, i);
2870        sk_msg_iter_var_next(i);
2871        tmp = sk_msg_elem_cpy(msg, i);
2872
2873        while (i != msg->sg.end) {
2874                msg->sg.data[i] = sge;
2875                sk_msg_iter_var_next(i);
2876                sge = tmp;
2877                tmp = sk_msg_elem_cpy(msg, i);
2878        }
2879}
2880
2881BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
2882           u32, len, u64, flags)
2883{
2884        u32 i = 0, l = 0, space, offset = 0;
2885        u64 last = start + len;
2886        int pop;
2887
2888        if (unlikely(flags))
2889                return -EINVAL;
2890
2891        /* First find the starting scatterlist element */
2892        i = msg->sg.start;
2893        do {
2894                offset += l;
2895                l = sk_msg_elem(msg, i)->length;
2896
2897                if (start < offset + l)
2898                        break;
2899                sk_msg_iter_var_next(i);
2900        } while (i != msg->sg.end);
2901
2902        /* Bounds checks: start and pop must be inside message */
2903        if (start >= offset + l || last >= msg->sg.size)
2904                return -EINVAL;
2905
2906        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2907
2908        pop = len;
2909        /* --------------| offset
2910         * -| start      |-------- len -------|
2911         *
2912         *  |----- a ----|-------- pop -------|----- b ----|
2913         *  |______________________________________________| length
2914         *
2915         *
2916         * a:   region at front of scatter element to save
2917         * b:   region at back of scatter element to save when length > A + pop
2918         * pop: region to pop from element, same as input 'pop' here will be
2919         *      decremented below per iteration.
2920         *
2921         * Two top-level cases to handle when start != offset, first B is non
2922         * zero and second B is zero corresponding to when a pop includes more
2923         * than one element.
2924         *
2925         * Then if B is non-zero AND there is no space allocate space and
2926         * compact A, B regions into page. If there is space shift ring to
2927         * the rigth free'ing the next element in ring to place B, leaving
2928         * A untouched except to reduce length.
2929         */
2930        if (start != offset) {
2931                struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
2932                int a = start;
2933                int b = sge->length - pop - a;
2934
2935                sk_msg_iter_var_next(i);
2936
2937                if (pop < sge->length - a) {
2938                        if (space) {
2939                                sge->length = a;
2940                                sk_msg_shift_right(msg, i);
2941                                nsge = sk_msg_elem(msg, i);
2942                                get_page(sg_page(sge));
2943                                sg_set_page(nsge,
2944                                            sg_page(sge),
2945                                            b, sge->offset + pop + a);
2946                        } else {
2947                                struct page *page, *orig;
2948                                u8 *to, *from;
2949
2950                                page = alloc_pages(__GFP_NOWARN |
2951                                                   __GFP_COMP   | GFP_ATOMIC,
2952                                                   get_order(a + b));
2953                                if (unlikely(!page))
2954                                        return -ENOMEM;
2955
2956                                sge->length = a;
2957                                orig = sg_page(sge);
2958                                from = sg_virt(sge);
2959                                to = page_address(page);
2960                                memcpy(to, from, a);
2961                                memcpy(to + a, from + a + pop, b);
2962                                sg_set_page(sge, page, a + b, 0);
2963                                put_page(orig);
2964                        }
2965                        pop = 0;
2966                } else if (pop >= sge->length - a) {
2967                        pop -= (sge->length - a);
2968                        sge->length = a;
2969                }
2970        }
2971
2972        /* From above the current layout _must_ be as follows,
2973         *
2974         * -| offset
2975         * -| start
2976         *
2977         *  |---- pop ---|---------------- b ------------|
2978         *  |____________________________________________| length
2979         *
2980         * Offset and start of the current msg elem are equal because in the
2981         * previous case we handled offset != start and either consumed the
2982         * entire element and advanced to the next element OR pop == 0.
2983         *
2984         * Two cases to handle here are first pop is less than the length
2985         * leaving some remainder b above. Simply adjust the element's layout
2986         * in this case. Or pop >= length of the element so that b = 0. In this
2987         * case advance to next element decrementing pop.
2988         */
2989        while (pop) {
2990                struct scatterlist *sge = sk_msg_elem(msg, i);
2991
2992                if (pop < sge->length) {
2993                        sge->length -= pop;
2994                        sge->offset += pop;
2995                        pop = 0;
2996                } else {
2997                        pop -= sge->length;
2998                        sk_msg_shift_left(msg, i);
2999                }
3000                sk_msg_iter_var_next(i);
3001        }
3002
3003        sk_mem_uncharge(msg->sk, len - pop);
3004        msg->sg.size -= (len - pop);
3005        sk_msg_compute_data_pointers(msg);
3006        return 0;
3007}
3008
3009static const struct bpf_func_proto bpf_msg_pop_data_proto = {
3010        .func           = bpf_msg_pop_data,
3011        .gpl_only       = false,
3012        .ret_type       = RET_INTEGER,
3013        .arg1_type      = ARG_PTR_TO_CTX,
3014        .arg2_type      = ARG_ANYTHING,
3015        .arg3_type      = ARG_ANYTHING,
3016        .arg4_type      = ARG_ANYTHING,
3017};
3018
3019#ifdef CONFIG_CGROUP_NET_CLASSID
3020BPF_CALL_0(bpf_get_cgroup_classid_curr)
3021{
3022        return __task_get_classid(current);
3023}
3024
3025static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
3026        .func           = bpf_get_cgroup_classid_curr,
3027        .gpl_only       = false,
3028        .ret_type       = RET_INTEGER,
3029};
3030
3031BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
3032{
3033        struct sock *sk = skb_to_full_sk(skb);
3034
3035        if (!sk || !sk_fullsock(sk))
3036                return 0;
3037
3038        return sock_cgroup_classid(&sk->sk_cgrp_data);
3039}
3040
3041static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
3042        .func           = bpf_skb_cgroup_classid,
3043        .gpl_only       = false,
3044        .ret_type       = RET_INTEGER,
3045        .arg1_type      = ARG_PTR_TO_CTX,
3046};
3047#endif
3048
3049BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
3050{
3051        return task_get_classid(skb);
3052}
3053
3054static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
3055        .func           = bpf_get_cgroup_classid,
3056        .gpl_only       = false,
3057        .ret_type       = RET_INTEGER,
3058        .arg1_type      = ARG_PTR_TO_CTX,
3059};
3060
3061BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
3062{
3063        return dst_tclassid(skb);
3064}
3065
3066static const struct bpf_func_proto bpf_get_route_realm_proto = {
3067        .func           = bpf_get_route_realm,
3068        .gpl_only       = false,
3069        .ret_type       = RET_INTEGER,
3070        .arg1_type      = ARG_PTR_TO_CTX,
3071};
3072
3073BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
3074{
3075        /* If skb_clear_hash() was called due to mangling, we can
3076         * trigger SW recalculation here. Later access to hash
3077         * can then use the inline skb->hash via context directly
3078         * instead of calling this helper again.
3079         */
3080        return skb_get_hash(skb);
3081}
3082
3083static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
3084        .func           = bpf_get_hash_recalc,
3085        .gpl_only       = false,
3086        .ret_type       = RET_INTEGER,
3087        .arg1_type      = ARG_PTR_TO_CTX,
3088};
3089
3090BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
3091{
3092        /* After all direct packet write, this can be used once for
3093         * triggering a lazy recalc on next skb_get_hash() invocation.
3094         */
3095        skb_clear_hash(skb);
3096        return 0;
3097}
3098
3099static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
3100        .func           = bpf_set_hash_invalid,
3101        .gpl_only       = false,
3102        .ret_type       = RET_INTEGER,
3103        .arg1_type      = ARG_PTR_TO_CTX,
3104};
3105
3106BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
3107{
3108        /* Set user specified hash as L4(+), so that it gets returned
3109         * on skb_get_hash() call unless BPF prog later on triggers a
3110         * skb_clear_hash().
3111         */
3112        __skb_set_sw_hash(skb, hash, true);
3113        return 0;
3114}
3115
3116static const struct bpf_func_proto bpf_set_hash_proto = {
3117        .func           = bpf_set_hash,
3118        .gpl_only       = false,
3119        .ret_type       = RET_INTEGER,
3120        .arg1_type      = ARG_PTR_TO_CTX,
3121        .arg2_type      = ARG_ANYTHING,
3122};
3123
3124BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
3125           u16, vlan_tci)
3126{
3127        int ret;
3128
3129        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
3130                     vlan_proto != htons(ETH_P_8021AD)))
3131                vlan_proto = htons(ETH_P_8021Q);
3132
3133        bpf_push_mac_rcsum(skb);
3134        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
3135        bpf_pull_mac_rcsum(skb);
3136
3137        bpf_compute_data_pointers(skb);
3138        return ret;
3139}
3140
3141static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
3142        .func           = bpf_skb_vlan_push,
3143        .gpl_only       = false,
3144        .ret_type       = RET_INTEGER,
3145        .arg1_type      = ARG_PTR_TO_CTX,
3146        .arg2_type      = ARG_ANYTHING,
3147        .arg3_type      = ARG_ANYTHING,
3148};
3149
3150BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
3151{
3152        int ret;
3153
3154        bpf_push_mac_rcsum(skb);
3155        ret = skb_vlan_pop(skb);
3156        bpf_pull_mac_rcsum(skb);
3157
3158        bpf_compute_data_pointers(skb);
3159        return ret;
3160}
3161
3162static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
3163        .func           = bpf_skb_vlan_pop,
3164        .gpl_only       = false,
3165        .ret_type       = RET_INTEGER,
3166        .arg1_type      = ARG_PTR_TO_CTX,
3167};
3168
3169static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
3170{
3171        /* Caller already did skb_cow() with len as headroom,
3172         * so no need to do it here.
3173         */
3174        skb_push(skb, len);
3175        memmove(skb->data, skb->data + len, off);
3176        memset(skb->data + off, 0, len);
3177
3178        /* No skb_postpush_rcsum(skb, skb->data + off, len)
3179         * needed here as it does not change the skb->csum
3180         * result for checksum complete when summing over
3181         * zeroed blocks.
3182         */
3183        return 0;
3184}
3185
3186static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
3187{
3188        /* skb_ensure_writable() is not needed here, as we're
3189         * already working on an uncloned skb.
3190         */
3191        if (unlikely(!pskb_may_pull(skb, off + len)))
3192                return -ENOMEM;
3193
3194        skb_postpull_rcsum(skb, skb->data + off, len);
3195        memmove(skb->data + len, skb->data, off);
3196        __skb_pull(skb, len);
3197
3198        return 0;
3199}
3200
3201static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
3202{
3203        bool trans_same = skb->transport_header == skb->network_header;
3204        int ret;
3205
3206        /* There's no need for __skb_push()/__skb_pull() pair to
3207         * get to the start of the mac header as we're guaranteed
3208         * to always start from here under eBPF.
3209         */
3210        ret = bpf_skb_generic_push(skb, off, len);
3211        if (likely(!ret)) {
3212                skb->mac_header -= len;
3213                skb->network_header -= len;
3214                if (trans_same)
3215                        skb->transport_header = skb->network_header;
3216        }
3217
3218        return ret;
3219}
3220
3221static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
3222{
3223        bool trans_same = skb->transport_header == skb->network_header;
3224        int ret;
3225
3226        /* Same here, __skb_push()/__skb_pull() pair not needed. */
3227        ret = bpf_skb_generic_pop(skb, off, len);
3228        if (likely(!ret)) {
3229                skb->mac_header += len;
3230                skb->network_header += len;
3231                if (trans_same)
3232                        skb->transport_header = skb->network_header;
3233        }
3234
3235        return ret;
3236}
3237
3238static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
3239{
3240        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
3241        u32 off = skb_mac_header_len(skb);
3242        int ret;
3243
3244        ret = skb_cow(skb, len_diff);
3245        if (unlikely(ret < 0))
3246                return ret;
3247
3248        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
3249        if (unlikely(ret < 0))
3250                return ret;
3251
3252        if (skb_is_gso(skb)) {
3253                struct skb_shared_info *shinfo = skb_shinfo(skb);
3254
3255                /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
3256                if (shinfo->gso_type & SKB_GSO_TCPV4) {
3257                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
3258                        shinfo->gso_type |=  SKB_GSO_TCPV6;
3259                }
3260        }
3261
3262        skb->protocol = htons(ETH_P_IPV6);
3263        skb_clear_hash(skb);
3264
3265        return 0;
3266}
3267
3268static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
3269{
3270        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
3271        u32 off = skb_mac_header_len(skb);
3272        int ret;
3273
3274        ret = skb_unclone(skb, GFP_ATOMIC);
3275        if (unlikely(ret < 0))
3276                return ret;
3277
3278        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
3279        if (unlikely(ret < 0))
3280                return ret;
3281
3282        if (skb_is_gso(skb)) {
3283                struct skb_shared_info *shinfo = skb_shinfo(skb);
3284
3285                /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
3286                if (shinfo->gso_type & SKB_GSO_TCPV6) {
3287                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
3288                        shinfo->gso_type |=  SKB_GSO_TCPV4;
3289                }
3290        }
3291
3292        skb->protocol = htons(ETH_P_IP);
3293        skb_clear_hash(skb);
3294
3295        return 0;
3296}
3297
3298static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
3299{
3300        __be16 from_proto = skb->protocol;
3301
3302        if (from_proto == htons(ETH_P_IP) &&
3303              to_proto == htons(ETH_P_IPV6))
3304                return bpf_skb_proto_4_to_6(skb);
3305
3306        if (from_proto == htons(ETH_P_IPV6) &&
3307              to_proto == htons(ETH_P_IP))
3308                return bpf_skb_proto_6_to_4(skb);
3309
3310        return -ENOTSUPP;
3311}
3312
3313BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
3314           u64, flags)
3315{
3316        int ret;
3317
3318        if (unlikely(flags))
3319                return -EINVAL;
3320
3321        /* General idea is that this helper does the basic groundwork
3322         * needed for changing the protocol, and eBPF program fills the
3323         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
3324         * and other helpers, rather than passing a raw buffer here.
3325         *
3326         * The rationale is to keep this minimal and without a need to
3327         * deal with raw packet data. F.e. even if we would pass buffers
3328         * here, the program still needs to call the bpf_lX_csum_replace()
3329         * helpers anyway. Plus, this way we keep also separation of
3330         * concerns, since f.e. bpf_skb_store_bytes() should only take
3331         * care of stores.
3332         *
3333         * Currently, additional options and extension header space are
3334         * not supported, but flags register is reserved so we can adapt
3335         * that. For offloads, we mark packet as dodgy, so that headers
3336         * need to be verified first.
3337         */
3338        ret = bpf_skb_proto_xlat(skb, proto);
3339        bpf_compute_data_pointers(skb);
3340        return ret;
3341}
3342
3343static const struct bpf_func_proto bpf_skb_change_proto_proto = {
3344        .func           = bpf_skb_change_proto,
3345        .gpl_only       = false,
3346        .ret_type       = RET_INTEGER,
3347        .arg1_type      = ARG_PTR_TO_CTX,
3348        .arg2_type      = ARG_ANYTHING,
3349        .arg3_type      = ARG_ANYTHING,
3350};
3351
3352BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
3353{
3354        /* We only allow a restricted subset to be changed for now. */
3355        if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
3356                     !skb_pkt_type_ok(pkt_type)))
3357                return -EINVAL;
3358
3359        skb->pkt_type = pkt_type;
3360        return 0;
3361}
3362
3363static const struct bpf_func_proto bpf_skb_change_type_proto = {
3364        .func           = bpf_skb_change_type,
3365        .gpl_only       = false,
3366        .ret_type       = RET_INTEGER,
3367        .arg1_type      = ARG_PTR_TO_CTX,
3368        .arg2_type      = ARG_ANYTHING,
3369};
3370
3371static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
3372{
3373        switch (skb->protocol) {
3374        case htons(ETH_P_IP):
3375                return sizeof(struct iphdr);
3376        case htons(ETH_P_IPV6):
3377                return sizeof(struct ipv6hdr);
3378        default:
3379                return ~0U;
3380        }
3381}
3382
3383#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK    (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
3384                                         BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3385
3386#define BPF_F_ADJ_ROOM_MASK             (BPF_F_ADJ_ROOM_FIXED_GSO | \
3387                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
3388                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
3389                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
3390                                         BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
3391                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
3392                                          BPF_ADJ_ROOM_ENCAP_L2_MASK))
3393
3394static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
3395                            u64 flags)
3396{
3397        u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
3398        bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
3399        u16 mac_len = 0, inner_net = 0, inner_trans = 0;
3400        unsigned int gso_type = SKB_GSO_DODGY;
3401        int ret;
3402
3403        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3404                /* udp gso_size delineates datagrams, only allow if fixed */
3405                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3406                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3407                        return -ENOTSUPP;
3408        }
3409
3410        ret = skb_cow_head(skb, len_diff);
3411        if (unlikely(ret < 0))
3412                return ret;
3413
3414        if (encap) {
3415                if (skb->protocol != htons(ETH_P_IP) &&
3416                    skb->protocol != htons(ETH_P_IPV6))
3417                        return -ENOTSUPP;
3418
3419                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
3420                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3421                        return -EINVAL;
3422
3423                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
3424                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3425                        return -EINVAL;
3426
3427                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
3428                    inner_mac_len < ETH_HLEN)
3429                        return -EINVAL;
3430
3431                if (skb->encapsulation)
3432                        return -EALREADY;
3433
3434                mac_len = skb->network_header - skb->mac_header;
3435                inner_net = skb->network_header;
3436                if (inner_mac_len > len_diff)
3437                        return -EINVAL;
3438                inner_trans = skb->transport_header;
3439        }
3440
3441        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
3442        if (unlikely(ret < 0))
3443                return ret;
3444
3445        if (encap) {
3446                skb->inner_mac_header = inner_net - inner_mac_len;
3447                skb->inner_network_header = inner_net;
3448                skb->inner_transport_header = inner_trans;
3449
3450                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
3451                        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
3452                else
3453                        skb_set_inner_protocol(skb, skb->protocol);
3454
3455                skb->encapsulation = 1;
3456                skb_set_network_header(skb, mac_len);
3457
3458                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3459                        gso_type |= SKB_GSO_UDP_TUNNEL;
3460                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
3461                        gso_type |= SKB_GSO_GRE;
3462                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3463                        gso_type |= SKB_GSO_IPXIP6;
3464                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3465                        gso_type |= SKB_GSO_IPXIP4;
3466
3467                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
3468                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
3469                        int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
3470                                        sizeof(struct ipv6hdr) :
3471                                        sizeof(struct iphdr);
3472
3473                        skb_set_transport_header(skb, mac_len + nh_len);
3474                }
3475
3476                /* Match skb->protocol to new outer l3 protocol */
3477                if (skb->protocol == htons(ETH_P_IP) &&
3478                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3479                        skb->protocol = htons(ETH_P_IPV6);
3480                else if (skb->protocol == htons(ETH_P_IPV6) &&
3481                         flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3482                        skb->protocol = htons(ETH_P_IP);
3483        }
3484
3485        if (skb_is_gso(skb)) {
3486                struct skb_shared_info *shinfo = skb_shinfo(skb);
3487
3488                /* Due to header grow, MSS needs to be downgraded. */
3489                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3490                        skb_decrease_gso_size(shinfo, len_diff);
3491
3492                /* Header must be checked, and gso_segs recomputed. */
3493                shinfo->gso_type |= gso_type;
3494                shinfo->gso_segs = 0;
3495        }
3496
3497        return 0;
3498}
3499
3500static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
3501                              u64 flags)
3502{
3503        int ret;
3504
3505        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
3506                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3507                return -EINVAL;
3508
3509        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3510                /* udp gso_size delineates datagrams, only allow if fixed */
3511                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3512                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3513                        return -ENOTSUPP;
3514        }
3515
3516        ret = skb_unclone(skb, GFP_ATOMIC);
3517        if (unlikely(ret < 0))
3518                return ret;
3519
3520        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
3521        if (unlikely(ret < 0))
3522                return ret;
3523
3524        if (skb_is_gso(skb)) {
3525                struct skb_shared_info *shinfo = skb_shinfo(skb);
3526
3527                /* Due to header shrink, MSS can be upgraded. */
3528                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3529                        skb_increase_gso_size(shinfo, len_diff);
3530
3531                /* Header must be checked, and gso_segs recomputed. */
3532                shinfo->gso_type |= SKB_GSO_DODGY;
3533                shinfo->gso_segs = 0;
3534        }
3535
3536        return 0;
3537}
3538
3539#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
3540
3541BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
3542           u32, mode, u64, flags)
3543{
3544        u32 len_diff_abs = abs(len_diff);
3545        bool shrink = len_diff < 0;
3546        int ret = 0;
3547
3548        if (unlikely(flags || mode))
3549                return -EINVAL;
3550        if (unlikely(len_diff_abs > 0xfffU))
3551                return -EFAULT;
3552
3553        if (!shrink) {
3554                ret = skb_cow(skb, len_diff);
3555                if (unlikely(ret < 0))
3556                        return ret;
3557                __skb_push(skb, len_diff_abs);
3558                memset(skb->data, 0, len_diff_abs);
3559        } else {
3560                if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
3561                        return -ENOMEM;
3562                __skb_pull(skb, len_diff_abs);
3563        }
3564        if (tls_sw_has_ctx_rx(skb->sk)) {
3565                struct strp_msg *rxm = strp_msg(skb);
3566
3567                rxm->full_len += len_diff;
3568        }
3569        return ret;
3570}
3571
3572static const struct bpf_func_proto sk_skb_adjust_room_proto = {
3573        .func           = sk_skb_adjust_room,
3574        .gpl_only       = false,
3575        .ret_type       = RET_INTEGER,
3576        .arg1_type      = ARG_PTR_TO_CTX,
3577        .arg2_type      = ARG_ANYTHING,
3578        .arg3_type      = ARG_ANYTHING,
3579        .arg4_type      = ARG_ANYTHING,
3580};
3581
3582BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
3583           u32, mode, u64, flags)
3584{
3585        u32 len_cur, len_diff_abs = abs(len_diff);
3586        u32 len_min = bpf_skb_net_base_len(skb);
3587        u32 len_max = BPF_SKB_MAX_LEN;
3588        __be16 proto = skb->protocol;
3589        bool shrink = len_diff < 0;
3590        u32 off;
3591        int ret;
3592
3593        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
3594                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3595                return -EINVAL;
3596        if (unlikely(len_diff_abs > 0xfffU))
3597                return -EFAULT;
3598        if (unlikely(proto != htons(ETH_P_IP) &&
3599                     proto != htons(ETH_P_IPV6)))
3600                return -ENOTSUPP;
3601
3602        off = skb_mac_header_len(skb);
3603        switch (mode) {
3604        case BPF_ADJ_ROOM_NET:
3605                off += bpf_skb_net_base_len(skb);
3606                break;
3607        case BPF_ADJ_ROOM_MAC:
3608                break;
3609        default:
3610                return -ENOTSUPP;
3611        }
3612
3613        len_cur = skb->len - skb_network_offset(skb);
3614        if ((shrink && (len_diff_abs >= len_cur ||
3615                        len_cur - len_diff_abs < len_min)) ||
3616            (!shrink && (skb->len + len_diff_abs > len_max &&
3617                         !skb_is_gso(skb))))
3618                return -ENOTSUPP;
3619
3620        ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
3621                       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
3622        if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
3623                __skb_reset_checksum_unnecessary(skb);
3624
3625        bpf_compute_data_pointers(skb);
3626        return ret;
3627}
3628
3629static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
3630        .func           = bpf_skb_adjust_room,
3631        .gpl_only       = false,
3632        .ret_type       = RET_INTEGER,
3633        .arg1_type      = ARG_PTR_TO_CTX,
3634        .arg2_type      = ARG_ANYTHING,
3635        .arg3_type      = ARG_ANYTHING,
3636        .arg4_type      = ARG_ANYTHING,
3637};
3638
3639static u32 __bpf_skb_min_len(const struct sk_buff *skb)
3640{
3641        u32 min_len = skb_network_offset(skb);
3642
3643        if (skb_transport_header_was_set(skb))
3644                min_len = skb_transport_offset(skb);
3645        if (skb->ip_summed == CHECKSUM_PARTIAL)
3646                min_len = skb_checksum_start_offset(skb) +
3647                          skb->csum_offset + sizeof(__sum16);
3648        return min_len;
3649}
3650
3651static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
3652{
3653        unsigned int old_len = skb->len;
3654        int ret;
3655
3656        ret = __skb_grow_rcsum(skb, new_len);
3657        if (!ret)
3658                memset(skb->data + old_len, 0, new_len - old_len);
3659        return ret;
3660}
3661
3662static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
3663{
3664        return __skb_trim_rcsum(skb, new_len);
3665}
3666
3667static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
3668                                        u64 flags)
3669{
3670        u32 max_len = BPF_SKB_MAX_LEN;
3671        u32 min_len = __bpf_skb_min_len(skb);
3672        int ret;
3673
3674        if (unlikely(flags || new_len > max_len || new_len < min_len))
3675                return -EINVAL;
3676        if (skb->encapsulation)
3677                return -ENOTSUPP;
3678
3679        /* The basic idea of this helper is that it's performing the
3680         * needed work to either grow or trim an skb, and eBPF program
3681         * rewrites the rest via helpers like bpf_skb_store_bytes(),
3682         * bpf_lX_csum_replace() and others rather than passing a raw
3683         * buffer here. This one is a slow path helper and intended
3684         * for replies with control messages.
3685         *
3686         * Like in bpf_skb_change_proto(), we want to keep this rather
3687         * minimal and without protocol specifics so that we are able
3688         * to separate concerns as in bpf_skb_store_bytes() should only
3689         * be the one responsible for writing buffers.
3690         *
3691         * It's really expected to be a slow path operation here for
3692         * control message replies, so we're implicitly linearizing,
3693         * uncloning and drop offloads from the skb by this.
3694         */
3695        ret = __bpf_try_make_writable(skb, skb->len);
3696        if (!ret) {
3697                if (new_len > skb->len)
3698                        ret = bpf_skb_grow_rcsum(skb, new_len);
3699                else if (new_len < skb->len)
3700                        ret = bpf_skb_trim_rcsum(skb, new_len);
3701                if (!ret && skb_is_gso(skb))
3702                        skb_gso_reset(skb);
3703        }
3704        return ret;
3705}
3706
3707BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3708           u64, flags)
3709{
3710        int ret = __bpf_skb_change_tail(skb, new_len, flags);
3711
3712        bpf_compute_data_pointers(skb);
3713        return ret;
3714}
3715
3716static const struct bpf_func_proto bpf_skb_change_tail_proto = {
3717        .func           = bpf_skb_change_tail,
3718        .gpl_only       = false,
3719        .ret_type       = RET_INTEGER,
3720        .arg1_type      = ARG_PTR_TO_CTX,
3721        .arg2_type      = ARG_ANYTHING,
3722        .arg3_type      = ARG_ANYTHING,
3723};
3724
3725BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3726           u64, flags)
3727{
3728        return __bpf_skb_change_tail(skb, new_len, flags);
3729}
3730
3731static const struct bpf_func_proto sk_skb_change_tail_proto = {
3732        .func           = sk_skb_change_tail,
3733        .gpl_only       = false,
3734        .ret_type       = RET_INTEGER,
3735        .arg1_type      = ARG_PTR_TO_CTX,
3736        .arg2_type      = ARG_ANYTHING,
3737        .arg3_type      = ARG_ANYTHING,
3738};
3739
3740static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
3741                                        u64 flags)
3742{
3743        u32 max_len = BPF_SKB_MAX_LEN;
3744        u32 new_len = skb->len + head_room;
3745        int ret;
3746
3747        if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
3748                     new_len < skb->len))
3749                return -EINVAL;
3750
3751        ret = skb_cow(skb, head_room);
3752        if (likely(!ret)) {
3753                /* Idea for this helper is that we currently only
3754                 * allow to expand on mac header. This means that
3755                 * skb->protocol network header, etc, stay as is.
3756                 * Compared to bpf_skb_change_tail(), we're more
3757                 * flexible due to not needing to linearize or
3758                 * reset GSO. Intention for this helper is to be
3759                 * used by an L3 skb that needs to push mac header
3760                 * for redirection into L2 device.
3761                 */
3762                __skb_push(skb, head_room);
3763                memset(skb->data, 0, head_room);
3764                skb_reset_mac_header(skb);
3765                skb_reset_mac_len(skb);
3766        }
3767
3768        return ret;
3769}
3770
3771BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
3772           u64, flags)
3773{
3774        int ret = __bpf_skb_change_head(skb, head_room, flags);
3775
3776        bpf_compute_data_pointers(skb);
3777        return ret;
3778}
3779
3780static const struct bpf_func_proto bpf_skb_change_head_proto = {
3781        .func           = bpf_skb_change_head,
3782        .gpl_only       = false,
3783        .ret_type       = RET_INTEGER,
3784        .arg1_type      = ARG_PTR_TO_CTX,
3785        .arg2_type      = ARG_ANYTHING,
3786        .arg3_type      = ARG_ANYTHING,
3787};
3788
3789BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
3790           u64, flags)
3791{
3792        return __bpf_skb_change_head(skb, head_room, flags);
3793}
3794
3795static const struct bpf_func_proto sk_skb_change_head_proto = {
3796        .func           = sk_skb_change_head,
3797        .gpl_only       = false,
3798        .ret_type       = RET_INTEGER,
3799        .arg1_type      = ARG_PTR_TO_CTX,
3800        .arg2_type      = ARG_ANYTHING,
3801        .arg3_type      = ARG_ANYTHING,
3802};
3803static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
3804{
3805        return xdp_data_meta_unsupported(xdp) ? 0 :
3806               xdp->data - xdp->data_meta;
3807}
3808
3809BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
3810{
3811        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
3812        unsigned long metalen = xdp_get_metalen(xdp);
3813        void *data_start = xdp_frame_end + metalen;
3814        void *data = xdp->data + offset;
3815
3816        if (unlikely(data < data_start ||
3817                     data > xdp->data_end - ETH_HLEN))
3818                return -EINVAL;
3819
3820        if (metalen)
3821                memmove(xdp->data_meta + offset,
3822                        xdp->data_meta, metalen);
3823        xdp->data_meta += offset;
3824        xdp->data = data;
3825
3826        return 0;
3827}
3828
3829static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
3830        .func           = bpf_xdp_adjust_head,
3831        .gpl_only       = false,
3832        .ret_type       = RET_INTEGER,
3833        .arg1_type      = ARG_PTR_TO_CTX,
3834        .arg2_type      = ARG_ANYTHING,
3835};
3836
3837BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
3838{
3839        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
3840        void *data_end = xdp->data_end + offset;
3841
3842        /* Notice that xdp_data_hard_end have reserved some tailroom */
3843        if (unlikely(data_end > data_hard_end))
3844                return -EINVAL;
3845
3846        /* ALL drivers MUST init xdp->frame_sz, chicken check below */
3847        if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
3848                WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
3849                return -EINVAL;
3850        }
3851
3852        if (unlikely(data_end < xdp->data + ETH_HLEN))
3853                return -EINVAL;
3854
3855        /* Clear memory area on grow, can contain uninit kernel memory */
3856        if (offset > 0)
3857                memset(xdp->data_end, 0, offset);
3858
3859        xdp->data_end = data_end;
3860
3861        return 0;
3862}
3863
3864static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
3865        .func           = bpf_xdp_adjust_tail,
3866        .gpl_only       = false,
3867        .ret_type       = RET_INTEGER,
3868        .arg1_type      = ARG_PTR_TO_CTX,
3869        .arg2_type      = ARG_ANYTHING,
3870};
3871
3872BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
3873{
3874        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
3875        void *meta = xdp->data_meta + offset;
3876        unsigned long metalen = xdp->data - meta;
3877
3878        if (xdp_data_meta_unsupported(xdp))
3879                return -ENOTSUPP;
3880        if (unlikely(meta < xdp_frame_end ||
3881                     meta > xdp->data))
3882                return -EINVAL;
3883        if (unlikely((metalen & (sizeof(__u32) - 1)) ||
3884                     (metalen > 32)))
3885                return -EACCES;
3886
3887        xdp->data_meta = meta;
3888
3889        return 0;
3890}
3891
3892static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
3893        .func           = bpf_xdp_adjust_meta,
3894        .gpl_only       = false,
3895        .ret_type       = RET_INTEGER,
3896        .arg1_type      = ARG_PTR_TO_CTX,
3897        .arg2_type      = ARG_ANYTHING,
3898};
3899
3900/* XDP_REDIRECT works by a three-step process, implemented in the functions
3901 * below:
3902 *
3903 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
3904 *    of the redirect and store it (along with some other metadata) in a per-CPU
3905 *    struct bpf_redirect_info.
3906 *
3907 * 2. When the program returns the XDP_REDIRECT return code, the driver will
3908 *    call xdp_do_redirect() which will use the information in struct
3909 *    bpf_redirect_info to actually enqueue the frame into a map type-specific
3910 *    bulk queue structure.
3911 *
3912 * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
3913 *    which will flush all the different bulk queues, thus completing the
3914 *    redirect.
3915 *
3916 * Pointers to the map entries will be kept around for this whole sequence of
3917 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
3918 * the core code; instead, the RCU protection relies on everything happening
3919 * inside a single NAPI poll sequence, which means it's between a pair of calls
3920 * to local_bh_disable()/local_bh_enable().
3921 *
3922 * The map entries are marked as __rcu and the map code makes sure to
3923 * dereference those pointers with rcu_dereference_check() in a way that works
3924 * for both sections that to hold an rcu_read_lock() and sections that are
3925 * called from NAPI without a separate rcu_read_lock(). The code below does not
3926 * use RCU annotations, but relies on those in the map code.
3927 */
3928void xdp_do_flush(void)
3929{
3930        __dev_flush();
3931        __cpu_map_flush();
3932        __xsk_map_flush();
3933}
3934EXPORT_SYMBOL_GPL(xdp_do_flush);
3935
3936void bpf_clear_redirect_map(struct bpf_map *map)
3937{
3938        struct bpf_redirect_info *ri;
3939        int cpu;
3940
3941        for_each_possible_cpu(cpu) {
3942                ri = per_cpu_ptr(&bpf_redirect_info, cpu);
3943                /* Avoid polluting remote cacheline due to writes if
3944                 * not needed. Once we pass this test, we need the
3945                 * cmpxchg() to make sure it hasn't been changed in
3946                 * the meantime by remote CPU.
3947                 */
3948                if (unlikely(READ_ONCE(ri->map) == map))
3949                        cmpxchg(&ri->map, map, NULL);
3950        }
3951}
3952
3953int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
3954                    struct bpf_prog *xdp_prog)
3955{
3956        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3957        enum bpf_map_type map_type = ri->map_type;
3958        void *fwd = ri->tgt_value;
3959        u32 map_id = ri->map_id;
3960        struct bpf_map *map;
3961        int err;
3962
3963        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
3964        ri->map_type = BPF_MAP_TYPE_UNSPEC;
3965
3966        switch (map_type) {
3967        case BPF_MAP_TYPE_DEVMAP:
3968                fallthrough;
3969        case BPF_MAP_TYPE_DEVMAP_HASH:
3970                map = READ_ONCE(ri->map);
3971                if (unlikely(map)) {
3972                        WRITE_ONCE(ri->map, NULL);
3973                        err = dev_map_enqueue_multi(xdp, dev, map,
3974                                                    ri->flags & BPF_F_EXCLUDE_INGRESS);
3975                } else {
3976                        err = dev_map_enqueue(fwd, xdp, dev);
3977                }
3978                break;
3979        case BPF_MAP_TYPE_CPUMAP:
3980                err = cpu_map_enqueue(fwd, xdp, dev);
3981                break;
3982        case BPF_MAP_TYPE_XSKMAP:
3983                err = __xsk_map_redirect(fwd, xdp);
3984                break;
3985        case BPF_MAP_TYPE_UNSPEC:
3986                if (map_id == INT_MAX) {
3987                        fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
3988                        if (unlikely(!fwd)) {
3989                                err = -EINVAL;
3990                                break;
3991                        }
3992                        err = dev_xdp_enqueue(fwd, xdp, dev);
3993                        break;
3994                }
3995                fallthrough;
3996        default:
3997                err = -EBADRQC;
3998        }
3999
4000        if (unlikely(err))
4001                goto err;
4002
4003        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4004        return 0;
4005err:
4006        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4007        return err;
4008}
4009EXPORT_SYMBOL_GPL(xdp_do_redirect);
4010
4011static int xdp_do_generic_redirect_map(struct net_device *dev,
4012                                       struct sk_buff *skb,
4013                                       struct xdp_buff *xdp,
4014                                       struct bpf_prog *xdp_prog,
4015                                       void *fwd,
4016                                       enum bpf_map_type map_type, u32 map_id)
4017{
4018        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4019        struct bpf_map *map;
4020        int err;
4021
4022        switch (map_type) {
4023        case BPF_MAP_TYPE_DEVMAP:
4024                fallthrough;
4025        case BPF_MAP_TYPE_DEVMAP_HASH:
4026                map = READ_ONCE(ri->map);
4027                if (unlikely(map)) {
4028                        WRITE_ONCE(ri->map, NULL);
4029                        err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
4030                                                     ri->flags & BPF_F_EXCLUDE_INGRESS);
4031                } else {
4032                        err = dev_map_generic_redirect(fwd, skb, xdp_prog);
4033                }
4034                if (unlikely(err))
4035                        goto err;
4036                break;
4037        case BPF_MAP_TYPE_XSKMAP:
4038                err = xsk_generic_rcv(fwd, xdp);
4039                if (err)
4040                        goto err;
4041                consume_skb(skb);
4042                break;
4043        default:
4044                /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
4045                err = -EBADRQC;
4046                goto err;
4047        }
4048
4049        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4050        return 0;
4051err:
4052        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4053        return err;
4054}
4055
4056int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
4057                            struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
4058{
4059        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4060        enum bpf_map_type map_type = ri->map_type;
4061        void *fwd = ri->tgt_value;
4062        u32 map_id = ri->map_id;
4063        int err;
4064
4065        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
4066        ri->map_type = BPF_MAP_TYPE_UNSPEC;
4067
4068        if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
4069                fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
4070                if (unlikely(!fwd)) {
4071                        err = -EINVAL;
4072                        goto err;
4073                }
4074
4075                err = xdp_ok_fwd_dev(fwd, skb->len);
4076                if (unlikely(err))
4077                        goto err;
4078
4079                skb->dev = fwd;
4080                _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
4081                generic_xdp_tx(skb, xdp_prog);
4082                return 0;
4083        }
4084
4085        return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
4086err:
4087        _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
4088        return err;
4089}
4090
4091BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
4092{
4093        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4094
4095        if (unlikely(flags))
4096                return XDP_ABORTED;
4097
4098        /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
4099         * by map_idr) is used for ifindex based XDP redirect.
4100         */
4101        ri->tgt_index = ifindex;
4102        ri->map_id = INT_MAX;
4103        ri->map_type = BPF_MAP_TYPE_UNSPEC;
4104
4105        return XDP_REDIRECT;
4106}
4107
4108static const struct bpf_func_proto bpf_xdp_redirect_proto = {
4109        .func           = bpf_xdp_redirect,
4110        .gpl_only       = false,
4111        .ret_type       = RET_INTEGER,
4112        .arg1_type      = ARG_ANYTHING,
4113        .arg2_type      = ARG_ANYTHING,
4114};
4115
4116BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
4117           u64, flags)
4118{
4119        return map->ops->map_redirect(map, ifindex, flags);
4120}
4121
4122static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
4123        .func           = bpf_xdp_redirect_map,
4124        .gpl_only       = false,
4125        .ret_type       = RET_INTEGER,
4126        .arg1_type      = ARG_CONST_MAP_PTR,
4127        .arg2_type      = ARG_ANYTHING,
4128        .arg3_type      = ARG_ANYTHING,
4129};
4130
4131static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
4132                                  unsigned long off, unsigned long len)
4133{
4134        void *ptr = skb_header_pointer(skb, off, len, dst_buff);
4135
4136        if (unlikely(!ptr))
4137                return len;
4138        if (ptr != dst_buff)
4139                memcpy(dst_buff, ptr, len);
4140
4141        return 0;
4142}
4143
4144BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
4145           u64, flags, void *, meta, u64, meta_size)
4146{
4147        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
4148
4149        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
4150                return -EINVAL;
4151        if (unlikely(!skb || skb_size > skb->len))
4152                return -EFAULT;
4153
4154        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
4155                                bpf_skb_copy);
4156}
4157
4158static const struct bpf_func_proto bpf_skb_event_output_proto = {
4159        .func           = bpf_skb_event_output,
4160        .gpl_only       = true,
4161        .ret_type       = RET_INTEGER,
4162        .arg1_type      = ARG_PTR_TO_CTX,
4163        .arg2_type      = ARG_CONST_MAP_PTR,
4164        .arg3_type      = ARG_ANYTHING,
4165        .arg4_type      = ARG_PTR_TO_MEM,
4166        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
4167};
4168
4169BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)
4170
4171const struct bpf_func_proto bpf_skb_output_proto = {
4172        .func           = bpf_skb_event_output,
4173        .gpl_only       = true,
4174        .ret_type       = RET_INTEGER,
4175        .arg1_type      = ARG_PTR_TO_BTF_ID,
4176        .arg1_btf_id    = &bpf_skb_output_btf_ids[0],
4177        .arg2_type      = ARG_CONST_MAP_PTR,
4178        .arg3_type      = ARG_ANYTHING,
4179        .arg4_type      = ARG_PTR_TO_MEM,
4180        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
4181};
4182
4183static unsigned short bpf_tunnel_key_af(u64 flags)
4184{
4185        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
4186}
4187
4188BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
4189           u32, size, u64, flags)
4190{
4191        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
4192        u8 compat[sizeof(struct bpf_tunnel_key)];
4193        void *to_orig = to;
4194        int err;
4195
4196        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
4197                err = -EINVAL;
4198                goto err_clear;
4199        }
4200        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
4201                err = -EPROTO;
4202                goto err_clear;
4203        }
4204        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
4205                err = -EINVAL;
4206                switch (size) {
4207                case offsetof(struct bpf_tunnel_key, tunnel_label):
4208                case offsetof(struct bpf_tunnel_key, tunnel_ext):
4209                        goto set_compat;
4210                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
4211                        /* Fixup deprecated structure layouts here, so we have
4212                         * a common path later on.
4213                         */
4214                        if (ip_tunnel_info_af(info) != AF_INET)
4215                                goto err_clear;
4216set_compat:
4217                        to = (struct bpf_tunnel_key *)compat;
4218                        break;
4219                default:
4220                        goto err_clear;
4221                }
4222        }
4223
4224        to->tunnel_id = be64_to_cpu(info->key.tun_id);
4225        to->tunnel_tos = info->key.tos;
4226        to->tunnel_ttl = info->key.ttl;
4227        to->tunnel_ext = 0;
4228
4229        if (flags & BPF_F_TUNINFO_IPV6) {
4230                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
4231                       sizeof(to->remote_ipv6));
4232                to->tunnel_label = be32_to_cpu(info->key.label);
4233        } else {
4234                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
4235                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
4236                to->tunnel_label = 0;
4237        }
4238
4239        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
4240                memcpy(to_orig, to, size);
4241
4242        return 0;
4243err_clear:
4244        memset(to_orig, 0, size);
4245        return err;
4246}
4247
4248static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
4249        .func           = bpf_skb_get_tunnel_key,
4250        .gpl_only       = false,
4251        .ret_type       = RET_INTEGER,
4252        .arg1_type      = ARG_PTR_TO_CTX,
4253        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
4254        .arg3_type      = ARG_CONST_SIZE,
4255        .arg4_type      = ARG_ANYTHING,
4256};
4257
4258BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
4259{
4260        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
4261        int err;
4262
4263        if (unlikely(!info ||
4264                     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
4265                err = -ENOENT;
4266                goto err_clear;
4267        }
4268        if (unlikely(size < info->options_len)) {
4269                err = -ENOMEM;
4270                goto err_clear;
4271        }
4272
4273        ip_tunnel_info_opts_get(to, info);
4274        if (size > info->options_len)
4275                memset(to + info->options_len, 0, size - info->options_len);
4276
4277        return info->options_len;
4278err_clear:
4279        memset(to, 0, size);
4280        return err;
4281}
4282
4283static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
4284        .func           = bpf_skb_get_tunnel_opt,
4285        .gpl_only       = false,
4286        .ret_type       = RET_INTEGER,
4287        .arg1_type      = ARG_PTR_TO_CTX,
4288        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
4289        .arg3_type      = ARG_CONST_SIZE,
4290};
4291
4292static struct metadata_dst __percpu *md_dst;
4293
4294BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
4295           const struct bpf_tunnel_key *, from, u32, size, u64, flags)
4296{
4297        struct metadata_dst *md = this_cpu_ptr(md_dst);
4298        u8 compat[sizeof(struct bpf_tunnel_key)];
4299        struct ip_tunnel_info *info;
4300
4301        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
4302                               BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
4303                return -EINVAL;
4304        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
4305                switch (size) {
4306                case offsetof(struct bpf_tunnel_key, tunnel_label):
4307                case offsetof(struct bpf_tunnel_key, tunnel_ext):
4308                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
4309                        /* Fixup deprecated structure layouts here, so we have
4310                         * a common path later on.
4311                         */
4312                        memcpy(compat, from, size);
4313                        memset(compat + size, 0, sizeof(compat) - size);
4314                        from = (const struct bpf_tunnel_key *) compat;
4315                        break;
4316                default:
4317                        return -EINVAL;
4318                }
4319        }
4320        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
4321                     from->tunnel_ext))
4322                return -EINVAL;
4323
4324        skb_dst_drop(skb);
4325        dst_hold((struct dst_entry *) md);
4326        skb_dst_set(skb, (struct dst_entry *) md);
4327
4328        info = &md->u.tun_info;
4329        memset(info, 0, sizeof(*info));
4330        info->mode = IP_TUNNEL_INFO_TX;
4331
4332        info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
4333        if (flags & BPF_F_DONT_FRAGMENT)
4334                info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
4335        if (flags & BPF_F_ZERO_CSUM_TX)
4336                info->key.tun_flags &= ~TUNNEL_CSUM;
4337        if (flags & BPF_F_SEQ_NUMBER)
4338                info->key.tun_flags |= TUNNEL_SEQ;
4339
4340        info->key.tun_id = cpu_to_be64(from->tunnel_id);
4341        info->key.tos = from->tunnel_tos;
4342        info->key.ttl = from->tunnel_ttl;
4343
4344        if (flags & BPF_F_TUNINFO_IPV6) {
4345                info->mode |= IP_TUNNEL_INFO_IPV6;
4346                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
4347                       sizeof(from->remote_ipv6));
4348                info->key.label = cpu_to_be32(from->tunnel_label) &
4349                                  IPV6_FLOWLABEL_MASK;
4350        } else {
4351                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
4352        }
4353
4354        return 0;
4355}
4356
4357static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
4358        .func           = bpf_skb_set_tunnel_key,
4359        .gpl_only       = false,
4360        .ret_type       = RET_INTEGER,
4361        .arg1_type      = ARG_PTR_TO_CTX,
4362        .arg2_type      = ARG_PTR_TO_MEM,
4363        .arg3_type      = ARG_CONST_SIZE,
4364        .arg4_type      = ARG_ANYTHING,
4365};
4366
4367BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
4368           const u8 *, from, u32, size)
4369{
4370        struct ip_tunnel_info *info = skb_tunnel_info(skb);
4371        const struct metadata_dst *md = this_cpu_ptr(md_dst);
4372
4373        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
4374                return -EINVAL;
4375        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
4376                return -ENOMEM;
4377
4378        ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
4379
4380        return 0;
4381}
4382
4383static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
4384        .func           = bpf_skb_set_tunnel_opt,
4385        .gpl_only       = false,
4386        .ret_type       = RET_INTEGER,
4387        .arg1_type      = ARG_PTR_TO_CTX,
4388        .arg2_type      = ARG_PTR_TO_MEM,
4389        .arg3_type      = ARG_CONST_SIZE,
4390};
4391
4392static const struct bpf_func_proto *
4393bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
4394{
4395        if (!md_dst) {
4396                struct metadata_dst __percpu *tmp;
4397
4398                tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
4399                                                METADATA_IP_TUNNEL,
4400                                                GFP_KERNEL);
4401                if (!tmp)
4402