linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117#include <linux/mroute.h>
 118#include <linux/mroute6.h>
 119#include <linux/icmpv6.h>
 120
 121#include <linux/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134#include <linux/sock_diag.h>
 135
 136#include <linux/filter.h>
 137#include <net/sock_reuseport.h>
 138#include <net/bpf_sk_storage.h>
 139
 140#include <trace/events/sock.h>
 141
 142#include <net/tcp.h>
 143#include <net/busy_poll.h>
 144#include <net/phonet/phonet.h>
 145
 146#include <linux/ethtool.h>
 147
 148#include "dev.h"
 149
 150static DEFINE_MUTEX(proto_list_mutex);
 151static LIST_HEAD(proto_list);
 152
 153static void sock_def_write_space_wfree(struct sock *sk);
 154static void sock_def_write_space(struct sock *sk);
 155
 156/**
 157 * sk_ns_capable - General socket capability test
 158 * @sk: Socket to use a capability on or through
 159 * @user_ns: The user namespace of the capability to use
 160 * @cap: The capability to use
 161 *
 162 * Test to see if the opener of the socket had when the socket was
 163 * created and the current process has the capability @cap in the user
 164 * namespace @user_ns.
 165 */
 166bool sk_ns_capable(const struct sock *sk,
 167                   struct user_namespace *user_ns, int cap)
 168{
 169        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 170                ns_capable(user_ns, cap);
 171}
 172EXPORT_SYMBOL(sk_ns_capable);
 173
 174/**
 175 * sk_capable - Socket global capability test
 176 * @sk: Socket to use a capability on or through
 177 * @cap: The global capability to use
 178 *
 179 * Test to see if the opener of the socket had when the socket was
 180 * created and the current process has the capability @cap in all user
 181 * namespaces.
 182 */
 183bool sk_capable(const struct sock *sk, int cap)
 184{
 185        return sk_ns_capable(sk, &init_user_ns, cap);
 186}
 187EXPORT_SYMBOL(sk_capable);
 188
 189/**
 190 * sk_net_capable - Network namespace socket capability test
 191 * @sk: Socket to use a capability on or through
 192 * @cap: The capability to use
 193 *
 194 * Test to see if the opener of the socket had when the socket was created
 195 * and the current process has the capability @cap over the network namespace
 196 * the socket is a member of.
 197 */
 198bool sk_net_capable(const struct sock *sk, int cap)
 199{
 200        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 201}
 202EXPORT_SYMBOL(sk_net_capable);
 203
 204/*
 205 * Each address family might have different locking rules, so we have
 206 * one slock key per address family and separate keys for internal and
 207 * userspace sockets.
 208 */
 209static struct lock_class_key af_family_keys[AF_MAX];
 210static struct lock_class_key af_family_kern_keys[AF_MAX];
 211static struct lock_class_key af_family_slock_keys[AF_MAX];
 212static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 213
 214/*
 215 * Make lock validator output more readable. (we pre-construct these
 216 * strings build-time, so that runtime initialization of socket
 217 * locks is fast):
 218 */
 219
 220#define _sock_locks(x)                                            \
 221  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 222  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 223  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 224  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 225  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 226  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 227  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 228  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 229  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 230  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 231  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 232  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 233  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 234  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 235  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 236  x "AF_MCTP"  , \
 237  x "AF_MAX"
 238
 239static const char *const af_family_key_strings[AF_MAX+1] = {
 240        _sock_locks("sk_lock-")
 241};
 242static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 243        _sock_locks("slock-")
 244};
 245static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 246        _sock_locks("clock-")
 247};
 248
 249static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 250        _sock_locks("k-sk_lock-")
 251};
 252static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 253        _sock_locks("k-slock-")
 254};
 255static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 256        _sock_locks("k-clock-")
 257};
 258static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 259        _sock_locks("rlock-")
 260};
 261static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 262        _sock_locks("wlock-")
 263};
 264static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 265        _sock_locks("elock-")
 266};
 267
 268/*
 269 * sk_callback_lock and sk queues locking rules are per-address-family,
 270 * so split the lock classes by using a per-AF key:
 271 */
 272static struct lock_class_key af_callback_keys[AF_MAX];
 273static struct lock_class_key af_rlock_keys[AF_MAX];
 274static struct lock_class_key af_wlock_keys[AF_MAX];
 275static struct lock_class_key af_elock_keys[AF_MAX];
 276static struct lock_class_key af_kern_callback_keys[AF_MAX];
 277
 278/* Run time adjustable parameters. */
 279__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280EXPORT_SYMBOL(sysctl_wmem_max);
 281__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282EXPORT_SYMBOL(sysctl_rmem_max);
 283__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286/* Maximal space eaten by iovec or ancillary data plus some space */
 287int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 293EXPORT_SYMBOL_GPL(memalloc_socks_key);
 294
 295/**
 296 * sk_set_memalloc - sets %SOCK_MEMALLOC
 297 * @sk: socket to set it on
 298 *
 299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300 * It's the responsibility of the admin to adjust min_free_kbytes
 301 * to meet the requirements
 302 */
 303void sk_set_memalloc(struct sock *sk)
 304{
 305        sock_set_flag(sk, SOCK_MEMALLOC);
 306        sk->sk_allocation |= __GFP_MEMALLOC;
 307        static_branch_inc(&memalloc_socks_key);
 308}
 309EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311void sk_clear_memalloc(struct sock *sk)
 312{
 313        sock_reset_flag(sk, SOCK_MEMALLOC);
 314        sk->sk_allocation &= ~__GFP_MEMALLOC;
 315        static_branch_dec(&memalloc_socks_key);
 316
 317        /*
 318         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319         * progress of swapping. SOCK_MEMALLOC may be cleared while
 320         * it has rmem allocations due to the last swapfile being deactivated
 321         * but there is a risk that the socket is unusable due to exceeding
 322         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323         */
 324        sk_mem_reclaim(sk);
 325}
 326EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329{
 330        int ret;
 331        unsigned int noreclaim_flag;
 332
 333        /* these should have been dropped before queueing */
 334        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336        noreclaim_flag = memalloc_noreclaim_save();
 337        ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 338                                 tcp_v6_do_rcv,
 339                                 tcp_v4_do_rcv,
 340                                 sk, skb);
 341        memalloc_noreclaim_restore(noreclaim_flag);
 342
 343        return ret;
 344}
 345EXPORT_SYMBOL(__sk_backlog_rcv);
 346
 347void sk_error_report(struct sock *sk)
 348{
 349        sk->sk_error_report(sk);
 350
 351        switch (sk->sk_family) {
 352        case AF_INET:
 353                fallthrough;
 354        case AF_INET6:
 355                trace_inet_sk_error_report(sk);
 356                break;
 357        default:
 358                break;
 359        }
 360}
 361EXPORT_SYMBOL(sk_error_report);
 362
 363int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 364{
 365        struct __kernel_sock_timeval tv;
 366
 367        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 368                tv.tv_sec = 0;
 369                tv.tv_usec = 0;
 370        } else {
 371                tv.tv_sec = timeo / HZ;
 372                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 373        }
 374
 375        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 376                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 377                *(struct old_timeval32 *)optval = tv32;
 378                return sizeof(tv32);
 379        }
 380
 381        if (old_timeval) {
 382                struct __kernel_old_timeval old_tv;
 383                old_tv.tv_sec = tv.tv_sec;
 384                old_tv.tv_usec = tv.tv_usec;
 385                *(struct __kernel_old_timeval *)optval = old_tv;
 386                return sizeof(old_tv);
 387        }
 388
 389        *(struct __kernel_sock_timeval *)optval = tv;
 390        return sizeof(tv);
 391}
 392EXPORT_SYMBOL(sock_get_timeout);
 393
 394int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 395                           sockptr_t optval, int optlen, bool old_timeval)
 396{
 397        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 398                struct old_timeval32 tv32;
 399
 400                if (optlen < sizeof(tv32))
 401                        return -EINVAL;
 402
 403                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 404                        return -EFAULT;
 405                tv->tv_sec = tv32.tv_sec;
 406                tv->tv_usec = tv32.tv_usec;
 407        } else if (old_timeval) {
 408                struct __kernel_old_timeval old_tv;
 409
 410                if (optlen < sizeof(old_tv))
 411                        return -EINVAL;
 412                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 413                        return -EFAULT;
 414                tv->tv_sec = old_tv.tv_sec;
 415                tv->tv_usec = old_tv.tv_usec;
 416        } else {
 417                if (optlen < sizeof(*tv))
 418                        return -EINVAL;
 419                if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 420                        return -EFAULT;
 421        }
 422
 423        return 0;
 424}
 425EXPORT_SYMBOL(sock_copy_user_timeval);
 426
 427static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 428                            bool old_timeval)
 429{
 430        struct __kernel_sock_timeval tv;
 431        int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 432        long val;
 433
 434        if (err)
 435                return err;
 436
 437        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 438                return -EDOM;
 439
 440        if (tv.tv_sec < 0) {
 441                static int warned __read_mostly;
 442
 443                WRITE_ONCE(*timeo_p, 0);
 444                if (warned < 10 && net_ratelimit()) {
 445                        warned++;
 446                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 447                                __func__, current->comm, task_pid_nr(current));
 448                }
 449                return 0;
 450        }
 451        val = MAX_SCHEDULE_TIMEOUT;
 452        if ((tv.tv_sec || tv.tv_usec) &&
 453            (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 454                val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 455                                                    USEC_PER_SEC / HZ);
 456        WRITE_ONCE(*timeo_p, val);
 457        return 0;
 458}
 459
 460static bool sock_needs_netstamp(const struct sock *sk)
 461{
 462        switch (sk->sk_family) {
 463        case AF_UNSPEC:
 464        case AF_UNIX:
 465                return false;
 466        default:
 467                return true;
 468        }
 469}
 470
 471static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 472{
 473        if (sk->sk_flags & flags) {
 474                sk->sk_flags &= ~flags;
 475                if (sock_needs_netstamp(sk) &&
 476                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 477                        net_disable_timestamp();
 478        }
 479}
 480
 481
 482int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 483{
 484        unsigned long flags;
 485        struct sk_buff_head *list = &sk->sk_receive_queue;
 486
 487        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 488                atomic_inc(&sk->sk_drops);
 489                trace_sock_rcvqueue_full(sk, skb);
 490                return -ENOMEM;
 491        }
 492
 493        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 494                atomic_inc(&sk->sk_drops);
 495                return -ENOBUFS;
 496        }
 497
 498        skb->dev = NULL;
 499        skb_set_owner_r(skb, sk);
 500
 501        /* we escape from rcu protected region, make sure we dont leak
 502         * a norefcounted dst
 503         */
 504        skb_dst_force(skb);
 505
 506        spin_lock_irqsave(&list->lock, flags);
 507        sock_skb_set_dropcount(sk, skb);
 508        __skb_queue_tail(list, skb);
 509        spin_unlock_irqrestore(&list->lock, flags);
 510
 511        if (!sock_flag(sk, SOCK_DEAD))
 512                sk->sk_data_ready(sk);
 513        return 0;
 514}
 515EXPORT_SYMBOL(__sock_queue_rcv_skb);
 516
 517int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 518                              enum skb_drop_reason *reason)
 519{
 520        enum skb_drop_reason drop_reason;
 521        int err;
 522
 523        err = sk_filter(sk, skb);
 524        if (err) {
 525                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 526                goto out;
 527        }
 528        err = __sock_queue_rcv_skb(sk, skb);
 529        switch (err) {
 530        case -ENOMEM:
 531                drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 532                break;
 533        case -ENOBUFS:
 534                drop_reason = SKB_DROP_REASON_PROTO_MEM;
 535                break;
 536        default:
 537                drop_reason = SKB_NOT_DROPPED_YET;
 538                break;
 539        }
 540out:
 541        if (reason)
 542                *reason = drop_reason;
 543        return err;
 544}
 545EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 546
 547int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 548                     const int nested, unsigned int trim_cap, bool refcounted)
 549{
 550        int rc = NET_RX_SUCCESS;
 551
 552        if (sk_filter_trim_cap(sk, skb, trim_cap))
 553                goto discard_and_relse;
 554
 555        skb->dev = NULL;
 556
 557        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 558                atomic_inc(&sk->sk_drops);
 559                goto discard_and_relse;
 560        }
 561        if (nested)
 562                bh_lock_sock_nested(sk);
 563        else
 564                bh_lock_sock(sk);
 565        if (!sock_owned_by_user(sk)) {
 566                /*
 567                 * trylock + unlock semantics:
 568                 */
 569                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 570
 571                rc = sk_backlog_rcv(sk, skb);
 572
 573                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 574        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 575                bh_unlock_sock(sk);
 576                atomic_inc(&sk->sk_drops);
 577                goto discard_and_relse;
 578        }
 579
 580        bh_unlock_sock(sk);
 581out:
 582        if (refcounted)
 583                sock_put(sk);
 584        return rc;
 585discard_and_relse:
 586        kfree_skb(skb);
 587        goto out;
 588}
 589EXPORT_SYMBOL(__sk_receive_skb);
 590
 591INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 592                                                          u32));
 593INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 594                                                           u32));
 595struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 596{
 597        struct dst_entry *dst = __sk_dst_get(sk);
 598
 599        if (dst && dst->obsolete &&
 600            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 601                               dst, cookie) == NULL) {
 602                sk_tx_queue_clear(sk);
 603                WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 604                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 605                dst_release(dst);
 606                return NULL;
 607        }
 608
 609        return dst;
 610}
 611EXPORT_SYMBOL(__sk_dst_check);
 612
 613struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 614{
 615        struct dst_entry *dst = sk_dst_get(sk);
 616
 617        if (dst && dst->obsolete &&
 618            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 619                               dst, cookie) == NULL) {
 620                sk_dst_reset(sk);
 621                dst_release(dst);
 622                return NULL;
 623        }
 624
 625        return dst;
 626}
 627EXPORT_SYMBOL(sk_dst_check);
 628
 629static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 630{
 631        int ret = -ENOPROTOOPT;
 632#ifdef CONFIG_NETDEVICES
 633        struct net *net = sock_net(sk);
 634
 635        /* Sorry... */
 636        ret = -EPERM;
 637        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 638                goto out;
 639
 640        ret = -EINVAL;
 641        if (ifindex < 0)
 642                goto out;
 643
 644        /* Paired with all READ_ONCE() done locklessly. */
 645        WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 646
 647        if (sk->sk_prot->rehash)
 648                sk->sk_prot->rehash(sk);
 649        sk_dst_reset(sk);
 650
 651        ret = 0;
 652
 653out:
 654#endif
 655
 656        return ret;
 657}
 658
 659int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 660{
 661        int ret;
 662
 663        if (lock_sk)
 664                lock_sock(sk);
 665        ret = sock_bindtoindex_locked(sk, ifindex);
 666        if (lock_sk)
 667                release_sock(sk);
 668
 669        return ret;
 670}
 671EXPORT_SYMBOL(sock_bindtoindex);
 672
 673static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 674{
 675        int ret = -ENOPROTOOPT;
 676#ifdef CONFIG_NETDEVICES
 677        struct net *net = sock_net(sk);
 678        char devname[IFNAMSIZ];
 679        int index;
 680
 681        ret = -EINVAL;
 682        if (optlen < 0)
 683                goto out;
 684
 685        /* Bind this socket to a particular device like "eth0",
 686         * as specified in the passed interface name. If the
 687         * name is "" or the option length is zero the socket
 688         * is not bound.
 689         */
 690        if (optlen > IFNAMSIZ - 1)
 691                optlen = IFNAMSIZ - 1;
 692        memset(devname, 0, sizeof(devname));
 693
 694        ret = -EFAULT;
 695        if (copy_from_sockptr(devname, optval, optlen))
 696                goto out;
 697
 698        index = 0;
 699        if (devname[0] != '\0') {
 700                struct net_device *dev;
 701
 702                rcu_read_lock();
 703                dev = dev_get_by_name_rcu(net, devname);
 704                if (dev)
 705                        index = dev->ifindex;
 706                rcu_read_unlock();
 707                ret = -ENODEV;
 708                if (!dev)
 709                        goto out;
 710        }
 711
 712        sockopt_lock_sock(sk);
 713        ret = sock_bindtoindex_locked(sk, index);
 714        sockopt_release_sock(sk);
 715out:
 716#endif
 717
 718        return ret;
 719}
 720
 721static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 722                                sockptr_t optlen, int len)
 723{
 724        int ret = -ENOPROTOOPT;
 725#ifdef CONFIG_NETDEVICES
 726        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 727        struct net *net = sock_net(sk);
 728        char devname[IFNAMSIZ];
 729
 730        if (bound_dev_if == 0) {
 731                len = 0;
 732                goto zero;
 733        }
 734
 735        ret = -EINVAL;
 736        if (len < IFNAMSIZ)
 737                goto out;
 738
 739        ret = netdev_get_name(net, devname, bound_dev_if);
 740        if (ret)
 741                goto out;
 742
 743        len = strlen(devname) + 1;
 744
 745        ret = -EFAULT;
 746        if (copy_to_sockptr(optval, devname, len))
 747                goto out;
 748
 749zero:
 750        ret = -EFAULT;
 751        if (copy_to_sockptr(optlen, &len, sizeof(int)))
 752                goto out;
 753
 754        ret = 0;
 755
 756out:
 757#endif
 758
 759        return ret;
 760}
 761
 762bool sk_mc_loop(const struct sock *sk)
 763{
 764        if (dev_recursion_level())
 765                return false;
 766        if (!sk)
 767                return true;
 768        /* IPV6_ADDRFORM can change sk->sk_family under us. */
 769        switch (READ_ONCE(sk->sk_family)) {
 770        case AF_INET:
 771                return inet_test_bit(MC_LOOP, sk);
 772#if IS_ENABLED(CONFIG_IPV6)
 773        case AF_INET6:
 774                return inet6_test_bit(MC6_LOOP, sk);
 775#endif
 776        }
 777        WARN_ON_ONCE(1);
 778        return true;
 779}
 780EXPORT_SYMBOL(sk_mc_loop);
 781
 782void sock_set_reuseaddr(struct sock *sk)
 783{
 784        lock_sock(sk);
 785        sk->sk_reuse = SK_CAN_REUSE;
 786        release_sock(sk);
 787}
 788EXPORT_SYMBOL(sock_set_reuseaddr);
 789
 790void sock_set_reuseport(struct sock *sk)
 791{
 792        lock_sock(sk);
 793        sk->sk_reuseport = true;
 794        release_sock(sk);
 795}
 796EXPORT_SYMBOL(sock_set_reuseport);
 797
 798void sock_no_linger(struct sock *sk)
 799{
 800        lock_sock(sk);
 801        WRITE_ONCE(sk->sk_lingertime, 0);
 802        sock_set_flag(sk, SOCK_LINGER);
 803        release_sock(sk);
 804}
 805EXPORT_SYMBOL(sock_no_linger);
 806
 807void sock_set_priority(struct sock *sk, u32 priority)
 808{
 809        WRITE_ONCE(sk->sk_priority, priority);
 810}
 811EXPORT_SYMBOL(sock_set_priority);
 812
 813void sock_set_sndtimeo(struct sock *sk, s64 secs)
 814{
 815        lock_sock(sk);
 816        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 817                WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 818        else
 819                WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 820        release_sock(sk);
 821}
 822EXPORT_SYMBOL(sock_set_sndtimeo);
 823
 824static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 825{
 826        if (val)  {
 827                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 828                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 829                sock_set_flag(sk, SOCK_RCVTSTAMP);
 830                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 831        } else {
 832                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 833                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 834        }
 835}
 836
 837void sock_enable_timestamps(struct sock *sk)
 838{
 839        lock_sock(sk);
 840        __sock_set_timestamps(sk, true, false, true);
 841        release_sock(sk);
 842}
 843EXPORT_SYMBOL(sock_enable_timestamps);
 844
 845void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 846{
 847        switch (optname) {
 848        case SO_TIMESTAMP_OLD:
 849                __sock_set_timestamps(sk, valbool, false, false);
 850                break;
 851        case SO_TIMESTAMP_NEW:
 852                __sock_set_timestamps(sk, valbool, true, false);
 853                break;
 854        case SO_TIMESTAMPNS_OLD:
 855                __sock_set_timestamps(sk, valbool, false, true);
 856                break;
 857        case SO_TIMESTAMPNS_NEW:
 858                __sock_set_timestamps(sk, valbool, true, true);
 859                break;
 860        }
 861}
 862
 863static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 864{
 865        struct net *net = sock_net(sk);
 866        struct net_device *dev = NULL;
 867        bool match = false;
 868        int *vclock_index;
 869        int i, num;
 870
 871        if (sk->sk_bound_dev_if)
 872                dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 873
 874        if (!dev) {
 875                pr_err("%s: sock not bind to device\n", __func__);
 876                return -EOPNOTSUPP;
 877        }
 878
 879        num = ethtool_get_phc_vclocks(dev, &vclock_index);
 880        dev_put(dev);
 881
 882        for (i = 0; i < num; i++) {
 883                if (*(vclock_index + i) == phc_index) {
 884                        match = true;
 885                        break;
 886                }
 887        }
 888
 889        if (num > 0)
 890                kfree(vclock_index);
 891
 892        if (!match)
 893                return -EINVAL;
 894
 895        WRITE_ONCE(sk->sk_bind_phc, phc_index);
 896
 897        return 0;
 898}
 899
 900int sock_set_timestamping(struct sock *sk, int optname,
 901                          struct so_timestamping timestamping)
 902{
 903        int val = timestamping.flags;
 904        int ret;
 905
 906        if (val & ~SOF_TIMESTAMPING_MASK)
 907                return -EINVAL;
 908
 909        if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 910            !(val & SOF_TIMESTAMPING_OPT_ID))
 911                return -EINVAL;
 912
 913        if (val & SOF_TIMESTAMPING_OPT_ID &&
 914            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 915                if (sk_is_tcp(sk)) {
 916                        if ((1 << sk->sk_state) &
 917                            (TCPF_CLOSE | TCPF_LISTEN))
 918                                return -EINVAL;
 919                        if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 920                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 921                        else
 922                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 923                } else {
 924                        atomic_set(&sk->sk_tskey, 0);
 925                }
 926        }
 927
 928        if (val & SOF_TIMESTAMPING_OPT_STATS &&
 929            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 930                return -EINVAL;
 931
 932        if (val & SOF_TIMESTAMPING_BIND_PHC) {
 933                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 934                if (ret)
 935                        return ret;
 936        }
 937
 938        WRITE_ONCE(sk->sk_tsflags, val);
 939        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 940
 941        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 942                sock_enable_timestamp(sk,
 943                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
 944        else
 945                sock_disable_timestamp(sk,
 946                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 947        return 0;
 948}
 949
 950void sock_set_keepalive(struct sock *sk)
 951{
 952        lock_sock(sk);
 953        if (sk->sk_prot->keepalive)
 954                sk->sk_prot->keepalive(sk, true);
 955        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 956        release_sock(sk);
 957}
 958EXPORT_SYMBOL(sock_set_keepalive);
 959
 960static void __sock_set_rcvbuf(struct sock *sk, int val)
 961{
 962        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 963         * as a negative value.
 964         */
 965        val = min_t(int, val, INT_MAX / 2);
 966        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 967
 968        /* We double it on the way in to account for "struct sk_buff" etc.
 969         * overhead.   Applications assume that the SO_RCVBUF setting they make
 970         * will allow that much actual data to be received on that socket.
 971         *
 972         * Applications are unaware that "struct sk_buff" and other overheads
 973         * allocate from the receive buffer during socket buffer allocation.
 974         *
 975         * And after considering the possible alternatives, returning the value
 976         * we actually used in getsockopt is the most desirable behavior.
 977         */
 978        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 979}
 980
 981void sock_set_rcvbuf(struct sock *sk, int val)
 982{
 983        lock_sock(sk);
 984        __sock_set_rcvbuf(sk, val);
 985        release_sock(sk);
 986}
 987EXPORT_SYMBOL(sock_set_rcvbuf);
 988
 989static void __sock_set_mark(struct sock *sk, u32 val)
 990{
 991        if (val != sk->sk_mark) {
 992                WRITE_ONCE(sk->sk_mark, val);
 993                sk_dst_reset(sk);
 994        }
 995}
 996
 997void sock_set_mark(struct sock *sk, u32 val)
 998{
 999        lock_sock(sk);
1000        __sock_set_mark(sk, val);
1001        release_sock(sk);
1002}
1003EXPORT_SYMBOL(sock_set_mark);
1004
1005static void sock_release_reserved_memory(struct sock *sk, int bytes)
1006{
1007        /* Round down bytes to multiple of pages */
1008        bytes = round_down(bytes, PAGE_SIZE);
1009
1010        WARN_ON(bytes > sk->sk_reserved_mem);
1011        WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1012        sk_mem_reclaim(sk);
1013}
1014
1015static int sock_reserve_memory(struct sock *sk, int bytes)
1016{
1017        long allocated;
1018        bool charged;
1019        int pages;
1020
1021        if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1022                return -EOPNOTSUPP;
1023
1024        if (!bytes)
1025                return 0;
1026
1027        pages = sk_mem_pages(bytes);
1028
1029        /* pre-charge to memcg */
1030        charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1031                                          GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1032        if (!charged)
1033                return -ENOMEM;
1034
1035        /* pre-charge to forward_alloc */
1036        sk_memory_allocated_add(sk, pages);
1037        allocated = sk_memory_allocated(sk);
1038        /* If the system goes into memory pressure with this
1039         * precharge, give up and return error.
1040         */
1041        if (allocated > sk_prot_mem_limits(sk, 1)) {
1042                sk_memory_allocated_sub(sk, pages);
1043                mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1044                return -ENOMEM;
1045        }
1046        sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1047
1048        WRITE_ONCE(sk->sk_reserved_mem,
1049                   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1050
1051        return 0;
1052}
1053
1054void sockopt_lock_sock(struct sock *sk)
1055{
1056        /* When current->bpf_ctx is set, the setsockopt is called from
1057         * a bpf prog.  bpf has ensured the sk lock has been
1058         * acquired before calling setsockopt().
1059         */
1060        if (has_current_bpf_ctx())
1061                return;
1062
1063        lock_sock(sk);
1064}
1065EXPORT_SYMBOL(sockopt_lock_sock);
1066
1067void sockopt_release_sock(struct sock *sk)
1068{
1069        if (has_current_bpf_ctx())
1070                return;
1071
1072        release_sock(sk);
1073}
1074EXPORT_SYMBOL(sockopt_release_sock);
1075
1076bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1077{
1078        return has_current_bpf_ctx() || ns_capable(ns, cap);
1079}
1080EXPORT_SYMBOL(sockopt_ns_capable);
1081
1082bool sockopt_capable(int cap)
1083{
1084        return has_current_bpf_ctx() || capable(cap);
1085}
1086EXPORT_SYMBOL(sockopt_capable);
1087
1088/*
1089 *      This is meant for all protocols to use and covers goings on
1090 *      at the socket level. Everything here is generic.
1091 */
1092
1093int sk_setsockopt(struct sock *sk, int level, int optname,
1094                  sockptr_t optval, unsigned int optlen)
1095{
1096        struct so_timestamping timestamping;
1097        struct socket *sock = sk->sk_socket;
1098        struct sock_txtime sk_txtime;
1099        int val;
1100        int valbool;
1101        struct linger ling;
1102        int ret = 0;
1103
1104        /*
1105         *      Options without arguments
1106         */
1107
1108        if (optname == SO_BINDTODEVICE)
1109                return sock_setbindtodevice(sk, optval, optlen);
1110
1111        if (optlen < sizeof(int))
1112                return -EINVAL;
1113
1114        if (copy_from_sockptr(&val, optval, sizeof(val)))
1115                return -EFAULT;
1116
1117        valbool = val ? 1 : 0;
1118
1119        /* handle options which do not require locking the socket. */
1120        switch (optname) {
1121        case SO_PRIORITY:
1122                if ((val >= 0 && val <= 6) ||
1123                    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1124                    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1125                        sock_set_priority(sk, val);
1126                        return 0;
1127                }
1128                return -EPERM;
1129        case SO_PASSSEC:
1130                assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1131                return 0;
1132        case SO_PASSCRED:
1133                assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1134                return 0;
1135        case SO_PASSPIDFD:
1136                assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1137                return 0;
1138        case SO_TYPE:
1139        case SO_PROTOCOL:
1140        case SO_DOMAIN:
1141        case SO_ERROR:
1142                return -ENOPROTOOPT;
1143#ifdef CONFIG_NET_RX_BUSY_POLL
1144        case SO_BUSY_POLL:
1145                if (val < 0)
1146                        return -EINVAL;
1147                WRITE_ONCE(sk->sk_ll_usec, val);
1148                return 0;
1149        case SO_PREFER_BUSY_POLL:
1150                if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1151                        return -EPERM;
1152                WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1153                return 0;
1154        case SO_BUSY_POLL_BUDGET:
1155                if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1156                    !sockopt_capable(CAP_NET_ADMIN))
1157                        return -EPERM;
1158                if (val < 0 || val > U16_MAX)
1159                        return -EINVAL;
1160                WRITE_ONCE(sk->sk_busy_poll_budget, val);
1161                return 0;
1162#endif
1163        case SO_MAX_PACING_RATE:
1164                {
1165                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1166                unsigned long pacing_rate;
1167
1168                if (sizeof(ulval) != sizeof(val) &&
1169                    optlen >= sizeof(ulval) &&
1170                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1171                        return -EFAULT;
1172                }
1173                if (ulval != ~0UL)
1174                        cmpxchg(&sk->sk_pacing_status,
1175                                SK_PACING_NONE,
1176                                SK_PACING_NEEDED);
1177                /* Pairs with READ_ONCE() from sk_getsockopt() */
1178                WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1179                pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1180                if (ulval < pacing_rate)
1181                        WRITE_ONCE(sk->sk_pacing_rate, ulval);
1182                return 0;
1183                }
1184        case SO_TXREHASH:
1185                if (val < -1 || val > 1)
1186                        return -EINVAL;
1187                if ((u8)val == SOCK_TXREHASH_DEFAULT)
1188                        val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1189                /* Paired with READ_ONCE() in tcp_rtx_synack()
1190                 * and sk_getsockopt().
1191                 */
1192                WRITE_ONCE(sk->sk_txrehash, (u8)val);
1193                return 0;
1194        }
1195
1196        sockopt_lock_sock(sk);
1197
1198        switch (optname) {
1199        case SO_DEBUG:
1200                if (val && !sockopt_capable(CAP_NET_ADMIN))
1201                        ret = -EACCES;
1202                else
1203                        sock_valbool_flag(sk, SOCK_DBG, valbool);
1204                break;
1205        case SO_REUSEADDR:
1206                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1207                break;
1208        case SO_REUSEPORT:
1209                sk->sk_reuseport = valbool;
1210                break;
1211        case SO_DONTROUTE:
1212                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1213                sk_dst_reset(sk);
1214                break;
1215        case SO_BROADCAST:
1216                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1217                break;
1218        case SO_SNDBUF:
1219                /* Don't error on this BSD doesn't and if you think
1220                 * about it this is right. Otherwise apps have to
1221                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1222                 * are treated in BSD as hints
1223                 */
1224                val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1225set_sndbuf:
1226                /* Ensure val * 2 fits into an int, to prevent max_t()
1227                 * from treating it as a negative value.
1228                 */
1229                val = min_t(int, val, INT_MAX / 2);
1230                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1231                WRITE_ONCE(sk->sk_sndbuf,
1232                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
1233                /* Wake up sending tasks if we upped the value. */
1234                sk->sk_write_space(sk);
1235                break;
1236
1237        case SO_SNDBUFFORCE:
1238                if (!sockopt_capable(CAP_NET_ADMIN)) {
1239                        ret = -EPERM;
1240                        break;
1241                }
1242
1243                /* No negative values (to prevent underflow, as val will be
1244                 * multiplied by 2).
1245                 */
1246                if (val < 0)
1247                        val = 0;
1248                goto set_sndbuf;
1249
1250        case SO_RCVBUF:
1251                /* Don't error on this BSD doesn't and if you think
1252                 * about it this is right. Otherwise apps have to
1253                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1254                 * are treated in BSD as hints
1255                 */
1256                __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1257                break;
1258
1259        case SO_RCVBUFFORCE:
1260                if (!sockopt_capable(CAP_NET_ADMIN)) {
1261                        ret = -EPERM;
1262                        break;
1263                }
1264
1265                /* No negative values (to prevent underflow, as val will be
1266                 * multiplied by 2).
1267                 */
1268                __sock_set_rcvbuf(sk, max(val, 0));
1269                break;
1270
1271        case SO_KEEPALIVE:
1272                if (sk->sk_prot->keepalive)
1273                        sk->sk_prot->keepalive(sk, valbool);
1274                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1275                break;
1276
1277        case SO_OOBINLINE:
1278                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1279                break;
1280
1281        case SO_NO_CHECK:
1282                sk->sk_no_check_tx = valbool;
1283                break;
1284
1285        case SO_LINGER:
1286                if (optlen < sizeof(ling)) {
1287                        ret = -EINVAL;  /* 1003.1g */
1288                        break;
1289                }
1290                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1291                        ret = -EFAULT;
1292                        break;
1293                }
1294                if (!ling.l_onoff) {
1295                        sock_reset_flag(sk, SOCK_LINGER);
1296                } else {
1297                        unsigned long t_sec = ling.l_linger;
1298
1299                        if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1300                                WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1301                        else
1302                                WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1303                        sock_set_flag(sk, SOCK_LINGER);
1304                }
1305                break;
1306
1307        case SO_BSDCOMPAT:
1308                break;
1309
1310        case SO_TIMESTAMP_OLD:
1311        case SO_TIMESTAMP_NEW:
1312        case SO_TIMESTAMPNS_OLD:
1313        case SO_TIMESTAMPNS_NEW:
1314                sock_set_timestamp(sk, optname, valbool);
1315                break;
1316
1317        case SO_TIMESTAMPING_NEW:
1318        case SO_TIMESTAMPING_OLD:
1319                if (optlen == sizeof(timestamping)) {
1320                        if (copy_from_sockptr(&timestamping, optval,
1321                                              sizeof(timestamping))) {
1322                                ret = -EFAULT;
1323                                break;
1324                        }
1325                } else {
1326                        memset(&timestamping, 0, sizeof(timestamping));
1327                        timestamping.flags = val;
1328                }
1329                ret = sock_set_timestamping(sk, optname, timestamping);
1330                break;
1331
1332        case SO_RCVLOWAT:
1333                {
1334                int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1335
1336                if (val < 0)
1337                        val = INT_MAX;
1338                if (sock)
1339                        set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1340                if (set_rcvlowat)
1341                        ret = set_rcvlowat(sk, val);
1342                else
1343                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1344                break;
1345                }
1346        case SO_RCVTIMEO_OLD:
1347        case SO_RCVTIMEO_NEW:
1348                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1349                                       optlen, optname == SO_RCVTIMEO_OLD);
1350                break;
1351
1352        case SO_SNDTIMEO_OLD:
1353        case SO_SNDTIMEO_NEW:
1354                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1355                                       optlen, optname == SO_SNDTIMEO_OLD);
1356                break;
1357
1358        case SO_ATTACH_FILTER: {
1359                struct sock_fprog fprog;
1360
1361                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1362                if (!ret)
1363                        ret = sk_attach_filter(&fprog, sk);
1364                break;
1365        }
1366        case SO_ATTACH_BPF:
1367                ret = -EINVAL;
1368                if (optlen == sizeof(u32)) {
1369                        u32 ufd;
1370
1371                        ret = -EFAULT;
1372                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1373                                break;
1374
1375                        ret = sk_attach_bpf(ufd, sk);
1376                }
1377                break;
1378
1379        case SO_ATTACH_REUSEPORT_CBPF: {
1380                struct sock_fprog fprog;
1381
1382                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1383                if (!ret)
1384                        ret = sk_reuseport_attach_filter(&fprog, sk);
1385                break;
1386        }
1387        case SO_ATTACH_REUSEPORT_EBPF:
1388                ret = -EINVAL;
1389                if (optlen == sizeof(u32)) {
1390                        u32 ufd;
1391
1392                        ret = -EFAULT;
1393                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1394                                break;
1395
1396                        ret = sk_reuseport_attach_bpf(ufd, sk);
1397                }
1398                break;
1399
1400        case SO_DETACH_REUSEPORT_BPF:
1401                ret = reuseport_detach_prog(sk);
1402                break;
1403
1404        case SO_DETACH_FILTER:
1405                ret = sk_detach_filter(sk);
1406                break;
1407
1408        case SO_LOCK_FILTER:
1409                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1410                        ret = -EPERM;
1411                else
1412                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1413                break;
1414
1415        case SO_MARK:
1416                if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1417                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1418                        ret = -EPERM;
1419                        break;
1420                }
1421
1422                __sock_set_mark(sk, val);
1423                break;
1424        case SO_RCVMARK:
1425                sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1426                break;
1427
1428        case SO_RXQ_OVFL:
1429                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1430                break;
1431
1432        case SO_WIFI_STATUS:
1433                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1434                break;
1435
1436        case SO_PEEK_OFF:
1437                {
1438                int (*set_peek_off)(struct sock *sk, int val);
1439
1440                set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1441                if (set_peek_off)
1442                        ret = set_peek_off(sk, val);
1443                else
1444                        ret = -EOPNOTSUPP;
1445                break;
1446                }
1447
1448        case SO_NOFCS:
1449                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1450                break;
1451
1452        case SO_SELECT_ERR_QUEUE:
1453                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1454                break;
1455
1456
1457        case SO_INCOMING_CPU:
1458                reuseport_update_incoming_cpu(sk, val);
1459                break;
1460
1461        case SO_CNX_ADVICE:
1462                if (val == 1)
1463                        dst_negative_advice(sk);
1464                break;
1465
1466        case SO_ZEROCOPY:
1467                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1468                        if (!(sk_is_tcp(sk) ||
1469                              (sk->sk_type == SOCK_DGRAM &&
1470                               sk->sk_protocol == IPPROTO_UDP)))
1471                                ret = -EOPNOTSUPP;
1472                } else if (sk->sk_family != PF_RDS) {
1473                        ret = -EOPNOTSUPP;
1474                }
1475                if (!ret) {
1476                        if (val < 0 || val > 1)
1477                                ret = -EINVAL;
1478                        else
1479                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1480                }
1481                break;
1482
1483        case SO_TXTIME:
1484                if (optlen != sizeof(struct sock_txtime)) {
1485                        ret = -EINVAL;
1486                        break;
1487                } else if (copy_from_sockptr(&sk_txtime, optval,
1488                           sizeof(struct sock_txtime))) {
1489                        ret = -EFAULT;
1490                        break;
1491                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1492                        ret = -EINVAL;
1493                        break;
1494                }
1495                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1496                 * scheduler has enough safe guards.
1497                 */
1498                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1499                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1500                        ret = -EPERM;
1501                        break;
1502                }
1503                sock_valbool_flag(sk, SOCK_TXTIME, true);
1504                sk->sk_clockid = sk_txtime.clockid;
1505                sk->sk_txtime_deadline_mode =
1506                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1507                sk->sk_txtime_report_errors =
1508                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1509                break;
1510
1511        case SO_BINDTOIFINDEX:
1512                ret = sock_bindtoindex_locked(sk, val);
1513                break;
1514
1515        case SO_BUF_LOCK:
1516                if (val & ~SOCK_BUF_LOCK_MASK) {
1517                        ret = -EINVAL;
1518                        break;
1519                }
1520                sk->sk_userlocks = val | (sk->sk_userlocks &
1521                                          ~SOCK_BUF_LOCK_MASK);
1522                break;
1523
1524        case SO_RESERVE_MEM:
1525        {
1526                int delta;
1527
1528                if (val < 0) {
1529                        ret = -EINVAL;
1530                        break;
1531                }
1532
1533                delta = val - sk->sk_reserved_mem;
1534                if (delta < 0)
1535                        sock_release_reserved_memory(sk, -delta);
1536                else
1537                        ret = sock_reserve_memory(sk, delta);
1538                break;
1539        }
1540
1541        default:
1542                ret = -ENOPROTOOPT;
1543                break;
1544        }
1545        sockopt_release_sock(sk);
1546        return ret;
1547}
1548
1549int sock_setsockopt(struct socket *sock, int level, int optname,
1550                    sockptr_t optval, unsigned int optlen)
1551{
1552        return sk_setsockopt(sock->sk, level, optname,
1553                             optval, optlen);
1554}
1555EXPORT_SYMBOL(sock_setsockopt);
1556
1557static const struct cred *sk_get_peer_cred(struct sock *sk)
1558{
1559        const struct cred *cred;
1560
1561        spin_lock(&sk->sk_peer_lock);
1562        cred = get_cred(sk->sk_peer_cred);
1563        spin_unlock(&sk->sk_peer_lock);
1564
1565        return cred;
1566}
1567
1568static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1569                          struct ucred *ucred)
1570{
1571        ucred->pid = pid_vnr(pid);
1572        ucred->uid = ucred->gid = -1;
1573        if (cred) {
1574                struct user_namespace *current_ns = current_user_ns();
1575
1576                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1577                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1578        }
1579}
1580
1581static int groups_to_user(sockptr_t dst, const struct group_info *src)
1582{
1583        struct user_namespace *user_ns = current_user_ns();
1584        int i;
1585
1586        for (i = 0; i < src->ngroups; i++) {
1587                gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1588
1589                if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1590                        return -EFAULT;
1591        }
1592
1593        return 0;
1594}
1595
1596int sk_getsockopt(struct sock *sk, int level, int optname,
1597                  sockptr_t optval, sockptr_t optlen)
1598{
1599        struct socket *sock = sk->sk_socket;
1600
1601        union {
1602                int val;
1603                u64 val64;
1604                unsigned long ulval;
1605                struct linger ling;
1606                struct old_timeval32 tm32;
1607                struct __kernel_old_timeval tm;
1608                struct  __kernel_sock_timeval stm;
1609                struct sock_txtime txtime;
1610                struct so_timestamping timestamping;
1611        } v;
1612
1613        int lv = sizeof(int);
1614        int len;
1615
1616        if (copy_from_sockptr(&len, optlen, sizeof(int)))
1617                return -EFAULT;
1618        if (len < 0)
1619                return -EINVAL;
1620
1621        memset(&v, 0, sizeof(v));
1622
1623        switch (optname) {
1624        case SO_DEBUG:
1625                v.val = sock_flag(sk, SOCK_DBG);
1626                break;
1627
1628        case SO_DONTROUTE:
1629                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1630                break;
1631
1632        case SO_BROADCAST:
1633                v.val = sock_flag(sk, SOCK_BROADCAST);
1634                break;
1635
1636        case SO_SNDBUF:
1637                v.val = READ_ONCE(sk->sk_sndbuf);
1638                break;
1639
1640        case SO_RCVBUF:
1641                v.val = READ_ONCE(sk->sk_rcvbuf);
1642                break;
1643
1644        case SO_REUSEADDR:
1645                v.val = sk->sk_reuse;
1646                break;
1647
1648        case SO_REUSEPORT:
1649                v.val = sk->sk_reuseport;
1650                break;
1651
1652        case SO_KEEPALIVE:
1653                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1654                break;
1655
1656        case SO_TYPE:
1657                v.val = sk->sk_type;
1658                break;
1659
1660        case SO_PROTOCOL:
1661                v.val = sk->sk_protocol;
1662                break;
1663
1664        case SO_DOMAIN:
1665                v.val = sk->sk_family;
1666                break;
1667
1668        case SO_ERROR:
1669                v.val = -sock_error(sk);
1670                if (v.val == 0)
1671                        v.val = xchg(&sk->sk_err_soft, 0);
1672                break;
1673
1674        case SO_OOBINLINE:
1675                v.val = sock_flag(sk, SOCK_URGINLINE);
1676                break;
1677
1678        case SO_NO_CHECK:
1679                v.val = sk->sk_no_check_tx;
1680                break;
1681
1682        case SO_PRIORITY:
1683                v.val = READ_ONCE(sk->sk_priority);
1684                break;
1685
1686        case SO_LINGER:
1687                lv              = sizeof(v.ling);
1688                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1689                v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1690                break;
1691
1692        case SO_BSDCOMPAT:
1693                break;
1694
1695        case SO_TIMESTAMP_OLD:
1696                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1697                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1698                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1699                break;
1700
1701        case SO_TIMESTAMPNS_OLD:
1702                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1703                break;
1704
1705        case SO_TIMESTAMP_NEW:
1706                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1707                break;
1708
1709        case SO_TIMESTAMPNS_NEW:
1710                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1711                break;
1712
1713        case SO_TIMESTAMPING_OLD:
1714        case SO_TIMESTAMPING_NEW:
1715                lv = sizeof(v.timestamping);
1716                /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1717                 * returning the flags when they were set through the same option.
1718                 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1719                 */
1720                if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1721                        v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1722                        v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1723                }
1724                break;
1725
1726        case SO_RCVTIMEO_OLD:
1727        case SO_RCVTIMEO_NEW:
1728                lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1729                                      SO_RCVTIMEO_OLD == optname);
1730                break;
1731
1732        case SO_SNDTIMEO_OLD:
1733        case SO_SNDTIMEO_NEW:
1734                lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1735                                      SO_SNDTIMEO_OLD == optname);
1736                break;
1737
1738        case SO_RCVLOWAT:
1739                v.val = READ_ONCE(sk->sk_rcvlowat);
1740                break;
1741
1742        case SO_SNDLOWAT:
1743                v.val = 1;
1744                break;
1745
1746        case SO_PASSCRED:
1747                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1748                break;
1749
1750        case SO_PASSPIDFD:
1751                v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1752                break;
1753
1754        case SO_PEERCRED:
1755        {
1756                struct ucred peercred;
1757                if (len > sizeof(peercred))
1758                        len = sizeof(peercred);
1759
1760                spin_lock(&sk->sk_peer_lock);
1761                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1762                spin_unlock(&sk->sk_peer_lock);
1763
1764                if (copy_to_sockptr(optval, &peercred, len))
1765                        return -EFAULT;
1766                goto lenout;
1767        }
1768
1769        case SO_PEERPIDFD:
1770        {
1771                struct pid *peer_pid;
1772                struct file *pidfd_file = NULL;
1773                int pidfd;
1774
1775                if (len > sizeof(pidfd))
1776                        len = sizeof(pidfd);
1777
1778                spin_lock(&sk->sk_peer_lock);
1779                peer_pid = get_pid(sk->sk_peer_pid);
1780                spin_unlock(&sk->sk_peer_lock);
1781
1782                if (!peer_pid)
1783                        return -ENODATA;
1784
1785                pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1786                put_pid(peer_pid);
1787                if (pidfd < 0)
1788                        return pidfd;
1789
1790                if (copy_to_sockptr(optval, &pidfd, len) ||
1791                    copy_to_sockptr(optlen, &len, sizeof(int))) {
1792                        put_unused_fd(pidfd);
1793                        fput(pidfd_file);
1794
1795                        return -EFAULT;
1796                }
1797
1798                fd_install(pidfd, pidfd_file);
1799                return 0;
1800        }
1801
1802        case SO_PEERGROUPS:
1803        {
1804                const struct cred *cred;
1805                int ret, n;
1806
1807                cred = sk_get_peer_cred(sk);
1808                if (!cred)
1809                        return -ENODATA;
1810
1811                n = cred->group_info->ngroups;
1812                if (len < n * sizeof(gid_t)) {
1813                        len = n * sizeof(gid_t);
1814                        put_cred(cred);
1815                        return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1816                }
1817                len = n * sizeof(gid_t);
1818
1819                ret = groups_to_user(optval, cred->group_info);
1820                put_cred(cred);
1821                if (ret)
1822                        return ret;
1823                goto lenout;
1824        }
1825
1826        case SO_PEERNAME:
1827        {
1828                struct sockaddr_storage address;
1829
1830                lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1831                if (lv < 0)
1832                        return -ENOTCONN;
1833                if (lv < len)
1834                        return -EINVAL;
1835                if (copy_to_sockptr(optval, &address, len))
1836                        return -EFAULT;
1837                goto lenout;
1838        }
1839
1840        /* Dubious BSD thing... Probably nobody even uses it, but
1841         * the UNIX standard wants it for whatever reason... -DaveM
1842         */
1843        case SO_ACCEPTCONN:
1844                v.val = sk->sk_state == TCP_LISTEN;
1845                break;
1846
1847        case SO_PASSSEC:
1848                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1849                break;
1850
1851        case SO_PEERSEC:
1852                return security_socket_getpeersec_stream(sock,
1853                                                         optval, optlen, len);
1854
1855        case SO_MARK:
1856                v.val = READ_ONCE(sk->sk_mark);
1857                break;
1858
1859        case SO_RCVMARK:
1860                v.val = sock_flag(sk, SOCK_RCVMARK);
1861                break;
1862
1863        case SO_RXQ_OVFL:
1864                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1865                break;
1866
1867        case SO_WIFI_STATUS:
1868                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1869                break;
1870
1871        case SO_PEEK_OFF:
1872                if (!READ_ONCE(sock->ops)->set_peek_off)
1873                        return -EOPNOTSUPP;
1874
1875                v.val = READ_ONCE(sk->sk_peek_off);
1876                break;
1877        case SO_NOFCS:
1878                v.val = sock_flag(sk, SOCK_NOFCS);
1879                break;
1880
1881        case SO_BINDTODEVICE:
1882                return sock_getbindtodevice(sk, optval, optlen, len);
1883
1884        case SO_GET_FILTER:
1885                len = sk_get_filter(sk, optval, len);
1886                if (len < 0)
1887                        return len;
1888
1889                goto lenout;
1890
1891        case SO_LOCK_FILTER:
1892                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1893                break;
1894
1895        case SO_BPF_EXTENSIONS:
1896                v.val = bpf_tell_extensions();
1897                break;
1898
1899        case SO_SELECT_ERR_QUEUE:
1900                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1901                break;
1902
1903#ifdef CONFIG_NET_RX_BUSY_POLL
1904        case SO_BUSY_POLL:
1905                v.val = READ_ONCE(sk->sk_ll_usec);
1906                break;
1907        case SO_PREFER_BUSY_POLL:
1908                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1909                break;
1910#endif
1911
1912        case SO_MAX_PACING_RATE:
1913                /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1914                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1915                        lv = sizeof(v.ulval);
1916                        v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1917                } else {
1918                        /* 32bit version */
1919                        v.val = min_t(unsigned long, ~0U,
1920                                      READ_ONCE(sk->sk_max_pacing_rate));
1921                }
1922                break;
1923
1924        case SO_INCOMING_CPU:
1925                v.val = READ_ONCE(sk->sk_incoming_cpu);
1926                break;
1927
1928        case SO_MEMINFO:
1929        {
1930                u32 meminfo[SK_MEMINFO_VARS];
1931
1932                sk_get_meminfo(sk, meminfo);
1933
1934                len = min_t(unsigned int, len, sizeof(meminfo));
1935                if (copy_to_sockptr(optval, &meminfo, len))
1936                        return -EFAULT;
1937
1938                goto lenout;
1939        }
1940
1941#ifdef CONFIG_NET_RX_BUSY_POLL
1942        case SO_INCOMING_NAPI_ID:
1943                v.val = READ_ONCE(sk->sk_napi_id);
1944
1945                /* aggregate non-NAPI IDs down to 0 */
1946                if (v.val < MIN_NAPI_ID)
1947                        v.val = 0;
1948
1949                break;
1950#endif
1951
1952        case SO_COOKIE:
1953                lv = sizeof(u64);
1954                if (len < lv)
1955                        return -EINVAL;
1956                v.val64 = sock_gen_cookie(sk);
1957                break;
1958
1959        case SO_ZEROCOPY:
1960                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1961                break;
1962
1963        case SO_TXTIME:
1964                lv = sizeof(v.txtime);
1965                v.txtime.clockid = sk->sk_clockid;
1966                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1967                                  SOF_TXTIME_DEADLINE_MODE : 0;
1968                v.txtime.flags |= sk->sk_txtime_report_errors ?
1969                                  SOF_TXTIME_REPORT_ERRORS : 0;
1970                break;
1971
1972        case SO_BINDTOIFINDEX:
1973                v.val = READ_ONCE(sk->sk_bound_dev_if);
1974                break;
1975
1976        case SO_NETNS_COOKIE:
1977                lv = sizeof(u64);
1978                if (len != lv)
1979                        return -EINVAL;
1980                v.val64 = sock_net(sk)->net_cookie;
1981                break;
1982
1983        case SO_BUF_LOCK:
1984                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1985                break;
1986
1987        case SO_RESERVE_MEM:
1988                v.val = READ_ONCE(sk->sk_reserved_mem);
1989                break;
1990
1991        case SO_TXREHASH:
1992                /* Paired with WRITE_ONCE() in sk_setsockopt() */
1993                v.val = READ_ONCE(sk->sk_txrehash);
1994                break;
1995
1996        default:
1997                /* We implement the SO_SNDLOWAT etc to not be settable
1998                 * (1003.1g 7).
1999                 */
2000                return -ENOPROTOOPT;
2001        }
2002
2003        if (len > lv)
2004                len = lv;
2005        if (copy_to_sockptr(optval, &v, len))
2006                return -EFAULT;
2007lenout:
2008        if (copy_to_sockptr(optlen, &len, sizeof(int)))
2009                return -EFAULT;
2010        return 0;
2011}
2012
2013/*
2014 * Initialize an sk_lock.
2015 *
2016 * (We also register the sk_lock with the lock validator.)
2017 */
2018static inline void sock_lock_init(struct sock *sk)
2019{
2020        if (sk->sk_kern_sock)
2021                sock_lock_init_class_and_name(
2022                        sk,
2023                        af_family_kern_slock_key_strings[sk->sk_family],
2024                        af_family_kern_slock_keys + sk->sk_family,
2025                        af_family_kern_key_strings[sk->sk_family],
2026                        af_family_kern_keys + sk->sk_family);
2027        else
2028                sock_lock_init_class_and_name(
2029                        sk,
2030                        af_family_slock_key_strings[sk->sk_family],
2031                        af_family_slock_keys + sk->sk_family,
2032                        af_family_key_strings[sk->sk_family],
2033                        af_family_keys + sk->sk_family);
2034}
2035
2036/*
2037 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2038 * even temporarly, because of RCU lookups. sk_node should also be left as is.
2039 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2040 */
2041static void sock_copy(struct sock *nsk, const struct sock *osk)
2042{
2043        const struct proto *prot = READ_ONCE(osk->sk_prot);
2044#ifdef CONFIG_SECURITY_NETWORK
2045        void *sptr = nsk->sk_security;
2046#endif
2047
2048        /* If we move sk_tx_queue_mapping out of the private section,
2049         * we must check if sk_tx_queue_clear() is called after
2050         * sock_copy() in sk_clone_lock().
2051         */
2052        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2053                     offsetof(struct sock, sk_dontcopy_begin) ||
2054                     offsetof(struct sock, sk_tx_queue_mapping) >=
2055                     offsetof(struct sock, sk_dontcopy_end));
2056
2057        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2058
2059        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2060               prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2061
2062#ifdef CONFIG_SECURITY_NETWORK
2063        nsk->sk_security = sptr;
2064        security_sk_clone(osk, nsk);
2065#endif
2066}
2067
2068static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2069                int family)
2070{
2071        struct sock *sk;
2072        struct kmem_cache *slab;
2073
2074        slab = prot->slab;
2075        if (slab != NULL) {
2076                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2077                if (!sk)
2078                        return sk;
2079                if (want_init_on_alloc(priority))
2080                        sk_prot_clear_nulls(sk, prot->obj_size);
2081        } else
2082                sk = kmalloc(prot->obj_size, priority);
2083
2084        if (sk != NULL) {
2085                if (security_sk_alloc(sk, family, priority))
2086                        goto out_free;
2087
2088                if (!try_module_get(prot->owner))
2089                        goto out_free_sec;
2090        }
2091
2092        return sk;
2093
2094out_free_sec:
2095        security_sk_free(sk);
2096out_free:
2097        if (slab != NULL)
2098                kmem_cache_free(slab, sk);
2099        else
2100                kfree(sk);
2101        return NULL;
2102}
2103
2104static void sk_prot_free(struct proto *prot, struct sock *sk)
2105{
2106        struct kmem_cache *slab;
2107        struct module *owner;
2108
2109        owner = prot->owner;
2110        slab = prot->slab;
2111
2112        cgroup_sk_free(&sk->sk_cgrp_data);
2113        mem_cgroup_sk_free(sk);
2114        security_sk_free(sk);
2115        if (slab != NULL)
2116                kmem_cache_free(slab, sk);
2117        else
2118                kfree(sk);
2119        module_put(owner);
2120}
2121
2122/**
2123 *      sk_alloc - All socket objects are allocated here
2124 *      @net: the applicable net namespace
2125 *      @family: protocol family
2126 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2127 *      @prot: struct proto associated with this new sock instance
2128 *      @kern: is this to be a kernel socket?
2129 */
2130struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2131                      struct proto *prot, int kern)
2132{
2133        struct sock *sk;
2134
2135        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2136        if (sk) {
2137                sk->sk_family = family;
2138                /*
2139                 * See comment in struct sock definition to understand
2140                 * why we need sk_prot_creator -acme
2141                 */
2142                sk->sk_prot = sk->sk_prot_creator = prot;
2143                sk->sk_kern_sock = kern;
2144                sock_lock_init(sk);
2145                sk->sk_net_refcnt = kern ? 0 : 1;
2146                if (likely(sk->sk_net_refcnt)) {
2147                        get_net_track(net, &sk->ns_tracker, priority);
2148                        sock_inuse_add(net, 1);
2149                } else {
2150                        __netns_tracker_alloc(net, &sk->ns_tracker,
2151                                              false, priority);
2152                }
2153
2154                sock_net_set(sk, net);
2155                refcount_set(&sk->sk_wmem_alloc, 1);
2156
2157                mem_cgroup_sk_alloc(sk);
2158                cgroup_sk_alloc(&sk->sk_cgrp_data);
2159                sock_update_classid(&sk->sk_cgrp_data);
2160                sock_update_netprioidx(&sk->sk_cgrp_data);
2161                sk_tx_queue_clear(sk);
2162        }
2163
2164        return sk;
2165}
2166EXPORT_SYMBOL(sk_alloc);
2167
2168/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2169 * grace period. This is the case for UDP sockets and TCP listeners.
2170 */
2171static void __sk_destruct(struct rcu_head *head)
2172{
2173        struct sock *sk = container_of(head, struct sock, sk_rcu);
2174        struct sk_filter *filter;
2175
2176        if (sk->sk_destruct)
2177                sk->sk_destruct(sk);
2178
2179        filter = rcu_dereference_check(sk->sk_filter,
2180                                       refcount_read(&sk->sk_wmem_alloc) == 0);
2181        if (filter) {
2182                sk_filter_uncharge(sk, filter);
2183                RCU_INIT_POINTER(sk->sk_filter, NULL);
2184        }
2185
2186        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2187
2188#ifdef CONFIG_BPF_SYSCALL
2189        bpf_sk_storage_free(sk);
2190#endif
2191
2192        if (atomic_read(&sk->sk_omem_alloc))
2193                pr_debug("%s: optmem leakage (%d bytes) detected\n",
2194                         __func__, atomic_read(&sk->sk_omem_alloc));
2195
2196        if (sk->sk_frag.page) {
2197                put_page(sk->sk_frag.page);
2198                sk->sk_frag.page = NULL;
2199        }
2200
2201        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2202        put_cred(sk->sk_peer_cred);
2203        put_pid(sk->sk_peer_pid);
2204
2205        if (likely(sk->sk_net_refcnt))
2206                put_net_track(sock_net(sk), &sk->ns_tracker);
2207        else
2208                __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2209
2210        sk_prot_free(sk->sk_prot_creator, sk);
2211}
2212
2213void sk_destruct(struct sock *sk)
2214{
2215        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2216
2217        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2218                reuseport_detach_sock(sk);
2219                use_call_rcu = true;
2220        }
2221
2222        if (use_call_rcu)
2223                call_rcu(&sk->sk_rcu, __sk_destruct);
2224        else
2225                __sk_destruct(&sk->sk_rcu);
2226}
2227
2228static void __sk_free(struct sock *sk)
2229{
2230        if (likely(sk->sk_net_refcnt))
2231                sock_inuse_add(sock_net(sk), -1);
2232
2233        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2234                sock_diag_broadcast_destroy(sk);
2235        else
2236                sk_destruct(sk);
2237}
2238
2239void sk_free(struct sock *sk)
2240{
2241        /*
2242         * We subtract one from sk_wmem_alloc and can know if
2243         * some packets are still in some tx queue.
2244         * If not null, sock_wfree() will call __sk_free(sk) later
2245         */
2246        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2247                __sk_free(sk);
2248}
2249EXPORT_SYMBOL(sk_free);
2250
2251static void sk_init_common(struct sock *sk)
2252{
2253        skb_queue_head_init(&sk->sk_receive_queue);
2254        skb_queue_head_init(&sk->sk_write_queue);
2255        skb_queue_head_init(&sk->sk_error_queue);
2256
2257        rwlock_init(&sk->sk_callback_lock);
2258        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2259                        af_rlock_keys + sk->sk_family,
2260                        af_family_rlock_key_strings[sk->sk_family]);
2261        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2262                        af_wlock_keys + sk->sk_family,
2263                        af_family_wlock_key_strings[sk->sk_family]);
2264        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2265                        af_elock_keys + sk->sk_family,
2266                        af_family_elock_key_strings[sk->sk_family]);
2267        lockdep_set_class_and_name(&sk->sk_callback_lock,
2268                        af_callback_keys + sk->sk_family,
2269                        af_family_clock_key_strings[sk->sk_family]);
2270}
2271
2272/**
2273 *      sk_clone_lock - clone a socket, and lock its clone
2274 *      @sk: the socket to clone
2275 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2276 *
2277 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2278 */
2279struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2280{
2281        struct proto *prot = READ_ONCE(sk->sk_prot);
2282        struct sk_filter *filter;
2283        bool is_charged = true;
2284        struct sock *newsk;
2285
2286        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2287        if (!newsk)
2288                goto out;
2289
2290        sock_copy(newsk, sk);
2291
2292        newsk->sk_prot_creator = prot;
2293
2294        /* SANITY */
2295        if (likely(newsk->sk_net_refcnt)) {
2296                get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2297                sock_inuse_add(sock_net(newsk), 1);
2298        } else {
2299                /* Kernel sockets are not elevating the struct net refcount.
2300                 * Instead, use a tracker to more easily detect if a layer
2301                 * is not properly dismantling its kernel sockets at netns
2302                 * destroy time.
2303                 */
2304                __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2305                                      false, priority);
2306        }
2307        sk_node_init(&newsk->sk_node);
2308        sock_lock_init(newsk);
2309        bh_lock_sock(newsk);
2310        newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2311        newsk->sk_backlog.len = 0;
2312
2313        atomic_set(&newsk->sk_rmem_alloc, 0);
2314
2315        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2316        refcount_set(&newsk->sk_wmem_alloc, 1);
2317
2318        atomic_set(&newsk->sk_omem_alloc, 0);
2319        sk_init_common(newsk);
2320
2321        newsk->sk_dst_cache     = NULL;
2322        newsk->sk_dst_pending_confirm = 0;
2323        newsk->sk_wmem_queued   = 0;
2324        newsk->sk_forward_alloc = 0;
2325        newsk->sk_reserved_mem  = 0;
2326        atomic_set(&newsk->sk_drops, 0);
2327        newsk->sk_send_head     = NULL;
2328        newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2329        atomic_set(&newsk->sk_zckey, 0);
2330
2331        sock_reset_flag(newsk, SOCK_DONE);
2332
2333        /* sk->sk_memcg will be populated at accept() time */
2334        newsk->sk_memcg = NULL;
2335
2336        cgroup_sk_clone(&newsk->sk_cgrp_data);
2337
2338        rcu_read_lock();
2339        filter = rcu_dereference(sk->sk_filter);
2340        if (filter != NULL)
2341                /* though it's an empty new sock, the charging may fail
2342                 * if sysctl_optmem_max was changed between creation of
2343                 * original socket and cloning
2344                 */
2345                is_charged = sk_filter_charge(newsk, filter);
2346        RCU_INIT_POINTER(newsk->sk_filter, filter);
2347        rcu_read_unlock();
2348
2349        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2350                /* We need to make sure that we don't uncharge the new
2351                 * socket if we couldn't charge it in the first place
2352                 * as otherwise we uncharge the parent's filter.
2353                 */
2354                if (!is_charged)
2355                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
2356                sk_free_unlock_clone(newsk);
2357                newsk = NULL;
2358                goto out;
2359        }
2360        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2361
2362        if (bpf_sk_storage_clone(sk, newsk)) {
2363                sk_free_unlock_clone(newsk);
2364                newsk = NULL;
2365                goto out;
2366        }
2367
2368        /* Clear sk_user_data if parent had the pointer tagged
2369         * as not suitable for copying when cloning.
2370         */
2371        if (sk_user_data_is_nocopy(newsk))
2372                newsk->sk_user_data = NULL;
2373
2374        newsk->sk_err      = 0;
2375        newsk->sk_err_soft = 0;
2376        newsk->sk_priority = 0;
2377        newsk->sk_incoming_cpu = raw_smp_processor_id();
2378
2379        /* Before updating sk_refcnt, we must commit prior changes to memory
2380         * (Documentation/RCU/rculist_nulls.rst for details)
2381         */
2382        smp_wmb();
2383        refcount_set(&newsk->sk_refcnt, 2);
2384
2385        sk_set_socket(newsk, NULL);
2386        sk_tx_queue_clear(newsk);
2387        RCU_INIT_POINTER(newsk->sk_wq, NULL);
2388
2389        if (newsk->sk_prot->sockets_allocated)
2390                sk_sockets_allocated_inc(newsk);
2391
2392        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2393                net_enable_timestamp();
2394out:
2395        return newsk;
2396}
2397EXPORT_SYMBOL_GPL(sk_clone_lock);
2398
2399void sk_free_unlock_clone(struct sock *sk)
2400{
2401        /* It is still raw copy of parent, so invalidate
2402         * destructor and make plain sk_free() */
2403        sk->sk_destruct = NULL;
2404        bh_unlock_sock(sk);
2405        sk_free(sk);
2406}
2407EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2408
2409static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2410{
2411        bool is_ipv6 = false;
2412        u32 max_size;
2413
2414#if IS_ENABLED(CONFIG_IPV6)
2415        is_ipv6 = (sk->sk_family == AF_INET6 &&
2416                   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2417#endif
2418        /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2419        max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2420                        READ_ONCE(dst->dev->gso_ipv4_max_size);
2421        if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2422                max_size = GSO_LEGACY_MAX_SIZE;
2423
2424        return max_size - (MAX_TCP_HEADER + 1);
2425}
2426
2427void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2428{
2429        u32 max_segs = 1;
2430
2431        sk->sk_route_caps = dst->dev->features;
2432        if (sk_is_tcp(sk))
2433                sk->sk_route_caps |= NETIF_F_GSO;
2434        if (sk->sk_route_caps & NETIF_F_GSO)
2435                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2436        if (unlikely(sk->sk_gso_disabled))
2437                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2438        if (sk_can_gso(sk)) {
2439                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2440                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2441                } else {
2442                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2443                        sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2444                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2445                        max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2446                }
2447        }
2448        sk->sk_gso_max_segs = max_segs;
2449        sk_dst_set(sk, dst);
2450}
2451EXPORT_SYMBOL_GPL(sk_setup_caps);
2452
2453/*
2454 *      Simple resource managers for sockets.
2455 */
2456
2457
2458/*
2459 * Write buffer destructor automatically called from kfree_skb.
2460 */
2461void sock_wfree(struct sk_buff *skb)
2462{
2463        struct sock *sk = skb->sk;
2464        unsigned int len = skb->truesize;
2465        bool free;
2466
2467        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2468                if (sock_flag(sk, SOCK_RCU_FREE) &&
2469                    sk->sk_write_space == sock_def_write_space) {
2470                        rcu_read_lock();
2471                        free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2472                        sock_def_write_space_wfree(sk);
2473                        rcu_read_unlock();
2474                        if (unlikely(free))
2475                                __sk_free(sk);
2476                        return;
2477                }
2478
2479                /*
2480                 * Keep a reference on sk_wmem_alloc, this will be released
2481                 * after sk_write_space() call
2482                 */
2483                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2484                sk->sk_write_space(sk);
2485                len = 1;
2486        }
2487        /*
2488         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2489         * could not do because of in-flight packets
2490         */
2491        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2492                __sk_free(sk);
2493}
2494EXPORT_SYMBOL(sock_wfree);
2495
2496/* This variant of sock_wfree() is used by TCP,
2497 * since it sets SOCK_USE_WRITE_QUEUE.
2498 */
2499void __sock_wfree(struct sk_buff *skb)
2500{
2501        struct sock *sk = skb->sk;
2502
2503        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2504                __sk_free(sk);
2505}
2506
2507void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2508{
2509        skb_orphan(skb);
2510        skb->sk = sk;
2511#ifdef CONFIG_INET
2512        if (unlikely(!sk_fullsock(sk))) {
2513                skb->destructor = sock_edemux;
2514                sock_hold(sk);
2515                return;
2516        }
2517#endif
2518        skb->destructor = sock_wfree;
2519        skb_set_hash_from_sk(skb, sk);
2520        /*
2521         * We used to take a refcount on sk, but following operation
2522         * is enough to guarantee sk_free() wont free this sock until
2523         * all in-flight packets are completed
2524         */
2525        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2526}
2527EXPORT_SYMBOL(skb_set_owner_w);
2528
2529static bool can_skb_orphan_partial(const struct sk_buff *skb)
2530{
2531#ifdef CONFIG_TLS_DEVICE
2532        /* Drivers depend on in-order delivery for crypto offload,
2533         * partial orphan breaks out-of-order-OK logic.
2534         */
2535        if (skb->decrypted)
2536                return false;
2537#endif
2538        return (skb->destructor == sock_wfree ||
2539                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2540}
2541
2542/* This helper is used by netem, as it can hold packets in its
2543 * delay queue. We want to allow the owner socket to send more
2544 * packets, as if they were already TX completed by a typical driver.
2545 * But we also want to keep skb->sk set because some packet schedulers
2546 * rely on it (sch_fq for example).
2547 */
2548void skb_orphan_partial(struct sk_buff *skb)
2549{
2550        if (skb_is_tcp_pure_ack(skb))
2551                return;
2552
2553        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2554                return;
2555
2556        skb_orphan(skb);
2557}
2558EXPORT_SYMBOL(skb_orphan_partial);
2559
2560/*
2561 * Read buffer destructor automatically called from kfree_skb.
2562 */
2563void sock_rfree(struct sk_buff *skb)
2564{
2565        struct sock *sk = skb->sk;
2566        unsigned int len = skb->truesize;
2567
2568        atomic_sub(len, &sk->sk_rmem_alloc);
2569        sk_mem_uncharge(sk, len);
2570}
2571EXPORT_SYMBOL(sock_rfree);
2572
2573/*
2574 * Buffer destructor for skbs that are not used directly in read or write
2575 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2576 */
2577void sock_efree(struct sk_buff *skb)
2578{
2579        sock_put(skb->sk);
2580}
2581EXPORT_SYMBOL(sock_efree);
2582
2583/* Buffer destructor for prefetch/receive path where reference count may
2584 * not be held, e.g. for listen sockets.
2585 */
2586#ifdef CONFIG_INET
2587void sock_pfree(struct sk_buff *skb)
2588{
2589        if (sk_is_refcounted(skb->sk))
2590                sock_gen_put(skb->sk);
2591}
2592EXPORT_SYMBOL(sock_pfree);
2593#endif /* CONFIG_INET */
2594
2595kuid_t sock_i_uid(struct sock *sk)
2596{
2597        kuid_t uid;
2598
2599        read_lock_bh(&sk->sk_callback_lock);
2600        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2601        read_unlock_bh(&sk->sk_callback_lock);
2602        return uid;
2603}
2604EXPORT_SYMBOL(sock_i_uid);
2605
2606unsigned long __sock_i_ino(struct sock *sk)
2607{
2608        unsigned long ino;
2609
2610        read_lock(&sk->sk_callback_lock);
2611        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2612        read_unlock(&sk->sk_callback_lock);
2613        return ino;
2614}
2615EXPORT_SYMBOL(__sock_i_ino);
2616
2617unsigned long sock_i_ino(struct sock *sk)
2618{
2619        unsigned long ino;
2620
2621        local_bh_disable();
2622        ino = __sock_i_ino(sk);
2623        local_bh_enable();
2624        return ino;
2625}
2626EXPORT_SYMBOL(sock_i_ino);
2627
2628/*
2629 * Allocate a skb from the socket's send buffer.
2630 */
2631struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2632                             gfp_t priority)
2633{
2634        if (force ||
2635            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2636                struct sk_buff *skb = alloc_skb(size, priority);
2637
2638                if (skb) {
2639                        skb_set_owner_w(skb, sk);
2640                        return skb;
2641                }
2642        }
2643        return NULL;
2644}
2645EXPORT_SYMBOL(sock_wmalloc);
2646
2647static void sock_ofree(struct sk_buff *skb)
2648{
2649        struct sock *sk = skb->sk;
2650
2651        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2652}
2653
2654struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2655                             gfp_t priority)
2656{
2657        struct sk_buff *skb;
2658
2659        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2660        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2661            READ_ONCE(sysctl_optmem_max))
2662                return NULL;
2663
2664        skb = alloc_skb(size, priority);
2665        if (!skb)
2666                return NULL;
2667
2668        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2669        skb->sk = sk;
2670        skb->destructor = sock_ofree;
2671        return skb;
2672}
2673
2674/*
2675 * Allocate a memory block from the socket's option memory buffer.
2676 */
2677void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2678{
2679        int optmem_max = READ_ONCE(sysctl_optmem_max);
2680
2681        if ((unsigned int)size <= optmem_max &&
2682            atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2683                void *mem;
2684                /* First do the add, to avoid the race if kmalloc
2685                 * might sleep.
2686                 */
2687                atomic_add(size, &sk->sk_omem_alloc);
2688                mem = kmalloc(size, priority);
2689                if (mem)
2690                        return mem;
2691                atomic_sub(size, &sk->sk_omem_alloc);
2692        }
2693        return NULL;
2694}
2695EXPORT_SYMBOL(sock_kmalloc);
2696
2697/* Free an option memory block. Note, we actually want the inline
2698 * here as this allows gcc to detect the nullify and fold away the
2699 * condition entirely.
2700 */
2701static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2702                                  const bool nullify)
2703{
2704        if (WARN_ON_ONCE(!mem))
2705                return;
2706        if (nullify)
2707                kfree_sensitive(mem);
2708        else
2709                kfree(mem);
2710        atomic_sub(size, &sk->sk_omem_alloc);
2711}
2712
2713void sock_kfree_s(struct sock *sk, void *mem, int size)
2714{
2715        __sock_kfree_s(sk, mem, size, false);
2716}
2717EXPORT_SYMBOL(sock_kfree_s);
2718
2719void sock_kzfree_s(struct sock *sk, void *mem, int size)
2720{
2721        __sock_kfree_s(sk, mem, size, true);
2722}
2723EXPORT_SYMBOL(sock_kzfree_s);
2724
2725/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2726   I think, these locks should be removed for datagram sockets.
2727 */
2728static long sock_wait_for_wmem(struct sock *sk, long timeo)
2729{
2730        DEFINE_WAIT(wait);
2731
2732        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2733        for (;;) {
2734                if (!timeo)
2735                        break;
2736                if (signal_pending(current))
2737                        break;
2738                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2739                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2740                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2741                        break;
2742                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2743                        break;
2744                if (READ_ONCE(sk->sk_err))
2745                        break;
2746                timeo = schedule_timeout(timeo);
2747        }
2748        finish_wait(sk_sleep(sk), &wait);
2749        return timeo;
2750}
2751
2752
2753/*
2754 *      Generic send/receive buffer handlers
2755 */
2756
2757struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2758                                     unsigned long data_len, int noblock,
2759                                     int *errcode, int max_page_order)
2760{
2761        struct sk_buff *skb;
2762        long timeo;
2763        int err;
2764
2765        timeo = sock_sndtimeo(sk, noblock);
2766        for (;;) {
2767                err = sock_error(sk);
2768                if (err != 0)
2769                        goto failure;
2770
2771                err = -EPIPE;
2772                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2773                        goto failure;
2774
2775                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2776                        break;
2777
2778                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2779                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2780                err = -EAGAIN;
2781                if (!timeo)
2782                        goto failure;
2783                if (signal_pending(current))
2784                        goto interrupted;
2785                timeo = sock_wait_for_wmem(sk, timeo);
2786        }
2787        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2788                                   errcode, sk->sk_allocation);
2789        if (skb)
2790                skb_set_owner_w(skb, sk);
2791        return skb;
2792
2793interrupted:
2794        err = sock_intr_errno(timeo);
2795failure:
2796        *errcode = err;
2797        return NULL;
2798}
2799EXPORT_SYMBOL(sock_alloc_send_pskb);
2800
2801int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2802                     struct sockcm_cookie *sockc)
2803{
2804        u32 tsflags;
2805
2806        switch (cmsg->cmsg_type) {
2807        case SO_MARK:
2808                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2809                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2810                        return -EPERM;
2811                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2812                        return -EINVAL;
2813                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2814                break;
2815        case SO_TIMESTAMPING_OLD:
2816        case SO_TIMESTAMPING_NEW:
2817                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2818                        return -EINVAL;
2819
2820                tsflags = *(u32 *)CMSG_DATA(cmsg);
2821                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2822                        return -EINVAL;
2823
2824                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2825                sockc->tsflags |= tsflags;
2826                break;
2827        case SCM_TXTIME:
2828                if (!sock_flag(sk, SOCK_TXTIME))
2829                        return -EINVAL;
2830                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2831                        return -EINVAL;
2832                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2833                break;
2834        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2835        case SCM_RIGHTS:
2836        case SCM_CREDENTIALS:
2837                break;
2838        default:
2839                return -EINVAL;
2840        }
2841        return 0;
2842}
2843EXPORT_SYMBOL(__sock_cmsg_send);
2844
2845int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2846                   struct sockcm_cookie *sockc)
2847{
2848        struct cmsghdr *cmsg;
2849        int ret;
2850
2851        for_each_cmsghdr(cmsg, msg) {
2852                if (!CMSG_OK(msg, cmsg))
2853                        return -EINVAL;
2854                if (cmsg->cmsg_level != SOL_SOCKET)
2855                        continue;
2856                ret = __sock_cmsg_send(sk, cmsg, sockc);
2857                if (ret)
2858                        return ret;
2859        }
2860        return 0;
2861}
2862EXPORT_SYMBOL(sock_cmsg_send);
2863
2864static void sk_enter_memory_pressure(struct sock *sk)
2865{
2866        if (!sk->sk_prot->enter_memory_pressure)
2867                return;
2868
2869        sk->sk_prot->enter_memory_pressure(sk);
2870}
2871
2872static void sk_leave_memory_pressure(struct sock *sk)
2873{
2874        if (sk->sk_prot->leave_memory_pressure) {
2875                INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2876                                     tcp_leave_memory_pressure, sk);
2877        } else {
2878                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2879
2880                if (memory_pressure && READ_ONCE(*memory_pressure))
2881                        WRITE_ONCE(*memory_pressure, 0);
2882        }
2883}
2884
2885DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2886
2887/**
2888 * skb_page_frag_refill - check that a page_frag contains enough room
2889 * @sz: minimum size of the fragment we want to get
2890 * @pfrag: pointer to page_frag
2891 * @gfp: priority for memory allocation
2892 *
2893 * Note: While this allocator tries to use high order pages, there is
2894 * no guarantee that allocations succeed. Therefore, @sz MUST be
2895 * less or equal than PAGE_SIZE.
2896 */
2897bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2898{
2899        if (pfrag->page) {
2900                if (page_ref_count(pfrag->page) == 1) {
2901                        pfrag->offset = 0;
2902                        return true;
2903                }
2904                if (pfrag->offset + sz <= pfrag->size)
2905                        return true;
2906                put_page(pfrag->page);
2907        }
2908
2909        pfrag->offset = 0;
2910        if (SKB_FRAG_PAGE_ORDER &&
2911            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2912                /* Avoid direct reclaim but allow kswapd to wake */
2913                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2914                                          __GFP_COMP | __GFP_NOWARN |
2915                                          __GFP_NORETRY,
2916                                          SKB_FRAG_PAGE_ORDER);
2917                if (likely(pfrag->page)) {
2918                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2919                        return true;
2920                }
2921        }
2922        pfrag->page = alloc_page(gfp);
2923        if (likely(pfrag->page)) {
2924                pfrag->size = PAGE_SIZE;
2925                return true;
2926        }
2927        return false;
2928}
2929EXPORT_SYMBOL(skb_page_frag_refill);
2930
2931bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2932{
2933        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2934                return true;
2935
2936        sk_enter_memory_pressure(sk);
2937        sk_stream_moderate_sndbuf(sk);
2938        return false;
2939}
2940EXPORT_SYMBOL(sk_page_frag_refill);
2941
2942void __lock_sock(struct sock *sk)
2943        __releases(&sk->sk_lock.slock)
2944        __acquires(&sk->sk_lock.slock)
2945{
2946        DEFINE_WAIT(wait);
2947
2948        for (;;) {
2949                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2950                                        TASK_UNINTERRUPTIBLE);
2951                spin_unlock_bh(&sk->sk_lock.slock);
2952                schedule();
2953                spin_lock_bh(&sk->sk_lock.slock);
2954                if (!sock_owned_by_user(sk))
2955                        break;
2956        }
2957        finish_wait(&sk->sk_lock.wq, &wait);
2958}
2959
2960void __release_sock(struct sock *sk)
2961        __releases(&sk->sk_lock.slock)
2962        __acquires(&sk->sk_lock.slock)
2963{
2964        struct sk_buff *skb, *next;
2965
2966        while ((skb = sk->sk_backlog.head) != NULL) {
2967                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2968
2969                spin_unlock_bh(&sk->sk_lock.slock);
2970
2971                do {
2972                        next = skb->next;
2973                        prefetch(next);
2974                        DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2975                        skb_mark_not_on_list(skb);
2976                        sk_backlog_rcv(sk, skb);
2977
2978                        cond_resched();
2979
2980                        skb = next;
2981                } while (skb != NULL);
2982
2983                spin_lock_bh(&sk->sk_lock.slock);
2984        }
2985
2986        /*
2987         * Doing the zeroing here guarantee we can not loop forever
2988         * while a wild producer attempts to flood us.
2989         */
2990        sk->sk_backlog.len = 0;
2991}
2992
2993void __sk_flush_backlog(struct sock *sk)
2994{
2995        spin_lock_bh(&sk->sk_lock.slock);
2996        __release_sock(sk);
2997
2998        if (sk->sk_prot->release_cb)
2999                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3000                                     tcp_release_cb, sk);
3001
3002        spin_unlock_bh(&sk->sk_lock.slock);
3003}
3004EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3005
3006/**
3007 * sk_wait_data - wait for data to arrive at sk_receive_queue
3008 * @sk:    sock to wait on
3009 * @timeo: for how long
3010 * @skb:   last skb seen on sk_receive_queue
3011 *
3012 * Now socket state including sk->sk_err is changed only under lock,
3013 * hence we may omit checks after joining wait queue.
3014 * We check receive queue before schedule() only as optimization;
3015 * it is very likely that release_sock() added new data.
3016 */
3017int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3018{
3019        DEFINE_WAIT_FUNC(wait, woken_wake_function);
3020        int rc;
3021
3022        add_wait_queue(sk_sleep(sk), &wait);
3023        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3024        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3025        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3026        remove_wait_queue(sk_sleep(sk), &wait);
3027        return rc;
3028}
3029EXPORT_SYMBOL(sk_wait_data);
3030
3031/**
3032 *      __sk_mem_raise_allocated - increase memory_allocated
3033 *      @sk: socket
3034 *      @size: memory size to allocate
3035 *      @amt: pages to allocate
3036 *      @kind: allocation type
3037 *
3038 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3039 *
3040 *      Unlike the globally shared limits among the sockets under same protocol,
3041 *      consuming the budget of a memcg won't have direct effect on other ones.
3042 *      So be optimistic about memcg's tolerance, and leave the callers to decide
3043 *      whether or not to raise allocated through sk_under_memory_pressure() or
3044 *      its variants.
3045 */
3046int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3047{
3048        struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3049        struct proto *prot = sk->sk_prot;
3050        bool charged = false;
3051        long allocated;
3052
3053        sk_memory_allocated_add(sk, amt);
3054        allocated = sk_memory_allocated(sk);
3055
3056        if (memcg) {
3057                if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3058                        goto suppress_allocation;
3059                charged = true;
3060        }
3061
3062        /* Under limit. */
3063        if (allocated <= sk_prot_mem_limits(sk, 0)) {
3064                sk_leave_memory_pressure(sk);
3065                return 1;
3066        }
3067
3068        /* Under pressure. */
3069        if (allocated > sk_prot_mem_limits(sk, 1))
3070                sk_enter_memory_pressure(sk);
3071
3072        /* Over hard limit. */
3073        if (allocated > sk_prot_mem_limits(sk, 2))
3074                goto suppress_allocation;
3075
3076        /* Guarantee minimum buffer size under pressure (either global
3077         * or memcg) to make sure features described in RFC 7323 (TCP
3078         * Extensions for High Performance) work properly.
3079         *
3080         * This rule does NOT stand when exceeds global or memcg's hard
3081         * limit, or else a DoS attack can be taken place by spawning
3082         * lots of sockets whose usage are under minimum buffer size.
3083         */
3084        if (kind == SK_MEM_RECV) {
3085                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3086                        return 1;
3087
3088        } else { /* SK_MEM_SEND */
3089                int wmem0 = sk_get_wmem0(sk, prot);
3090
3091                if (sk->sk_type == SOCK_STREAM) {
3092                        if (sk->sk_wmem_queued < wmem0)
3093                                return 1;
3094                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3095                                return 1;
3096                }
3097        }
3098
3099        if (sk_has_memory_pressure(sk)) {
3100                u64 alloc;
3101
3102                /* The following 'average' heuristic is within the
3103                 * scope of global accounting, so it only makes
3104                 * sense for global memory pressure.
3105                 */
3106                if (!sk_under_global_memory_pressure(sk))
3107                        return 1;
3108
3109                /* Try to be fair among all the sockets under global
3110                 * pressure by allowing the ones that below average
3111                 * usage to raise.
3112                 */
3113                alloc = sk_sockets_allocated_read_positive(sk);
3114                if (sk_prot_mem_limits(sk, 2) > alloc *
3115                    sk_mem_pages(sk->sk_wmem_queued +
3116                                 atomic_read(&sk->sk_rmem_alloc) +
3117                                 sk->sk_forward_alloc))
3118                        return 1;
3119        }
3120
3121suppress_allocation:
3122
3123        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3124                sk_stream_moderate_sndbuf(sk);
3125
3126                /* Fail only if socket is _under_ its sndbuf.
3127                 * In this case we cannot block, so that we have to fail.
3128                 */
3129                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3130                        /* Force charge with __GFP_NOFAIL */
3131                        if (memcg && !charged) {
3132                                mem_cgroup_charge_skmem(memcg, amt,
3133                                        gfp_memcg_charge() | __GFP_NOFAIL);
3134                        }
3135                        return 1;
3136                }
3137        }
3138
3139        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3140                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3141
3142        sk_memory_allocated_sub(sk, amt);
3143
3144        if (charged)
3145                mem_cgroup_uncharge_skmem(memcg, amt);
3146
3147        return 0;
3148}
3149
3150/**
3151 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3152 *      @sk: socket
3153 *      @size: memory size to allocate
3154 *      @kind: allocation type
3155 *
3156 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3157 *      rmem allocation. This function assumes that protocols which have
3158 *      memory_pressure use sk_wmem_queued as write buffer accounting.
3159 */
3160int __sk_mem_schedule(struct sock *sk, int size, int kind)
3161{
3162        int ret, amt = sk_mem_pages(size);
3163
3164        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3165        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3166        if (!ret)
3167                sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3168        return ret;
3169}
3170EXPORT_SYMBOL(__sk_mem_schedule);
3171
3172/**
3173 *      __sk_mem_reduce_allocated - reclaim memory_allocated
3174 *      @sk: socket
3175 *      @amount: number of quanta
3176 *
3177 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3178 */
3179void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3180{
3181        sk_memory_allocated_sub(sk, amount);
3182
3183        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3184                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3185
3186        if (sk_under_global_memory_pressure(sk) &&
3187            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3188                sk_leave_memory_pressure(sk);
3189}
3190
3191/**
3192 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3193 *      @sk: socket
3194 *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3195 */
3196void __sk_mem_reclaim(struct sock *sk, int amount)
3197{
3198        amount >>= PAGE_SHIFT;
3199        sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3200        __sk_mem_reduce_allocated(sk, amount);
3201}
3202EXPORT_SYMBOL(__sk_mem_reclaim);
3203
3204int sk_set_peek_off(struct sock *sk, int val)
3205{
3206        WRITE_ONCE(sk->sk_peek_off, val);
3207        return 0;
3208}
3209EXPORT_SYMBOL_GPL(sk_set_peek_off);
3210
3211/*
3212 * Set of default routines for initialising struct proto_ops when
3213 * the protocol does not support a particular function. In certain
3214 * cases where it makes no sense for a protocol to have a "do nothing"
3215 * function, some default processing is provided.
3216 */
3217
3218int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3219{
3220        return -EOPNOTSUPP;
3221}
3222EXPORT_SYMBOL(sock_no_bind);
3223
3224int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3225                    int len, int flags)
3226{
3227        return -EOPNOTSUPP;
3228}
3229EXPORT_SYMBOL(sock_no_connect);
3230
3231int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3232{
3233        return -EOPNOTSUPP;
3234}
3235EXPORT_SYMBOL(sock_no_socketpair);
3236
3237int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3238                   bool kern)
3239{
3240        return -EOPNOTSUPP;
3241}
3242EXPORT_SYMBOL(sock_no_accept);
3243
3244int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3245                    int peer)
3246{
3247        return -EOPNOTSUPP;
3248}
3249EXPORT_SYMBOL(sock_no_getname);
3250
3251int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3252{
3253        return -EOPNOTSUPP;
3254}
3255EXPORT_SYMBOL(sock_no_ioctl);
3256
3257int sock_no_listen(struct socket *sock, int backlog)
3258{
3259        return -EOPNOTSUPP;
3260}
3261EXPORT_SYMBOL(sock_no_listen);
3262
3263int sock_no_shutdown(struct socket *sock, int how)
3264{
3265        return -EOPNOTSUPP;
3266}
3267EXPORT_SYMBOL(sock_no_shutdown);
3268
3269int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3270{
3271        return -EOPNOTSUPP;
3272}
3273EXPORT_SYMBOL(sock_no_sendmsg);
3274
3275int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3276{
3277        return -EOPNOTSUPP;
3278}
3279EXPORT_SYMBOL(sock_no_sendmsg_locked);
3280
3281int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3282                    int flags)
3283{
3284        return -EOPNOTSUPP;
3285}
3286EXPORT_SYMBOL(sock_no_recvmsg);
3287
3288int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3289{
3290        /* Mirror missing mmap method error code */
3291        return -ENODEV;
3292}
3293EXPORT_SYMBOL(sock_no_mmap);
3294
3295/*
3296 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3297 * various sock-based usage counts.
3298 */
3299void __receive_sock(struct file *file)
3300{
3301        struct socket *sock;
3302
3303        sock = sock_from_file(file);
3304        if (sock) {
3305                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3306                sock_update_classid(&sock->sk->sk_cgrp_data);
3307        }
3308}
3309
3310/*
3311 *      Default Socket Callbacks
3312 */
3313
3314static void sock_def_wakeup(struct sock *sk)
3315{
3316        struct socket_wq *wq;
3317
3318        rcu_read_lock();
3319        wq = rcu_dereference(sk->sk_wq);
3320        if (skwq_has_sleeper(wq))
3321                wake_up_interruptible_all(&wq->wait);
3322        rcu_read_unlock();
3323}
3324
3325static void sock_def_error_report(struct sock *sk)
3326{
3327        struct socket_wq *wq;
3328
3329        rcu_read_lock();
3330        wq = rcu_dereference(sk->sk_wq);
3331        if (skwq_has_sleeper(wq))
3332                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3333        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3334        rcu_read_unlock();
3335}
3336
3337void sock_def_readable(struct sock *sk)
3338{
3339        struct socket_wq *wq;
3340
3341        trace_sk_data_ready(sk);
3342
3343        rcu_read_lock();
3344        wq = rcu_dereference(sk->sk_wq);
3345        if (skwq_has_sleeper(wq))
3346                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3347                                                EPOLLRDNORM | EPOLLRDBAND);
3348        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3349        rcu_read_unlock();
3350}
3351
3352static void sock_def_write_space(struct sock *sk)
3353{
3354        struct socket_wq *wq;
3355
3356        rcu_read_lock();
3357
3358        /* Do not wake up a writer until he can make "significant"
3359         * progress.  --DaveM
3360         */
3361        if (sock_writeable(sk)) {
3362                wq = rcu_dereference(sk->sk_wq);
3363                if (skwq_has_sleeper(wq))
3364                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3365                                                EPOLLWRNORM | EPOLLWRBAND);
3366
3367                /* Should agree with poll, otherwise some programs break */
3368                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3369        }
3370
3371        rcu_read_unlock();
3372}
3373
3374/* An optimised version of sock_def_write_space(), should only be called
3375 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3376 * ->sk_wmem_alloc.
3377 */
3378static void sock_def_write_space_wfree(struct sock *sk)
3379{
3380        /* Do not wake up a writer until he can make "significant"
3381         * progress.  --DaveM
3382         */
3383        if (sock_writeable(sk)) {
3384                struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3385
3386                /* rely on refcount_sub from sock_wfree() */
3387                smp_mb__after_atomic();
3388                if (wq && waitqueue_active(&wq->wait))
3389                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3390                                                EPOLLWRNORM | EPOLLWRBAND);
3391
3392                /* Should agree with poll, otherwise some programs break */
3393                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3394        }
3395}
3396
3397static void sock_def_destruct(struct sock *sk)
3398{
3399}
3400
3401void sk_send_sigurg(struct sock *sk)
3402{
3403        if (sk->sk_socket && sk->sk_socket->file)
3404                if (send_sigurg(&sk->sk_socket->file->f_owner))
3405                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3406}
3407EXPORT_SYMBOL(sk_send_sigurg);
3408
3409void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3410                    unsigned long expires)
3411{
3412        if (!mod_timer(timer, expires))
3413                sock_hold(sk);
3414}
3415EXPORT_SYMBOL(sk_reset_timer);
3416
3417void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3418{
3419        if (del_timer(timer))
3420                __sock_put(sk);
3421}
3422EXPORT_SYMBOL(sk_stop_timer);
3423
3424void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3425{
3426        if (del_timer_sync(timer))
3427                __sock_put(sk);
3428}
3429EXPORT_SYMBOL(sk_stop_timer_sync);
3430
3431void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3432{
3433        sk_init_common(sk);
3434        sk->sk_send_head        =       NULL;
3435
3436        timer_setup(&sk->sk_timer, NULL, 0);
3437
3438        sk->sk_allocation       =       GFP_KERNEL;
3439        sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3440        sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3441        sk->sk_state            =       TCP_CLOSE;
3442        sk->sk_use_task_frag    =       true;
3443        sk_set_socket(sk, sock);
3444
3445        sock_set_flag(sk, SOCK_ZAPPED);
3446
3447        if (sock) {
3448                sk->sk_type     =       sock->type;
3449                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3450                sock->sk        =       sk;
3451        } else {
3452                RCU_INIT_POINTER(sk->sk_wq, NULL);
3453        }
3454        sk->sk_uid      =       uid;
3455
3456        rwlock_init(&sk->sk_callback_lock);
3457        if (sk->sk_kern_sock)
3458                lockdep_set_class_and_name(
3459                        &sk->sk_callback_lock,
3460                        af_kern_callback_keys + sk->sk_family,
3461                        af_family_kern_clock_key_strings[sk->sk_family]);
3462        else
3463                lockdep_set_class_and_name(
3464                        &sk->sk_callback_lock,
3465                        af_callback_keys + sk->sk_family,
3466                        af_family_clock_key_strings[sk->sk_family]);
3467
3468        sk->sk_state_change     =       sock_def_wakeup;
3469        sk->sk_data_ready       =       sock_def_readable;
3470        sk->sk_write_space      =       sock_def_write_space;
3471        sk->sk_error_report     =       sock_def_error_report;
3472        sk->sk_destruct         =       sock_def_destruct;
3473
3474        sk->sk_frag.page        =       NULL;
3475        sk->sk_frag.offset      =       0;
3476        sk->sk_peek_off         =       -1;
3477
3478        sk->sk_peer_pid         =       NULL;
3479        sk->sk_peer_cred        =       NULL;
3480        spin_lock_init(&sk->sk_peer_lock);
3481
3482        sk->sk_write_pending    =       0;
3483        sk->sk_rcvlowat         =       1;
3484        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3485        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3486
3487        sk->sk_stamp = SK_DEFAULT_STAMP;
3488#if BITS_PER_LONG==32
3489        seqlock_init(&sk->sk_stamp_seq);
3490#endif
3491        atomic_set(&sk->sk_zckey, 0);
3492
3493#ifdef CONFIG_NET_RX_BUSY_POLL
3494        sk->sk_napi_id          =       0;
3495        sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3496#endif
3497
3498        sk->sk_max_pacing_rate = ~0UL;
3499        sk->sk_pacing_rate = ~0UL;
3500        WRITE_ONCE(sk->sk_pacing_shift, 10);
3501        sk->sk_incoming_cpu = -1;
3502
3503        sk_rx_queue_clear(sk);
3504        /*
3505         * Before updating sk_refcnt, we must commit prior changes to memory
3506         * (Documentation/RCU/rculist_nulls.rst for details)
3507         */
3508        smp_wmb();
3509        refcount_set(&sk->sk_refcnt, 1);
3510        atomic_set(&sk->sk_drops, 0);
3511}
3512EXPORT_SYMBOL(sock_init_data_uid);
3513
3514void sock_init_data(struct socket *sock, struct sock *sk)
3515{
3516        kuid_t uid = sock ?
3517                SOCK_INODE(sock)->i_uid :
3518                make_kuid(sock_net(sk)->user_ns, 0);
3519
3520        sock_init_data_uid(sock, sk, uid);
3521}
3522EXPORT_SYMBOL(sock_init_data);
3523
3524void lock_sock_nested(struct sock *sk, int subclass)
3525{
3526        /* The sk_lock has mutex_lock() semantics here. */
3527        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3528
3529        might_sleep();
3530        spin_lock_bh(&sk->sk_lock.slock);
3531        if (sock_owned_by_user_nocheck(sk))
3532                __lock_sock(sk);
3533        sk->sk_lock.owned = 1;
3534        spin_unlock_bh(&sk->sk_lock.slock);
3535}
3536EXPORT_SYMBOL(lock_sock_nested);
3537
3538void release_sock(struct sock *sk)
3539{
3540        spin_lock_bh(&sk->sk_lock.slock);
3541        if (sk->sk_backlog.tail)
3542                __release_sock(sk);
3543
3544        if (sk->sk_prot->release_cb)
3545                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3546                                     tcp_release_cb, sk);
3547
3548        sock_release_ownership(sk);
3549        if (waitqueue_active(&sk->sk_lock.wq))
3550                wake_up(&sk->sk_lock.wq);
3551        spin_unlock_bh(&sk->sk_lock.slock);
3552}
3553EXPORT_SYMBOL(release_sock);
3554
3555bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3556{
3557        might_sleep();
3558        spin_lock_bh(&sk->sk_lock.slock);
3559
3560        if (!sock_owned_by_user_nocheck(sk)) {
3561                /*
3562                 * Fast path return with bottom halves disabled and
3563                 * sock::sk_lock.slock held.
3564                 *
3565                 * The 'mutex' is not contended and holding
3566                 * sock::sk_lock.slock prevents all other lockers to
3567                 * proceed so the corresponding unlock_sock_fast() can
3568                 * avoid the slow path of release_sock() completely and
3569                 * just release slock.
3570                 *
3571                 * From a semantical POV this is equivalent to 'acquiring'
3572                 * the 'mutex', hence the corresponding lockdep
3573                 * mutex_release() has to happen in the fast path of
3574                 * unlock_sock_fast().
3575                 */
3576                return false;
3577        }
3578
3579        __lock_sock(sk);
3580        sk->sk_lock.owned = 1;
3581        __acquire(&sk->sk_lock.slock);
3582        spin_unlock_bh(&sk->sk_lock.slock);
3583        return true;
3584}
3585EXPORT_SYMBOL(__lock_sock_fast);
3586
3587int sock_gettstamp(struct socket *sock, void __user *userstamp,
3588                   bool timeval, bool time32)
3589{
3590        struct sock *sk = sock->sk;
3591        struct timespec64 ts;
3592
3593        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3594        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3595        if (ts.tv_sec == -1)
3596                return -ENOENT;
3597        if (ts.tv_sec == 0) {
3598                ktime_t kt = ktime_get_real();
3599                sock_write_timestamp(sk, kt);
3600                ts = ktime_to_timespec64(kt);
3601        }
3602
3603        if (timeval)
3604                ts.tv_nsec /= 1000;
3605
3606#ifdef CONFIG_COMPAT_32BIT_TIME
3607        if (time32)
3608                return put_old_timespec32(&ts, userstamp);
3609#endif
3610#ifdef CONFIG_SPARC64
3611        /* beware of padding in sparc64 timeval */
3612        if (timeval && !in_compat_syscall()) {
3613                struct __kernel_old_timeval __user tv = {
3614                        .tv_sec = ts.tv_sec,
3615                        .tv_usec = ts.tv_nsec,
3616                };
3617                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3618                        return -EFAULT;
3619                return 0;
3620        }
3621#endif
3622        return put_timespec64(&ts, userstamp);
3623}
3624EXPORT_SYMBOL(sock_gettstamp);
3625
3626void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3627{
3628        if (!sock_flag(sk, flag)) {
3629                unsigned long previous_flags = sk->sk_flags;
3630
3631                sock_set_flag(sk, flag);
3632                /*
3633                 * we just set one of the two flags which require net
3634                 * time stamping, but time stamping might have been on
3635                 * already because of the other one
3636                 */
3637                if (sock_needs_netstamp(sk) &&
3638                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3639                        net_enable_timestamp();
3640        }
3641}
3642
3643int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3644                       int level, int type)
3645{
3646        struct sock_exterr_skb *serr;
3647        struct sk_buff *skb;
3648        int copied, err;
3649
3650        err = -EAGAIN;
3651        skb = sock_dequeue_err_skb(sk);
3652        if (skb == NULL)
3653                goto out;
3654
3655        copied = skb->len;
3656        if (copied > len) {
3657                msg->msg_flags |= MSG_TRUNC;
3658                copied = len;
3659        }
3660        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3661        if (err)
3662                goto out_free_skb;
3663
3664        sock_recv_timestamp(msg, sk, skb);
3665
3666        serr = SKB_EXT_ERR(skb);
3667        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3668
3669        msg->msg_flags |= MSG_ERRQUEUE;
3670        err = copied;
3671
3672out_free_skb:
3673        kfree_skb(skb);
3674out:
3675        return err;
3676}
3677EXPORT_SYMBOL(sock_recv_errqueue);
3678
3679/*
3680 *      Get a socket option on an socket.
3681 *
3682 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3683 *      asynchronous errors should be reported by getsockopt. We assume
3684 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3685 */
3686int sock_common_getsockopt(struct socket *sock, int level, int optname,
3687                           char __user *optval, int __user *optlen)
3688{
3689        struct sock *sk = sock->sk;
3690
3691        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3692        return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3693}
3694EXPORT_SYMBOL(sock_common_getsockopt);
3695
3696int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3697                        int flags)
3698{
3699        struct sock *sk = sock->sk;
3700        int addr_len = 0;
3701        int err;
3702
3703        err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3704        if (err >= 0)
3705                msg->msg_namelen = addr_len;
3706        return err;
3707}
3708EXPORT_SYMBOL(sock_common_recvmsg);
3709
3710/*
3711 *      Set socket options on an inet socket.
3712 */
3713int sock_common_setsockopt(struct socket *sock, int level, int optname,
3714                           sockptr_t optval, unsigned int optlen)
3715{
3716        struct sock *sk = sock->sk;
3717
3718        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3719        return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3720}
3721EXPORT_SYMBOL(sock_common_setsockopt);
3722
3723void sk_common_release(struct sock *sk)
3724{
3725        if (sk->sk_prot->destroy)
3726                sk->sk_prot->destroy(sk);
3727
3728        /*
3729         * Observation: when sk_common_release is called, processes have
3730         * no access to socket. But net still has.
3731         * Step one, detach it from networking:
3732         *
3733         * A. Remove from hash tables.
3734         */
3735
3736        sk->sk_prot->unhash(sk);
3737
3738        /*
3739         * In this point socket cannot receive new packets, but it is possible
3740         * that some packets are in flight because some CPU runs receiver and
3741         * did hash table lookup before we unhashed socket. They will achieve
3742         * receive queue and will be purged by socket destructor.
3743         *
3744         * Also we still have packets pending on receive queue and probably,
3745         * our own packets waiting in device queues. sock_destroy will drain
3746         * receive queue, but transmitted packets will delay socket destruction
3747         * until the last reference will be released.
3748         */
3749
3750        sock_orphan(sk);
3751
3752        xfrm_sk_free_policy(sk);
3753
3754        sock_put(sk);
3755}
3756EXPORT_SYMBOL(sk_common_release);
3757
3758void sk_get_meminfo(const struct sock *sk, u32 *mem)
3759{
3760        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3761
3762        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3763        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3764        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3765        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3766        mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3767        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3768        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3769        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3770        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3771}
3772
3773#ifdef CONFIG_PROC_FS
3774static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3775
3776int sock_prot_inuse_get(struct net *net, struct proto *prot)
3777{
3778        int cpu, idx = prot->inuse_idx;
3779        int res = 0;
3780
3781        for_each_possible_cpu(cpu)
3782                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3783
3784        return res >= 0 ? res : 0;
3785}
3786EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3787
3788int sock_inuse_get(struct net *net)
3789{
3790        int cpu, res = 0;
3791
3792        for_each_possible_cpu(cpu)
3793                res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3794
3795        return res;
3796}
3797
3798EXPORT_SYMBOL_GPL(sock_inuse_get);
3799
3800static int __net_init sock_inuse_init_net(struct net *net)
3801{
3802        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3803        if (net->core.prot_inuse == NULL)
3804                return -ENOMEM;
3805        return 0;
3806}
3807
3808static void __net_exit sock_inuse_exit_net(struct net *net)
3809{
3810        free_percpu(net->core.prot_inuse);
3811}
3812
3813static struct pernet_operations net_inuse_ops = {
3814        .init = sock_inuse_init_net,
3815        .exit = sock_inuse_exit_net,
3816};
3817
3818static __init int net_inuse_init(void)
3819{
3820        if (register_pernet_subsys(&net_inuse_ops))
3821                panic("Cannot initialize net inuse counters");
3822
3823        return 0;
3824}
3825
3826core_initcall(net_inuse_init);
3827
3828static int assign_proto_idx(struct proto *prot)
3829{
3830        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3831
3832        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3833                pr_err("PROTO_INUSE_NR exhausted\n");
3834                return -ENOSPC;
3835        }
3836
3837        set_bit(prot->inuse_idx, proto_inuse_idx);
3838        return 0;
3839}
3840
3841static void release_proto_idx(struct proto *prot)
3842{
3843        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3844                clear_bit(prot->inuse_idx, proto_inuse_idx);
3845}
3846#else
3847static inline int assign_proto_idx(struct proto *prot)
3848{
3849        return 0;
3850}
3851
3852static inline void release_proto_idx(struct proto *prot)
3853{
3854}
3855
3856#endif
3857
3858static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3859{
3860        if (!twsk_prot)
3861                return;
3862        kfree(twsk_prot->twsk_slab_name);
3863        twsk_prot->twsk_slab_name = NULL;
3864        kmem_cache_destroy(twsk_prot->twsk_slab);
3865        twsk_prot->twsk_slab = NULL;
3866}
3867
3868static int tw_prot_init(const struct proto *prot)
3869{
3870        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3871
3872        if (!twsk_prot)
3873                return 0;
3874
3875        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3876                                              prot->name);
3877        if (!twsk_prot->twsk_slab_name)
3878                return -ENOMEM;
3879
3880        twsk_prot->twsk_slab =
3881                kmem_cache_create(twsk_prot->twsk_slab_name,
3882                                  twsk_prot->twsk_obj_size, 0,
3883                                  SLAB_ACCOUNT | prot->slab_flags,
3884                                  NULL);
3885        if (!twsk_prot->twsk_slab) {
3886                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3887                        prot->name);
3888                return -ENOMEM;
3889        }
3890
3891        return 0;
3892}
3893
3894static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3895{
3896        if (!rsk_prot)
3897                return;
3898        kfree(rsk_prot->slab_name);
3899        rsk_prot->slab_name = NULL;
3900        kmem_cache_destroy(rsk_prot->slab);
3901        rsk_prot->slab = NULL;
3902}
3903
3904static int req_prot_init(const struct proto *prot)
3905{
3906        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3907
3908        if (!rsk_prot)
3909                return 0;
3910
3911        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3912                                        prot->name);
3913        if (!rsk_prot->slab_name)
3914                return -ENOMEM;
3915
3916        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3917                                           rsk_prot->obj_size, 0,
3918                                           SLAB_ACCOUNT | prot->slab_flags,
3919                                           NULL);
3920
3921        if (!rsk_prot->slab) {
3922                pr_crit("%s: Can't create request sock SLAB cache!\n",
3923                        prot->name);
3924                return -ENOMEM;
3925        }
3926        return 0;
3927}
3928
3929int proto_register(struct proto *prot, int alloc_slab)
3930{
3931        int ret = -ENOBUFS;
3932
3933        if (prot->memory_allocated && !prot->sysctl_mem) {
3934                pr_err("%s: missing sysctl_mem\n", prot->name);
3935                return -EINVAL;
3936        }
3937        if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3938                pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3939                return -EINVAL;
3940        }
3941        if (alloc_slab) {
3942                prot->slab = kmem_cache_create_usercopy(prot->name,
3943                                        prot->obj_size, 0,
3944                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3945                                        prot->slab_flags,
3946                                        prot->useroffset, prot->usersize,
3947                                        NULL);
3948
3949                if (prot->slab == NULL) {
3950                        pr_crit("%s: Can't create sock SLAB cache!\n",
3951                                prot->name);
3952                        goto out;
3953                }
3954
3955                if (req_prot_init(prot))
3956                        goto out_free_request_sock_slab;
3957
3958                if (tw_prot_init(prot))
3959                        goto out_free_timewait_sock_slab;
3960        }
3961
3962        mutex_lock(&proto_list_mutex);
3963        ret = assign_proto_idx(prot);
3964        if (ret) {
3965                mutex_unlock(&proto_list_mutex);
3966                goto out_free_timewait_sock_slab;
3967        }
3968        list_add(&prot->node, &proto_list);
3969        mutex_unlock(&proto_list_mutex);
3970        return ret;
3971
3972out_free_timewait_sock_slab:
3973        if (alloc_slab)
3974                tw_prot_cleanup(prot->twsk_prot);
3975out_free_request_sock_slab:
3976        if (alloc_slab) {
3977                req_prot_cleanup(prot->rsk_prot);
3978
3979                kmem_cache_destroy(prot->slab);
3980                prot->slab = NULL;
3981        }
3982out:
3983        return ret;
3984}
3985EXPORT_SYMBOL(proto_register);
3986
3987void proto_unregister(struct proto *prot)
3988{
3989        mutex_lock(&proto_list_mutex);
3990        release_proto_idx(prot);
3991        list_del(&prot->node);
3992        mutex_unlock(&proto_list_mutex);
3993
3994        kmem_cache_destroy(prot->slab);
3995        prot->slab = NULL;
3996
3997        req_prot_cleanup(prot->rsk_prot);
3998        tw_prot_cleanup(prot->twsk_prot);
3999}
4000EXPORT_SYMBOL(proto_unregister);
4001
4002int sock_load_diag_module(int family, int protocol)
4003{
4004        if (!protocol) {
4005                if (!sock_is_registered(family))
4006                        return -ENOENT;
4007
4008                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4009                                      NETLINK_SOCK_DIAG, family);
4010        }
4011
4012#ifdef CONFIG_INET
4013        if (family == AF_INET &&
4014            protocol != IPPROTO_RAW &&
4015            protocol < MAX_INET_PROTOS &&
4016            !rcu_access_pointer(inet_protos[protocol]))
4017                return -ENOENT;
4018#endif
4019
4020        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4021                              NETLINK_SOCK_DIAG, family, protocol);
4022}
4023EXPORT_SYMBOL(sock_load_diag_module);
4024
4025#ifdef CONFIG_PROC_FS
4026static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4027        __acquires(proto_list_mutex)
4028{
4029        mutex_lock(&proto_list_mutex);
4030        return seq_list_start_head(&proto_list, *pos);
4031}
4032
4033static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4034{
4035        return seq_list_next(v, &proto_list, pos);
4036}
4037
4038static void proto_seq_stop(struct seq_file *seq, void *v)
4039        __releases(proto_list_mutex)
4040{
4041        mutex_unlock(&proto_list_mutex);
4042}
4043
4044static char proto_method_implemented(const void *method)
4045{
4046        return method == NULL ? 'n' : 'y';
4047}
4048static long sock_prot_memory_allocated(struct proto *proto)
4049{
4050        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4051}
4052
4053static const char *sock_prot_memory_pressure(struct proto *proto)
4054{
4055        return proto->memory_pressure != NULL ?
4056        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4057}
4058
4059static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4060{
4061
4062        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4063                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4064                   proto->name,
4065                   proto->obj_size,
4066                   sock_prot_inuse_get(seq_file_net(seq), proto),
4067                   sock_prot_memory_allocated(proto),
4068                   sock_prot_memory_pressure(proto),
4069                   proto->max_header,
4070                   proto->slab == NULL ? "no" : "yes",
4071                   module_name(proto->owner),
4072                   proto_method_implemented(proto->close),
4073                   proto_method_implemented(proto->connect),
4074                   proto_method_implemented(proto->disconnect),
4075                   proto_method_implemented(proto->accept),
4076                   proto_method_implemented(proto->ioctl),
4077                   proto_method_implemented(proto->init),
4078                   proto_method_implemented(proto->destroy),
4079                   proto_method_implemented(proto->shutdown),
4080                   proto_method_implemented(proto->setsockopt),
4081                   proto_method_implemented(proto->getsockopt),
4082                   proto_method_implemented(proto->sendmsg),
4083                   proto_method_implemented(proto->recvmsg),
4084                   proto_method_implemented(proto->bind),
4085                   proto_method_implemented(proto->backlog_rcv),
4086                   proto_method_implemented(proto->hash),
4087                   proto_method_implemented(proto->unhash),
4088                   proto_method_implemented(proto->get_port),
4089                   proto_method_implemented(proto->enter_memory_pressure));
4090}
4091
4092static int proto_seq_show(struct seq_file *seq, void *v)
4093{
4094        if (v == &proto_list)
4095                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4096                           "protocol",
4097                           "size",
4098                           "sockets",
4099                           "memory",
4100                           "press",
4101                           "maxhdr",
4102                           "slab",
4103                           "module",
4104                           "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4105        else
4106                proto_seq_printf(seq, list_entry(v, struct proto, node));
4107        return 0;
4108}
4109
4110static const struct seq_operations proto_seq_ops = {
4111        .start  = proto_seq_start,
4112        .next   = proto_seq_next,
4113        .stop   = proto_seq_stop,
4114        .show   = proto_seq_show,
4115};
4116
4117static __net_init int proto_init_net(struct net *net)
4118{
4119        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4120                        sizeof(struct seq_net_private)))
4121                return -ENOMEM;
4122
4123        return 0;
4124}
4125
4126static __net_exit void proto_exit_net(struct net *net)
4127{
4128        remove_proc_entry("protocols", net->proc_net);
4129}
4130
4131
4132static __net_initdata struct pernet_operations proto_net_ops = {
4133        .init = proto_init_net,
4134        .exit = proto_exit_net,
4135};
4136
4137static int __init proto_init(void)
4138{
4139        return register_pernet_subsys(&proto_net_ops);
4140}
4141
4142subsys_initcall(proto_init);
4143
4144#endif /* PROC_FS */
4145
4146#ifdef CONFIG_NET_RX_BUSY_POLL
4147bool sk_busy_loop_end(void *p, unsigned long start_time)
4148{
4149        struct sock *sk = p;
4150
4151        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4152               sk_busy_loop_timeout(sk, start_time);
4153}
4154EXPORT_SYMBOL(sk_busy_loop_end);
4155#endif /* CONFIG_NET_RX_BUSY_POLL */
4156
4157int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4158{
4159        if (!sk->sk_prot->bind_add)
4160                return -EOPNOTSUPP;
4161        return sk->sk_prot->bind_add(sk, addr, addr_len);
4162}
4163EXPORT_SYMBOL(sock_bind_add);
4164
4165/* Copy 'size' bytes from userspace and return `size` back to userspace */
4166int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4167                     void __user *arg, void *karg, size_t size)
4168{
4169        int ret;
4170
4171        if (copy_from_user(karg, arg, size))
4172                return -EFAULT;
4173
4174        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4175        if (ret)
4176                return ret;
4177
4178        if (copy_to_user(arg, karg, size))
4179                return -EFAULT;
4180
4181        return 0;
4182}
4183EXPORT_SYMBOL(sock_ioctl_inout);
4184
4185/* This is the most common ioctl prep function, where the result (4 bytes) is
4186 * copied back to userspace if the ioctl() returns successfully. No input is
4187 * copied from userspace as input argument.
4188 */
4189static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4190{
4191        int ret, karg = 0;
4192
4193        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4194        if (ret)
4195                return ret;
4196
4197        return put_user(karg, (int __user *)arg);
4198}
4199
4200/* A wrapper around sock ioctls, which copies the data from userspace
4201 * (depending on the protocol/ioctl), and copies back the result to userspace.
4202 * The main motivation for this function is to pass kernel memory to the
4203 * protocol ioctl callbacks, instead of userspace memory.
4204 */
4205int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4206{
4207        int rc = 1;
4208
4209        if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4210                rc = ipmr_sk_ioctl(sk, cmd, arg);
4211        else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4212                rc = ip6mr_sk_ioctl(sk, cmd, arg);
4213        else if (sk_is_phonet(sk))
4214                rc = phonet_sk_ioctl(sk, cmd, arg);
4215
4216        /* If ioctl was processed, returns its value */
4217        if (rc <= 0)
4218                return rc;
4219
4220        /* Otherwise call the default handler */
4221        return sock_ioctl_out(sk, cmd, arg);
4222}
4223EXPORT_SYMBOL(sk_ioctl);
4224