linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142#include <linux/ethtool.h>
 143
 144static DEFINE_MUTEX(proto_list_mutex);
 145static LIST_HEAD(proto_list);
 146
 147static void sock_inuse_add(struct net *net, int val);
 148
 149/**
 150 * sk_ns_capable - General socket capability test
 151 * @sk: Socket to use a capability on or through
 152 * @user_ns: The user namespace of the capability to use
 153 * @cap: The capability to use
 154 *
 155 * Test to see if the opener of the socket had when the socket was
 156 * created and the current process has the capability @cap in the user
 157 * namespace @user_ns.
 158 */
 159bool sk_ns_capable(const struct sock *sk,
 160                   struct user_namespace *user_ns, int cap)
 161{
 162        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 163                ns_capable(user_ns, cap);
 164}
 165EXPORT_SYMBOL(sk_ns_capable);
 166
 167/**
 168 * sk_capable - Socket global capability test
 169 * @sk: Socket to use a capability on or through
 170 * @cap: The global capability to use
 171 *
 172 * Test to see if the opener of the socket had when the socket was
 173 * created and the current process has the capability @cap in all user
 174 * namespaces.
 175 */
 176bool sk_capable(const struct sock *sk, int cap)
 177{
 178        return sk_ns_capable(sk, &init_user_ns, cap);
 179}
 180EXPORT_SYMBOL(sk_capable);
 181
 182/**
 183 * sk_net_capable - Network namespace socket capability test
 184 * @sk: Socket to use a capability on or through
 185 * @cap: The capability to use
 186 *
 187 * Test to see if the opener of the socket had when the socket was created
 188 * and the current process has the capability @cap over the network namespace
 189 * the socket is a member of.
 190 */
 191bool sk_net_capable(const struct sock *sk, int cap)
 192{
 193        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 194}
 195EXPORT_SYMBOL(sk_net_capable);
 196
 197/*
 198 * Each address family might have different locking rules, so we have
 199 * one slock key per address family and separate keys for internal and
 200 * userspace sockets.
 201 */
 202static struct lock_class_key af_family_keys[AF_MAX];
 203static struct lock_class_key af_family_kern_keys[AF_MAX];
 204static struct lock_class_key af_family_slock_keys[AF_MAX];
 205static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 206
 207/*
 208 * Make lock validator output more readable. (we pre-construct these
 209 * strings build-time, so that runtime initialization of socket
 210 * locks is fast):
 211 */
 212
 213#define _sock_locks(x)                                            \
 214  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 215  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 216  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 217  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 218  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 219  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 220  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 221  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 222  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 223  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 224  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 225  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 226  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 227  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 228  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 229  x "AF_MAX"
 230
 231static const char *const af_family_key_strings[AF_MAX+1] = {
 232        _sock_locks("sk_lock-")
 233};
 234static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 235        _sock_locks("slock-")
 236};
 237static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 238        _sock_locks("clock-")
 239};
 240
 241static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 242        _sock_locks("k-sk_lock-")
 243};
 244static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 245        _sock_locks("k-slock-")
 246};
 247static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 248        _sock_locks("k-clock-")
 249};
 250static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 251        _sock_locks("rlock-")
 252};
 253static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 254        _sock_locks("wlock-")
 255};
 256static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 257        _sock_locks("elock-")
 258};
 259
 260/*
 261 * sk_callback_lock and sk queues locking rules are per-address-family,
 262 * so split the lock classes by using a per-AF key:
 263 */
 264static struct lock_class_key af_callback_keys[AF_MAX];
 265static struct lock_class_key af_rlock_keys[AF_MAX];
 266static struct lock_class_key af_wlock_keys[AF_MAX];
 267static struct lock_class_key af_elock_keys[AF_MAX];
 268static struct lock_class_key af_kern_callback_keys[AF_MAX];
 269
 270/* Run time adjustable parameters. */
 271__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 272EXPORT_SYMBOL(sysctl_wmem_max);
 273__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 274EXPORT_SYMBOL(sysctl_rmem_max);
 275__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 276__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 277
 278/* Maximal space eaten by iovec or ancillary data plus some space */
 279int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 280EXPORT_SYMBOL(sysctl_optmem_max);
 281
 282int sysctl_tstamp_allow_data __read_mostly = 1;
 283
 284DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 285EXPORT_SYMBOL_GPL(memalloc_socks_key);
 286
 287/**
 288 * sk_set_memalloc - sets %SOCK_MEMALLOC
 289 * @sk: socket to set it on
 290 *
 291 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 292 * It's the responsibility of the admin to adjust min_free_kbytes
 293 * to meet the requirements
 294 */
 295void sk_set_memalloc(struct sock *sk)
 296{
 297        sock_set_flag(sk, SOCK_MEMALLOC);
 298        sk->sk_allocation |= __GFP_MEMALLOC;
 299        static_branch_inc(&memalloc_socks_key);
 300}
 301EXPORT_SYMBOL_GPL(sk_set_memalloc);
 302
 303void sk_clear_memalloc(struct sock *sk)
 304{
 305        sock_reset_flag(sk, SOCK_MEMALLOC);
 306        sk->sk_allocation &= ~__GFP_MEMALLOC;
 307        static_branch_dec(&memalloc_socks_key);
 308
 309        /*
 310         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 311         * progress of swapping. SOCK_MEMALLOC may be cleared while
 312         * it has rmem allocations due to the last swapfile being deactivated
 313         * but there is a risk that the socket is unusable due to exceeding
 314         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 315         */
 316        sk_mem_reclaim(sk);
 317}
 318EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 319
 320int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 321{
 322        int ret;
 323        unsigned int noreclaim_flag;
 324
 325        /* these should have been dropped before queueing */
 326        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 327
 328        noreclaim_flag = memalloc_noreclaim_save();
 329        ret = sk->sk_backlog_rcv(sk, skb);
 330        memalloc_noreclaim_restore(noreclaim_flag);
 331
 332        return ret;
 333}
 334EXPORT_SYMBOL(__sk_backlog_rcv);
 335
 336void sk_error_report(struct sock *sk)
 337{
 338        sk->sk_error_report(sk);
 339
 340        switch (sk->sk_family) {
 341        case AF_INET:
 342                fallthrough;
 343        case AF_INET6:
 344                trace_inet_sk_error_report(sk);
 345                break;
 346        default:
 347                break;
 348        }
 349}
 350EXPORT_SYMBOL(sk_error_report);
 351
 352static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 353{
 354        struct __kernel_sock_timeval tv;
 355
 356        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 357                tv.tv_sec = 0;
 358                tv.tv_usec = 0;
 359        } else {
 360                tv.tv_sec = timeo / HZ;
 361                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 362        }
 363
 364        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 365                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 366                *(struct old_timeval32 *)optval = tv32;
 367                return sizeof(tv32);
 368        }
 369
 370        if (old_timeval) {
 371                struct __kernel_old_timeval old_tv;
 372                old_tv.tv_sec = tv.tv_sec;
 373                old_tv.tv_usec = tv.tv_usec;
 374                *(struct __kernel_old_timeval *)optval = old_tv;
 375                return sizeof(old_tv);
 376        }
 377
 378        *(struct __kernel_sock_timeval *)optval = tv;
 379        return sizeof(tv);
 380}
 381
 382static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 383                            bool old_timeval)
 384{
 385        struct __kernel_sock_timeval tv;
 386
 387        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 388                struct old_timeval32 tv32;
 389
 390                if (optlen < sizeof(tv32))
 391                        return -EINVAL;
 392
 393                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 394                        return -EFAULT;
 395                tv.tv_sec = tv32.tv_sec;
 396                tv.tv_usec = tv32.tv_usec;
 397        } else if (old_timeval) {
 398                struct __kernel_old_timeval old_tv;
 399
 400                if (optlen < sizeof(old_tv))
 401                        return -EINVAL;
 402                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 403                        return -EFAULT;
 404                tv.tv_sec = old_tv.tv_sec;
 405                tv.tv_usec = old_tv.tv_usec;
 406        } else {
 407                if (optlen < sizeof(tv))
 408                        return -EINVAL;
 409                if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 410                        return -EFAULT;
 411        }
 412        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 413                return -EDOM;
 414
 415        if (tv.tv_sec < 0) {
 416                static int warned __read_mostly;
 417
 418                *timeo_p = 0;
 419                if (warned < 10 && net_ratelimit()) {
 420                        warned++;
 421                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 422                                __func__, current->comm, task_pid_nr(current));
 423                }
 424                return 0;
 425        }
 426        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 427        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 428                return 0;
 429        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 430                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 431        return 0;
 432}
 433
 434static bool sock_needs_netstamp(const struct sock *sk)
 435{
 436        switch (sk->sk_family) {
 437        case AF_UNSPEC:
 438        case AF_UNIX:
 439                return false;
 440        default:
 441                return true;
 442        }
 443}
 444
 445static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 446{
 447        if (sk->sk_flags & flags) {
 448                sk->sk_flags &= ~flags;
 449                if (sock_needs_netstamp(sk) &&
 450                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 451                        net_disable_timestamp();
 452        }
 453}
 454
 455
 456int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 457{
 458        unsigned long flags;
 459        struct sk_buff_head *list = &sk->sk_receive_queue;
 460
 461        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 462                atomic_inc(&sk->sk_drops);
 463                trace_sock_rcvqueue_full(sk, skb);
 464                return -ENOMEM;
 465        }
 466
 467        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 468                atomic_inc(&sk->sk_drops);
 469                return -ENOBUFS;
 470        }
 471
 472        skb->dev = NULL;
 473        skb_set_owner_r(skb, sk);
 474
 475        /* we escape from rcu protected region, make sure we dont leak
 476         * a norefcounted dst
 477         */
 478        skb_dst_force(skb);
 479
 480        spin_lock_irqsave(&list->lock, flags);
 481        sock_skb_set_dropcount(sk, skb);
 482        __skb_queue_tail(list, skb);
 483        spin_unlock_irqrestore(&list->lock, flags);
 484
 485        if (!sock_flag(sk, SOCK_DEAD))
 486                sk->sk_data_ready(sk);
 487        return 0;
 488}
 489EXPORT_SYMBOL(__sock_queue_rcv_skb);
 490
 491int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 492{
 493        int err;
 494
 495        err = sk_filter(sk, skb);
 496        if (err)
 497                return err;
 498
 499        return __sock_queue_rcv_skb(sk, skb);
 500}
 501EXPORT_SYMBOL(sock_queue_rcv_skb);
 502
 503int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 504                     const int nested, unsigned int trim_cap, bool refcounted)
 505{
 506        int rc = NET_RX_SUCCESS;
 507
 508        if (sk_filter_trim_cap(sk, skb, trim_cap))
 509                goto discard_and_relse;
 510
 511        skb->dev = NULL;
 512
 513        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 514                atomic_inc(&sk->sk_drops);
 515                goto discard_and_relse;
 516        }
 517        if (nested)
 518                bh_lock_sock_nested(sk);
 519        else
 520                bh_lock_sock(sk);
 521        if (!sock_owned_by_user(sk)) {
 522                /*
 523                 * trylock + unlock semantics:
 524                 */
 525                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 526
 527                rc = sk_backlog_rcv(sk, skb);
 528
 529                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 530        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 531                bh_unlock_sock(sk);
 532                atomic_inc(&sk->sk_drops);
 533                goto discard_and_relse;
 534        }
 535
 536        bh_unlock_sock(sk);
 537out:
 538        if (refcounted)
 539                sock_put(sk);
 540        return rc;
 541discard_and_relse:
 542        kfree_skb(skb);
 543        goto out;
 544}
 545EXPORT_SYMBOL(__sk_receive_skb);
 546
 547INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 548                                                          u32));
 549INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 550                                                           u32));
 551struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 552{
 553        struct dst_entry *dst = __sk_dst_get(sk);
 554
 555        if (dst && dst->obsolete &&
 556            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 557                               dst, cookie) == NULL) {
 558                sk_tx_queue_clear(sk);
 559                sk->sk_dst_pending_confirm = 0;
 560                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 561                dst_release(dst);
 562                return NULL;
 563        }
 564
 565        return dst;
 566}
 567EXPORT_SYMBOL(__sk_dst_check);
 568
 569struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 570{
 571        struct dst_entry *dst = sk_dst_get(sk);
 572
 573        if (dst && dst->obsolete &&
 574            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 575                               dst, cookie) == NULL) {
 576                sk_dst_reset(sk);
 577                dst_release(dst);
 578                return NULL;
 579        }
 580
 581        return dst;
 582}
 583EXPORT_SYMBOL(sk_dst_check);
 584
 585static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 586{
 587        int ret = -ENOPROTOOPT;
 588#ifdef CONFIG_NETDEVICES
 589        struct net *net = sock_net(sk);
 590
 591        /* Sorry... */
 592        ret = -EPERM;
 593        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 594                goto out;
 595
 596        ret = -EINVAL;
 597        if (ifindex < 0)
 598                goto out;
 599
 600        sk->sk_bound_dev_if = ifindex;
 601        if (sk->sk_prot->rehash)
 602                sk->sk_prot->rehash(sk);
 603        sk_dst_reset(sk);
 604
 605        ret = 0;
 606
 607out:
 608#endif
 609
 610        return ret;
 611}
 612
 613int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 614{
 615        int ret;
 616
 617        if (lock_sk)
 618                lock_sock(sk);
 619        ret = sock_bindtoindex_locked(sk, ifindex);
 620        if (lock_sk)
 621                release_sock(sk);
 622
 623        return ret;
 624}
 625EXPORT_SYMBOL(sock_bindtoindex);
 626
 627static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 628{
 629        int ret = -ENOPROTOOPT;
 630#ifdef CONFIG_NETDEVICES
 631        struct net *net = sock_net(sk);
 632        char devname[IFNAMSIZ];
 633        int index;
 634
 635        ret = -EINVAL;
 636        if (optlen < 0)
 637                goto out;
 638
 639        /* Bind this socket to a particular device like "eth0",
 640         * as specified in the passed interface name. If the
 641         * name is "" or the option length is zero the socket
 642         * is not bound.
 643         */
 644        if (optlen > IFNAMSIZ - 1)
 645                optlen = IFNAMSIZ - 1;
 646        memset(devname, 0, sizeof(devname));
 647
 648        ret = -EFAULT;
 649        if (copy_from_sockptr(devname, optval, optlen))
 650                goto out;
 651
 652        index = 0;
 653        if (devname[0] != '\0') {
 654                struct net_device *dev;
 655
 656                rcu_read_lock();
 657                dev = dev_get_by_name_rcu(net, devname);
 658                if (dev)
 659                        index = dev->ifindex;
 660                rcu_read_unlock();
 661                ret = -ENODEV;
 662                if (!dev)
 663                        goto out;
 664        }
 665
 666        return sock_bindtoindex(sk, index, true);
 667out:
 668#endif
 669
 670        return ret;
 671}
 672
 673static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 674                                int __user *optlen, int len)
 675{
 676        int ret = -ENOPROTOOPT;
 677#ifdef CONFIG_NETDEVICES
 678        struct net *net = sock_net(sk);
 679        char devname[IFNAMSIZ];
 680
 681        if (sk->sk_bound_dev_if == 0) {
 682                len = 0;
 683                goto zero;
 684        }
 685
 686        ret = -EINVAL;
 687        if (len < IFNAMSIZ)
 688                goto out;
 689
 690        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 691        if (ret)
 692                goto out;
 693
 694        len = strlen(devname) + 1;
 695
 696        ret = -EFAULT;
 697        if (copy_to_user(optval, devname, len))
 698                goto out;
 699
 700zero:
 701        ret = -EFAULT;
 702        if (put_user(len, optlen))
 703                goto out;
 704
 705        ret = 0;
 706
 707out:
 708#endif
 709
 710        return ret;
 711}
 712
 713bool sk_mc_loop(struct sock *sk)
 714{
 715        if (dev_recursion_level())
 716                return false;
 717        if (!sk)
 718                return true;
 719        switch (sk->sk_family) {
 720        case AF_INET:
 721                return inet_sk(sk)->mc_loop;
 722#if IS_ENABLED(CONFIG_IPV6)
 723        case AF_INET6:
 724                return inet6_sk(sk)->mc_loop;
 725#endif
 726        }
 727        WARN_ON_ONCE(1);
 728        return true;
 729}
 730EXPORT_SYMBOL(sk_mc_loop);
 731
 732void sock_set_reuseaddr(struct sock *sk)
 733{
 734        lock_sock(sk);
 735        sk->sk_reuse = SK_CAN_REUSE;
 736        release_sock(sk);
 737}
 738EXPORT_SYMBOL(sock_set_reuseaddr);
 739
 740void sock_set_reuseport(struct sock *sk)
 741{
 742        lock_sock(sk);
 743        sk->sk_reuseport = true;
 744        release_sock(sk);
 745}
 746EXPORT_SYMBOL(sock_set_reuseport);
 747
 748void sock_no_linger(struct sock *sk)
 749{
 750        lock_sock(sk);
 751        sk->sk_lingertime = 0;
 752        sock_set_flag(sk, SOCK_LINGER);
 753        release_sock(sk);
 754}
 755EXPORT_SYMBOL(sock_no_linger);
 756
 757void sock_set_priority(struct sock *sk, u32 priority)
 758{
 759        lock_sock(sk);
 760        sk->sk_priority = priority;
 761        release_sock(sk);
 762}
 763EXPORT_SYMBOL(sock_set_priority);
 764
 765void sock_set_sndtimeo(struct sock *sk, s64 secs)
 766{
 767        lock_sock(sk);
 768        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 769                sk->sk_sndtimeo = secs * HZ;
 770        else
 771                sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 772        release_sock(sk);
 773}
 774EXPORT_SYMBOL(sock_set_sndtimeo);
 775
 776static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 777{
 778        if (val)  {
 779                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 780                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 781                sock_set_flag(sk, SOCK_RCVTSTAMP);
 782                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 783        } else {
 784                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 785                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 786        }
 787}
 788
 789void sock_enable_timestamps(struct sock *sk)
 790{
 791        lock_sock(sk);
 792        __sock_set_timestamps(sk, true, false, true);
 793        release_sock(sk);
 794}
 795EXPORT_SYMBOL(sock_enable_timestamps);
 796
 797void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 798{
 799        switch (optname) {
 800        case SO_TIMESTAMP_OLD:
 801                __sock_set_timestamps(sk, valbool, false, false);
 802                break;
 803        case SO_TIMESTAMP_NEW:
 804                __sock_set_timestamps(sk, valbool, true, false);
 805                break;
 806        case SO_TIMESTAMPNS_OLD:
 807                __sock_set_timestamps(sk, valbool, false, true);
 808                break;
 809        case SO_TIMESTAMPNS_NEW:
 810                __sock_set_timestamps(sk, valbool, true, true);
 811                break;
 812        }
 813}
 814
 815static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 816{
 817        struct net *net = sock_net(sk);
 818        struct net_device *dev = NULL;
 819        bool match = false;
 820        int *vclock_index;
 821        int i, num;
 822
 823        if (sk->sk_bound_dev_if)
 824                dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 825
 826        if (!dev) {
 827                pr_err("%s: sock not bind to device\n", __func__);
 828                return -EOPNOTSUPP;
 829        }
 830
 831        num = ethtool_get_phc_vclocks(dev, &vclock_index);
 832        for (i = 0; i < num; i++) {
 833                if (*(vclock_index + i) == phc_index) {
 834                        match = true;
 835                        break;
 836                }
 837        }
 838
 839        if (num > 0)
 840                kfree(vclock_index);
 841
 842        if (!match)
 843                return -EINVAL;
 844
 845        sk->sk_bind_phc = phc_index;
 846
 847        return 0;
 848}
 849
 850int sock_set_timestamping(struct sock *sk, int optname,
 851                          struct so_timestamping timestamping)
 852{
 853        int val = timestamping.flags;
 854        int ret;
 855
 856        if (val & ~SOF_TIMESTAMPING_MASK)
 857                return -EINVAL;
 858
 859        if (val & SOF_TIMESTAMPING_OPT_ID &&
 860            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 861                if (sk->sk_protocol == IPPROTO_TCP &&
 862                    sk->sk_type == SOCK_STREAM) {
 863                        if ((1 << sk->sk_state) &
 864                            (TCPF_CLOSE | TCPF_LISTEN))
 865                                return -EINVAL;
 866                        sk->sk_tskey = tcp_sk(sk)->snd_una;
 867                } else {
 868                        sk->sk_tskey = 0;
 869                }
 870        }
 871
 872        if (val & SOF_TIMESTAMPING_OPT_STATS &&
 873            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 874                return -EINVAL;
 875
 876        if (val & SOF_TIMESTAMPING_BIND_PHC) {
 877                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 878                if (ret)
 879                        return ret;
 880        }
 881
 882        sk->sk_tsflags = val;
 883        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 884
 885        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 886                sock_enable_timestamp(sk,
 887                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
 888        else
 889                sock_disable_timestamp(sk,
 890                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 891        return 0;
 892}
 893
 894void sock_set_keepalive(struct sock *sk)
 895{
 896        lock_sock(sk);
 897        if (sk->sk_prot->keepalive)
 898                sk->sk_prot->keepalive(sk, true);
 899        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 900        release_sock(sk);
 901}
 902EXPORT_SYMBOL(sock_set_keepalive);
 903
 904static void __sock_set_rcvbuf(struct sock *sk, int val)
 905{
 906        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 907         * as a negative value.
 908         */
 909        val = min_t(int, val, INT_MAX / 2);
 910        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 911
 912        /* We double it on the way in to account for "struct sk_buff" etc.
 913         * overhead.   Applications assume that the SO_RCVBUF setting they make
 914         * will allow that much actual data to be received on that socket.
 915         *
 916         * Applications are unaware that "struct sk_buff" and other overheads
 917         * allocate from the receive buffer during socket buffer allocation.
 918         *
 919         * And after considering the possible alternatives, returning the value
 920         * we actually used in getsockopt is the most desirable behavior.
 921         */
 922        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 923}
 924
 925void sock_set_rcvbuf(struct sock *sk, int val)
 926{
 927        lock_sock(sk);
 928        __sock_set_rcvbuf(sk, val);
 929        release_sock(sk);
 930}
 931EXPORT_SYMBOL(sock_set_rcvbuf);
 932
 933static void __sock_set_mark(struct sock *sk, u32 val)
 934{
 935        if (val != sk->sk_mark) {
 936                sk->sk_mark = val;
 937                sk_dst_reset(sk);
 938        }
 939}
 940
 941void sock_set_mark(struct sock *sk, u32 val)
 942{
 943        lock_sock(sk);
 944        __sock_set_mark(sk, val);
 945        release_sock(sk);
 946}
 947EXPORT_SYMBOL(sock_set_mark);
 948
 949/*
 950 *      This is meant for all protocols to use and covers goings on
 951 *      at the socket level. Everything here is generic.
 952 */
 953
 954int sock_setsockopt(struct socket *sock, int level, int optname,
 955                    sockptr_t optval, unsigned int optlen)
 956{
 957        struct so_timestamping timestamping;
 958        struct sock_txtime sk_txtime;
 959        struct sock *sk = sock->sk;
 960        int val;
 961        int valbool;
 962        struct linger ling;
 963        int ret = 0;
 964
 965        /*
 966         *      Options without arguments
 967         */
 968
 969        if (optname == SO_BINDTODEVICE)
 970                return sock_setbindtodevice(sk, optval, optlen);
 971
 972        if (optlen < sizeof(int))
 973                return -EINVAL;
 974
 975        if (copy_from_sockptr(&val, optval, sizeof(val)))
 976                return -EFAULT;
 977
 978        valbool = val ? 1 : 0;
 979
 980        lock_sock(sk);
 981
 982        switch (optname) {
 983        case SO_DEBUG:
 984                if (val && !capable(CAP_NET_ADMIN))
 985                        ret = -EACCES;
 986                else
 987                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 988                break;
 989        case SO_REUSEADDR:
 990                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 991                break;
 992        case SO_REUSEPORT:
 993                sk->sk_reuseport = valbool;
 994                break;
 995        case SO_TYPE:
 996        case SO_PROTOCOL:
 997        case SO_DOMAIN:
 998        case SO_ERROR:
 999                ret = -ENOPROTOOPT;
1000                break;
1001        case SO_DONTROUTE:
1002                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1003                sk_dst_reset(sk);
1004                break;
1005        case SO_BROADCAST:
1006                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1007                break;
1008        case SO_SNDBUF:
1009                /* Don't error on this BSD doesn't and if you think
1010                 * about it this is right. Otherwise apps have to
1011                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1012                 * are treated in BSD as hints
1013                 */
1014                val = min_t(u32, val, sysctl_wmem_max);
1015set_sndbuf:
1016                /* Ensure val * 2 fits into an int, to prevent max_t()
1017                 * from treating it as a negative value.
1018                 */
1019                val = min_t(int, val, INT_MAX / 2);
1020                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1021                WRITE_ONCE(sk->sk_sndbuf,
1022                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
1023                /* Wake up sending tasks if we upped the value. */
1024                sk->sk_write_space(sk);
1025                break;
1026
1027        case SO_SNDBUFFORCE:
1028                if (!capable(CAP_NET_ADMIN)) {
1029                        ret = -EPERM;
1030                        break;
1031                }
1032
1033                /* No negative values (to prevent underflow, as val will be
1034                 * multiplied by 2).
1035                 */
1036                if (val < 0)
1037                        val = 0;
1038                goto set_sndbuf;
1039
1040        case SO_RCVBUF:
1041                /* Don't error on this BSD doesn't and if you think
1042                 * about it this is right. Otherwise apps have to
1043                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1044                 * are treated in BSD as hints
1045                 */
1046                __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1047                break;
1048
1049        case SO_RCVBUFFORCE:
1050                if (!capable(CAP_NET_ADMIN)) {
1051                        ret = -EPERM;
1052                        break;
1053                }
1054
1055                /* No negative values (to prevent underflow, as val will be
1056                 * multiplied by 2).
1057                 */
1058                __sock_set_rcvbuf(sk, max(val, 0));
1059                break;
1060
1061        case SO_KEEPALIVE:
1062                if (sk->sk_prot->keepalive)
1063                        sk->sk_prot->keepalive(sk, valbool);
1064                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1065                break;
1066
1067        case SO_OOBINLINE:
1068                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1069                break;
1070
1071        case SO_NO_CHECK:
1072                sk->sk_no_check_tx = valbool;
1073                break;
1074
1075        case SO_PRIORITY:
1076                if ((val >= 0 && val <= 6) ||
1077                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1078                        sk->sk_priority = val;
1079                else
1080                        ret = -EPERM;
1081                break;
1082
1083        case SO_LINGER:
1084                if (optlen < sizeof(ling)) {
1085                        ret = -EINVAL;  /* 1003.1g */
1086                        break;
1087                }
1088                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1089                        ret = -EFAULT;
1090                        break;
1091                }
1092                if (!ling.l_onoff)
1093                        sock_reset_flag(sk, SOCK_LINGER);
1094                else {
1095#if (BITS_PER_LONG == 32)
1096                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1097                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1098                        else
1099#endif
1100                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1101                        sock_set_flag(sk, SOCK_LINGER);
1102                }
1103                break;
1104
1105        case SO_BSDCOMPAT:
1106                break;
1107
1108        case SO_PASSCRED:
1109                if (valbool)
1110                        set_bit(SOCK_PASSCRED, &sock->flags);
1111                else
1112                        clear_bit(SOCK_PASSCRED, &sock->flags);
1113                break;
1114
1115        case SO_TIMESTAMP_OLD:
1116        case SO_TIMESTAMP_NEW:
1117        case SO_TIMESTAMPNS_OLD:
1118        case SO_TIMESTAMPNS_NEW:
1119                sock_set_timestamp(sk, optname, valbool);
1120                break;
1121
1122        case SO_TIMESTAMPING_NEW:
1123        case SO_TIMESTAMPING_OLD:
1124                if (optlen == sizeof(timestamping)) {
1125                        if (copy_from_sockptr(&timestamping, optval,
1126                                              sizeof(timestamping))) {
1127                                ret = -EFAULT;
1128                                break;
1129                        }
1130                } else {
1131                        memset(&timestamping, 0, sizeof(timestamping));
1132                        timestamping.flags = val;
1133                }
1134                ret = sock_set_timestamping(sk, optname, timestamping);
1135                break;
1136
1137        case SO_RCVLOWAT:
1138                if (val < 0)
1139                        val = INT_MAX;
1140                if (sock->ops->set_rcvlowat)
1141                        ret = sock->ops->set_rcvlowat(sk, val);
1142                else
1143                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1144                break;
1145
1146        case SO_RCVTIMEO_OLD:
1147        case SO_RCVTIMEO_NEW:
1148                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1149                                       optlen, optname == SO_RCVTIMEO_OLD);
1150                break;
1151
1152        case SO_SNDTIMEO_OLD:
1153        case SO_SNDTIMEO_NEW:
1154                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1155                                       optlen, optname == SO_SNDTIMEO_OLD);
1156                break;
1157
1158        case SO_ATTACH_FILTER: {
1159                struct sock_fprog fprog;
1160
1161                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1162                if (!ret)
1163                        ret = sk_attach_filter(&fprog, sk);
1164                break;
1165        }
1166        case SO_ATTACH_BPF:
1167                ret = -EINVAL;
1168                if (optlen == sizeof(u32)) {
1169                        u32 ufd;
1170
1171                        ret = -EFAULT;
1172                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1173                                break;
1174
1175                        ret = sk_attach_bpf(ufd, sk);
1176                }
1177                break;
1178
1179        case SO_ATTACH_REUSEPORT_CBPF: {
1180                struct sock_fprog fprog;
1181
1182                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1183                if (!ret)
1184                        ret = sk_reuseport_attach_filter(&fprog, sk);
1185                break;
1186        }
1187        case SO_ATTACH_REUSEPORT_EBPF:
1188                ret = -EINVAL;
1189                if (optlen == sizeof(u32)) {
1190                        u32 ufd;
1191
1192                        ret = -EFAULT;
1193                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1194                                break;
1195
1196                        ret = sk_reuseport_attach_bpf(ufd, sk);
1197                }
1198                break;
1199
1200        case SO_DETACH_REUSEPORT_BPF:
1201                ret = reuseport_detach_prog(sk);
1202                break;
1203
1204        case SO_DETACH_FILTER:
1205                ret = sk_detach_filter(sk);
1206                break;
1207
1208        case SO_LOCK_FILTER:
1209                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1210                        ret = -EPERM;
1211                else
1212                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1213                break;
1214
1215        case SO_PASSSEC:
1216                if (valbool)
1217                        set_bit(SOCK_PASSSEC, &sock->flags);
1218                else
1219                        clear_bit(SOCK_PASSSEC, &sock->flags);
1220                break;
1221        case SO_MARK:
1222                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1223                        ret = -EPERM;
1224                        break;
1225                }
1226
1227                __sock_set_mark(sk, val);
1228                break;
1229
1230        case SO_RXQ_OVFL:
1231                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1232                break;
1233
1234        case SO_WIFI_STATUS:
1235                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1236                break;
1237
1238        case SO_PEEK_OFF:
1239                if (sock->ops->set_peek_off)
1240                        ret = sock->ops->set_peek_off(sk, val);
1241                else
1242                        ret = -EOPNOTSUPP;
1243                break;
1244
1245        case SO_NOFCS:
1246                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1247                break;
1248
1249        case SO_SELECT_ERR_QUEUE:
1250                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1251                break;
1252
1253#ifdef CONFIG_NET_RX_BUSY_POLL
1254        case SO_BUSY_POLL:
1255                /* allow unprivileged users to decrease the value */
1256                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1257                        ret = -EPERM;
1258                else {
1259                        if (val < 0)
1260                                ret = -EINVAL;
1261                        else
1262                                WRITE_ONCE(sk->sk_ll_usec, val);
1263                }
1264                break;
1265        case SO_PREFER_BUSY_POLL:
1266                if (valbool && !capable(CAP_NET_ADMIN))
1267                        ret = -EPERM;
1268                else
1269                        WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1270                break;
1271        case SO_BUSY_POLL_BUDGET:
1272                if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1273                        ret = -EPERM;
1274                } else {
1275                        if (val < 0 || val > U16_MAX)
1276                                ret = -EINVAL;
1277                        else
1278                                WRITE_ONCE(sk->sk_busy_poll_budget, val);
1279                }
1280                break;
1281#endif
1282
1283        case SO_MAX_PACING_RATE:
1284                {
1285                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1286
1287                if (sizeof(ulval) != sizeof(val) &&
1288                    optlen >= sizeof(ulval) &&
1289                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1290                        ret = -EFAULT;
1291                        break;
1292                }
1293                if (ulval != ~0UL)
1294                        cmpxchg(&sk->sk_pacing_status,
1295                                SK_PACING_NONE,
1296                                SK_PACING_NEEDED);
1297                sk->sk_max_pacing_rate = ulval;
1298                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1299                break;
1300                }
1301        case SO_INCOMING_CPU:
1302                WRITE_ONCE(sk->sk_incoming_cpu, val);
1303                break;
1304
1305        case SO_CNX_ADVICE:
1306                if (val == 1)
1307                        dst_negative_advice(sk);
1308                break;
1309
1310        case SO_ZEROCOPY:
1311                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1312                        if (!((sk->sk_type == SOCK_STREAM &&
1313                               sk->sk_protocol == IPPROTO_TCP) ||
1314                              (sk->sk_type == SOCK_DGRAM &&
1315                               sk->sk_protocol == IPPROTO_UDP)))
1316                                ret = -ENOTSUPP;
1317                } else if (sk->sk_family != PF_RDS) {
1318                        ret = -ENOTSUPP;
1319                }
1320                if (!ret) {
1321                        if (val < 0 || val > 1)
1322                                ret = -EINVAL;
1323                        else
1324                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1325                }
1326                break;
1327
1328        case SO_TXTIME:
1329                if (optlen != sizeof(struct sock_txtime)) {
1330                        ret = -EINVAL;
1331                        break;
1332                } else if (copy_from_sockptr(&sk_txtime, optval,
1333                           sizeof(struct sock_txtime))) {
1334                        ret = -EFAULT;
1335                        break;
1336                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1337                        ret = -EINVAL;
1338                        break;
1339                }
1340                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1341                 * scheduler has enough safe guards.
1342                 */
1343                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1344                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1345                        ret = -EPERM;
1346                        break;
1347                }
1348                sock_valbool_flag(sk, SOCK_TXTIME, true);
1349                sk->sk_clockid = sk_txtime.clockid;
1350                sk->sk_txtime_deadline_mode =
1351                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1352                sk->sk_txtime_report_errors =
1353                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1354                break;
1355
1356        case SO_BINDTOIFINDEX:
1357                ret = sock_bindtoindex_locked(sk, val);
1358                break;
1359
1360        default:
1361                ret = -ENOPROTOOPT;
1362                break;
1363        }
1364        release_sock(sk);
1365        return ret;
1366}
1367EXPORT_SYMBOL(sock_setsockopt);
1368
1369
1370static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1371                          struct ucred *ucred)
1372{
1373        ucred->pid = pid_vnr(pid);
1374        ucred->uid = ucred->gid = -1;
1375        if (cred) {
1376                struct user_namespace *current_ns = current_user_ns();
1377
1378                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1379                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1380        }
1381}
1382
1383static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1384{
1385        struct user_namespace *user_ns = current_user_ns();
1386        int i;
1387
1388        for (i = 0; i < src->ngroups; i++)
1389                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1390                        return -EFAULT;
1391
1392        return 0;
1393}
1394
1395int sock_getsockopt(struct socket *sock, int level, int optname,
1396                    char __user *optval, int __user *optlen)
1397{
1398        struct sock *sk = sock->sk;
1399
1400        union {
1401                int val;
1402                u64 val64;
1403                unsigned long ulval;
1404                struct linger ling;
1405                struct old_timeval32 tm32;
1406                struct __kernel_old_timeval tm;
1407                struct  __kernel_sock_timeval stm;
1408                struct sock_txtime txtime;
1409                struct so_timestamping timestamping;
1410        } v;
1411
1412        int lv = sizeof(int);
1413        int len;
1414
1415        if (get_user(len, optlen))
1416                return -EFAULT;
1417        if (len < 0)
1418                return -EINVAL;
1419
1420        memset(&v, 0, sizeof(v));
1421
1422        switch (optname) {
1423        case SO_DEBUG:
1424                v.val = sock_flag(sk, SOCK_DBG);
1425                break;
1426
1427        case SO_DONTROUTE:
1428                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1429                break;
1430
1431        case SO_BROADCAST:
1432                v.val = sock_flag(sk, SOCK_BROADCAST);
1433                break;
1434
1435        case SO_SNDBUF:
1436                v.val = sk->sk_sndbuf;
1437                break;
1438
1439        case SO_RCVBUF:
1440                v.val = sk->sk_rcvbuf;
1441                break;
1442
1443        case SO_REUSEADDR:
1444                v.val = sk->sk_reuse;
1445                break;
1446
1447        case SO_REUSEPORT:
1448                v.val = sk->sk_reuseport;
1449                break;
1450
1451        case SO_KEEPALIVE:
1452                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1453                break;
1454
1455        case SO_TYPE:
1456                v.val = sk->sk_type;
1457                break;
1458
1459        case SO_PROTOCOL:
1460                v.val = sk->sk_protocol;
1461                break;
1462
1463        case SO_DOMAIN:
1464                v.val = sk->sk_family;
1465                break;
1466
1467        case SO_ERROR:
1468                v.val = -sock_error(sk);
1469                if (v.val == 0)
1470                        v.val = xchg(&sk->sk_err_soft, 0);
1471                break;
1472
1473        case SO_OOBINLINE:
1474                v.val = sock_flag(sk, SOCK_URGINLINE);
1475                break;
1476
1477        case SO_NO_CHECK:
1478                v.val = sk->sk_no_check_tx;
1479                break;
1480
1481        case SO_PRIORITY:
1482                v.val = sk->sk_priority;
1483                break;
1484
1485        case SO_LINGER:
1486                lv              = sizeof(v.ling);
1487                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1488                v.ling.l_linger = sk->sk_lingertime / HZ;
1489                break;
1490
1491        case SO_BSDCOMPAT:
1492                break;
1493
1494        case SO_TIMESTAMP_OLD:
1495                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1496                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1497                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1498                break;
1499
1500        case SO_TIMESTAMPNS_OLD:
1501                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1502                break;
1503
1504        case SO_TIMESTAMP_NEW:
1505                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1506                break;
1507
1508        case SO_TIMESTAMPNS_NEW:
1509                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1510                break;
1511
1512        case SO_TIMESTAMPING_OLD:
1513                lv = sizeof(v.timestamping);
1514                v.timestamping.flags = sk->sk_tsflags;
1515                v.timestamping.bind_phc = sk->sk_bind_phc;
1516                break;
1517
1518        case SO_RCVTIMEO_OLD:
1519        case SO_RCVTIMEO_NEW:
1520                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1521                break;
1522
1523        case SO_SNDTIMEO_OLD:
1524        case SO_SNDTIMEO_NEW:
1525                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1526                break;
1527
1528        case SO_RCVLOWAT:
1529                v.val = sk->sk_rcvlowat;
1530                break;
1531
1532        case SO_SNDLOWAT:
1533                v.val = 1;
1534                break;
1535
1536        case SO_PASSCRED:
1537                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1538                break;
1539
1540        case SO_PEERCRED:
1541        {
1542                struct ucred peercred;
1543                if (len > sizeof(peercred))
1544                        len = sizeof(peercred);
1545                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1546                if (copy_to_user(optval, &peercred, len))
1547                        return -EFAULT;
1548                goto lenout;
1549        }
1550
1551        case SO_PEERGROUPS:
1552        {
1553                int ret, n;
1554
1555                if (!sk->sk_peer_cred)
1556                        return -ENODATA;
1557
1558                n = sk->sk_peer_cred->group_info->ngroups;
1559                if (len < n * sizeof(gid_t)) {
1560                        len = n * sizeof(gid_t);
1561                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1562                }
1563                len = n * sizeof(gid_t);
1564
1565                ret = groups_to_user((gid_t __user *)optval,
1566                                     sk->sk_peer_cred->group_info);
1567                if (ret)
1568                        return ret;
1569                goto lenout;
1570        }
1571
1572        case SO_PEERNAME:
1573        {
1574                char address[128];
1575
1576                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1577                if (lv < 0)
1578                        return -ENOTCONN;
1579                if (lv < len)
1580                        return -EINVAL;
1581                if (copy_to_user(optval, address, len))
1582                        return -EFAULT;
1583                goto lenout;
1584        }
1585
1586        /* Dubious BSD thing... Probably nobody even uses it, but
1587         * the UNIX standard wants it for whatever reason... -DaveM
1588         */
1589        case SO_ACCEPTCONN:
1590                v.val = sk->sk_state == TCP_LISTEN;
1591                break;
1592
1593        case SO_PASSSEC:
1594                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1595                break;
1596
1597        case SO_PEERSEC:
1598                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1599
1600        case SO_MARK:
1601                v.val = sk->sk_mark;
1602                break;
1603
1604        case SO_RXQ_OVFL:
1605                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1606                break;
1607
1608        case SO_WIFI_STATUS:
1609                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1610                break;
1611
1612        case SO_PEEK_OFF:
1613                if (!sock->ops->set_peek_off)
1614                        return -EOPNOTSUPP;
1615
1616                v.val = sk->sk_peek_off;
1617                break;
1618        case SO_NOFCS:
1619                v.val = sock_flag(sk, SOCK_NOFCS);
1620                break;
1621
1622        case SO_BINDTODEVICE:
1623                return sock_getbindtodevice(sk, optval, optlen, len);
1624
1625        case SO_GET_FILTER:
1626                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1627                if (len < 0)
1628                        return len;
1629
1630                goto lenout;
1631
1632        case SO_LOCK_FILTER:
1633                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1634                break;
1635
1636        case SO_BPF_EXTENSIONS:
1637                v.val = bpf_tell_extensions();
1638                break;
1639
1640        case SO_SELECT_ERR_QUEUE:
1641                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1642                break;
1643
1644#ifdef CONFIG_NET_RX_BUSY_POLL
1645        case SO_BUSY_POLL:
1646                v.val = sk->sk_ll_usec;
1647                break;
1648        case SO_PREFER_BUSY_POLL:
1649                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1650                break;
1651#endif
1652
1653        case SO_MAX_PACING_RATE:
1654                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1655                        lv = sizeof(v.ulval);
1656                        v.ulval = sk->sk_max_pacing_rate;
1657                } else {
1658                        /* 32bit version */
1659                        v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1660                }
1661                break;
1662
1663        case SO_INCOMING_CPU:
1664                v.val = READ_ONCE(sk->sk_incoming_cpu);
1665                break;
1666
1667        case SO_MEMINFO:
1668        {
1669                u32 meminfo[SK_MEMINFO_VARS];
1670
1671                sk_get_meminfo(sk, meminfo);
1672
1673                len = min_t(unsigned int, len, sizeof(meminfo));
1674                if (copy_to_user(optval, &meminfo, len))
1675                        return -EFAULT;
1676
1677                goto lenout;
1678        }
1679
1680#ifdef CONFIG_NET_RX_BUSY_POLL
1681        case SO_INCOMING_NAPI_ID:
1682                v.val = READ_ONCE(sk->sk_napi_id);
1683
1684                /* aggregate non-NAPI IDs down to 0 */
1685                if (v.val < MIN_NAPI_ID)
1686                        v.val = 0;
1687
1688                break;
1689#endif
1690
1691        case SO_COOKIE:
1692                lv = sizeof(u64);
1693                if (len < lv)
1694                        return -EINVAL;
1695                v.val64 = sock_gen_cookie(sk);
1696                break;
1697
1698        case SO_ZEROCOPY:
1699                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1700                break;
1701
1702        case SO_TXTIME:
1703                lv = sizeof(v.txtime);
1704                v.txtime.clockid = sk->sk_clockid;
1705                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1706                                  SOF_TXTIME_DEADLINE_MODE : 0;
1707                v.txtime.flags |= sk->sk_txtime_report_errors ?
1708                                  SOF_TXTIME_REPORT_ERRORS : 0;
1709                break;
1710
1711        case SO_BINDTOIFINDEX:
1712                v.val = sk->sk_bound_dev_if;
1713                break;
1714
1715        case SO_NETNS_COOKIE:
1716                lv = sizeof(u64);
1717                if (len != lv)
1718                        return -EINVAL;
1719                v.val64 = sock_net(sk)->net_cookie;
1720                break;
1721
1722        default:
1723                /* We implement the SO_SNDLOWAT etc to not be settable
1724                 * (1003.1g 7).
1725                 */
1726                return -ENOPROTOOPT;
1727        }
1728
1729        if (len > lv)
1730                len = lv;
1731        if (copy_to_user(optval, &v, len))
1732                return -EFAULT;
1733lenout:
1734        if (put_user(len, optlen))
1735                return -EFAULT;
1736        return 0;
1737}
1738
1739/*
1740 * Initialize an sk_lock.
1741 *
1742 * (We also register the sk_lock with the lock validator.)
1743 */
1744static inline void sock_lock_init(struct sock *sk)
1745{
1746        if (sk->sk_kern_sock)
1747                sock_lock_init_class_and_name(
1748                        sk,
1749                        af_family_kern_slock_key_strings[sk->sk_family],
1750                        af_family_kern_slock_keys + sk->sk_family,
1751                        af_family_kern_key_strings[sk->sk_family],
1752                        af_family_kern_keys + sk->sk_family);
1753        else
1754                sock_lock_init_class_and_name(
1755                        sk,
1756                        af_family_slock_key_strings[sk->sk_family],
1757                        af_family_slock_keys + sk->sk_family,
1758                        af_family_key_strings[sk->sk_family],
1759                        af_family_keys + sk->sk_family);
1760}
1761
1762/*
1763 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1764 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1765 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1766 */
1767static void sock_copy(struct sock *nsk, const struct sock *osk)
1768{
1769        const struct proto *prot = READ_ONCE(osk->sk_prot);
1770#ifdef CONFIG_SECURITY_NETWORK
1771        void *sptr = nsk->sk_security;
1772#endif
1773
1774        /* If we move sk_tx_queue_mapping out of the private section,
1775         * we must check if sk_tx_queue_clear() is called after
1776         * sock_copy() in sk_clone_lock().
1777         */
1778        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1779                     offsetof(struct sock, sk_dontcopy_begin) ||
1780                     offsetof(struct sock, sk_tx_queue_mapping) >=
1781                     offsetof(struct sock, sk_dontcopy_end));
1782
1783        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1784
1785        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1786               prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1787
1788#ifdef CONFIG_SECURITY_NETWORK
1789        nsk->sk_security = sptr;
1790        security_sk_clone(osk, nsk);
1791#endif
1792}
1793
1794static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1795                int family)
1796{
1797        struct sock *sk;
1798        struct kmem_cache *slab;
1799
1800        slab = prot->slab;
1801        if (slab != NULL) {
1802                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1803                if (!sk)
1804                        return sk;
1805                if (want_init_on_alloc(priority))
1806                        sk_prot_clear_nulls(sk, prot->obj_size);
1807        } else
1808                sk = kmalloc(prot->obj_size, priority);
1809
1810        if (sk != NULL) {
1811                if (security_sk_alloc(sk, family, priority))
1812                        goto out_free;
1813
1814                if (!try_module_get(prot->owner))
1815                        goto out_free_sec;
1816        }
1817
1818        return sk;
1819
1820out_free_sec:
1821        security_sk_free(sk);
1822out_free:
1823        if (slab != NULL)
1824                kmem_cache_free(slab, sk);
1825        else
1826                kfree(sk);
1827        return NULL;
1828}
1829
1830static void sk_prot_free(struct proto *prot, struct sock *sk)
1831{
1832        struct kmem_cache *slab;
1833        struct module *owner;
1834
1835        owner = prot->owner;
1836        slab = prot->slab;
1837
1838        cgroup_sk_free(&sk->sk_cgrp_data);
1839        mem_cgroup_sk_free(sk);
1840        security_sk_free(sk);
1841        if (slab != NULL)
1842                kmem_cache_free(slab, sk);
1843        else
1844                kfree(sk);
1845        module_put(owner);
1846}
1847
1848/**
1849 *      sk_alloc - All socket objects are allocated here
1850 *      @net: the applicable net namespace
1851 *      @family: protocol family
1852 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1853 *      @prot: struct proto associated with this new sock instance
1854 *      @kern: is this to be a kernel socket?
1855 */
1856struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1857                      struct proto *prot, int kern)
1858{
1859        struct sock *sk;
1860
1861        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1862        if (sk) {
1863                sk->sk_family = family;
1864                /*
1865                 * See comment in struct sock definition to understand
1866                 * why we need sk_prot_creator -acme
1867                 */
1868                sk->sk_prot = sk->sk_prot_creator = prot;
1869                sk->sk_kern_sock = kern;
1870                sock_lock_init(sk);
1871                sk->sk_net_refcnt = kern ? 0 : 1;
1872                if (likely(sk->sk_net_refcnt)) {
1873                        get_net(net);
1874                        sock_inuse_add(net, 1);
1875                }
1876
1877                sock_net_set(sk, net);
1878                refcount_set(&sk->sk_wmem_alloc, 1);
1879
1880                mem_cgroup_sk_alloc(sk);
1881                cgroup_sk_alloc(&sk->sk_cgrp_data);
1882                sock_update_classid(&sk->sk_cgrp_data);
1883                sock_update_netprioidx(&sk->sk_cgrp_data);
1884                sk_tx_queue_clear(sk);
1885        }
1886
1887        return sk;
1888}
1889EXPORT_SYMBOL(sk_alloc);
1890
1891/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1892 * grace period. This is the case for UDP sockets and TCP listeners.
1893 */
1894static void __sk_destruct(struct rcu_head *head)
1895{
1896        struct sock *sk = container_of(head, struct sock, sk_rcu);
1897        struct sk_filter *filter;
1898
1899        if (sk->sk_destruct)
1900                sk->sk_destruct(sk);
1901
1902        filter = rcu_dereference_check(sk->sk_filter,
1903                                       refcount_read(&sk->sk_wmem_alloc) == 0);
1904        if (filter) {
1905                sk_filter_uncharge(sk, filter);
1906                RCU_INIT_POINTER(sk->sk_filter, NULL);
1907        }
1908
1909        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1910
1911#ifdef CONFIG_BPF_SYSCALL
1912        bpf_sk_storage_free(sk);
1913#endif
1914
1915        if (atomic_read(&sk->sk_omem_alloc))
1916                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1917                         __func__, atomic_read(&sk->sk_omem_alloc));
1918
1919        if (sk->sk_frag.page) {
1920                put_page(sk->sk_frag.page);
1921                sk->sk_frag.page = NULL;
1922        }
1923
1924        if (sk->sk_peer_cred)
1925                put_cred(sk->sk_peer_cred);
1926        put_pid(sk->sk_peer_pid);
1927        if (likely(sk->sk_net_refcnt))
1928                put_net(sock_net(sk));
1929        sk_prot_free(sk->sk_prot_creator, sk);
1930}
1931
1932void sk_destruct(struct sock *sk)
1933{
1934        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1935
1936        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1937                reuseport_detach_sock(sk);
1938                use_call_rcu = true;
1939        }
1940
1941        if (use_call_rcu)
1942                call_rcu(&sk->sk_rcu, __sk_destruct);
1943        else
1944                __sk_destruct(&sk->sk_rcu);
1945}
1946
1947static void __sk_free(struct sock *sk)
1948{
1949        if (likely(sk->sk_net_refcnt))
1950                sock_inuse_add(sock_net(sk), -1);
1951
1952        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1953                sock_diag_broadcast_destroy(sk);
1954        else
1955                sk_destruct(sk);
1956}
1957
1958void sk_free(struct sock *sk)
1959{
1960        /*
1961         * We subtract one from sk_wmem_alloc and can know if
1962         * some packets are still in some tx queue.
1963         * If not null, sock_wfree() will call __sk_free(sk) later
1964         */
1965        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1966                __sk_free(sk);
1967}
1968EXPORT_SYMBOL(sk_free);
1969
1970static void sk_init_common(struct sock *sk)
1971{
1972        skb_queue_head_init(&sk->sk_receive_queue);
1973        skb_queue_head_init(&sk->sk_write_queue);
1974        skb_queue_head_init(&sk->sk_error_queue);
1975
1976        rwlock_init(&sk->sk_callback_lock);
1977        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1978                        af_rlock_keys + sk->sk_family,
1979                        af_family_rlock_key_strings[sk->sk_family]);
1980        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1981                        af_wlock_keys + sk->sk_family,
1982                        af_family_wlock_key_strings[sk->sk_family]);
1983        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1984                        af_elock_keys + sk->sk_family,
1985                        af_family_elock_key_strings[sk->sk_family]);
1986        lockdep_set_class_and_name(&sk->sk_callback_lock,
1987                        af_callback_keys + sk->sk_family,
1988                        af_family_clock_key_strings[sk->sk_family]);
1989}
1990
1991/**
1992 *      sk_clone_lock - clone a socket, and lock its clone
1993 *      @sk: the socket to clone
1994 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1995 *
1996 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1997 */
1998struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1999{
2000        struct proto *prot = READ_ONCE(sk->sk_prot);
2001        struct sk_filter *filter;
2002        bool is_charged = true;
2003        struct sock *newsk;
2004
2005        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2006        if (!newsk)
2007                goto out;
2008
2009        sock_copy(newsk, sk);
2010
2011        newsk->sk_prot_creator = prot;
2012
2013        /* SANITY */
2014        if (likely(newsk->sk_net_refcnt))
2015                get_net(sock_net(newsk));
2016        sk_node_init(&newsk->sk_node);
2017        sock_lock_init(newsk);
2018        bh_lock_sock(newsk);
2019        newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2020        newsk->sk_backlog.len = 0;
2021
2022        atomic_set(&newsk->sk_rmem_alloc, 0);
2023
2024        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2025        refcount_set(&newsk->sk_wmem_alloc, 1);
2026
2027        atomic_set(&newsk->sk_omem_alloc, 0);
2028        sk_init_common(newsk);
2029
2030        newsk->sk_dst_cache     = NULL;
2031        newsk->sk_dst_pending_confirm = 0;
2032        newsk->sk_wmem_queued   = 0;
2033        newsk->sk_forward_alloc = 0;
2034        atomic_set(&newsk->sk_drops, 0);
2035        newsk->sk_send_head     = NULL;
2036        newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2037        atomic_set(&newsk->sk_zckey, 0);
2038
2039        sock_reset_flag(newsk, SOCK_DONE);
2040
2041        /* sk->sk_memcg will be populated at accept() time */
2042        newsk->sk_memcg = NULL;
2043
2044        cgroup_sk_clone(&newsk->sk_cgrp_data);
2045
2046        rcu_read_lock();
2047        filter = rcu_dereference(sk->sk_filter);
2048        if (filter != NULL)
2049                /* though it's an empty new sock, the charging may fail
2050                 * if sysctl_optmem_max was changed between creation of
2051                 * original socket and cloning
2052                 */
2053                is_charged = sk_filter_charge(newsk, filter);
2054        RCU_INIT_POINTER(newsk->sk_filter, filter);
2055        rcu_read_unlock();
2056
2057        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2058                /* We need to make sure that we don't uncharge the new
2059                 * socket if we couldn't charge it in the first place
2060                 * as otherwise we uncharge the parent's filter.
2061                 */
2062                if (!is_charged)
2063                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
2064                sk_free_unlock_clone(newsk);
2065                newsk = NULL;
2066                goto out;
2067        }
2068        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2069
2070        if (bpf_sk_storage_clone(sk, newsk)) {
2071                sk_free_unlock_clone(newsk);
2072                newsk = NULL;
2073                goto out;
2074        }
2075
2076        /* Clear sk_user_data if parent had the pointer tagged
2077         * as not suitable for copying when cloning.
2078         */
2079        if (sk_user_data_is_nocopy(newsk))
2080                newsk->sk_user_data = NULL;
2081
2082        newsk->sk_err      = 0;
2083        newsk->sk_err_soft = 0;
2084        newsk->sk_priority = 0;
2085        newsk->sk_incoming_cpu = raw_smp_processor_id();
2086        if (likely(newsk->sk_net_refcnt))
2087                sock_inuse_add(sock_net(newsk), 1);
2088
2089        /* Before updating sk_refcnt, we must commit prior changes to memory
2090         * (Documentation/RCU/rculist_nulls.rst for details)
2091         */
2092        smp_wmb();
2093        refcount_set(&newsk->sk_refcnt, 2);
2094
2095        /* Increment the counter in the same struct proto as the master
2096         * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2097         * is the same as sk->sk_prot->socks, as this field was copied
2098         * with memcpy).
2099         *
2100         * This _changes_ the previous behaviour, where
2101         * tcp_create_openreq_child always was incrementing the
2102         * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2103         * to be taken into account in all callers. -acme
2104         */
2105        sk_refcnt_debug_inc(newsk);
2106        sk_set_socket(newsk, NULL);
2107        sk_tx_queue_clear(newsk);
2108        RCU_INIT_POINTER(newsk->sk_wq, NULL);
2109
2110        if (newsk->sk_prot->sockets_allocated)
2111                sk_sockets_allocated_inc(newsk);
2112
2113        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2114                net_enable_timestamp();
2115out:
2116        return newsk;
2117}
2118EXPORT_SYMBOL_GPL(sk_clone_lock);
2119
2120void sk_free_unlock_clone(struct sock *sk)
2121{
2122        /* It is still raw copy of parent, so invalidate
2123         * destructor and make plain sk_free() */
2124        sk->sk_destruct = NULL;
2125        bh_unlock_sock(sk);
2126        sk_free(sk);
2127}
2128EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2129
2130void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2131{
2132        u32 max_segs = 1;
2133
2134        sk_dst_set(sk, dst);
2135        sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2136        if (sk->sk_route_caps & NETIF_F_GSO)
2137                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2138        sk->sk_route_caps &= ~sk->sk_route_nocaps;
2139        if (sk_can_gso(sk)) {
2140                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2141                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2142                } else {
2143                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2144                        sk->sk_gso_max_size = dst->dev->gso_max_size;
2145                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2146                }
2147        }
2148        sk->sk_gso_max_segs = max_segs;
2149}
2150EXPORT_SYMBOL_GPL(sk_setup_caps);
2151
2152/*
2153 *      Simple resource managers for sockets.
2154 */
2155
2156
2157/*
2158 * Write buffer destructor automatically called from kfree_skb.
2159 */
2160void sock_wfree(struct sk_buff *skb)
2161{
2162        struct sock *sk = skb->sk;
2163        unsigned int len = skb->truesize;
2164
2165        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2166                /*
2167                 * Keep a reference on sk_wmem_alloc, this will be released
2168                 * after sk_write_space() call
2169                 */
2170                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2171                sk->sk_write_space(sk);
2172                len = 1;
2173        }
2174        /*
2175         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2176         * could not do because of in-flight packets
2177         */
2178        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2179                __sk_free(sk);
2180}
2181EXPORT_SYMBOL(sock_wfree);
2182
2183/* This variant of sock_wfree() is used by TCP,
2184 * since it sets SOCK_USE_WRITE_QUEUE.
2185 */
2186void __sock_wfree(struct sk_buff *skb)
2187{
2188        struct sock *sk = skb->sk;
2189
2190        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2191                __sk_free(sk);
2192}
2193
2194void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2195{
2196        skb_orphan(skb);
2197        skb->sk = sk;
2198#ifdef CONFIG_INET
2199        if (unlikely(!sk_fullsock(sk))) {
2200                skb->destructor = sock_edemux;
2201                sock_hold(sk);
2202                return;
2203        }
2204#endif
2205        skb->destructor = sock_wfree;
2206        skb_set_hash_from_sk(skb, sk);
2207        /*
2208         * We used to take a refcount on sk, but following operation
2209         * is enough to guarantee sk_free() wont free this sock until
2210         * all in-flight packets are completed
2211         */
2212        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2213}
2214EXPORT_SYMBOL(skb_set_owner_w);
2215
2216static bool can_skb_orphan_partial(const struct sk_buff *skb)
2217{
2218#ifdef CONFIG_TLS_DEVICE
2219        /* Drivers depend on in-order delivery for crypto offload,
2220         * partial orphan breaks out-of-order-OK logic.
2221         */
2222        if (skb->decrypted)
2223                return false;
2224#endif
2225        return (skb->destructor == sock_wfree ||
2226                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2227}
2228
2229/* This helper is used by netem, as it can hold packets in its
2230 * delay queue. We want to allow the owner socket to send more
2231 * packets, as if they were already TX completed by a typical driver.
2232 * But we also want to keep skb->sk set because some packet schedulers
2233 * rely on it (sch_fq for example).
2234 */
2235void skb_orphan_partial(struct sk_buff *skb)
2236{
2237        if (skb_is_tcp_pure_ack(skb))
2238                return;
2239
2240        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2241                return;
2242
2243        skb_orphan(skb);
2244}
2245EXPORT_SYMBOL(skb_orphan_partial);
2246
2247/*
2248 * Read buffer destructor automatically called from kfree_skb.
2249 */
2250void sock_rfree(struct sk_buff *skb)
2251{
2252        struct sock *sk = skb->sk;
2253        unsigned int len = skb->truesize;
2254
2255        atomic_sub(len, &sk->sk_rmem_alloc);
2256        sk_mem_uncharge(sk, len);
2257}
2258EXPORT_SYMBOL(sock_rfree);
2259
2260/*
2261 * Buffer destructor for skbs that are not used directly in read or write
2262 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2263 */
2264void sock_efree(struct sk_buff *skb)
2265{
2266        sock_put(skb->sk);
2267}
2268EXPORT_SYMBOL(sock_efree);
2269
2270/* Buffer destructor for prefetch/receive path where reference count may
2271 * not be held, e.g. for listen sockets.
2272 */
2273#ifdef CONFIG_INET
2274void sock_pfree(struct sk_buff *skb)
2275{
2276        if (sk_is_refcounted(skb->sk))
2277                sock_gen_put(skb->sk);
2278}
2279EXPORT_SYMBOL(sock_pfree);
2280#endif /* CONFIG_INET */
2281
2282kuid_t sock_i_uid(struct sock *sk)
2283{
2284        kuid_t uid;
2285
2286        read_lock_bh(&sk->sk_callback_lock);
2287        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2288        read_unlock_bh(&sk->sk_callback_lock);
2289        return uid;
2290}
2291EXPORT_SYMBOL(sock_i_uid);
2292
2293unsigned long sock_i_ino(struct sock *sk)
2294{
2295        unsigned long ino;
2296
2297        read_lock_bh(&sk->sk_callback_lock);
2298        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2299        read_unlock_bh(&sk->sk_callback_lock);
2300        return ino;
2301}
2302EXPORT_SYMBOL(sock_i_ino);
2303
2304/*
2305 * Allocate a skb from the socket's send buffer.
2306 */
2307struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2308                             gfp_t priority)
2309{
2310        if (force ||
2311            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2312                struct sk_buff *skb = alloc_skb(size, priority);
2313
2314                if (skb) {
2315                        skb_set_owner_w(skb, sk);
2316                        return skb;
2317                }
2318        }
2319        return NULL;
2320}
2321EXPORT_SYMBOL(sock_wmalloc);
2322
2323static void sock_ofree(struct sk_buff *skb)
2324{
2325        struct sock *sk = skb->sk;
2326
2327        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2328}
2329
2330struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2331                             gfp_t priority)
2332{
2333        struct sk_buff *skb;
2334
2335        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2336        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2337            sysctl_optmem_max)
2338                return NULL;
2339
2340        skb = alloc_skb(size, priority);
2341        if (!skb)
2342                return NULL;
2343
2344        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2345        skb->sk = sk;
2346        skb->destructor = sock_ofree;
2347        return skb;
2348}
2349
2350/*
2351 * Allocate a memory block from the socket's option memory buffer.
2352 */
2353void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2354{
2355        if ((unsigned int)size <= sysctl_optmem_max &&
2356            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2357                void *mem;
2358                /* First do the add, to avoid the race if kmalloc
2359                 * might sleep.
2360                 */
2361                atomic_add(size, &sk->sk_omem_alloc);
2362                mem = kmalloc(size, priority);
2363                if (mem)
2364                        return mem;
2365                atomic_sub(size, &sk->sk_omem_alloc);
2366        }
2367        return NULL;
2368}
2369EXPORT_SYMBOL(sock_kmalloc);
2370
2371/* Free an option memory block. Note, we actually want the inline
2372 * here as this allows gcc to detect the nullify and fold away the
2373 * condition entirely.
2374 */
2375static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2376                                  const bool nullify)
2377{
2378        if (WARN_ON_ONCE(!mem))
2379                return;
2380        if (nullify)
2381                kfree_sensitive(mem);
2382        else
2383                kfree(mem);
2384        atomic_sub(size, &sk->sk_omem_alloc);
2385}
2386
2387void sock_kfree_s(struct sock *sk, void *mem, int size)
2388{
2389        __sock_kfree_s(sk, mem, size, false);
2390}
2391EXPORT_SYMBOL(sock_kfree_s);
2392
2393void sock_kzfree_s(struct sock *sk, void *mem, int size)
2394{
2395        __sock_kfree_s(sk, mem, size, true);
2396}
2397EXPORT_SYMBOL(sock_kzfree_s);
2398
2399/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2400   I think, these locks should be removed for datagram sockets.
2401 */
2402static long sock_wait_for_wmem(struct sock *sk, long timeo)
2403{
2404        DEFINE_WAIT(wait);
2405
2406        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2407        for (;;) {
2408                if (!timeo)
2409                        break;
2410                if (signal_pending(current))
2411                        break;
2412                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2413                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2414                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2415                        break;
2416                if (sk->sk_shutdown & SEND_SHUTDOWN)
2417                        break;
2418                if (sk->sk_err)
2419                        break;
2420                timeo = schedule_timeout(timeo);
2421        }
2422        finish_wait(sk_sleep(sk), &wait);
2423        return timeo;
2424}
2425
2426
2427/*
2428 *      Generic send/receive buffer handlers
2429 */
2430
2431struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2432                                     unsigned long data_len, int noblock,
2433                                     int *errcode, int max_page_order)
2434{
2435        struct sk_buff *skb;
2436        long timeo;
2437        int err;
2438
2439        timeo = sock_sndtimeo(sk, noblock);
2440        for (;;) {
2441                err = sock_error(sk);
2442                if (err != 0)
2443                        goto failure;
2444
2445                err = -EPIPE;
2446                if (sk->sk_shutdown & SEND_SHUTDOWN)
2447                        goto failure;
2448
2449                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2450                        break;
2451
2452                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2453                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2454                err = -EAGAIN;
2455                if (!timeo)
2456                        goto failure;
2457                if (signal_pending(current))
2458                        goto interrupted;
2459                timeo = sock_wait_for_wmem(sk, timeo);
2460        }
2461        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2462                                   errcode, sk->sk_allocation);
2463        if (skb)
2464                skb_set_owner_w(skb, sk);
2465        return skb;
2466
2467interrupted:
2468        err = sock_intr_errno(timeo);
2469failure:
2470        *errcode = err;
2471        return NULL;
2472}
2473EXPORT_SYMBOL(sock_alloc_send_pskb);
2474
2475struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2476                                    int noblock, int *errcode)
2477{
2478        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2479}
2480EXPORT_SYMBOL(sock_alloc_send_skb);
2481
2482int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2483                     struct sockcm_cookie *sockc)
2484{
2485        u32 tsflags;
2486
2487        switch (cmsg->cmsg_type) {
2488        case SO_MARK:
2489                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2490                        return -EPERM;
2491                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2492                        return -EINVAL;
2493                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2494                break;
2495        case SO_TIMESTAMPING_OLD:
2496                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2497                        return -EINVAL;
2498
2499                tsflags = *(u32 *)CMSG_DATA(cmsg);
2500                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2501                        return -EINVAL;
2502
2503                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2504                sockc->tsflags |= tsflags;
2505                break;
2506        case SCM_TXTIME:
2507                if (!sock_flag(sk, SOCK_TXTIME))
2508                        return -EINVAL;
2509                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2510                        return -EINVAL;
2511                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2512                break;
2513        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2514        case SCM_RIGHTS:
2515        case SCM_CREDENTIALS:
2516                break;
2517        default:
2518                return -EINVAL;
2519        }
2520        return 0;
2521}
2522EXPORT_SYMBOL(__sock_cmsg_send);
2523
2524int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2525                   struct sockcm_cookie *sockc)
2526{
2527        struct cmsghdr *cmsg;
2528        int ret;
2529
2530        for_each_cmsghdr(cmsg, msg) {
2531                if (!CMSG_OK(msg, cmsg))
2532                        return -EINVAL;
2533                if (cmsg->cmsg_level != SOL_SOCKET)
2534                        continue;
2535                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2536                if (ret)
2537                        return ret;
2538        }
2539        return 0;
2540}
2541EXPORT_SYMBOL(sock_cmsg_send);
2542
2543static void sk_enter_memory_pressure(struct sock *sk)
2544{
2545        if (!sk->sk_prot->enter_memory_pressure)
2546                return;
2547
2548        sk->sk_prot->enter_memory_pressure(sk);
2549}
2550
2551static void sk_leave_memory_pressure(struct sock *sk)
2552{
2553        if (sk->sk_prot->leave_memory_pressure) {
2554                sk->sk_prot->leave_memory_pressure(sk);
2555        } else {
2556                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2557
2558                if (memory_pressure && READ_ONCE(*memory_pressure))
2559                        WRITE_ONCE(*memory_pressure, 0);
2560        }
2561}
2562
2563#define SKB_FRAG_PAGE_ORDER     get_order(32768)
2564DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2565
2566/**
2567 * skb_page_frag_refill - check that a page_frag contains enough room
2568 * @sz: minimum size of the fragment we want to get
2569 * @pfrag: pointer to page_frag
2570 * @gfp: priority for memory allocation
2571 *
2572 * Note: While this allocator tries to use high order pages, there is
2573 * no guarantee that allocations succeed. Therefore, @sz MUST be
2574 * less or equal than PAGE_SIZE.
2575 */
2576bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2577{
2578        if (pfrag->page) {
2579                if (page_ref_count(pfrag->page) == 1) {
2580                        pfrag->offset = 0;
2581                        return true;
2582                }
2583                if (pfrag->offset + sz <= pfrag->size)
2584                        return true;
2585                put_page(pfrag->page);
2586        }
2587
2588        pfrag->offset = 0;
2589        if (SKB_FRAG_PAGE_ORDER &&
2590            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2591                /* Avoid direct reclaim but allow kswapd to wake */
2592                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2593                                          __GFP_COMP | __GFP_NOWARN |
2594                                          __GFP_NORETRY,
2595                                          SKB_FRAG_PAGE_ORDER);
2596                if (likely(pfrag->page)) {
2597                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2598                        return true;
2599                }
2600        }
2601        pfrag->page = alloc_page(gfp);
2602        if (likely(pfrag->page)) {
2603                pfrag->size = PAGE_SIZE;
2604                return true;
2605        }
2606        return false;
2607}
2608EXPORT_SYMBOL(skb_page_frag_refill);
2609
2610bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2611{
2612        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2613                return true;
2614
2615        sk_enter_memory_pressure(sk);
2616        sk_stream_moderate_sndbuf(sk);
2617        return false;
2618}
2619EXPORT_SYMBOL(sk_page_frag_refill);
2620
2621void __lock_sock(struct sock *sk)
2622        __releases(&sk->sk_lock.slock)
2623        __acquires(&sk->sk_lock.slock)
2624{
2625        DEFINE_WAIT(wait);
2626
2627        for (;;) {
2628                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2629                                        TASK_UNINTERRUPTIBLE);
2630                spin_unlock_bh(&sk->sk_lock.slock);
2631                schedule();
2632                spin_lock_bh(&sk->sk_lock.slock);
2633                if (!sock_owned_by_user(sk))
2634                        break;
2635        }
2636        finish_wait(&sk->sk_lock.wq, &wait);
2637}
2638
2639void __release_sock(struct sock *sk)
2640        __releases(&sk->sk_lock.slock)
2641        __acquires(&sk->sk_lock.slock)
2642{
2643        struct sk_buff *skb, *next;
2644
2645        while ((skb = sk->sk_backlog.head) != NULL) {
2646                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2647
2648                spin_unlock_bh(&sk->sk_lock.slock);
2649
2650                do {
2651                        next = skb->next;
2652                        prefetch(next);
2653                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2654                        skb_mark_not_on_list(skb);
2655                        sk_backlog_rcv(sk, skb);
2656
2657                        cond_resched();
2658
2659                        skb = next;
2660                } while (skb != NULL);
2661
2662                spin_lock_bh(&sk->sk_lock.slock);
2663        }
2664
2665        /*
2666         * Doing the zeroing here guarantee we can not loop forever
2667         * while a wild producer attempts to flood us.
2668         */
2669        sk->sk_backlog.len = 0;
2670}
2671
2672void __sk_flush_backlog(struct sock *sk)
2673{
2674        spin_lock_bh(&sk->sk_lock.slock);
2675        __release_sock(sk);
2676        spin_unlock_bh(&sk->sk_lock.slock);
2677}
2678
2679/**
2680 * sk_wait_data - wait for data to arrive at sk_receive_queue
2681 * @sk:    sock to wait on
2682 * @timeo: for how long
2683 * @skb:   last skb seen on sk_receive_queue
2684 *
2685 * Now socket state including sk->sk_err is changed only under lock,
2686 * hence we may omit checks after joining wait queue.
2687 * We check receive queue before schedule() only as optimization;
2688 * it is very likely that release_sock() added new data.
2689 */
2690int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2691{
2692        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2693        int rc;
2694
2695        add_wait_queue(sk_sleep(sk), &wait);
2696        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2697        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2698        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2699        remove_wait_queue(sk_sleep(sk), &wait);
2700        return rc;
2701}
2702EXPORT_SYMBOL(sk_wait_data);
2703
2704/**
2705 *      __sk_mem_raise_allocated - increase memory_allocated
2706 *      @sk: socket
2707 *      @size: memory size to allocate
2708 *      @amt: pages to allocate
2709 *      @kind: allocation type
2710 *
2711 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2712 */
2713int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2714{
2715        struct proto *prot = sk->sk_prot;
2716        long allocated = sk_memory_allocated_add(sk, amt);
2717        bool charged = true;
2718
2719        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2720            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2721                goto suppress_allocation;
2722
2723        /* Under limit. */
2724        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2725                sk_leave_memory_pressure(sk);
2726                return 1;
2727        }
2728
2729        /* Under pressure. */
2730        if (allocated > sk_prot_mem_limits(sk, 1))
2731                sk_enter_memory_pressure(sk);
2732
2733        /* Over hard limit. */
2734        if (allocated > sk_prot_mem_limits(sk, 2))
2735                goto suppress_allocation;
2736
2737        /* guarantee minimum buffer size under pressure */
2738        if (kind == SK_MEM_RECV) {
2739                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2740                        return 1;
2741
2742        } else { /* SK_MEM_SEND */
2743                int wmem0 = sk_get_wmem0(sk, prot);
2744
2745                if (sk->sk_type == SOCK_STREAM) {
2746                        if (sk->sk_wmem_queued < wmem0)
2747                                return 1;
2748                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2749                                return 1;
2750                }
2751        }
2752
2753        if (sk_has_memory_pressure(sk)) {
2754                u64 alloc;
2755
2756                if (!sk_under_memory_pressure(sk))
2757                        return 1;
2758                alloc = sk_sockets_allocated_read_positive(sk);
2759                if (sk_prot_mem_limits(sk, 2) > alloc *
2760                    sk_mem_pages(sk->sk_wmem_queued +
2761                                 atomic_read(&sk->sk_rmem_alloc) +
2762                                 sk->sk_forward_alloc))
2763                        return 1;
2764        }
2765
2766suppress_allocation:
2767
2768        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2769                sk_stream_moderate_sndbuf(sk);
2770
2771                /* Fail only if socket is _under_ its sndbuf.
2772                 * In this case we cannot block, so that we have to fail.
2773                 */
2774                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2775                        return 1;
2776        }
2777
2778        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2779                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2780
2781        sk_memory_allocated_sub(sk, amt);
2782
2783        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2784                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2785
2786        return 0;
2787}
2788EXPORT_SYMBOL(__sk_mem_raise_allocated);
2789
2790/**
2791 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2792 *      @sk: socket
2793 *      @size: memory size to allocate
2794 *      @kind: allocation type
2795 *
2796 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2797 *      rmem allocation. This function assumes that protocols which have
2798 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2799 */
2800int __sk_mem_schedule(struct sock *sk, int size, int kind)
2801{
2802        int ret, amt = sk_mem_pages(size);
2803
2804        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2805        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2806        if (!ret)
2807                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2808        return ret;
2809}
2810EXPORT_SYMBOL(__sk_mem_schedule);
2811
2812/**
2813 *      __sk_mem_reduce_allocated - reclaim memory_allocated
2814 *      @sk: socket
2815 *      @amount: number of quanta
2816 *
2817 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2818 */
2819void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2820{
2821        sk_memory_allocated_sub(sk, amount);
2822
2823        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2824                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2825
2826        if (sk_under_memory_pressure(sk) &&
2827            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2828                sk_leave_memory_pressure(sk);
2829}
2830EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2831
2832/**
2833 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2834 *      @sk: socket
2835 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2836 */
2837void __sk_mem_reclaim(struct sock *sk, int amount)
2838{
2839        amount >>= SK_MEM_QUANTUM_SHIFT;
2840        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2841        __sk_mem_reduce_allocated(sk, amount);
2842}
2843EXPORT_SYMBOL(__sk_mem_reclaim);
2844
2845int sk_set_peek_off(struct sock *sk, int val)
2846{
2847        sk->sk_peek_off = val;
2848        return 0;
2849}
2850EXPORT_SYMBOL_GPL(sk_set_peek_off);
2851
2852/*
2853 * Set of default routines for initialising struct proto_ops when
2854 * the protocol does not support a particular function. In certain
2855 * cases where it makes no sense for a protocol to have a "do nothing"
2856 * function, some default processing is provided.
2857 */
2858
2859int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2860{
2861        return -EOPNOTSUPP;
2862}
2863EXPORT_SYMBOL(sock_no_bind);
2864
2865int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2866                    int len, int flags)
2867{
2868        return -EOPNOTSUPP;
2869}
2870EXPORT_SYMBOL(sock_no_connect);
2871
2872int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2873{
2874        return -EOPNOTSUPP;
2875}
2876EXPORT_SYMBOL(sock_no_socketpair);
2877
2878int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2879                   bool kern)
2880{
2881        return -EOPNOTSUPP;
2882}
2883EXPORT_SYMBOL(sock_no_accept);
2884
2885int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2886                    int peer)
2887{
2888        return -EOPNOTSUPP;
2889}
2890EXPORT_SYMBOL(sock_no_getname);
2891
2892int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2893{
2894        return -EOPNOTSUPP;
2895}
2896EXPORT_SYMBOL(sock_no_ioctl);
2897
2898int sock_no_listen(struct socket *sock, int backlog)
2899{
2900        return -EOPNOTSUPP;
2901}
2902EXPORT_SYMBOL(sock_no_listen);
2903
2904int sock_no_shutdown(struct socket *sock, int how)
2905{
2906        return -EOPNOTSUPP;
2907}
2908EXPORT_SYMBOL(sock_no_shutdown);
2909
2910int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2911{
2912        return -EOPNOTSUPP;
2913}
2914EXPORT_SYMBOL(sock_no_sendmsg);
2915
2916int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2917{
2918        return -EOPNOTSUPP;
2919}
2920EXPORT_SYMBOL(sock_no_sendmsg_locked);
2921
2922int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2923                    int flags)
2924{
2925        return -EOPNOTSUPP;
2926}
2927EXPORT_SYMBOL(sock_no_recvmsg);
2928
2929int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2930{
2931        /* Mirror missing mmap method error code */
2932        return -ENODEV;
2933}
2934EXPORT_SYMBOL(sock_no_mmap);
2935
2936/*
2937 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2938 * various sock-based usage counts.
2939 */
2940void __receive_sock(struct file *file)
2941{
2942        struct socket *sock;
2943
2944        sock = sock_from_file(file);
2945        if (sock) {
2946                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2947                sock_update_classid(&sock->sk->sk_cgrp_data);
2948        }
2949}
2950
2951ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2952{
2953        ssize_t res;
2954        struct msghdr msg = {.msg_flags = flags};
2955        struct kvec iov;
2956        char *kaddr = kmap(page);
2957        iov.iov_base = kaddr + offset;
2958        iov.iov_len = size;
2959        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2960        kunmap(page);
2961        return res;
2962}
2963EXPORT_SYMBOL(sock_no_sendpage);
2964
2965ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2966                                int offset, size_t size, int flags)
2967{
2968        ssize_t res;
2969        struct msghdr msg = {.msg_flags = flags};
2970        struct kvec iov;
2971        char *kaddr = kmap(page);
2972
2973        iov.iov_base = kaddr + offset;
2974        iov.iov_len = size;
2975        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2976        kunmap(page);
2977        return res;
2978}
2979EXPORT_SYMBOL(sock_no_sendpage_locked);
2980
2981/*
2982 *      Default Socket Callbacks
2983 */
2984
2985static void sock_def_wakeup(struct sock *sk)
2986{
2987        struct socket_wq *wq;
2988
2989        rcu_read_lock();
2990        wq = rcu_dereference(sk->sk_wq);
2991        if (skwq_has_sleeper(wq))
2992                wake_up_interruptible_all(&wq->wait);
2993        rcu_read_unlock();
2994}
2995
2996static void sock_def_error_report(struct sock *sk)
2997{
2998        struct socket_wq *wq;
2999
3000        rcu_read_lock();
3001        wq = rcu_dereference(sk->sk_wq);
3002        if (skwq_has_sleeper(wq))
3003                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3004        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3005        rcu_read_unlock();
3006}
3007
3008void sock_def_readable(struct sock *sk)
3009{
3010        struct socket_wq *wq;
3011
3012        rcu_read_lock();
3013        wq = rcu_dereference(sk->sk_wq);
3014        if (skwq_has_sleeper(wq))
3015                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3016                                                EPOLLRDNORM | EPOLLRDBAND);
3017        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3018        rcu_read_unlock();
3019}
3020
3021static void sock_def_write_space(struct sock *sk)
3022{
3023        struct socket_wq *wq;
3024
3025        rcu_read_lock();
3026
3027        /* Do not wake up a writer until he can make "significant"
3028         * progress.  --DaveM
3029         */
3030        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3031                wq = rcu_dereference(sk->sk_wq);
3032                if (skwq_has_sleeper(wq))
3033                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3034                                                EPOLLWRNORM | EPOLLWRBAND);
3035
3036                /* Should agree with poll, otherwise some programs break */
3037                if (sock_writeable(sk))
3038                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3039        }
3040
3041        rcu_read_unlock();
3042}
3043
3044static void sock_def_destruct(struct sock *sk)
3045{
3046}
3047
3048void sk_send_sigurg(struct sock *sk)
3049{
3050        if (sk->sk_socket && sk->sk_socket->file)
3051                if (send_sigurg(&sk->sk_socket->file->f_owner))
3052                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3053}
3054EXPORT_SYMBOL(sk_send_sigurg);
3055
3056void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3057                    unsigned long expires)
3058{
3059        if (!mod_timer(timer, expires))
3060                sock_hold(sk);
3061}
3062EXPORT_SYMBOL(sk_reset_timer);
3063
3064void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3065{
3066        if (del_timer(timer))
3067                __sock_put(sk);
3068}
3069EXPORT_SYMBOL(sk_stop_timer);
3070
3071void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3072{
3073        if (del_timer_sync(timer))
3074                __sock_put(sk);
3075}
3076EXPORT_SYMBOL(sk_stop_timer_sync);
3077
3078void sock_init_data(struct socket *sock, struct sock *sk)
3079{
3080        sk_init_common(sk);
3081        sk->sk_send_head        =       NULL;
3082
3083        timer_setup(&sk->sk_timer, NULL, 0);
3084
3085        sk->sk_allocation       =       GFP_KERNEL;
3086        sk->sk_rcvbuf           =       sysctl_rmem_default;
3087        sk->sk_sndbuf           =       sysctl_wmem_default;
3088        sk->sk_state            =       TCP_CLOSE;
3089        sk_set_socket(sk, sock);
3090
3091        sock_set_flag(sk, SOCK_ZAPPED);
3092
3093        if (sock) {
3094                sk->sk_type     =       sock->type;
3095                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3096                sock->sk        =       sk;
3097                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3098        } else {
3099                RCU_INIT_POINTER(sk->sk_wq, NULL);
3100                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3101        }
3102
3103        rwlock_init(&sk->sk_callback_lock);
3104        if (sk->sk_kern_sock)
3105                lockdep_set_class_and_name(
3106                        &sk->sk_callback_lock,
3107                        af_kern_callback_keys + sk->sk_family,
3108                        af_family_kern_clock_key_strings[sk->sk_family]);
3109        else
3110                lockdep_set_class_and_name(
3111                        &sk->sk_callback_lock,
3112                        af_callback_keys + sk->sk_family,
3113                        af_family_clock_key_strings[sk->sk_family]);
3114
3115        sk->sk_state_change     =       sock_def_wakeup;
3116        sk->sk_data_ready       =       sock_def_readable;
3117        sk->sk_write_space      =       sock_def_write_space;
3118        sk->sk_error_report     =       sock_def_error_report;
3119        sk->sk_destruct         =       sock_def_destruct;
3120
3121        sk->sk_frag.page        =       NULL;
3122        sk->sk_frag.offset      =       0;
3123        sk->sk_peek_off         =       -1;
3124
3125        sk->sk_peer_pid         =       NULL;
3126        sk->sk_peer_cred        =       NULL;
3127        sk->sk_write_pending    =       0;
3128        sk->sk_rcvlowat         =       1;
3129        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3130        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3131
3132        sk->sk_stamp = SK_DEFAULT_STAMP;
3133#if BITS_PER_LONG==32
3134        seqlock_init(&sk->sk_stamp_seq);
3135#endif
3136        atomic_set(&sk->sk_zckey, 0);
3137
3138#ifdef CONFIG_NET_RX_BUSY_POLL
3139        sk->sk_napi_id          =       0;
3140        sk->sk_ll_usec          =       sysctl_net_busy_read;
3141#endif
3142
3143        sk->sk_max_pacing_rate = ~0UL;
3144        sk->sk_pacing_rate = ~0UL;
3145        WRITE_ONCE(sk->sk_pacing_shift, 10);
3146        sk->sk_incoming_cpu = -1;
3147
3148        sk_rx_queue_clear(sk);
3149        /*
3150         * Before updating sk_refcnt, we must commit prior changes to memory
3151         * (Documentation/RCU/rculist_nulls.rst for details)
3152         */
3153        smp_wmb();
3154        refcount_set(&sk->sk_refcnt, 1);
3155        atomic_set(&sk->sk_drops, 0);
3156}
3157EXPORT_SYMBOL(sock_init_data);
3158
3159void lock_sock_nested(struct sock *sk, int subclass)
3160{
3161        might_sleep();
3162        spin_lock_bh(&sk->sk_lock.slock);
3163        if (sk->sk_lock.owned)
3164                __lock_sock(sk);
3165        sk->sk_lock.owned = 1;
3166        spin_unlock(&sk->sk_lock.slock);
3167        /*
3168         * The sk_lock has mutex_lock() semantics here:
3169         */
3170        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3171        local_bh_enable();
3172}
3173EXPORT_SYMBOL(lock_sock_nested);
3174
3175void release_sock(struct sock *sk)
3176{
3177        spin_lock_bh(&sk->sk_lock.slock);
3178        if (sk->sk_backlog.tail)
3179                __release_sock(sk);
3180
3181        /* Warning : release_cb() might need to release sk ownership,
3182         * ie call sock_release_ownership(sk) before us.
3183         */
3184        if (sk->sk_prot->release_cb)
3185                sk->sk_prot->release_cb(sk);
3186
3187        sock_release_ownership(sk);
3188        if (waitqueue_active(&sk->sk_lock.wq))
3189                wake_up(&sk->sk_lock.wq);
3190        spin_unlock_bh(&sk->sk_lock.slock);
3191}
3192EXPORT_SYMBOL(release_sock);
3193
3194/**
3195 * lock_sock_fast - fast version of lock_sock
3196 * @sk: socket
3197 *
3198 * This version should be used for very small section, where process wont block
3199 * return false if fast path is taken:
3200 *
3201 *   sk_lock.slock locked, owned = 0, BH disabled
3202 *
3203 * return true if slow path is taken:
3204 *
3205 *   sk_lock.slock unlocked, owned = 1, BH enabled
3206 */
3207bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3208{
3209        might_sleep();
3210        spin_lock_bh(&sk->sk_lock.slock);
3211
3212        if (!sk->sk_lock.owned)
3213                /*
3214                 * Note : We must disable BH
3215                 */
3216                return false;
3217
3218        __lock_sock(sk);
3219        sk->sk_lock.owned = 1;
3220        spin_unlock(&sk->sk_lock.slock);
3221        /*
3222         * The sk_lock has mutex_lock() semantics here:
3223         */
3224        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3225        __acquire(&sk->sk_lock.slock);
3226        local_bh_enable();
3227        return true;
3228}
3229EXPORT_SYMBOL(lock_sock_fast);
3230
3231int sock_gettstamp(struct socket *sock, void __user *userstamp,
3232                   bool timeval, bool time32)
3233{
3234        struct sock *sk = sock->sk;
3235        struct timespec64 ts;
3236
3237        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3238        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3239        if (ts.tv_sec == -1)
3240                return -ENOENT;
3241        if (ts.tv_sec == 0) {
3242                ktime_t kt = ktime_get_real();
3243                sock_write_timestamp(sk, kt);
3244                ts = ktime_to_timespec64(kt);
3245        }
3246
3247        if (timeval)
3248                ts.tv_nsec /= 1000;
3249
3250#ifdef CONFIG_COMPAT_32BIT_TIME
3251        if (time32)
3252                return put_old_timespec32(&ts, userstamp);
3253#endif
3254#ifdef CONFIG_SPARC64
3255        /* beware of padding in sparc64 timeval */
3256        if (timeval && !in_compat_syscall()) {
3257                struct __kernel_old_timeval __user tv = {
3258                        .tv_sec = ts.tv_sec,
3259                        .tv_usec = ts.tv_nsec,
3260                };
3261                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3262                        return -EFAULT;
3263                return 0;
3264        }
3265#endif
3266        return put_timespec64(&ts, userstamp);
3267}
3268EXPORT_SYMBOL(sock_gettstamp);
3269
3270void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3271{
3272        if (!sock_flag(sk, flag)) {
3273                unsigned long previous_flags = sk->sk_flags;
3274
3275                sock_set_flag(sk, flag);
3276                /*
3277                 * we just set one of the two flags which require net
3278                 * time stamping, but time stamping might have been on
3279                 * already because of the other one
3280                 */
3281                if (sock_needs_netstamp(sk) &&
3282                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3283                        net_enable_timestamp();
3284        }
3285}
3286
3287int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3288                       int level, int type)
3289{
3290        struct sock_exterr_skb *serr;
3291        struct sk_buff *skb;
3292        int copied, err;
3293
3294        err = -EAGAIN;
3295        skb = sock_dequeue_err_skb(sk);
3296        if (skb == NULL)
3297                goto out;
3298
3299        copied = skb->len;
3300        if (copied > len) {
3301                msg->msg_flags |= MSG_TRUNC;
3302                copied = len;
3303        }
3304        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3305        if (err)
3306                goto out_free_skb;
3307
3308        sock_recv_timestamp(msg, sk, skb);
3309
3310        serr = SKB_EXT_ERR(skb);
3311        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3312
3313        msg->msg_flags |= MSG_ERRQUEUE;
3314        err = copied;
3315
3316out_free_skb:
3317        kfree_skb(skb);
3318out:
3319        return err;
3320}
3321EXPORT_SYMBOL(sock_recv_errqueue);
3322
3323/*
3324 *      Get a socket option on an socket.
3325 *
3326 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3327 *      asynchronous errors should be reported by getsockopt. We assume
3328 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3329 */
3330int sock_common_getsockopt(struct socket *sock, int level, int optname,
3331                           char __user *optval, int __user *optlen)
3332{
3333        struct sock *sk = sock->sk;
3334
3335        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3336}
3337EXPORT_SYMBOL(sock_common_getsockopt);
3338
3339int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3340                        int flags)
3341{
3342        struct sock *sk = sock->sk;
3343        int addr_len = 0;
3344        int err;
3345
3346        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3347                                   flags & ~MSG_DONTWAIT, &addr_len);
3348        if (err >= 0)
3349                msg->msg_namelen = addr_len;
3350        return err;
3351}
3352EXPORT_SYMBOL(sock_common_recvmsg);
3353
3354/*
3355 *      Set socket options on an inet socket.
3356 */
3357int sock_common_setsockopt(struct socket *sock, int level, int optname,
3358                           sockptr_t optval, unsigned int optlen)
3359{
3360        struct sock *sk = sock->sk;
3361
3362        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3363}
3364EXPORT_SYMBOL(sock_common_setsockopt);
3365
3366void sk_common_release(struct sock *sk)
3367{
3368        if (sk->sk_prot->destroy)
3369                sk->sk_prot->destroy(sk);
3370
3371        /*
3372         * Observation: when sk_common_release is called, processes have
3373         * no access to socket. But net still has.
3374         * Step one, detach it from networking:
3375         *
3376         * A. Remove from hash tables.
3377         */
3378
3379        sk->sk_prot->unhash(sk);
3380
3381        /*
3382         * In this point socket cannot receive new packets, but it is possible
3383         * that some packets are in flight because some CPU runs receiver and
3384         * did hash table lookup before we unhashed socket. They will achieve
3385         * receive queue and will be purged by socket destructor.
3386         *
3387         * Also we still have packets pending on receive queue and probably,
3388         * our own packets waiting in device queues. sock_destroy will drain
3389         * receive queue, but transmitted packets will delay socket destruction
3390         * until the last reference will be released.
3391         */
3392
3393        sock_orphan(sk);
3394
3395        xfrm_sk_free_policy(sk);
3396
3397        sk_refcnt_debug_release(sk);
3398
3399        sock_put(sk);
3400}
3401EXPORT_SYMBOL(sk_common_release);
3402
3403void sk_get_meminfo(const struct sock *sk, u32 *mem)
3404{
3405        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3406
3407        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3408        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3409        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3410        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3411        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3412        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3413        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3414        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3415        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3416}
3417
3418#ifdef CONFIG_PROC_FS
3419#define PROTO_INUSE_NR  64      /* should be enough for the first time */
3420struct prot_inuse {
3421        int val[PROTO_INUSE_NR];
3422};
3423
3424static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3425
3426void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3427{
3428        __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3429}
3430EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3431
3432int sock_prot_inuse_get(struct net *net, struct proto *prot)
3433{
3434        int cpu, idx = prot->inuse_idx;
3435        int res = 0;
3436
3437        for_each_possible_cpu(cpu)
3438                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3439
3440        return res >= 0 ? res : 0;
3441}
3442EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3443
3444static void sock_inuse_add(struct net *net, int val)
3445{
3446        this_cpu_add(*net->core.sock_inuse, val);
3447}
3448
3449int sock_inuse_get(struct net *net)
3450{
3451        int cpu, res = 0;
3452
3453        for_each_possible_cpu(cpu)
3454                res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3455
3456        return res;
3457}
3458
3459EXPORT_SYMBOL_GPL(sock_inuse_get);
3460
3461static int __net_init sock_inuse_init_net(struct net *net)
3462{
3463        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3464        if (net->core.prot_inuse == NULL)
3465                return -ENOMEM;
3466
3467        net->core.sock_inuse = alloc_percpu(int);
3468        if (net->core.sock_inuse == NULL)
3469                goto out;
3470
3471        return 0;
3472
3473out:
3474        free_percpu(net->core.prot_inuse);
3475        return -ENOMEM;
3476}
3477
3478static void __net_exit sock_inuse_exit_net(struct net *net)
3479{
3480        free_percpu(net->core.prot_inuse);
3481        free_percpu(net->core.sock_inuse);
3482}
3483
3484static struct pernet_operations net_inuse_ops = {
3485        .init = sock_inuse_init_net,
3486        .exit = sock_inuse_exit_net,
3487};
3488
3489static __init int net_inuse_init(void)
3490{
3491        if (register_pernet_subsys(&net_inuse_ops))
3492                panic("Cannot initialize net inuse counters");
3493
3494        return 0;
3495}
3496
3497core_initcall(net_inuse_init);
3498
3499static int assign_proto_idx(struct proto *prot)
3500{
3501        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3502
3503        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3504                pr_err("PROTO_INUSE_NR exhausted\n");
3505                return -ENOSPC;
3506        }
3507
3508        set_bit(prot->inuse_idx, proto_inuse_idx);
3509        return 0;
3510}
3511
3512static void release_proto_idx(struct proto *prot)
3513{
3514        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3515                clear_bit(prot->inuse_idx, proto_inuse_idx);
3516}
3517#else
3518static inline int assign_proto_idx(struct proto *prot)
3519{
3520        return 0;
3521}
3522
3523static inline void release_proto_idx(struct proto *prot)
3524{
3525}
3526
3527static void sock_inuse_add(struct net *net, int val)
3528{
3529}
3530#endif
3531
3532static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3533{
3534        if (!twsk_prot)
3535                return;
3536        kfree(twsk_prot->twsk_slab_name);
3537        twsk_prot->twsk_slab_name = NULL;
3538        kmem_cache_destroy(twsk_prot->twsk_slab);
3539        twsk_prot->twsk_slab = NULL;
3540}
3541
3542static int tw_prot_init(const struct proto *prot)
3543{
3544        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3545
3546        if (!twsk_prot)
3547                return 0;
3548
3549        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3550                                              prot->name);
3551        if (!twsk_prot->twsk_slab_name)
3552                return -ENOMEM;
3553
3554        twsk_prot->twsk_slab =
3555                kmem_cache_create(twsk_prot->twsk_slab_name,
3556                                  twsk_prot->twsk_obj_size, 0,
3557                                  SLAB_ACCOUNT | prot->slab_flags,
3558                                  NULL);
3559        if (!twsk_prot->twsk_slab) {
3560                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3561                        prot->name);
3562                return -ENOMEM;
3563        }
3564
3565        return 0;
3566}
3567
3568static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3569{
3570        if (!rsk_prot)
3571                return;
3572        kfree(rsk_prot->slab_name);
3573        rsk_prot->slab_name = NULL;
3574        kmem_cache_destroy(rsk_prot->slab);
3575        rsk_prot->slab = NULL;
3576}
3577
3578static int req_prot_init(const struct proto *prot)
3579{
3580        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3581
3582        if (!rsk_prot)
3583                return 0;
3584
3585        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3586                                        prot->name);
3587        if (!rsk_prot->slab_name)
3588                return -ENOMEM;
3589
3590        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3591                                           rsk_prot->obj_size, 0,
3592                                           SLAB_ACCOUNT | prot->slab_flags,
3593                                           NULL);
3594
3595        if (!rsk_prot->slab) {
3596                pr_crit("%s: Can't create request sock SLAB cache!\n",
3597                        prot->name);
3598                return -ENOMEM;
3599        }
3600        return 0;
3601}
3602
3603int proto_register(struct proto *prot, int alloc_slab)
3604{
3605        int ret = -ENOBUFS;
3606
3607        if (alloc_slab) {
3608                prot->slab = kmem_cache_create_usercopy(prot->name,
3609                                        prot->obj_size, 0,
3610                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3611                                        prot->slab_flags,
3612                                        prot->useroffset, prot->usersize,
3613                                        NULL);
3614
3615                if (prot->slab == NULL) {
3616                        pr_crit("%s: Can't create sock SLAB cache!\n",
3617                                prot->name);
3618                        goto out;
3619                }
3620
3621                if (req_prot_init(prot))
3622                        goto out_free_request_sock_slab;
3623
3624                if (tw_prot_init(prot))
3625                        goto out_free_timewait_sock_slab;
3626        }
3627
3628        mutex_lock(&proto_list_mutex);
3629        ret = assign_proto_idx(prot);
3630        if (ret) {
3631                mutex_unlock(&proto_list_mutex);
3632                goto out_free_timewait_sock_slab;
3633        }
3634        list_add(&prot->node, &proto_list);
3635        mutex_unlock(&proto_list_mutex);
3636        return ret;
3637
3638out_free_timewait_sock_slab:
3639        if (alloc_slab)
3640                tw_prot_cleanup(prot->twsk_prot);
3641out_free_request_sock_slab:
3642        if (alloc_slab) {
3643                req_prot_cleanup(prot->rsk_prot);
3644
3645                kmem_cache_destroy(prot->slab);
3646                prot->slab = NULL;
3647        }
3648out:
3649        return ret;
3650}
3651EXPORT_SYMBOL(proto_register);
3652
3653void proto_unregister(struct proto *prot)
3654{
3655        mutex_lock(&proto_list_mutex);
3656        release_proto_idx(prot);
3657        list_del(&prot->node);
3658        mutex_unlock(&proto_list_mutex);
3659
3660        kmem_cache_destroy(prot->slab);
3661        prot->slab = NULL;
3662
3663        req_prot_cleanup(prot->rsk_prot);
3664        tw_prot_cleanup(prot->twsk_prot);
3665}
3666EXPORT_SYMBOL(proto_unregister);
3667
3668int sock_load_diag_module(int family, int protocol)
3669{
3670        if (!protocol) {
3671                if (!sock_is_registered(family))
3672                        return -ENOENT;
3673
3674                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3675                                      NETLINK_SOCK_DIAG, family);
3676        }
3677
3678#ifdef CONFIG_INET
3679        if (family == AF_INET &&
3680            protocol != IPPROTO_RAW &&
3681            protocol < MAX_INET_PROTOS &&
3682            !rcu_access_pointer(inet_protos[protocol]))
3683                return -ENOENT;
3684#endif
3685
3686        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3687                              NETLINK_SOCK_DIAG, family, protocol);
3688}
3689EXPORT_SYMBOL(sock_load_diag_module);
3690
3691#ifdef CONFIG_PROC_FS
3692static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3693        __acquires(proto_list_mutex)
3694{
3695        mutex_lock(&proto_list_mutex);
3696        return seq_list_start_head(&proto_list, *pos);
3697}
3698
3699static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3700{
3701        return seq_list_next(v, &proto_list, pos);
3702}
3703
3704static void proto_seq_stop(struct seq_file *seq, void *v)
3705        __releases(proto_list_mutex)
3706{
3707        mutex_unlock(&proto_list_mutex);
3708}
3709
3710static char proto_method_implemented(const void *method)
3711{
3712        return method == NULL ? 'n' : 'y';
3713}
3714static long sock_prot_memory_allocated(struct proto *proto)
3715{
3716        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3717}
3718
3719static const char *sock_prot_memory_pressure(struct proto *proto)
3720{
3721        return proto->memory_pressure != NULL ?
3722        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3723}
3724
3725static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3726{
3727
3728        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3729                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3730                   proto->name,
3731                   proto->obj_size,
3732                   sock_prot_inuse_get(seq_file_net(seq), proto),
3733                   sock_prot_memory_allocated(proto),
3734                   sock_prot_memory_pressure(proto),
3735                   proto->max_header,
3736                   proto->slab == NULL ? "no" : "yes",
3737                   module_name(proto->owner),
3738                   proto_method_implemented(proto->close),
3739                   proto_method_implemented(proto->connect),
3740                   proto_method_implemented(proto->disconnect),
3741                   proto_method_implemented(proto->accept),
3742                   proto_method_implemented(proto->ioctl),
3743                   proto_method_implemented(proto->init),
3744                   proto_method_implemented(proto->destroy),
3745                   proto_method_implemented(proto->shutdown),
3746                   proto_method_implemented(proto->setsockopt),
3747                   proto_method_implemented(proto->getsockopt),
3748                   proto_method_implemented(proto->sendmsg),
3749                   proto_method_implemented(proto->recvmsg),
3750                   proto_method_implemented(proto->sendpage),
3751                   proto_method_implemented(proto->bind),
3752                   proto_method_implemented(proto->backlog_rcv),
3753                   proto_method_implemented(proto->hash),
3754                   proto_method_implemented(proto->unhash),
3755                   proto_method_implemented(proto->get_port),
3756                   proto_method_implemented(proto->enter_memory_pressure));
3757}
3758
3759static int proto_seq_show(struct seq_file *seq, void *v)
3760{
3761        if (v == &proto_list)
3762                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3763                           "protocol",
3764                           "size",
3765                           "sockets",
3766                           "memory",
3767                           "press",
3768                           "maxhdr",
3769                           "slab",
3770                           "module",
3771                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3772        else
3773                proto_seq_printf(seq, list_entry(v, struct proto, node));
3774        return 0;
3775}
3776
3777static const struct seq_operations proto_seq_ops = {
3778        .start  = proto_seq_start,
3779        .next   = proto_seq_next,
3780        .stop   = proto_seq_stop,
3781        .show   = proto_seq_show,
3782};
3783
3784static __net_init int proto_init_net(struct net *net)
3785{
3786        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3787                        sizeof(struct seq_net_private)))
3788                return -ENOMEM;
3789
3790        return 0;
3791}
3792
3793static __net_exit void proto_exit_net(struct net *net)
3794{
3795        remove_proc_entry("protocols", net->proc_net);
3796}
3797
3798
3799static __net_initdata struct pernet_operations proto_net_ops = {
3800        .init = proto_init_net,
3801        .exit = proto_exit_net,
3802};
3803
3804static int __init proto_init(void)
3805{
3806        return register_pernet_subsys(&proto_net_ops);
3807}
3808
3809subsys_initcall(proto_init);
3810
3811#endif /* PROC_FS */
3812
3813#ifdef CONFIG_NET_RX_BUSY_POLL
3814bool sk_busy_loop_end(void *p, unsigned long start_time)
3815{
3816        struct sock *sk = p;
3817
3818        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3819               sk_busy_loop_timeout(sk, start_time);
3820}
3821EXPORT_SYMBOL(sk_busy_loop_end);
3822#endif /* CONFIG_NET_RX_BUSY_POLL */
3823
3824int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3825{
3826        if (!sk->sk_prot->bind_add)
3827                return -EOPNOTSUPP;
3828        return sk->sk_prot->bind_add(sk, addr, addr_len);
3829}
3830EXPORT_SYMBOL(sock_bind_add);
3831
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.