linux/net/core/sock.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Generic socket support routines. Memory allocators, socket lock/release
   8 *              handler for protocols to use and generic option handler.
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142#include <linux/ethtool.h>
 143
 144#include "dev.h"
 145
 146static DEFINE_MUTEX(proto_list_mutex);
 147static LIST_HEAD(proto_list);
 148
 149static void sock_def_write_space_wfree(struct sock *sk);
 150static void sock_def_write_space(struct sock *sk);
 151
 152/**
 153 * sk_ns_capable - General socket capability test
 154 * @sk: Socket to use a capability on or through
 155 * @user_ns: The user namespace of the capability to use
 156 * @cap: The capability to use
 157 *
 158 * Test to see if the opener of the socket had when the socket was
 159 * created and the current process has the capability @cap in the user
 160 * namespace @user_ns.
 161 */
 162bool sk_ns_capable(const struct sock *sk,
 163                   struct user_namespace *user_ns, int cap)
 164{
 165        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 166                ns_capable(user_ns, cap);
 167}
 168EXPORT_SYMBOL(sk_ns_capable);
 169
 170/**
 171 * sk_capable - Socket global capability test
 172 * @sk: Socket to use a capability on or through
 173 * @cap: The global capability to use
 174 *
 175 * Test to see if the opener of the socket had when the socket was
 176 * created and the current process has the capability @cap in all user
 177 * namespaces.
 178 */
 179bool sk_capable(const struct sock *sk, int cap)
 180{
 181        return sk_ns_capable(sk, &init_user_ns, cap);
 182}
 183EXPORT_SYMBOL(sk_capable);
 184
 185/**
 186 * sk_net_capable - Network namespace socket capability test
 187 * @sk: Socket to use a capability on or through
 188 * @cap: The capability to use
 189 *
 190 * Test to see if the opener of the socket had when the socket was created
 191 * and the current process has the capability @cap over the network namespace
 192 * the socket is a member of.
 193 */
 194bool sk_net_capable(const struct sock *sk, int cap)
 195{
 196        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 197}
 198EXPORT_SYMBOL(sk_net_capable);
 199
 200/*
 201 * Each address family might have different locking rules, so we have
 202 * one slock key per address family and separate keys for internal and
 203 * userspace sockets.
 204 */
 205static struct lock_class_key af_family_keys[AF_MAX];
 206static struct lock_class_key af_family_kern_keys[AF_MAX];
 207static struct lock_class_key af_family_slock_keys[AF_MAX];
 208static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 209
 210/*
 211 * Make lock validator output more readable. (we pre-construct these
 212 * strings build-time, so that runtime initialization of socket
 213 * locks is fast):
 214 */
 215
 216#define _sock_locks(x)                                            \
 217  x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 218  x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 219  x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 220  x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 221  x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 222  x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 223  x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 224  x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 225  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 226  x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 227  x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 228  x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 229  x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 230  x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 231  x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 232  x "AF_MCTP"  , \
 233  x "AF_MAX"
 234
 235static const char *const af_family_key_strings[AF_MAX+1] = {
 236        _sock_locks("sk_lock-")
 237};
 238static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 239        _sock_locks("slock-")
 240};
 241static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 242        _sock_locks("clock-")
 243};
 244
 245static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 246        _sock_locks("k-sk_lock-")
 247};
 248static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 249        _sock_locks("k-slock-")
 250};
 251static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 252        _sock_locks("k-clock-")
 253};
 254static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 255        _sock_locks("rlock-")
 256};
 257static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 258        _sock_locks("wlock-")
 259};
 260static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 261        _sock_locks("elock-")
 262};
 263
 264/*
 265 * sk_callback_lock and sk queues locking rules are per-address-family,
 266 * so split the lock classes by using a per-AF key:
 267 */
 268static struct lock_class_key af_callback_keys[AF_MAX];
 269static struct lock_class_key af_rlock_keys[AF_MAX];
 270static struct lock_class_key af_wlock_keys[AF_MAX];
 271static struct lock_class_key af_elock_keys[AF_MAX];
 272static struct lock_class_key af_kern_callback_keys[AF_MAX];
 273
 274/* Run time adjustable parameters. */
 275__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 276EXPORT_SYMBOL(sysctl_wmem_max);
 277__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 278EXPORT_SYMBOL(sysctl_rmem_max);
 279__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 280__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 281
 282/* Maximal space eaten by iovec or ancillary data plus some space */
 283int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 284EXPORT_SYMBOL(sysctl_optmem_max);
 285
 286int sysctl_tstamp_allow_data __read_mostly = 1;
 287
 288DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 289EXPORT_SYMBOL_GPL(memalloc_socks_key);
 290
 291/**
 292 * sk_set_memalloc - sets %SOCK_MEMALLOC
 293 * @sk: socket to set it on
 294 *
 295 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 296 * It's the responsibility of the admin to adjust min_free_kbytes
 297 * to meet the requirements
 298 */
 299void sk_set_memalloc(struct sock *sk)
 300{
 301        sock_set_flag(sk, SOCK_MEMALLOC);
 302        sk->sk_allocation |= __GFP_MEMALLOC;
 303        static_branch_inc(&memalloc_socks_key);
 304}
 305EXPORT_SYMBOL_GPL(sk_set_memalloc);
 306
 307void sk_clear_memalloc(struct sock *sk)
 308{
 309        sock_reset_flag(sk, SOCK_MEMALLOC);
 310        sk->sk_allocation &= ~__GFP_MEMALLOC;
 311        static_branch_dec(&memalloc_socks_key);
 312
 313        /*
 314         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 315         * progress of swapping. SOCK_MEMALLOC may be cleared while
 316         * it has rmem allocations due to the last swapfile being deactivated
 317         * but there is a risk that the socket is unusable due to exceeding
 318         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 319         */
 320        sk_mem_reclaim(sk);
 321}
 322EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 323
 324int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 325{
 326        int ret;
 327        unsigned int noreclaim_flag;
 328
 329        /* these should have been dropped before queueing */
 330        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 331
 332        noreclaim_flag = memalloc_noreclaim_save();
 333        ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 334                                 tcp_v6_do_rcv,
 335                                 tcp_v4_do_rcv,
 336                                 sk, skb);
 337        memalloc_noreclaim_restore(noreclaim_flag);
 338
 339        return ret;
 340}
 341EXPORT_SYMBOL(__sk_backlog_rcv);
 342
 343void sk_error_report(struct sock *sk)
 344{
 345        sk->sk_error_report(sk);
 346
 347        switch (sk->sk_family) {
 348        case AF_INET:
 349                fallthrough;
 350        case AF_INET6:
 351                trace_inet_sk_error_report(sk);
 352                break;
 353        default:
 354                break;
 355        }
 356}
 357EXPORT_SYMBOL(sk_error_report);
 358
 359int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 360{
 361        struct __kernel_sock_timeval tv;
 362
 363        if (timeo == MAX_SCHEDULE_TIMEOUT) {
 364                tv.tv_sec = 0;
 365                tv.tv_usec = 0;
 366        } else {
 367                tv.tv_sec = timeo / HZ;
 368                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 369        }
 370
 371        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 372                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 373                *(struct old_timeval32 *)optval = tv32;
 374                return sizeof(tv32);
 375        }
 376
 377        if (old_timeval) {
 378                struct __kernel_old_timeval old_tv;
 379                old_tv.tv_sec = tv.tv_sec;
 380                old_tv.tv_usec = tv.tv_usec;
 381                *(struct __kernel_old_timeval *)optval = old_tv;
 382                return sizeof(old_tv);
 383        }
 384
 385        *(struct __kernel_sock_timeval *)optval = tv;
 386        return sizeof(tv);
 387}
 388EXPORT_SYMBOL(sock_get_timeout);
 389
 390int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 391                           sockptr_t optval, int optlen, bool old_timeval)
 392{
 393        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 394                struct old_timeval32 tv32;
 395
 396                if (optlen < sizeof(tv32))
 397                        return -EINVAL;
 398
 399                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 400                        return -EFAULT;
 401                tv->tv_sec = tv32.tv_sec;
 402                tv->tv_usec = tv32.tv_usec;
 403        } else if (old_timeval) {
 404                struct __kernel_old_timeval old_tv;
 405
 406                if (optlen < sizeof(old_tv))
 407                        return -EINVAL;
 408                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 409                        return -EFAULT;
 410                tv->tv_sec = old_tv.tv_sec;
 411                tv->tv_usec = old_tv.tv_usec;
 412        } else {
 413                if (optlen < sizeof(*tv))
 414                        return -EINVAL;
 415                if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 416                        return -EFAULT;
 417        }
 418
 419        return 0;
 420}
 421EXPORT_SYMBOL(sock_copy_user_timeval);
 422
 423static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 424                            bool old_timeval)
 425{
 426        struct __kernel_sock_timeval tv;
 427        int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 428
 429        if (err)
 430                return err;
 431
 432        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 433                return -EDOM;
 434
 435        if (tv.tv_sec < 0) {
 436                static int warned __read_mostly;
 437
 438                *timeo_p = 0;
 439                if (warned < 10 && net_ratelimit()) {
 440                        warned++;
 441                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 442                                __func__, current->comm, task_pid_nr(current));
 443                }
 444                return 0;
 445        }
 446        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 447        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 448                return 0;
 449        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 450                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 451        return 0;
 452}
 453
 454static bool sock_needs_netstamp(const struct sock *sk)
 455{
 456        switch (sk->sk_family) {
 457        case AF_UNSPEC:
 458        case AF_UNIX:
 459                return false;
 460        default:
 461                return true;
 462        }
 463}
 464
 465static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 466{
 467        if (sk->sk_flags & flags) {
 468                sk->sk_flags &= ~flags;
 469                if (sock_needs_netstamp(sk) &&
 470                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 471                        net_disable_timestamp();
 472        }
 473}
 474
 475
 476int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 477{
 478        unsigned long flags;
 479        struct sk_buff_head *list = &sk->sk_receive_queue;
 480
 481        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 482                atomic_inc(&sk->sk_drops);
 483                trace_sock_rcvqueue_full(sk, skb);
 484                return -ENOMEM;
 485        }
 486
 487        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 488                atomic_inc(&sk->sk_drops);
 489                return -ENOBUFS;
 490        }
 491
 492        skb->dev = NULL;
 493        skb_set_owner_r(skb, sk);
 494
 495        /* we escape from rcu protected region, make sure we dont leak
 496         * a norefcounted dst
 497         */
 498        skb_dst_force(skb);
 499
 500        spin_lock_irqsave(&list->lock, flags);
 501        sock_skb_set_dropcount(sk, skb);
 502        __skb_queue_tail(list, skb);
 503        spin_unlock_irqrestore(&list->lock, flags);
 504
 505        if (!sock_flag(sk, SOCK_DEAD))
 506                sk->sk_data_ready(sk);
 507        return 0;
 508}
 509EXPORT_SYMBOL(__sock_queue_rcv_skb);
 510
 511int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 512                              enum skb_drop_reason *reason)
 513{
 514        enum skb_drop_reason drop_reason;
 515        int err;
 516
 517        err = sk_filter(sk, skb);
 518        if (err) {
 519                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 520                goto out;
 521        }
 522        err = __sock_queue_rcv_skb(sk, skb);
 523        switch (err) {
 524        case -ENOMEM:
 525                drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 526                break;
 527        case -ENOBUFS:
 528                drop_reason = SKB_DROP_REASON_PROTO_MEM;
 529                break;
 530        default:
 531                drop_reason = SKB_NOT_DROPPED_YET;
 532                break;
 533        }
 534out:
 535        if (reason)
 536                *reason = drop_reason;
 537        return err;
 538}
 539EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 540
 541int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 542                     const int nested, unsigned int trim_cap, bool refcounted)
 543{
 544        int rc = NET_RX_SUCCESS;
 545
 546        if (sk_filter_trim_cap(sk, skb, trim_cap))
 547                goto discard_and_relse;
 548
 549        skb->dev = NULL;
 550
 551        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 552                atomic_inc(&sk->sk_drops);
 553                goto discard_and_relse;
 554        }
 555        if (nested)
 556                bh_lock_sock_nested(sk);
 557        else
 558                bh_lock_sock(sk);
 559        if (!sock_owned_by_user(sk)) {
 560                /*
 561                 * trylock + unlock semantics:
 562                 */
 563                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 564
 565                rc = sk_backlog_rcv(sk, skb);
 566
 567                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 568        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 569                bh_unlock_sock(sk);
 570                atomic_inc(&sk->sk_drops);
 571                goto discard_and_relse;
 572        }
 573
 574        bh_unlock_sock(sk);
 575out:
 576        if (refcounted)
 577                sock_put(sk);
 578        return rc;
 579discard_and_relse:
 580        kfree_skb(skb);
 581        goto out;
 582}
 583EXPORT_SYMBOL(__sk_receive_skb);
 584
 585INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 586                                                          u32));
 587INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 588                                                           u32));
 589struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 590{
 591        struct dst_entry *dst = __sk_dst_get(sk);
 592
 593        if (dst && dst->obsolete &&
 594            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 595                               dst, cookie) == NULL) {
 596                sk_tx_queue_clear(sk);
 597                sk->sk_dst_pending_confirm = 0;
 598                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 599                dst_release(dst);
 600                return NULL;
 601        }
 602
 603        return dst;
 604}
 605EXPORT_SYMBOL(__sk_dst_check);
 606
 607struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 608{
 609        struct dst_entry *dst = sk_dst_get(sk);
 610
 611        if (dst && dst->obsolete &&
 612            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 613                               dst, cookie) == NULL) {
 614                sk_dst_reset(sk);
 615                dst_release(dst);
 616                return NULL;
 617        }
 618
 619        return dst;
 620}
 621EXPORT_SYMBOL(sk_dst_check);
 622
 623static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 624{
 625        int ret = -ENOPROTOOPT;
 626#ifdef CONFIG_NETDEVICES
 627        struct net *net = sock_net(sk);
 628
 629        /* Sorry... */
 630        ret = -EPERM;
 631        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 632                goto out;
 633
 634        ret = -EINVAL;
 635        if (ifindex < 0)
 636                goto out;
 637
 638        /* Paired with all READ_ONCE() done locklessly. */
 639        WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 640
 641        if (sk->sk_prot->rehash)
 642                sk->sk_prot->rehash(sk);
 643        sk_dst_reset(sk);
 644
 645        ret = 0;
 646
 647out:
 648#endif
 649
 650        return ret;
 651}
 652
 653int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 654{
 655        int ret;
 656
 657        if (lock_sk)
 658                lock_sock(sk);
 659        ret = sock_bindtoindex_locked(sk, ifindex);
 660        if (lock_sk)
 661                release_sock(sk);
 662
 663        return ret;
 664}
 665EXPORT_SYMBOL(sock_bindtoindex);
 666
 667static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 668{
 669        int ret = -ENOPROTOOPT;
 670#ifdef CONFIG_NETDEVICES
 671        struct net *net = sock_net(sk);
 672        char devname[IFNAMSIZ];
 673        int index;
 674
 675        ret = -EINVAL;
 676        if (optlen < 0)
 677                goto out;
 678
 679        /* Bind this socket to a particular device like "eth0",
 680         * as specified in the passed interface name. If the
 681         * name is "" or the option length is zero the socket
 682         * is not bound.
 683         */
 684        if (optlen > IFNAMSIZ - 1)
 685                optlen = IFNAMSIZ - 1;
 686        memset(devname, 0, sizeof(devname));
 687
 688        ret = -EFAULT;
 689        if (copy_from_sockptr(devname, optval, optlen))
 690                goto out;
 691
 692        index = 0;
 693        if (devname[0] != '\0') {
 694                struct net_device *dev;
 695
 696                rcu_read_lock();
 697                dev = dev_get_by_name_rcu(net, devname);
 698                if (dev)
 699                        index = dev->ifindex;
 700                rcu_read_unlock();
 701                ret = -ENODEV;
 702                if (!dev)
 703                        goto out;
 704        }
 705
 706        return sock_bindtoindex(sk, index, true);
 707out:
 708#endif
 709
 710        return ret;
 711}
 712
 713static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 714                                int __user *optlen, int len)
 715{
 716        int ret = -ENOPROTOOPT;
 717#ifdef CONFIG_NETDEVICES
 718        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 719        struct net *net = sock_net(sk);
 720        char devname[IFNAMSIZ];
 721
 722        if (bound_dev_if == 0) {
 723                len = 0;
 724                goto zero;
 725        }
 726
 727        ret = -EINVAL;
 728        if (len < IFNAMSIZ)
 729                goto out;
 730
 731        ret = netdev_get_name(net, devname, bound_dev_if);
 732        if (ret)
 733                goto out;
 734
 735        len = strlen(devname) + 1;
 736
 737        ret = -EFAULT;
 738        if (copy_to_user(optval, devname, len))
 739                goto out;
 740
 741zero:
 742        ret = -EFAULT;
 743        if (put_user(len, optlen))
 744                goto out;
 745
 746        ret = 0;
 747
 748out:
 749#endif
 750
 751        return ret;
 752}
 753
 754bool sk_mc_loop(struct sock *sk)
 755{
 756        if (dev_recursion_level())
 757                return false;
 758        if (!sk)
 759                return true;
 760        switch (sk->sk_family) {
 761        case AF_INET:
 762                return inet_sk(sk)->mc_loop;
 763#if IS_ENABLED(CONFIG_IPV6)
 764        case AF_INET6:
 765                return inet6_sk(sk)->mc_loop;
 766#endif
 767        }
 768        WARN_ON_ONCE(1);
 769        return true;
 770}
 771EXPORT_SYMBOL(sk_mc_loop);
 772
 773void sock_set_reuseaddr(struct sock *sk)
 774{
 775        lock_sock(sk);
 776        sk->sk_reuse = SK_CAN_REUSE;
 777        release_sock(sk);
 778}
 779EXPORT_SYMBOL(sock_set_reuseaddr);
 780
 781void sock_set_reuseport(struct sock *sk)
 782{
 783        lock_sock(sk);
 784        sk->sk_reuseport = true;
 785        release_sock(sk);
 786}
 787EXPORT_SYMBOL(sock_set_reuseport);
 788
 789void sock_no_linger(struct sock *sk)
 790{
 791        lock_sock(sk);
 792        sk->sk_lingertime = 0;
 793        sock_set_flag(sk, SOCK_LINGER);
 794        release_sock(sk);
 795}
 796EXPORT_SYMBOL(sock_no_linger);
 797
 798void sock_set_priority(struct sock *sk, u32 priority)
 799{
 800        lock_sock(sk);
 801        sk->sk_priority = priority;
 802        release_sock(sk);
 803}
 804EXPORT_SYMBOL(sock_set_priority);
 805
 806void sock_set_sndtimeo(struct sock *sk, s64 secs)
 807{
 808        lock_sock(sk);
 809        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 810                sk->sk_sndtimeo = secs * HZ;
 811        else
 812                sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 813        release_sock(sk);
 814}
 815EXPORT_SYMBOL(sock_set_sndtimeo);
 816
 817static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 818{
 819        if (val)  {
 820                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 821                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 822                sock_set_flag(sk, SOCK_RCVTSTAMP);
 823                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 824        } else {
 825                sock_reset_flag(sk, SOCK_RCVTSTAMP);
 826                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827        }
 828}
 829
 830void sock_enable_timestamps(struct sock *sk)
 831{
 832        lock_sock(sk);
 833        __sock_set_timestamps(sk, true, false, true);
 834        release_sock(sk);
 835}
 836EXPORT_SYMBOL(sock_enable_timestamps);
 837
 838void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 839{
 840        switch (optname) {
 841        case SO_TIMESTAMP_OLD:
 842                __sock_set_timestamps(sk, valbool, false, false);
 843                break;
 844        case SO_TIMESTAMP_NEW:
 845                __sock_set_timestamps(sk, valbool, true, false);
 846                break;
 847        case SO_TIMESTAMPNS_OLD:
 848                __sock_set_timestamps(sk, valbool, false, true);
 849                break;
 850        case SO_TIMESTAMPNS_NEW:
 851                __sock_set_timestamps(sk, valbool, true, true);
 852                break;
 853        }
 854}
 855
 856static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 857{
 858        struct net *net = sock_net(sk);
 859        struct net_device *dev = NULL;
 860        bool match = false;
 861        int *vclock_index;
 862        int i, num;
 863
 864        if (sk->sk_bound_dev_if)
 865                dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 866
 867        if (!dev) {
 868                pr_err("%s: sock not bind to device\n", __func__);
 869                return -EOPNOTSUPP;
 870        }
 871
 872        num = ethtool_get_phc_vclocks(dev, &vclock_index);
 873        dev_put(dev);
 874
 875        for (i = 0; i < num; i++) {
 876                if (*(vclock_index + i) == phc_index) {
 877                        match = true;
 878                        break;
 879                }
 880        }
 881
 882        if (num > 0)
 883                kfree(vclock_index);
 884
 885        if (!match)
 886                return -EINVAL;
 887
 888        sk->sk_bind_phc = phc_index;
 889
 890        return 0;
 891}
 892
 893int sock_set_timestamping(struct sock *sk, int optname,
 894                          struct so_timestamping timestamping)
 895{
 896        int val = timestamping.flags;
 897        int ret;
 898
 899        if (val & ~SOF_TIMESTAMPING_MASK)
 900                return -EINVAL;
 901
 902        if (val & SOF_TIMESTAMPING_OPT_ID &&
 903            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 904                if (sk_is_tcp(sk)) {
 905                        if ((1 << sk->sk_state) &
 906                            (TCPF_CLOSE | TCPF_LISTEN))
 907                                return -EINVAL;
 908                        atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 909                } else {
 910                        atomic_set(&sk->sk_tskey, 0);
 911                }
 912        }
 913
 914        if (val & SOF_TIMESTAMPING_OPT_STATS &&
 915            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 916                return -EINVAL;
 917
 918        if (val & SOF_TIMESTAMPING_BIND_PHC) {
 919                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 920                if (ret)
 921                        return ret;
 922        }
 923
 924        sk->sk_tsflags = val;
 925        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 926
 927        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 928                sock_enable_timestamp(sk,
 929                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
 930        else
 931                sock_disable_timestamp(sk,
 932                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 933        return 0;
 934}
 935
 936void sock_set_keepalive(struct sock *sk)
 937{
 938        lock_sock(sk);
 939        if (sk->sk_prot->keepalive)
 940                sk->sk_prot->keepalive(sk, true);
 941        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 942        release_sock(sk);
 943}
 944EXPORT_SYMBOL(sock_set_keepalive);
 945
 946static void __sock_set_rcvbuf(struct sock *sk, int val)
 947{
 948        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 949         * as a negative value.
 950         */
 951        val = min_t(int, val, INT_MAX / 2);
 952        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 953
 954        /* We double it on the way in to account for "struct sk_buff" etc.
 955         * overhead.   Applications assume that the SO_RCVBUF setting they make
 956         * will allow that much actual data to be received on that socket.
 957         *
 958         * Applications are unaware that "struct sk_buff" and other overheads
 959         * allocate from the receive buffer during socket buffer allocation.
 960         *
 961         * And after considering the possible alternatives, returning the value
 962         * we actually used in getsockopt is the most desirable behavior.
 963         */
 964        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 965}
 966
 967void sock_set_rcvbuf(struct sock *sk, int val)
 968{
 969        lock_sock(sk);
 970        __sock_set_rcvbuf(sk, val);
 971        release_sock(sk);
 972}
 973EXPORT_SYMBOL(sock_set_rcvbuf);
 974
 975static void __sock_set_mark(struct sock *sk, u32 val)
 976{
 977        if (val != sk->sk_mark) {
 978                sk->sk_mark = val;
 979                sk_dst_reset(sk);
 980        }
 981}
 982
 983void sock_set_mark(struct sock *sk, u32 val)
 984{
 985        lock_sock(sk);
 986        __sock_set_mark(sk, val);
 987        release_sock(sk);
 988}
 989EXPORT_SYMBOL(sock_set_mark);
 990
 991static void sock_release_reserved_memory(struct sock *sk, int bytes)
 992{
 993        /* Round down bytes to multiple of pages */
 994        bytes = round_down(bytes, PAGE_SIZE);
 995
 996        WARN_ON(bytes > sk->sk_reserved_mem);
 997        sk->sk_reserved_mem -= bytes;
 998        sk_mem_reclaim(sk);
 999}
1000
1001static int sock_reserve_memory(struct sock *sk, int bytes)
1002{
1003        long allocated;
1004        bool charged;
1005        int pages;
1006
1007        if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1008                return -EOPNOTSUPP;
1009
1010        if (!bytes)
1011                return 0;
1012
1013        pages = sk_mem_pages(bytes);
1014
1015        /* pre-charge to memcg */
1016        charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1017                                          GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1018        if (!charged)
1019                return -ENOMEM;
1020
1021        /* pre-charge to forward_alloc */
1022        sk_memory_allocated_add(sk, pages);
1023        allocated = sk_memory_allocated(sk);
1024        /* If the system goes into memory pressure with this
1025         * precharge, give up and return error.
1026         */
1027        if (allocated > sk_prot_mem_limits(sk, 1)) {
1028                sk_memory_allocated_sub(sk, pages);
1029                mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1030                return -ENOMEM;
1031        }
1032        sk->sk_forward_alloc += pages << PAGE_SHIFT;
1033
1034        sk->sk_reserved_mem += pages << PAGE_SHIFT;
1035
1036        return 0;
1037}
1038
1039/*
1040 *      This is meant for all protocols to use and covers goings on
1041 *      at the socket level. Everything here is generic.
1042 */
1043
1044int sock_setsockopt(struct socket *sock, int level, int optname,
1045                    sockptr_t optval, unsigned int optlen)
1046{
1047        struct so_timestamping timestamping;
1048        struct sock_txtime sk_txtime;
1049        struct sock *sk = sock->sk;
1050        int val;
1051        int valbool;
1052        struct linger ling;
1053        int ret = 0;
1054
1055        /*
1056         *      Options without arguments
1057         */
1058
1059        if (optname == SO_BINDTODEVICE)
1060                return sock_setbindtodevice(sk, optval, optlen);
1061
1062        if (optlen < sizeof(int))
1063                return -EINVAL;
1064
1065        if (copy_from_sockptr(&val, optval, sizeof(val)))
1066                return -EFAULT;
1067
1068        valbool = val ? 1 : 0;
1069
1070        lock_sock(sk);
1071
1072        switch (optname) {
1073        case SO_DEBUG:
1074                if (val && !capable(CAP_NET_ADMIN))
1075                        ret = -EACCES;
1076                else
1077                        sock_valbool_flag(sk, SOCK_DBG, valbool);
1078                break;
1079        case SO_REUSEADDR:
1080                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1081                break;
1082        case SO_REUSEPORT:
1083                sk->sk_reuseport = valbool;
1084                break;
1085        case SO_TYPE:
1086        case SO_PROTOCOL:
1087        case SO_DOMAIN:
1088        case SO_ERROR:
1089                ret = -ENOPROTOOPT;
1090                break;
1091        case SO_DONTROUTE:
1092                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1093                sk_dst_reset(sk);
1094                break;
1095        case SO_BROADCAST:
1096                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1097                break;
1098        case SO_SNDBUF:
1099                /* Don't error on this BSD doesn't and if you think
1100                 * about it this is right. Otherwise apps have to
1101                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1102                 * are treated in BSD as hints
1103                 */
1104                val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1105set_sndbuf:
1106                /* Ensure val * 2 fits into an int, to prevent max_t()
1107                 * from treating it as a negative value.
1108                 */
1109                val = min_t(int, val, INT_MAX / 2);
1110                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1111                WRITE_ONCE(sk->sk_sndbuf,
1112                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
1113                /* Wake up sending tasks if we upped the value. */
1114                sk->sk_write_space(sk);
1115                break;
1116
1117        case SO_SNDBUFFORCE:
1118                if (!capable(CAP_NET_ADMIN)) {
1119                        ret = -EPERM;
1120                        break;
1121                }
1122
1123                /* No negative values (to prevent underflow, as val will be
1124                 * multiplied by 2).
1125                 */
1126                if (val < 0)
1127                        val = 0;
1128                goto set_sndbuf;
1129
1130        case SO_RCVBUF:
1131                /* Don't error on this BSD doesn't and if you think
1132                 * about it this is right. Otherwise apps have to
1133                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1134                 * are treated in BSD as hints
1135                 */
1136                __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1137                break;
1138
1139        case SO_RCVBUFFORCE:
1140                if (!capable(CAP_NET_ADMIN)) {
1141                        ret = -EPERM;
1142                        break;
1143                }
1144
1145                /* No negative values (to prevent underflow, as val will be
1146                 * multiplied by 2).
1147                 */
1148                __sock_set_rcvbuf(sk, max(val, 0));
1149                break;
1150
1151        case SO_KEEPALIVE:
1152                if (sk->sk_prot->keepalive)
1153                        sk->sk_prot->keepalive(sk, valbool);
1154                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1155                break;
1156
1157        case SO_OOBINLINE:
1158                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1159                break;
1160
1161        case SO_NO_CHECK:
1162                sk->sk_no_check_tx = valbool;
1163                break;
1164
1165        case SO_PRIORITY:
1166                if ((val >= 0 && val <= 6) ||
1167                    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1168                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1169                        sk->sk_priority = val;
1170                else
1171                        ret = -EPERM;
1172                break;
1173
1174        case SO_LINGER:
1175                if (optlen < sizeof(ling)) {
1176                        ret = -EINVAL;  /* 1003.1g */
1177                        break;
1178                }
1179                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1180                        ret = -EFAULT;
1181                        break;
1182                }
1183                if (!ling.l_onoff)
1184                        sock_reset_flag(sk, SOCK_LINGER);
1185                else {
1186#if (BITS_PER_LONG == 32)
1187                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1188                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1189                        else
1190#endif
1191                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1192                        sock_set_flag(sk, SOCK_LINGER);
1193                }
1194                break;
1195
1196        case SO_BSDCOMPAT:
1197                break;
1198
1199        case SO_PASSCRED:
1200                if (valbool)
1201                        set_bit(SOCK_PASSCRED, &sock->flags);
1202                else
1203                        clear_bit(SOCK_PASSCRED, &sock->flags);
1204                break;
1205
1206        case SO_TIMESTAMP_OLD:
1207        case SO_TIMESTAMP_NEW:
1208        case SO_TIMESTAMPNS_OLD:
1209        case SO_TIMESTAMPNS_NEW:
1210                sock_set_timestamp(sk, optname, valbool);
1211                break;
1212
1213        case SO_TIMESTAMPING_NEW:
1214        case SO_TIMESTAMPING_OLD:
1215                if (optlen == sizeof(timestamping)) {
1216                        if (copy_from_sockptr(&timestamping, optval,
1217                                              sizeof(timestamping))) {
1218                                ret = -EFAULT;
1219                                break;
1220                        }
1221                } else {
1222                        memset(&timestamping, 0, sizeof(timestamping));
1223                        timestamping.flags = val;
1224                }
1225                ret = sock_set_timestamping(sk, optname, timestamping);
1226                break;
1227
1228        case SO_RCVLOWAT:
1229                if (val < 0)
1230                        val = INT_MAX;
1231                if (sock->ops->set_rcvlowat)
1232                        ret = sock->ops->set_rcvlowat(sk, val);
1233                else
1234                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1235                break;
1236
1237        case SO_RCVTIMEO_OLD:
1238        case SO_RCVTIMEO_NEW:
1239                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1240                                       optlen, optname == SO_RCVTIMEO_OLD);
1241                break;
1242
1243        case SO_SNDTIMEO_OLD:
1244        case SO_SNDTIMEO_NEW:
1245                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1246                                       optlen, optname == SO_SNDTIMEO_OLD);
1247                break;
1248
1249        case SO_ATTACH_FILTER: {
1250                struct sock_fprog fprog;
1251
1252                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1253                if (!ret)
1254                        ret = sk_attach_filter(&fprog, sk);
1255                break;
1256        }
1257        case SO_ATTACH_BPF:
1258                ret = -EINVAL;
1259                if (optlen == sizeof(u32)) {
1260                        u32 ufd;
1261
1262                        ret = -EFAULT;
1263                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1264                                break;
1265
1266                        ret = sk_attach_bpf(ufd, sk);
1267                }
1268                break;
1269
1270        case SO_ATTACH_REUSEPORT_CBPF: {
1271                struct sock_fprog fprog;
1272
1273                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1274                if (!ret)
1275                        ret = sk_reuseport_attach_filter(&fprog, sk);
1276                break;
1277        }
1278        case SO_ATTACH_REUSEPORT_EBPF:
1279                ret = -EINVAL;
1280                if (optlen == sizeof(u32)) {
1281                        u32 ufd;
1282
1283                        ret = -EFAULT;
1284                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1285                                break;
1286
1287                        ret = sk_reuseport_attach_bpf(ufd, sk);
1288                }
1289                break;
1290
1291        case SO_DETACH_REUSEPORT_BPF:
1292                ret = reuseport_detach_prog(sk);
1293                break;
1294
1295        case SO_DETACH_FILTER:
1296                ret = sk_detach_filter(sk);
1297                break;
1298
1299        case SO_LOCK_FILTER:
1300                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1301                        ret = -EPERM;
1302                else
1303                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1304                break;
1305
1306        case SO_PASSSEC:
1307                if (valbool)
1308                        set_bit(SOCK_PASSSEC, &sock->flags);
1309                else
1310                        clear_bit(SOCK_PASSSEC, &sock->flags);
1311                break;
1312        case SO_MARK:
1313                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1314                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1315                        ret = -EPERM;
1316                        break;
1317                }
1318
1319                __sock_set_mark(sk, val);
1320                break;
1321        case SO_RCVMARK:
1322                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1323                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1324                        ret = -EPERM;
1325                        break;
1326                }
1327
1328                sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1329                break;
1330
1331        case SO_RXQ_OVFL:
1332                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1333                break;
1334
1335        case SO_WIFI_STATUS:
1336                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1337                break;
1338
1339        case SO_PEEK_OFF:
1340                if (sock->ops->set_peek_off)
1341                        ret = sock->ops->set_peek_off(sk, val);
1342                else
1343                        ret = -EOPNOTSUPP;
1344                break;
1345
1346        case SO_NOFCS:
1347                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1348                break;
1349
1350        case SO_SELECT_ERR_QUEUE:
1351                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1352                break;
1353
1354#ifdef CONFIG_NET_RX_BUSY_POLL
1355        case SO_BUSY_POLL:
1356                /* allow unprivileged users to decrease the value */
1357                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1358                        ret = -EPERM;
1359                else {
1360                        if (val < 0)
1361                                ret = -EINVAL;
1362                        else
1363                                WRITE_ONCE(sk->sk_ll_usec, val);
1364                }
1365                break;
1366        case SO_PREFER_BUSY_POLL:
1367                if (valbool && !capable(CAP_NET_ADMIN))
1368                        ret = -EPERM;
1369                else
1370                        WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1371                break;
1372        case SO_BUSY_POLL_BUDGET:
1373                if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1374                        ret = -EPERM;
1375                } else {
1376                        if (val < 0 || val > U16_MAX)
1377                                ret = -EINVAL;
1378                        else
1379                                WRITE_ONCE(sk->sk_busy_poll_budget, val);
1380                }
1381                break;
1382#endif
1383
1384        case SO_MAX_PACING_RATE:
1385                {
1386                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1387
1388                if (sizeof(ulval) != sizeof(val) &&
1389                    optlen >= sizeof(ulval) &&
1390                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1391                        ret = -EFAULT;
1392                        break;
1393                }
1394                if (ulval != ~0UL)
1395                        cmpxchg(&sk->sk_pacing_status,
1396                                SK_PACING_NONE,
1397                                SK_PACING_NEEDED);
1398                sk->sk_max_pacing_rate = ulval;
1399                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1400                break;
1401                }
1402        case SO_INCOMING_CPU:
1403                WRITE_ONCE(sk->sk_incoming_cpu, val);
1404                break;
1405
1406        case SO_CNX_ADVICE:
1407                if (val == 1)
1408                        dst_negative_advice(sk);
1409                break;
1410
1411        case SO_ZEROCOPY:
1412                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1413                        if (!(sk_is_tcp(sk) ||
1414                              (sk->sk_type == SOCK_DGRAM &&
1415                               sk->sk_protocol == IPPROTO_UDP)))
1416                                ret = -EOPNOTSUPP;
1417                } else if (sk->sk_family != PF_RDS) {
1418                        ret = -EOPNOTSUPP;
1419                }
1420                if (!ret) {
1421                        if (val < 0 || val > 1)
1422                                ret = -EINVAL;
1423                        else
1424                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1425                }
1426                break;
1427
1428        case SO_TXTIME:
1429                if (optlen != sizeof(struct sock_txtime)) {
1430                        ret = -EINVAL;
1431                        break;
1432                } else if (copy_from_sockptr(&sk_txtime, optval,
1433                           sizeof(struct sock_txtime))) {
1434                        ret = -EFAULT;
1435                        break;
1436                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1437                        ret = -EINVAL;
1438                        break;
1439                }
1440                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1441                 * scheduler has enough safe guards.
1442                 */
1443                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1444                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1445                        ret = -EPERM;
1446                        break;
1447                }
1448                sock_valbool_flag(sk, SOCK_TXTIME, true);
1449                sk->sk_clockid = sk_txtime.clockid;
1450                sk->sk_txtime_deadline_mode =
1451                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1452                sk->sk_txtime_report_errors =
1453                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1454                break;
1455
1456        case SO_BINDTOIFINDEX:
1457                ret = sock_bindtoindex_locked(sk, val);
1458                break;
1459
1460        case SO_BUF_LOCK:
1461                if (val & ~SOCK_BUF_LOCK_MASK) {
1462                        ret = -EINVAL;
1463                        break;
1464                }
1465                sk->sk_userlocks = val | (sk->sk_userlocks &
1466                                          ~SOCK_BUF_LOCK_MASK);
1467                break;
1468
1469        case SO_RESERVE_MEM:
1470        {
1471                int delta;
1472
1473                if (val < 0) {
1474                        ret = -EINVAL;
1475                        break;
1476                }
1477
1478                delta = val - sk->sk_reserved_mem;
1479                if (delta < 0)
1480                        sock_release_reserved_memory(sk, -delta);
1481                else
1482                        ret = sock_reserve_memory(sk, delta);
1483                break;
1484        }
1485
1486        case SO_TXREHASH:
1487                if (val < -1 || val > 1) {
1488                        ret = -EINVAL;
1489                        break;
1490                }
1491                /* Paired with READ_ONCE() in tcp_rtx_synack() */
1492                WRITE_ONCE(sk->sk_txrehash, (u8)val);
1493                break;
1494
1495        default:
1496                ret = -ENOPROTOOPT;
1497                break;
1498        }
1499        release_sock(sk);
1500        return ret;
1501}
1502EXPORT_SYMBOL(sock_setsockopt);
1503
1504static const struct cred *sk_get_peer_cred(struct sock *sk)
1505{
1506        const struct cred *cred;
1507
1508        spin_lock(&sk->sk_peer_lock);
1509        cred = get_cred(sk->sk_peer_cred);
1510        spin_unlock(&sk->sk_peer_lock);
1511
1512        return cred;
1513}
1514
1515static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1516                          struct ucred *ucred)
1517{
1518        ucred->pid = pid_vnr(pid);
1519        ucred->uid = ucred->gid = -1;
1520        if (cred) {
1521                struct user_namespace *current_ns = current_user_ns();
1522
1523                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1524                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1525        }
1526}
1527
1528static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1529{
1530        struct user_namespace *user_ns = current_user_ns();
1531        int i;
1532
1533        for (i = 0; i < src->ngroups; i++)
1534                if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1535                        return -EFAULT;
1536
1537        return 0;
1538}
1539
1540int sock_getsockopt(struct socket *sock, int level, int optname,
1541                    char __user *optval, int __user *optlen)
1542{
1543        struct sock *sk = sock->sk;
1544
1545        union {
1546                int val;
1547                u64 val64;
1548                unsigned long ulval;
1549                struct linger ling;
1550                struct old_timeval32 tm32;
1551                struct __kernel_old_timeval tm;
1552                struct  __kernel_sock_timeval stm;
1553                struct sock_txtime txtime;
1554                struct so_timestamping timestamping;
1555        } v;
1556
1557        int lv = sizeof(int);
1558        int len;
1559
1560        if (get_user(len, optlen))
1561                return -EFAULT;
1562        if (len < 0)
1563                return -EINVAL;
1564
1565        memset(&v, 0, sizeof(v));
1566
1567        switch (optname) {
1568        case SO_DEBUG:
1569                v.val = sock_flag(sk, SOCK_DBG);
1570                break;
1571
1572        case SO_DONTROUTE:
1573                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1574                break;
1575
1576        case SO_BROADCAST:
1577                v.val = sock_flag(sk, SOCK_BROADCAST);
1578                break;
1579
1580        case SO_SNDBUF:
1581                v.val = sk->sk_sndbuf;
1582                break;
1583
1584        case SO_RCVBUF:
1585                v.val = sk->sk_rcvbuf;
1586                break;
1587
1588        case SO_REUSEADDR:
1589                v.val = sk->sk_reuse;
1590                break;
1591
1592        case SO_REUSEPORT:
1593                v.val = sk->sk_reuseport;
1594                break;
1595
1596        case SO_KEEPALIVE:
1597                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1598                break;
1599
1600        case SO_TYPE:
1601                v.val = sk->sk_type;
1602                break;
1603
1604        case SO_PROTOCOL:
1605                v.val = sk->sk_protocol;
1606                break;
1607
1608        case SO_DOMAIN:
1609                v.val = sk->sk_family;
1610                break;
1611
1612        case SO_ERROR:
1613                v.val = -sock_error(sk);
1614                if (v.val == 0)
1615                        v.val = xchg(&sk->sk_err_soft, 0);
1616                break;
1617
1618        case SO_OOBINLINE:
1619                v.val = sock_flag(sk, SOCK_URGINLINE);
1620                break;
1621
1622        case SO_NO_CHECK:
1623                v.val = sk->sk_no_check_tx;
1624                break;
1625
1626        case SO_PRIORITY:
1627                v.val = sk->sk_priority;
1628                break;
1629
1630        case SO_LINGER:
1631                lv              = sizeof(v.ling);
1632                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1633                v.ling.l_linger = sk->sk_lingertime / HZ;
1634                break;
1635
1636        case SO_BSDCOMPAT:
1637                break;
1638
1639        case SO_TIMESTAMP_OLD:
1640                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1641                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1642                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1643                break;
1644
1645        case SO_TIMESTAMPNS_OLD:
1646                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1647                break;
1648
1649        case SO_TIMESTAMP_NEW:
1650                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1651                break;
1652
1653        case SO_TIMESTAMPNS_NEW:
1654                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1655                break;
1656
1657        case SO_TIMESTAMPING_OLD:
1658                lv = sizeof(v.timestamping);
1659                v.timestamping.flags = sk->sk_tsflags;
1660                v.timestamping.bind_phc = sk->sk_bind_phc;
1661                break;
1662
1663        case SO_RCVTIMEO_OLD:
1664        case SO_RCVTIMEO_NEW:
1665                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1666                break;
1667
1668        case SO_SNDTIMEO_OLD:
1669        case SO_SNDTIMEO_NEW:
1670                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1671                break;
1672
1673        case SO_RCVLOWAT:
1674                v.val = sk->sk_rcvlowat;
1675                break;
1676
1677        case SO_SNDLOWAT:
1678                v.val = 1;
1679                break;
1680
1681        case SO_PASSCRED:
1682                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1683                break;
1684
1685        case SO_PEERCRED:
1686        {
1687                struct ucred peercred;
1688                if (len > sizeof(peercred))
1689                        len = sizeof(peercred);
1690
1691                spin_lock(&sk->sk_peer_lock);
1692                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1693                spin_unlock(&sk->sk_peer_lock);
1694
1695                if (copy_to_user(optval, &peercred, len))
1696                        return -EFAULT;
1697                goto lenout;
1698        }
1699
1700        case SO_PEERGROUPS:
1701        {
1702                const struct cred *cred;
1703                int ret, n;
1704
1705                cred = sk_get_peer_cred(sk);
1706                if (!cred)
1707                        return -ENODATA;
1708
1709                n = cred->group_info->ngroups;
1710                if (len < n * sizeof(gid_t)) {
1711                        len = n * sizeof(gid_t);
1712                        put_cred(cred);
1713                        return put_user(len, optlen) ? -EFAULT : -ERANGE;
1714                }
1715                len = n * sizeof(gid_t);
1716
1717                ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1718                put_cred(cred);
1719                if (ret)
1720                        return ret;
1721                goto lenout;
1722        }
1723
1724        case SO_PEERNAME:
1725        {
1726                char address[128];
1727
1728                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1729                if (lv < 0)
1730                        return -ENOTCONN;
1731                if (lv < len)
1732                        return -EINVAL;
1733                if (copy_to_user(optval, address, len))
1734                        return -EFAULT;
1735                goto lenout;
1736        }
1737
1738        /* Dubious BSD thing... Probably nobody even uses it, but
1739         * the UNIX standard wants it for whatever reason... -DaveM
1740         */
1741        case SO_ACCEPTCONN:
1742                v.val = sk->sk_state == TCP_LISTEN;
1743                break;
1744
1745        case SO_PASSSEC:
1746                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1747                break;
1748
1749        case SO_PEERSEC:
1750                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1751
1752        case SO_MARK:
1753                v.val = sk->sk_mark;
1754                break;
1755
1756        case SO_RCVMARK:
1757                v.val = sock_flag(sk, SOCK_RCVMARK);
1758                break;
1759
1760        case SO_RXQ_OVFL:
1761                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1762                break;
1763
1764        case SO_WIFI_STATUS:
1765                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1766                break;
1767
1768        case SO_PEEK_OFF:
1769                if (!sock->ops->set_peek_off)
1770                        return -EOPNOTSUPP;
1771
1772                v.val = sk->sk_peek_off;
1773                break;
1774        case SO_NOFCS:
1775                v.val = sock_flag(sk, SOCK_NOFCS);
1776                break;
1777
1778        case SO_BINDTODEVICE:
1779                return sock_getbindtodevice(sk, optval, optlen, len);
1780
1781        case SO_GET_FILTER:
1782                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1783                if (len < 0)
1784                        return len;
1785
1786                goto lenout;
1787
1788        case SO_LOCK_FILTER:
1789                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1790                break;
1791
1792        case SO_BPF_EXTENSIONS:
1793                v.val = bpf_tell_extensions();
1794                break;
1795
1796        case SO_SELECT_ERR_QUEUE:
1797                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1798                break;
1799
1800#ifdef CONFIG_NET_RX_BUSY_POLL
1801        case SO_BUSY_POLL:
1802                v.val = sk->sk_ll_usec;
1803                break;
1804        case SO_PREFER_BUSY_POLL:
1805                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1806                break;
1807#endif
1808
1809        case SO_MAX_PACING_RATE:
1810                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1811                        lv = sizeof(v.ulval);
1812                        v.ulval = sk->sk_max_pacing_rate;
1813                } else {
1814                        /* 32bit version */
1815                        v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1816                }
1817                break;
1818
1819        case SO_INCOMING_CPU:
1820                v.val = READ_ONCE(sk->sk_incoming_cpu);
1821                break;
1822
1823        case SO_MEMINFO:
1824        {
1825                u32 meminfo[SK_MEMINFO_VARS];
1826
1827                sk_get_meminfo(sk, meminfo);
1828
1829                len = min_t(unsigned int, len, sizeof(meminfo));
1830                if (copy_to_user(optval, &meminfo, len))
1831                        return -EFAULT;
1832
1833                goto lenout;
1834        }
1835
1836#ifdef CONFIG_NET_RX_BUSY_POLL
1837        case SO_INCOMING_NAPI_ID:
1838                v.val = READ_ONCE(sk->sk_napi_id);
1839
1840                /* aggregate non-NAPI IDs down to 0 */
1841                if (v.val < MIN_NAPI_ID)
1842                        v.val = 0;
1843
1844                break;
1845#endif
1846
1847        case SO_COOKIE:
1848                lv = sizeof(u64);
1849                if (len < lv)
1850                        return -EINVAL;
1851                v.val64 = sock_gen_cookie(sk);
1852                break;
1853
1854        case SO_ZEROCOPY:
1855                v.val = sock_flag(sk, SOCK_ZEROCOPY);
1856                break;
1857
1858        case SO_TXTIME:
1859                lv = sizeof(v.txtime);
1860                v.txtime.clockid = sk->sk_clockid;
1861                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1862                                  SOF_TXTIME_DEADLINE_MODE : 0;
1863                v.txtime.flags |= sk->sk_txtime_report_errors ?
1864                                  SOF_TXTIME_REPORT_ERRORS : 0;
1865                break;
1866
1867        case SO_BINDTOIFINDEX:
1868                v.val = READ_ONCE(sk->sk_bound_dev_if);
1869                break;
1870
1871        case SO_NETNS_COOKIE:
1872                lv = sizeof(u64);
1873                if (len != lv)
1874                        return -EINVAL;
1875                v.val64 = sock_net(sk)->net_cookie;
1876                break;
1877
1878        case SO_BUF_LOCK:
1879                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1880                break;
1881
1882        case SO_RESERVE_MEM:
1883                v.val = sk->sk_reserved_mem;
1884                break;
1885
1886        case SO_TXREHASH:
1887                v.val = sk->sk_txrehash;
1888                break;
1889
1890        default:
1891                /* We implement the SO_SNDLOWAT etc to not be settable
1892                 * (1003.1g 7).
1893                 */
1894                return -ENOPROTOOPT;
1895        }
1896
1897        if (len > lv)
1898                len = lv;
1899        if (copy_to_user(optval, &v, len))
1900                return -EFAULT;
1901lenout:
1902        if (put_user(len, optlen))
1903                return -EFAULT;
1904        return 0;
1905}
1906
1907/*
1908 * Initialize an sk_lock.
1909 *
1910 * (We also register the sk_lock with the lock validator.)
1911 */
1912static inline void sock_lock_init(struct sock *sk)
1913{
1914        if (sk->sk_kern_sock)
1915                sock_lock_init_class_and_name(
1916                        sk,
1917                        af_family_kern_slock_key_strings[sk->sk_family],
1918                        af_family_kern_slock_keys + sk->sk_family,
1919                        af_family_kern_key_strings[sk->sk_family],
1920                        af_family_kern_keys + sk->sk_family);
1921        else
1922                sock_lock_init_class_and_name(
1923                        sk,
1924                        af_family_slock_key_strings[sk->sk_family],
1925                        af_family_slock_keys + sk->sk_family,
1926                        af_family_key_strings[sk->sk_family],
1927                        af_family_keys + sk->sk_family);
1928}
1929
1930/*
1931 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1932 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1933 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1934 */
1935static void sock_copy(struct sock *nsk, const struct sock *osk)
1936{
1937        const struct proto *prot = READ_ONCE(osk->sk_prot);
1938#ifdef CONFIG_SECURITY_NETWORK
1939        void *sptr = nsk->sk_security;
1940#endif
1941
1942        /* If we move sk_tx_queue_mapping out of the private section,
1943         * we must check if sk_tx_queue_clear() is called after
1944         * sock_copy() in sk_clone_lock().
1945         */
1946        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1947                     offsetof(struct sock, sk_dontcopy_begin) ||
1948                     offsetof(struct sock, sk_tx_queue_mapping) >=
1949                     offsetof(struct sock, sk_dontcopy_end));
1950
1951        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1952
1953        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1954               prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1955
1956#ifdef CONFIG_SECURITY_NETWORK
1957        nsk->sk_security = sptr;
1958        security_sk_clone(osk, nsk);
1959#endif
1960}
1961
1962static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1963                int family)
1964{
1965        struct sock *sk;
1966        struct kmem_cache *slab;
1967
1968        slab = prot->slab;
1969        if (slab != NULL) {
1970                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1971                if (!sk)
1972                        return sk;
1973                if (want_init_on_alloc(priority))
1974                        sk_prot_clear_nulls(sk, prot->obj_size);
1975        } else
1976                sk = kmalloc(prot->obj_size, priority);
1977
1978        if (sk != NULL) {
1979                if (security_sk_alloc(sk, family, priority))
1980                        goto out_free;
1981
1982                if (!try_module_get(prot->owner))
1983                        goto out_free_sec;
1984        }
1985
1986        return sk;
1987
1988out_free_sec:
1989        security_sk_free(sk);
1990out_free:
1991        if (slab != NULL)
1992                kmem_cache_free(slab, sk);
1993        else
1994                kfree(sk);
1995        return NULL;
1996}
1997
1998static void sk_prot_free(struct proto *prot, struct sock *sk)
1999{
2000        struct kmem_cache *slab;
2001        struct module *owner;
2002
2003        owner = prot->owner;
2004        slab = prot->slab;
2005
2006        cgroup_sk_free(&sk->sk_cgrp_data);
2007        mem_cgroup_sk_free(sk);
2008        security_sk_free(sk);
2009        if (slab != NULL)
2010                kmem_cache_free(slab, sk);
2011        else
2012                kfree(sk);
2013        module_put(owner);
2014}
2015
2016/**
2017 *      sk_alloc - All socket objects are allocated here
2018 *      @net: the applicable net namespace
2019 *      @family: protocol family
2020 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2021 *      @prot: struct proto associated with this new sock instance
2022 *      @kern: is this to be a kernel socket?
2023 */
2024struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2025                      struct proto *prot, int kern)
2026{
2027        struct sock *sk;
2028
2029        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2030        if (sk) {
2031                sk->sk_family = family;
2032                /*
2033                 * See comment in struct sock definition to understand
2034                 * why we need sk_prot_creator -acme
2035                 */
2036                sk->sk_prot = sk->sk_prot_creator = prot;
2037                sk->sk_kern_sock = kern;
2038                sock_lock_init(sk);
2039                sk->sk_net_refcnt = kern ? 0 : 1;
2040                if (likely(sk->sk_net_refcnt)) {
2041                        get_net_track(net, &sk->ns_tracker, priority);
2042                        sock_inuse_add(net, 1);
2043                }
2044
2045                sock_net_set(sk, net);
2046                refcount_set(&sk->sk_wmem_alloc, 1);
2047
2048                mem_cgroup_sk_alloc(sk);
2049                cgroup_sk_alloc(&sk->sk_cgrp_data);
2050                sock_update_classid(&sk->sk_cgrp_data);
2051                sock_update_netprioidx(&sk->sk_cgrp_data);
2052                sk_tx_queue_clear(sk);
2053        }
2054
2055        return sk;
2056}
2057EXPORT_SYMBOL(sk_alloc);
2058
2059/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2060 * grace period. This is the case for UDP sockets and TCP listeners.
2061 */
2062static void __sk_destruct(struct rcu_head *head)
2063{
2064        struct sock *sk = container_of(head, struct sock, sk_rcu);
2065        struct sk_filter *filter;
2066
2067        if (sk->sk_destruct)
2068                sk->sk_destruct(sk);
2069
2070        filter = rcu_dereference_check(sk->sk_filter,
2071                                       refcount_read(&sk->sk_wmem_alloc) == 0);
2072        if (filter) {
2073                sk_filter_uncharge(sk, filter);
2074                RCU_INIT_POINTER(sk->sk_filter, NULL);
2075        }
2076
2077        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2078
2079#ifdef CONFIG_BPF_SYSCALL
2080        bpf_sk_storage_free(sk);
2081#endif
2082
2083        if (atomic_read(&sk->sk_omem_alloc))
2084                pr_debug("%s: optmem leakage (%d bytes) detected\n",
2085                         __func__, atomic_read(&sk->sk_omem_alloc));
2086
2087        if (sk->sk_frag.page) {
2088                put_page(sk->sk_frag.page);
2089                sk->sk_frag.page = NULL;
2090        }
2091
2092        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2093        put_cred(sk->sk_peer_cred);
2094        put_pid(sk->sk_peer_pid);
2095
2096        if (likely(sk->sk_net_refcnt))
2097                put_net_track(sock_net(sk), &sk->ns_tracker);
2098        sk_prot_free(sk->sk_prot_creator, sk);
2099}
2100
2101void sk_destruct(struct sock *sk)
2102{
2103        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2104
2105        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2106                reuseport_detach_sock(sk);
2107                use_call_rcu = true;
2108        }
2109
2110        if (use_call_rcu)
2111                call_rcu(&sk->sk_rcu, __sk_destruct);
2112        else
2113                __sk_destruct(&sk->sk_rcu);
2114}
2115
2116static void __sk_free(struct sock *sk)
2117{
2118        if (likely(sk->sk_net_refcnt))
2119                sock_inuse_add(sock_net(sk), -1);
2120
2121        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2122                sock_diag_broadcast_destroy(sk);
2123        else
2124                sk_destruct(sk);
2125}
2126
2127void sk_free(struct sock *sk)
2128{
2129        /*
2130         * We subtract one from sk_wmem_alloc and can know if
2131         * some packets are still in some tx queue.
2132         * If not null, sock_wfree() will call __sk_free(sk) later
2133         */
2134        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2135                __sk_free(sk);
2136}
2137EXPORT_SYMBOL(sk_free);
2138
2139static void sk_init_common(struct sock *sk)
2140{
2141        skb_queue_head_init(&sk->sk_receive_queue);
2142        skb_queue_head_init(&sk->sk_write_queue);
2143        skb_queue_head_init(&sk->sk_error_queue);
2144
2145        rwlock_init(&sk->sk_callback_lock);
2146        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2147                        af_rlock_keys + sk->sk_family,
2148                        af_family_rlock_key_strings[sk->sk_family]);
2149        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2150                        af_wlock_keys + sk->sk_family,
2151                        af_family_wlock_key_strings[sk->sk_family]);
2152        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2153                        af_elock_keys + sk->sk_family,
2154                        af_family_elock_key_strings[sk->sk_family]);
2155        lockdep_set_class_and_name(&sk->sk_callback_lock,
2156                        af_callback_keys + sk->sk_family,
2157                        af_family_clock_key_strings[sk->sk_family]);
2158}
2159
2160/**
2161 *      sk_clone_lock - clone a socket, and lock its clone
2162 *      @sk: the socket to clone
2163 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2164 *
2165 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2166 */
2167struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2168{
2169        struct proto *prot = READ_ONCE(sk->sk_prot);
2170        struct sk_filter *filter;
2171        bool is_charged = true;
2172        struct sock *newsk;
2173
2174        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2175        if (!newsk)
2176                goto out;
2177
2178        sock_copy(newsk, sk);
2179
2180        newsk->sk_prot_creator = prot;
2181
2182        /* SANITY */
2183        if (likely(newsk->sk_net_refcnt)) {
2184                get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2185                sock_inuse_add(sock_net(newsk), 1);
2186        }
2187        sk_node_init(&newsk->sk_node);
2188        sock_lock_init(newsk);
2189        bh_lock_sock(newsk);
2190        newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2191        newsk->sk_backlog.len = 0;
2192
2193        atomic_set(&newsk->sk_rmem_alloc, 0);
2194
2195        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2196        refcount_set(&newsk->sk_wmem_alloc, 1);
2197
2198        atomic_set(&newsk->sk_omem_alloc, 0);
2199        sk_init_common(newsk);
2200
2201        newsk->sk_dst_cache     = NULL;
2202        newsk->sk_dst_pending_confirm = 0;
2203        newsk->sk_wmem_queued   = 0;
2204        newsk->sk_forward_alloc = 0;
2205        newsk->sk_reserved_mem  = 0;
2206        atomic_set(&newsk->sk_drops, 0);
2207        newsk->sk_send_head     = NULL;
2208        newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2209        atomic_set(&newsk->sk_zckey, 0);
2210
2211        sock_reset_flag(newsk, SOCK_DONE);
2212
2213        /* sk->sk_memcg will be populated at accept() time */
2214        newsk->sk_memcg = NULL;
2215
2216        cgroup_sk_clone(&newsk->sk_cgrp_data);
2217
2218        rcu_read_lock();
2219        filter = rcu_dereference(sk->sk_filter);
2220        if (filter != NULL)
2221                /* though it's an empty new sock, the charging may fail
2222                 * if sysctl_optmem_max was changed between creation of
2223                 * original socket and cloning
2224                 */
2225                is_charged = sk_filter_charge(newsk, filter);
2226        RCU_INIT_POINTER(newsk->sk_filter, filter);
2227        rcu_read_unlock();
2228
2229        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2230                /* We need to make sure that we don't uncharge the new
2231                 * socket if we couldn't charge it in the first place
2232                 * as otherwise we uncharge the parent's filter.
2233                 */
2234                if (!is_charged)
2235                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
2236                sk_free_unlock_clone(newsk);
2237                newsk = NULL;
2238                goto out;
2239        }
2240        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2241
2242        if (bpf_sk_storage_clone(sk, newsk)) {
2243                sk_free_unlock_clone(newsk);
2244                newsk = NULL;
2245                goto out;
2246        }
2247
2248        /* Clear sk_user_data if parent had the pointer tagged
2249         * as not suitable for copying when cloning.
2250         */
2251        if (sk_user_data_is_nocopy(newsk))
2252                newsk->sk_user_data = NULL;
2253
2254        newsk->sk_err      = 0;
2255        newsk->sk_err_soft = 0;
2256        newsk->sk_priority = 0;
2257        newsk->sk_incoming_cpu = raw_smp_processor_id();
2258
2259        /* Before updating sk_refcnt, we must commit prior changes to memory
2260         * (Documentation/RCU/rculist_nulls.rst for details)
2261         */
2262        smp_wmb();
2263        refcount_set(&newsk->sk_refcnt, 2);
2264
2265        /* Increment the counter in the same struct proto as the master
2266         * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2267         * is the same as sk->sk_prot->socks, as this field was copied
2268         * with memcpy).
2269         *
2270         * This _changes_ the previous behaviour, where
2271         * tcp_create_openreq_child always was incrementing the
2272         * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2273         * to be taken into account in all callers. -acme
2274         */
2275        sk_refcnt_debug_inc(newsk);
2276        sk_set_socket(newsk, NULL);
2277        sk_tx_queue_clear(newsk);
2278        RCU_INIT_POINTER(newsk->sk_wq, NULL);
2279
2280        if (newsk->sk_prot->sockets_allocated)
2281                sk_sockets_allocated_inc(newsk);
2282
2283        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2284                net_enable_timestamp();
2285out:
2286        return newsk;
2287}
2288EXPORT_SYMBOL_GPL(sk_clone_lock);
2289
2290void sk_free_unlock_clone(struct sock *sk)
2291{
2292        /* It is still raw copy of parent, so invalidate
2293         * destructor and make plain sk_free() */
2294        sk->sk_destruct = NULL;
2295        bh_unlock_sock(sk);
2296        sk_free(sk);
2297}
2298EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2299
2300static void sk_trim_gso_size(struct sock *sk)
2301{
2302        if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2303                return;
2304#if IS_ENABLED(CONFIG_IPV6)
2305        if (sk->sk_family == AF_INET6 &&
2306            sk_is_tcp(sk) &&
2307            !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2308                return;
2309#endif
2310        sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2311}
2312
2313void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2314{
2315        u32 max_segs = 1;
2316
2317        sk_dst_set(sk, dst);
2318        sk->sk_route_caps = dst->dev->features;
2319        if (sk_is_tcp(sk))
2320                sk->sk_route_caps |= NETIF_F_GSO;
2321        if (sk->sk_route_caps & NETIF_F_GSO)
2322                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2323        if (unlikely(sk->sk_gso_disabled))
2324                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2325        if (sk_can_gso(sk)) {
2326                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2327                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2328                } else {
2329                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2330                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2331                        sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2332                        sk_trim_gso_size(sk);
2333                        sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2334                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2335                        max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2336                }
2337        }
2338        sk->sk_gso_max_segs = max_segs;
2339}
2340EXPORT_SYMBOL_GPL(sk_setup_caps);
2341
2342/*
2343 *      Simple resource managers for sockets.
2344 */
2345
2346
2347/*
2348 * Write buffer destructor automatically called from kfree_skb.
2349 */
2350void sock_wfree(struct sk_buff *skb)
2351{
2352        struct sock *sk = skb->sk;
2353        unsigned int len = skb->truesize;
2354        bool free;
2355
2356        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2357                if (sock_flag(sk, SOCK_RCU_FREE) &&
2358                    sk->sk_write_space == sock_def_write_space) {
2359                        rcu_read_lock();
2360                        free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2361                        sock_def_write_space_wfree(sk);
2362                        rcu_read_unlock();
2363                        if (unlikely(free))
2364                                __sk_free(sk);
2365                        return;
2366                }
2367
2368                /*
2369                 * Keep a reference on sk_wmem_alloc, this will be released
2370                 * after sk_write_space() call
2371                 */
2372                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2373                sk->sk_write_space(sk);
2374                len = 1;
2375        }
2376        /*
2377         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2378         * could not do because of in-flight packets
2379         */
2380        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2381                __sk_free(sk);
2382}
2383EXPORT_SYMBOL(sock_wfree);
2384
2385/* This variant of sock_wfree() is used by TCP,
2386 * since it sets SOCK_USE_WRITE_QUEUE.
2387 */
2388void __sock_wfree(struct sk_buff *skb)
2389{
2390        struct sock *sk = skb->sk;
2391
2392        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2393                __sk_free(sk);
2394}
2395
2396void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2397{
2398        skb_orphan(skb);
2399        skb->sk = sk;
2400#ifdef CONFIG_INET
2401        if (unlikely(!sk_fullsock(sk))) {
2402                skb->destructor = sock_edemux;
2403                sock_hold(sk);
2404                return;
2405        }
2406#endif
2407        skb->destructor = sock_wfree;
2408        skb_set_hash_from_sk(skb, sk);
2409        /*
2410         * We used to take a refcount on sk, but following operation
2411         * is enough to guarantee sk_free() wont free this sock until
2412         * all in-flight packets are completed
2413         */
2414        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2415}
2416EXPORT_SYMBOL(skb_set_owner_w);
2417
2418static bool can_skb_orphan_partial(const struct sk_buff *skb)
2419{
2420#ifdef CONFIG_TLS_DEVICE
2421        /* Drivers depend on in-order delivery for crypto offload,
2422         * partial orphan breaks out-of-order-OK logic.
2423         */
2424        if (skb->decrypted)
2425                return false;
2426#endif
2427        return (skb->destructor == sock_wfree ||
2428                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2429}
2430
2431/* This helper is used by netem, as it can hold packets in its
2432 * delay queue. We want to allow the owner socket to send more
2433 * packets, as if they were already TX completed by a typical driver.
2434 * But we also want to keep skb->sk set because some packet schedulers
2435 * rely on it (sch_fq for example).
2436 */
2437void skb_orphan_partial(struct sk_buff *skb)
2438{
2439        if (skb_is_tcp_pure_ack(skb))
2440                return;
2441
2442        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2443                return;
2444
2445        skb_orphan(skb);
2446}
2447EXPORT_SYMBOL(skb_orphan_partial);
2448
2449/*
2450 * Read buffer destructor automatically called from kfree_skb.
2451 */
2452void sock_rfree(struct sk_buff *skb)
2453{
2454        struct sock *sk = skb->sk;
2455        unsigned int len = skb->truesize;
2456
2457        atomic_sub(len, &sk->sk_rmem_alloc);
2458        sk_mem_uncharge(sk, len);
2459}
2460EXPORT_SYMBOL(sock_rfree);
2461
2462/*
2463 * Buffer destructor for skbs that are not used directly in read or write
2464 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2465 */
2466void sock_efree(struct sk_buff *skb)
2467{
2468        sock_put(skb->sk);
2469}
2470EXPORT_SYMBOL(sock_efree);
2471
2472/* Buffer destructor for prefetch/receive path where reference count may
2473 * not be held, e.g. for listen sockets.
2474 */
2475#ifdef CONFIG_INET
2476void sock_pfree(struct sk_buff *skb)
2477{
2478        if (sk_is_refcounted(skb->sk))
2479                sock_gen_put(skb->sk);
2480}
2481EXPORT_SYMBOL(sock_pfree);
2482#endif /* CONFIG_INET */
2483
2484kuid_t sock_i_uid(struct sock *sk)
2485{
2486        kuid_t uid;
2487
2488        read_lock_bh(&sk->sk_callback_lock);
2489        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2490        read_unlock_bh(&sk->sk_callback_lock);
2491        return uid;
2492}
2493EXPORT_SYMBOL(sock_i_uid);
2494
2495unsigned long sock_i_ino(struct sock *sk)
2496{
2497        unsigned long ino;
2498
2499        read_lock_bh(&sk->sk_callback_lock);
2500        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2501        read_unlock_bh(&sk->sk_callback_lock);
2502        return ino;
2503}
2504EXPORT_SYMBOL(sock_i_ino);
2505
2506/*
2507 * Allocate a skb from the socket's send buffer.
2508 */
2509struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2510                             gfp_t priority)
2511{
2512        if (force ||
2513            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2514                struct sk_buff *skb = alloc_skb(size, priority);
2515
2516                if (skb) {
2517                        skb_set_owner_w(skb, sk);
2518                        return skb;
2519                }
2520        }
2521        return NULL;
2522}
2523EXPORT_SYMBOL(sock_wmalloc);
2524
2525static void sock_ofree(struct sk_buff *skb)
2526{
2527        struct sock *sk = skb->sk;
2528
2529        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2530}
2531
2532struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2533                             gfp_t priority)
2534{
2535        struct sk_buff *skb;
2536
2537        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2538        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2539            READ_ONCE(sysctl_optmem_max))
2540                return NULL;
2541
2542        skb = alloc_skb(size, priority);
2543        if (!skb)
2544                return NULL;
2545
2546        atomic_add(skb->truesize, &sk->sk_omem_alloc);
2547        skb->sk = sk;
2548        skb->destructor = sock_ofree;
2549        return skb;
2550}
2551
2552/*
2553 * Allocate a memory block from the socket's option memory buffer.
2554 */
2555void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2556{
2557        int optmem_max = READ_ONCE(sysctl_optmem_max);
2558
2559        if ((unsigned int)size <= optmem_max &&
2560            atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2561                void *mem;
2562                /* First do the add, to avoid the race if kmalloc
2563                 * might sleep.
2564                 */
2565                atomic_add(size, &sk->sk_omem_alloc);
2566                mem = kmalloc(size, priority);
2567                if (mem)
2568                        return mem;
2569                atomic_sub(size, &sk->sk_omem_alloc);
2570        }
2571        return NULL;
2572}
2573EXPORT_SYMBOL(sock_kmalloc);
2574
2575/* Free an option memory block. Note, we actually want the inline
2576 * here as this allows gcc to detect the nullify and fold away the
2577 * condition entirely.
2578 */
2579static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2580                                  const bool nullify)
2581{
2582        if (WARN_ON_ONCE(!mem))
2583                return;
2584        if (nullify)
2585                kfree_sensitive(mem);
2586        else
2587                kfree(mem);
2588        atomic_sub(size, &sk->sk_omem_alloc);
2589}
2590
2591void sock_kfree_s(struct sock *sk, void *mem, int size)
2592{
2593        __sock_kfree_s(sk, mem, size, false);
2594}
2595EXPORT_SYMBOL(sock_kfree_s);
2596
2597void sock_kzfree_s(struct sock *sk, void *mem, int size)
2598{
2599        __sock_kfree_s(sk, mem, size, true);
2600}
2601EXPORT_SYMBOL(sock_kzfree_s);
2602
2603/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2604   I think, these locks should be removed for datagram sockets.
2605 */
2606static long sock_wait_for_wmem(struct sock *sk, long timeo)
2607{
2608        DEFINE_WAIT(wait);
2609
2610        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2611        for (;;) {
2612                if (!timeo)
2613                        break;
2614                if (signal_pending(current))
2615                        break;
2616                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2617                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2618                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2619                        break;
2620                if (sk->sk_shutdown & SEND_SHUTDOWN)
2621                        break;
2622                if (sk->sk_err)
2623                        break;
2624                timeo = schedule_timeout(timeo);
2625        }
2626        finish_wait(sk_sleep(sk), &wait);
2627        return timeo;
2628}
2629
2630
2631/*
2632 *      Generic send/receive buffer handlers
2633 */
2634
2635struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2636                                     unsigned long data_len, int noblock,
2637                                     int *errcode, int max_page_order)
2638{
2639        struct sk_buff *skb;
2640        long timeo;
2641        int err;
2642
2643        timeo = sock_sndtimeo(sk, noblock);
2644        for (;;) {
2645                err = sock_error(sk);
2646                if (err != 0)
2647                        goto failure;
2648
2649                err = -EPIPE;
2650                if (sk->sk_shutdown & SEND_SHUTDOWN)
2651                        goto failure;
2652
2653                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2654                        break;
2655
2656                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2657                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2658                err = -EAGAIN;
2659                if (!timeo)
2660                        goto failure;
2661                if (signal_pending(current))
2662                        goto interrupted;
2663                timeo = sock_wait_for_wmem(sk, timeo);
2664        }
2665        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2666                                   errcode, sk->sk_allocation);
2667        if (skb)
2668                skb_set_owner_w(skb, sk);
2669        return skb;
2670
2671interrupted:
2672        err = sock_intr_errno(timeo);
2673failure:
2674        *errcode = err;
2675        return NULL;
2676}
2677EXPORT_SYMBOL(sock_alloc_send_pskb);
2678
2679int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2680                     struct sockcm_cookie *sockc)
2681{
2682        u32 tsflags;
2683
2684        switch (cmsg->cmsg_type) {
2685        case SO_MARK:
2686                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2687                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2688                        return -EPERM;
2689                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2690                        return -EINVAL;
2691                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2692                break;
2693        case SO_TIMESTAMPING_OLD:
2694                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2695                        return -EINVAL;
2696
2697                tsflags = *(u32 *)CMSG_DATA(cmsg);
2698                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2699                        return -EINVAL;
2700
2701                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2702                sockc->tsflags |= tsflags;
2703                break;
2704        case SCM_TXTIME:
2705                if (!sock_flag(sk, SOCK_TXTIME))
2706                        return -EINVAL;
2707                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2708                        return -EINVAL;
2709                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2710                break;
2711        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2712        case SCM_RIGHTS:
2713        case SCM_CREDENTIALS:
2714                break;
2715        default:
2716                return -EINVAL;
2717        }
2718        return 0;
2719}
2720EXPORT_SYMBOL(__sock_cmsg_send);
2721
2722int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2723                   struct sockcm_cookie *sockc)
2724{
2725        struct cmsghdr *cmsg;
2726        int ret;
2727
2728        for_each_cmsghdr(cmsg, msg) {
2729                if (!CMSG_OK(msg, cmsg))
2730                        return -EINVAL;
2731                if (cmsg->cmsg_level != SOL_SOCKET)
2732                        continue;
2733                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2734                if (ret)
2735                        return ret;
2736        }
2737        return 0;
2738}
2739EXPORT_SYMBOL(sock_cmsg_send);
2740
2741static void sk_enter_memory_pressure(struct sock *sk)
2742{
2743        if (!sk->sk_prot->enter_memory_pressure)
2744                return;
2745
2746        sk->sk_prot->enter_memory_pressure(sk);
2747}
2748
2749static void sk_leave_memory_pressure(struct sock *sk)
2750{
2751        if (sk->sk_prot->leave_memory_pressure) {
2752                sk->sk_prot->leave_memory_pressure(sk);
2753        } else {
2754                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2755
2756                if (memory_pressure && READ_ONCE(*memory_pressure))
2757                        WRITE_ONCE(*memory_pressure, 0);
2758        }
2759}
2760
2761DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2762
2763/**
2764 * skb_page_frag_refill - check that a page_frag contains enough room
2765 * @sz: minimum size of the fragment we want to get
2766 * @pfrag: pointer to page_frag
2767 * @gfp: priority for memory allocation
2768 *
2769 * Note: While this allocator tries to use high order pages, there is
2770 * no guarantee that allocations succeed. Therefore, @sz MUST be
2771 * less or equal than PAGE_SIZE.
2772 */
2773bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2774{
2775        if (pfrag->page) {
2776                if (page_ref_count(pfrag->page) == 1) {
2777                        pfrag->offset = 0;
2778                        return true;
2779                }
2780                if (pfrag->offset + sz <= pfrag->size)
2781                        return true;
2782                put_page(pfrag->page);
2783        }
2784
2785        pfrag->offset = 0;
2786        if (SKB_FRAG_PAGE_ORDER &&
2787            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2788                /* Avoid direct reclaim but allow kswapd to wake */
2789                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2790                                          __GFP_COMP | __GFP_NOWARN |
2791                                          __GFP_NORETRY,
2792                                          SKB_FRAG_PAGE_ORDER);
2793                if (likely(pfrag->page)) {
2794                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2795                        return true;
2796                }
2797        }
2798        pfrag->page = alloc_page(gfp);
2799        if (likely(pfrag->page)) {
2800                pfrag->size = PAGE_SIZE;
2801                return true;
2802        }
2803        return false;
2804}
2805EXPORT_SYMBOL(skb_page_frag_refill);
2806
2807bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2808{
2809        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2810                return true;
2811
2812        sk_enter_memory_pressure(sk);
2813        sk_stream_moderate_sndbuf(sk);
2814        return false;
2815}
2816EXPORT_SYMBOL(sk_page_frag_refill);
2817
2818void __lock_sock(struct sock *sk)
2819        __releases(&sk->sk_lock.slock)
2820        __acquires(&sk->sk_lock.slock)
2821{
2822        DEFINE_WAIT(wait);
2823
2824        for (;;) {
2825                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2826                                        TASK_UNINTERRUPTIBLE);
2827                spin_unlock_bh(&sk->sk_lock.slock);
2828                schedule();
2829                spin_lock_bh(&sk->sk_lock.slock);
2830                if (!sock_owned_by_user(sk))
2831                        break;
2832        }
2833        finish_wait(&sk->sk_lock.wq, &wait);
2834}
2835
2836void __release_sock(struct sock *sk)
2837        __releases(&sk->sk_lock.slock)
2838        __acquires(&sk->sk_lock.slock)
2839{
2840        struct sk_buff *skb, *next;
2841
2842        while ((skb = sk->sk_backlog.head) != NULL) {
2843                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2844
2845                spin_unlock_bh(&sk->sk_lock.slock);
2846
2847                do {
2848                        next = skb->next;
2849                        prefetch(next);
2850                        DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2851                        skb_mark_not_on_list(skb);
2852                        sk_backlog_rcv(sk, skb);
2853
2854                        cond_resched();
2855
2856                        skb = next;
2857                } while (skb != NULL);
2858
2859                spin_lock_bh(&sk->sk_lock.slock);
2860        }
2861
2862        /*
2863         * Doing the zeroing here guarantee we can not loop forever
2864         * while a wild producer attempts to flood us.
2865         */
2866        sk->sk_backlog.len = 0;
2867}
2868
2869void __sk_flush_backlog(struct sock *sk)
2870{
2871        spin_lock_bh(&sk->sk_lock.slock);
2872        __release_sock(sk);
2873        spin_unlock_bh(&sk->sk_lock.slock);
2874}
2875EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2876
2877/**
2878 * sk_wait_data - wait for data to arrive at sk_receive_queue
2879 * @sk:    sock to wait on
2880 * @timeo: for how long
2881 * @skb:   last skb seen on sk_receive_queue
2882 *
2883 * Now socket state including sk->sk_err is changed only under lock,
2884 * hence we may omit checks after joining wait queue.
2885 * We check receive queue before schedule() only as optimization;
2886 * it is very likely that release_sock() added new data.
2887 */
2888int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2889{
2890        DEFINE_WAIT_FUNC(wait, woken_wake_function);
2891        int rc;
2892
2893        add_wait_queue(sk_sleep(sk), &wait);
2894        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2895        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2896        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2897        remove_wait_queue(sk_sleep(sk), &wait);
2898        return rc;
2899}
2900EXPORT_SYMBOL(sk_wait_data);
2901
2902/**
2903 *      __sk_mem_raise_allocated - increase memory_allocated
2904 *      @sk: socket
2905 *      @size: memory size to allocate
2906 *      @amt: pages to allocate
2907 *      @kind: allocation type
2908 *
2909 *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2910 */
2911int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2912{
2913        bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2914        struct proto *prot = sk->sk_prot;
2915        bool charged = true;
2916        long allocated;
2917
2918        sk_memory_allocated_add(sk, amt);
2919        allocated = sk_memory_allocated(sk);
2920        if (memcg_charge &&
2921            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2922                                                gfp_memcg_charge())))
2923                goto suppress_allocation;
2924
2925        /* Under limit. */
2926        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2927                sk_leave_memory_pressure(sk);
2928                return 1;
2929        }
2930
2931        /* Under pressure. */
2932        if (allocated > sk_prot_mem_limits(sk, 1))
2933                sk_enter_memory_pressure(sk);
2934
2935        /* Over hard limit. */
2936        if (allocated > sk_prot_mem_limits(sk, 2))
2937                goto suppress_allocation;
2938
2939        /* guarantee minimum buffer size under pressure */
2940        if (kind == SK_MEM_RECV) {
2941                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2942                        return 1;
2943
2944        } else { /* SK_MEM_SEND */
2945                int wmem0 = sk_get_wmem0(sk, prot);
2946
2947                if (sk->sk_type == SOCK_STREAM) {
2948                        if (sk->sk_wmem_queued < wmem0)
2949                                return 1;
2950                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2951                                return 1;
2952                }
2953        }
2954
2955        if (sk_has_memory_pressure(sk)) {
2956                u64 alloc;
2957
2958                if (!sk_under_memory_pressure(sk))
2959                        return 1;
2960                alloc = sk_sockets_allocated_read_positive(sk);
2961                if (sk_prot_mem_limits(sk, 2) > alloc *
2962                    sk_mem_pages(sk->sk_wmem_queued +
2963                                 atomic_read(&sk->sk_rmem_alloc) +
2964                                 sk->sk_forward_alloc))
2965                        return 1;
2966        }
2967
2968suppress_allocation:
2969
2970        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2971                sk_stream_moderate_sndbuf(sk);
2972
2973                /* Fail only if socket is _under_ its sndbuf.
2974                 * In this case we cannot block, so that we have to fail.
2975                 */
2976                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2977                        /* Force charge with __GFP_NOFAIL */
2978                        if (memcg_charge && !charged) {
2979                                mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2980                                        gfp_memcg_charge() | __GFP_NOFAIL);
2981                        }
2982                        return 1;
2983                }
2984        }
2985
2986        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2987                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2988
2989        sk_memory_allocated_sub(sk, amt);
2990
2991        if (memcg_charge && charged)
2992                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2993
2994        return 0;
2995}
2996
2997/**
2998 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2999 *      @sk: socket
3000 *      @size: memory size to allocate
3001 *      @kind: allocation type
3002 *
3003 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3004 *      rmem allocation. This function assumes that protocols which have
3005 *      memory_pressure use sk_wmem_queued as write buffer accounting.
3006 */
3007int __sk_mem_schedule(struct sock *sk, int size, int kind)
3008{
3009        int ret, amt = sk_mem_pages(size);
3010
3011        sk->sk_forward_alloc += amt << PAGE_SHIFT;
3012        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3013        if (!ret)
3014                sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3015        return ret;
3016}
3017EXPORT_SYMBOL(__sk_mem_schedule);
3018
3019/**
3020 *      __sk_mem_reduce_allocated - reclaim memory_allocated
3021 *      @sk: socket
3022 *      @amount: number of quanta
3023 *
3024 *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3025 */
3026void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3027{
3028        sk_memory_allocated_sub(sk, amount);
3029
3030        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3031                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3032
3033        if (sk_under_memory_pressure(sk) &&
3034            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3035                sk_leave_memory_pressure(sk);
3036}
3037
3038/**
3039 *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3040 *      @sk: socket
3041 *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3042 */
3043void __sk_mem_reclaim(struct sock *sk, int amount)
3044{
3045        amount >>= PAGE_SHIFT;
3046        sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3047        __sk_mem_reduce_allocated(sk, amount);
3048}
3049EXPORT_SYMBOL(__sk_mem_reclaim);
3050
3051int sk_set_peek_off(struct sock *sk, int val)
3052{
3053        sk->sk_peek_off = val;
3054        return 0;
3055}
3056EXPORT_SYMBOL_GPL(sk_set_peek_off);
3057
3058/*
3059 * Set of default routines for initialising struct proto_ops when
3060 * the protocol does not support a particular function. In certain
3061 * cases where it makes no sense for a protocol to have a "do nothing"
3062 * function, some default processing is provided.
3063 */
3064
3065int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3066{
3067        return -EOPNOTSUPP;
3068}
3069EXPORT_SYMBOL(sock_no_bind);
3070
3071int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3072                    int len, int flags)
3073{
3074        return -EOPNOTSUPP;
3075}
3076EXPORT_SYMBOL(sock_no_connect);
3077
3078int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3079{
3080        return -EOPNOTSUPP;
3081}
3082EXPORT_SYMBOL(sock_no_socketpair);
3083
3084int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3085                   bool kern)
3086{
3087        return -EOPNOTSUPP;
3088}
3089EXPORT_SYMBOL(sock_no_accept);
3090
3091int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3092                    int peer)
3093{
3094        return -EOPNOTSUPP;
3095}
3096EXPORT_SYMBOL(sock_no_getname);
3097
3098int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3099{
3100        return -EOPNOTSUPP;
3101}
3102EXPORT_SYMBOL(sock_no_ioctl);
3103
3104int sock_no_listen(struct socket *sock, int backlog)
3105{
3106        return -EOPNOTSUPP;
3107}
3108EXPORT_SYMBOL(sock_no_listen);
3109
3110int sock_no_shutdown(struct socket *sock, int how)
3111{
3112        return -EOPNOTSUPP;
3113}
3114EXPORT_SYMBOL(sock_no_shutdown);
3115
3116int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3117{
3118        return -EOPNOTSUPP;
3119}
3120EXPORT_SYMBOL(sock_no_sendmsg);
3121
3122int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3123{
3124        return -EOPNOTSUPP;
3125}
3126EXPORT_SYMBOL(sock_no_sendmsg_locked);
3127
3128int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3129                    int flags)
3130{
3131        return -EOPNOTSUPP;
3132}
3133EXPORT_SYMBOL(sock_no_recvmsg);
3134
3135int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3136{
3137        /* Mirror missing mmap method error code */
3138        return -ENODEV;
3139}
3140EXPORT_SYMBOL(sock_no_mmap);
3141
3142/*
3143 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3144 * various sock-based usage counts.
3145 */
3146void __receive_sock(struct file *file)
3147{
3148        struct socket *sock;
3149
3150        sock = sock_from_file(file);
3151        if (sock) {
3152                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3153                sock_update_classid(&sock->sk->sk_cgrp_data);
3154        }
3155}
3156
3157ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3158{
3159        ssize_t res;
3160        struct msghdr msg = {.msg_flags = flags};
3161        struct kvec iov;
3162        char *kaddr = kmap(page);
3163        iov.iov_base = kaddr + offset;
3164        iov.iov_len = size;
3165        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3166        kunmap(page);
3167        return res;
3168}
3169EXPORT_SYMBOL(sock_no_sendpage);
3170
3171ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3172                                int offset, size_t size, int flags)
3173{
3174        ssize_t res;
3175        struct msghdr msg = {.msg_flags = flags};
3176        struct kvec iov;
3177        char *kaddr = kmap(page);
3178
3179        iov.iov_base = kaddr + offset;
3180        iov.iov_len = size;
3181        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3182        kunmap(page);
3183        return res;
3184}
3185EXPORT_SYMBOL(sock_no_sendpage_locked);
3186
3187/*
3188 *      Default Socket Callbacks
3189 */
3190
3191static void sock_def_wakeup(struct sock *sk)
3192{
3193        struct socket_wq *wq;
3194
3195        rcu_read_lock();
3196        wq = rcu_dereference(sk->sk_wq);
3197        if (skwq_has_sleeper(wq))
3198                wake_up_interruptible_all(&wq->wait);
3199        rcu_read_unlock();
3200}
3201
3202static void sock_def_error_report(struct sock *sk)
3203{
3204        struct socket_wq *wq;
3205
3206        rcu_read_lock();
3207        wq = rcu_dereference(sk->sk_wq);
3208        if (skwq_has_sleeper(wq))
3209                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3210        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3211        rcu_read_unlock();
3212}
3213
3214void sock_def_readable(struct sock *sk)
3215{
3216        struct socket_wq *wq;
3217
3218        rcu_read_lock();
3219        wq = rcu_dereference(sk->sk_wq);
3220        if (skwq_has_sleeper(wq))
3221                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3222                                                EPOLLRDNORM | EPOLLRDBAND);
3223        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3224        rcu_read_unlock();
3225}
3226
3227static void sock_def_write_space(struct sock *sk)
3228{
3229        struct socket_wq *wq;
3230
3231        rcu_read_lock();
3232
3233        /* Do not wake up a writer until he can make "significant"
3234         * progress.  --DaveM
3235         */
3236        if (sock_writeable(sk)) {
3237                wq = rcu_dereference(sk->sk_wq);
3238                if (skwq_has_sleeper(wq))
3239                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3240                                                EPOLLWRNORM | EPOLLWRBAND);
3241
3242                /* Should agree with poll, otherwise some programs break */
3243                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3244        }
3245
3246        rcu_read_unlock();
3247}
3248
3249/* An optimised version of sock_def_write_space(), should only be called
3250 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3251 * ->sk_wmem_alloc.
3252 */
3253static void sock_def_write_space_wfree(struct sock *sk)
3254{
3255        /* Do not wake up a writer until he can make "significant"
3256         * progress.  --DaveM
3257         */
3258        if (sock_writeable(sk)) {
3259                struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3260
3261                /* rely on refcount_sub from sock_wfree() */
3262                smp_mb__after_atomic();
3263                if (wq && waitqueue_active(&wq->wait))
3264                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3265                                                EPOLLWRNORM | EPOLLWRBAND);
3266
3267                /* Should agree with poll, otherwise some programs break */
3268                sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3269        }
3270}
3271
3272static void sock_def_destruct(struct sock *sk)
3273{
3274}
3275
3276void sk_send_sigurg(struct sock *sk)
3277{
3278        if (sk->sk_socket && sk->sk_socket->file)
3279                if (send_sigurg(&sk->sk_socket->file->f_owner))
3280                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3281}
3282EXPORT_SYMBOL(sk_send_sigurg);
3283
3284void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3285                    unsigned long expires)
3286{
3287        if (!mod_timer(timer, expires))
3288                sock_hold(sk);
3289}
3290EXPORT_SYMBOL(sk_reset_timer);
3291
3292void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3293{
3294        if (del_timer(timer))
3295                __sock_put(sk);
3296}
3297EXPORT_SYMBOL(sk_stop_timer);
3298
3299void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3300{
3301        if (del_timer_sync(timer))
3302                __sock_put(sk);
3303}
3304EXPORT_SYMBOL(sk_stop_timer_sync);
3305
3306void sock_init_data(struct socket *sock, struct sock *sk)
3307{
3308        sk_init_common(sk);
3309        sk->sk_send_head        =       NULL;
3310
3311        timer_setup(&sk->sk_timer, NULL, 0);
3312
3313        sk->sk_allocation       =       GFP_KERNEL;
3314        sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3315        sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3316        sk->sk_state            =       TCP_CLOSE;
3317        sk_set_socket(sk, sock);
3318
3319        sock_set_flag(sk, SOCK_ZAPPED);
3320
3321        if (sock) {
3322                sk->sk_type     =       sock->type;
3323                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3324                sock->sk        =       sk;
3325                sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3326        } else {
3327                RCU_INIT_POINTER(sk->sk_wq, NULL);
3328                sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3329        }
3330
3331        rwlock_init(&sk->sk_callback_lock);
3332        if (sk->sk_kern_sock)
3333                lockdep_set_class_and_name(
3334                        &sk->sk_callback_lock,
3335                        af_kern_callback_keys + sk->sk_family,
3336                        af_family_kern_clock_key_strings[sk->sk_family]);
3337        else
3338                lockdep_set_class_and_name(
3339                        &sk->sk_callback_lock,
3340                        af_callback_keys + sk->sk_family,
3341                        af_family_clock_key_strings[sk->sk_family]);
3342
3343        sk->sk_state_change     =       sock_def_wakeup;
3344        sk->sk_data_ready       =       sock_def_readable;
3345        sk->sk_write_space      =       sock_def_write_space;
3346        sk->sk_error_report     =       sock_def_error_report;
3347        sk->sk_destruct         =       sock_def_destruct;
3348
3349        sk->sk_frag.page        =       NULL;
3350        sk->sk_frag.offset      =       0;
3351        sk->sk_peek_off         =       -1;
3352
3353        sk->sk_peer_pid         =       NULL;
3354        sk->sk_peer_cred        =       NULL;
3355        spin_lock_init(&sk->sk_peer_lock);
3356
3357        sk->sk_write_pending    =       0;
3358        sk->sk_rcvlowat         =       1;
3359        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3360        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3361
3362        sk->sk_stamp = SK_DEFAULT_STAMP;
3363#if BITS_PER_LONG==32
3364        seqlock_init(&sk->sk_stamp_seq);
3365#endif
3366        atomic_set(&sk->sk_zckey, 0);
3367
3368#ifdef CONFIG_NET_RX_BUSY_POLL
3369        sk->sk_napi_id          =       0;
3370        sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3371#endif
3372
3373        sk->sk_max_pacing_rate = ~0UL;
3374        sk->sk_pacing_rate = ~0UL;
3375        WRITE_ONCE(sk->sk_pacing_shift, 10);
3376        sk->sk_incoming_cpu = -1;
3377        sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3378
3379        sk_rx_queue_clear(sk);
3380        /*
3381         * Before updating sk_refcnt, we must commit prior changes to memory
3382         * (Documentation/RCU/rculist_nulls.rst for details)
3383         */
3384        smp_wmb();
3385        refcount_set(&sk->sk_refcnt, 1);
3386        atomic_set(&sk->sk_drops, 0);
3387}
3388EXPORT_SYMBOL(sock_init_data);
3389
3390void lock_sock_nested(struct sock *sk, int subclass)
3391{
3392        /* The sk_lock has mutex_lock() semantics here. */
3393        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3394
3395        might_sleep();
3396        spin_lock_bh(&sk->sk_lock.slock);
3397        if (sock_owned_by_user_nocheck(sk))
3398                __lock_sock(sk);
3399        sk->sk_lock.owned = 1;
3400        spin_unlock_bh(&sk->sk_lock.slock);
3401}
3402EXPORT_SYMBOL(lock_sock_nested);
3403
3404void release_sock(struct sock *sk)
3405{
3406        spin_lock_bh(&sk->sk_lock.slock);
3407        if (sk->sk_backlog.tail)
3408                __release_sock(sk);
3409
3410        /* Warning : release_cb() might need to release sk ownership,
3411         * ie call sock_release_ownership(sk) before us.
3412         */
3413        if (sk->sk_prot->release_cb)
3414                sk->sk_prot->release_cb(sk);
3415
3416        sock_release_ownership(sk);
3417        if (waitqueue_active(&sk->sk_lock.wq))
3418                wake_up(&sk->sk_lock.wq);
3419        spin_unlock_bh(&sk->sk_lock.slock);
3420}
3421EXPORT_SYMBOL(release_sock);
3422
3423bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3424{
3425        might_sleep();
3426        spin_lock_bh(&sk->sk_lock.slock);
3427
3428        if (!sock_owned_by_user_nocheck(sk)) {
3429                /*
3430                 * Fast path return with bottom halves disabled and
3431                 * sock::sk_lock.slock held.
3432                 *
3433                 * The 'mutex' is not contended and holding
3434                 * sock::sk_lock.slock prevents all other lockers to
3435                 * proceed so the corresponding unlock_sock_fast() can
3436                 * avoid the slow path of release_sock() completely and
3437                 * just release slock.
3438                 *
3439                 * From a semantical POV this is equivalent to 'acquiring'
3440                 * the 'mutex', hence the corresponding lockdep
3441                 * mutex_release() has to happen in the fast path of
3442                 * unlock_sock_fast().
3443                 */
3444                return false;
3445        }
3446
3447        __lock_sock(sk);
3448        sk->sk_lock.owned = 1;
3449        __acquire(&sk->sk_lock.slock);
3450        spin_unlock_bh(&sk->sk_lock.slock);
3451        return true;
3452}
3453EXPORT_SYMBOL(__lock_sock_fast);
3454
3455int sock_gettstamp(struct socket *sock, void __user *userstamp,
3456                   bool timeval, bool time32)
3457{
3458        struct sock *sk = sock->sk;
3459        struct timespec64 ts;
3460
3461        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3462        ts = ktime_to_timespec64(sock_read_timestamp(sk));
3463        if (ts.tv_sec == -1)
3464                return -ENOENT;
3465        if (ts.tv_sec == 0) {
3466                ktime_t kt = ktime_get_real();
3467                sock_write_timestamp(sk, kt);
3468                ts = ktime_to_timespec64(kt);
3469        }
3470
3471        if (timeval)
3472                ts.tv_nsec /= 1000;
3473
3474#ifdef CONFIG_COMPAT_32BIT_TIME
3475        if (time32)
3476                return put_old_timespec32(&ts, userstamp);
3477#endif
3478#ifdef CONFIG_SPARC64
3479        /* beware of padding in sparc64 timeval */
3480        if (timeval && !in_compat_syscall()) {
3481                struct __kernel_old_timeval __user tv = {
3482                        .tv_sec = ts.tv_sec,
3483                        .tv_usec = ts.tv_nsec,
3484                };
3485                if (copy_to_user(userstamp, &tv, sizeof(tv)))
3486                        return -EFAULT;
3487                return 0;
3488        }
3489#endif
3490        return put_timespec64(&ts, userstamp);
3491}
3492EXPORT_SYMBOL(sock_gettstamp);
3493
3494void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3495{
3496        if (!sock_flag(sk, flag)) {
3497                unsigned long previous_flags = sk->sk_flags;
3498
3499                sock_set_flag(sk, flag);
3500                /*
3501                 * we just set one of the two flags which require net
3502                 * time stamping, but time stamping might have been on
3503                 * already because of the other one
3504                 */
3505                if (sock_needs_netstamp(sk) &&
3506                    !(previous_flags & SK_FLAGS_TIMESTAMP))
3507                        net_enable_timestamp();
3508        }
3509}
3510
3511int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3512                       int level, int type)
3513{
3514        struct sock_exterr_skb *serr;
3515        struct sk_buff *skb;
3516        int copied, err;
3517
3518        err = -EAGAIN;
3519        skb = sock_dequeue_err_skb(sk);
3520        if (skb == NULL)
3521                goto out;
3522
3523        copied = skb->len;
3524        if (copied > len) {
3525                msg->msg_flags |= MSG_TRUNC;
3526                copied = len;
3527        }
3528        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3529        if (err)
3530                goto out_free_skb;
3531
3532        sock_recv_timestamp(msg, sk, skb);
3533
3534        serr = SKB_EXT_ERR(skb);
3535        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3536
3537        msg->msg_flags |= MSG_ERRQUEUE;
3538        err = copied;
3539
3540out_free_skb:
3541        kfree_skb(skb);
3542out:
3543        return err;
3544}
3545EXPORT_SYMBOL(sock_recv_errqueue);
3546
3547/*
3548 *      Get a socket option on an socket.
3549 *
3550 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3551 *      asynchronous errors should be reported by getsockopt. We assume
3552 *      this means if you specify SO_ERROR (otherwise whats the point of it).
3553 */
3554int sock_common_getsockopt(struct socket *sock, int level, int optname,
3555                           char __user *optval, int __user *optlen)
3556{
3557        struct sock *sk = sock->sk;
3558
3559        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3560}
3561EXPORT_SYMBOL(sock_common_getsockopt);
3562
3563int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3564                        int flags)
3565{
3566        struct sock *sk = sock->sk;
3567        int addr_len = 0;
3568        int err;
3569
3570        err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3571        if (err >= 0)
3572                msg->msg_namelen = addr_len;
3573        return err;
3574}
3575EXPORT_SYMBOL(sock_common_recvmsg);
3576
3577/*
3578 *      Set socket options on an inet socket.
3579 */
3580int sock_common_setsockopt(struct socket *sock, int level, int optname,
3581                           sockptr_t optval, unsigned int optlen)
3582{
3583        struct sock *sk = sock->sk;
3584
3585        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3586}
3587EXPORT_SYMBOL(sock_common_setsockopt);
3588
3589void sk_common_release(struct sock *sk)
3590{
3591        if (sk->sk_prot->destroy)
3592                sk->sk_prot->destroy(sk);
3593
3594        /*
3595         * Observation: when sk_common_release is called, processes have
3596         * no access to socket. But net still has.
3597         * Step one, detach it from networking:
3598         *
3599         * A. Remove from hash tables.
3600         */
3601
3602        sk->sk_prot->unhash(sk);
3603
3604        /*
3605         * In this point socket cannot receive new packets, but it is possible
3606         * that some packets are in flight because some CPU runs receiver and
3607         * did hash table lookup before we unhashed socket. They will achieve
3608         * receive queue and will be purged by socket destructor.
3609         *
3610         * Also we still have packets pending on receive queue and probably,
3611         * our own packets waiting in device queues. sock_destroy will drain
3612         * receive queue, but transmitted packets will delay socket destruction
3613         * until the last reference will be released.
3614         */
3615
3616        sock_orphan(sk);
3617
3618        xfrm_sk_free_policy(sk);
3619
3620        sk_refcnt_debug_release(sk);
3621
3622        sock_put(sk);
3623}
3624EXPORT_SYMBOL(sk_common_release);
3625
3626void sk_get_meminfo(const struct sock *sk, u32 *mem)
3627{
3628        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3629
3630        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3631        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3632        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3633        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3634        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3635        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3636        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3637        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3638        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3639}
3640
3641#ifdef CONFIG_PROC_FS
3642static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3643
3644int sock_prot_inuse_get(struct net *net, struct proto *prot)
3645{
3646        int cpu, idx = prot->inuse_idx;
3647        int res = 0;
3648
3649        for_each_possible_cpu(cpu)
3650                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3651
3652        return res >= 0 ? res : 0;
3653}
3654EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3655
3656int sock_inuse_get(struct net *net)
3657{
3658        int cpu, res = 0;
3659
3660        for_each_possible_cpu(cpu)
3661                res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3662
3663        return res;
3664}
3665
3666EXPORT_SYMBOL_GPL(sock_inuse_get);
3667
3668static int __net_init sock_inuse_init_net(struct net *net)
3669{
3670        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3671        if (net->core.prot_inuse == NULL)
3672                return -ENOMEM;
3673        return 0;
3674}
3675
3676static void __net_exit sock_inuse_exit_net(struct net *net)
3677{
3678        free_percpu(net->core.prot_inuse);
3679}
3680
3681static struct pernet_operations net_inuse_ops = {
3682        .init = sock_inuse_init_net,
3683        .exit = sock_inuse_exit_net,
3684};
3685
3686static __init int net_inuse_init(void)
3687{
3688        if (register_pernet_subsys(&net_inuse_ops))
3689                panic("Cannot initialize net inuse counters");
3690
3691        return 0;
3692}
3693
3694core_initcall(net_inuse_init);
3695
3696static int assign_proto_idx(struct proto *prot)
3697{
3698        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3699
3700        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3701                pr_err("PROTO_INUSE_NR exhausted\n");
3702                return -ENOSPC;
3703        }
3704
3705        set_bit(prot->inuse_idx, proto_inuse_idx);
3706        return 0;
3707}
3708
3709static void release_proto_idx(struct proto *prot)
3710{
3711        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3712                clear_bit(prot->inuse_idx, proto_inuse_idx);
3713}
3714#else
3715static inline int assign_proto_idx(struct proto *prot)
3716{
3717        return 0;
3718}
3719
3720static inline void release_proto_idx(struct proto *prot)
3721{
3722}
3723
3724#endif
3725
3726static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3727{
3728        if (!twsk_prot)
3729                return;
3730        kfree(twsk_prot->twsk_slab_name);
3731        twsk_prot->twsk_slab_name = NULL;
3732        kmem_cache_destroy(twsk_prot->twsk_slab);
3733        twsk_prot->twsk_slab = NULL;
3734}
3735
3736static int tw_prot_init(const struct proto *prot)
3737{
3738        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3739
3740        if (!twsk_prot)
3741                return 0;
3742
3743        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3744                                              prot->name);
3745        if (!twsk_prot->twsk_slab_name)
3746                return -ENOMEM;
3747
3748        twsk_prot->twsk_slab =
3749                kmem_cache_create(twsk_prot->twsk_slab_name,
3750                                  twsk_prot->twsk_obj_size, 0,
3751                                  SLAB_ACCOUNT | prot->slab_flags,
3752                                  NULL);
3753        if (!twsk_prot->twsk_slab) {
3754                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3755                        prot->name);
3756                return -ENOMEM;
3757        }
3758
3759        return 0;
3760}
3761
3762static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3763{
3764        if (!rsk_prot)
3765                return;
3766        kfree(rsk_prot->slab_name);
3767        rsk_prot->slab_name = NULL;
3768        kmem_cache_destroy(rsk_prot->slab);
3769        rsk_prot->slab = NULL;
3770}
3771
3772static int req_prot_init(const struct proto *prot)
3773{
3774        struct request_sock_ops *rsk_prot = prot->rsk_prot;
3775
3776        if (!rsk_prot)
3777                return 0;
3778
3779        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3780                                        prot->name);
3781        if (!rsk_prot->slab_name)
3782                return -ENOMEM;
3783
3784        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3785                                           rsk_prot->obj_size, 0,
3786                                           SLAB_ACCOUNT | prot->slab_flags,
3787                                           NULL);
3788
3789        if (!rsk_prot->slab) {
3790                pr_crit("%s: Can't create request sock SLAB cache!\n",
3791                        prot->name);
3792                return -ENOMEM;
3793        }
3794        return 0;
3795}
3796
3797int proto_register(struct proto *prot, int alloc_slab)
3798{
3799        int ret = -ENOBUFS;
3800
3801        if (prot->memory_allocated && !prot->sysctl_mem) {
3802                pr_err("%s: missing sysctl_mem\n", prot->name);
3803                return -EINVAL;
3804        }
3805        if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3806                pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3807                return -EINVAL;
3808        }
3809        if (alloc_slab) {
3810                prot->slab = kmem_cache_create_usercopy(prot->name,
3811                                        prot->obj_size, 0,
3812                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3813                                        prot->slab_flags,
3814                                        prot->useroffset, prot->usersize,
3815                                        NULL);
3816
3817                if (prot->slab == NULL) {
3818                        pr_crit("%s: Can't create sock SLAB cache!\n",
3819                                prot->name);
3820                        goto out;
3821                }
3822
3823                if (req_prot_init(prot))
3824                        goto out_free_request_sock_slab;
3825
3826                if (tw_prot_init(prot))
3827                        goto out_free_timewait_sock_slab;
3828        }
3829
3830        mutex_lock(&proto_list_mutex);
3831        ret = assign_proto_idx(prot);
3832        if (ret) {
3833                mutex_unlock(&proto_list_mutex);
3834                goto out_free_timewait_sock_slab;
3835        }
3836        list_add(&prot->node, &proto_list);
3837        mutex_unlock(&proto_list_mutex);
3838        return ret;
3839
3840out_free_timewait_sock_slab:
3841        if (alloc_slab)
3842                tw_prot_cleanup(prot->twsk_prot);
3843out_free_request_sock_slab:
3844        if (alloc_slab) {
3845                req_prot_cleanup(prot->rsk_prot);
3846
3847                kmem_cache_destroy(prot->slab);
3848                prot->slab = NULL;
3849        }
3850out:
3851        return ret;
3852}
3853EXPORT_SYMBOL(proto_register);
3854
3855void proto_unregister(struct proto *prot)
3856{
3857        mutex_lock(&proto_list_mutex);
3858        release_proto_idx(prot);
3859        list_del(&prot->node);
3860        mutex_unlock(&proto_list_mutex);
3861
3862        kmem_cache_destroy(prot->slab);
3863        prot->slab = NULL;
3864
3865        req_prot_cleanup(prot->rsk_prot);
3866        tw_prot_cleanup(prot->twsk_prot);
3867}
3868EXPORT_SYMBOL(proto_unregister);
3869
3870int sock_load_diag_module(int family, int protocol)
3871{
3872        if (!protocol) {
3873                if (!sock_is_registered(family))
3874                        return -ENOENT;
3875
3876                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3877                                      NETLINK_SOCK_DIAG, family);
3878        }
3879
3880#ifdef CONFIG_INET
3881        if (family == AF_INET &&
3882            protocol != IPPROTO_RAW &&
3883            protocol < MAX_INET_PROTOS &&
3884            !rcu_access_pointer(inet_protos[protocol]))
3885                return -ENOENT;
3886#endif
3887
3888        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3889                              NETLINK_SOCK_DIAG, family, protocol);
3890}
3891EXPORT_SYMBOL(sock_load_diag_module);
3892
3893#ifdef CONFIG_PROC_FS
3894static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3895        __acquires(proto_list_mutex)
3896{
3897        mutex_lock(&proto_list_mutex);
3898        return seq_list_start_head(&proto_list, *pos);
3899}
3900
3901static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3902{
3903        return seq_list_next(v, &proto_list, pos);
3904}
3905
3906static void proto_seq_stop(struct seq_file *seq, void *v)
3907        __releases(proto_list_mutex)
3908{
3909        mutex_unlock(&proto_list_mutex);
3910}
3911
3912static char proto_method_implemented(const void *method)
3913{
3914        return method == NULL ? 'n' : 'y';
3915}
3916static long sock_prot_memory_allocated(struct proto *proto)
3917{
3918        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3919}
3920
3921static const char *sock_prot_memory_pressure(struct proto *proto)
3922{
3923        return proto->memory_pressure != NULL ?
3924        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3925}
3926
3927static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3928{
3929
3930        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3931                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3932                   proto->name,
3933                   proto->obj_size,
3934                   sock_prot_inuse_get(seq_file_net(seq), proto),
3935                   sock_prot_memory_allocated(proto),
3936                   sock_prot_memory_pressure(proto),
3937                   proto->max_header,
3938                   proto->slab == NULL ? "no" : "yes",
3939                   module_name(proto->owner),
3940                   proto_method_implemented(proto->close),
3941                   proto_method_implemented(proto->connect),
3942                   proto_method_implemented(proto->disconnect),
3943                   proto_method_implemented(proto->accept),
3944                   proto_method_implemented(proto->ioctl),
3945                   proto_method_implemented(proto->init),
3946                   proto_method_implemented(proto->destroy),
3947                   proto_method_implemented(proto->shutdown),
3948                   proto_method_implemented(proto->setsockopt),
3949                   proto_method_implemented(proto->getsockopt),
3950                   proto_method_implemented(proto->sendmsg),
3951                   proto_method_implemented(proto->recvmsg),
3952                   proto_method_implemented(proto->sendpage),
3953                   proto_method_implemented(proto->bind),
3954                   proto_method_implemented(proto->backlog_rcv),
3955                   proto_method_implemented(proto->hash),
3956                   proto_method_implemented(proto->unhash),
3957                   proto_method_implemented(proto->get_port),
3958                   proto_method_implemented(proto->enter_memory_pressure));
3959}
3960
3961static int proto_seq_show(struct seq_file *seq, void *v)
3962{
3963        if (v == &proto_list)
3964                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3965                           "protocol",
3966                           "size",
3967                           "sockets",
3968                           "memory",
3969                           "press",
3970                           "maxhdr",
3971                           "slab",
3972                           "module",
3973                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3974        else
3975                proto_seq_printf(seq, list_entry(v, struct proto, node));
3976        return 0;
3977}
3978
3979static const struct seq_operations proto_seq_ops = {
3980        .start  = proto_seq_start,
3981        .next   = proto_seq_next,
3982        .stop   = proto_seq_stop,
3983        .show   = proto_seq_show,
3984};
3985
3986static __net_init int proto_init_net(struct net *net)
3987{
3988        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3989                        sizeof(struct seq_net_private)))
3990                return -ENOMEM;
3991
3992        return 0;
3993}
3994
3995static __net_exit void proto_exit_net(struct net *net)
3996{
3997        remove_proc_entry("protocols", net->proc_net);
3998}
3999
4000
4001static __net_initdata struct pernet_operations proto_net_ops = {
4002        .init = proto_init_net,
4003        .exit = proto_exit_net,
4004};
4005
4006static int __init proto_init(void)
4007{
4008        return register_pernet_subsys(&proto_net_ops);
4009}
4010
4011subsys_initcall(proto_init);
4012
4013#endif /* PROC_FS */
4014
4015#ifdef CONFIG_NET_RX_BUSY_POLL
4016bool sk_busy_loop_end(void *p, unsigned long start_time)
4017{
4018        struct sock *sk = p;
4019
4020        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4021               sk_busy_loop_timeout(sk, start_time);
4022}
4023EXPORT_SYMBOL(sk_busy_loop_end);
4024#endif /* CONFIG_NET_RX_BUSY_POLL */
4025
4026int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4027{
4028        if (!sk->sk_prot->bind_add)
4029                return -EOPNOTSUPP;
4030        return sk->sk_prot->bind_add(sk, addr, addr_len);
4031}
4032EXPORT_SYMBOL(sock_bind_add);
4033