linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <asm/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134#include <linux/sock_diag.h>
 135
 136#include <linux/filter.h>
 137#include <net/sock_reuseport.h>
 138
 139#include <trace/events/sock.h>
 140
 141#ifdef CONFIG_INET
 142#include <net/tcp.h>
 143#endif
 144
 145#include <net/busy_poll.h>
 146
 147static DEFINE_MUTEX(proto_list_mutex);
 148static LIST_HEAD(proto_list);
 149
 150/**
 151 * sk_ns_capable - General socket capability test
 152 * @sk: Socket to use a capability on or through
 153 * @user_ns: The user namespace of the capability to use
 154 * @cap: The capability to use
 155 *
 156 * Test to see if the opener of the socket had when the socket was
 157 * created and the current process has the capability @cap in the user
 158 * namespace @user_ns.
 159 */
 160bool sk_ns_capable(const struct sock *sk,
 161                   struct user_namespace *user_ns, int cap)
 162{
 163        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                ns_capable(user_ns, cap);
 165}
 166EXPORT_SYMBOL(sk_ns_capable);
 167
 168/**
 169 * sk_capable - Socket global capability test
 170 * @sk: Socket to use a capability on or through
 171 * @cap: The global capability to use
 172 *
 173 * Test to see if the opener of the socket had when the socket was
 174 * created and the current process has the capability @cap in all user
 175 * namespaces.
 176 */
 177bool sk_capable(const struct sock *sk, int cap)
 178{
 179        return sk_ns_capable(sk, &init_user_ns, cap);
 180}
 181EXPORT_SYMBOL(sk_capable);
 182
 183/**
 184 * sk_net_capable - Network namespace socket capability test
 185 * @sk: Socket to use a capability on or through
 186 * @cap: The capability to use
 187 *
 188 * Test to see if the opener of the socket had when the socket was created
 189 * and the current process has the capability @cap over the network namespace
 190 * the socket is a member of.
 191 */
 192bool sk_net_capable(const struct sock *sk, int cap)
 193{
 194        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195}
 196EXPORT_SYMBOL(sk_net_capable);
 197
 198/*
 199 * Each address family might have different locking rules, so we have
 200 * one slock key per address family:
 201 */
 202static struct lock_class_key af_family_keys[AF_MAX];
 203static struct lock_class_key af_family_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210static const char *const af_family_key_strings[AF_MAX+1] = {
 211  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 212  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 213  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 214  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 215  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 216  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 217  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 218  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 219  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 220  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 221  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 222  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 223  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 224  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 225  "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
 226};
 227static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 228  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 229  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 230  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 231  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 232  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 233  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 234  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 235  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 236  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 237  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 238  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 239  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 240  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 241  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 242  "slock-AF_QIPCRTR", "slock-AF_MAX"
 243};
 244static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 245  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 246  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 247  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 248  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 249  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 250  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 251  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 252  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 253  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 254  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 255  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 256  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 257  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 258  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 259  "clock-AF_QIPCRTR", "clock-AF_MAX"
 260};
 261
 262/*
 263 * sk_callback_lock locking rules are per-address-family,
 264 * so split the lock classes by using a per-AF key:
 265 */
 266static struct lock_class_key af_callback_keys[AF_MAX];
 267
 268/* Take into consideration the size of the struct sk_buff overhead in the
 269 * determination of these values, since that is non-constant across
 270 * platforms.  This makes socket queueing behavior and performance
 271 * not depend upon such differences.
 272 */
 273#define _SK_MEM_PACKETS         256
 274#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 275#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 276#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 277
 278/* Run time adjustable parameters. */
 279__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280EXPORT_SYMBOL(sysctl_wmem_max);
 281__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282EXPORT_SYMBOL(sysctl_rmem_max);
 283__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286/* Maximal space eaten by iovec or ancillary data plus some space */
 287int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 293EXPORT_SYMBOL_GPL(memalloc_socks);
 294
 295/**
 296 * sk_set_memalloc - sets %SOCK_MEMALLOC
 297 * @sk: socket to set it on
 298 *
 299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300 * It's the responsibility of the admin to adjust min_free_kbytes
 301 * to meet the requirements
 302 */
 303void sk_set_memalloc(struct sock *sk)
 304{
 305        sock_set_flag(sk, SOCK_MEMALLOC);
 306        sk->sk_allocation |= __GFP_MEMALLOC;
 307        static_key_slow_inc(&memalloc_socks);
 308}
 309EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311void sk_clear_memalloc(struct sock *sk)
 312{
 313        sock_reset_flag(sk, SOCK_MEMALLOC);
 314        sk->sk_allocation &= ~__GFP_MEMALLOC;
 315        static_key_slow_dec(&memalloc_socks);
 316
 317        /*
 318         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319         * progress of swapping. SOCK_MEMALLOC may be cleared while
 320         * it has rmem allocations due to the last swapfile being deactivated
 321         * but there is a risk that the socket is unusable due to exceeding
 322         * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323         */
 324        sk_mem_reclaim(sk);
 325}
 326EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329{
 330        int ret;
 331        unsigned long pflags = current->flags;
 332
 333        /* these should have been dropped before queueing */
 334        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336        current->flags |= PF_MEMALLOC;
 337        ret = sk->sk_backlog_rcv(sk, skb);
 338        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 339
 340        return ret;
 341}
 342EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 345{
 346        struct timeval tv;
 347
 348        if (optlen < sizeof(tv))
 349                return -EINVAL;
 350        if (copy_from_user(&tv, optval, sizeof(tv)))
 351                return -EFAULT;
 352        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 353                return -EDOM;
 354
 355        if (tv.tv_sec < 0) {
 356                static int warned __read_mostly;
 357
 358                *timeo_p = 0;
 359                if (warned < 10 && net_ratelimit()) {
 360                        warned++;
 361                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 362                                __func__, current->comm, task_pid_nr(current));
 363                }
 364                return 0;
 365        }
 366        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 367        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 368                return 0;
 369        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 370                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 371        return 0;
 372}
 373
 374static void sock_warn_obsolete_bsdism(const char *name)
 375{
 376        static int warned;
 377        static char warncomm[TASK_COMM_LEN];
 378        if (strcmp(warncomm, current->comm) && warned < 5) {
 379                strcpy(warncomm,  current->comm);
 380                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 381                        warncomm, name);
 382                warned++;
 383        }
 384}
 385
 386static bool sock_needs_netstamp(const struct sock *sk)
 387{
 388        switch (sk->sk_family) {
 389        case AF_UNSPEC:
 390        case AF_UNIX:
 391                return false;
 392        default:
 393                return true;
 394        }
 395}
 396
 397static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 398{
 399        if (sk->sk_flags & flags) {
 400                sk->sk_flags &= ~flags;
 401                if (sock_needs_netstamp(sk) &&
 402                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 403                        net_disable_timestamp();
 404        }
 405}
 406
 407
 408int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 409{
 410        unsigned long flags;
 411        struct sk_buff_head *list = &sk->sk_receive_queue;
 412
 413        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 414                atomic_inc(&sk->sk_drops);
 415                trace_sock_rcvqueue_full(sk, skb);
 416                return -ENOMEM;
 417        }
 418
 419        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 420                atomic_inc(&sk->sk_drops);
 421                return -ENOBUFS;
 422        }
 423
 424        skb->dev = NULL;
 425        skb_set_owner_r(skb, sk);
 426
 427        /* we escape from rcu protected region, make sure we dont leak
 428         * a norefcounted dst
 429         */
 430        skb_dst_force(skb);
 431
 432        spin_lock_irqsave(&list->lock, flags);
 433        sock_skb_set_dropcount(sk, skb);
 434        __skb_queue_tail(list, skb);
 435        spin_unlock_irqrestore(&list->lock, flags);
 436
 437        if (!sock_flag(sk, SOCK_DEAD))
 438                sk->sk_data_ready(sk);
 439        return 0;
 440}
 441EXPORT_SYMBOL(__sock_queue_rcv_skb);
 442
 443int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 444{
 445        int err;
 446
 447        err = sk_filter(sk, skb);
 448        if (err)
 449                return err;
 450
 451        return __sock_queue_rcv_skb(sk, skb);
 452}
 453EXPORT_SYMBOL(sock_queue_rcv_skb);
 454
 455int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 456                     const int nested, unsigned int trim_cap, bool refcounted)
 457{
 458        int rc = NET_RX_SUCCESS;
 459
 460        if (sk_filter_trim_cap(sk, skb, trim_cap))
 461                goto discard_and_relse;
 462
 463        skb->dev = NULL;
 464
 465        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 466                atomic_inc(&sk->sk_drops);
 467                goto discard_and_relse;
 468        }
 469        if (nested)
 470                bh_lock_sock_nested(sk);
 471        else
 472                bh_lock_sock(sk);
 473        if (!sock_owned_by_user(sk)) {
 474                /*
 475                 * trylock + unlock semantics:
 476                 */
 477                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 478
 479                rc = sk_backlog_rcv(sk, skb);
 480
 481                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 482        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 483                bh_unlock_sock(sk);
 484                atomic_inc(&sk->sk_drops);
 485                goto discard_and_relse;
 486        }
 487
 488        bh_unlock_sock(sk);
 489out:
 490        if (refcounted)
 491                sock_put(sk);
 492        return rc;
 493discard_and_relse:
 494        kfree_skb(skb);
 495        goto out;
 496}
 497EXPORT_SYMBOL(__sk_receive_skb);
 498
 499struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 500{
 501        struct dst_entry *dst = __sk_dst_get(sk);
 502
 503        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 504                sk_tx_queue_clear(sk);
 505                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 506                dst_release(dst);
 507                return NULL;
 508        }
 509
 510        return dst;
 511}
 512EXPORT_SYMBOL(__sk_dst_check);
 513
 514struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 515{
 516        struct dst_entry *dst = sk_dst_get(sk);
 517
 518        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 519                sk_dst_reset(sk);
 520                dst_release(dst);
 521                return NULL;
 522        }
 523
 524        return dst;
 525}
 526EXPORT_SYMBOL(sk_dst_check);
 527
 528static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 529                                int optlen)
 530{
 531        int ret = -ENOPROTOOPT;
 532#ifdef CONFIG_NETDEVICES
 533        struct net *net = sock_net(sk);
 534        char devname[IFNAMSIZ];
 535        int index;
 536
 537        /* Sorry... */
 538        ret = -EPERM;
 539        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 540                goto out;
 541
 542        ret = -EINVAL;
 543        if (optlen < 0)
 544                goto out;
 545
 546        /* Bind this socket to a particular device like "eth0",
 547         * as specified in the passed interface name. If the
 548         * name is "" or the option length is zero the socket
 549         * is not bound.
 550         */
 551        if (optlen > IFNAMSIZ - 1)
 552                optlen = IFNAMSIZ - 1;
 553        memset(devname, 0, sizeof(devname));
 554
 555        ret = -EFAULT;
 556        if (copy_from_user(devname, optval, optlen))
 557                goto out;
 558
 559        index = 0;
 560        if (devname[0] != '\0') {
 561                struct net_device *dev;
 562
 563                rcu_read_lock();
 564                dev = dev_get_by_name_rcu(net, devname);
 565                if (dev)
 566                        index = dev->ifindex;
 567                rcu_read_unlock();
 568                ret = -ENODEV;
 569                if (!dev)
 570                        goto out;
 571        }
 572
 573        lock_sock(sk);
 574        sk->sk_bound_dev_if = index;
 575        sk_dst_reset(sk);
 576        release_sock(sk);
 577
 578        ret = 0;
 579
 580out:
 581#endif
 582
 583        return ret;
 584}
 585
 586static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 587                                int __user *optlen, int len)
 588{
 589        int ret = -ENOPROTOOPT;
 590#ifdef CONFIG_NETDEVICES
 591        struct net *net = sock_net(sk);
 592        char devname[IFNAMSIZ];
 593
 594        if (sk->sk_bound_dev_if == 0) {
 595                len = 0;
 596                goto zero;
 597        }
 598
 599        ret = -EINVAL;
 600        if (len < IFNAMSIZ)
 601                goto out;
 602
 603        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 604        if (ret)
 605                goto out;
 606
 607        len = strlen(devname) + 1;
 608
 609        ret = -EFAULT;
 610        if (copy_to_user(optval, devname, len))
 611                goto out;
 612
 613zero:
 614        ret = -EFAULT;
 615        if (put_user(len, optlen))
 616                goto out;
 617
 618        ret = 0;
 619
 620out:
 621#endif
 622
 623        return ret;
 624}
 625
 626static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 627{
 628        if (valbool)
 629                sock_set_flag(sk, bit);
 630        else
 631                sock_reset_flag(sk, bit);
 632}
 633
 634bool sk_mc_loop(struct sock *sk)
 635{
 636        if (dev_recursion_level())
 637                return false;
 638        if (!sk)
 639                return true;
 640        switch (sk->sk_family) {
 641        case AF_INET:
 642                return inet_sk(sk)->mc_loop;
 643#if IS_ENABLED(CONFIG_IPV6)
 644        case AF_INET6:
 645                return inet6_sk(sk)->mc_loop;
 646#endif
 647        }
 648        WARN_ON(1);
 649        return true;
 650}
 651EXPORT_SYMBOL(sk_mc_loop);
 652
 653/*
 654 *      This is meant for all protocols to use and covers goings on
 655 *      at the socket level. Everything here is generic.
 656 */
 657
 658int sock_setsockopt(struct socket *sock, int level, int optname,
 659                    char __user *optval, unsigned int optlen)
 660{
 661        struct sock *sk = sock->sk;
 662        int val;
 663        int valbool;
 664        struct linger ling;
 665        int ret = 0;
 666
 667        /*
 668         *      Options without arguments
 669         */
 670
 671        if (optname == SO_BINDTODEVICE)
 672                return sock_setbindtodevice(sk, optval, optlen);
 673
 674        if (optlen < sizeof(int))
 675                return -EINVAL;
 676
 677        if (get_user(val, (int __user *)optval))
 678                return -EFAULT;
 679
 680        valbool = val ? 1 : 0;
 681
 682        lock_sock(sk);
 683
 684        switch (optname) {
 685        case SO_DEBUG:
 686                if (val && !capable(CAP_NET_ADMIN))
 687                        ret = -EACCES;
 688                else
 689                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 690                break;
 691        case SO_REUSEADDR:
 692                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 693                break;
 694        case SO_REUSEPORT:
 695                sk->sk_reuseport = valbool;
 696                break;
 697        case SO_TYPE:
 698        case SO_PROTOCOL:
 699        case SO_DOMAIN:
 700        case SO_ERROR:
 701                ret = -ENOPROTOOPT;
 702                break;
 703        case SO_DONTROUTE:
 704                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 705                break;
 706        case SO_BROADCAST:
 707                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 708                break;
 709        case SO_SNDBUF:
 710                /* Don't error on this BSD doesn't and if you think
 711                 * about it this is right. Otherwise apps have to
 712                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 713                 * are treated in BSD as hints
 714                 */
 715                val = min_t(u32, val, sysctl_wmem_max);
 716set_sndbuf:
 717                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 718                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 719                /* Wake up sending tasks if we upped the value. */
 720                sk->sk_write_space(sk);
 721                break;
 722
 723        case SO_SNDBUFFORCE:
 724                if (!capable(CAP_NET_ADMIN)) {
 725                        ret = -EPERM;
 726                        break;
 727                }
 728                goto set_sndbuf;
 729
 730        case SO_RCVBUF:
 731                /* Don't error on this BSD doesn't and if you think
 732                 * about it this is right. Otherwise apps have to
 733                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 734                 * are treated in BSD as hints
 735                 */
 736                val = min_t(u32, val, sysctl_rmem_max);
 737set_rcvbuf:
 738                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 739                /*
 740                 * We double it on the way in to account for
 741                 * "struct sk_buff" etc. overhead.   Applications
 742                 * assume that the SO_RCVBUF setting they make will
 743                 * allow that much actual data to be received on that
 744                 * socket.
 745                 *
 746                 * Applications are unaware that "struct sk_buff" and
 747                 * other overheads allocate from the receive buffer
 748                 * during socket buffer allocation.
 749                 *
 750                 * And after considering the possible alternatives,
 751                 * returning the value we actually used in getsockopt
 752                 * is the most desirable behavior.
 753                 */
 754                sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 755                break;
 756
 757        case SO_RCVBUFFORCE:
 758                if (!capable(CAP_NET_ADMIN)) {
 759                        ret = -EPERM;
 760                        break;
 761                }
 762                goto set_rcvbuf;
 763
 764        case SO_KEEPALIVE:
 765#ifdef CONFIG_INET
 766                if (sk->sk_protocol == IPPROTO_TCP &&
 767                    sk->sk_type == SOCK_STREAM)
 768                        tcp_set_keepalive(sk, valbool);
 769#endif
 770                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 771                break;
 772
 773        case SO_OOBINLINE:
 774                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 775                break;
 776
 777        case SO_NO_CHECK:
 778                sk->sk_no_check_tx = valbool;
 779                break;
 780
 781        case SO_PRIORITY:
 782                if ((val >= 0 && val <= 6) ||
 783                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 784                        sk->sk_priority = val;
 785                else
 786                        ret = -EPERM;
 787                break;
 788
 789        case SO_LINGER:
 790                if (optlen < sizeof(ling)) {
 791                        ret = -EINVAL;  /* 1003.1g */
 792                        break;
 793                }
 794                if (copy_from_user(&ling, optval, sizeof(ling))) {
 795                        ret = -EFAULT;
 796                        break;
 797                }
 798                if (!ling.l_onoff)
 799                        sock_reset_flag(sk, SOCK_LINGER);
 800                else {
 801#if (BITS_PER_LONG == 32)
 802                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 803                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 804                        else
 805#endif
 806                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 807                        sock_set_flag(sk, SOCK_LINGER);
 808                }
 809                break;
 810
 811        case SO_BSDCOMPAT:
 812                sock_warn_obsolete_bsdism("setsockopt");
 813                break;
 814
 815        case SO_PASSCRED:
 816                if (valbool)
 817                        set_bit(SOCK_PASSCRED, &sock->flags);
 818                else
 819                        clear_bit(SOCK_PASSCRED, &sock->flags);
 820                break;
 821
 822        case SO_TIMESTAMP:
 823        case SO_TIMESTAMPNS:
 824                if (valbool)  {
 825                        if (optname == SO_TIMESTAMP)
 826                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827                        else
 828                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 829                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 830                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 831                } else {
 832                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 833                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 834                }
 835                break;
 836
 837        case SO_TIMESTAMPING:
 838                if (val & ~SOF_TIMESTAMPING_MASK) {
 839                        ret = -EINVAL;
 840                        break;
 841                }
 842
 843                if (val & SOF_TIMESTAMPING_OPT_ID &&
 844                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 845                        if (sk->sk_protocol == IPPROTO_TCP &&
 846                            sk->sk_type == SOCK_STREAM) {
 847                                if ((1 << sk->sk_state) &
 848                                    (TCPF_CLOSE | TCPF_LISTEN)) {
 849                                        ret = -EINVAL;
 850                                        break;
 851                                }
 852                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 853                        } else {
 854                                sk->sk_tskey = 0;
 855                        }
 856                }
 857                sk->sk_tsflags = val;
 858                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 859                        sock_enable_timestamp(sk,
 860                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 861                else
 862                        sock_disable_timestamp(sk,
 863                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 864                break;
 865
 866        case SO_RCVLOWAT:
 867                if (val < 0)
 868                        val = INT_MAX;
 869                sk->sk_rcvlowat = val ? : 1;
 870                break;
 871
 872        case SO_RCVTIMEO:
 873                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 874                break;
 875
 876        case SO_SNDTIMEO:
 877                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 878                break;
 879
 880        case SO_ATTACH_FILTER:
 881                ret = -EINVAL;
 882                if (optlen == sizeof(struct sock_fprog)) {
 883                        struct sock_fprog fprog;
 884
 885                        ret = -EFAULT;
 886                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 887                                break;
 888
 889                        ret = sk_attach_filter(&fprog, sk);
 890                }
 891                break;
 892
 893        case SO_ATTACH_BPF:
 894                ret = -EINVAL;
 895                if (optlen == sizeof(u32)) {
 896                        u32 ufd;
 897
 898                        ret = -EFAULT;
 899                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 900                                break;
 901
 902                        ret = sk_attach_bpf(ufd, sk);
 903                }
 904                break;
 905
 906        case SO_ATTACH_REUSEPORT_CBPF:
 907                ret = -EINVAL;
 908                if (optlen == sizeof(struct sock_fprog)) {
 909                        struct sock_fprog fprog;
 910
 911                        ret = -EFAULT;
 912                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 913                                break;
 914
 915                        ret = sk_reuseport_attach_filter(&fprog, sk);
 916                }
 917                break;
 918
 919        case SO_ATTACH_REUSEPORT_EBPF:
 920                ret = -EINVAL;
 921                if (optlen == sizeof(u32)) {
 922                        u32 ufd;
 923
 924                        ret = -EFAULT;
 925                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 926                                break;
 927
 928                        ret = sk_reuseport_attach_bpf(ufd, sk);
 929                }
 930                break;
 931
 932        case SO_DETACH_FILTER:
 933                ret = sk_detach_filter(sk);
 934                break;
 935
 936        case SO_LOCK_FILTER:
 937                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 938                        ret = -EPERM;
 939                else
 940                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 941                break;
 942
 943        case SO_PASSSEC:
 944                if (valbool)
 945                        set_bit(SOCK_PASSSEC, &sock->flags);
 946                else
 947                        clear_bit(SOCK_PASSSEC, &sock->flags);
 948                break;
 949        case SO_MARK:
 950                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 951                        ret = -EPERM;
 952                else
 953                        sk->sk_mark = val;
 954                break;
 955
 956        case SO_RXQ_OVFL:
 957                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 958                break;
 959
 960        case SO_WIFI_STATUS:
 961                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 962                break;
 963
 964        case SO_PEEK_OFF:
 965                if (sock->ops->set_peek_off)
 966                        ret = sock->ops->set_peek_off(sk, val);
 967                else
 968                        ret = -EOPNOTSUPP;
 969                break;
 970
 971        case SO_NOFCS:
 972                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 973                break;
 974
 975        case SO_SELECT_ERR_QUEUE:
 976                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 977                break;
 978
 979#ifdef CONFIG_NET_RX_BUSY_POLL
 980        case SO_BUSY_POLL:
 981                /* allow unprivileged users to decrease the value */
 982                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 983                        ret = -EPERM;
 984                else {
 985                        if (val < 0)
 986                                ret = -EINVAL;
 987                        else
 988                                sk->sk_ll_usec = val;
 989                }
 990                break;
 991#endif
 992
 993        case SO_MAX_PACING_RATE:
 994                sk->sk_max_pacing_rate = val;
 995                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 996                                         sk->sk_max_pacing_rate);
 997                break;
 998
 999        case SO_INCOMING_CPU:
1000                sk->sk_incoming_cpu = val;
1001                break;
1002
1003        case SO_CNX_ADVICE:
1004                if (val == 1)
1005                        dst_negative_advice(sk);
1006                break;
1007        default:
1008                ret = -ENOPROTOOPT;
1009                break;
1010        }
1011        release_sock(sk);
1012        return ret;
1013}
1014EXPORT_SYMBOL(sock_setsockopt);
1015
1016
1017static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1018                          struct ucred *ucred)
1019{
1020        ucred->pid = pid_vnr(pid);
1021        ucred->uid = ucred->gid = -1;
1022        if (cred) {
1023                struct user_namespace *current_ns = current_user_ns();
1024
1025                ucred->uid = from_kuid_munged(current_ns, cred->euid);
1026                ucred->gid = from_kgid_munged(current_ns, cred->egid);
1027        }
1028}
1029
1030int sock_getsockopt(struct socket *sock, int level, int optname,
1031                    char __user *optval, int __user *optlen)
1032{
1033        struct sock *sk = sock->sk;
1034
1035        union {
1036                int val;
1037                struct linger ling;
1038                struct timeval tm;
1039        } v;
1040
1041        int lv = sizeof(int);
1042        int len;
1043
1044        if (get_user(len, optlen))
1045                return -EFAULT;
1046        if (len < 0)
1047                return -EINVAL;
1048
1049        memset(&v, 0, sizeof(v));
1050
1051        switch (optname) {
1052        case SO_DEBUG:
1053                v.val = sock_flag(sk, SOCK_DBG);
1054                break;
1055
1056        case SO_DONTROUTE:
1057                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1058                break;
1059
1060        case SO_BROADCAST:
1061                v.val = sock_flag(sk, SOCK_BROADCAST);
1062                break;
1063
1064        case SO_SNDBUF:
1065                v.val = sk->sk_sndbuf;
1066                break;
1067
1068        case SO_RCVBUF:
1069                v.val = sk->sk_rcvbuf;
1070                break;
1071
1072        case SO_REUSEADDR:
1073                v.val = sk->sk_reuse;
1074                break;
1075
1076        case SO_REUSEPORT:
1077                v.val = sk->sk_reuseport;
1078                break;
1079
1080        case SO_KEEPALIVE:
1081                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1082                break;
1083
1084        case SO_TYPE:
1085                v.val = sk->sk_type;
1086                break;
1087
1088        case SO_PROTOCOL:
1089                v.val = sk->sk_protocol;
1090                break;
1091
1092        case SO_DOMAIN:
1093                v.val = sk->sk_family;
1094                break;
1095
1096        case SO_ERROR:
1097                v.val = -sock_error(sk);
1098                if (v.val == 0)
1099                        v.val = xchg(&sk->sk_err_soft, 0);
1100                break;
1101
1102        case SO_OOBINLINE:
1103                v.val = sock_flag(sk, SOCK_URGINLINE);
1104                break;
1105
1106        case SO_NO_CHECK:
1107                v.val = sk->sk_no_check_tx;
1108                break;
1109
1110        case SO_PRIORITY:
1111                v.val = sk->sk_priority;
1112                break;
1113
1114        case SO_LINGER:
1115                lv              = sizeof(v.ling);
1116                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1117                v.ling.l_linger = sk->sk_lingertime / HZ;
1118                break;
1119
1120        case SO_BSDCOMPAT:
1121                sock_warn_obsolete_bsdism("getsockopt");
1122                break;
1123
1124        case SO_TIMESTAMP:
1125                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1126                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1127                break;
1128
1129        case SO_TIMESTAMPNS:
1130                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1131                break;
1132
1133        case SO_TIMESTAMPING:
1134                v.val = sk->sk_tsflags;
1135                break;
1136
1137        case SO_RCVTIMEO:
1138                lv = sizeof(struct timeval);
1139                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1140                        v.tm.tv_sec = 0;
1141                        v.tm.tv_usec = 0;
1142                } else {
1143                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1144                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1145                }
1146                break;
1147
1148        case SO_SNDTIMEO:
1149                lv = sizeof(struct timeval);
1150                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1151                        v.tm.tv_sec = 0;
1152                        v.tm.tv_usec = 0;
1153                } else {
1154                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1155                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1156                }
1157                break;
1158
1159        case SO_RCVLOWAT:
1160                v.val = sk->sk_rcvlowat;
1161                break;
1162
1163        case SO_SNDLOWAT:
1164                v.val = 1;
1165                break;
1166
1167        case SO_PASSCRED:
1168                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1169                break;
1170
1171        case SO_PEERCRED:
1172        {
1173                struct ucred peercred;
1174                if (len > sizeof(peercred))
1175                        len = sizeof(peercred);
1176                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1177                if (copy_to_user(optval, &peercred, len))
1178                        return -EFAULT;
1179                goto lenout;
1180        }
1181
1182        case SO_PEERNAME:
1183        {
1184                char address[128];
1185
1186                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1187                        return -ENOTCONN;
1188                if (lv < len)
1189                        return -EINVAL;
1190                if (copy_to_user(optval, address, len))
1191                        return -EFAULT;
1192                goto lenout;
1193        }
1194
1195        /* Dubious BSD thing... Probably nobody even uses it, but
1196         * the UNIX standard wants it for whatever reason... -DaveM
1197         */
1198        case SO_ACCEPTCONN:
1199                v.val = sk->sk_state == TCP_LISTEN;
1200                break;
1201
1202        case SO_PASSSEC:
1203                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1204                break;
1205
1206        case SO_PEERSEC:
1207                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1208
1209        case SO_MARK:
1210                v.val = sk->sk_mark;
1211                break;
1212
1213        case SO_RXQ_OVFL:
1214                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1215                break;
1216
1217        case SO_WIFI_STATUS:
1218                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1219                break;
1220
1221        case SO_PEEK_OFF:
1222                if (!sock->ops->set_peek_off)
1223                        return -EOPNOTSUPP;
1224
1225                v.val = sk->sk_peek_off;
1226                break;
1227        case SO_NOFCS:
1228                v.val = sock_flag(sk, SOCK_NOFCS);
1229                break;
1230
1231        case SO_BINDTODEVICE:
1232                return sock_getbindtodevice(sk, optval, optlen, len);
1233
1234        case SO_GET_FILTER:
1235                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1236                if (len < 0)
1237                        return len;
1238
1239                goto lenout;
1240
1241        case SO_LOCK_FILTER:
1242                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1243                break;
1244
1245        case SO_BPF_EXTENSIONS:
1246                v.val = bpf_tell_extensions();
1247                break;
1248
1249        case SO_SELECT_ERR_QUEUE:
1250                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1251                break;
1252
1253#ifdef CONFIG_NET_RX_BUSY_POLL
1254        case SO_BUSY_POLL:
1255                v.val = sk->sk_ll_usec;
1256                break;
1257#endif
1258
1259        case SO_MAX_PACING_RATE:
1260                v.val = sk->sk_max_pacing_rate;
1261                break;
1262
1263        case SO_INCOMING_CPU:
1264                v.val = sk->sk_incoming_cpu;
1265                break;
1266
1267        default:
1268                /* We implement the SO_SNDLOWAT etc to not be settable
1269                 * (1003.1g 7).
1270                 */
1271                return -ENOPROTOOPT;
1272        }
1273
1274        if (len > lv)
1275                len = lv;
1276        if (copy_to_user(optval, &v, len))
1277                return -EFAULT;
1278lenout:
1279        if (put_user(len, optlen))
1280                return -EFAULT;
1281        return 0;
1282}
1283
1284/*
1285 * Initialize an sk_lock.
1286 *
1287 * (We also register the sk_lock with the lock validator.)
1288 */
1289static inline void sock_lock_init(struct sock *sk)
1290{
1291        sock_lock_init_class_and_name(sk,
1292                        af_family_slock_key_strings[sk->sk_family],
1293                        af_family_slock_keys + sk->sk_family,
1294                        af_family_key_strings[sk->sk_family],
1295                        af_family_keys + sk->sk_family);
1296}
1297
1298/*
1299 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1300 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1301 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1302 */
1303static void sock_copy(struct sock *nsk, const struct sock *osk)
1304{
1305#ifdef CONFIG_SECURITY_NETWORK
1306        void *sptr = nsk->sk_security;
1307#endif
1308        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1309
1310        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1311               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1312
1313#ifdef CONFIG_SECURITY_NETWORK
1314        nsk->sk_security = sptr;
1315        security_sk_clone(osk, nsk);
1316#endif
1317}
1318
1319static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1320                int family)
1321{
1322        struct sock *sk;
1323        struct kmem_cache *slab;
1324
1325        slab = prot->slab;
1326        if (slab != NULL) {
1327                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1328                if (!sk)
1329                        return sk;
1330                if (priority & __GFP_ZERO)
1331                        sk_prot_clear_nulls(sk, prot->obj_size);
1332        } else
1333                sk = kmalloc(prot->obj_size, priority);
1334
1335        if (sk != NULL) {
1336                kmemcheck_annotate_bitfield(sk, flags);
1337
1338                if (security_sk_alloc(sk, family, priority))
1339                        goto out_free;
1340
1341                if (!try_module_get(prot->owner))
1342                        goto out_free_sec;
1343                sk_tx_queue_clear(sk);
1344        }
1345
1346        return sk;
1347
1348out_free_sec:
1349        security_sk_free(sk);
1350out_free:
1351        if (slab != NULL)
1352                kmem_cache_free(slab, sk);
1353        else
1354                kfree(sk);
1355        return NULL;
1356}
1357
1358static void sk_prot_free(struct proto *prot, struct sock *sk)
1359{
1360        struct kmem_cache *slab;
1361        struct module *owner;
1362
1363        owner = prot->owner;
1364        slab = prot->slab;
1365
1366        cgroup_sk_free(&sk->sk_cgrp_data);
1367        mem_cgroup_sk_free(sk);
1368        security_sk_free(sk);
1369        if (slab != NULL)
1370                kmem_cache_free(slab, sk);
1371        else
1372                kfree(sk);
1373        module_put(owner);
1374}
1375
1376/**
1377 *      sk_alloc - All socket objects are allocated here
1378 *      @net: the applicable net namespace
1379 *      @family: protocol family
1380 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1381 *      @prot: struct proto associated with this new sock instance
1382 *      @kern: is this to be a kernel socket?
1383 */
1384struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1385                      struct proto *prot, int kern)
1386{
1387        struct sock *sk;
1388
1389        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1390        if (sk) {
1391                sk->sk_family = family;
1392                /*
1393                 * See comment in struct sock definition to understand
1394                 * why we need sk_prot_creator -acme
1395                 */
1396                sk->sk_prot = sk->sk_prot_creator = prot;
1397                sock_lock_init(sk);
1398                sk->sk_net_refcnt = kern ? 0 : 1;
1399                if (likely(sk->sk_net_refcnt))
1400                        get_net(net);
1401                sock_net_set(sk, net);
1402                atomic_set(&sk->sk_wmem_alloc, 1);
1403
1404                mem_cgroup_sk_alloc(sk);
1405                cgroup_sk_alloc(&sk->sk_cgrp_data);
1406                sock_update_classid(&sk->sk_cgrp_data);
1407                sock_update_netprioidx(&sk->sk_cgrp_data);
1408        }
1409
1410        return sk;
1411}
1412EXPORT_SYMBOL(sk_alloc);
1413
1414/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1415 * grace period. This is the case for UDP sockets and TCP listeners.
1416 */
1417static void __sk_destruct(struct rcu_head *head)
1418{
1419        struct sock *sk = container_of(head, struct sock, sk_rcu);
1420        struct sk_filter *filter;
1421
1422        if (sk->sk_destruct)
1423                sk->sk_destruct(sk);
1424
1425        filter = rcu_dereference_check(sk->sk_filter,
1426                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1427        if (filter) {
1428                sk_filter_uncharge(sk, filter);
1429                RCU_INIT_POINTER(sk->sk_filter, NULL);
1430        }
1431        if (rcu_access_pointer(sk->sk_reuseport_cb))
1432                reuseport_detach_sock(sk);
1433
1434        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1435
1436        if (atomic_read(&sk->sk_omem_alloc))
1437                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1438                         __func__, atomic_read(&sk->sk_omem_alloc));
1439
1440        if (sk->sk_peer_cred)
1441                put_cred(sk->sk_peer_cred);
1442        put_pid(sk->sk_peer_pid);
1443        if (likely(sk->sk_net_refcnt))
1444                put_net(sock_net(sk));
1445        sk_prot_free(sk->sk_prot_creator, sk);
1446}
1447
1448void sk_destruct(struct sock *sk)
1449{
1450        if (sock_flag(sk, SOCK_RCU_FREE))
1451                call_rcu(&sk->sk_rcu, __sk_destruct);
1452        else
1453                __sk_destruct(&sk->sk_rcu);
1454}
1455
1456static void __sk_free(struct sock *sk)
1457{
1458        if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1459                sock_diag_broadcast_destroy(sk);
1460        else
1461                sk_destruct(sk);
1462}
1463
1464void sk_free(struct sock *sk)
1465{
1466        /*
1467         * We subtract one from sk_wmem_alloc and can know if
1468         * some packets are still in some tx queue.
1469         * If not null, sock_wfree() will call __sk_free(sk) later
1470         */
1471        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1472                __sk_free(sk);
1473}
1474EXPORT_SYMBOL(sk_free);
1475
1476/**
1477 *      sk_clone_lock - clone a socket, and lock its clone
1478 *      @sk: the socket to clone
1479 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1480 *
1481 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1482 */
1483struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1484{
1485        struct sock *newsk;
1486        bool is_charged = true;
1487
1488        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1489        if (newsk != NULL) {
1490                struct sk_filter *filter;
1491
1492                sock_copy(newsk, sk);
1493
1494                /* SANITY */
1495                if (likely(newsk->sk_net_refcnt))
1496                        get_net(sock_net(newsk));
1497                sk_node_init(&newsk->sk_node);
1498                sock_lock_init(newsk);
1499                bh_lock_sock(newsk);
1500                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1501                newsk->sk_backlog.len = 0;
1502
1503                atomic_set(&newsk->sk_rmem_alloc, 0);
1504                /*
1505                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1506                 */
1507                atomic_set(&newsk->sk_wmem_alloc, 1);
1508                atomic_set(&newsk->sk_omem_alloc, 0);
1509                skb_queue_head_init(&newsk->sk_receive_queue);
1510                skb_queue_head_init(&newsk->sk_write_queue);
1511
1512                rwlock_init(&newsk->sk_callback_lock);
1513                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1514                                af_callback_keys + newsk->sk_family,
1515                                af_family_clock_key_strings[newsk->sk_family]);
1516
1517                newsk->sk_dst_cache     = NULL;
1518                newsk->sk_wmem_queued   = 0;
1519                newsk->sk_forward_alloc = 0;
1520                atomic_set(&newsk->sk_drops, 0);
1521                newsk->sk_send_head     = NULL;
1522                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1523
1524                sock_reset_flag(newsk, SOCK_DONE);
1525                skb_queue_head_init(&newsk->sk_error_queue);
1526
1527                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1528                if (filter != NULL)
1529                        /* though it's an empty new sock, the charging may fail
1530                         * if sysctl_optmem_max was changed between creation of
1531                         * original socket and cloning
1532                         */
1533                        is_charged = sk_filter_charge(newsk, filter);
1534
1535                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1536                        /* It is still raw copy of parent, so invalidate
1537                         * destructor and make plain sk_free() */
1538                        newsk->sk_destruct = NULL;
1539                        bh_unlock_sock(newsk);
1540                        sk_free(newsk);
1541                        newsk = NULL;
1542                        goto out;
1543                }
1544                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1545
1546                newsk->sk_err      = 0;
1547                newsk->sk_err_soft = 0;
1548                newsk->sk_priority = 0;
1549                newsk->sk_incoming_cpu = raw_smp_processor_id();
1550                atomic64_set(&newsk->sk_cookie, 0);
1551
1552                mem_cgroup_sk_alloc(newsk);
1553                cgroup_sk_alloc(&newsk->sk_cgrp_data);
1554
1555                /*
1556                 * Before updating sk_refcnt, we must commit prior changes to memory
1557                 * (Documentation/RCU/rculist_nulls.txt for details)
1558                 */
1559                smp_wmb();
1560                atomic_set(&newsk->sk_refcnt, 2);
1561
1562                /*
1563                 * Increment the counter in the same struct proto as the master
1564                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1565                 * is the same as sk->sk_prot->socks, as this field was copied
1566                 * with memcpy).
1567                 *
1568                 * This _changes_ the previous behaviour, where
1569                 * tcp_create_openreq_child always was incrementing the
1570                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1571                 * to be taken into account in all callers. -acme
1572                 */
1573                sk_refcnt_debug_inc(newsk);
1574                sk_set_socket(newsk, NULL);
1575                newsk->sk_wq = NULL;
1576
1577                if (newsk->sk_prot->sockets_allocated)
1578                        sk_sockets_allocated_inc(newsk);
1579
1580                if (sock_needs_netstamp(sk) &&
1581                    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1582                        net_enable_timestamp();
1583        }
1584out:
1585        return newsk;
1586}
1587EXPORT_SYMBOL_GPL(sk_clone_lock);
1588
1589void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1590{
1591        u32 max_segs = 1;
1592
1593        sk_dst_set(sk, dst);
1594        sk->sk_route_caps = dst->dev->features;
1595        if (sk->sk_route_caps & NETIF_F_GSO)
1596                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1597        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1598        if (sk_can_gso(sk)) {
1599                if (dst->header_len) {
1600                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1601                } else {
1602                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1603                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1604                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1605                }
1606        }
1607        sk->sk_gso_max_segs = max_segs;
1608}
1609EXPORT_SYMBOL_GPL(sk_setup_caps);
1610
1611/*
1612 *      Simple resource managers for sockets.
1613 */
1614
1615
1616/*
1617 * Write buffer destructor automatically called from kfree_skb.
1618 */
1619void sock_wfree(struct sk_buff *skb)
1620{
1621        struct sock *sk = skb->sk;
1622        unsigned int len = skb->truesize;
1623
1624        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1625                /*
1626                 * Keep a reference on sk_wmem_alloc, this will be released
1627                 * after sk_write_space() call
1628                 */
1629                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1630                sk->sk_write_space(sk);
1631                len = 1;
1632        }
1633        /*
1634         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1635         * could not do because of in-flight packets
1636         */
1637        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1638                __sk_free(sk);
1639}
1640EXPORT_SYMBOL(sock_wfree);
1641
1642/* This variant of sock_wfree() is used by TCP,
1643 * since it sets SOCK_USE_WRITE_QUEUE.
1644 */
1645void __sock_wfree(struct sk_buff *skb)
1646{
1647        struct sock *sk = skb->sk;
1648
1649        if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1650                __sk_free(sk);
1651}
1652
1653void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1654{
1655        skb_orphan(skb);
1656        skb->sk = sk;
1657#ifdef CONFIG_INET
1658        if (unlikely(!sk_fullsock(sk))) {
1659                skb->destructor = sock_edemux;
1660                sock_hold(sk);
1661                return;
1662        }
1663#endif
1664        skb->destructor = sock_wfree;
1665        skb_set_hash_from_sk(skb, sk);
1666        /*
1667         * We used to take a refcount on sk, but following operation
1668         * is enough to guarantee sk_free() wont free this sock until
1669         * all in-flight packets are completed
1670         */
1671        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1672}
1673EXPORT_SYMBOL(skb_set_owner_w);
1674
1675/* This helper is used by netem, as it can hold packets in its
1676 * delay queue. We want to allow the owner socket to send more
1677 * packets, as if they were already TX completed by a typical driver.
1678 * But we also want to keep skb->sk set because some packet schedulers
1679 * rely on it (sch_fq for example). So we set skb->truesize to a small
1680 * amount (1) and decrease sk_wmem_alloc accordingly.
1681 */
1682void skb_orphan_partial(struct sk_buff *skb)
1683{
1684        /* If this skb is a TCP pure ACK or already went here,
1685         * we have nothing to do. 2 is already a very small truesize.
1686         */
1687        if (skb->truesize <= 2)
1688                return;
1689
1690        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1691         * so we do not completely orphan skb, but transfert all
1692         * accounted bytes but one, to avoid unexpected reorders.
1693         */
1694        if (skb->destructor == sock_wfree
1695#ifdef CONFIG_INET
1696            || skb->destructor == tcp_wfree
1697#endif
1698                ) {
1699                atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1700                skb->truesize = 1;
1701        } else {
1702                skb_orphan(skb);
1703        }
1704}
1705EXPORT_SYMBOL(skb_orphan_partial);
1706
1707/*
1708 * Read buffer destructor automatically called from kfree_skb.
1709 */
1710void sock_rfree(struct sk_buff *skb)
1711{
1712        struct sock *sk = skb->sk;
1713        unsigned int len = skb->truesize;
1714
1715        atomic_sub(len, &sk->sk_rmem_alloc);
1716        sk_mem_uncharge(sk, len);
1717}
1718EXPORT_SYMBOL(sock_rfree);
1719
1720/*
1721 * Buffer destructor for skbs that are not used directly in read or write
1722 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1723 */
1724void sock_efree(struct sk_buff *skb)
1725{
1726        sock_put(skb->sk);
1727}
1728EXPORT_SYMBOL(sock_efree);
1729
1730kuid_t sock_i_uid(struct sock *sk)
1731{
1732        kuid_t uid;
1733
1734        read_lock_bh(&sk->sk_callback_lock);
1735        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1736        read_unlock_bh(&sk->sk_callback_lock);
1737        return uid;
1738}
1739EXPORT_SYMBOL(sock_i_uid);
1740
1741unsigned long sock_i_ino(struct sock *sk)
1742{
1743        unsigned long ino;
1744
1745        read_lock_bh(&sk->sk_callback_lock);
1746        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1747        read_unlock_bh(&sk->sk_callback_lock);
1748        return ino;
1749}
1750EXPORT_SYMBOL(sock_i_ino);
1751
1752/*
1753 * Allocate a skb from the socket's send buffer.
1754 */
1755struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1756                             gfp_t priority)
1757{
1758        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1759                struct sk_buff *skb = alloc_skb(size, priority);
1760                if (skb) {
1761                        skb_set_owner_w(skb, sk);
1762                        return skb;
1763                }
1764        }
1765        return NULL;
1766}
1767EXPORT_SYMBOL(sock_wmalloc);
1768
1769/*
1770 * Allocate a memory block from the socket's option memory buffer.
1771 */
1772void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1773{
1774        if ((unsigned int)size <= sysctl_optmem_max &&
1775            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1776                void *mem;
1777                /* First do the add, to avoid the race if kmalloc
1778                 * might sleep.
1779                 */
1780                atomic_add(size, &sk->sk_omem_alloc);
1781                mem = kmalloc(size, priority);
1782                if (mem)
1783                        return mem;
1784                atomic_sub(size, &sk->sk_omem_alloc);
1785        }
1786        return NULL;
1787}
1788EXPORT_SYMBOL(sock_kmalloc);
1789
1790/* Free an option memory block. Note, we actually want the inline
1791 * here as this allows gcc to detect the nullify and fold away the
1792 * condition entirely.
1793 */
1794static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1795                                  const bool nullify)
1796{
1797        if (WARN_ON_ONCE(!mem))
1798                return;
1799        if (nullify)
1800                kzfree(mem);
1801        else
1802                kfree(mem);
1803        atomic_sub(size, &sk->sk_omem_alloc);
1804}
1805
1806void sock_kfree_s(struct sock *sk, void *mem, int size)
1807{
1808        __sock_kfree_s(sk, mem, size, false);
1809}
1810EXPORT_SYMBOL(sock_kfree_s);
1811
1812void sock_kzfree_s(struct sock *sk, void *mem, int size)
1813{
1814        __sock_kfree_s(sk, mem, size, true);
1815}
1816EXPORT_SYMBOL(sock_kzfree_s);
1817
1818/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1819   I think, these locks should be removed for datagram sockets.
1820 */
1821static long sock_wait_for_wmem(struct sock *sk, long timeo)
1822{
1823        DEFINE_WAIT(wait);
1824
1825        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1826        for (;;) {
1827                if (!timeo)
1828                        break;
1829                if (signal_pending(current))
1830                        break;
1831                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1832                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1833                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1834                        break;
1835                if (sk->sk_shutdown & SEND_SHUTDOWN)
1836                        break;
1837                if (sk->sk_err)
1838                        break;
1839                timeo = schedule_timeout(timeo);
1840        }
1841        finish_wait(sk_sleep(sk), &wait);
1842        return timeo;
1843}
1844
1845
1846/*
1847 *      Generic send/receive buffer handlers
1848 */
1849
1850struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1851                                     unsigned long data_len, int noblock,
1852                                     int *errcode, int max_page_order)
1853{
1854        struct sk_buff *skb;
1855        long timeo;
1856        int err;
1857
1858        timeo = sock_sndtimeo(sk, noblock);
1859        for (;;) {
1860                err = sock_error(sk);
1861                if (err != 0)
1862                        goto failure;
1863
1864                err = -EPIPE;
1865                if (sk->sk_shutdown & SEND_SHUTDOWN)
1866                        goto failure;
1867
1868                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1869                        break;
1870
1871                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1872                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1873                err = -EAGAIN;
1874                if (!timeo)
1875                        goto failure;
1876                if (signal_pending(current))
1877                        goto interrupted;
1878                timeo = sock_wait_for_wmem(sk, timeo);
1879        }
1880        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1881                                   errcode, sk->sk_allocation);
1882        if (skb)
1883                skb_set_owner_w(skb, sk);
1884        return skb;
1885
1886interrupted:
1887        err = sock_intr_errno(timeo);
1888failure:
1889        *errcode = err;
1890        return NULL;
1891}
1892EXPORT_SYMBOL(sock_alloc_send_pskb);
1893
1894struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1895                                    int noblock, int *errcode)
1896{
1897        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1898}
1899EXPORT_SYMBOL(sock_alloc_send_skb);
1900
1901int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1902                     struct sockcm_cookie *sockc)
1903{
1904        u32 tsflags;
1905
1906        switch (cmsg->cmsg_type) {
1907        case SO_MARK:
1908                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1909                        return -EPERM;
1910                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1911                        return -EINVAL;
1912                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1913                break;
1914        case SO_TIMESTAMPING:
1915                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1916                        return -EINVAL;
1917
1918                tsflags = *(u32 *)CMSG_DATA(cmsg);
1919                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1920                        return -EINVAL;
1921
1922                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1923                sockc->tsflags |= tsflags;
1924                break;
1925        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1926        case SCM_RIGHTS:
1927        case SCM_CREDENTIALS:
1928                break;
1929        default:
1930                return -EINVAL;
1931        }
1932        return 0;
1933}
1934EXPORT_SYMBOL(__sock_cmsg_send);
1935
1936int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1937                   struct sockcm_cookie *sockc)
1938{
1939        struct cmsghdr *cmsg;
1940        int ret;
1941
1942        for_each_cmsghdr(cmsg, msg) {
1943                if (!CMSG_OK(msg, cmsg))
1944                        return -EINVAL;
1945                if (cmsg->cmsg_level != SOL_SOCKET)
1946                        continue;
1947                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1948                if (ret)
1949                        return ret;
1950        }
1951        return 0;
1952}
1953EXPORT_SYMBOL(sock_cmsg_send);
1954
1955/* On 32bit arches, an skb frag is limited to 2^15 */
1956#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1957
1958/**
1959 * skb_page_frag_refill - check that a page_frag contains enough room
1960 * @sz: minimum size of the fragment we want to get
1961 * @pfrag: pointer to page_frag
1962 * @gfp: priority for memory allocation
1963 *
1964 * Note: While this allocator tries to use high order pages, there is
1965 * no guarantee that allocations succeed. Therefore, @sz MUST be
1966 * less or equal than PAGE_SIZE.
1967 */
1968bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1969{
1970        if (pfrag->page) {
1971                if (page_ref_count(pfrag->page) == 1) {
1972                        pfrag->offset = 0;
1973                        return true;
1974                }
1975                if (pfrag->offset + sz <= pfrag->size)
1976                        return true;
1977                put_page(pfrag->page);
1978        }
1979
1980        pfrag->offset = 0;
1981        if (SKB_FRAG_PAGE_ORDER) {
1982                /* Avoid direct reclaim but allow kswapd to wake */
1983                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1984                                          __GFP_COMP | __GFP_NOWARN |
1985                                          __GFP_NORETRY,
1986                                          SKB_FRAG_PAGE_ORDER);
1987                if (likely(pfrag->page)) {
1988                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1989                        return true;
1990                }
1991        }
1992        pfrag->page = alloc_page(gfp);
1993        if (likely(pfrag->page)) {
1994                pfrag->size = PAGE_SIZE;
1995                return true;
1996        }
1997        return false;
1998}
1999EXPORT_SYMBOL(skb_page_frag_refill);
2000
2001bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2002{
2003        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2004                return true;
2005
2006        sk_enter_memory_pressure(sk);
2007        sk_stream_moderate_sndbuf(sk);
2008        return false;
2009}
2010EXPORT_SYMBOL(sk_page_frag_refill);
2011
2012static void __lock_sock(struct sock *sk)
2013        __releases(&sk->sk_lock.slock)
2014        __acquires(&sk->sk_lock.slock)
2015{
2016        DEFINE_WAIT(wait);
2017
2018        for (;;) {
2019                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2020                                        TASK_UNINTERRUPTIBLE);
2021                spin_unlock_bh(&sk->sk_lock.slock);
2022                schedule();
2023                spin_lock_bh(&sk->sk_lock.slock);
2024                if (!sock_owned_by_user(sk))
2025                        break;
2026        }
2027        finish_wait(&sk->sk_lock.wq, &wait);
2028}
2029
2030static void __release_sock(struct sock *sk)
2031        __releases(&sk->sk_lock.slock)
2032        __acquires(&sk->sk_lock.slock)
2033{
2034        struct sk_buff *skb, *next;
2035
2036        while ((skb = sk->sk_backlog.head) != NULL) {
2037                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2038
2039                spin_unlock_bh(&sk->sk_lock.slock);
2040
2041                do {
2042                        next = skb->next;
2043                        prefetch(next);
2044                        WARN_ON_ONCE(skb_dst_is_noref(skb));
2045                        skb->next = NULL;
2046                        sk_backlog_rcv(sk, skb);
2047
2048                        cond_resched();
2049
2050                        skb = next;
2051                } while (skb != NULL);
2052
2053                spin_lock_bh(&sk->sk_lock.slock);
2054        }
2055
2056        /*
2057         * Doing the zeroing here guarantee we can not loop forever
2058         * while a wild producer attempts to flood us.
2059         */
2060        sk->sk_backlog.len = 0;
2061}
2062
2063void __sk_flush_backlog(struct sock *sk)
2064{
2065        spin_lock_bh(&sk->sk_lock.slock);
2066        __release_sock(sk);
2067        spin_unlock_bh(&sk->sk_lock.slock);
2068}
2069
2070/**
2071 * sk_wait_data - wait for data to arrive at sk_receive_queue
2072 * @sk:    sock to wait on
2073 * @timeo: for how long
2074 * @skb:   last skb seen on sk_receive_queue
2075 *
2076 * Now socket state including sk->sk_err is changed only under lock,
2077 * hence we may omit checks after joining wait queue.
2078 * We check receive queue before schedule() only as optimization;
2079 * it is very likely that release_sock() added new data.
2080 */
2081int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2082{
2083        int rc;
2084        DEFINE_WAIT(wait);
2085
2086        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2087        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2088        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2089        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2090        finish_wait(sk_sleep(sk), &wait);
2091        return rc;
2092}
2093EXPORT_SYMBOL(sk_wait_data);
2094
2095/**
2096 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2097 *      @sk: socket
2098 *      @size: memory size to allocate
2099 *      @kind: allocation type
2100 *
2101 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2102 *      rmem allocation. This function assumes that protocols which have
2103 *      memory_pressure use sk_wmem_queued as write buffer accounting.
2104 */
2105int __sk_mem_schedule(struct sock *sk, int size, int kind)
2106{
2107        struct proto *prot = sk->sk_prot;
2108        int amt = sk_mem_pages(size);
2109        long allocated;
2110
2111        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2112
2113        allocated = sk_memory_allocated_add(sk, amt);
2114
2115        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2116            !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2117                goto suppress_allocation;
2118
2119        /* Under limit. */
2120        if (allocated <= sk_prot_mem_limits(sk, 0)) {
2121                sk_leave_memory_pressure(sk);
2122                return 1;
2123        }
2124
2125        /* Under pressure. */
2126        if (allocated > sk_prot_mem_limits(sk, 1))
2127                sk_enter_memory_pressure(sk);
2128
2129        /* Over hard limit. */
2130        if (allocated > sk_prot_mem_limits(sk, 2))
2131                goto suppress_allocation;
2132
2133        /* guarantee minimum buffer size under pressure */
2134        if (kind == SK_MEM_RECV) {
2135                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2136                        return 1;
2137
2138        } else { /* SK_MEM_SEND */
2139                if (sk->sk_type == SOCK_STREAM) {
2140                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2141                                return 1;
2142                } else if (atomic_read(&sk->sk_wmem_alloc) <
2143                           prot->sysctl_wmem[0])
2144                                return 1;
2145        }
2146
2147        if (sk_has_memory_pressure(sk)) {
2148                int alloc;
2149
2150                if (!sk_under_memory_pressure(sk))
2151                        return 1;
2152                alloc = sk_sockets_allocated_read_positive(sk);
2153                if (sk_prot_mem_limits(sk, 2) > alloc *
2154                    sk_mem_pages(sk->sk_wmem_queued +
2155                                 atomic_read(&sk->sk_rmem_alloc) +
2156                                 sk->sk_forward_alloc))
2157                        return 1;
2158        }
2159
2160suppress_allocation:
2161
2162        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2163                sk_stream_moderate_sndbuf(sk);
2164
2165                /* Fail only if socket is _under_ its sndbuf.
2166                 * In this case we cannot block, so that we have to fail.
2167                 */
2168                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2169                        return 1;
2170        }
2171
2172        trace_sock_exceed_buf_limit(sk, prot, allocated);
2173
2174        /* Alas. Undo changes. */
2175        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2176
2177        sk_memory_allocated_sub(sk, amt);
2178
2179        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2180                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2181
2182        return 0;
2183}
2184EXPORT_SYMBOL(__sk_mem_schedule);
2185
2186/**
2187 *      __sk_mem_reclaim - reclaim memory_allocated
2188 *      @sk: socket
2189 *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2190 */
2191void __sk_mem_reclaim(struct sock *sk, int amount)
2192{
2193        amount >>= SK_MEM_QUANTUM_SHIFT;
2194        sk_memory_allocated_sub(sk, amount);
2195        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2196
2197        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2198                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2199
2200        if (sk_under_memory_pressure(sk) &&
2201            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2202                sk_leave_memory_pressure(sk);
2203}
2204EXPORT_SYMBOL(__sk_mem_reclaim);
2205
2206int sk_set_peek_off(struct sock *sk, int val)
2207{
2208        if (val < 0)
2209                return -EINVAL;
2210
2211        sk->sk_peek_off = val;
2212        return 0;
2213}
2214EXPORT_SYMBOL_GPL(sk_set_peek_off);
2215
2216/*
2217 * Set of default routines for initialising struct proto_ops when
2218 * the protocol does not support a particular function. In certain
2219 * cases where it makes no sense for a protocol to have a "do nothing"
2220 * function, some default processing is provided.
2221 */
2222
2223int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2224{
2225        return -EOPNOTSUPP;
2226}
2227EXPORT_SYMBOL(sock_no_bind);
2228
2229int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2230                    int len, int flags)
2231{
2232        return -EOPNOTSUPP;
2233}
2234EXPORT_SYMBOL(sock_no_connect);
2235
2236int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2237{
2238        return -EOPNOTSUPP;
2239}
2240EXPORT_SYMBOL(sock_no_socketpair);
2241
2242int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2243{
2244        return -EOPNOTSUPP;
2245}
2246EXPORT_SYMBOL(sock_no_accept);
2247
2248int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2249                    int *len, int peer)
2250{
2251        return -EOPNOTSUPP;
2252}
2253EXPORT_SYMBOL(sock_no_getname);
2254
2255unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2256{
2257        return 0;
2258}
2259EXPORT_SYMBOL(sock_no_poll);
2260
2261int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2262{
2263        return -EOPNOTSUPP;
2264}
2265EXPORT_SYMBOL(sock_no_ioctl);
2266
2267int sock_no_listen(struct socket *sock, int backlog)
2268{
2269        return -EOPNOTSUPP;
2270}
2271EXPORT_SYMBOL(sock_no_listen);
2272
2273int sock_no_shutdown(struct socket *sock, int how)
2274{
2275        return -EOPNOTSUPP;
2276}
2277EXPORT_SYMBOL(sock_no_shutdown);
2278
2279int sock_no_setsockopt(struct socket *sock, int level, int optname,
2280                    char __user *optval, unsigned int optlen)
2281{
2282        return -EOPNOTSUPP;
2283}
2284EXPORT_SYMBOL(sock_no_setsockopt);
2285
2286int sock_no_getsockopt(struct socket *sock, int level, int optname,
2287                    char __user *optval, int __user *optlen)
2288{
2289        return -EOPNOTSUPP;
2290}
2291EXPORT_SYMBOL(sock_no_getsockopt);
2292
2293int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2294{
2295        return -EOPNOTSUPP;
2296}
2297EXPORT_SYMBOL(sock_no_sendmsg);
2298
2299int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2300                    int flags)
2301{
2302        return -EOPNOTSUPP;
2303}
2304EXPORT_SYMBOL(sock_no_recvmsg);
2305
2306int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2307{
2308        /* Mirror missing mmap method error code */
2309        return -ENODEV;
2310}
2311EXPORT_SYMBOL(sock_no_mmap);
2312
2313ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2314{
2315        ssize_t res;
2316        struct msghdr msg = {.msg_flags = flags};
2317        struct kvec iov;
2318        char *kaddr = kmap(page);
2319        iov.iov_base = kaddr + offset;
2320        iov.iov_len = size;
2321        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2322        kunmap(page);
2323        return res;
2324}
2325EXPORT_SYMBOL(sock_no_sendpage);
2326
2327/*
2328 *      Default Socket Callbacks
2329 */
2330
2331static void sock_def_wakeup(struct sock *sk)
2332{
2333        struct socket_wq *wq;
2334
2335        rcu_read_lock();
2336        wq = rcu_dereference(sk->sk_wq);
2337        if (skwq_has_sleeper(wq))
2338                wake_up_interruptible_all(&wq->wait);
2339        rcu_read_unlock();
2340}
2341
2342static void sock_def_error_report(struct sock *sk)
2343{
2344        struct socket_wq *wq;
2345
2346        rcu_read_lock();
2347        wq = rcu_dereference(sk->sk_wq);
2348        if (skwq_has_sleeper(wq))
2349                wake_up_interruptible_poll(&wq->wait, POLLERR);
2350        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2351        rcu_read_unlock();
2352}
2353
2354static void sock_def_readable(struct sock *sk)
2355{
2356        struct socket_wq *wq;
2357
2358        rcu_read_lock();
2359        wq = rcu_dereference(sk->sk_wq);
2360        if (skwq_has_sleeper(wq))
2361                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2362                                                POLLRDNORM | POLLRDBAND);
2363        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2364        rcu_read_unlock();
2365}
2366
2367static void sock_def_write_space(struct sock *sk)
2368{
2369        struct socket_wq *wq;
2370
2371        rcu_read_lock();
2372
2373        /* Do not wake up a writer until he can make "significant"
2374         * progress.  --DaveM
2375         */
2376        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2377                wq = rcu_dereference(sk->sk_wq);
2378                if (skwq_has_sleeper(wq))
2379                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2380                                                POLLWRNORM | POLLWRBAND);
2381
2382                /* Should agree with poll, otherwise some programs break */
2383                if (sock_writeable(sk))
2384                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2385        }
2386
2387        rcu_read_unlock();
2388}
2389
2390static void sock_def_destruct(struct sock *sk)
2391{
2392}
2393
2394void sk_send_sigurg(struct sock *sk)
2395{
2396        if (sk->sk_socket && sk->sk_socket->file)
2397                if (send_sigurg(&sk->sk_socket->file->f_owner))
2398                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2399}
2400EXPORT_SYMBOL(sk_send_sigurg);
2401
2402void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2403                    unsigned long expires)
2404{
2405        if (!mod_timer(timer, expires))
2406                sock_hold(sk);
2407}
2408EXPORT_SYMBOL(sk_reset_timer);
2409
2410void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2411{
2412        if (del_timer(timer))
2413                __sock_put(sk);
2414}
2415EXPORT_SYMBOL(sk_stop_timer);
2416
2417void sock_init_data(struct socket *sock, struct sock *sk)
2418{
2419        skb_queue_head_init(&sk->sk_receive_queue);
2420        skb_queue_head_init(&sk->sk_write_queue);
2421        skb_queue_head_init(&sk->sk_error_queue);
2422
2423        sk->sk_send_head        =       NULL;
2424
2425        init_timer(&sk->sk_timer);
2426
2427        sk->sk_allocation       =       GFP_KERNEL;
2428        sk->sk_rcvbuf           =       sysctl_rmem_default;
2429        sk->sk_sndbuf           =       sysctl_wmem_default;
2430        sk->sk_state            =       TCP_CLOSE;
2431        sk_set_socket(sk, sock);
2432
2433        sock_set_flag(sk, SOCK_ZAPPED);
2434
2435        if (sock) {
2436                sk->sk_type     =       sock->type;
2437                sk->sk_wq       =       sock->wq;
2438                sock->sk        =       sk;
2439        } else
2440                sk->sk_wq       =       NULL;
2441
2442        rwlock_init(&sk->sk_callback_lock);
2443        lockdep_set_class_and_name(&sk->sk_callback_lock,
2444                        af_callback_keys + sk->sk_family,
2445                        af_family_clock_key_strings[sk->sk_family]);
2446
2447        sk->sk_state_change     =       sock_def_wakeup;
2448        sk->sk_data_ready       =       sock_def_readable;
2449        sk->sk_write_space      =       sock_def_write_space;
2450        sk->sk_error_report     =       sock_def_error_report;
2451        sk->sk_destruct         =       sock_def_destruct;
2452
2453        sk->sk_frag.page        =       NULL;
2454        sk->sk_frag.offset      =       0;
2455        sk->sk_peek_off         =       -1;
2456
2457        sk->sk_peer_pid         =       NULL;
2458        sk->sk_peer_cred        =       NULL;
2459        sk->sk_write_pending    =       0;
2460        sk->sk_rcvlowat         =       1;
2461        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2462        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2463
2464        sk->sk_stamp = ktime_set(-1L, 0);
2465
2466#ifdef CONFIG_NET_RX_BUSY_POLL
2467        sk->sk_napi_id          =       0;
2468        sk->sk_ll_usec          =       sysctl_net_busy_read;
2469#endif
2470
2471        sk->sk_max_pacing_rate = ~0U;
2472        sk->sk_pacing_rate = ~0U;
2473        sk->sk_incoming_cpu = -1;
2474        /*
2475         * Before updating sk_refcnt, we must commit prior changes to memory
2476         * (Documentation/RCU/rculist_nulls.txt for details)
2477         */
2478        smp_wmb();
2479        atomic_set(&sk->sk_refcnt, 1);
2480        atomic_set(&sk->sk_drops, 0);
2481}
2482EXPORT_SYMBOL(sock_init_data);
2483
2484void lock_sock_nested(struct sock *sk, int subclass)
2485{
2486        might_sleep();
2487        spin_lock_bh(&sk->sk_lock.slock);
2488        if (sk->sk_lock.owned)
2489                __lock_sock(sk);
2490        sk->sk_lock.owned = 1;
2491        spin_unlock(&sk->sk_lock.slock);
2492        /*
2493         * The sk_lock has mutex_lock() semantics here:
2494         */
2495        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2496        local_bh_enable();
2497}
2498EXPORT_SYMBOL(lock_sock_nested);
2499
2500void release_sock(struct sock *sk)
2501{
2502        spin_lock_bh(&sk->sk_lock.slock);
2503        if (sk->sk_backlog.tail)
2504                __release_sock(sk);
2505
2506        /* Warning : release_cb() might need to release sk ownership,
2507         * ie call sock_release_ownership(sk) before us.
2508         */
2509        if (sk->sk_prot->release_cb)
2510                sk->sk_prot->release_cb(sk);
2511
2512        sock_release_ownership(sk);
2513        if (waitqueue_active(&sk->sk_lock.wq))
2514                wake_up(&sk->sk_lock.wq);
2515        spin_unlock_bh(&sk->sk_lock.slock);
2516}
2517EXPORT_SYMBOL(release_sock);
2518
2519/**
2520 * lock_sock_fast - fast version of lock_sock
2521 * @sk: socket
2522 *
2523 * This version should be used for very small section, where process wont block
2524 * return false if fast path is taken
2525 *   sk_lock.slock locked, owned = 0, BH disabled
2526 * return true if slow path is taken
2527 *   sk_lock.slock unlocked, owned = 1, BH enabled
2528 */
2529bool lock_sock_fast(struct sock *sk)
2530{
2531        might_sleep();
2532        spin_lock_bh(&sk->sk_lock.slock);
2533
2534        if (!sk->sk_lock.owned)
2535                /*
2536                 * Note : We must disable BH
2537                 */
2538                return false;
2539
2540        __lock_sock(sk);
2541        sk->sk_lock.owned = 1;
2542        spin_unlock(&sk->sk_lock.slock);
2543        /*
2544         * The sk_lock has mutex_lock() semantics here:
2545         */
2546        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2547        local_bh_enable();
2548        return true;
2549}
2550EXPORT_SYMBOL(lock_sock_fast);
2551
2552int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2553{
2554        struct timeval tv;
2555        if (!sock_flag(sk, SOCK_TIMESTAMP))
2556                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2557        tv = ktime_to_timeval(sk->sk_stamp);
2558        if (tv.tv_sec == -1)
2559                return -ENOENT;
2560        if (tv.tv_sec == 0) {
2561                sk->sk_stamp = ktime_get_real();
2562                tv = ktime_to_timeval(sk->sk_stamp);
2563        }
2564        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2565}
2566EXPORT_SYMBOL(sock_get_timestamp);
2567
2568int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2569{
2570        struct timespec ts;
2571        if (!sock_flag(sk, SOCK_TIMESTAMP))
2572                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2573        ts = ktime_to_timespec(sk->sk_stamp);
2574        if (ts.tv_sec == -1)
2575                return -ENOENT;
2576        if (ts.tv_sec == 0) {
2577                sk->sk_stamp = ktime_get_real();
2578                ts = ktime_to_timespec(sk->sk_stamp);
2579        }
2580        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2581}
2582EXPORT_SYMBOL(sock_get_timestampns);
2583
2584void sock_enable_timestamp(struct sock *sk, int flag)
2585{
2586        if (!sock_flag(sk, flag)) {
2587                unsigned long previous_flags = sk->sk_flags;
2588
2589                sock_set_flag(sk, flag);
2590                /*
2591                 * we just set one of the two flags which require net
2592                 * time stamping, but time stamping might have been on
2593                 * already because of the other one
2594                 */
2595                if (sock_needs_netstamp(sk) &&
2596                    !(previous_flags & SK_FLAGS_TIMESTAMP))
2597                        net_enable_timestamp();
2598        }
2599}
2600
2601int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2602                       int level, int type)
2603{
2604        struct sock_exterr_skb *serr;
2605        struct sk_buff *skb;
2606        int copied, err;
2607
2608        err = -EAGAIN;
2609        skb = sock_dequeue_err_skb(sk);
2610        if (skb == NULL)
2611                goto out;
2612
2613        copied = skb->len;
2614        if (copied > len) {
2615                msg->msg_flags |= MSG_TRUNC;
2616                copied = len;
2617        }
2618        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2619        if (err)
2620                goto out_free_skb;
2621
2622        sock_recv_timestamp(msg, sk, skb);
2623
2624        serr = SKB_EXT_ERR(skb);
2625        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2626
2627        msg->msg_flags |= MSG_ERRQUEUE;
2628        err = copied;
2629
2630out_free_skb:
2631        kfree_skb(skb);
2632out:
2633        return err;
2634}
2635EXPORT_SYMBOL(sock_recv_errqueue);
2636
2637/*
2638 *      Get a socket option on an socket.
2639 *
2640 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2641 *      asynchronous errors should be reported by getsockopt. We assume
2642 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2643 */
2644int sock_common_getsockopt(struct socket *sock, int level, int optname,
2645                           char __user *optval, int __user *optlen)
2646{
2647        struct sock *sk = sock->sk;
2648
2649        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2650}
2651EXPORT_SYMBOL(sock_common_getsockopt);
2652
2653#ifdef CONFIG_COMPAT
2654int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2655                                  char __user *optval, int __user *optlen)
2656{
2657        struct sock *sk = sock->sk;
2658
2659        if (sk->sk_prot->compat_getsockopt != NULL)
2660                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2661                                                      optval, optlen);
2662        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2663}
2664EXPORT_SYMBOL(compat_sock_common_getsockopt);
2665#endif
2666
2667int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2668                        int flags)
2669{
2670        struct sock *sk = sock->sk;
2671        int addr_len = 0;
2672        int err;
2673
2674        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2675                                   flags & ~MSG_DONTWAIT, &addr_len);
2676        if (err >= 0)
2677                msg->msg_namelen = addr_len;
2678        return err;
2679}
2680EXPORT_SYMBOL(sock_common_recvmsg);
2681
2682/*
2683 *      Set socket options on an inet socket.
2684 */
2685int sock_common_setsockopt(struct socket *sock, int level, int optname,
2686                           char __user *optval, unsigned int optlen)
2687{
2688        struct sock *sk = sock->sk;
2689
2690        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2691}
2692EXPORT_SYMBOL(sock_common_setsockopt);
2693
2694#ifdef CONFIG_COMPAT
2695int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2696                                  char __user *optval, unsigned int optlen)
2697{
2698        struct sock *sk = sock->sk;
2699
2700        if (sk->sk_prot->compat_setsockopt != NULL)
2701                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2702                                                      optval, optlen);
2703        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2704}
2705EXPORT_SYMBOL(compat_sock_common_setsockopt);
2706#endif
2707
2708void sk_common_release(struct sock *sk)
2709{
2710        if (sk->sk_prot->destroy)
2711                sk->sk_prot->destroy(sk);
2712
2713        /*
2714         * Observation: when sock_common_release is called, processes have
2715         * no access to socket. But net still has.
2716         * Step one, detach it from networking:
2717         *
2718         * A. Remove from hash tables.
2719         */
2720
2721        sk->sk_prot->unhash(sk);
2722
2723        /*
2724         * In this point socket cannot receive new packets, but it is possible
2725         * that some packets are in flight because some CPU runs receiver and
2726         * did hash table lookup before we unhashed socket. They will achieve
2727         * receive queue and will be purged by socket destructor.
2728         *
2729         * Also we still have packets pending on receive queue and probably,
2730         * our own packets waiting in device queues. sock_destroy will drain
2731         * receive queue, but transmitted packets will delay socket destruction
2732         * until the last reference will be released.
2733         */
2734
2735        sock_orphan(sk);
2736
2737        xfrm_sk_free_policy(sk);
2738
2739        sk_refcnt_debug_release(sk);
2740
2741        if (sk->sk_frag.page) {
2742                put_page(sk->sk_frag.page);
2743                sk->sk_frag.page = NULL;
2744        }
2745
2746        sock_put(sk);
2747}
2748EXPORT_SYMBOL(sk_common_release);
2749
2750#ifdef CONFIG_PROC_FS
2751#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2752struct prot_inuse {
2753        int val[PROTO_INUSE_NR];
2754};
2755
2756static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2757
2758#ifdef CONFIG_NET_NS
2759void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2760{
2761        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2762}
2763EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2764
2765int sock_prot_inuse_get(struct net *net, struct proto *prot)
2766{
2767        int cpu, idx = prot->inuse_idx;
2768        int res = 0;
2769
2770        for_each_possible_cpu(cpu)
2771                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2772
2773        return res >= 0 ? res : 0;
2774}
2775EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2776
2777static int __net_init sock_inuse_init_net(struct net *net)
2778{
2779        net->core.inuse = alloc_percpu(struct prot_inuse);
2780        return net->core.inuse ? 0 : -ENOMEM;
2781}
2782
2783static void __net_exit sock_inuse_exit_net(struct net *net)
2784{
2785        free_percpu(net->core.inuse);
2786}
2787
2788static struct pernet_operations net_inuse_ops = {
2789        .init = sock_inuse_init_net,
2790        .exit = sock_inuse_exit_net,
2791};
2792
2793static __init int net_inuse_init(void)
2794{
2795        if (register_pernet_subsys(&net_inuse_ops))
2796                panic("Cannot initialize net inuse counters");
2797
2798        return 0;
2799}
2800
2801core_initcall(net_inuse_init);
2802#else
2803static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2804
2805void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2806{
2807        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2808}
2809EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2810
2811int sock_prot_inuse_get(struct net *net, struct proto *prot)
2812{
2813        int cpu, idx = prot->inuse_idx;
2814        int res = 0;
2815
2816        for_each_possible_cpu(cpu)
2817                res += per_cpu(prot_inuse, cpu).val[idx];
2818
2819        return res >= 0 ? res : 0;
2820}
2821EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2822#endif
2823
2824static void assign_proto_idx(struct proto *prot)
2825{
2826        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2827
2828        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2829                pr_err("PROTO_INUSE_NR exhausted\n");
2830                return;
2831        }
2832
2833        set_bit(prot->inuse_idx, proto_inuse_idx);
2834}
2835
2836static void release_proto_idx(struct proto *prot)
2837{
2838        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2839                clear_bit(prot->inuse_idx, proto_inuse_idx);
2840}
2841#else
2842static inline void assign_proto_idx(struct proto *prot)
2843{
2844}
2845
2846static inline void release_proto_idx(struct proto *prot)
2847{
2848}
2849#endif
2850
2851static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2852{
2853        if (!rsk_prot)
2854                return;
2855        kfree(rsk_prot->slab_name);
2856        rsk_prot->slab_name = NULL;
2857        kmem_cache_destroy(rsk_prot->slab);
2858        rsk_prot->slab = NULL;
2859}
2860
2861static int req_prot_init(const struct proto *prot)
2862{
2863        struct request_sock_ops *rsk_prot = prot->rsk_prot;
2864
2865        if (!rsk_prot)
2866                return 0;
2867
2868        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2869                                        prot->name);
2870        if (!rsk_prot->slab_name)
2871                return -ENOMEM;
2872
2873        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2874                                           rsk_prot->obj_size, 0,
2875                                           prot->slab_flags, NULL);
2876
2877        if (!rsk_prot->slab) {
2878                pr_crit("%s: Can't create request sock SLAB cache!\n",
2879                        prot->name);
2880                return -ENOMEM;
2881        }
2882        return 0;
2883}
2884
2885int proto_register(struct proto *prot, int alloc_slab)
2886{
2887        if (alloc_slab) {
2888                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2889                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2890                                        NULL);
2891
2892                if (prot->slab == NULL) {
2893                        pr_crit("%s: Can't create sock SLAB cache!\n",
2894                                prot->name);
2895                        goto out;
2896                }
2897
2898                if (req_prot_init(prot))
2899                        goto out_free_request_sock_slab;
2900
2901                if (prot->twsk_prot != NULL) {
2902                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2903
2904                        if (prot->twsk_prot->twsk_slab_name == NULL)
2905                                goto out_free_request_sock_slab;
2906
2907                        prot->twsk_prot->twsk_slab =
2908                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2909                                                  prot->twsk_prot->twsk_obj_size,
2910                                                  0,
2911                                                  prot->slab_flags,
2912                                                  NULL);
2913                        if (prot->twsk_prot->twsk_slab == NULL)
2914                                goto out_free_timewait_sock_slab_name;
2915                }
2916        }
2917
2918        mutex_lock(&proto_list_mutex);
2919        list_add(&prot->node, &proto_list);
2920        assign_proto_idx(prot);
2921        mutex_unlock(&proto_list_mutex);
2922        return 0;
2923
2924out_free_timewait_sock_slab_name:
2925        kfree(prot->twsk_prot->twsk_slab_name);
2926out_free_request_sock_slab:
2927        req_prot_cleanup(prot->rsk_prot);
2928
2929        kmem_cache_destroy(prot->slab);
2930        prot->slab = NULL;
2931out:
2932        return -ENOBUFS;
2933}
2934EXPORT_SYMBOL(proto_register);
2935
2936void proto_unregister(struct proto *prot)
2937{
2938        mutex_lock(&proto_list_mutex);
2939        release_proto_idx(prot);
2940        list_del(&prot->node);
2941        mutex_unlock(&proto_list_mutex);
2942
2943        kmem_cache_destroy(prot->slab);
2944        prot->slab = NULL;
2945
2946        req_prot_cleanup(prot->rsk_prot);
2947
2948        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2949                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2950                kfree(prot->twsk_prot->twsk_slab_name);
2951                prot->twsk_prot->twsk_slab = NULL;
2952        }
2953}
2954EXPORT_SYMBOL(proto_unregister);
2955
2956#ifdef CONFIG_PROC_FS
2957static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2958        __acquires(proto_list_mutex)
2959{
2960        mutex_lock(&proto_list_mutex);
2961        return seq_list_start_head(&proto_list, *pos);
2962}
2963
2964static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2965{
2966        return seq_list_next(v, &proto_list, pos);
2967}
2968
2969static void proto_seq_stop(struct seq_file *seq, void *v)
2970        __releases(proto_list_mutex)
2971{
2972        mutex_unlock(&proto_list_mutex);
2973}
2974
2975static char proto_method_implemented(const void *method)
2976{
2977        return method == NULL ? 'n' : 'y';
2978}
2979static long sock_prot_memory_allocated(struct proto *proto)
2980{
2981        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2982}
2983
2984static char *sock_prot_memory_pressure(struct proto *proto)
2985{
2986        return proto->memory_pressure != NULL ?
2987        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2988}
2989
2990static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2991{
2992
2993        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2994                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2995                   proto->name,
2996                   proto->obj_size,
2997                   sock_prot_inuse_get(seq_file_net(seq), proto),
2998                   sock_prot_memory_allocated(proto),
2999                   sock_prot_memory_pressure(proto),
3000                   proto->max_header,
3001                   proto->slab == NULL ? "no" : "yes",
3002                   module_name(proto->owner),
3003                   proto_method_implemented(proto->close),
3004                   proto_method_implemented(proto->connect),
3005                   proto_method_implemented(proto->disconnect),
3006                   proto_method_implemented(proto->accept),
3007                   proto_method_implemented(proto->ioctl),
3008                   proto_method_implemented(proto->init),
3009                   proto_method_implemented(proto->destroy),
3010                   proto_method_implemented(proto->shutdown),
3011                   proto_method_implemented(proto->setsockopt),
3012                   proto_method_implemented(proto->getsockopt),
3013                   proto_method_implemented(proto->sendmsg),
3014                   proto_method_implemented(proto->recvmsg),
3015                   proto_method_implemented(proto->sendpage),
3016                   proto_method_implemented(proto->bind),
3017                   proto_method_implemented(proto->backlog_rcv),
3018                   proto_method_implemented(proto->hash),
3019                   proto_method_implemented(proto->unhash),
3020                   proto_method_implemented(proto->get_port),
3021                   proto_method_implemented(proto->enter_memory_pressure));
3022}
3023
3024static int proto_seq_show(struct seq_file *seq, void *v)
3025{
3026        if (v == &proto_list)
3027                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3028                           "protocol",
3029                           "size",
3030                           "sockets",
3031                           "memory",
3032                           "press",
3033                           "maxhdr",
3034                           "slab",
3035                           "module",
3036                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3037        else
3038                proto_seq_printf(seq, list_entry(v, struct proto, node));
3039        return 0;
3040}
3041
3042static const struct seq_operations proto_seq_ops = {
3043        .start  = proto_seq_start,
3044        .next   = proto_seq_next,
3045        .stop   = proto_seq_stop,
3046        .show   = proto_seq_show,
3047};
3048
3049static int proto_seq_open(struct inode *inode, struct file *file)
3050{
3051        return seq_open_net(inode, file, &proto_seq_ops,
3052                            sizeof(struct seq_net_private));
3053}
3054
3055static const struct file_operations proto_seq_fops = {
3056        .owner          = THIS_MODULE,
3057        .open           = proto_seq_open,
3058        .read           = seq_read,
3059        .llseek         = seq_lseek,
3060        .release        = seq_release_net,
3061};
3062
3063static __net_init int proto_init_net(struct net *net)
3064{
3065        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3066                return -ENOMEM;
3067
3068        return 0;
3069}
3070
3071static __net_exit void proto_exit_net(struct net *net)
3072{
3073        remove_proc_entry("protocols", net->proc_net);
3074}
3075
3076
3077static __net_initdata struct pernet_operations proto_net_ops = {
3078        .init = proto_init_net,
3079        .exit = proto_exit_net,
3080};
3081
3082static int __init proto_init(void)
3083{
3084        return register_pernet_subsys(&proto_net_ops);
3085}
3086
3087subsys_initcall(proto_init);
3088
3089#endif /* PROC_FS */
3090
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.