linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <asm/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134
 135#include <linux/filter.h>
 136
 137#include <trace/events/sock.h>
 138
 139#ifdef CONFIG_INET
 140#include <net/tcp.h>
 141#endif
 142
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148/**
 149 * sk_ns_capable - General socket capability test
 150 * @sk: Socket to use a capability on or through
 151 * @user_ns: The user namespace of the capability to use
 152 * @cap: The capability to use
 153 *
 154 * Test to see if the opener of the socket had when the socket was
 155 * created and the current process has the capability @cap in the user
 156 * namespace @user_ns.
 157 */
 158bool sk_ns_capable(const struct sock *sk,
 159                   struct user_namespace *user_ns, int cap)
 160{
 161        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                ns_capable(user_ns, cap);
 163}
 164EXPORT_SYMBOL(sk_ns_capable);
 165
 166/**
 167 * sk_capable - Socket global capability test
 168 * @sk: Socket to use a capability on or through
 169 * @cap: The global capability to use
 170 *
 171 * Test to see if the opener of the socket had when the socket was
 172 * created and the current process has the capability @cap in all user
 173 * namespaces.
 174 */
 175bool sk_capable(const struct sock *sk, int cap)
 176{
 177        return sk_ns_capable(sk, &init_user_ns, cap);
 178}
 179EXPORT_SYMBOL(sk_capable);
 180
 181/**
 182 * sk_net_capable - Network namespace socket capability test
 183 * @sk: Socket to use a capability on or through
 184 * @cap: The capability to use
 185 *
 186 * Test to see if the opener of the socket had when the socket was created
 187 * and the current process has the capability @cap over the network namespace
 188 * the socket is a member of.
 189 */
 190bool sk_net_capable(const struct sock *sk, int cap)
 191{
 192        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193}
 194EXPORT_SYMBOL(sk_net_capable);
 195
 196
 197#ifdef CONFIG_MEMCG_KMEM
 198int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 199{
 200        struct proto *proto;
 201        int ret = 0;
 202
 203        mutex_lock(&proto_list_mutex);
 204        list_for_each_entry(proto, &proto_list, node) {
 205                if (proto->init_cgroup) {
 206                        ret = proto->init_cgroup(memcg, ss);
 207                        if (ret)
 208                                goto out;
 209                }
 210        }
 211
 212        mutex_unlock(&proto_list_mutex);
 213        return ret;
 214out:
 215        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 216                if (proto->destroy_cgroup)
 217                        proto->destroy_cgroup(memcg);
 218        mutex_unlock(&proto_list_mutex);
 219        return ret;
 220}
 221
 222void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 223{
 224        struct proto *proto;
 225
 226        mutex_lock(&proto_list_mutex);
 227        list_for_each_entry_reverse(proto, &proto_list, node)
 228                if (proto->destroy_cgroup)
 229                        proto->destroy_cgroup(memcg);
 230        mutex_unlock(&proto_list_mutex);
 231}
 232#endif
 233
 234/*
 235 * Each address family might have different locking rules, so we have
 236 * one slock key per address family:
 237 */
 238static struct lock_class_key af_family_keys[AF_MAX];
 239static struct lock_class_key af_family_slock_keys[AF_MAX];
 240
 241#if defined(CONFIG_MEMCG_KMEM)
 242struct static_key memcg_socket_limit_enabled;
 243EXPORT_SYMBOL(memcg_socket_limit_enabled);
 244#endif
 245
 246/*
 247 * Make lock validator output more readable. (we pre-construct these
 248 * strings build-time, so that runtime initialization of socket
 249 * locks is fast):
 250 */
 251static const char *const af_family_key_strings[AF_MAX+1] = {
 252  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 253  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 254  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 255  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 256  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 257  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 258  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 259  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 260  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 261  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 262  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 263  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 264  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 265  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 266};
 267static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 268  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 269  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 270  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 271  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 272  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 273  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 274  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 275  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 276  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 277  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 278  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 279  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 280  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 281  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 282};
 283static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 284  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 285  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 286  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 287  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 288  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 289  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 290  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 291  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 292  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 293  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 294  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 295  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 296  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 297  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 298};
 299
 300/*
 301 * sk_callback_lock locking rules are per-address-family,
 302 * so split the lock classes by using a per-AF key:
 303 */
 304static struct lock_class_key af_callback_keys[AF_MAX];
 305
 306/* Take into consideration the size of the struct sk_buff overhead in the
 307 * determination of these values, since that is non-constant across
 308 * platforms.  This makes socket queueing behavior and performance
 309 * not depend upon such differences.
 310 */
 311#define _SK_MEM_PACKETS         256
 312#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 313#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 314#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 315
 316/* Run time adjustable parameters. */
 317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 318EXPORT_SYMBOL(sysctl_wmem_max);
 319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 320EXPORT_SYMBOL(sysctl_rmem_max);
 321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 323
 324/* Maximal space eaten by iovec or ancillary data plus some space */
 325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 326EXPORT_SYMBOL(sysctl_optmem_max);
 327
 328struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 329EXPORT_SYMBOL_GPL(memalloc_socks);
 330
 331/**
 332 * sk_set_memalloc - sets %SOCK_MEMALLOC
 333 * @sk: socket to set it on
 334 *
 335 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 336 * It's the responsibility of the admin to adjust min_free_kbytes
 337 * to meet the requirements
 338 */
 339void sk_set_memalloc(struct sock *sk)
 340{
 341        sock_set_flag(sk, SOCK_MEMALLOC);
 342        sk->sk_allocation |= __GFP_MEMALLOC;
 343        static_key_slow_inc(&memalloc_socks);
 344}
 345EXPORT_SYMBOL_GPL(sk_set_memalloc);
 346
 347void sk_clear_memalloc(struct sock *sk)
 348{
 349        sock_reset_flag(sk, SOCK_MEMALLOC);
 350        sk->sk_allocation &= ~__GFP_MEMALLOC;
 351        static_key_slow_dec(&memalloc_socks);
 352
 353        /*
 354         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 355         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 356         * it has rmem allocations there is a risk that the user of the
 357         * socket cannot make forward progress due to exceeding the rmem
 358         * limits. By rights, sk_clear_memalloc() should only be called
 359         * on sockets being torn down but warn and reset the accounting if
 360         * that assumption breaks.
 361         */
 362        if (WARN_ON(sk->sk_forward_alloc))
 363                sk_mem_reclaim(sk);
 364}
 365EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 366
 367int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 368{
 369        int ret;
 370        unsigned long pflags = current->flags;
 371
 372        /* these should have been dropped before queueing */
 373        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 374
 375        current->flags |= PF_MEMALLOC;
 376        ret = sk->sk_backlog_rcv(sk, skb);
 377        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 378
 379        return ret;
 380}
 381EXPORT_SYMBOL(__sk_backlog_rcv);
 382
 383static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 384{
 385        struct timeval tv;
 386
 387        if (optlen < sizeof(tv))
 388                return -EINVAL;
 389        if (copy_from_user(&tv, optval, sizeof(tv)))
 390                return -EFAULT;
 391        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 392                return -EDOM;
 393
 394        if (tv.tv_sec < 0) {
 395                static int warned __read_mostly;
 396
 397                *timeo_p = 0;
 398                if (warned < 10 && net_ratelimit()) {
 399                        warned++;
 400                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 401                                __func__, current->comm, task_pid_nr(current));
 402                }
 403                return 0;
 404        }
 405        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 406        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 407                return 0;
 408        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 409                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 410        return 0;
 411}
 412
 413static void sock_warn_obsolete_bsdism(const char *name)
 414{
 415        static int warned;
 416        static char warncomm[TASK_COMM_LEN];
 417        if (strcmp(warncomm, current->comm) && warned < 5) {
 418                strcpy(warncomm,  current->comm);
 419                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 420                        warncomm, name);
 421                warned++;
 422        }
 423}
 424
 425#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 426
 427static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 428{
 429        if (sk->sk_flags & flags) {
 430                sk->sk_flags &= ~flags;
 431                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 432                        net_disable_timestamp();
 433        }
 434}
 435
 436
 437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438{
 439        int err;
 440        unsigned long flags;
 441        struct sk_buff_head *list = &sk->sk_receive_queue;
 442
 443        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 444                atomic_inc(&sk->sk_drops);
 445                trace_sock_rcvqueue_full(sk, skb);
 446                return -ENOMEM;
 447        }
 448
 449        err = sk_filter(sk, skb);
 450        if (err)
 451                return err;
 452
 453        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 454                atomic_inc(&sk->sk_drops);
 455                return -ENOBUFS;
 456        }
 457
 458        skb->dev = NULL;
 459        skb_set_owner_r(skb, sk);
 460
 461        /* we escape from rcu protected region, make sure we dont leak
 462         * a norefcounted dst
 463         */
 464        skb_dst_force(skb);
 465
 466        spin_lock_irqsave(&list->lock, flags);
 467        skb->dropcount = atomic_read(&sk->sk_drops);
 468        __skb_queue_tail(list, skb);
 469        spin_unlock_irqrestore(&list->lock, flags);
 470
 471        if (!sock_flag(sk, SOCK_DEAD))
 472                sk->sk_data_ready(sk);
 473        return 0;
 474}
 475EXPORT_SYMBOL(sock_queue_rcv_skb);
 476
 477int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 478{
 479        int rc = NET_RX_SUCCESS;
 480
 481        if (sk_filter(sk, skb))
 482                goto discard_and_relse;
 483
 484        skb->dev = NULL;
 485
 486        if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 487                atomic_inc(&sk->sk_drops);
 488                goto discard_and_relse;
 489        }
 490        if (nested)
 491                bh_lock_sock_nested(sk);
 492        else
 493                bh_lock_sock(sk);
 494        if (!sock_owned_by_user(sk)) {
 495                /*
 496                 * trylock + unlock semantics:
 497                 */
 498                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 499
 500                rc = sk_backlog_rcv(sk, skb);
 501
 502                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 503        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 504                bh_unlock_sock(sk);
 505                atomic_inc(&sk->sk_drops);
 506                goto discard_and_relse;
 507        }
 508
 509        bh_unlock_sock(sk);
 510out:
 511        sock_put(sk);
 512        return rc;
 513discard_and_relse:
 514        kfree_skb(skb);
 515        goto out;
 516}
 517EXPORT_SYMBOL(sk_receive_skb);
 518
 519struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 520{
 521        struct dst_entry *dst = __sk_dst_get(sk);
 522
 523        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 524                sk_tx_queue_clear(sk);
 525                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 526                dst_release(dst);
 527                return NULL;
 528        }
 529
 530        return dst;
 531}
 532EXPORT_SYMBOL(__sk_dst_check);
 533
 534struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 535{
 536        struct dst_entry *dst = sk_dst_get(sk);
 537
 538        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 539                sk_dst_reset(sk);
 540                dst_release(dst);
 541                return NULL;
 542        }
 543
 544        return dst;
 545}
 546EXPORT_SYMBOL(sk_dst_check);
 547
 548static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 549                                int optlen)
 550{
 551        int ret = -ENOPROTOOPT;
 552#ifdef CONFIG_NETDEVICES
 553        struct net *net = sock_net(sk);
 554        char devname[IFNAMSIZ];
 555        int index;
 556
 557        /* Sorry... */
 558        ret = -EPERM;
 559        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 560                goto out;
 561
 562        ret = -EINVAL;
 563        if (optlen < 0)
 564                goto out;
 565
 566        /* Bind this socket to a particular device like "eth0",
 567         * as specified in the passed interface name. If the
 568         * name is "" or the option length is zero the socket
 569         * is not bound.
 570         */
 571        if (optlen > IFNAMSIZ - 1)
 572                optlen = IFNAMSIZ - 1;
 573        memset(devname, 0, sizeof(devname));
 574
 575        ret = -EFAULT;
 576        if (copy_from_user(devname, optval, optlen))
 577                goto out;
 578
 579        index = 0;
 580        if (devname[0] != '\0') {
 581                struct net_device *dev;
 582
 583                rcu_read_lock();
 584                dev = dev_get_by_name_rcu(net, devname);
 585                if (dev)
 586                        index = dev->ifindex;
 587                rcu_read_unlock();
 588                ret = -ENODEV;
 589                if (!dev)
 590                        goto out;
 591        }
 592
 593        lock_sock(sk);
 594        sk->sk_bound_dev_if = index;
 595        sk_dst_reset(sk);
 596        release_sock(sk);
 597
 598        ret = 0;
 599
 600out:
 601#endif
 602
 603        return ret;
 604}
 605
 606static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 607                                int __user *optlen, int len)
 608{
 609        int ret = -ENOPROTOOPT;
 610#ifdef CONFIG_NETDEVICES
 611        struct net *net = sock_net(sk);
 612        char devname[IFNAMSIZ];
 613
 614        if (sk->sk_bound_dev_if == 0) {
 615                len = 0;
 616                goto zero;
 617        }
 618
 619        ret = -EINVAL;
 620        if (len < IFNAMSIZ)
 621                goto out;
 622
 623        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 624        if (ret)
 625                goto out;
 626
 627        len = strlen(devname) + 1;
 628
 629        ret = -EFAULT;
 630        if (copy_to_user(optval, devname, len))
 631                goto out;
 632
 633zero:
 634        ret = -EFAULT;
 635        if (put_user(len, optlen))
 636                goto out;
 637
 638        ret = 0;
 639
 640out:
 641#endif
 642
 643        return ret;
 644}
 645
 646static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 647{
 648        if (valbool)
 649                sock_set_flag(sk, bit);
 650        else
 651                sock_reset_flag(sk, bit);
 652}
 653
 654/*
 655 *      This is meant for all protocols to use and covers goings on
 656 *      at the socket level. Everything here is generic.
 657 */
 658
 659int sock_setsockopt(struct socket *sock, int level, int optname,
 660                    char __user *optval, unsigned int optlen)
 661{
 662        struct sock *sk = sock->sk;
 663        int val;
 664        int valbool;
 665        struct linger ling;
 666        int ret = 0;
 667
 668        /*
 669         *      Options without arguments
 670         */
 671
 672        if (optname == SO_BINDTODEVICE)
 673                return sock_setbindtodevice(sk, optval, optlen);
 674
 675        if (optlen < sizeof(int))
 676                return -EINVAL;
 677
 678        if (get_user(val, (int __user *)optval))
 679                return -EFAULT;
 680
 681        valbool = val ? 1 : 0;
 682
 683        lock_sock(sk);
 684
 685        switch (optname) {
 686        case SO_DEBUG:
 687                if (val && !capable(CAP_NET_ADMIN))
 688                        ret = -EACCES;
 689                else
 690                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 691                break;
 692        case SO_REUSEADDR:
 693                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 694                break;
 695        case SO_REUSEPORT:
 696                sk->sk_reuseport = valbool;
 697                break;
 698        case SO_TYPE:
 699        case SO_PROTOCOL:
 700        case SO_DOMAIN:
 701        case SO_ERROR:
 702                ret = -ENOPROTOOPT;
 703                break;
 704        case SO_DONTROUTE:
 705                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 706                break;
 707        case SO_BROADCAST:
 708                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 709                break;
 710        case SO_SNDBUF:
 711                /* Don't error on this BSD doesn't and if you think
 712                 * about it this is right. Otherwise apps have to
 713                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 714                 * are treated in BSD as hints
 715                 */
 716                val = min_t(u32, val, sysctl_wmem_max);
 717set_sndbuf:
 718                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 719                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 720                /* Wake up sending tasks if we upped the value. */
 721                sk->sk_write_space(sk);
 722                break;
 723
 724        case SO_SNDBUFFORCE:
 725                if (!capable(CAP_NET_ADMIN)) {
 726                        ret = -EPERM;
 727                        break;
 728                }
 729                goto set_sndbuf;
 730
 731        case SO_RCVBUF:
 732                /* Don't error on this BSD doesn't and if you think
 733                 * about it this is right. Otherwise apps have to
 734                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 735                 * are treated in BSD as hints
 736                 */
 737                val = min_t(u32, val, sysctl_rmem_max);
 738set_rcvbuf:
 739                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 740                /*
 741                 * We double it on the way in to account for
 742                 * "struct sk_buff" etc. overhead.   Applications
 743                 * assume that the SO_RCVBUF setting they make will
 744                 * allow that much actual data to be received on that
 745                 * socket.
 746                 *
 747                 * Applications are unaware that "struct sk_buff" and
 748                 * other overheads allocate from the receive buffer
 749                 * during socket buffer allocation.
 750                 *
 751                 * And after considering the possible alternatives,
 752                 * returning the value we actually used in getsockopt
 753                 * is the most desirable behavior.
 754                 */
 755                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 756                break;
 757
 758        case SO_RCVBUFFORCE:
 759                if (!capable(CAP_NET_ADMIN)) {
 760                        ret = -EPERM;
 761                        break;
 762                }
 763                goto set_rcvbuf;
 764
 765        case SO_KEEPALIVE:
 766#ifdef CONFIG_INET
 767                if (sk->sk_protocol == IPPROTO_TCP &&
 768                    sk->sk_type == SOCK_STREAM)
 769                        tcp_set_keepalive(sk, valbool);
 770#endif
 771                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 772                break;
 773
 774        case SO_OOBINLINE:
 775                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 776                break;
 777
 778        case SO_NO_CHECK:
 779                sk->sk_no_check_tx = valbool;
 780                break;
 781
 782        case SO_PRIORITY:
 783                if ((val >= 0 && val <= 6) ||
 784                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 785                        sk->sk_priority = val;
 786                else
 787                        ret = -EPERM;
 788                break;
 789
 790        case SO_LINGER:
 791                if (optlen < sizeof(ling)) {
 792                        ret = -EINVAL;  /* 1003.1g */
 793                        break;
 794                }
 795                if (copy_from_user(&ling, optval, sizeof(ling))) {
 796                        ret = -EFAULT;
 797                        break;
 798                }
 799                if (!ling.l_onoff)
 800                        sock_reset_flag(sk, SOCK_LINGER);
 801                else {
 802#if (BITS_PER_LONG == 32)
 803                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 804                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 805                        else
 806#endif
 807                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 808                        sock_set_flag(sk, SOCK_LINGER);
 809                }
 810                break;
 811
 812        case SO_BSDCOMPAT:
 813                sock_warn_obsolete_bsdism("setsockopt");
 814                break;
 815
 816        case SO_PASSCRED:
 817                if (valbool)
 818                        set_bit(SOCK_PASSCRED, &sock->flags);
 819                else
 820                        clear_bit(SOCK_PASSCRED, &sock->flags);
 821                break;
 822
 823        case SO_TIMESTAMP:
 824        case SO_TIMESTAMPNS:
 825                if (valbool)  {
 826                        if (optname == SO_TIMESTAMP)
 827                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 828                        else
 829                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 830                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 831                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 832                } else {
 833                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 834                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 835                }
 836                break;
 837
 838        case SO_TIMESTAMPING:
 839                if (val & ~SOF_TIMESTAMPING_MASK) {
 840                        ret = -EINVAL;
 841                        break;
 842                }
 843                if (val & SOF_TIMESTAMPING_OPT_ID &&
 844                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 845                        if (sk->sk_protocol == IPPROTO_TCP) {
 846                                if (sk->sk_state != TCP_ESTABLISHED) {
 847                                        ret = -EINVAL;
 848                                        break;
 849                                }
 850                                sk->sk_tskey = tcp_sk(sk)->snd_una;
 851                        } else {
 852                                sk->sk_tskey = 0;
 853                        }
 854                }
 855                sk->sk_tsflags = val;
 856                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 857                        sock_enable_timestamp(sk,
 858                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 859                else
 860                        sock_disable_timestamp(sk,
 861                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 862                break;
 863
 864        case SO_RCVLOWAT:
 865                if (val < 0)
 866                        val = INT_MAX;
 867                sk->sk_rcvlowat = val ? : 1;
 868                break;
 869
 870        case SO_RCVTIMEO:
 871                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 872                break;
 873
 874        case SO_SNDTIMEO:
 875                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 876                break;
 877
 878        case SO_ATTACH_FILTER:
 879                ret = -EINVAL;
 880                if (optlen == sizeof(struct sock_fprog)) {
 881                        struct sock_fprog fprog;
 882
 883                        ret = -EFAULT;
 884                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 885                                break;
 886
 887                        ret = sk_attach_filter(&fprog, sk);
 888                }
 889                break;
 890
 891        case SO_ATTACH_BPF:
 892                ret = -EINVAL;
 893                if (optlen == sizeof(u32)) {
 894                        u32 ufd;
 895
 896                        ret = -EFAULT;
 897                        if (copy_from_user(&ufd, optval, sizeof(ufd)))
 898                                break;
 899
 900                        ret = sk_attach_bpf(ufd, sk);
 901                }
 902                break;
 903
 904        case SO_DETACH_FILTER:
 905                ret = sk_detach_filter(sk);
 906                break;
 907
 908        case SO_LOCK_FILTER:
 909                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 910                        ret = -EPERM;
 911                else
 912                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 913                break;
 914
 915        case SO_PASSSEC:
 916                if (valbool)
 917                        set_bit(SOCK_PASSSEC, &sock->flags);
 918                else
 919                        clear_bit(SOCK_PASSSEC, &sock->flags);
 920                break;
 921        case SO_MARK:
 922                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 923                        ret = -EPERM;
 924                else
 925                        sk->sk_mark = val;
 926                break;
 927
 928                /* We implement the SO_SNDLOWAT etc to
 929                   not be settable (1003.1g 5.3) */
 930        case SO_RXQ_OVFL:
 931                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 932                break;
 933
 934        case SO_WIFI_STATUS:
 935                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 936                break;
 937
 938        case SO_PEEK_OFF:
 939                if (sock->ops->set_peek_off)
 940                        ret = sock->ops->set_peek_off(sk, val);
 941                else
 942                        ret = -EOPNOTSUPP;
 943                break;
 944
 945        case SO_NOFCS:
 946                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 947                break;
 948
 949        case SO_SELECT_ERR_QUEUE:
 950                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 951                break;
 952
 953#ifdef CONFIG_NET_RX_BUSY_POLL
 954        case SO_BUSY_POLL:
 955                /* allow unprivileged users to decrease the value */
 956                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 957                        ret = -EPERM;
 958                else {
 959                        if (val < 0)
 960                                ret = -EINVAL;
 961                        else
 962                                sk->sk_ll_usec = val;
 963                }
 964                break;
 965#endif
 966
 967        case SO_MAX_PACING_RATE:
 968                sk->sk_max_pacing_rate = val;
 969                sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 970                                         sk->sk_max_pacing_rate);
 971                break;
 972
 973        default:
 974                ret = -ENOPROTOOPT;
 975                break;
 976        }
 977        release_sock(sk);
 978        return ret;
 979}
 980EXPORT_SYMBOL(sock_setsockopt);
 981
 982
 983static void cred_to_ucred(struct pid *pid, const struct cred *cred,
 984                          struct ucred *ucred)
 985{
 986        ucred->pid = pid_vnr(pid);
 987        ucred->uid = ucred->gid = -1;
 988        if (cred) {
 989                struct user_namespace *current_ns = current_user_ns();
 990
 991                ucred->uid = from_kuid_munged(current_ns, cred->euid);
 992                ucred->gid = from_kgid_munged(current_ns, cred->egid);
 993        }
 994}
 995
 996int sock_getsockopt(struct socket *sock, int level, int optname,
 997                    char __user *optval, int __user *optlen)
 998{
 999        struct sock *sk = sock->sk;
1000
1001        union {
1002                int val;
1003                struct linger ling;
1004                struct timeval tm;
1005        } v;
1006
1007        int lv = sizeof(int);
1008        int len;
1009
1010        if (get_user(len, optlen))
1011                return -EFAULT;
1012        if (len < 0)
1013                return -EINVAL;
1014
1015        memset(&v, 0, sizeof(v));
1016
1017        switch (optname) {
1018        case SO_DEBUG:
1019                v.val = sock_flag(sk, SOCK_DBG);
1020                break;
1021
1022        case SO_DONTROUTE:
1023                v.val = sock_flag(sk, SOCK_LOCALROUTE);
1024                break;
1025
1026        case SO_BROADCAST:
1027                v.val = sock_flag(sk, SOCK_BROADCAST);
1028                break;
1029
1030        case SO_SNDBUF:
1031                v.val = sk->sk_sndbuf;
1032                break;
1033
1034        case SO_RCVBUF:
1035                v.val = sk->sk_rcvbuf;
1036                break;
1037
1038        case SO_REUSEADDR:
1039                v.val = sk->sk_reuse;
1040                break;
1041
1042        case SO_REUSEPORT:
1043                v.val = sk->sk_reuseport;
1044                break;
1045
1046        case SO_KEEPALIVE:
1047                v.val = sock_flag(sk, SOCK_KEEPOPEN);
1048                break;
1049
1050        case SO_TYPE:
1051                v.val = sk->sk_type;
1052                break;
1053
1054        case SO_PROTOCOL:
1055                v.val = sk->sk_protocol;
1056                break;
1057
1058        case SO_DOMAIN:
1059                v.val = sk->sk_family;
1060                break;
1061
1062        case SO_ERROR:
1063                v.val = -sock_error(sk);
1064                if (v.val == 0)
1065                        v.val = xchg(&sk->sk_err_soft, 0);
1066                break;
1067
1068        case SO_OOBINLINE:
1069                v.val = sock_flag(sk, SOCK_URGINLINE);
1070                break;
1071
1072        case SO_NO_CHECK:
1073                v.val = sk->sk_no_check_tx;
1074                break;
1075
1076        case SO_PRIORITY:
1077                v.val = sk->sk_priority;
1078                break;
1079
1080        case SO_LINGER:
1081                lv              = sizeof(v.ling);
1082                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1083                v.ling.l_linger = sk->sk_lingertime / HZ;
1084                break;
1085
1086        case SO_BSDCOMPAT:
1087                sock_warn_obsolete_bsdism("getsockopt");
1088                break;
1089
1090        case SO_TIMESTAMP:
1091                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1092                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
1093                break;
1094
1095        case SO_TIMESTAMPNS:
1096                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1097                break;
1098
1099        case SO_TIMESTAMPING:
1100                v.val = sk->sk_tsflags;
1101                break;
1102
1103        case SO_RCVTIMEO:
1104                lv = sizeof(struct timeval);
1105                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1106                        v.tm.tv_sec = 0;
1107                        v.tm.tv_usec = 0;
1108                } else {
1109                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1110                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1111                }
1112                break;
1113
1114        case SO_SNDTIMEO:
1115                lv = sizeof(struct timeval);
1116                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1117                        v.tm.tv_sec = 0;
1118                        v.tm.tv_usec = 0;
1119                } else {
1120                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1121                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1122                }
1123                break;
1124
1125        case SO_RCVLOWAT:
1126                v.val = sk->sk_rcvlowat;
1127                break;
1128
1129        case SO_SNDLOWAT:
1130                v.val = 1;
1131                break;
1132
1133        case SO_PASSCRED:
1134                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1135                break;
1136
1137        case SO_PEERCRED:
1138        {
1139                struct ucred peercred;
1140                if (len > sizeof(peercred))
1141                        len = sizeof(peercred);
1142                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1143                if (copy_to_user(optval, &peercred, len))
1144                        return -EFAULT;
1145                goto lenout;
1146        }
1147
1148        case SO_PEERNAME:
1149        {
1150                char address[128];
1151
1152                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1153                        return -ENOTCONN;
1154                if (lv < len)
1155                        return -EINVAL;
1156                if (copy_to_user(optval, address, len))
1157                        return -EFAULT;
1158                goto lenout;
1159        }
1160
1161        /* Dubious BSD thing... Probably nobody even uses it, but
1162         * the UNIX standard wants it for whatever reason... -DaveM
1163         */
1164        case SO_ACCEPTCONN:
1165                v.val = sk->sk_state == TCP_LISTEN;
1166                break;
1167
1168        case SO_PASSSEC:
1169                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1170                break;
1171
1172        case SO_PEERSEC:
1173                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1174
1175        case SO_MARK:
1176                v.val = sk->sk_mark;
1177                break;
1178
1179        case SO_RXQ_OVFL:
1180                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1181                break;
1182
1183        case SO_WIFI_STATUS:
1184                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1185                break;
1186
1187        case SO_PEEK_OFF:
1188                if (!sock->ops->set_peek_off)
1189                        return -EOPNOTSUPP;
1190
1191                v.val = sk->sk_peek_off;
1192                break;
1193        case SO_NOFCS:
1194                v.val = sock_flag(sk, SOCK_NOFCS);
1195                break;
1196
1197        case SO_BINDTODEVICE:
1198                return sock_getbindtodevice(sk, optval, optlen, len);
1199
1200        case SO_GET_FILTER:
1201                len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1202                if (len < 0)
1203                        return len;
1204
1205                goto lenout;
1206
1207        case SO_LOCK_FILTER:
1208                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1209                break;
1210
1211        case SO_BPF_EXTENSIONS:
1212                v.val = bpf_tell_extensions();
1213                break;
1214
1215        case SO_SELECT_ERR_QUEUE:
1216                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1217                break;
1218
1219#ifdef CONFIG_NET_RX_BUSY_POLL
1220        case SO_BUSY_POLL:
1221                v.val = sk->sk_ll_usec;
1222                break;
1223#endif
1224
1225        case SO_MAX_PACING_RATE:
1226                v.val = sk->sk_max_pacing_rate;
1227                break;
1228
1229        case SO_INCOMING_CPU:
1230                v.val = sk->sk_incoming_cpu;
1231                break;
1232
1233        default:
1234                return -ENOPROTOOPT;
1235        }
1236
1237        if (len > lv)
1238                len = lv;
1239        if (copy_to_user(optval, &v, len))
1240                return -EFAULT;
1241lenout:
1242        if (put_user(len, optlen))
1243                return -EFAULT;
1244        return 0;
1245}
1246
1247/*
1248 * Initialize an sk_lock.
1249 *
1250 * (We also register the sk_lock with the lock validator.)
1251 */
1252static inline void sock_lock_init(struct sock *sk)
1253{
1254        sock_lock_init_class_and_name(sk,
1255                        af_family_slock_key_strings[sk->sk_family],
1256                        af_family_slock_keys + sk->sk_family,
1257                        af_family_key_strings[sk->sk_family],
1258                        af_family_keys + sk->sk_family);
1259}
1260
1261/*
1262 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1263 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1264 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1265 */
1266static void sock_copy(struct sock *nsk, const struct sock *osk)
1267{
1268#ifdef CONFIG_SECURITY_NETWORK
1269        void *sptr = nsk->sk_security;
1270#endif
1271        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1272
1273        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1274               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1275
1276#ifdef CONFIG_SECURITY_NETWORK
1277        nsk->sk_security = sptr;
1278        security_sk_clone(osk, nsk);
1279#endif
1280}
1281
1282void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1283{
1284        unsigned long nulls1, nulls2;
1285
1286        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1287        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1288        if (nulls1 > nulls2)
1289                swap(nulls1, nulls2);
1290
1291        if (nulls1 != 0)
1292                memset((char *)sk, 0, nulls1);
1293        memset((char *)sk + nulls1 + sizeof(void *), 0,
1294               nulls2 - nulls1 - sizeof(void *));
1295        memset((char *)sk + nulls2 + sizeof(void *), 0,
1296               size - nulls2 - sizeof(void *));
1297}
1298EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1299
1300static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1301                int family)
1302{
1303        struct sock *sk;
1304        struct kmem_cache *slab;
1305
1306        slab = prot->slab;
1307        if (slab != NULL) {
1308                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1309                if (!sk)
1310                        return sk;
1311                if (priority & __GFP_ZERO) {
1312                        if (prot->clear_sk)
1313                                prot->clear_sk(sk, prot->obj_size);
1314                        else
1315                                sk_prot_clear_nulls(sk, prot->obj_size);
1316                }
1317        } else
1318                sk = kmalloc(prot->obj_size, priority);
1319
1320        if (sk != NULL) {
1321                kmemcheck_annotate_bitfield(sk, flags);
1322
1323                if (security_sk_alloc(sk, family, priority))
1324                        goto out_free;
1325
1326                if (!try_module_get(prot->owner))
1327                        goto out_free_sec;
1328                sk_tx_queue_clear(sk);
1329        }
1330
1331        return sk;
1332
1333out_free_sec:
1334        security_sk_free(sk);
1335out_free:
1336        if (slab != NULL)
1337                kmem_cache_free(slab, sk);
1338        else
1339                kfree(sk);
1340        return NULL;
1341}
1342
1343static void sk_prot_free(struct proto *prot, struct sock *sk)
1344{
1345        struct kmem_cache *slab;
1346        struct module *owner;
1347
1348        owner = prot->owner;
1349        slab = prot->slab;
1350
1351        security_sk_free(sk);
1352        if (slab != NULL)
1353                kmem_cache_free(slab, sk);
1354        else
1355                kfree(sk);
1356        module_put(owner);
1357}
1358
1359#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1360void sock_update_netprioidx(struct sock *sk)
1361{
1362        if (in_interrupt())
1363                return;
1364
1365        sk->sk_cgrp_prioidx = task_netprioidx(current);
1366}
1367EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1368#endif
1369
1370/**
1371 *      sk_alloc - All socket objects are allocated here
1372 *      @net: the applicable net namespace
1373 *      @family: protocol family
1374 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1375 *      @prot: struct proto associated with this new sock instance
1376 */
1377struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1378                      struct proto *prot)
1379{
1380        struct sock *sk;
1381
1382        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1383        if (sk) {
1384                sk->sk_family = family;
1385                /*
1386                 * See comment in struct sock definition to understand
1387                 * why we need sk_prot_creator -acme
1388                 */
1389                sk->sk_prot = sk->sk_prot_creator = prot;
1390                sock_lock_init(sk);
1391                sock_net_set(sk, get_net(net));
1392                atomic_set(&sk->sk_wmem_alloc, 1);
1393
1394                sock_update_classid(sk);
1395                sock_update_netprioidx(sk);
1396        }
1397
1398        return sk;
1399}
1400EXPORT_SYMBOL(sk_alloc);
1401
1402static void __sk_free(struct sock *sk)
1403{
1404        struct sk_filter *filter;
1405
1406        if (sk->sk_destruct)
1407                sk->sk_destruct(sk);
1408
1409        filter = rcu_dereference_check(sk->sk_filter,
1410                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1411        if (filter) {
1412                sk_filter_uncharge(sk, filter);
1413                RCU_INIT_POINTER(sk->sk_filter, NULL);
1414        }
1415
1416        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1417
1418        if (atomic_read(&sk->sk_omem_alloc))
1419                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1420                         __func__, atomic_read(&sk->sk_omem_alloc));
1421
1422        if (sk->sk_peer_cred)
1423                put_cred(sk->sk_peer_cred);
1424        put_pid(sk->sk_peer_pid);
1425        put_net(sock_net(sk));
1426        sk_prot_free(sk->sk_prot_creator, sk);
1427}
1428
1429void sk_free(struct sock *sk)
1430{
1431        /*
1432         * We subtract one from sk_wmem_alloc and can know if
1433         * some packets are still in some tx queue.
1434         * If not null, sock_wfree() will call __sk_free(sk) later
1435         */
1436        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1437                __sk_free(sk);
1438}
1439EXPORT_SYMBOL(sk_free);
1440
1441/*
1442 * Last sock_put should drop reference to sk->sk_net. It has already
1443 * been dropped in sk_change_net. Taking reference to stopping namespace
1444 * is not an option.
1445 * Take reference to a socket to remove it from hash _alive_ and after that
1446 * destroy it in the context of init_net.
1447 */
1448void sk_release_kernel(struct sock *sk)
1449{
1450        if (sk == NULL || sk->sk_socket == NULL)
1451                return;
1452
1453        sock_hold(sk);
1454        sock_release(sk->sk_socket);
1455        release_net(sock_net(sk));
1456        sock_net_set(sk, get_net(&init_net));
1457        sock_put(sk);
1458}
1459EXPORT_SYMBOL(sk_release_kernel);
1460
1461static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1462{
1463        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1464                sock_update_memcg(newsk);
1465}
1466
1467/**
1468 *      sk_clone_lock - clone a socket, and lock its clone
1469 *      @sk: the socket to clone
1470 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1471 *
1472 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1473 */
1474struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1475{
1476        struct sock *newsk;
1477        bool is_charged = true;
1478
1479        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1480        if (newsk != NULL) {
1481                struct sk_filter *filter;
1482
1483                sock_copy(newsk, sk);
1484
1485                /* SANITY */
1486                get_net(sock_net(newsk));
1487                sk_node_init(&newsk->sk_node);
1488                sock_lock_init(newsk);
1489                bh_lock_sock(newsk);
1490                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1491                newsk->sk_backlog.len = 0;
1492
1493                atomic_set(&newsk->sk_rmem_alloc, 0);
1494                /*
1495                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1496                 */
1497                atomic_set(&newsk->sk_wmem_alloc, 1);
1498                atomic_set(&newsk->sk_omem_alloc, 0);
1499                skb_queue_head_init(&newsk->sk_receive_queue);
1500                skb_queue_head_init(&newsk->sk_write_queue);
1501
1502                spin_lock_init(&newsk->sk_dst_lock);
1503                rwlock_init(&newsk->sk_callback_lock);
1504                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1505                                af_callback_keys + newsk->sk_family,
1506                                af_family_clock_key_strings[newsk->sk_family]);
1507
1508                newsk->sk_dst_cache     = NULL;
1509                newsk->sk_wmem_queued   = 0;
1510                newsk->sk_forward_alloc = 0;
1511                newsk->sk_send_head     = NULL;
1512                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1513
1514                sock_reset_flag(newsk, SOCK_DONE);
1515                skb_queue_head_init(&newsk->sk_error_queue);
1516
1517                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1518                if (filter != NULL)
1519                        /* though it's an empty new sock, the charging may fail
1520                         * if sysctl_optmem_max was changed between creation of
1521                         * original socket and cloning
1522                         */
1523                        is_charged = sk_filter_charge(newsk, filter);
1524
1525                if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1526                        /* It is still raw copy of parent, so invalidate
1527                         * destructor and make plain sk_free() */
1528                        newsk->sk_destruct = NULL;
1529                        bh_unlock_sock(newsk);
1530                        sk_free(newsk);
1531                        newsk = NULL;
1532                        goto out;
1533                }
1534
1535                newsk->sk_err      = 0;
1536                newsk->sk_priority = 0;
1537                newsk->sk_incoming_cpu = raw_smp_processor_id();
1538                /*
1539                 * Before updating sk_refcnt, we must commit prior changes to memory
1540                 * (Documentation/RCU/rculist_nulls.txt for details)
1541                 */
1542                smp_wmb();
1543                atomic_set(&newsk->sk_refcnt, 2);
1544
1545                /*
1546                 * Increment the counter in the same struct proto as the master
1547                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1548                 * is the same as sk->sk_prot->socks, as this field was copied
1549                 * with memcpy).
1550                 *
1551                 * This _changes_ the previous behaviour, where
1552                 * tcp_create_openreq_child always was incrementing the
1553                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1554                 * to be taken into account in all callers. -acme
1555                 */
1556                sk_refcnt_debug_inc(newsk);
1557                sk_set_socket(newsk, NULL);
1558                newsk->sk_wq = NULL;
1559
1560                sk_update_clone(sk, newsk);
1561
1562                if (newsk->sk_prot->sockets_allocated)
1563                        sk_sockets_allocated_inc(newsk);
1564
1565                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1566                        net_enable_timestamp();
1567        }
1568out:
1569        return newsk;
1570}
1571EXPORT_SYMBOL_GPL(sk_clone_lock);
1572
1573void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1574{
1575        __sk_dst_set(sk, dst);
1576        sk->sk_route_caps = dst->dev->features;
1577        if (sk->sk_route_caps & NETIF_F_GSO)
1578                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1579        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1580        if (sk_can_gso(sk)) {
1581                if (dst->header_len) {
1582                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1583                } else {
1584                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1585                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1586                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1587                }
1588        }
1589}
1590EXPORT_SYMBOL_GPL(sk_setup_caps);
1591
1592/*
1593 *      Simple resource managers for sockets.
1594 */
1595
1596
1597/*
1598 * Write buffer destructor automatically called from kfree_skb.
1599 */
1600void sock_wfree(struct sk_buff *skb)
1601{
1602        struct sock *sk = skb->sk;
1603        unsigned int len = skb->truesize;
1604
1605        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1606                /*
1607                 * Keep a reference on sk_wmem_alloc, this will be released
1608                 * after sk_write_space() call
1609                 */
1610                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1611                sk->sk_write_space(sk);
1612                len = 1;
1613        }
1614        /*
1615         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1616         * could not do because of in-flight packets
1617         */
1618        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1619                __sk_free(sk);
1620}
1621EXPORT_SYMBOL(sock_wfree);
1622
1623void skb_orphan_partial(struct sk_buff *skb)
1624{
1625        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1626         * so we do not completely orphan skb, but transfert all
1627         * accounted bytes but one, to avoid unexpected reorders.
1628         */
1629        if (skb->destructor == sock_wfree
1630#ifdef CONFIG_INET
1631            || skb->destructor == tcp_wfree
1632#endif
1633                ) {
1634                atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1635                skb->truesize = 1;
1636        } else {
1637                skb_orphan(skb);
1638        }
1639}
1640EXPORT_SYMBOL(skb_orphan_partial);
1641
1642/*
1643 * Read buffer destructor automatically called from kfree_skb.
1644 */
1645void sock_rfree(struct sk_buff *skb)
1646{
1647        struct sock *sk = skb->sk;
1648        unsigned int len = skb->truesize;
1649
1650        atomic_sub(len, &sk->sk_rmem_alloc);
1651        sk_mem_uncharge(sk, len);
1652}
1653EXPORT_SYMBOL(sock_rfree);
1654
1655void sock_efree(struct sk_buff *skb)
1656{
1657        sock_put(skb->sk);
1658}
1659EXPORT_SYMBOL(sock_efree);
1660
1661#ifdef CONFIG_INET
1662void sock_edemux(struct sk_buff *skb)
1663{
1664        struct sock *sk = skb->sk;
1665
1666        if (sk->sk_state == TCP_TIME_WAIT)
1667                inet_twsk_put(inet_twsk(sk));
1668        else
1669                sock_put(sk);
1670}
1671EXPORT_SYMBOL(sock_edemux);
1672#endif
1673
1674kuid_t sock_i_uid(struct sock *sk)
1675{
1676        kuid_t uid;
1677
1678        read_lock_bh(&sk->sk_callback_lock);
1679        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1680        read_unlock_bh(&sk->sk_callback_lock);
1681        return uid;
1682}
1683EXPORT_SYMBOL(sock_i_uid);
1684
1685unsigned long sock_i_ino(struct sock *sk)
1686{
1687        unsigned long ino;
1688
1689        read_lock_bh(&sk->sk_callback_lock);
1690        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1691        read_unlock_bh(&sk->sk_callback_lock);
1692        return ino;
1693}
1694EXPORT_SYMBOL(sock_i_ino);
1695
1696/*
1697 * Allocate a skb from the socket's send buffer.
1698 */
1699struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1700                             gfp_t priority)
1701{
1702        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1703                struct sk_buff *skb = alloc_skb(size, priority);
1704                if (skb) {
1705                        skb_set_owner_w(skb, sk);
1706                        return skb;
1707                }
1708        }
1709        return NULL;
1710}
1711EXPORT_SYMBOL(sock_wmalloc);
1712
1713/*
1714 * Allocate a memory block from the socket's option memory buffer.
1715 */
1716void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1717{
1718        if ((unsigned int)size <= sysctl_optmem_max &&
1719            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1720                void *mem;
1721                /* First do the add, to avoid the race if kmalloc
1722                 * might sleep.
1723                 */
1724                atomic_add(size, &sk->sk_omem_alloc);
1725                mem = kmalloc(size, priority);
1726                if (mem)
1727                        return mem;
1728                atomic_sub(size, &sk->sk_omem_alloc);
1729        }
1730        return NULL;
1731}
1732EXPORT_SYMBOL(sock_kmalloc);
1733
1734/* Free an option memory block. Note, we actually want the inline
1735 * here as this allows gcc to detect the nullify and fold away the
1736 * condition entirely.
1737 */
1738static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1739                                  const bool nullify)
1740{
1741        if (WARN_ON_ONCE(!mem))
1742                return;
1743        if (nullify)
1744                kzfree(mem);
1745        else
1746                kfree(mem);
1747        atomic_sub(size, &sk->sk_omem_alloc);
1748}
1749
1750void sock_kfree_s(struct sock *sk, void *mem, int size)
1751{
1752        __sock_kfree_s(sk, mem, size, false);
1753}
1754EXPORT_SYMBOL(sock_kfree_s);
1755
1756void sock_kzfree_s(struct sock *sk, void *mem, int size)
1757{
1758        __sock_kfree_s(sk, mem, size, true);
1759}
1760EXPORT_SYMBOL(sock_kzfree_s);
1761
1762/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1763   I think, these locks should be removed for datagram sockets.
1764 */
1765static long sock_wait_for_wmem(struct sock *sk, long timeo)
1766{
1767        DEFINE_WAIT(wait);
1768
1769        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1770        for (;;) {
1771                if (!timeo)
1772                        break;
1773                if (signal_pending(current))
1774                        break;
1775                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1776                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1777                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1778                        break;
1779                if (sk->sk_shutdown & SEND_SHUTDOWN)
1780                        break;
1781                if (sk->sk_err)
1782                        break;
1783                timeo = schedule_timeout(timeo);
1784        }
1785        finish_wait(sk_sleep(sk), &wait);
1786        return timeo;
1787}
1788
1789
1790/*
1791 *      Generic send/receive buffer handlers
1792 */
1793
1794struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1795                                     unsigned long data_len, int noblock,
1796                                     int *errcode, int max_page_order)
1797{
1798        struct sk_buff *skb;
1799        long timeo;
1800        int err;
1801
1802        timeo = sock_sndtimeo(sk, noblock);
1803        for (;;) {
1804                err = sock_error(sk);
1805                if (err != 0)
1806                        goto failure;
1807
1808                err = -EPIPE;
1809                if (sk->sk_shutdown & SEND_SHUTDOWN)
1810                        goto failure;
1811
1812                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1813                        break;
1814
1815                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1816                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1817                err = -EAGAIN;
1818                if (!timeo)
1819                        goto failure;
1820                if (signal_pending(current))
1821                        goto interrupted;
1822                timeo = sock_wait_for_wmem(sk, timeo);
1823        }
1824        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1825                                   errcode, sk->sk_allocation);
1826        if (skb)
1827                skb_set_owner_w(skb, sk);
1828        return skb;
1829
1830interrupted:
1831        err = sock_intr_errno(timeo);
1832failure:
1833        *errcode = err;
1834        return NULL;
1835}
1836EXPORT_SYMBOL(sock_alloc_send_pskb);
1837
1838struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1839                                    int noblock, int *errcode)
1840{
1841        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1842}
1843EXPORT_SYMBOL(sock_alloc_send_skb);
1844
1845/* On 32bit arches, an skb frag is limited to 2^15 */
1846#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1847
1848/**
1849 * skb_page_frag_refill - check that a page_frag contains enough room
1850 * @sz: minimum size of the fragment we want to get
1851 * @pfrag: pointer to page_frag
1852 * @gfp: priority for memory allocation
1853 *
1854 * Note: While this allocator tries to use high order pages, there is
1855 * no guarantee that allocations succeed. Therefore, @sz MUST be
1856 * less or equal than PAGE_SIZE.
1857 */
1858bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1859{
1860        if (pfrag->page) {
1861                if (atomic_read(&pfrag->page->_count) == 1) {
1862                        pfrag->offset = 0;
1863                        return true;
1864                }
1865                if (pfrag->offset + sz <= pfrag->size)
1866                        return true;
1867                put_page(pfrag->page);
1868        }
1869
1870        pfrag->offset = 0;
1871        if (SKB_FRAG_PAGE_ORDER) {
1872                pfrag->page = alloc_pages(gfp | __GFP_COMP |
1873                                          __GFP_NOWARN | __GFP_NORETRY,
1874                                          SKB_FRAG_PAGE_ORDER);
1875                if (likely(pfrag->page)) {
1876                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1877                        return true;
1878                }
1879        }
1880        pfrag->page = alloc_page(gfp);
1881        if (likely(pfrag->page)) {
1882                pfrag->size = PAGE_SIZE;
1883                return true;
1884        }
1885        return false;
1886}
1887EXPORT_SYMBOL(skb_page_frag_refill);
1888
1889bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1890{
1891        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1892                return true;
1893
1894        sk_enter_memory_pressure(sk);
1895        sk_stream_moderate_sndbuf(sk);
1896        return false;
1897}
1898EXPORT_SYMBOL(sk_page_frag_refill);
1899
1900static void __lock_sock(struct sock *sk)
1901        __releases(&sk->sk_lock.slock)
1902        __acquires(&sk->sk_lock.slock)
1903{
1904        DEFINE_WAIT(wait);
1905
1906        for (;;) {
1907                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1908                                        TASK_UNINTERRUPTIBLE);
1909                spin_unlock_bh(&sk->sk_lock.slock);
1910                schedule();
1911                spin_lock_bh(&sk->sk_lock.slock);
1912                if (!sock_owned_by_user(sk))
1913                        break;
1914        }
1915        finish_wait(&sk->sk_lock.wq, &wait);
1916}
1917
1918static void __release_sock(struct sock *sk)
1919        __releases(&sk->sk_lock.slock)
1920        __acquires(&sk->sk_lock.slock)
1921{
1922        struct sk_buff *skb = sk->sk_backlog.head;
1923
1924        do {
1925                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1926                bh_unlock_sock(sk);
1927
1928                do {
1929                        struct sk_buff *next = skb->next;
1930
1931                        prefetch(next);
1932                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1933                        skb->next = NULL;
1934                        sk_backlog_rcv(sk, skb);
1935
1936                        /*
1937                         * We are in process context here with softirqs
1938                         * disabled, use cond_resched_softirq() to preempt.
1939                         * This is safe to do because we've taken the backlog
1940                         * queue private:
1941                         */
1942                        cond_resched_softirq();
1943
1944                        skb = next;
1945                } while (skb != NULL);
1946
1947                bh_lock_sock(sk);
1948        } while ((skb = sk->sk_backlog.head) != NULL);
1949
1950        /*
1951         * Doing the zeroing here guarantee we can not loop forever
1952         * while a wild producer attempts to flood us.
1953         */
1954        sk->sk_backlog.len = 0;
1955}
1956
1957/**
1958 * sk_wait_data - wait for data to arrive at sk_receive_queue
1959 * @sk:    sock to wait on
1960 * @timeo: for how long
1961 *
1962 * Now socket state including sk->sk_err is changed only under lock,
1963 * hence we may omit checks after joining wait queue.
1964 * We check receive queue before schedule() only as optimization;
1965 * it is very likely that release_sock() added new data.
1966 */
1967int sk_wait_data(struct sock *sk, long *timeo)
1968{
1969        int rc;
1970        DEFINE_WAIT(wait);
1971
1972        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1973        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1974        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1975        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1976        finish_wait(sk_sleep(sk), &wait);
1977        return rc;
1978}
1979EXPORT_SYMBOL(sk_wait_data);
1980
1981/**
1982 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1983 *      @sk: socket
1984 *      @size: memory size to allocate
1985 *      @kind: allocation type
1986 *
1987 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1988 *      rmem allocation. This function assumes that protocols which have
1989 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1990 */
1991int __sk_mem_schedule(struct sock *sk, int size, int kind)
1992{
1993        struct proto *prot = sk->sk_prot;
1994        int amt = sk_mem_pages(size);
1995        long allocated;
1996        int parent_status = UNDER_LIMIT;
1997
1998        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1999
2000        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2001
2002        /* Under limit. */
2003        if (parent_status == UNDER_LIMIT &&
2004                        allocated <= sk_prot_mem_limits(sk, 0)) {
2005                sk_leave_memory_pressure(sk);
2006                return 1;
2007        }
2008
2009        /* Under pressure. (we or our parents) */
2010        if ((parent_status > SOFT_LIMIT) ||
2011                        allocated > sk_prot_mem_limits(sk, 1))
2012                sk_enter_memory_pressure(sk);
2013
2014        /* Over hard limit (we or our parents) */
2015        if ((parent_status == OVER_LIMIT) ||
2016                        (allocated > sk_prot_mem_limits(sk, 2)))
2017                goto suppress_allocation;
2018
2019        /* guarantee minimum buffer size under pressure */
2020        if (kind == SK_MEM_RECV) {
2021                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2022                        return 1;
2023
2024        } else { /* SK_MEM_SEND */
2025                if (sk->sk_type == SOCK_STREAM) {
2026                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2027                                return 1;
2028                } else if (atomic_read(&sk->sk_wmem_alloc) <
2029                           prot->sysctl_wmem[0])
2030                                return 1;
2031        }
2032
2033        if (sk_has_memory_pressure(sk)) {
2034                int alloc;
2035
2036                if (!sk_under_memory_pressure(sk))
2037                        return 1;
2038                alloc = sk_sockets_allocated_read_positive(sk);
2039                if (sk_prot_mem_limits(sk, 2) > alloc *
2040                    sk_mem_pages(sk->sk_wmem_queued +
2041                                 atomic_read(&sk->sk_rmem_alloc) +
2042                                 sk->sk_forward_alloc))
2043                        return 1;
2044        }
2045
2046suppress_allocation:
2047
2048        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2049                sk_stream_moderate_sndbuf(sk);
2050
2051                /* Fail only if socket is _under_ its sndbuf.
2052                 * In this case we cannot block, so that we have to fail.
2053                 */
2054                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2055                        return 1;
2056        }
2057
2058        trace_sock_exceed_buf_limit(sk, prot, allocated);
2059
2060        /* Alas. Undo changes. */
2061        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2062
2063        sk_memory_allocated_sub(sk, amt);
2064
2065        return 0;
2066}
2067EXPORT_SYMBOL(__sk_mem_schedule);
2068
2069/**
2070 *      __sk_reclaim - reclaim memory_allocated
2071 *      @sk: socket
2072 */
2073void __sk_mem_reclaim(struct sock *sk)
2074{
2075        sk_memory_allocated_sub(sk,
2076                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2077        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2078
2079        if (sk_under_memory_pressure(sk) &&
2080            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2081                sk_leave_memory_pressure(sk);
2082}
2083EXPORT_SYMBOL(__sk_mem_reclaim);
2084
2085
2086/*
2087 * Set of default routines for initialising struct proto_ops when
2088 * the protocol does not support a particular function. In certain
2089 * cases where it makes no sense for a protocol to have a "do nothing"
2090 * function, some default processing is provided.
2091 */
2092
2093int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2094{
2095        return -EOPNOTSUPP;
2096}
2097EXPORT_SYMBOL(sock_no_bind);
2098
2099int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2100                    int len, int flags)
2101{
2102        return -EOPNOTSUPP;
2103}
2104EXPORT_SYMBOL(sock_no_connect);
2105
2106int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2107{
2108        return -EOPNOTSUPP;
2109}
2110EXPORT_SYMBOL(sock_no_socketpair);
2111
2112int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2113{
2114        return -EOPNOTSUPP;
2115}
2116EXPORT_SYMBOL(sock_no_accept);
2117
2118int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2119                    int *len, int peer)
2120{
2121        return -EOPNOTSUPP;
2122}
2123EXPORT_SYMBOL(sock_no_getname);
2124
2125unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2126{
2127        return 0;
2128}
2129EXPORT_SYMBOL(sock_no_poll);
2130
2131int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2132{
2133        return -EOPNOTSUPP;
2134}
2135EXPORT_SYMBOL(sock_no_ioctl);
2136
2137int sock_no_listen(struct socket *sock, int backlog)
2138{
2139        return -EOPNOTSUPP;
2140}
2141EXPORT_SYMBOL(sock_no_listen);
2142
2143int sock_no_shutdown(struct socket *sock, int how)
2144{
2145        return -EOPNOTSUPP;
2146}
2147EXPORT_SYMBOL(sock_no_shutdown);
2148
2149int sock_no_setsockopt(struct socket *sock, int level, int optname,
2150                    char __user *optval, unsigned int optlen)
2151{
2152        return -EOPNOTSUPP;
2153}
2154EXPORT_SYMBOL(sock_no_setsockopt);
2155
2156int sock_no_getsockopt(struct socket *sock, int level, int optname,
2157                    char __user *optval, int __user *optlen)
2158{
2159        return -EOPNOTSUPP;
2160}
2161EXPORT_SYMBOL(sock_no_getsockopt);
2162
2163int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2164                    size_t len)
2165{
2166        return -EOPNOTSUPP;
2167}
2168EXPORT_SYMBOL(sock_no_sendmsg);
2169
2170int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2171                    size_t len, int flags)
2172{
2173        return -EOPNOTSUPP;
2174}
2175EXPORT_SYMBOL(sock_no_recvmsg);
2176
2177int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2178{
2179        /* Mirror missing mmap method error code */
2180        return -ENODEV;
2181}
2182EXPORT_SYMBOL(sock_no_mmap);
2183
2184ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2185{
2186        ssize_t res;
2187        struct msghdr msg = {.msg_flags = flags};
2188        struct kvec iov;
2189        char *kaddr = kmap(page);
2190        iov.iov_base = kaddr + offset;
2191        iov.iov_len = size;
2192        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2193        kunmap(page);
2194        return res;
2195}
2196EXPORT_SYMBOL(sock_no_sendpage);
2197
2198/*
2199 *      Default Socket Callbacks
2200 */
2201
2202static void sock_def_wakeup(struct sock *sk)
2203{
2204        struct socket_wq *wq;
2205
2206        rcu_read_lock();
2207        wq = rcu_dereference(sk->sk_wq);
2208        if (wq_has_sleeper(wq))
2209                wake_up_interruptible_all(&wq->wait);
2210        rcu_read_unlock();
2211}
2212
2213static void sock_def_error_report(struct sock *sk)
2214{
2215        struct socket_wq *wq;
2216
2217        rcu_read_lock();
2218        wq = rcu_dereference(sk->sk_wq);
2219        if (wq_has_sleeper(wq))
2220                wake_up_interruptible_poll(&wq->wait, POLLERR);
2221        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2222        rcu_read_unlock();
2223}
2224
2225static void sock_def_readable(struct sock *sk)
2226{
2227        struct socket_wq *wq;
2228
2229        rcu_read_lock();
2230        wq = rcu_dereference(sk->sk_wq);
2231        if (wq_has_sleeper(wq))
2232                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2233                                                POLLRDNORM | POLLRDBAND);
2234        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2235        rcu_read_unlock();
2236}
2237
2238static void sock_def_write_space(struct sock *sk)
2239{
2240        struct socket_wq *wq;
2241
2242        rcu_read_lock();
2243
2244        /* Do not wake up a writer until he can make "significant"
2245         * progress.  --DaveM
2246         */
2247        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2248                wq = rcu_dereference(sk->sk_wq);
2249                if (wq_has_sleeper(wq))
2250                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2251                                                POLLWRNORM | POLLWRBAND);
2252
2253                /* Should agree with poll, otherwise some programs break */
2254                if (sock_writeable(sk))
2255                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2256        }
2257
2258        rcu_read_unlock();
2259}
2260
2261static void sock_def_destruct(struct sock *sk)
2262{
2263        kfree(sk->sk_protinfo);
2264}
2265
2266void sk_send_sigurg(struct sock *sk)
2267{
2268        if (sk->sk_socket && sk->sk_socket->file)
2269                if (send_sigurg(&sk->sk_socket->file->f_owner))
2270                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2271}
2272EXPORT_SYMBOL(sk_send_sigurg);
2273
2274void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2275                    unsigned long expires)
2276{
2277        if (!mod_timer(timer, expires))
2278                sock_hold(sk);
2279}
2280EXPORT_SYMBOL(sk_reset_timer);
2281
2282void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2283{
2284        if (del_timer(timer))
2285                __sock_put(sk);
2286}
2287EXPORT_SYMBOL(sk_stop_timer);
2288
2289void sock_init_data(struct socket *sock, struct sock *sk)
2290{
2291        skb_queue_head_init(&sk->sk_receive_queue);
2292        skb_queue_head_init(&sk->sk_write_queue);
2293        skb_queue_head_init(&sk->sk_error_queue);
2294
2295        sk->sk_send_head        =       NULL;
2296
2297        init_timer(&sk->sk_timer);
2298
2299        sk->sk_allocation       =       GFP_KERNEL;
2300        sk->sk_rcvbuf           =       sysctl_rmem_default;
2301        sk->sk_sndbuf           =       sysctl_wmem_default;
2302        sk->sk_state            =       TCP_CLOSE;
2303        sk_set_socket(sk, sock);
2304
2305        sock_set_flag(sk, SOCK_ZAPPED);
2306
2307        if (sock) {
2308                sk->sk_type     =       sock->type;
2309                sk->sk_wq       =       sock->wq;
2310                sock->sk        =       sk;
2311        } else
2312                sk->sk_wq       =       NULL;
2313
2314        spin_lock_init(&sk->sk_dst_lock);
2315        rwlock_init(&sk->sk_callback_lock);
2316        lockdep_set_class_and_name(&sk->sk_callback_lock,
2317                        af_callback_keys + sk->sk_family,
2318                        af_family_clock_key_strings[sk->sk_family]);
2319
2320        sk->sk_state_change     =       sock_def_wakeup;
2321        sk->sk_data_ready       =       sock_def_readable;
2322        sk->sk_write_space      =       sock_def_write_space;
2323        sk->sk_error_report     =       sock_def_error_report;
2324        sk->sk_destruct         =       sock_def_destruct;
2325
2326        sk->sk_frag.page        =       NULL;
2327        sk->sk_frag.offset      =       0;
2328        sk->sk_peek_off         =       -1;
2329
2330        sk->sk_peer_pid         =       NULL;
2331        sk->sk_peer_cred        =       NULL;
2332        sk->sk_write_pending    =       0;
2333        sk->sk_rcvlowat         =       1;
2334        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2335        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2336
2337        sk->sk_stamp = ktime_set(-1L, 0);
2338
2339#ifdef CONFIG_NET_RX_BUSY_POLL
2340        sk->sk_napi_id          =       0;
2341        sk->sk_ll_usec          =       sysctl_net_busy_read;
2342#endif
2343
2344        sk->sk_max_pacing_rate = ~0U;
2345        sk->sk_pacing_rate = ~0U;
2346        /*
2347         * Before updating sk_refcnt, we must commit prior changes to memory
2348         * (Documentation/RCU/rculist_nulls.txt for details)
2349         */
2350        smp_wmb();
2351        atomic_set(&sk->sk_refcnt, 1);
2352        atomic_set(&sk->sk_drops, 0);
2353}
2354EXPORT_SYMBOL(sock_init_data);
2355
2356void lock_sock_nested(struct sock *sk, int subclass)
2357{
2358        might_sleep();
2359        spin_lock_bh(&sk->sk_lock.slock);
2360        if (sk->sk_lock.owned)
2361                __lock_sock(sk);
2362        sk->sk_lock.owned = 1;
2363        spin_unlock(&sk->sk_lock.slock);
2364        /*
2365         * The sk_lock has mutex_lock() semantics here:
2366         */
2367        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2368        local_bh_enable();
2369}
2370EXPORT_SYMBOL(lock_sock_nested);
2371
2372void release_sock(struct sock *sk)
2373{
2374        /*
2375         * The sk_lock has mutex_unlock() semantics:
2376         */
2377        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2378
2379        spin_lock_bh(&sk->sk_lock.slock);
2380        if (sk->sk_backlog.tail)
2381                __release_sock(sk);
2382
2383        /* Warning : release_cb() might need to release sk ownership,
2384         * ie call sock_release_ownership(sk) before us.
2385         */
2386        if (sk->sk_prot->release_cb)
2387                sk->sk_prot->release_cb(sk);
2388
2389        sock_release_ownership(sk);
2390        if (waitqueue_active(&sk->sk_lock.wq))
2391                wake_up(&sk->sk_lock.wq);
2392        spin_unlock_bh(&sk->sk_lock.slock);
2393}
2394EXPORT_SYMBOL(release_sock);
2395
2396/**
2397 * lock_sock_fast - fast version of lock_sock
2398 * @sk: socket
2399 *
2400 * This version should be used for very small section, where process wont block
2401 * return false if fast path is taken
2402 *   sk_lock.slock locked, owned = 0, BH disabled
2403 * return true if slow path is taken
2404 *   sk_lock.slock unlocked, owned = 1, BH enabled
2405 */
2406bool lock_sock_fast(struct sock *sk)
2407{
2408        might_sleep();
2409        spin_lock_bh(&sk->sk_lock.slock);
2410
2411        if (!sk->sk_lock.owned)
2412                /*
2413                 * Note : We must disable BH
2414                 */
2415                return false;
2416
2417        __lock_sock(sk);
2418        sk->sk_lock.owned = 1;
2419        spin_unlock(&sk->sk_lock.slock);
2420        /*
2421         * The sk_lock has mutex_lock() semantics here:
2422         */
2423        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2424        local_bh_enable();
2425        return true;
2426}
2427EXPORT_SYMBOL(lock_sock_fast);
2428
2429int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2430{
2431        struct timeval tv;
2432        if (!sock_flag(sk, SOCK_TIMESTAMP))
2433                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2434        tv = ktime_to_timeval(sk->sk_stamp);
2435        if (tv.tv_sec == -1)
2436                return -ENOENT;
2437        if (tv.tv_sec == 0) {
2438                sk->sk_stamp = ktime_get_real();
2439                tv = ktime_to_timeval(sk->sk_stamp);
2440        }
2441        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2442}
2443EXPORT_SYMBOL(sock_get_timestamp);
2444
2445int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2446{
2447        struct timespec ts;
2448        if (!sock_flag(sk, SOCK_TIMESTAMP))
2449                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2450        ts = ktime_to_timespec(sk->sk_stamp);
2451        if (ts.tv_sec == -1)
2452                return -ENOENT;
2453        if (ts.tv_sec == 0) {
2454                sk->sk_stamp = ktime_get_real();
2455                ts = ktime_to_timespec(sk->sk_stamp);
2456        }
2457        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2458}
2459EXPORT_SYMBOL(sock_get_timestampns);
2460
2461void sock_enable_timestamp(struct sock *sk, int flag)
2462{
2463        if (!sock_flag(sk, flag)) {
2464                unsigned long previous_flags = sk->sk_flags;
2465
2466                sock_set_flag(sk, flag);
2467                /*
2468                 * we just set one of the two flags which require net
2469                 * time stamping, but time stamping might have been on
2470                 * already because of the other one
2471                 */
2472                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2473                        net_enable_timestamp();
2474        }
2475}
2476
2477int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2478                       int level, int type)
2479{
2480        struct sock_exterr_skb *serr;
2481        struct sk_buff *skb;
2482        int copied, err;
2483
2484        err = -EAGAIN;
2485        skb = sock_dequeue_err_skb(sk);
2486        if (skb == NULL)
2487                goto out;
2488
2489        copied = skb->len;
2490        if (copied > len) {
2491                msg->msg_flags |= MSG_TRUNC;
2492                copied = len;
2493        }
2494        err = skb_copy_datagram_msg(skb, 0, msg, copied);
2495        if (err)
2496                goto out_free_skb;
2497
2498        sock_recv_timestamp(msg, sk, skb);
2499
2500        serr = SKB_EXT_ERR(skb);
2501        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2502
2503        msg->msg_flags |= MSG_ERRQUEUE;
2504        err = copied;
2505
2506out_free_skb:
2507        kfree_skb(skb);
2508out:
2509        return err;
2510}
2511EXPORT_SYMBOL(sock_recv_errqueue);
2512
2513/*
2514 *      Get a socket option on an socket.
2515 *
2516 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2517 *      asynchronous errors should be reported by getsockopt. We assume
2518 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2519 */
2520int sock_common_getsockopt(struct socket *sock, int level, int optname,
2521                           char __user *optval, int __user *optlen)
2522{
2523        struct sock *sk = sock->sk;
2524
2525        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2526}
2527EXPORT_SYMBOL(sock_common_getsockopt);
2528
2529#ifdef CONFIG_COMPAT
2530int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2531                                  char __user *optval, int __user *optlen)
2532{
2533        struct sock *sk = sock->sk;
2534
2535        if (sk->sk_prot->compat_getsockopt != NULL)
2536                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2537                                                      optval, optlen);
2538        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2539}
2540EXPORT_SYMBOL(compat_sock_common_getsockopt);
2541#endif
2542
2543int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2544                        struct msghdr *msg, size_t size, int flags)
2545{
2546        struct sock *sk = sock->sk;
2547        int addr_len = 0;
2548        int err;
2549
2550        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2551                                   flags & ~MSG_DONTWAIT, &addr_len);
2552        if (err >= 0)
2553                msg->msg_namelen = addr_len;
2554        return err;
2555}
2556EXPORT_SYMBOL(sock_common_recvmsg);
2557
2558/*
2559 *      Set socket options on an inet socket.
2560 */
2561int sock_common_setsockopt(struct socket *sock, int level, int optname,
2562                           char __user *optval, unsigned int optlen)
2563{
2564        struct sock *sk = sock->sk;
2565
2566        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2567}
2568EXPORT_SYMBOL(sock_common_setsockopt);
2569
2570#ifdef CONFIG_COMPAT
2571int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2572                                  char __user *optval, unsigned int optlen)
2573{
2574        struct sock *sk = sock->sk;
2575
2576        if (sk->sk_prot->compat_setsockopt != NULL)
2577                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2578                                                      optval, optlen);
2579        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2580}
2581EXPORT_SYMBOL(compat_sock_common_setsockopt);
2582#endif
2583
2584void sk_common_release(struct sock *sk)
2585{
2586        if (sk->sk_prot->destroy)
2587                sk->sk_prot->destroy(sk);
2588
2589        /*
2590         * Observation: when sock_common_release is called, processes have
2591         * no access to socket. But net still has.
2592         * Step one, detach it from networking:
2593         *
2594         * A. Remove from hash tables.
2595         */
2596
2597        sk->sk_prot->unhash(sk);
2598
2599        /*
2600         * In this point socket cannot receive new packets, but it is possible
2601         * that some packets are in flight because some CPU runs receiver and
2602         * did hash table lookup before we unhashed socket. They will achieve
2603         * receive queue and will be purged by socket destructor.
2604         *
2605         * Also we still have packets pending on receive queue and probably,
2606         * our own packets waiting in device queues. sock_destroy will drain
2607         * receive queue, but transmitted packets will delay socket destruction
2608         * until the last reference will be released.
2609         */
2610
2611        sock_orphan(sk);
2612
2613        xfrm_sk_free_policy(sk);
2614
2615        sk_refcnt_debug_release(sk);
2616
2617        if (sk->sk_frag.page) {
2618                put_page(sk->sk_frag.page);
2619                sk->sk_frag.page = NULL;
2620        }
2621
2622        sock_put(sk);
2623}
2624EXPORT_SYMBOL(sk_common_release);
2625
2626#ifdef CONFIG_PROC_FS
2627#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2628struct prot_inuse {
2629        int val[PROTO_INUSE_NR];
2630};
2631
2632static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2633
2634#ifdef CONFIG_NET_NS
2635void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2636{
2637        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2638}
2639EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2640
2641int sock_prot_inuse_get(struct net *net, struct proto *prot)
2642{
2643        int cpu, idx = prot->inuse_idx;
2644        int res = 0;
2645
2646        for_each_possible_cpu(cpu)
2647                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2648
2649        return res >= 0 ? res : 0;
2650}
2651EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2652
2653static int __net_init sock_inuse_init_net(struct net *net)
2654{
2655        net->core.inuse = alloc_percpu(struct prot_inuse);
2656        return net->core.inuse ? 0 : -ENOMEM;
2657}
2658
2659static void __net_exit sock_inuse_exit_net(struct net *net)
2660{
2661        free_percpu(net->core.inuse);
2662}
2663
2664static struct pernet_operations net_inuse_ops = {
2665        .init = sock_inuse_init_net,
2666        .exit = sock_inuse_exit_net,
2667};
2668
2669static __init int net_inuse_init(void)
2670{
2671        if (register_pernet_subsys(&net_inuse_ops))
2672                panic("Cannot initialize net inuse counters");
2673
2674        return 0;
2675}
2676
2677core_initcall(net_inuse_init);
2678#else
2679static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2680
2681void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2682{
2683        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2684}
2685EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2686
2687int sock_prot_inuse_get(struct net *net, struct proto *prot)
2688{
2689        int cpu, idx = prot->inuse_idx;
2690        int res = 0;
2691
2692        for_each_possible_cpu(cpu)
2693                res += per_cpu(prot_inuse, cpu).val[idx];
2694
2695        return res >= 0 ? res : 0;
2696}
2697EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2698#endif
2699
2700static void assign_proto_idx(struct proto *prot)
2701{
2702        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2703
2704        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2705                pr_err("PROTO_INUSE_NR exhausted\n");
2706                return;
2707        }
2708
2709        set_bit(prot->inuse_idx, proto_inuse_idx);
2710}
2711
2712static void release_proto_idx(struct proto *prot)
2713{
2714        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2715                clear_bit(prot->inuse_idx, proto_inuse_idx);
2716}
2717#else
2718static inline void assign_proto_idx(struct proto *prot)
2719{
2720}
2721
2722static inline void release_proto_idx(struct proto *prot)
2723{
2724}
2725#endif
2726
2727int proto_register(struct proto *prot, int alloc_slab)
2728{
2729        if (alloc_slab) {
2730                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2731                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2732                                        NULL);
2733
2734                if (prot->slab == NULL) {
2735                        pr_crit("%s: Can't create sock SLAB cache!\n",
2736                                prot->name);
2737                        goto out;
2738                }
2739
2740                if (prot->rsk_prot != NULL) {
2741                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2742                        if (prot->rsk_prot->slab_name == NULL)
2743                                goto out_free_sock_slab;
2744
2745                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2746                                                                 prot->rsk_prot->obj_size, 0,
2747                                                                 SLAB_HWCACHE_ALIGN, NULL);
2748
2749                        if (prot->rsk_prot->slab == NULL) {
2750                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2751                                        prot->name);
2752                                goto out_free_request_sock_slab_name;
2753                        }
2754                }
2755
2756                if (prot->twsk_prot != NULL) {
2757                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2758
2759                        if (prot->twsk_prot->twsk_slab_name == NULL)
2760                                goto out_free_request_sock_slab;
2761
2762                        prot->twsk_prot->twsk_slab =
2763                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2764                                                  prot->twsk_prot->twsk_obj_size,
2765                                                  0,
2766                                                  SLAB_HWCACHE_ALIGN |
2767                                                        prot->slab_flags,
2768                                                  NULL);
2769                        if (prot->twsk_prot->twsk_slab == NULL)
2770                                goto out_free_timewait_sock_slab_name;
2771                }
2772        }
2773
2774        mutex_lock(&proto_list_mutex);
2775        list_add(&prot->node, &proto_list);
2776        assign_proto_idx(prot);
2777        mutex_unlock(&proto_list_mutex);
2778        return 0;
2779
2780out_free_timewait_sock_slab_name:
2781        kfree(prot->twsk_prot->twsk_slab_name);
2782out_free_request_sock_slab:
2783        if (prot->rsk_prot && prot->rsk_prot->slab) {
2784                kmem_cache_destroy(prot->rsk_prot->slab);
2785                prot->rsk_prot->slab = NULL;
2786        }
2787out_free_request_sock_slab_name:
2788        if (prot->rsk_prot)
2789                kfree(prot->rsk_prot->slab_name);
2790out_free_sock_slab:
2791        kmem_cache_destroy(prot->slab);
2792        prot->slab = NULL;
2793out:
2794        return -ENOBUFS;
2795}
2796EXPORT_SYMBOL(proto_register);
2797
2798void proto_unregister(struct proto *prot)
2799{
2800        mutex_lock(&proto_list_mutex);
2801        release_proto_idx(prot);
2802        list_del(&prot->node);
2803        mutex_unlock(&proto_list_mutex);
2804
2805        if (prot->slab != NULL) {
2806                kmem_cache_destroy(prot->slab);
2807                prot->slab = NULL;
2808        }
2809
2810        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2811                kmem_cache_destroy(prot->rsk_prot->slab);
2812                kfree(prot->rsk_prot->slab_name);
2813                prot->rsk_prot->slab = NULL;
2814        }
2815
2816        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2817                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2818                kfree(prot->twsk_prot->twsk_slab_name);
2819                prot->twsk_prot->twsk_slab = NULL;
2820        }
2821}
2822EXPORT_SYMBOL(proto_unregister);
2823
2824#ifdef CONFIG_PROC_FS
2825static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2826        __acquires(proto_list_mutex)
2827{
2828        mutex_lock(&proto_list_mutex);
2829        return seq_list_start_head(&proto_list, *pos);
2830}
2831
2832static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2833{
2834        return seq_list_next(v, &proto_list, pos);
2835}
2836
2837static void proto_seq_stop(struct seq_file *seq, void *v)
2838        __releases(proto_list_mutex)
2839{
2840        mutex_unlock(&proto_list_mutex);
2841}
2842
2843static char proto_method_implemented(const void *method)
2844{
2845        return method == NULL ? 'n' : 'y';
2846}
2847static long sock_prot_memory_allocated(struct proto *proto)
2848{
2849        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2850}
2851
2852static char *sock_prot_memory_pressure(struct proto *proto)
2853{
2854        return proto->memory_pressure != NULL ?
2855        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2856}
2857
2858static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2859{
2860
2861        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2862                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2863                   proto->name,
2864                   proto->obj_size,
2865                   sock_prot_inuse_get(seq_file_net(seq), proto),
2866                   sock_prot_memory_allocated(proto),
2867                   sock_prot_memory_pressure(proto),
2868                   proto->max_header,
2869                   proto->slab == NULL ? "no" : "yes",
2870                   module_name(proto->owner),
2871                   proto_method_implemented(proto->close),
2872                   proto_method_implemented(proto->connect),
2873                   proto_method_implemented(proto->disconnect),
2874                   proto_method_implemented(proto->accept),
2875                   proto_method_implemented(proto->ioctl),
2876                   proto_method_implemented(proto->init),
2877                   proto_method_implemented(proto->destroy),
2878                   proto_method_implemented(proto->shutdown),
2879                   proto_method_implemented(proto->setsockopt),
2880                   proto_method_implemented(proto->getsockopt),
2881                   proto_method_implemented(proto->sendmsg),
2882                   proto_method_implemented(proto->recvmsg),
2883                   proto_method_implemented(proto->sendpage),
2884                   proto_method_implemented(proto->bind),
2885                   proto_method_implemented(proto->backlog_rcv),
2886                   proto_method_implemented(proto->hash),
2887                   proto_method_implemented(proto->unhash),
2888                   proto_method_implemented(proto->get_port),
2889                   proto_method_implemented(proto->enter_memory_pressure));
2890}
2891
2892static int proto_seq_show(struct seq_file *seq, void *v)
2893{
2894        if (v == &proto_list)
2895                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2896                           "protocol",
2897                           "size",
2898                           "sockets",
2899                           "memory",
2900                           "press",
2901                           "maxhdr",
2902                           "slab",
2903                           "module",
2904                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2905        else
2906                proto_seq_printf(seq, list_entry(v, struct proto, node));
2907        return 0;
2908}
2909
2910static const struct seq_operations proto_seq_ops = {
2911        .start  = proto_seq_start,
2912        .next   = proto_seq_next,
2913        .stop   = proto_seq_stop,
2914        .show   = proto_seq_show,
2915};
2916
2917static int proto_seq_open(struct inode *inode, struct file *file)
2918{
2919        return seq_open_net(inode, file, &proto_seq_ops,
2920                            sizeof(struct seq_net_private));
2921}
2922
2923static const struct file_operations proto_seq_fops = {
2924        .owner          = THIS_MODULE,
2925        .open           = proto_seq_open,
2926        .read           = seq_read,
2927        .llseek         = seq_lseek,
2928        .release        = seq_release_net,
2929};
2930
2931static __net_init int proto_init_net(struct net *net)
2932{
2933        if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2934                return -ENOMEM;
2935
2936        return 0;
2937}
2938
2939static __net_exit void proto_exit_net(struct net *net)
2940{
2941        remove_proc_entry("protocols", net->proc_net);
2942}
2943
2944
2945static __net_initdata struct pernet_operations proto_net_ops = {
2946        .init = proto_init_net,
2947        .exit = proto_exit_net,
2948};
2949
2950static int __init proto_init(void)
2951{
2952        return register_pernet_subsys(&proto_net_ops);
2953}
2954
2955subsys_initcall(proto_init);
2956
2957#endif /* PROC_FS */
2958
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.