linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/types.h>
  97#include <linux/socket.h>
  98#include <linux/in.h>
  99#include <linux/kernel.h>
 100#include <linux/module.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/sched.h>
 104#include <linux/timer.h>
 105#include <linux/string.h>
 106#include <linux/sockios.h>
 107#include <linux/net.h>
 108#include <linux/mm.h>
 109#include <linux/slab.h>
 110#include <linux/interrupt.h>
 111#include <linux/poll.h>
 112#include <linux/tcp.h>
 113#include <linux/init.h>
 114#include <linux/highmem.h>
 115#include <linux/user_namespace.h>
 116#include <linux/static_key.h>
 117#include <linux/memcontrol.h>
 118#include <linux/prefetch.h>
 119
 120#include <asm/uaccess.h>
 121
 122#include <linux/netdevice.h>
 123#include <net/protocol.h>
 124#include <linux/skbuff.h>
 125#include <net/net_namespace.h>
 126#include <net/request_sock.h>
 127#include <net/sock.h>
 128#include <linux/net_tstamp.h>
 129#include <net/xfrm.h>
 130#include <linux/ipsec.h>
 131#include <net/cls_cgroup.h>
 132#include <net/netprio_cgroup.h>
 133
 134#include <linux/filter.h>
 135
 136#include <trace/events/sock.h>
 137
 138#ifdef CONFIG_INET
 139#include <net/tcp.h>
 140#endif
 141
 142static DEFINE_MUTEX(proto_list_mutex);
 143static LIST_HEAD(proto_list);
 144
 145#ifdef CONFIG_MEMCG_KMEM
 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 147{
 148        struct proto *proto;
 149        int ret = 0;
 150
 151        mutex_lock(&proto_list_mutex);
 152        list_for_each_entry(proto, &proto_list, node) {
 153                if (proto->init_cgroup) {
 154                        ret = proto->init_cgroup(memcg, ss);
 155                        if (ret)
 156                                goto out;
 157                }
 158        }
 159
 160        mutex_unlock(&proto_list_mutex);
 161        return ret;
 162out:
 163        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 164                if (proto->destroy_cgroup)
 165                        proto->destroy_cgroup(memcg);
 166        mutex_unlock(&proto_list_mutex);
 167        return ret;
 168}
 169
 170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 171{
 172        struct proto *proto;
 173
 174        mutex_lock(&proto_list_mutex);
 175        list_for_each_entry_reverse(proto, &proto_list, node)
 176                if (proto->destroy_cgroup)
 177                        proto->destroy_cgroup(memcg);
 178        mutex_unlock(&proto_list_mutex);
 179}
 180#endif
 181
 182/*
 183 * Each address family might have different locking rules, so we have
 184 * one slock key per address family:
 185 */
 186static struct lock_class_key af_family_keys[AF_MAX];
 187static struct lock_class_key af_family_slock_keys[AF_MAX];
 188
 189struct static_key memcg_socket_limit_enabled;
 190EXPORT_SYMBOL(memcg_socket_limit_enabled);
 191
 192/*
 193 * Make lock validator output more readable. (we pre-construct these
 194 * strings build-time, so that runtime initialization of socket
 195 * locks is fast):
 196 */
 197static const char *const af_family_key_strings[AF_MAX+1] = {
 198  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 199  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 200  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 201  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 202  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 203  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 204  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 205  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 206  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 207  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 208  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 209  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 210  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 211  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 212};
 213static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 214  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 215  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 216  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 217  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 218  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 219  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 220  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 221  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 222  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 223  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 224  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 225  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 226  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 227  "slock-AF_NFC"   , "slock-AF_MAX"
 228};
 229static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 230  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 231  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 232  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 233  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 234  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 235  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 236  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 237  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 238  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 239  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 240  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 241  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 242  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 243  "clock-AF_NFC"   , "clock-AF_MAX"
 244};
 245
 246/*
 247 * sk_callback_lock locking rules are per-address-family,
 248 * so split the lock classes by using a per-AF key:
 249 */
 250static struct lock_class_key af_callback_keys[AF_MAX];
 251
 252/* Take into consideration the size of the struct sk_buff overhead in the
 253 * determination of these values, since that is non-constant across
 254 * platforms.  This makes socket queueing behavior and performance
 255 * not depend upon such differences.
 256 */
 257#define _SK_MEM_PACKETS         256
 258#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 259#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 260#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 261
 262/* Run time adjustable parameters. */
 263__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 264EXPORT_SYMBOL(sysctl_wmem_max);
 265__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 266EXPORT_SYMBOL(sysctl_rmem_max);
 267__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 268__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 269
 270/* Maximal space eaten by iovec or ancillary data plus some space */
 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 272EXPORT_SYMBOL(sysctl_optmem_max);
 273
 274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 275EXPORT_SYMBOL_GPL(memalloc_socks);
 276
 277/**
 278 * sk_set_memalloc - sets %SOCK_MEMALLOC
 279 * @sk: socket to set it on
 280 *
 281 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 282 * It's the responsibility of the admin to adjust min_free_kbytes
 283 * to meet the requirements
 284 */
 285void sk_set_memalloc(struct sock *sk)
 286{
 287        sock_set_flag(sk, SOCK_MEMALLOC);
 288        sk->sk_allocation |= __GFP_MEMALLOC;
 289        static_key_slow_inc(&memalloc_socks);
 290}
 291EXPORT_SYMBOL_GPL(sk_set_memalloc);
 292
 293void sk_clear_memalloc(struct sock *sk)
 294{
 295        sock_reset_flag(sk, SOCK_MEMALLOC);
 296        sk->sk_allocation &= ~__GFP_MEMALLOC;
 297        static_key_slow_dec(&memalloc_socks);
 298
 299        /*
 300         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 301         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 302         * it has rmem allocations there is a risk that the user of the
 303         * socket cannot make forward progress due to exceeding the rmem
 304         * limits. By rights, sk_clear_memalloc() should only be called
 305         * on sockets being torn down but warn and reset the accounting if
 306         * that assumption breaks.
 307         */
 308        if (WARN_ON(sk->sk_forward_alloc))
 309                sk_mem_reclaim(sk);
 310}
 311EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 312
 313int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 314{
 315        int ret;
 316        unsigned long pflags = current->flags;
 317
 318        /* these should have been dropped before queueing */
 319        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 320
 321        current->flags |= PF_MEMALLOC;
 322        ret = sk->sk_backlog_rcv(sk, skb);
 323        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 324
 325        return ret;
 326}
 327EXPORT_SYMBOL(__sk_backlog_rcv);
 328
 329static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 330{
 331        struct timeval tv;
 332
 333        if (optlen < sizeof(tv))
 334                return -EINVAL;
 335        if (copy_from_user(&tv, optval, sizeof(tv)))
 336                return -EFAULT;
 337        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 338                return -EDOM;
 339
 340        if (tv.tv_sec < 0) {
 341                static int warned __read_mostly;
 342
 343                *timeo_p = 0;
 344                if (warned < 10 && net_ratelimit()) {
 345                        warned++;
 346                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 347                                __func__, current->comm, task_pid_nr(current));
 348                }
 349                return 0;
 350        }
 351        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 352        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 353                return 0;
 354        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 355                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 356        return 0;
 357}
 358
 359static void sock_warn_obsolete_bsdism(const char *name)
 360{
 361        static int warned;
 362        static char warncomm[TASK_COMM_LEN];
 363        if (strcmp(warncomm, current->comm) && warned < 5) {
 364                strcpy(warncomm,  current->comm);
 365                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 366                        warncomm, name);
 367                warned++;
 368        }
 369}
 370
 371#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 372
 373static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 374{
 375        if (sk->sk_flags & flags) {
 376                sk->sk_flags &= ~flags;
 377                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 378                        net_disable_timestamp();
 379        }
 380}
 381
 382
 383int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 384{
 385        int err;
 386        int skb_len;
 387        unsigned long flags;
 388        struct sk_buff_head *list = &sk->sk_receive_queue;
 389
 390        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 391                atomic_inc(&sk->sk_drops);
 392                trace_sock_rcvqueue_full(sk, skb);
 393                return -ENOMEM;
 394        }
 395
 396        err = sk_filter(sk, skb);
 397        if (err)
 398                return err;
 399
 400        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 401                atomic_inc(&sk->sk_drops);
 402                return -ENOBUFS;
 403        }
 404
 405        skb->dev = NULL;
 406        skb_set_owner_r(skb, sk);
 407
 408        /* Cache the SKB length before we tack it onto the receive
 409         * queue.  Once it is added it no longer belongs to us and
 410         * may be freed by other threads of control pulling packets
 411         * from the queue.
 412         */
 413        skb_len = skb->len;
 414
 415        /* we escape from rcu protected region, make sure we dont leak
 416         * a norefcounted dst
 417         */
 418        skb_dst_force(skb);
 419
 420        spin_lock_irqsave(&list->lock, flags);
 421        skb->dropcount = atomic_read(&sk->sk_drops);
 422        __skb_queue_tail(list, skb);
 423        spin_unlock_irqrestore(&list->lock, flags);
 424
 425        if (!sock_flag(sk, SOCK_DEAD))
 426                sk->sk_data_ready(sk, skb_len);
 427        return 0;
 428}
 429EXPORT_SYMBOL(sock_queue_rcv_skb);
 430
 431int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 432{
 433        int rc = NET_RX_SUCCESS;
 434
 435        if (sk_filter(sk, skb))
 436                goto discard_and_relse;
 437
 438        skb->dev = NULL;
 439
 440        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 441                atomic_inc(&sk->sk_drops);
 442                goto discard_and_relse;
 443        }
 444        if (nested)
 445                bh_lock_sock_nested(sk);
 446        else
 447                bh_lock_sock(sk);
 448        if (!sock_owned_by_user(sk)) {
 449                /*
 450                 * trylock + unlock semantics:
 451                 */
 452                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 453
 454                rc = sk_backlog_rcv(sk, skb);
 455
 456                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 457        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 458                bh_unlock_sock(sk);
 459                atomic_inc(&sk->sk_drops);
 460                goto discard_and_relse;
 461        }
 462
 463        bh_unlock_sock(sk);
 464out:
 465        sock_put(sk);
 466        return rc;
 467discard_and_relse:
 468        kfree_skb(skb);
 469        goto out;
 470}
 471EXPORT_SYMBOL(sk_receive_skb);
 472
 473void sk_reset_txq(struct sock *sk)
 474{
 475        sk_tx_queue_clear(sk);
 476}
 477EXPORT_SYMBOL(sk_reset_txq);
 478
 479struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 480{
 481        struct dst_entry *dst = __sk_dst_get(sk);
 482
 483        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 484                sk_tx_queue_clear(sk);
 485                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 486                dst_release(dst);
 487                return NULL;
 488        }
 489
 490        return dst;
 491}
 492EXPORT_SYMBOL(__sk_dst_check);
 493
 494struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 495{
 496        struct dst_entry *dst = sk_dst_get(sk);
 497
 498        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 499                sk_dst_reset(sk);
 500                dst_release(dst);
 501                return NULL;
 502        }
 503
 504        return dst;
 505}
 506EXPORT_SYMBOL(sk_dst_check);
 507
 508static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 509{
 510        int ret = -ENOPROTOOPT;
 511#ifdef CONFIG_NETDEVICES
 512        struct net *net = sock_net(sk);
 513        char devname[IFNAMSIZ];
 514        int index;
 515
 516        /* Sorry... */
 517        ret = -EPERM;
 518        if (!capable(CAP_NET_RAW))
 519                goto out;
 520
 521        ret = -EINVAL;
 522        if (optlen < 0)
 523                goto out;
 524
 525        /* Bind this socket to a particular device like "eth0",
 526         * as specified in the passed interface name. If the
 527         * name is "" or the option length is zero the socket
 528         * is not bound.
 529         */
 530        if (optlen > IFNAMSIZ - 1)
 531                optlen = IFNAMSIZ - 1;
 532        memset(devname, 0, sizeof(devname));
 533
 534        ret = -EFAULT;
 535        if (copy_from_user(devname, optval, optlen))
 536                goto out;
 537
 538        index = 0;
 539        if (devname[0] != '\0') {
 540                struct net_device *dev;
 541
 542                rcu_read_lock();
 543                dev = dev_get_by_name_rcu(net, devname);
 544                if (dev)
 545                        index = dev->ifindex;
 546                rcu_read_unlock();
 547                ret = -ENODEV;
 548                if (!dev)
 549                        goto out;
 550        }
 551
 552        lock_sock(sk);
 553        sk->sk_bound_dev_if = index;
 554        sk_dst_reset(sk);
 555        release_sock(sk);
 556
 557        ret = 0;
 558
 559out:
 560#endif
 561
 562        return ret;
 563}
 564
 565static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 566{
 567        if (valbool)
 568                sock_set_flag(sk, bit);
 569        else
 570                sock_reset_flag(sk, bit);
 571}
 572
 573/*
 574 *      This is meant for all protocols to use and covers goings on
 575 *      at the socket level. Everything here is generic.
 576 */
 577
 578int sock_setsockopt(struct socket *sock, int level, int optname,
 579                    char __user *optval, unsigned int optlen)
 580{
 581        struct sock *sk = sock->sk;
 582        int val;
 583        int valbool;
 584        struct linger ling;
 585        int ret = 0;
 586
 587        /*
 588         *      Options without arguments
 589         */
 590
 591        if (optname == SO_BINDTODEVICE)
 592                return sock_bindtodevice(sk, optval, optlen);
 593
 594        if (optlen < sizeof(int))
 595                return -EINVAL;
 596
 597        if (get_user(val, (int __user *)optval))
 598                return -EFAULT;
 599
 600        valbool = val ? 1 : 0;
 601
 602        lock_sock(sk);
 603
 604        switch (optname) {
 605        case SO_DEBUG:
 606                if (val && !capable(CAP_NET_ADMIN))
 607                        ret = -EACCES;
 608                else
 609                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 610                break;
 611        case SO_REUSEADDR:
 612                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 613                break;
 614        case SO_TYPE:
 615        case SO_PROTOCOL:
 616        case SO_DOMAIN:
 617        case SO_ERROR:
 618                ret = -ENOPROTOOPT;
 619                break;
 620        case SO_DONTROUTE:
 621                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 622                break;
 623        case SO_BROADCAST:
 624                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 625                break;
 626        case SO_SNDBUF:
 627                /* Don't error on this BSD doesn't and if you think
 628                 * about it this is right. Otherwise apps have to
 629                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 630                 * are treated in BSD as hints
 631                 */
 632                val = min_t(u32, val, sysctl_wmem_max);
 633set_sndbuf:
 634                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 635                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 636                /* Wake up sending tasks if we upped the value. */
 637                sk->sk_write_space(sk);
 638                break;
 639
 640        case SO_SNDBUFFORCE:
 641                if (!capable(CAP_NET_ADMIN)) {
 642                        ret = -EPERM;
 643                        break;
 644                }
 645                goto set_sndbuf;
 646
 647        case SO_RCVBUF:
 648                /* Don't error on this BSD doesn't and if you think
 649                 * about it this is right. Otherwise apps have to
 650                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 651                 * are treated in BSD as hints
 652                 */
 653                val = min_t(u32, val, sysctl_rmem_max);
 654set_rcvbuf:
 655                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 656                /*
 657                 * We double it on the way in to account for
 658                 * "struct sk_buff" etc. overhead.   Applications
 659                 * assume that the SO_RCVBUF setting they make will
 660                 * allow that much actual data to be received on that
 661                 * socket.
 662                 *
 663                 * Applications are unaware that "struct sk_buff" and
 664                 * other overheads allocate from the receive buffer
 665                 * during socket buffer allocation.
 666                 *
 667                 * And after considering the possible alternatives,
 668                 * returning the value we actually used in getsockopt
 669                 * is the most desirable behavior.
 670                 */
 671                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 672                break;
 673
 674        case SO_RCVBUFFORCE:
 675                if (!capable(CAP_NET_ADMIN)) {
 676                        ret = -EPERM;
 677                        break;
 678                }
 679                goto set_rcvbuf;
 680
 681        case SO_KEEPALIVE:
 682#ifdef CONFIG_INET
 683                if (sk->sk_protocol == IPPROTO_TCP &&
 684                    sk->sk_type == SOCK_STREAM)
 685                        tcp_set_keepalive(sk, valbool);
 686#endif
 687                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 688                break;
 689
 690        case SO_OOBINLINE:
 691                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 692                break;
 693
 694        case SO_NO_CHECK:
 695                sk->sk_no_check = valbool;
 696                break;
 697
 698        case SO_PRIORITY:
 699                if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 700                        sk->sk_priority = val;
 701                else
 702                        ret = -EPERM;
 703                break;
 704
 705        case SO_LINGER:
 706                if (optlen < sizeof(ling)) {
 707                        ret = -EINVAL;  /* 1003.1g */
 708                        break;
 709                }
 710                if (copy_from_user(&ling, optval, sizeof(ling))) {
 711                        ret = -EFAULT;
 712                        break;
 713                }
 714                if (!ling.l_onoff)
 715                        sock_reset_flag(sk, SOCK_LINGER);
 716                else {
 717#if (BITS_PER_LONG == 32)
 718                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 719                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 720                        else
 721#endif
 722                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 723                        sock_set_flag(sk, SOCK_LINGER);
 724                }
 725                break;
 726
 727        case SO_BSDCOMPAT:
 728                sock_warn_obsolete_bsdism("setsockopt");
 729                break;
 730
 731        case SO_PASSCRED:
 732                if (valbool)
 733                        set_bit(SOCK_PASSCRED, &sock->flags);
 734                else
 735                        clear_bit(SOCK_PASSCRED, &sock->flags);
 736                break;
 737
 738        case SO_TIMESTAMP:
 739        case SO_TIMESTAMPNS:
 740                if (valbool)  {
 741                        if (optname == SO_TIMESTAMP)
 742                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 743                        else
 744                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 745                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 746                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 747                } else {
 748                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 749                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 750                }
 751                break;
 752
 753        case SO_TIMESTAMPING:
 754                if (val & ~SOF_TIMESTAMPING_MASK) {
 755                        ret = -EINVAL;
 756                        break;
 757                }
 758                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 759                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 760                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 761                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 762                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 763                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 764                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 765                        sock_enable_timestamp(sk,
 766                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 767                else
 768                        sock_disable_timestamp(sk,
 769                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 770                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 771                                  val & SOF_TIMESTAMPING_SOFTWARE);
 772                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 773                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 774                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 775                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 776                break;
 777
 778        case SO_RCVLOWAT:
 779                if (val < 0)
 780                        val = INT_MAX;
 781                sk->sk_rcvlowat = val ? : 1;
 782                break;
 783
 784        case SO_RCVTIMEO:
 785                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 786                break;
 787
 788        case SO_SNDTIMEO:
 789                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 790                break;
 791
 792        case SO_ATTACH_FILTER:
 793                ret = -EINVAL;
 794                if (optlen == sizeof(struct sock_fprog)) {
 795                        struct sock_fprog fprog;
 796
 797                        ret = -EFAULT;
 798                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 799                                break;
 800
 801                        ret = sk_attach_filter(&fprog, sk);
 802                }
 803                break;
 804
 805        case SO_DETACH_FILTER:
 806                ret = sk_detach_filter(sk);
 807                break;
 808
 809        case SO_PASSSEC:
 810                if (valbool)
 811                        set_bit(SOCK_PASSSEC, &sock->flags);
 812                else
 813                        clear_bit(SOCK_PASSSEC, &sock->flags);
 814                break;
 815        case SO_MARK:
 816                if (!capable(CAP_NET_ADMIN))
 817                        ret = -EPERM;
 818                else
 819                        sk->sk_mark = val;
 820                break;
 821
 822                /* We implement the SO_SNDLOWAT etc to
 823                   not be settable (1003.1g 5.3) */
 824        case SO_RXQ_OVFL:
 825                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 826                break;
 827
 828        case SO_WIFI_STATUS:
 829                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 830                break;
 831
 832        case SO_PEEK_OFF:
 833                if (sock->ops->set_peek_off)
 834                        sock->ops->set_peek_off(sk, val);
 835                else
 836                        ret = -EOPNOTSUPP;
 837                break;
 838
 839        case SO_NOFCS:
 840                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 841                break;
 842
 843        default:
 844                ret = -ENOPROTOOPT;
 845                break;
 846        }
 847        release_sock(sk);
 848        return ret;
 849}
 850EXPORT_SYMBOL(sock_setsockopt);
 851
 852
 853void cred_to_ucred(struct pid *pid, const struct cred *cred,
 854                   struct ucred *ucred)
 855{
 856        ucred->pid = pid_vnr(pid);
 857        ucred->uid = ucred->gid = -1;
 858        if (cred) {
 859                struct user_namespace *current_ns = current_user_ns();
 860
 861                ucred->uid = from_kuid_munged(current_ns, cred->euid);
 862                ucred->gid = from_kgid_munged(current_ns, cred->egid);
 863        }
 864}
 865EXPORT_SYMBOL_GPL(cred_to_ucred);
 866
 867int sock_getsockopt(struct socket *sock, int level, int optname,
 868                    char __user *optval, int __user *optlen)
 869{
 870        struct sock *sk = sock->sk;
 871
 872        union {
 873                int val;
 874                struct linger ling;
 875                struct timeval tm;
 876        } v;
 877
 878        int lv = sizeof(int);
 879        int len;
 880
 881        if (get_user(len, optlen))
 882                return -EFAULT;
 883        if (len < 0)
 884                return -EINVAL;
 885
 886        memset(&v, 0, sizeof(v));
 887
 888        switch (optname) {
 889        case SO_DEBUG:
 890                v.val = sock_flag(sk, SOCK_DBG);
 891                break;
 892
 893        case SO_DONTROUTE:
 894                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 895                break;
 896
 897        case SO_BROADCAST:
 898                v.val = sock_flag(sk, SOCK_BROADCAST);
 899                break;
 900
 901        case SO_SNDBUF:
 902                v.val = sk->sk_sndbuf;
 903                break;
 904
 905        case SO_RCVBUF:
 906                v.val = sk->sk_rcvbuf;
 907                break;
 908
 909        case SO_REUSEADDR:
 910                v.val = sk->sk_reuse;
 911                break;
 912
 913        case SO_KEEPALIVE:
 914                v.val = sock_flag(sk, SOCK_KEEPOPEN);
 915                break;
 916
 917        case SO_TYPE:
 918                v.val = sk->sk_type;
 919                break;
 920
 921        case SO_PROTOCOL:
 922                v.val = sk->sk_protocol;
 923                break;
 924
 925        case SO_DOMAIN:
 926                v.val = sk->sk_family;
 927                break;
 928
 929        case SO_ERROR:
 930                v.val = -sock_error(sk);
 931                if (v.val == 0)
 932                        v.val = xchg(&sk->sk_err_soft, 0);
 933                break;
 934
 935        case SO_OOBINLINE:
 936                v.val = sock_flag(sk, SOCK_URGINLINE);
 937                break;
 938
 939        case SO_NO_CHECK:
 940                v.val = sk->sk_no_check;
 941                break;
 942
 943        case SO_PRIORITY:
 944                v.val = sk->sk_priority;
 945                break;
 946
 947        case SO_LINGER:
 948                lv              = sizeof(v.ling);
 949                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
 950                v.ling.l_linger = sk->sk_lingertime / HZ;
 951                break;
 952
 953        case SO_BSDCOMPAT:
 954                sock_warn_obsolete_bsdism("getsockopt");
 955                break;
 956
 957        case SO_TIMESTAMP:
 958                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 959                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
 960                break;
 961
 962        case SO_TIMESTAMPNS:
 963                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 964                break;
 965
 966        case SO_TIMESTAMPING:
 967                v.val = 0;
 968                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 969                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 970                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 971                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 972                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 973                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 974                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 975                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 976                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 977                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
 978                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 979                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 980                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 981                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 982                break;
 983
 984        case SO_RCVTIMEO:
 985                lv = sizeof(struct timeval);
 986                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 987                        v.tm.tv_sec = 0;
 988                        v.tm.tv_usec = 0;
 989                } else {
 990                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 991                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 992                }
 993                break;
 994
 995        case SO_SNDTIMEO:
 996                lv = sizeof(struct timeval);
 997                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 998                        v.tm.tv_sec = 0;
 999                        v.tm.tv_usec = 0;
1000                } else {
1001                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1002                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1003                }
1004                break;
1005
1006        case SO_RCVLOWAT:
1007                v.val = sk->sk_rcvlowat;
1008                break;
1009
1010        case SO_SNDLOWAT:
1011                v.val = 1;
1012                break;
1013
1014        case SO_PASSCRED:
1015                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1016                break;
1017
1018        case SO_PEERCRED:
1019        {
1020                struct ucred peercred;
1021                if (len > sizeof(peercred))
1022                        len = sizeof(peercred);
1023                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1024                if (copy_to_user(optval, &peercred, len))
1025                        return -EFAULT;
1026                goto lenout;
1027        }
1028
1029        case SO_PEERNAME:
1030        {
1031                char address[128];
1032
1033                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1034                        return -ENOTCONN;
1035                if (lv < len)
1036                        return -EINVAL;
1037                if (copy_to_user(optval, address, len))
1038                        return -EFAULT;
1039                goto lenout;
1040        }
1041
1042        /* Dubious BSD thing... Probably nobody even uses it, but
1043         * the UNIX standard wants it for whatever reason... -DaveM
1044         */
1045        case SO_ACCEPTCONN:
1046                v.val = sk->sk_state == TCP_LISTEN;
1047                break;
1048
1049        case SO_PASSSEC:
1050                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1051                break;
1052
1053        case SO_PEERSEC:
1054                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1055
1056        case SO_MARK:
1057                v.val = sk->sk_mark;
1058                break;
1059
1060        case SO_RXQ_OVFL:
1061                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1062                break;
1063
1064        case SO_WIFI_STATUS:
1065                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1066                break;
1067
1068        case SO_PEEK_OFF:
1069                if (!sock->ops->set_peek_off)
1070                        return -EOPNOTSUPP;
1071
1072                v.val = sk->sk_peek_off;
1073                break;
1074        case SO_NOFCS:
1075                v.val = sock_flag(sk, SOCK_NOFCS);
1076                break;
1077        default:
1078                return -ENOPROTOOPT;
1079        }
1080
1081        if (len > lv)
1082                len = lv;
1083        if (copy_to_user(optval, &v, len))
1084                return -EFAULT;
1085lenout:
1086        if (put_user(len, optlen))
1087                return -EFAULT;
1088        return 0;
1089}
1090
1091/*
1092 * Initialize an sk_lock.
1093 *
1094 * (We also register the sk_lock with the lock validator.)
1095 */
1096static inline void sock_lock_init(struct sock *sk)
1097{
1098        sock_lock_init_class_and_name(sk,
1099                        af_family_slock_key_strings[sk->sk_family],
1100                        af_family_slock_keys + sk->sk_family,
1101                        af_family_key_strings[sk->sk_family],
1102                        af_family_keys + sk->sk_family);
1103}
1104
1105/*
1106 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1107 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1108 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1109 */
1110static void sock_copy(struct sock *nsk, const struct sock *osk)
1111{
1112#ifdef CONFIG_SECURITY_NETWORK
1113        void *sptr = nsk->sk_security;
1114#endif
1115        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1116
1117        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1118               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1119
1120#ifdef CONFIG_SECURITY_NETWORK
1121        nsk->sk_security = sptr;
1122        security_sk_clone(osk, nsk);
1123#endif
1124}
1125
1126/*
1127 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1128 * un-modified. Special care is taken when initializing object to zero.
1129 */
1130static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1131{
1132        if (offsetof(struct sock, sk_node.next) != 0)
1133                memset(sk, 0, offsetof(struct sock, sk_node.next));
1134        memset(&sk->sk_node.pprev, 0,
1135               size - offsetof(struct sock, sk_node.pprev));
1136}
1137
1138void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1139{
1140        unsigned long nulls1, nulls2;
1141
1142        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1143        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1144        if (nulls1 > nulls2)
1145                swap(nulls1, nulls2);
1146
1147        if (nulls1 != 0)
1148                memset((char *)sk, 0, nulls1);
1149        memset((char *)sk + nulls1 + sizeof(void *), 0,
1150               nulls2 - nulls1 - sizeof(void *));
1151        memset((char *)sk + nulls2 + sizeof(void *), 0,
1152               size - nulls2 - sizeof(void *));
1153}
1154EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1155
1156static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1157                int family)
1158{
1159        struct sock *sk;
1160        struct kmem_cache *slab;
1161
1162        slab = prot->slab;
1163        if (slab != NULL) {
1164                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1165                if (!sk)
1166                        return sk;
1167                if (priority & __GFP_ZERO) {
1168                        if (prot->clear_sk)
1169                                prot->clear_sk(sk, prot->obj_size);
1170                        else
1171                                sk_prot_clear_nulls(sk, prot->obj_size);
1172                }
1173        } else
1174                sk = kmalloc(prot->obj_size, priority);
1175
1176        if (sk != NULL) {
1177                kmemcheck_annotate_bitfield(sk, flags);
1178
1179                if (security_sk_alloc(sk, family, priority))
1180                        goto out_free;
1181
1182                if (!try_module_get(prot->owner))
1183                        goto out_free_sec;
1184                sk_tx_queue_clear(sk);
1185        }
1186
1187        return sk;
1188
1189out_free_sec:
1190        security_sk_free(sk);
1191out_free:
1192        if (slab != NULL)
1193                kmem_cache_free(slab, sk);
1194        else
1195                kfree(sk);
1196        return NULL;
1197}
1198
1199static void sk_prot_free(struct proto *prot, struct sock *sk)
1200{
1201        struct kmem_cache *slab;
1202        struct module *owner;
1203
1204        owner = prot->owner;
1205        slab = prot->slab;
1206
1207        security_sk_free(sk);
1208        if (slab != NULL)
1209                kmem_cache_free(slab, sk);
1210        else
1211                kfree(sk);
1212        module_put(owner);
1213}
1214
1215#ifdef CONFIG_CGROUPS
1216#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1217void sock_update_classid(struct sock *sk)
1218{
1219        u32 classid;
1220
1221        rcu_read_lock();  /* doing current task, which cannot vanish. */
1222        classid = task_cls_classid(current);
1223        rcu_read_unlock();
1224        if (classid != sk->sk_classid)
1225                sk->sk_classid = classid;
1226}
1227EXPORT_SYMBOL(sock_update_classid);
1228#endif
1229
1230#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1231void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1232{
1233        if (in_interrupt())
1234                return;
1235
1236        sk->sk_cgrp_prioidx = task_netprioidx(task);
1237}
1238EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1239#endif
1240#endif
1241
1242/**
1243 *      sk_alloc - All socket objects are allocated here
1244 *      @net: the applicable net namespace
1245 *      @family: protocol family
1246 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1247 *      @prot: struct proto associated with this new sock instance
1248 */
1249struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1250                      struct proto *prot)
1251{
1252        struct sock *sk;
1253
1254        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1255        if (sk) {
1256                sk->sk_family = family;
1257                /*
1258                 * See comment in struct sock definition to understand
1259                 * why we need sk_prot_creator -acme
1260                 */
1261                sk->sk_prot = sk->sk_prot_creator = prot;
1262                sock_lock_init(sk);
1263                sock_net_set(sk, get_net(net));
1264                atomic_set(&sk->sk_wmem_alloc, 1);
1265
1266                sock_update_classid(sk);
1267                sock_update_netprioidx(sk, current);
1268        }
1269
1270        return sk;
1271}
1272EXPORT_SYMBOL(sk_alloc);
1273
1274static void __sk_free(struct sock *sk)
1275{
1276        struct sk_filter *filter;
1277
1278        if (sk->sk_destruct)
1279                sk->sk_destruct(sk);
1280
1281        filter = rcu_dereference_check(sk->sk_filter,
1282                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1283        if (filter) {
1284                sk_filter_uncharge(sk, filter);
1285                RCU_INIT_POINTER(sk->sk_filter, NULL);
1286        }
1287
1288        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1289
1290        if (atomic_read(&sk->sk_omem_alloc))
1291                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1292                         __func__, atomic_read(&sk->sk_omem_alloc));
1293
1294        if (sk->sk_peer_cred)
1295                put_cred(sk->sk_peer_cred);
1296        put_pid(sk->sk_peer_pid);
1297        put_net(sock_net(sk));
1298        sk_prot_free(sk->sk_prot_creator, sk);
1299}
1300
1301void sk_free(struct sock *sk)
1302{
1303        /*
1304         * We subtract one from sk_wmem_alloc and can know if
1305         * some packets are still in some tx queue.
1306         * If not null, sock_wfree() will call __sk_free(sk) later
1307         */
1308        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1309                __sk_free(sk);
1310}
1311EXPORT_SYMBOL(sk_free);
1312
1313/*
1314 * Last sock_put should drop reference to sk->sk_net. It has already
1315 * been dropped in sk_change_net. Taking reference to stopping namespace
1316 * is not an option.
1317 * Take reference to a socket to remove it from hash _alive_ and after that
1318 * destroy it in the context of init_net.
1319 */
1320void sk_release_kernel(struct sock *sk)
1321{
1322        if (sk == NULL || sk->sk_socket == NULL)
1323                return;
1324
1325        sock_hold(sk);
1326        sock_release(sk->sk_socket);
1327        release_net(sock_net(sk));
1328        sock_net_set(sk, get_net(&init_net));
1329        sock_put(sk);
1330}
1331EXPORT_SYMBOL(sk_release_kernel);
1332
1333static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1334{
1335        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1336                sock_update_memcg(newsk);
1337}
1338
1339/**
1340 *      sk_clone_lock - clone a socket, and lock its clone
1341 *      @sk: the socket to clone
1342 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1343 *
1344 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1345 */
1346struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1347{
1348        struct sock *newsk;
1349
1350        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1351        if (newsk != NULL) {
1352                struct sk_filter *filter;
1353
1354                sock_copy(newsk, sk);
1355
1356                /* SANITY */
1357                get_net(sock_net(newsk));
1358                sk_node_init(&newsk->sk_node);
1359                sock_lock_init(newsk);
1360                bh_lock_sock(newsk);
1361                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1362                newsk->sk_backlog.len = 0;
1363
1364                atomic_set(&newsk->sk_rmem_alloc, 0);
1365                /*
1366                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1367                 */
1368                atomic_set(&newsk->sk_wmem_alloc, 1);
1369                atomic_set(&newsk->sk_omem_alloc, 0);
1370                skb_queue_head_init(&newsk->sk_receive_queue);
1371                skb_queue_head_init(&newsk->sk_write_queue);
1372#ifdef CONFIG_NET_DMA
1373                skb_queue_head_init(&newsk->sk_async_wait_queue);
1374#endif
1375
1376                spin_lock_init(&newsk->sk_dst_lock);
1377                rwlock_init(&newsk->sk_callback_lock);
1378                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1379                                af_callback_keys + newsk->sk_family,
1380                                af_family_clock_key_strings[newsk->sk_family]);
1381
1382                newsk->sk_dst_cache     = NULL;
1383                newsk->sk_wmem_queued   = 0;
1384                newsk->sk_forward_alloc = 0;
1385                newsk->sk_send_head     = NULL;
1386                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1387
1388                sock_reset_flag(newsk, SOCK_DONE);
1389                skb_queue_head_init(&newsk->sk_error_queue);
1390
1391                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1392                if (filter != NULL)
1393                        sk_filter_charge(newsk, filter);
1394
1395                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1396                        /* It is still raw copy of parent, so invalidate
1397                         * destructor and make plain sk_free() */
1398                        newsk->sk_destruct = NULL;
1399                        bh_unlock_sock(newsk);
1400                        sk_free(newsk);
1401                        newsk = NULL;
1402                        goto out;
1403                }
1404
1405                newsk->sk_err      = 0;
1406                newsk->sk_priority = 0;
1407                /*
1408                 * Before updating sk_refcnt, we must commit prior changes to memory
1409                 * (Documentation/RCU/rculist_nulls.txt for details)
1410                 */
1411                smp_wmb();
1412                atomic_set(&newsk->sk_refcnt, 2);
1413
1414                /*
1415                 * Increment the counter in the same struct proto as the master
1416                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1417                 * is the same as sk->sk_prot->socks, as this field was copied
1418                 * with memcpy).
1419                 *
1420                 * This _changes_ the previous behaviour, where
1421                 * tcp_create_openreq_child always was incrementing the
1422                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1423                 * to be taken into account in all callers. -acme
1424                 */
1425                sk_refcnt_debug_inc(newsk);
1426                sk_set_socket(newsk, NULL);
1427                newsk->sk_wq = NULL;
1428
1429                sk_update_clone(sk, newsk);
1430
1431                if (newsk->sk_prot->sockets_allocated)
1432                        sk_sockets_allocated_inc(newsk);
1433
1434                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1435                        net_enable_timestamp();
1436        }
1437out:
1438        return newsk;
1439}
1440EXPORT_SYMBOL_GPL(sk_clone_lock);
1441
1442void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1443{
1444        __sk_dst_set(sk, dst);
1445        sk->sk_route_caps = dst->dev->features;
1446        if (sk->sk_route_caps & NETIF_F_GSO)
1447                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1448        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1449        if (sk_can_gso(sk)) {
1450                if (dst->header_len) {
1451                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1452                } else {
1453                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1454                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1455                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1456                }
1457        }
1458}
1459EXPORT_SYMBOL_GPL(sk_setup_caps);
1460
1461/*
1462 *      Simple resource managers for sockets.
1463 */
1464
1465
1466/*
1467 * Write buffer destructor automatically called from kfree_skb.
1468 */
1469void sock_wfree(struct sk_buff *skb)
1470{
1471        struct sock *sk = skb->sk;
1472        unsigned int len = skb->truesize;
1473
1474        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1475                /*
1476                 * Keep a reference on sk_wmem_alloc, this will be released
1477                 * after sk_write_space() call
1478                 */
1479                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1480                sk->sk_write_space(sk);
1481                len = 1;
1482        }
1483        /*
1484         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1485         * could not do because of in-flight packets
1486         */
1487        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1488                __sk_free(sk);
1489}
1490EXPORT_SYMBOL(sock_wfree);
1491
1492/*
1493 * Read buffer destructor automatically called from kfree_skb.
1494 */
1495void sock_rfree(struct sk_buff *skb)
1496{
1497        struct sock *sk = skb->sk;
1498        unsigned int len = skb->truesize;
1499
1500        atomic_sub(len, &sk->sk_rmem_alloc);
1501        sk_mem_uncharge(sk, len);
1502}
1503EXPORT_SYMBOL(sock_rfree);
1504
1505void sock_edemux(struct sk_buff *skb)
1506{
1507        struct sock *sk = skb->sk;
1508
1509#ifdef CONFIG_INET
1510        if (sk->sk_state == TCP_TIME_WAIT)
1511                inet_twsk_put(inet_twsk(sk));
1512        else
1513#endif
1514                sock_put(sk);
1515}
1516EXPORT_SYMBOL(sock_edemux);
1517
1518kuid_t sock_i_uid(struct sock *sk)
1519{
1520        kuid_t uid;
1521
1522        read_lock_bh(&sk->sk_callback_lock);
1523        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1524        read_unlock_bh(&sk->sk_callback_lock);
1525        return uid;
1526}
1527EXPORT_SYMBOL(sock_i_uid);
1528
1529unsigned long sock_i_ino(struct sock *sk)
1530{
1531        unsigned long ino;
1532
1533        read_lock_bh(&sk->sk_callback_lock);
1534        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1535        read_unlock_bh(&sk->sk_callback_lock);
1536        return ino;
1537}
1538EXPORT_SYMBOL(sock_i_ino);
1539
1540/*
1541 * Allocate a skb from the socket's send buffer.
1542 */
1543struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1544                             gfp_t priority)
1545{
1546        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1547                struct sk_buff *skb = alloc_skb(size, priority);
1548                if (skb) {
1549                        skb_set_owner_w(skb, sk);
1550                        return skb;
1551                }
1552        }
1553        return NULL;
1554}
1555EXPORT_SYMBOL(sock_wmalloc);
1556
1557/*
1558 * Allocate a skb from the socket's receive buffer.
1559 */
1560struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1561                             gfp_t priority)
1562{
1563        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1564                struct sk_buff *skb = alloc_skb(size, priority);
1565                if (skb) {
1566                        skb_set_owner_r(skb, sk);
1567                        return skb;
1568                }
1569        }
1570        return NULL;
1571}
1572
1573/*
1574 * Allocate a memory block from the socket's option memory buffer.
1575 */
1576void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1577{
1578        if ((unsigned int)size <= sysctl_optmem_max &&
1579            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1580                void *mem;
1581                /* First do the add, to avoid the race if kmalloc
1582                 * might sleep.
1583                 */
1584                atomic_add(size, &sk->sk_omem_alloc);
1585                mem = kmalloc(size, priority);
1586                if (mem)
1587                        return mem;
1588                atomic_sub(size, &sk->sk_omem_alloc);
1589        }
1590        return NULL;
1591}
1592EXPORT_SYMBOL(sock_kmalloc);
1593
1594/*
1595 * Free an option memory block.
1596 */
1597void sock_kfree_s(struct sock *sk, void *mem, int size)
1598{
1599        kfree(mem);
1600        atomic_sub(size, &sk->sk_omem_alloc);
1601}
1602EXPORT_SYMBOL(sock_kfree_s);
1603
1604/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1605   I think, these locks should be removed for datagram sockets.
1606 */
1607static long sock_wait_for_wmem(struct sock *sk, long timeo)
1608{
1609        DEFINE_WAIT(wait);
1610
1611        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1612        for (;;) {
1613                if (!timeo)
1614                        break;
1615                if (signal_pending(current))
1616                        break;
1617                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1618                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1619                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1620                        break;
1621                if (sk->sk_shutdown & SEND_SHUTDOWN)
1622                        break;
1623                if (sk->sk_err)
1624                        break;
1625                timeo = schedule_timeout(timeo);
1626        }
1627        finish_wait(sk_sleep(sk), &wait);
1628        return timeo;
1629}
1630
1631
1632/*
1633 *      Generic send/receive buffer handlers
1634 */
1635
1636struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1637                                     unsigned long data_len, int noblock,
1638                                     int *errcode)
1639{
1640        struct sk_buff *skb;
1641        gfp_t gfp_mask;
1642        long timeo;
1643        int err;
1644        int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1645
1646        err = -EMSGSIZE;
1647        if (npages > MAX_SKB_FRAGS)
1648                goto failure;
1649
1650        gfp_mask = sk->sk_allocation;
1651        if (gfp_mask & __GFP_WAIT)
1652                gfp_mask |= __GFP_REPEAT;
1653
1654        timeo = sock_sndtimeo(sk, noblock);
1655        while (1) {
1656                err = sock_error(sk);
1657                if (err != 0)
1658                        goto failure;
1659
1660                err = -EPIPE;
1661                if (sk->sk_shutdown & SEND_SHUTDOWN)
1662                        goto failure;
1663
1664                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1665                        skb = alloc_skb(header_len, gfp_mask);
1666                        if (skb) {
1667                                int i;
1668
1669                                /* No pages, we're done... */
1670                                if (!data_len)
1671                                        break;
1672
1673                                skb->truesize += data_len;
1674                                skb_shinfo(skb)->nr_frags = npages;
1675                                for (i = 0; i < npages; i++) {
1676                                        struct page *page;
1677
1678                                        page = alloc_pages(sk->sk_allocation, 0);
1679                                        if (!page) {
1680                                                err = -ENOBUFS;
1681                                                skb_shinfo(skb)->nr_frags = i;
1682                                                kfree_skb(skb);
1683                                                goto failure;
1684                                        }
1685
1686                                        __skb_fill_page_desc(skb, i,
1687                                                        page, 0,
1688                                                        (data_len >= PAGE_SIZE ?
1689                                                         PAGE_SIZE :
1690                                                         data_len));
1691                                        data_len -= PAGE_SIZE;
1692                                }
1693
1694                                /* Full success... */
1695                                break;
1696                        }
1697                        err = -ENOBUFS;
1698                        goto failure;
1699                }
1700                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1701                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1702                err = -EAGAIN;
1703                if (!timeo)
1704                        goto failure;
1705                if (signal_pending(current))
1706                        goto interrupted;
1707                timeo = sock_wait_for_wmem(sk, timeo);
1708        }
1709
1710        skb_set_owner_w(skb, sk);
1711        return skb;
1712
1713interrupted:
1714        err = sock_intr_errno(timeo);
1715failure:
1716        *errcode = err;
1717        return NULL;
1718}
1719EXPORT_SYMBOL(sock_alloc_send_pskb);
1720
1721struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1722                                    int noblock, int *errcode)
1723{
1724        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1725}
1726EXPORT_SYMBOL(sock_alloc_send_skb);
1727
1728/* On 32bit arches, an skb frag is limited to 2^15 */
1729#define SKB_FRAG_PAGE_ORDER     get_order(32768)
1730
1731bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1732{
1733        int order;
1734
1735        if (pfrag->page) {
1736                if (atomic_read(&pfrag->page->_count) == 1) {
1737                        pfrag->offset = 0;
1738                        return true;
1739                }
1740                if (pfrag->offset < pfrag->size)
1741                        return true;
1742                put_page(pfrag->page);
1743        }
1744
1745        /* We restrict high order allocations to users that can afford to wait */
1746        order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1747
1748        do {
1749                gfp_t gfp = sk->sk_allocation;
1750
1751                if (order)
1752                        gfp |= __GFP_COMP | __GFP_NOWARN;
1753                pfrag->page = alloc_pages(gfp, order);
1754                if (likely(pfrag->page)) {
1755                        pfrag->offset = 0;
1756                        pfrag->size = PAGE_SIZE << order;
1757                        return true;
1758                }
1759        } while (--order >= 0);
1760
1761        sk_enter_memory_pressure(sk);
1762        sk_stream_moderate_sndbuf(sk);
1763        return false;
1764}
1765EXPORT_SYMBOL(sk_page_frag_refill);
1766
1767static void __lock_sock(struct sock *sk)
1768        __releases(&sk->sk_lock.slock)
1769        __acquires(&sk->sk_lock.slock)
1770{
1771        DEFINE_WAIT(wait);
1772
1773        for (;;) {
1774                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1775                                        TASK_UNINTERRUPTIBLE);
1776                spin_unlock_bh(&sk->sk_lock.slock);
1777                schedule();
1778                spin_lock_bh(&sk->sk_lock.slock);
1779                if (!sock_owned_by_user(sk))
1780                        break;
1781        }
1782        finish_wait(&sk->sk_lock.wq, &wait);
1783}
1784
1785static void __release_sock(struct sock *sk)
1786        __releases(&sk->sk_lock.slock)
1787        __acquires(&sk->sk_lock.slock)
1788{
1789        struct sk_buff *skb = sk->sk_backlog.head;
1790
1791        do {
1792                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1793                bh_unlock_sock(sk);
1794
1795                do {
1796                        struct sk_buff *next = skb->next;
1797
1798                        prefetch(next);
1799                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1800                        skb->next = NULL;
1801                        sk_backlog_rcv(sk, skb);
1802
1803                        /*
1804                         * We are in process context here with softirqs
1805                         * disabled, use cond_resched_softirq() to preempt.
1806                         * This is safe to do because we've taken the backlog
1807                         * queue private:
1808                         */
1809                        cond_resched_softirq();
1810
1811                        skb = next;
1812                } while (skb != NULL);
1813
1814                bh_lock_sock(sk);
1815        } while ((skb = sk->sk_backlog.head) != NULL);
1816
1817        /*
1818         * Doing the zeroing here guarantee we can not loop forever
1819         * while a wild producer attempts to flood us.
1820         */
1821        sk->sk_backlog.len = 0;
1822}
1823
1824/**
1825 * sk_wait_data - wait for data to arrive at sk_receive_queue
1826 * @sk:    sock to wait on
1827 * @timeo: for how long
1828 *
1829 * Now socket state including sk->sk_err is changed only under lock,
1830 * hence we may omit checks after joining wait queue.
1831 * We check receive queue before schedule() only as optimization;
1832 * it is very likely that release_sock() added new data.
1833 */
1834int sk_wait_data(struct sock *sk, long *timeo)
1835{
1836        int rc;
1837        DEFINE_WAIT(wait);
1838
1839        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1840        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1841        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1842        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1843        finish_wait(sk_sleep(sk), &wait);
1844        return rc;
1845}
1846EXPORT_SYMBOL(sk_wait_data);
1847
1848/**
1849 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1850 *      @sk: socket
1851 *      @size: memory size to allocate
1852 *      @kind: allocation type
1853 *
1854 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1855 *      rmem allocation. This function assumes that protocols which have
1856 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1857 */
1858int __sk_mem_schedule(struct sock *sk, int size, int kind)
1859{
1860        struct proto *prot = sk->sk_prot;
1861        int amt = sk_mem_pages(size);
1862        long allocated;
1863        int parent_status = UNDER_LIMIT;
1864
1865        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1866
1867        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1868
1869        /* Under limit. */
1870        if (parent_status == UNDER_LIMIT &&
1871                        allocated <= sk_prot_mem_limits(sk, 0)) {
1872                sk_leave_memory_pressure(sk);
1873                return 1;
1874        }
1875
1876        /* Under pressure. (we or our parents) */
1877        if ((parent_status > SOFT_LIMIT) ||
1878                        allocated > sk_prot_mem_limits(sk, 1))
1879                sk_enter_memory_pressure(sk);
1880
1881        /* Over hard limit (we or our parents) */
1882        if ((parent_status == OVER_LIMIT) ||
1883                        (allocated > sk_prot_mem_limits(sk, 2)))
1884                goto suppress_allocation;
1885
1886        /* guarantee minimum buffer size under pressure */
1887        if (kind == SK_MEM_RECV) {
1888                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1889                        return 1;
1890
1891        } else { /* SK_MEM_SEND */
1892                if (sk->sk_type == SOCK_STREAM) {
1893                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1894                                return 1;
1895                } else if (atomic_read(&sk->sk_wmem_alloc) <
1896                           prot->sysctl_wmem[0])
1897                                return 1;
1898        }
1899
1900        if (sk_has_memory_pressure(sk)) {
1901                int alloc;
1902
1903                if (!sk_under_memory_pressure(sk))
1904                        return 1;
1905                alloc = sk_sockets_allocated_read_positive(sk);
1906                if (sk_prot_mem_limits(sk, 2) > alloc *
1907                    sk_mem_pages(sk->sk_wmem_queued +
1908                                 atomic_read(&sk->sk_rmem_alloc) +
1909                                 sk->sk_forward_alloc))
1910                        return 1;
1911        }
1912
1913suppress_allocation:
1914
1915        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1916                sk_stream_moderate_sndbuf(sk);
1917
1918                /* Fail only if socket is _under_ its sndbuf.
1919                 * In this case we cannot block, so that we have to fail.
1920                 */
1921                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1922                        return 1;
1923        }
1924
1925        trace_sock_exceed_buf_limit(sk, prot, allocated);
1926
1927        /* Alas. Undo changes. */
1928        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1929
1930        sk_memory_allocated_sub(sk, amt);
1931
1932        return 0;
1933}
1934EXPORT_SYMBOL(__sk_mem_schedule);
1935
1936/**
1937 *      __sk_reclaim - reclaim memory_allocated
1938 *      @sk: socket
1939 */
1940void __sk_mem_reclaim(struct sock *sk)
1941{
1942        sk_memory_allocated_sub(sk,
1943                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1944        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1945
1946        if (sk_under_memory_pressure(sk) &&
1947            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1948                sk_leave_memory_pressure(sk);
1949}
1950EXPORT_SYMBOL(__sk_mem_reclaim);
1951
1952
1953/*
1954 * Set of default routines for initialising struct proto_ops when
1955 * the protocol does not support a particular function. In certain
1956 * cases where it makes no sense for a protocol to have a "do nothing"
1957 * function, some default processing is provided.
1958 */
1959
1960int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1961{
1962        return -EOPNOTSUPP;
1963}
1964EXPORT_SYMBOL(sock_no_bind);
1965
1966int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1967                    int len, int flags)
1968{
1969        return -EOPNOTSUPP;
1970}
1971EXPORT_SYMBOL(sock_no_connect);
1972
1973int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1974{
1975        return -EOPNOTSUPP;
1976}
1977EXPORT_SYMBOL(sock_no_socketpair);
1978
1979int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1980{
1981        return -EOPNOTSUPP;
1982}
1983EXPORT_SYMBOL(sock_no_accept);
1984
1985int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1986                    int *len, int peer)
1987{
1988        return -EOPNOTSUPP;
1989}
1990EXPORT_SYMBOL(sock_no_getname);
1991
1992unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1993{
1994        return 0;
1995}
1996EXPORT_SYMBOL(sock_no_poll);
1997
1998int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1999{
2000        return -EOPNOTSUPP;
2001}
2002EXPORT_SYMBOL(sock_no_ioctl);
2003
2004int sock_no_listen(struct socket *sock, int backlog)
2005{
2006        return -EOPNOTSUPP;
2007}
2008EXPORT_SYMBOL(sock_no_listen);
2009
2010int sock_no_shutdown(struct socket *sock, int how)
2011{
2012        return -EOPNOTSUPP;
2013}
2014EXPORT_SYMBOL(sock_no_shutdown);
2015
2016int sock_no_setsockopt(struct socket *sock, int level, int optname,
2017                    char __user *optval, unsigned int optlen)
2018{
2019        return -EOPNOTSUPP;
2020}
2021EXPORT_SYMBOL(sock_no_setsockopt);
2022
2023int sock_no_getsockopt(struct socket *sock, int level, int optname,
2024                    char __user *optval, int __user *optlen)
2025{
2026        return -EOPNOTSUPP;
2027}
2028EXPORT_SYMBOL(sock_no_getsockopt);
2029
2030int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2031                    size_t len)
2032{
2033        return -EOPNOTSUPP;
2034}
2035EXPORT_SYMBOL(sock_no_sendmsg);
2036
2037int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2038                    size_t len, int flags)
2039{
2040        return -EOPNOTSUPP;
2041}
2042EXPORT_SYMBOL(sock_no_recvmsg);
2043
2044int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2045{
2046        /* Mirror missing mmap method error code */
2047        return -ENODEV;
2048}
2049EXPORT_SYMBOL(sock_no_mmap);
2050
2051ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2052{
2053        ssize_t res;
2054        struct msghdr msg = {.msg_flags = flags};
2055        struct kvec iov;
2056        char *kaddr = kmap(page);
2057        iov.iov_base = kaddr + offset;
2058        iov.iov_len = size;
2059        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2060        kunmap(page);
2061        return res;
2062}
2063EXPORT_SYMBOL(sock_no_sendpage);
2064
2065/*
2066 *      Default Socket Callbacks
2067 */
2068
2069static void sock_def_wakeup(struct sock *sk)
2070{
2071        struct socket_wq *wq;
2072
2073        rcu_read_lock();
2074        wq = rcu_dereference(sk->sk_wq);
2075        if (wq_has_sleeper(wq))
2076                wake_up_interruptible_all(&wq->wait);
2077        rcu_read_unlock();
2078}
2079
2080static void sock_def_error_report(struct sock *sk)
2081{
2082        struct socket_wq *wq;
2083
2084        rcu_read_lock();
2085        wq = rcu_dereference(sk->sk_wq);
2086        if (wq_has_sleeper(wq))
2087                wake_up_interruptible_poll(&wq->wait, POLLERR);
2088        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2089        rcu_read_unlock();
2090}
2091
2092static void sock_def_readable(struct sock *sk, int len)
2093{
2094        struct socket_wq *wq;
2095
2096        rcu_read_lock();
2097        wq = rcu_dereference(sk->sk_wq);
2098        if (wq_has_sleeper(wq))
2099                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2100                                                POLLRDNORM | POLLRDBAND);
2101        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2102        rcu_read_unlock();
2103}
2104
2105static void sock_def_write_space(struct sock *sk)
2106{
2107        struct socket_wq *wq;
2108
2109        rcu_read_lock();
2110
2111        /* Do not wake up a writer until he can make "significant"
2112         * progress.  --DaveM
2113         */
2114        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2115                wq = rcu_dereference(sk->sk_wq);
2116                if (wq_has_sleeper(wq))
2117                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2118                                                POLLWRNORM | POLLWRBAND);
2119
2120                /* Should agree with poll, otherwise some programs break */
2121                if (sock_writeable(sk))
2122                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2123        }
2124
2125        rcu_read_unlock();
2126}
2127
2128static void sock_def_destruct(struct sock *sk)
2129{
2130        kfree(sk->sk_protinfo);
2131}
2132
2133void sk_send_sigurg(struct sock *sk)
2134{
2135        if (sk->sk_socket && sk->sk_socket->file)
2136                if (send_sigurg(&sk->sk_socket->file->f_owner))
2137                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2138}
2139EXPORT_SYMBOL(sk_send_sigurg);
2140
2141void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2142                    unsigned long expires)
2143{
2144        if (!mod_timer(timer, expires))
2145                sock_hold(sk);
2146}
2147EXPORT_SYMBOL(sk_reset_timer);
2148
2149void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2150{
2151        if (timer_pending(timer) && del_timer(timer))
2152                __sock_put(sk);
2153}
2154EXPORT_SYMBOL(sk_stop_timer);
2155
2156void sock_init_data(struct socket *sock, struct sock *sk)
2157{
2158        skb_queue_head_init(&sk->sk_receive_queue);
2159        skb_queue_head_init(&sk->sk_write_queue);
2160        skb_queue_head_init(&sk->sk_error_queue);
2161#ifdef CONFIG_NET_DMA
2162        skb_queue_head_init(&sk->sk_async_wait_queue);
2163#endif
2164
2165        sk->sk_send_head        =       NULL;
2166
2167        init_timer(&sk->sk_timer);
2168
2169        sk->sk_allocation       =       GFP_KERNEL;
2170        sk->sk_rcvbuf           =       sysctl_rmem_default;
2171        sk->sk_sndbuf           =       sysctl_wmem_default;
2172        sk->sk_state            =       TCP_CLOSE;
2173        sk_set_socket(sk, sock);
2174
2175        sock_set_flag(sk, SOCK_ZAPPED);
2176
2177        if (sock) {
2178                sk->sk_type     =       sock->type;
2179                sk->sk_wq       =       sock->wq;
2180                sock->sk        =       sk;
2181        } else
2182                sk->sk_wq       =       NULL;
2183
2184        spin_lock_init(&sk->sk_dst_lock);
2185        rwlock_init(&sk->sk_callback_lock);
2186        lockdep_set_class_and_name(&sk->sk_callback_lock,
2187                        af_callback_keys + sk->sk_family,
2188                        af_family_clock_key_strings[sk->sk_family]);
2189
2190        sk->sk_state_change     =       sock_def_wakeup;
2191        sk->sk_data_ready       =       sock_def_readable;
2192        sk->sk_write_space      =       sock_def_write_space;
2193        sk->sk_error_report     =       sock_def_error_report;
2194        sk->sk_destruct         =       sock_def_destruct;
2195
2196        sk->sk_frag.page        =       NULL;
2197        sk->sk_frag.offset      =       0;
2198        sk->sk_peek_off         =       -1;
2199
2200        sk->sk_peer_pid         =       NULL;
2201        sk->sk_peer_cred        =       NULL;
2202        sk->sk_write_pending    =       0;
2203        sk->sk_rcvlowat         =       1;
2204        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2205        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2206
2207        sk->sk_stamp = ktime_set(-1L, 0);
2208
2209        /*
2210         * Before updating sk_refcnt, we must commit prior changes to memory
2211         * (Documentation/RCU/rculist_nulls.txt for details)
2212         */
2213        smp_wmb();
2214        atomic_set(&sk->sk_refcnt, 1);
2215        atomic_set(&sk->sk_drops, 0);
2216}
2217EXPORT_SYMBOL(sock_init_data);
2218
2219void lock_sock_nested(struct sock *sk, int subclass)
2220{
2221        might_sleep();
2222        spin_lock_bh(&sk->sk_lock.slock);
2223        if (sk->sk_lock.owned)
2224                __lock_sock(sk);
2225        sk->sk_lock.owned = 1;
2226        spin_unlock(&sk->sk_lock.slock);
2227        /*
2228         * The sk_lock has mutex_lock() semantics here:
2229         */
2230        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2231        local_bh_enable();
2232}
2233EXPORT_SYMBOL(lock_sock_nested);
2234
2235void release_sock(struct sock *sk)
2236{
2237        /*
2238         * The sk_lock has mutex_unlock() semantics:
2239         */
2240        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2241
2242        spin_lock_bh(&sk->sk_lock.slock);
2243        if (sk->sk_backlog.tail)
2244                __release_sock(sk);
2245
2246        if (sk->sk_prot->release_cb)
2247                sk->sk_prot->release_cb(sk);
2248
2249        sk->sk_lock.owned = 0;
2250        if (waitqueue_active(&sk->sk_lock.wq))
2251                wake_up(&sk->sk_lock.wq);
2252        spin_unlock_bh(&sk->sk_lock.slock);
2253}
2254EXPORT_SYMBOL(release_sock);
2255
2256/**
2257 * lock_sock_fast - fast version of lock_sock
2258 * @sk: socket
2259 *
2260 * This version should be used for very small section, where process wont block
2261 * return false if fast path is taken
2262 *   sk_lock.slock locked, owned = 0, BH disabled
2263 * return true if slow path is taken
2264 *   sk_lock.slock unlocked, owned = 1, BH enabled
2265 */
2266bool lock_sock_fast(struct sock *sk)
2267{
2268        might_sleep();
2269        spin_lock_bh(&sk->sk_lock.slock);
2270
2271        if (!sk->sk_lock.owned)
2272                /*
2273                 * Note : We must disable BH
2274                 */
2275                return false;
2276
2277        __lock_sock(sk);
2278        sk->sk_lock.owned = 1;
2279        spin_unlock(&sk->sk_lock.slock);
2280        /*
2281         * The sk_lock has mutex_lock() semantics here:
2282         */
2283        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2284        local_bh_enable();
2285        return true;
2286}
2287EXPORT_SYMBOL(lock_sock_fast);
2288
2289int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2290{
2291        struct timeval tv;
2292        if (!sock_flag(sk, SOCK_TIMESTAMP))
2293                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2294        tv = ktime_to_timeval(sk->sk_stamp);
2295        if (tv.tv_sec == -1)
2296                return -ENOENT;
2297        if (tv.tv_sec == 0) {
2298                sk->sk_stamp = ktime_get_real();
2299                tv = ktime_to_timeval(sk->sk_stamp);
2300        }
2301        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2302}
2303EXPORT_SYMBOL(sock_get_timestamp);
2304
2305int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2306{
2307        struct timespec ts;
2308        if (!sock_flag(sk, SOCK_TIMESTAMP))
2309                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2310        ts = ktime_to_timespec(sk->sk_stamp);
2311        if (ts.tv_sec == -1)
2312                return -ENOENT;
2313        if (ts.tv_sec == 0) {
2314                sk->sk_stamp = ktime_get_real();
2315                ts = ktime_to_timespec(sk->sk_stamp);
2316        }
2317        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2318}
2319EXPORT_SYMBOL(sock_get_timestampns);
2320
2321void sock_enable_timestamp(struct sock *sk, int flag)
2322{
2323        if (!sock_flag(sk, flag)) {
2324                unsigned long previous_flags = sk->sk_flags;
2325
2326                sock_set_flag(sk, flag);
2327                /*
2328                 * we just set one of the two flags which require net
2329                 * time stamping, but time stamping might have been on
2330                 * already because of the other one
2331                 */
2332                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2333                        net_enable_timestamp();
2334        }
2335}
2336
2337/*
2338 *      Get a socket option on an socket.
2339 *
2340 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2341 *      asynchronous errors should be reported by getsockopt. We assume
2342 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2343 */
2344int sock_common_getsockopt(struct socket *sock, int level, int optname,
2345                           char __user *optval, int __user *optlen)
2346{
2347        struct sock *sk = sock->sk;
2348
2349        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2350}
2351EXPORT_SYMBOL(sock_common_getsockopt);
2352
2353#ifdef CONFIG_COMPAT
2354int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2355                                  char __user *optval, int __user *optlen)
2356{
2357        struct sock *sk = sock->sk;
2358
2359        if (sk->sk_prot->compat_getsockopt != NULL)
2360                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2361                                                      optval, optlen);
2362        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2363}
2364EXPORT_SYMBOL(compat_sock_common_getsockopt);
2365#endif
2366
2367int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2368                        struct msghdr *msg, size_t size, int flags)
2369{
2370        struct sock *sk = sock->sk;
2371        int addr_len = 0;
2372        int err;
2373
2374        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2375                                   flags & ~MSG_DONTWAIT, &addr_len);
2376        if (err >= 0)
2377                msg->msg_namelen = addr_len;
2378        return err;
2379}
2380EXPORT_SYMBOL(sock_common_recvmsg);
2381
2382/*
2383 *      Set socket options on an inet socket.
2384 */
2385int sock_common_setsockopt(struct socket *sock, int level, int optname,
2386                           char __user *optval, unsigned int optlen)
2387{
2388        struct sock *sk = sock->sk;
2389
2390        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2391}
2392EXPORT_SYMBOL(sock_common_setsockopt);
2393
2394#ifdef CONFIG_COMPAT
2395int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2396                                  char __user *optval, unsigned int optlen)
2397{
2398        struct sock *sk = sock->sk;
2399
2400        if (sk->sk_prot->compat_setsockopt != NULL)
2401                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2402                                                      optval, optlen);
2403        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2404}
2405EXPORT_SYMBOL(compat_sock_common_setsockopt);
2406#endif
2407
2408void sk_common_release(struct sock *sk)
2409{
2410        if (sk->sk_prot->destroy)
2411                sk->sk_prot->destroy(sk);
2412
2413        /*
2414         * Observation: when sock_common_release is called, processes have
2415         * no access to socket. But net still has.
2416         * Step one, detach it from networking:
2417         *
2418         * A. Remove from hash tables.
2419         */
2420
2421        sk->sk_prot->unhash(sk);
2422
2423        /*
2424         * In this point socket cannot receive new packets, but it is possible
2425         * that some packets are in flight because some CPU runs receiver and
2426         * did hash table lookup before we unhashed socket. They will achieve
2427         * receive queue and will be purged by socket destructor.
2428         *
2429         * Also we still have packets pending on receive queue and probably,
2430         * our own packets waiting in device queues. sock_destroy will drain
2431         * receive queue, but transmitted packets will delay socket destruction
2432         * until the last reference will be released.
2433         */
2434
2435        sock_orphan(sk);
2436
2437        xfrm_sk_free_policy(sk);
2438
2439        sk_refcnt_debug_release(sk);
2440
2441        if (sk->sk_frag.page) {
2442                put_page(sk->sk_frag.page);
2443                sk->sk_frag.page = NULL;
2444        }
2445
2446        sock_put(sk);
2447}
2448EXPORT_SYMBOL(sk_common_release);
2449
2450#ifdef CONFIG_PROC_FS
2451#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2452struct prot_inuse {
2453        int val[PROTO_INUSE_NR];
2454};
2455
2456static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2457
2458#ifdef CONFIG_NET_NS
2459void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2460{
2461        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2462}
2463EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2464
2465int sock_prot_inuse_get(struct net *net, struct proto *prot)
2466{
2467        int cpu, idx = prot->inuse_idx;
2468        int res = 0;
2469
2470        for_each_possible_cpu(cpu)
2471                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2472
2473        return res >= 0 ? res : 0;
2474}
2475EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2476
2477static int __net_init sock_inuse_init_net(struct net *net)
2478{
2479        net->core.inuse = alloc_percpu(struct prot_inuse);
2480        return net->core.inuse ? 0 : -ENOMEM;
2481}
2482
2483static void __net_exit sock_inuse_exit_net(struct net *net)
2484{
2485        free_percpu(net->core.inuse);
2486}
2487
2488static struct pernet_operations net_inuse_ops = {
2489        .init = sock_inuse_init_net,
2490        .exit = sock_inuse_exit_net,
2491};
2492
2493static __init int net_inuse_init(void)
2494{
2495        if (register_pernet_subsys(&net_inuse_ops))
2496                panic("Cannot initialize net inuse counters");
2497
2498        return 0;
2499}
2500
2501core_initcall(net_inuse_init);
2502#else
2503static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2504
2505void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2506{
2507        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2508}
2509EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2510
2511int sock_prot_inuse_get(struct net *net, struct proto *prot)
2512{
2513        int cpu, idx = prot->inuse_idx;
2514        int res = 0;
2515
2516        for_each_possible_cpu(cpu)
2517                res += per_cpu(prot_inuse, cpu).val[idx];
2518
2519        return res >= 0 ? res : 0;
2520}
2521EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2522#endif
2523
2524static void assign_proto_idx(struct proto *prot)
2525{
2526        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2527
2528        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2529                pr_err("PROTO_INUSE_NR exhausted\n");
2530                return;
2531        }
2532
2533        set_bit(prot->inuse_idx, proto_inuse_idx);
2534}
2535
2536static void release_proto_idx(struct proto *prot)
2537{
2538        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2539                clear_bit(prot->inuse_idx, proto_inuse_idx);
2540}
2541#else
2542static inline void assign_proto_idx(struct proto *prot)
2543{
2544}
2545
2546static inline void release_proto_idx(struct proto *prot)
2547{
2548}
2549#endif
2550
2551int proto_register(struct proto *prot, int alloc_slab)
2552{
2553        if (alloc_slab) {
2554                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2555                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2556                                        NULL);
2557
2558                if (prot->slab == NULL) {
2559                        pr_crit("%s: Can't create sock SLAB cache!\n",
2560                                prot->name);
2561                        goto out;
2562                }
2563
2564                if (prot->rsk_prot != NULL) {
2565                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2566                        if (prot->rsk_prot->slab_name == NULL)
2567                                goto out_free_sock_slab;
2568
2569                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2570                                                                 prot->rsk_prot->obj_size, 0,
2571                                                                 SLAB_HWCACHE_ALIGN, NULL);
2572
2573                        if (prot->rsk_prot->slab == NULL) {
2574                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2575                                        prot->name);
2576                                goto out_free_request_sock_slab_name;
2577                        }
2578                }
2579
2580                if (prot->twsk_prot != NULL) {
2581                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2582
2583                        if (prot->twsk_prot->twsk_slab_name == NULL)
2584                                goto out_free_request_sock_slab;
2585
2586                        prot->twsk_prot->twsk_slab =
2587                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2588                                                  prot->twsk_prot->twsk_obj_size,
2589                                                  0,
2590                                                  SLAB_HWCACHE_ALIGN |
2591                                                        prot->slab_flags,
2592                                                  NULL);
2593                        if (prot->twsk_prot->twsk_slab == NULL)
2594                                goto out_free_timewait_sock_slab_name;
2595                }
2596        }
2597
2598        mutex_lock(&proto_list_mutex);
2599        list_add(&prot->node, &proto_list);
2600        assign_proto_idx(prot);
2601        mutex_unlock(&proto_list_mutex);
2602        return 0;
2603
2604out_free_timewait_sock_slab_name:
2605        kfree(prot->twsk_prot->twsk_slab_name);
2606out_free_request_sock_slab:
2607        if (prot->rsk_prot && prot->rsk_prot->slab) {
2608                kmem_cache_destroy(prot->rsk_prot->slab);
2609                prot->rsk_prot->slab = NULL;
2610        }
2611out_free_request_sock_slab_name:
2612        if (prot->rsk_prot)
2613                kfree(prot->rsk_prot->slab_name);
2614out_free_sock_slab:
2615        kmem_cache_destroy(prot->slab);
2616        prot->slab = NULL;
2617out:
2618        return -ENOBUFS;
2619}
2620EXPORT_SYMBOL(proto_register);
2621
2622void proto_unregister(struct proto *prot)
2623{
2624        mutex_lock(&proto_list_mutex);
2625        release_proto_idx(prot);
2626        list_del(&prot->node);
2627        mutex_unlock(&proto_list_mutex);
2628
2629        if (prot->slab != NULL) {
2630                kmem_cache_destroy(prot->slab);
2631                prot->slab = NULL;
2632        }
2633
2634        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2635                kmem_cache_destroy(prot->rsk_prot->slab);
2636                kfree(prot->rsk_prot->slab_name);
2637                prot->rsk_prot->slab = NULL;
2638        }
2639
2640        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2641                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2642                kfree(prot->twsk_prot->twsk_slab_name);
2643                prot->twsk_prot->twsk_slab = NULL;
2644        }
2645}
2646EXPORT_SYMBOL(proto_unregister);
2647
2648#ifdef CONFIG_PROC_FS
2649static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2650        __acquires(proto_list_mutex)
2651{
2652        mutex_lock(&proto_list_mutex);
2653        return seq_list_start_head(&proto_list, *pos);
2654}
2655
2656static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2657{
2658        return seq_list_next(v, &proto_list, pos);
2659}
2660
2661static void proto_seq_stop(struct seq_file *seq, void *v)
2662        __releases(proto_list_mutex)
2663{
2664        mutex_unlock(&proto_list_mutex);
2665}
2666
2667static char proto_method_implemented(const void *method)
2668{
2669        return method == NULL ? 'n' : 'y';
2670}
2671static long sock_prot_memory_allocated(struct proto *proto)
2672{
2673        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2674}
2675
2676static char *sock_prot_memory_pressure(struct proto *proto)
2677{
2678        return proto->memory_pressure != NULL ?
2679        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2680}
2681
2682static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2683{
2684
2685        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2686                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2687                   proto->name,
2688                   proto->obj_size,
2689                   sock_prot_inuse_get(seq_file_net(seq), proto),
2690                   sock_prot_memory_allocated(proto),
2691                   sock_prot_memory_pressure(proto),
2692                   proto->max_header,
2693                   proto->slab == NULL ? "no" : "yes",
2694                   module_name(proto->owner),
2695                   proto_method_implemented(proto->close),
2696                   proto_method_implemented(proto->connect),
2697                   proto_method_implemented(proto->disconnect),
2698                   proto_method_implemented(proto->accept),
2699                   proto_method_implemented(proto->ioctl),
2700                   proto_method_implemented(proto->init),
2701                   proto_method_implemented(proto->destroy),
2702                   proto_method_implemented(proto->shutdown),
2703                   proto_method_implemented(proto->setsockopt),
2704                   proto_method_implemented(proto->getsockopt),
2705                   proto_method_implemented(proto->sendmsg),
2706                   proto_method_implemented(proto->recvmsg),
2707                   proto_method_implemented(proto->sendpage),
2708                   proto_method_implemented(proto->bind),
2709                   proto_method_implemented(proto->backlog_rcv),
2710                   proto_method_implemented(proto->hash),
2711                   proto_method_implemented(proto->unhash),
2712                   proto_method_implemented(proto->get_port),
2713                   proto_method_implemented(proto->enter_memory_pressure));
2714}
2715
2716static int proto_seq_show(struct seq_file *seq, void *v)
2717{
2718        if (v == &proto_list)
2719                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2720                           "protocol",
2721                           "size",
2722                           "sockets",
2723                           "memory",
2724                           "press",
2725                           "maxhdr",
2726                           "slab",
2727                           "module",
2728                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2729        else
2730                proto_seq_printf(seq, list_entry(v, struct proto, node));
2731        return 0;
2732}
2733
2734static const struct seq_operations proto_seq_ops = {
2735        .start  = proto_seq_start,
2736        .next   = proto_seq_next,
2737        .stop   = proto_seq_stop,
2738        .show   = proto_seq_show,
2739};
2740
2741static int proto_seq_open(struct inode *inode, struct file *file)
2742{
2743        return seq_open_net(inode, file, &proto_seq_ops,
2744                            sizeof(struct seq_net_private));
2745}
2746
2747static const struct file_operations proto_seq_fops = {
2748        .owner          = THIS_MODULE,
2749        .open           = proto_seq_open,
2750        .read           = seq_read,
2751        .llseek         = seq_lseek,
2752        .release        = seq_release_net,
2753};
2754
2755static __net_init int proto_init_net(struct net *net)
2756{
2757        if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2758                return -ENOMEM;
2759
2760        return 0;
2761}
2762
2763static __net_exit void proto_exit_net(struct net *net)
2764{
2765        proc_net_remove(net, "protocols");
2766}
2767
2768
2769static __net_initdata struct pernet_operations proto_net_ops = {
2770        .init = proto_init_net,
2771        .exit = proto_exit_net,
2772};
2773
2774static int __init proto_init(void)
2775{
2776        return register_pernet_subsys(&proto_net_ops);
2777}
2778
2779subsys_initcall(proto_init);
2780
2781#endif /* PROC_FS */
2782
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.