linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/types.h>
  97#include <linux/socket.h>
  98#include <linux/in.h>
  99#include <linux/kernel.h>
 100#include <linux/module.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/sched.h>
 104#include <linux/timer.h>
 105#include <linux/string.h>
 106#include <linux/sockios.h>
 107#include <linux/net.h>
 108#include <linux/mm.h>
 109#include <linux/slab.h>
 110#include <linux/interrupt.h>
 111#include <linux/poll.h>
 112#include <linux/tcp.h>
 113#include <linux/init.h>
 114#include <linux/highmem.h>
 115#include <linux/user_namespace.h>
 116#include <linux/static_key.h>
 117#include <linux/memcontrol.h>
 118#include <linux/prefetch.h>
 119
 120#include <asm/uaccess.h>
 121
 122#include <linux/netdevice.h>
 123#include <net/protocol.h>
 124#include <linux/skbuff.h>
 125#include <net/net_namespace.h>
 126#include <net/request_sock.h>
 127#include <net/sock.h>
 128#include <linux/net_tstamp.h>
 129#include <net/xfrm.h>
 130#include <linux/ipsec.h>
 131#include <net/cls_cgroup.h>
 132#include <net/netprio_cgroup.h>
 133
 134#include <linux/filter.h>
 135
 136#include <trace/events/sock.h>
 137
 138#ifdef CONFIG_INET
 139#include <net/tcp.h>
 140#endif
 141
 142static DEFINE_MUTEX(proto_list_mutex);
 143static LIST_HEAD(proto_list);
 144
 145#ifdef CONFIG_MEMCG_KMEM
 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 147{
 148        struct proto *proto;
 149        int ret = 0;
 150
 151        mutex_lock(&proto_list_mutex);
 152        list_for_each_entry(proto, &proto_list, node) {
 153                if (proto->init_cgroup) {
 154                        ret = proto->init_cgroup(memcg, ss);
 155                        if (ret)
 156                                goto out;
 157                }
 158        }
 159
 160        mutex_unlock(&proto_list_mutex);
 161        return ret;
 162out:
 163        list_for_each_entry_continue_reverse(proto, &proto_list, node)
 164                if (proto->destroy_cgroup)
 165                        proto->destroy_cgroup(memcg);
 166        mutex_unlock(&proto_list_mutex);
 167        return ret;
 168}
 169
 170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 171{
 172        struct proto *proto;
 173
 174        mutex_lock(&proto_list_mutex);
 175        list_for_each_entry_reverse(proto, &proto_list, node)
 176                if (proto->destroy_cgroup)
 177                        proto->destroy_cgroup(memcg);
 178        mutex_unlock(&proto_list_mutex);
 179}
 180#endif
 181
 182/*
 183 * Each address family might have different locking rules, so we have
 184 * one slock key per address family:
 185 */
 186static struct lock_class_key af_family_keys[AF_MAX];
 187static struct lock_class_key af_family_slock_keys[AF_MAX];
 188
 189struct static_key memcg_socket_limit_enabled;
 190EXPORT_SYMBOL(memcg_socket_limit_enabled);
 191
 192/*
 193 * Make lock validator output more readable. (we pre-construct these
 194 * strings build-time, so that runtime initialization of socket
 195 * locks is fast):
 196 */
 197static const char *const af_family_key_strings[AF_MAX+1] = {
 198  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 199  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 200  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 201  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 202  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 203  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 204  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 205  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 206  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 207  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 208  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 209  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 210  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 211  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 212};
 213static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 214  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 215  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 216  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 217  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 218  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 219  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 220  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 221  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 222  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 223  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 224  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 225  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 226  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 227  "slock-AF_NFC"   , "slock-AF_MAX"
 228};
 229static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 230  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 231  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 232  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 233  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 234  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 235  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 236  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 237  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 238  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 239  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 240  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 241  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 242  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 243  "clock-AF_NFC"   , "clock-AF_MAX"
 244};
 245
 246/*
 247 * sk_callback_lock locking rules are per-address-family,
 248 * so split the lock classes by using a per-AF key:
 249 */
 250static struct lock_class_key af_callback_keys[AF_MAX];
 251
 252/* Take into consideration the size of the struct sk_buff overhead in the
 253 * determination of these values, since that is non-constant across
 254 * platforms.  This makes socket queueing behavior and performance
 255 * not depend upon such differences.
 256 */
 257#define _SK_MEM_PACKETS         256
 258#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 259#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 260#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 261
 262/* Run time adjustable parameters. */
 263__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 264EXPORT_SYMBOL(sysctl_wmem_max);
 265__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 266EXPORT_SYMBOL(sysctl_rmem_max);
 267__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 268__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 269
 270/* Maximal space eaten by iovec or ancillary data plus some space */
 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 272EXPORT_SYMBOL(sysctl_optmem_max);
 273
 274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 275EXPORT_SYMBOL_GPL(memalloc_socks);
 276
 277/**
 278 * sk_set_memalloc - sets %SOCK_MEMALLOC
 279 * @sk: socket to set it on
 280 *
 281 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 282 * It's the responsibility of the admin to adjust min_free_kbytes
 283 * to meet the requirements
 284 */
 285void sk_set_memalloc(struct sock *sk)
 286{
 287        sock_set_flag(sk, SOCK_MEMALLOC);
 288        sk->sk_allocation |= __GFP_MEMALLOC;
 289        static_key_slow_inc(&memalloc_socks);
 290}
 291EXPORT_SYMBOL_GPL(sk_set_memalloc);
 292
 293void sk_clear_memalloc(struct sock *sk)
 294{
 295        sock_reset_flag(sk, SOCK_MEMALLOC);
 296        sk->sk_allocation &= ~__GFP_MEMALLOC;
 297        static_key_slow_dec(&memalloc_socks);
 298
 299        /*
 300         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 301         * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 302         * it has rmem allocations there is a risk that the user of the
 303         * socket cannot make forward progress due to exceeding the rmem
 304         * limits. By rights, sk_clear_memalloc() should only be called
 305         * on sockets being torn down but warn and reset the accounting if
 306         * that assumption breaks.
 307         */
 308        if (WARN_ON(sk->sk_forward_alloc))
 309                sk_mem_reclaim(sk);
 310}
 311EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 312
 313int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 314{
 315        int ret;
 316        unsigned long pflags = current->flags;
 317
 318        /* these should have been dropped before queueing */
 319        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 320
 321        current->flags |= PF_MEMALLOC;
 322        ret = sk->sk_backlog_rcv(sk, skb);
 323        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 324
 325        return ret;
 326}
 327EXPORT_SYMBOL(__sk_backlog_rcv);
 328
 329#if defined(CONFIG_CGROUPS)
 330#if !defined(CONFIG_NET_CLS_CGROUP)
 331int net_cls_subsys_id = -1;
 332EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 333#endif
 334#if !defined(CONFIG_NETPRIO_CGROUP)
 335int net_prio_subsys_id = -1;
 336EXPORT_SYMBOL_GPL(net_prio_subsys_id);
 337#endif
 338#endif
 339
 340static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 341{
 342        struct timeval tv;
 343
 344        if (optlen < sizeof(tv))
 345                return -EINVAL;
 346        if (copy_from_user(&tv, optval, sizeof(tv)))
 347                return -EFAULT;
 348        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 349                return -EDOM;
 350
 351        if (tv.tv_sec < 0) {
 352                static int warned __read_mostly;
 353
 354                *timeo_p = 0;
 355                if (warned < 10 && net_ratelimit()) {
 356                        warned++;
 357                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 358                                __func__, current->comm, task_pid_nr(current));
 359                }
 360                return 0;
 361        }
 362        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 363        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 364                return 0;
 365        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 366                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 367        return 0;
 368}
 369
 370static void sock_warn_obsolete_bsdism(const char *name)
 371{
 372        static int warned;
 373        static char warncomm[TASK_COMM_LEN];
 374        if (strcmp(warncomm, current->comm) && warned < 5) {
 375                strcpy(warncomm,  current->comm);
 376                pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 377                        warncomm, name);
 378                warned++;
 379        }
 380}
 381
 382#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 383
 384static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 385{
 386        if (sk->sk_flags & flags) {
 387                sk->sk_flags &= ~flags;
 388                if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 389                        net_disable_timestamp();
 390        }
 391}
 392
 393
 394int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 395{
 396        int err;
 397        int skb_len;
 398        unsigned long flags;
 399        struct sk_buff_head *list = &sk->sk_receive_queue;
 400
 401        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 402                atomic_inc(&sk->sk_drops);
 403                trace_sock_rcvqueue_full(sk, skb);
 404                return -ENOMEM;
 405        }
 406
 407        err = sk_filter(sk, skb);
 408        if (err)
 409                return err;
 410
 411        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 412                atomic_inc(&sk->sk_drops);
 413                return -ENOBUFS;
 414        }
 415
 416        skb->dev = NULL;
 417        skb_set_owner_r(skb, sk);
 418
 419        /* Cache the SKB length before we tack it onto the receive
 420         * queue.  Once it is added it no longer belongs to us and
 421         * may be freed by other threads of control pulling packets
 422         * from the queue.
 423         */
 424        skb_len = skb->len;
 425
 426        /* we escape from rcu protected region, make sure we dont leak
 427         * a norefcounted dst
 428         */
 429        skb_dst_force(skb);
 430
 431        spin_lock_irqsave(&list->lock, flags);
 432        skb->dropcount = atomic_read(&sk->sk_drops);
 433        __skb_queue_tail(list, skb);
 434        spin_unlock_irqrestore(&list->lock, flags);
 435
 436        if (!sock_flag(sk, SOCK_DEAD))
 437                sk->sk_data_ready(sk, skb_len);
 438        return 0;
 439}
 440EXPORT_SYMBOL(sock_queue_rcv_skb);
 441
 442int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 443{
 444        int rc = NET_RX_SUCCESS;
 445
 446        if (sk_filter(sk, skb))
 447                goto discard_and_relse;
 448
 449        skb->dev = NULL;
 450
 451        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 452                atomic_inc(&sk->sk_drops);
 453                goto discard_and_relse;
 454        }
 455        if (nested)
 456                bh_lock_sock_nested(sk);
 457        else
 458                bh_lock_sock(sk);
 459        if (!sock_owned_by_user(sk)) {
 460                /*
 461                 * trylock + unlock semantics:
 462                 */
 463                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 464
 465                rc = sk_backlog_rcv(sk, skb);
 466
 467                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 468        } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 469                bh_unlock_sock(sk);
 470                atomic_inc(&sk->sk_drops);
 471                goto discard_and_relse;
 472        }
 473
 474        bh_unlock_sock(sk);
 475out:
 476        sock_put(sk);
 477        return rc;
 478discard_and_relse:
 479        kfree_skb(skb);
 480        goto out;
 481}
 482EXPORT_SYMBOL(sk_receive_skb);
 483
 484void sk_reset_txq(struct sock *sk)
 485{
 486        sk_tx_queue_clear(sk);
 487}
 488EXPORT_SYMBOL(sk_reset_txq);
 489
 490struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 491{
 492        struct dst_entry *dst = __sk_dst_get(sk);
 493
 494        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 495                sk_tx_queue_clear(sk);
 496                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 497                dst_release(dst);
 498                return NULL;
 499        }
 500
 501        return dst;
 502}
 503EXPORT_SYMBOL(__sk_dst_check);
 504
 505struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 506{
 507        struct dst_entry *dst = sk_dst_get(sk);
 508
 509        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 510                sk_dst_reset(sk);
 511                dst_release(dst);
 512                return NULL;
 513        }
 514
 515        return dst;
 516}
 517EXPORT_SYMBOL(sk_dst_check);
 518
 519static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 520{
 521        int ret = -ENOPROTOOPT;
 522#ifdef CONFIG_NETDEVICES
 523        struct net *net = sock_net(sk);
 524        char devname[IFNAMSIZ];
 525        int index;
 526
 527        /* Sorry... */
 528        ret = -EPERM;
 529        if (!capable(CAP_NET_RAW))
 530                goto out;
 531
 532        ret = -EINVAL;
 533        if (optlen < 0)
 534                goto out;
 535
 536        /* Bind this socket to a particular device like "eth0",
 537         * as specified in the passed interface name. If the
 538         * name is "" or the option length is zero the socket
 539         * is not bound.
 540         */
 541        if (optlen > IFNAMSIZ - 1)
 542                optlen = IFNAMSIZ - 1;
 543        memset(devname, 0, sizeof(devname));
 544
 545        ret = -EFAULT;
 546        if (copy_from_user(devname, optval, optlen))
 547                goto out;
 548
 549        index = 0;
 550        if (devname[0] != '\0') {
 551                struct net_device *dev;
 552
 553                rcu_read_lock();
 554                dev = dev_get_by_name_rcu(net, devname);
 555                if (dev)
 556                        index = dev->ifindex;
 557                rcu_read_unlock();
 558                ret = -ENODEV;
 559                if (!dev)
 560                        goto out;
 561        }
 562
 563        lock_sock(sk);
 564        sk->sk_bound_dev_if = index;
 565        sk_dst_reset(sk);
 566        release_sock(sk);
 567
 568        ret = 0;
 569
 570out:
 571#endif
 572
 573        return ret;
 574}
 575
 576static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 577{
 578        if (valbool)
 579                sock_set_flag(sk, bit);
 580        else
 581                sock_reset_flag(sk, bit);
 582}
 583
 584/*
 585 *      This is meant for all protocols to use and covers goings on
 586 *      at the socket level. Everything here is generic.
 587 */
 588
 589int sock_setsockopt(struct socket *sock, int level, int optname,
 590                    char __user *optval, unsigned int optlen)
 591{
 592        struct sock *sk = sock->sk;
 593        int val;
 594        int valbool;
 595        struct linger ling;
 596        int ret = 0;
 597
 598        /*
 599         *      Options without arguments
 600         */
 601
 602        if (optname == SO_BINDTODEVICE)
 603                return sock_bindtodevice(sk, optval, optlen);
 604
 605        if (optlen < sizeof(int))
 606                return -EINVAL;
 607
 608        if (get_user(val, (int __user *)optval))
 609                return -EFAULT;
 610
 611        valbool = val ? 1 : 0;
 612
 613        lock_sock(sk);
 614
 615        switch (optname) {
 616        case SO_DEBUG:
 617                if (val && !capable(CAP_NET_ADMIN))
 618                        ret = -EACCES;
 619                else
 620                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 621                break;
 622        case SO_REUSEADDR:
 623                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 624                break;
 625        case SO_TYPE:
 626        case SO_PROTOCOL:
 627        case SO_DOMAIN:
 628        case SO_ERROR:
 629                ret = -ENOPROTOOPT;
 630                break;
 631        case SO_DONTROUTE:
 632                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 633                break;
 634        case SO_BROADCAST:
 635                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 636                break;
 637        case SO_SNDBUF:
 638                /* Don't error on this BSD doesn't and if you think
 639                 * about it this is right. Otherwise apps have to
 640                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 641                 * are treated in BSD as hints
 642                 */
 643                val = min_t(u32, val, sysctl_wmem_max);
 644set_sndbuf:
 645                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 646                sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 647                /* Wake up sending tasks if we upped the value. */
 648                sk->sk_write_space(sk);
 649                break;
 650
 651        case SO_SNDBUFFORCE:
 652                if (!capable(CAP_NET_ADMIN)) {
 653                        ret = -EPERM;
 654                        break;
 655                }
 656                goto set_sndbuf;
 657
 658        case SO_RCVBUF:
 659                /* Don't error on this BSD doesn't and if you think
 660                 * about it this is right. Otherwise apps have to
 661                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 662                 * are treated in BSD as hints
 663                 */
 664                val = min_t(u32, val, sysctl_rmem_max);
 665set_rcvbuf:
 666                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 667                /*
 668                 * We double it on the way in to account for
 669                 * "struct sk_buff" etc. overhead.   Applications
 670                 * assume that the SO_RCVBUF setting they make will
 671                 * allow that much actual data to be received on that
 672                 * socket.
 673                 *
 674                 * Applications are unaware that "struct sk_buff" and
 675                 * other overheads allocate from the receive buffer
 676                 * during socket buffer allocation.
 677                 *
 678                 * And after considering the possible alternatives,
 679                 * returning the value we actually used in getsockopt
 680                 * is the most desirable behavior.
 681                 */
 682                sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 683                break;
 684
 685        case SO_RCVBUFFORCE:
 686                if (!capable(CAP_NET_ADMIN)) {
 687                        ret = -EPERM;
 688                        break;
 689                }
 690                goto set_rcvbuf;
 691
 692        case SO_KEEPALIVE:
 693#ifdef CONFIG_INET
 694                if (sk->sk_protocol == IPPROTO_TCP &&
 695                    sk->sk_type == SOCK_STREAM)
 696                        tcp_set_keepalive(sk, valbool);
 697#endif
 698                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 699                break;
 700
 701        case SO_OOBINLINE:
 702                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 703                break;
 704
 705        case SO_NO_CHECK:
 706                sk->sk_no_check = valbool;
 707                break;
 708
 709        case SO_PRIORITY:
 710                if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 711                        sk->sk_priority = val;
 712                else
 713                        ret = -EPERM;
 714                break;
 715
 716        case SO_LINGER:
 717                if (optlen < sizeof(ling)) {
 718                        ret = -EINVAL;  /* 1003.1g */
 719                        break;
 720                }
 721                if (copy_from_user(&ling, optval, sizeof(ling))) {
 722                        ret = -EFAULT;
 723                        break;
 724                }
 725                if (!ling.l_onoff)
 726                        sock_reset_flag(sk, SOCK_LINGER);
 727                else {
 728#if (BITS_PER_LONG == 32)
 729                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 730                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 731                        else
 732#endif
 733                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 734                        sock_set_flag(sk, SOCK_LINGER);
 735                }
 736                break;
 737
 738        case SO_BSDCOMPAT:
 739                sock_warn_obsolete_bsdism("setsockopt");
 740                break;
 741
 742        case SO_PASSCRED:
 743                if (valbool)
 744                        set_bit(SOCK_PASSCRED, &sock->flags);
 745                else
 746                        clear_bit(SOCK_PASSCRED, &sock->flags);
 747                break;
 748
 749        case SO_TIMESTAMP:
 750        case SO_TIMESTAMPNS:
 751                if (valbool)  {
 752                        if (optname == SO_TIMESTAMP)
 753                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 754                        else
 755                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 756                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 757                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 758                } else {
 759                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 760                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 761                }
 762                break;
 763
 764        case SO_TIMESTAMPING:
 765                if (val & ~SOF_TIMESTAMPING_MASK) {
 766                        ret = -EINVAL;
 767                        break;
 768                }
 769                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 770                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 771                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 772                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 773                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 774                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 775                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 776                        sock_enable_timestamp(sk,
 777                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 778                else
 779                        sock_disable_timestamp(sk,
 780                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 781                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 782                                  val & SOF_TIMESTAMPING_SOFTWARE);
 783                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 784                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 785                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 786                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 787                break;
 788
 789        case SO_RCVLOWAT:
 790                if (val < 0)
 791                        val = INT_MAX;
 792                sk->sk_rcvlowat = val ? : 1;
 793                break;
 794
 795        case SO_RCVTIMEO:
 796                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 797                break;
 798
 799        case SO_SNDTIMEO:
 800                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 801                break;
 802
 803        case SO_ATTACH_FILTER:
 804                ret = -EINVAL;
 805                if (optlen == sizeof(struct sock_fprog)) {
 806                        struct sock_fprog fprog;
 807
 808                        ret = -EFAULT;
 809                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 810                                break;
 811
 812                        ret = sk_attach_filter(&fprog, sk);
 813                }
 814                break;
 815
 816        case SO_DETACH_FILTER:
 817                ret = sk_detach_filter(sk);
 818                break;
 819
 820        case SO_PASSSEC:
 821                if (valbool)
 822                        set_bit(SOCK_PASSSEC, &sock->flags);
 823                else
 824                        clear_bit(SOCK_PASSSEC, &sock->flags);
 825                break;
 826        case SO_MARK:
 827                if (!capable(CAP_NET_ADMIN))
 828                        ret = -EPERM;
 829                else
 830                        sk->sk_mark = val;
 831                break;
 832
 833                /* We implement the SO_SNDLOWAT etc to
 834                   not be settable (1003.1g 5.3) */
 835        case SO_RXQ_OVFL:
 836                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 837                break;
 838
 839        case SO_WIFI_STATUS:
 840                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 841                break;
 842
 843        case SO_PEEK_OFF:
 844                if (sock->ops->set_peek_off)
 845                        sock->ops->set_peek_off(sk, val);
 846                else
 847                        ret = -EOPNOTSUPP;
 848                break;
 849
 850        case SO_NOFCS:
 851                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 852                break;
 853
 854        default:
 855                ret = -ENOPROTOOPT;
 856                break;
 857        }
 858        release_sock(sk);
 859        return ret;
 860}
 861EXPORT_SYMBOL(sock_setsockopt);
 862
 863
 864void cred_to_ucred(struct pid *pid, const struct cred *cred,
 865                   struct ucred *ucred)
 866{
 867        ucred->pid = pid_vnr(pid);
 868        ucred->uid = ucred->gid = -1;
 869        if (cred) {
 870                struct user_namespace *current_ns = current_user_ns();
 871
 872                ucred->uid = from_kuid(current_ns, cred->euid);
 873                ucred->gid = from_kgid(current_ns, cred->egid);
 874        }
 875}
 876EXPORT_SYMBOL_GPL(cred_to_ucred);
 877
 878int sock_getsockopt(struct socket *sock, int level, int optname,
 879                    char __user *optval, int __user *optlen)
 880{
 881        struct sock *sk = sock->sk;
 882
 883        union {
 884                int val;
 885                struct linger ling;
 886                struct timeval tm;
 887        } v;
 888
 889        int lv = sizeof(int);
 890        int len;
 891
 892        if (get_user(len, optlen))
 893                return -EFAULT;
 894        if (len < 0)
 895                return -EINVAL;
 896
 897        memset(&v, 0, sizeof(v));
 898
 899        switch (optname) {
 900        case SO_DEBUG:
 901                v.val = sock_flag(sk, SOCK_DBG);
 902                break;
 903
 904        case SO_DONTROUTE:
 905                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 906                break;
 907
 908        case SO_BROADCAST:
 909                v.val = sock_flag(sk, SOCK_BROADCAST);
 910                break;
 911
 912        case SO_SNDBUF:
 913                v.val = sk->sk_sndbuf;
 914                break;
 915
 916        case SO_RCVBUF:
 917                v.val = sk->sk_rcvbuf;
 918                break;
 919
 920        case SO_REUSEADDR:
 921                v.val = sk->sk_reuse;
 922                break;
 923
 924        case SO_KEEPALIVE:
 925                v.val = sock_flag(sk, SOCK_KEEPOPEN);
 926                break;
 927
 928        case SO_TYPE:
 929                v.val = sk->sk_type;
 930                break;
 931
 932        case SO_PROTOCOL:
 933                v.val = sk->sk_protocol;
 934                break;
 935
 936        case SO_DOMAIN:
 937                v.val = sk->sk_family;
 938                break;
 939
 940        case SO_ERROR:
 941                v.val = -sock_error(sk);
 942                if (v.val == 0)
 943                        v.val = xchg(&sk->sk_err_soft, 0);
 944                break;
 945
 946        case SO_OOBINLINE:
 947                v.val = sock_flag(sk, SOCK_URGINLINE);
 948                break;
 949
 950        case SO_NO_CHECK:
 951                v.val = sk->sk_no_check;
 952                break;
 953
 954        case SO_PRIORITY:
 955                v.val = sk->sk_priority;
 956                break;
 957
 958        case SO_LINGER:
 959                lv              = sizeof(v.ling);
 960                v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
 961                v.ling.l_linger = sk->sk_lingertime / HZ;
 962                break;
 963
 964        case SO_BSDCOMPAT:
 965                sock_warn_obsolete_bsdism("getsockopt");
 966                break;
 967
 968        case SO_TIMESTAMP:
 969                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 970                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
 971                break;
 972
 973        case SO_TIMESTAMPNS:
 974                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 975                break;
 976
 977        case SO_TIMESTAMPING:
 978                v.val = 0;
 979                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 980                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 981                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 982                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 983                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 984                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 985                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 986                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 987                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 988                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
 989                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 990                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 991                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 992                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 993                break;
 994
 995        case SO_RCVTIMEO:
 996                lv = sizeof(struct timeval);
 997                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 998                        v.tm.tv_sec = 0;
 999                        v.tm.tv_usec = 0;
1000                } else {
1001                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1002                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1003                }
1004                break;
1005
1006        case SO_SNDTIMEO:
1007                lv = sizeof(struct timeval);
1008                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1009                        v.tm.tv_sec = 0;
1010                        v.tm.tv_usec = 0;
1011                } else {
1012                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1013                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1014                }
1015                break;
1016
1017        case SO_RCVLOWAT:
1018                v.val = sk->sk_rcvlowat;
1019                break;
1020
1021        case SO_SNDLOWAT:
1022                v.val = 1;
1023                break;
1024
1025        case SO_PASSCRED:
1026                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1027                break;
1028
1029        case SO_PEERCRED:
1030        {
1031                struct ucred peercred;
1032                if (len > sizeof(peercred))
1033                        len = sizeof(peercred);
1034                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1035                if (copy_to_user(optval, &peercred, len))
1036                        return -EFAULT;
1037                goto lenout;
1038        }
1039
1040        case SO_PEERNAME:
1041        {
1042                char address[128];
1043
1044                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1045                        return -ENOTCONN;
1046                if (lv < len)
1047                        return -EINVAL;
1048                if (copy_to_user(optval, address, len))
1049                        return -EFAULT;
1050                goto lenout;
1051        }
1052
1053        /* Dubious BSD thing... Probably nobody even uses it, but
1054         * the UNIX standard wants it for whatever reason... -DaveM
1055         */
1056        case SO_ACCEPTCONN:
1057                v.val = sk->sk_state == TCP_LISTEN;
1058                break;
1059
1060        case SO_PASSSEC:
1061                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1062                break;
1063
1064        case SO_PEERSEC:
1065                return security_socket_getpeersec_stream(sock, optval, optlen, len);
1066
1067        case SO_MARK:
1068                v.val = sk->sk_mark;
1069                break;
1070
1071        case SO_RXQ_OVFL:
1072                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1073                break;
1074
1075        case SO_WIFI_STATUS:
1076                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1077                break;
1078
1079        case SO_PEEK_OFF:
1080                if (!sock->ops->set_peek_off)
1081                        return -EOPNOTSUPP;
1082
1083                v.val = sk->sk_peek_off;
1084                break;
1085        case SO_NOFCS:
1086                v.val = sock_flag(sk, SOCK_NOFCS);
1087                break;
1088        default:
1089                return -ENOPROTOOPT;
1090        }
1091
1092        if (len > lv)
1093                len = lv;
1094        if (copy_to_user(optval, &v, len))
1095                return -EFAULT;
1096lenout:
1097        if (put_user(len, optlen))
1098                return -EFAULT;
1099        return 0;
1100}
1101
1102/*
1103 * Initialize an sk_lock.
1104 *
1105 * (We also register the sk_lock with the lock validator.)
1106 */
1107static inline void sock_lock_init(struct sock *sk)
1108{
1109        sock_lock_init_class_and_name(sk,
1110                        af_family_slock_key_strings[sk->sk_family],
1111                        af_family_slock_keys + sk->sk_family,
1112                        af_family_key_strings[sk->sk_family],
1113                        af_family_keys + sk->sk_family);
1114}
1115
1116/*
1117 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1118 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1119 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1120 */
1121static void sock_copy(struct sock *nsk, const struct sock *osk)
1122{
1123#ifdef CONFIG_SECURITY_NETWORK
1124        void *sptr = nsk->sk_security;
1125#endif
1126        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1127
1128        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1129               osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1130
1131#ifdef CONFIG_SECURITY_NETWORK
1132        nsk->sk_security = sptr;
1133        security_sk_clone(osk, nsk);
1134#endif
1135}
1136
1137/*
1138 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1139 * un-modified. Special care is taken when initializing object to zero.
1140 */
1141static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1142{
1143        if (offsetof(struct sock, sk_node.next) != 0)
1144                memset(sk, 0, offsetof(struct sock, sk_node.next));
1145        memset(&sk->sk_node.pprev, 0,
1146               size - offsetof(struct sock, sk_node.pprev));
1147}
1148
1149void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1150{
1151        unsigned long nulls1, nulls2;
1152
1153        nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1154        nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1155        if (nulls1 > nulls2)
1156                swap(nulls1, nulls2);
1157
1158        if (nulls1 != 0)
1159                memset((char *)sk, 0, nulls1);
1160        memset((char *)sk + nulls1 + sizeof(void *), 0,
1161               nulls2 - nulls1 - sizeof(void *));
1162        memset((char *)sk + nulls2 + sizeof(void *), 0,
1163               size - nulls2 - sizeof(void *));
1164}
1165EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1166
1167static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1168                int family)
1169{
1170        struct sock *sk;
1171        struct kmem_cache *slab;
1172
1173        slab = prot->slab;
1174        if (slab != NULL) {
1175                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1176                if (!sk)
1177                        return sk;
1178                if (priority & __GFP_ZERO) {
1179                        if (prot->clear_sk)
1180                                prot->clear_sk(sk, prot->obj_size);
1181                        else
1182                                sk_prot_clear_nulls(sk, prot->obj_size);
1183                }
1184        } else
1185                sk = kmalloc(prot->obj_size, priority);
1186
1187        if (sk != NULL) {
1188                kmemcheck_annotate_bitfield(sk, flags);
1189
1190                if (security_sk_alloc(sk, family, priority))
1191                        goto out_free;
1192
1193                if (!try_module_get(prot->owner))
1194                        goto out_free_sec;
1195                sk_tx_queue_clear(sk);
1196        }
1197
1198        return sk;
1199
1200out_free_sec:
1201        security_sk_free(sk);
1202out_free:
1203        if (slab != NULL)
1204                kmem_cache_free(slab, sk);
1205        else
1206                kfree(sk);
1207        return NULL;
1208}
1209
1210static void sk_prot_free(struct proto *prot, struct sock *sk)
1211{
1212        struct kmem_cache *slab;
1213        struct module *owner;
1214
1215        owner = prot->owner;
1216        slab = prot->slab;
1217
1218        security_sk_free(sk);
1219        if (slab != NULL)
1220                kmem_cache_free(slab, sk);
1221        else
1222                kfree(sk);
1223        module_put(owner);
1224}
1225
1226#ifdef CONFIG_CGROUPS
1227void sock_update_classid(struct sock *sk)
1228{
1229        u32 classid;
1230
1231        rcu_read_lock();  /* doing current task, which cannot vanish. */
1232        classid = task_cls_classid(current);
1233        rcu_read_unlock();
1234        if (classid && classid != sk->sk_classid)
1235                sk->sk_classid = classid;
1236}
1237EXPORT_SYMBOL(sock_update_classid);
1238
1239void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1240{
1241        if (in_interrupt())
1242                return;
1243
1244        sk->sk_cgrp_prioidx = task_netprioidx(task);
1245}
1246EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1247#endif
1248
1249/**
1250 *      sk_alloc - All socket objects are allocated here
1251 *      @net: the applicable net namespace
1252 *      @family: protocol family
1253 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1254 *      @prot: struct proto associated with this new sock instance
1255 */
1256struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1257                      struct proto *prot)
1258{
1259        struct sock *sk;
1260
1261        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1262        if (sk) {
1263                sk->sk_family = family;
1264                /*
1265                 * See comment in struct sock definition to understand
1266                 * why we need sk_prot_creator -acme
1267                 */
1268                sk->sk_prot = sk->sk_prot_creator = prot;
1269                sock_lock_init(sk);
1270                sock_net_set(sk, get_net(net));
1271                atomic_set(&sk->sk_wmem_alloc, 1);
1272
1273                sock_update_classid(sk);
1274                sock_update_netprioidx(sk, current);
1275        }
1276
1277        return sk;
1278}
1279EXPORT_SYMBOL(sk_alloc);
1280
1281static void __sk_free(struct sock *sk)
1282{
1283        struct sk_filter *filter;
1284
1285        if (sk->sk_destruct)
1286                sk->sk_destruct(sk);
1287
1288        filter = rcu_dereference_check(sk->sk_filter,
1289                                       atomic_read(&sk->sk_wmem_alloc) == 0);
1290        if (filter) {
1291                sk_filter_uncharge(sk, filter);
1292                RCU_INIT_POINTER(sk->sk_filter, NULL);
1293        }
1294
1295        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1296
1297        if (atomic_read(&sk->sk_omem_alloc))
1298                pr_debug("%s: optmem leakage (%d bytes) detected\n",
1299                         __func__, atomic_read(&sk->sk_omem_alloc));
1300
1301        if (sk->sk_peer_cred)
1302                put_cred(sk->sk_peer_cred);
1303        put_pid(sk->sk_peer_pid);
1304        put_net(sock_net(sk));
1305        sk_prot_free(sk->sk_prot_creator, sk);
1306}
1307
1308void sk_free(struct sock *sk)
1309{
1310        /*
1311         * We subtract one from sk_wmem_alloc and can know if
1312         * some packets are still in some tx queue.
1313         * If not null, sock_wfree() will call __sk_free(sk) later
1314         */
1315        if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1316                __sk_free(sk);
1317}
1318EXPORT_SYMBOL(sk_free);
1319
1320/*
1321 * Last sock_put should drop reference to sk->sk_net. It has already
1322 * been dropped in sk_change_net. Taking reference to stopping namespace
1323 * is not an option.
1324 * Take reference to a socket to remove it from hash _alive_ and after that
1325 * destroy it in the context of init_net.
1326 */
1327void sk_release_kernel(struct sock *sk)
1328{
1329        if (sk == NULL || sk->sk_socket == NULL)
1330                return;
1331
1332        sock_hold(sk);
1333        sock_release(sk->sk_socket);
1334        release_net(sock_net(sk));
1335        sock_net_set(sk, get_net(&init_net));
1336        sock_put(sk);
1337}
1338EXPORT_SYMBOL(sk_release_kernel);
1339
1340static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1341{
1342        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1343                sock_update_memcg(newsk);
1344}
1345
1346/**
1347 *      sk_clone_lock - clone a socket, and lock its clone
1348 *      @sk: the socket to clone
1349 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1350 *
1351 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1352 */
1353struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1354{
1355        struct sock *newsk;
1356
1357        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1358        if (newsk != NULL) {
1359                struct sk_filter *filter;
1360
1361                sock_copy(newsk, sk);
1362
1363                /* SANITY */
1364                get_net(sock_net(newsk));
1365                sk_node_init(&newsk->sk_node);
1366                sock_lock_init(newsk);
1367                bh_lock_sock(newsk);
1368                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1369                newsk->sk_backlog.len = 0;
1370
1371                atomic_set(&newsk->sk_rmem_alloc, 0);
1372                /*
1373                 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1374                 */
1375                atomic_set(&newsk->sk_wmem_alloc, 1);
1376                atomic_set(&newsk->sk_omem_alloc, 0);
1377                skb_queue_head_init(&newsk->sk_receive_queue);
1378                skb_queue_head_init(&newsk->sk_write_queue);
1379#ifdef CONFIG_NET_DMA
1380                skb_queue_head_init(&newsk->sk_async_wait_queue);
1381#endif
1382
1383                spin_lock_init(&newsk->sk_dst_lock);
1384                rwlock_init(&newsk->sk_callback_lock);
1385                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1386                                af_callback_keys + newsk->sk_family,
1387                                af_family_clock_key_strings[newsk->sk_family]);
1388
1389                newsk->sk_dst_cache     = NULL;
1390                newsk->sk_wmem_queued   = 0;
1391                newsk->sk_forward_alloc = 0;
1392                newsk->sk_send_head     = NULL;
1393                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1394
1395                sock_reset_flag(newsk, SOCK_DONE);
1396                skb_queue_head_init(&newsk->sk_error_queue);
1397
1398                filter = rcu_dereference_protected(newsk->sk_filter, 1);
1399                if (filter != NULL)
1400                        sk_filter_charge(newsk, filter);
1401
1402                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1403                        /* It is still raw copy of parent, so invalidate
1404                         * destructor and make plain sk_free() */
1405                        newsk->sk_destruct = NULL;
1406                        bh_unlock_sock(newsk);
1407                        sk_free(newsk);
1408                        newsk = NULL;
1409                        goto out;
1410                }
1411
1412                newsk->sk_err      = 0;
1413                newsk->sk_priority = 0;
1414                /*
1415                 * Before updating sk_refcnt, we must commit prior changes to memory
1416                 * (Documentation/RCU/rculist_nulls.txt for details)
1417                 */
1418                smp_wmb();
1419                atomic_set(&newsk->sk_refcnt, 2);
1420
1421                /*
1422                 * Increment the counter in the same struct proto as the master
1423                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1424                 * is the same as sk->sk_prot->socks, as this field was copied
1425                 * with memcpy).
1426                 *
1427                 * This _changes_ the previous behaviour, where
1428                 * tcp_create_openreq_child always was incrementing the
1429                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1430                 * to be taken into account in all callers. -acme
1431                 */
1432                sk_refcnt_debug_inc(newsk);
1433                sk_set_socket(newsk, NULL);
1434                newsk->sk_wq = NULL;
1435
1436                sk_update_clone(sk, newsk);
1437
1438                if (newsk->sk_prot->sockets_allocated)
1439                        sk_sockets_allocated_inc(newsk);
1440
1441                if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1442                        net_enable_timestamp();
1443        }
1444out:
1445        return newsk;
1446}
1447EXPORT_SYMBOL_GPL(sk_clone_lock);
1448
1449void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1450{
1451        __sk_dst_set(sk, dst);
1452        sk->sk_route_caps = dst->dev->features;
1453        if (sk->sk_route_caps & NETIF_F_GSO)
1454                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1455        sk->sk_route_caps &= ~sk->sk_route_nocaps;
1456        if (sk_can_gso(sk)) {
1457                if (dst->header_len) {
1458                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1459                } else {
1460                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1461                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1462                        sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1463                }
1464        }
1465}
1466EXPORT_SYMBOL_GPL(sk_setup_caps);
1467
1468void __init sk_init(void)
1469{
1470        if (totalram_pages <= 4096) {
1471                sysctl_wmem_max = 32767;
1472                sysctl_rmem_max = 32767;
1473                sysctl_wmem_default = 32767;
1474                sysctl_rmem_default = 32767;
1475        } else if (totalram_pages >= 131072) {
1476                sysctl_wmem_max = 131071;
1477                sysctl_rmem_max = 131071;
1478        }
1479}
1480
1481/*
1482 *      Simple resource managers for sockets.
1483 */
1484
1485
1486/*
1487 * Write buffer destructor automatically called from kfree_skb.
1488 */
1489void sock_wfree(struct sk_buff *skb)
1490{
1491        struct sock *sk = skb->sk;
1492        unsigned int len = skb->truesize;
1493
1494        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1495                /*
1496                 * Keep a reference on sk_wmem_alloc, this will be released
1497                 * after sk_write_space() call
1498                 */
1499                atomic_sub(len - 1, &sk->sk_wmem_alloc);
1500                sk->sk_write_space(sk);
1501                len = 1;
1502        }
1503        /*
1504         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1505         * could not do because of in-flight packets
1506         */
1507        if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1508                __sk_free(sk);
1509}
1510EXPORT_SYMBOL(sock_wfree);
1511
1512/*
1513 * Read buffer destructor automatically called from kfree_skb.
1514 */
1515void sock_rfree(struct sk_buff *skb)
1516{
1517        struct sock *sk = skb->sk;
1518        unsigned int len = skb->truesize;
1519
1520        atomic_sub(len, &sk->sk_rmem_alloc);
1521        sk_mem_uncharge(sk, len);
1522}
1523EXPORT_SYMBOL(sock_rfree);
1524
1525void sock_edemux(struct sk_buff *skb)
1526{
1527        struct sock *sk = skb->sk;
1528
1529#ifdef CONFIG_INET
1530        if (sk->sk_state == TCP_TIME_WAIT)
1531                inet_twsk_put(inet_twsk(sk));
1532        else
1533#endif
1534                sock_put(sk);
1535}
1536EXPORT_SYMBOL(sock_edemux);
1537
1538int sock_i_uid(struct sock *sk)
1539{
1540        int uid;
1541
1542        read_lock_bh(&sk->sk_callback_lock);
1543        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1544        read_unlock_bh(&sk->sk_callback_lock);
1545        return uid;
1546}
1547EXPORT_SYMBOL(sock_i_uid);
1548
1549unsigned long sock_i_ino(struct sock *sk)
1550{
1551        unsigned long ino;
1552
1553        read_lock_bh(&sk->sk_callback_lock);
1554        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1555        read_unlock_bh(&sk->sk_callback_lock);
1556        return ino;
1557}
1558EXPORT_SYMBOL(sock_i_ino);
1559
1560/*
1561 * Allocate a skb from the socket's send buffer.
1562 */
1563struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1564                             gfp_t priority)
1565{
1566        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1567                struct sk_buff *skb = alloc_skb(size, priority);
1568                if (skb) {
1569                        skb_set_owner_w(skb, sk);
1570                        return skb;
1571                }
1572        }
1573        return NULL;
1574}
1575EXPORT_SYMBOL(sock_wmalloc);
1576
1577/*
1578 * Allocate a skb from the socket's receive buffer.
1579 */
1580struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1581                             gfp_t priority)
1582{
1583        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1584                struct sk_buff *skb = alloc_skb(size, priority);
1585                if (skb) {
1586                        skb_set_owner_r(skb, sk);
1587                        return skb;
1588                }
1589        }
1590        return NULL;
1591}
1592
1593/*
1594 * Allocate a memory block from the socket's option memory buffer.
1595 */
1596void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1597{
1598        if ((unsigned int)size <= sysctl_optmem_max &&
1599            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1600                void *mem;
1601                /* First do the add, to avoid the race if kmalloc
1602                 * might sleep.
1603                 */
1604                atomic_add(size, &sk->sk_omem_alloc);
1605                mem = kmalloc(size, priority);
1606                if (mem)
1607                        return mem;
1608                atomic_sub(size, &sk->sk_omem_alloc);
1609        }
1610        return NULL;
1611}
1612EXPORT_SYMBOL(sock_kmalloc);
1613
1614/*
1615 * Free an option memory block.
1616 */
1617void sock_kfree_s(struct sock *sk, void *mem, int size)
1618{
1619        kfree(mem);
1620        atomic_sub(size, &sk->sk_omem_alloc);
1621}
1622EXPORT_SYMBOL(sock_kfree_s);
1623
1624/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1625   I think, these locks should be removed for datagram sockets.
1626 */
1627static long sock_wait_for_wmem(struct sock *sk, long timeo)
1628{
1629        DEFINE_WAIT(wait);
1630
1631        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1632        for (;;) {
1633                if (!timeo)
1634                        break;
1635                if (signal_pending(current))
1636                        break;
1637                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1638                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1639                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1640                        break;
1641                if (sk->sk_shutdown & SEND_SHUTDOWN)
1642                        break;
1643                if (sk->sk_err)
1644                        break;
1645                timeo = schedule_timeout(timeo);
1646        }
1647        finish_wait(sk_sleep(sk), &wait);
1648        return timeo;
1649}
1650
1651
1652/*
1653 *      Generic send/receive buffer handlers
1654 */
1655
1656struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1657                                     unsigned long data_len, int noblock,
1658                                     int *errcode)
1659{
1660        struct sk_buff *skb;
1661        gfp_t gfp_mask;
1662        long timeo;
1663        int err;
1664        int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1665
1666        err = -EMSGSIZE;
1667        if (npages > MAX_SKB_FRAGS)
1668                goto failure;
1669
1670        gfp_mask = sk->sk_allocation;
1671        if (gfp_mask & __GFP_WAIT)
1672                gfp_mask |= __GFP_REPEAT;
1673
1674        timeo = sock_sndtimeo(sk, noblock);
1675        while (1) {
1676                err = sock_error(sk);
1677                if (err != 0)
1678                        goto failure;
1679
1680                err = -EPIPE;
1681                if (sk->sk_shutdown & SEND_SHUTDOWN)
1682                        goto failure;
1683
1684                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1685                        skb = alloc_skb(header_len, gfp_mask);
1686                        if (skb) {
1687                                int i;
1688
1689                                /* No pages, we're done... */
1690                                if (!data_len)
1691                                        break;
1692
1693                                skb->truesize += data_len;
1694                                skb_shinfo(skb)->nr_frags = npages;
1695                                for (i = 0; i < npages; i++) {
1696                                        struct page *page;
1697
1698                                        page = alloc_pages(sk->sk_allocation, 0);
1699                                        if (!page) {
1700                                                err = -ENOBUFS;
1701                                                skb_shinfo(skb)->nr_frags = i;
1702                                                kfree_skb(skb);
1703                                                goto failure;
1704                                        }
1705
1706                                        __skb_fill_page_desc(skb, i,
1707                                                        page, 0,
1708                                                        (data_len >= PAGE_SIZE ?
1709                                                         PAGE_SIZE :
1710                                                         data_len));
1711                                        data_len -= PAGE_SIZE;
1712                                }
1713
1714                                /* Full success... */
1715                                break;
1716                        }
1717                        err = -ENOBUFS;
1718                        goto failure;
1719                }
1720                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1721                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1722                err = -EAGAIN;
1723                if (!timeo)
1724                        goto failure;
1725                if (signal_pending(current))
1726                        goto interrupted;
1727                timeo = sock_wait_for_wmem(sk, timeo);
1728        }
1729
1730        skb_set_owner_w(skb, sk);
1731        return skb;
1732
1733interrupted:
1734        err = sock_intr_errno(timeo);
1735failure:
1736        *errcode = err;
1737        return NULL;
1738}
1739EXPORT_SYMBOL(sock_alloc_send_pskb);
1740
1741struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1742                                    int noblock, int *errcode)
1743{
1744        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1745}
1746EXPORT_SYMBOL(sock_alloc_send_skb);
1747
1748static void __lock_sock(struct sock *sk)
1749        __releases(&sk->sk_lock.slock)
1750        __acquires(&sk->sk_lock.slock)
1751{
1752        DEFINE_WAIT(wait);
1753
1754        for (;;) {
1755                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1756                                        TASK_UNINTERRUPTIBLE);
1757                spin_unlock_bh(&sk->sk_lock.slock);
1758                schedule();
1759                spin_lock_bh(&sk->sk_lock.slock);
1760                if (!sock_owned_by_user(sk))
1761                        break;
1762        }
1763        finish_wait(&sk->sk_lock.wq, &wait);
1764}
1765
1766static void __release_sock(struct sock *sk)
1767        __releases(&sk->sk_lock.slock)
1768        __acquires(&sk->sk_lock.slock)
1769{
1770        struct sk_buff *skb = sk->sk_backlog.head;
1771
1772        do {
1773                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1774                bh_unlock_sock(sk);
1775
1776                do {
1777                        struct sk_buff *next = skb->next;
1778
1779                        prefetch(next);
1780                        WARN_ON_ONCE(skb_dst_is_noref(skb));
1781                        skb->next = NULL;
1782                        sk_backlog_rcv(sk, skb);
1783
1784                        /*
1785                         * We are in process context here with softirqs
1786                         * disabled, use cond_resched_softirq() to preempt.
1787                         * This is safe to do because we've taken the backlog
1788                         * queue private:
1789                         */
1790                        cond_resched_softirq();
1791
1792                        skb = next;
1793                } while (skb != NULL);
1794
1795                bh_lock_sock(sk);
1796        } while ((skb = sk->sk_backlog.head) != NULL);
1797
1798        /*
1799         * Doing the zeroing here guarantee we can not loop forever
1800         * while a wild producer attempts to flood us.
1801         */
1802        sk->sk_backlog.len = 0;
1803}
1804
1805/**
1806 * sk_wait_data - wait for data to arrive at sk_receive_queue
1807 * @sk:    sock to wait on
1808 * @timeo: for how long
1809 *
1810 * Now socket state including sk->sk_err is changed only under lock,
1811 * hence we may omit checks after joining wait queue.
1812 * We check receive queue before schedule() only as optimization;
1813 * it is very likely that release_sock() added new data.
1814 */
1815int sk_wait_data(struct sock *sk, long *timeo)
1816{
1817        int rc;
1818        DEFINE_WAIT(wait);
1819
1820        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1821        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1822        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1823        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1824        finish_wait(sk_sleep(sk), &wait);
1825        return rc;
1826}
1827EXPORT_SYMBOL(sk_wait_data);
1828
1829/**
1830 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1831 *      @sk: socket
1832 *      @size: memory size to allocate
1833 *      @kind: allocation type
1834 *
1835 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1836 *      rmem allocation. This function assumes that protocols which have
1837 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1838 */
1839int __sk_mem_schedule(struct sock *sk, int size, int kind)
1840{
1841        struct proto *prot = sk->sk_prot;
1842        int amt = sk_mem_pages(size);
1843        long allocated;
1844        int parent_status = UNDER_LIMIT;
1845
1846        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1847
1848        allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1849
1850        /* Under limit. */
1851        if (parent_status == UNDER_LIMIT &&
1852                        allocated <= sk_prot_mem_limits(sk, 0)) {
1853                sk_leave_memory_pressure(sk);
1854                return 1;
1855        }
1856
1857        /* Under pressure. (we or our parents) */
1858        if ((parent_status > SOFT_LIMIT) ||
1859                        allocated > sk_prot_mem_limits(sk, 1))
1860                sk_enter_memory_pressure(sk);
1861
1862        /* Over hard limit (we or our parents) */
1863        if ((parent_status == OVER_LIMIT) ||
1864                        (allocated > sk_prot_mem_limits(sk, 2)))
1865                goto suppress_allocation;
1866
1867        /* guarantee minimum buffer size under pressure */
1868        if (kind == SK_MEM_RECV) {
1869                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1870                        return 1;
1871
1872        } else { /* SK_MEM_SEND */
1873                if (sk->sk_type == SOCK_STREAM) {
1874                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1875                                return 1;
1876                } else if (atomic_read(&sk->sk_wmem_alloc) <
1877                           prot->sysctl_wmem[0])
1878                                return 1;
1879        }
1880
1881        if (sk_has_memory_pressure(sk)) {
1882                int alloc;
1883
1884                if (!sk_under_memory_pressure(sk))
1885                        return 1;
1886                alloc = sk_sockets_allocated_read_positive(sk);
1887                if (sk_prot_mem_limits(sk, 2) > alloc *
1888                    sk_mem_pages(sk->sk_wmem_queued +
1889                                 atomic_read(&sk->sk_rmem_alloc) +
1890                                 sk->sk_forward_alloc))
1891                        return 1;
1892        }
1893
1894suppress_allocation:
1895
1896        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1897                sk_stream_moderate_sndbuf(sk);
1898
1899                /* Fail only if socket is _under_ its sndbuf.
1900                 * In this case we cannot block, so that we have to fail.
1901                 */
1902                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1903                        return 1;
1904        }
1905
1906        trace_sock_exceed_buf_limit(sk, prot, allocated);
1907
1908        /* Alas. Undo changes. */
1909        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1910
1911        sk_memory_allocated_sub(sk, amt);
1912
1913        return 0;
1914}
1915EXPORT_SYMBOL(__sk_mem_schedule);
1916
1917/**
1918 *      __sk_reclaim - reclaim memory_allocated
1919 *      @sk: socket
1920 */
1921void __sk_mem_reclaim(struct sock *sk)
1922{
1923        sk_memory_allocated_sub(sk,
1924                                sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1925        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1926
1927        if (sk_under_memory_pressure(sk) &&
1928            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1929                sk_leave_memory_pressure(sk);
1930}
1931EXPORT_SYMBOL(__sk_mem_reclaim);
1932
1933
1934/*
1935 * Set of default routines for initialising struct proto_ops when
1936 * the protocol does not support a particular function. In certain
1937 * cases where it makes no sense for a protocol to have a "do nothing"
1938 * function, some default processing is provided.
1939 */
1940
1941int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1942{
1943        return -EOPNOTSUPP;
1944}
1945EXPORT_SYMBOL(sock_no_bind);
1946
1947int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1948                    int len, int flags)
1949{
1950        return -EOPNOTSUPP;
1951}
1952EXPORT_SYMBOL(sock_no_connect);
1953
1954int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1955{
1956        return -EOPNOTSUPP;
1957}
1958EXPORT_SYMBOL(sock_no_socketpair);
1959
1960int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1961{
1962        return -EOPNOTSUPP;
1963}
1964EXPORT_SYMBOL(sock_no_accept);
1965
1966int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1967                    int *len, int peer)
1968{
1969        return -EOPNOTSUPP;
1970}
1971EXPORT_SYMBOL(sock_no_getname);
1972
1973unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1974{
1975        return 0;
1976}
1977EXPORT_SYMBOL(sock_no_poll);
1978
1979int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1980{
1981        return -EOPNOTSUPP;
1982}
1983EXPORT_SYMBOL(sock_no_ioctl);
1984
1985int sock_no_listen(struct socket *sock, int backlog)
1986{
1987        return -EOPNOTSUPP;
1988}
1989EXPORT_SYMBOL(sock_no_listen);
1990
1991int sock_no_shutdown(struct socket *sock, int how)
1992{
1993        return -EOPNOTSUPP;
1994}
1995EXPORT_SYMBOL(sock_no_shutdown);
1996
1997int sock_no_setsockopt(struct socket *sock, int level, int optname,
1998                    char __user *optval, unsigned int optlen)
1999{
2000        return -EOPNOTSUPP;
2001}
2002EXPORT_SYMBOL(sock_no_setsockopt);
2003
2004int sock_no_getsockopt(struct socket *sock, int level, int optname,
2005                    char __user *optval, int __user *optlen)
2006{
2007        return -EOPNOTSUPP;
2008}
2009EXPORT_SYMBOL(sock_no_getsockopt);
2010
2011int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2012                    size_t len)
2013{
2014        return -EOPNOTSUPP;
2015}
2016EXPORT_SYMBOL(sock_no_sendmsg);
2017
2018int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2019                    size_t len, int flags)
2020{
2021        return -EOPNOTSUPP;
2022}
2023EXPORT_SYMBOL(sock_no_recvmsg);
2024
2025int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2026{
2027        /* Mirror missing mmap method error code */
2028        return -ENODEV;
2029}
2030EXPORT_SYMBOL(sock_no_mmap);
2031
2032ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2033{
2034        ssize_t res;
2035        struct msghdr msg = {.msg_flags = flags};
2036        struct kvec iov;
2037        char *kaddr = kmap(page);
2038        iov.iov_base = kaddr + offset;
2039        iov.iov_len = size;
2040        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2041        kunmap(page);
2042        return res;
2043}
2044EXPORT_SYMBOL(sock_no_sendpage);
2045
2046/*
2047 *      Default Socket Callbacks
2048 */
2049
2050static void sock_def_wakeup(struct sock *sk)
2051{
2052        struct socket_wq *wq;
2053
2054        rcu_read_lock();
2055        wq = rcu_dereference(sk->sk_wq);
2056        if (wq_has_sleeper(wq))
2057                wake_up_interruptible_all(&wq->wait);
2058        rcu_read_unlock();
2059}
2060
2061static void sock_def_error_report(struct sock *sk)
2062{
2063        struct socket_wq *wq;
2064
2065        rcu_read_lock();
2066        wq = rcu_dereference(sk->sk_wq);
2067        if (wq_has_sleeper(wq))
2068                wake_up_interruptible_poll(&wq->wait, POLLERR);
2069        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2070        rcu_read_unlock();
2071}
2072
2073static void sock_def_readable(struct sock *sk, int len)
2074{
2075        struct socket_wq *wq;
2076
2077        rcu_read_lock();
2078        wq = rcu_dereference(sk->sk_wq);
2079        if (wq_has_sleeper(wq))
2080                wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2081                                                POLLRDNORM | POLLRDBAND);
2082        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2083        rcu_read_unlock();
2084}
2085
2086static void sock_def_write_space(struct sock *sk)
2087{
2088        struct socket_wq *wq;
2089
2090        rcu_read_lock();
2091
2092        /* Do not wake up a writer until he can make "significant"
2093         * progress.  --DaveM
2094         */
2095        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2096                wq = rcu_dereference(sk->sk_wq);
2097                if (wq_has_sleeper(wq))
2098                        wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2099                                                POLLWRNORM | POLLWRBAND);
2100
2101                /* Should agree with poll, otherwise some programs break */
2102                if (sock_writeable(sk))
2103                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2104        }
2105
2106        rcu_read_unlock();
2107}
2108
2109static void sock_def_destruct(struct sock *sk)
2110{
2111        kfree(sk->sk_protinfo);
2112}
2113
2114void sk_send_sigurg(struct sock *sk)
2115{
2116        if (sk->sk_socket && sk->sk_socket->file)
2117                if (send_sigurg(&sk->sk_socket->file->f_owner))
2118                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2119}
2120EXPORT_SYMBOL(sk_send_sigurg);
2121
2122void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2123                    unsigned long expires)
2124{
2125        if (!mod_timer(timer, expires))
2126                sock_hold(sk);
2127}
2128EXPORT_SYMBOL(sk_reset_timer);
2129
2130void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2131{
2132        if (timer_pending(timer) && del_timer(timer))
2133                __sock_put(sk);
2134}
2135EXPORT_SYMBOL(sk_stop_timer);
2136
2137void sock_init_data(struct socket *sock, struct sock *sk)
2138{
2139        skb_queue_head_init(&sk->sk_receive_queue);
2140        skb_queue_head_init(&sk->sk_write_queue);
2141        skb_queue_head_init(&sk->sk_error_queue);
2142#ifdef CONFIG_NET_DMA
2143        skb_queue_head_init(&sk->sk_async_wait_queue);
2144#endif
2145
2146        sk->sk_send_head        =       NULL;
2147
2148        init_timer(&sk->sk_timer);
2149
2150        sk->sk_allocation       =       GFP_KERNEL;
2151        sk->sk_rcvbuf           =       sysctl_rmem_default;
2152        sk->sk_sndbuf           =       sysctl_wmem_default;
2153        sk->sk_state            =       TCP_CLOSE;
2154        sk_set_socket(sk, sock);
2155
2156        sock_set_flag(sk, SOCK_ZAPPED);
2157
2158        if (sock) {
2159                sk->sk_type     =       sock->type;
2160                sk->sk_wq       =       sock->wq;
2161                sock->sk        =       sk;
2162        } else
2163                sk->sk_wq       =       NULL;
2164
2165        spin_lock_init(&sk->sk_dst_lock);
2166        rwlock_init(&sk->sk_callback_lock);
2167        lockdep_set_class_and_name(&sk->sk_callback_lock,
2168                        af_callback_keys + sk->sk_family,
2169                        af_family_clock_key_strings[sk->sk_family]);
2170
2171        sk->sk_state_change     =       sock_def_wakeup;
2172        sk->sk_data_ready       =       sock_def_readable;
2173        sk->sk_write_space      =       sock_def_write_space;
2174        sk->sk_error_report     =       sock_def_error_report;
2175        sk->sk_destruct         =       sock_def_destruct;
2176
2177        sk->sk_sndmsg_page      =       NULL;
2178        sk->sk_sndmsg_off       =       0;
2179        sk->sk_peek_off         =       -1;
2180
2181        sk->sk_peer_pid         =       NULL;
2182        sk->sk_peer_cred        =       NULL;
2183        sk->sk_write_pending    =       0;
2184        sk->sk_rcvlowat         =       1;
2185        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2186        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2187
2188        sk->sk_stamp = ktime_set(-1L, 0);
2189
2190        /*
2191         * Before updating sk_refcnt, we must commit prior changes to memory
2192         * (Documentation/RCU/rculist_nulls.txt for details)
2193         */
2194        smp_wmb();
2195        atomic_set(&sk->sk_refcnt, 1);
2196        atomic_set(&sk->sk_drops, 0);
2197}
2198EXPORT_SYMBOL(sock_init_data);
2199
2200void lock_sock_nested(struct sock *sk, int subclass)
2201{
2202        might_sleep();
2203        spin_lock_bh(&sk->sk_lock.slock);
2204        if (sk->sk_lock.owned)
2205                __lock_sock(sk);
2206        sk->sk_lock.owned = 1;
2207        spin_unlock(&sk->sk_lock.slock);
2208        /*
2209         * The sk_lock has mutex_lock() semantics here:
2210         */
2211        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2212        local_bh_enable();
2213}
2214EXPORT_SYMBOL(lock_sock_nested);
2215
2216void release_sock(struct sock *sk)
2217{
2218        /*
2219         * The sk_lock has mutex_unlock() semantics:
2220         */
2221        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2222
2223        spin_lock_bh(&sk->sk_lock.slock);
2224        if (sk->sk_backlog.tail)
2225                __release_sock(sk);
2226
2227        if (sk->sk_prot->release_cb)
2228                sk->sk_prot->release_cb(sk);
2229
2230        sk->sk_lock.owned = 0;
2231        if (waitqueue_active(&sk->sk_lock.wq))
2232                wake_up(&sk->sk_lock.wq);
2233        spin_unlock_bh(&sk->sk_lock.slock);
2234}
2235EXPORT_SYMBOL(release_sock);
2236
2237/**
2238 * lock_sock_fast - fast version of lock_sock
2239 * @sk: socket
2240 *
2241 * This version should be used for very small section, where process wont block
2242 * return false if fast path is taken
2243 *   sk_lock.slock locked, owned = 0, BH disabled
2244 * return true if slow path is taken
2245 *   sk_lock.slock unlocked, owned = 1, BH enabled
2246 */
2247bool lock_sock_fast(struct sock *sk)
2248{
2249        might_sleep();
2250        spin_lock_bh(&sk->sk_lock.slock);
2251
2252        if (!sk->sk_lock.owned)
2253                /*
2254                 * Note : We must disable BH
2255                 */
2256                return false;
2257
2258        __lock_sock(sk);
2259        sk->sk_lock.owned = 1;
2260        spin_unlock(&sk->sk_lock.slock);
2261        /*
2262         * The sk_lock has mutex_lock() semantics here:
2263         */
2264        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2265        local_bh_enable();
2266        return true;
2267}
2268EXPORT_SYMBOL(lock_sock_fast);
2269
2270int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2271{
2272        struct timeval tv;
2273        if (!sock_flag(sk, SOCK_TIMESTAMP))
2274                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2275        tv = ktime_to_timeval(sk->sk_stamp);
2276        if (tv.tv_sec == -1)
2277                return -ENOENT;
2278        if (tv.tv_sec == 0) {
2279                sk->sk_stamp = ktime_get_real();
2280                tv = ktime_to_timeval(sk->sk_stamp);
2281        }
2282        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2283}
2284EXPORT_SYMBOL(sock_get_timestamp);
2285
2286int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2287{
2288        struct timespec ts;
2289        if (!sock_flag(sk, SOCK_TIMESTAMP))
2290                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2291        ts = ktime_to_timespec(sk->sk_stamp);
2292        if (ts.tv_sec == -1)
2293                return -ENOENT;
2294        if (ts.tv_sec == 0) {
2295                sk->sk_stamp = ktime_get_real();
2296                ts = ktime_to_timespec(sk->sk_stamp);
2297        }
2298        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2299}
2300EXPORT_SYMBOL(sock_get_timestampns);
2301
2302void sock_enable_timestamp(struct sock *sk, int flag)
2303{
2304        if (!sock_flag(sk, flag)) {
2305                unsigned long previous_flags = sk->sk_flags;
2306
2307                sock_set_flag(sk, flag);
2308                /*
2309                 * we just set one of the two flags which require net
2310                 * time stamping, but time stamping might have been on
2311                 * already because of the other one
2312                 */
2313                if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2314                        net_enable_timestamp();
2315        }
2316}
2317
2318/*
2319 *      Get a socket option on an socket.
2320 *
2321 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2322 *      asynchronous errors should be reported by getsockopt. We assume
2323 *      this means if you specify SO_ERROR (otherwise whats the point of it).
2324 */
2325int sock_common_getsockopt(struct socket *sock, int level, int optname,
2326                           char __user *optval, int __user *optlen)
2327{
2328        struct sock *sk = sock->sk;
2329
2330        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2331}
2332EXPORT_SYMBOL(sock_common_getsockopt);
2333
2334#ifdef CONFIG_COMPAT
2335int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2336                                  char __user *optval, int __user *optlen)
2337{
2338        struct sock *sk = sock->sk;
2339
2340        if (sk->sk_prot->compat_getsockopt != NULL)
2341                return sk->sk_prot->compat_getsockopt(sk, level, optname,
2342                                                      optval, optlen);
2343        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2344}
2345EXPORT_SYMBOL(compat_sock_common_getsockopt);
2346#endif
2347
2348int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2349                        struct msghdr *msg, size_t size, int flags)
2350{
2351        struct sock *sk = sock->sk;
2352        int addr_len = 0;
2353        int err;
2354
2355        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2356                                   flags & ~MSG_DONTWAIT, &addr_len);
2357        if (err >= 0)
2358                msg->msg_namelen = addr_len;
2359        return err;
2360}
2361EXPORT_SYMBOL(sock_common_recvmsg);
2362
2363/*
2364 *      Set socket options on an inet socket.
2365 */
2366int sock_common_setsockopt(struct socket *sock, int level, int optname,
2367                           char __user *optval, unsigned int optlen)
2368{
2369        struct sock *sk = sock->sk;
2370
2371        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2372}
2373EXPORT_SYMBOL(sock_common_setsockopt);
2374
2375#ifdef CONFIG_COMPAT
2376int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2377                                  char __user *optval, unsigned int optlen)
2378{
2379        struct sock *sk = sock->sk;
2380
2381        if (sk->sk_prot->compat_setsockopt != NULL)
2382                return sk->sk_prot->compat_setsockopt(sk, level, optname,
2383                                                      optval, optlen);
2384        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2385}
2386EXPORT_SYMBOL(compat_sock_common_setsockopt);
2387#endif
2388
2389void sk_common_release(struct sock *sk)
2390{
2391        if (sk->sk_prot->destroy)
2392                sk->sk_prot->destroy(sk);
2393
2394        /*
2395         * Observation: when sock_common_release is called, processes have
2396         * no access to socket. But net still has.
2397         * Step one, detach it from networking:
2398         *
2399         * A. Remove from hash tables.
2400         */
2401
2402        sk->sk_prot->unhash(sk);
2403
2404        /*
2405         * In this point socket cannot receive new packets, but it is possible
2406         * that some packets are in flight because some CPU runs receiver and
2407         * did hash table lookup before we unhashed socket. They will achieve
2408         * receive queue and will be purged by socket destructor.
2409         *
2410         * Also we still have packets pending on receive queue and probably,
2411         * our own packets waiting in device queues. sock_destroy will drain
2412         * receive queue, but transmitted packets will delay socket destruction
2413         * until the last reference will be released.
2414         */
2415
2416        sock_orphan(sk);
2417
2418        xfrm_sk_free_policy(sk);
2419
2420        sk_refcnt_debug_release(sk);
2421        sock_put(sk);
2422}
2423EXPORT_SYMBOL(sk_common_release);
2424
2425#ifdef CONFIG_PROC_FS
2426#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2427struct prot_inuse {
2428        int val[PROTO_INUSE_NR];
2429};
2430
2431static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2432
2433#ifdef CONFIG_NET_NS
2434void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2435{
2436        __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2437}
2438EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2439
2440int sock_prot_inuse_get(struct net *net, struct proto *prot)
2441{
2442        int cpu, idx = prot->inuse_idx;
2443        int res = 0;
2444
2445        for_each_possible_cpu(cpu)
2446                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2447
2448        return res >= 0 ? res : 0;
2449}
2450EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2451
2452static int __net_init sock_inuse_init_net(struct net *net)
2453{
2454        net->core.inuse = alloc_percpu(struct prot_inuse);
2455        return net->core.inuse ? 0 : -ENOMEM;
2456}
2457
2458static void __net_exit sock_inuse_exit_net(struct net *net)
2459{
2460        free_percpu(net->core.inuse);
2461}
2462
2463static struct pernet_operations net_inuse_ops = {
2464        .init = sock_inuse_init_net,
2465        .exit = sock_inuse_exit_net,
2466};
2467
2468static __init int net_inuse_init(void)
2469{
2470        if (register_pernet_subsys(&net_inuse_ops))
2471                panic("Cannot initialize net inuse counters");
2472
2473        return 0;
2474}
2475
2476core_initcall(net_inuse_init);
2477#else
2478static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2479
2480void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2481{
2482        __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2483}
2484EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2485
2486int sock_prot_inuse_get(struct net *net, struct proto *prot)
2487{
2488        int cpu, idx = prot->inuse_idx;
2489        int res = 0;
2490
2491        for_each_possible_cpu(cpu)
2492                res += per_cpu(prot_inuse, cpu).val[idx];
2493
2494        return res >= 0 ? res : 0;
2495}
2496EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2497#endif
2498
2499static void assign_proto_idx(struct proto *prot)
2500{
2501        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2502
2503        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2504                pr_err("PROTO_INUSE_NR exhausted\n");
2505                return;
2506        }
2507
2508        set_bit(prot->inuse_idx, proto_inuse_idx);
2509}
2510
2511static void release_proto_idx(struct proto *prot)
2512{
2513        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2514                clear_bit(prot->inuse_idx, proto_inuse_idx);
2515}
2516#else
2517static inline void assign_proto_idx(struct proto *prot)
2518{
2519}
2520
2521static inline void release_proto_idx(struct proto *prot)
2522{
2523}
2524#endif
2525
2526int proto_register(struct proto *prot, int alloc_slab)
2527{
2528        if (alloc_slab) {
2529                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2530                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2531                                        NULL);
2532
2533                if (prot->slab == NULL) {
2534                        pr_crit("%s: Can't create sock SLAB cache!\n",
2535                                prot->name);
2536                        goto out;
2537                }
2538
2539                if (prot->rsk_prot != NULL) {
2540                        prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2541                        if (prot->rsk_prot->slab_name == NULL)
2542                                goto out_free_sock_slab;
2543
2544                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2545                                                                 prot->rsk_prot->obj_size, 0,
2546                                                                 SLAB_HWCACHE_ALIGN, NULL);
2547
2548                        if (prot->rsk_prot->slab == NULL) {
2549                                pr_crit("%s: Can't create request sock SLAB cache!\n",
2550                                        prot->name);
2551                                goto out_free_request_sock_slab_name;
2552                        }
2553                }
2554
2555                if (prot->twsk_prot != NULL) {
2556                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2557
2558                        if (prot->twsk_prot->twsk_slab_name == NULL)
2559                                goto out_free_request_sock_slab;
2560
2561                        prot->twsk_prot->twsk_slab =
2562                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2563                                                  prot->twsk_prot->twsk_obj_size,
2564                                                  0,
2565                                                  SLAB_HWCACHE_ALIGN |
2566                                                        prot->slab_flags,
2567                                                  NULL);
2568                        if (prot->twsk_prot->twsk_slab == NULL)
2569                                goto out_free_timewait_sock_slab_name;
2570                }
2571        }
2572
2573        mutex_lock(&proto_list_mutex);
2574        list_add(&prot->node, &proto_list);
2575        assign_proto_idx(prot);
2576        mutex_unlock(&proto_list_mutex);
2577        return 0;
2578
2579out_free_timewait_sock_slab_name:
2580        kfree(prot->twsk_prot->twsk_slab_name);
2581out_free_request_sock_slab:
2582        if (prot->rsk_prot && prot->rsk_prot->slab) {
2583                kmem_cache_destroy(prot->rsk_prot->slab);
2584                prot->rsk_prot->slab = NULL;
2585        }
2586out_free_request_sock_slab_name:
2587        if (prot->rsk_prot)
2588                kfree(prot->rsk_prot->slab_name);
2589out_free_sock_slab:
2590        kmem_cache_destroy(prot->slab);
2591        prot->slab = NULL;
2592out:
2593        return -ENOBUFS;
2594}
2595EXPORT_SYMBOL(proto_register);
2596
2597void proto_unregister(struct proto *prot)
2598{
2599        mutex_lock(&proto_list_mutex);
2600        release_proto_idx(prot);
2601        list_del(&prot->node);
2602        mutex_unlock(&proto_list_mutex);
2603
2604        if (prot->slab != NULL) {
2605                kmem_cache_destroy(prot->slab);
2606                prot->slab = NULL;
2607        }
2608
2609        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2610                kmem_cache_destroy(prot->rsk_prot->slab);
2611                kfree(prot->rsk_prot->slab_name);
2612                prot->rsk_prot->slab = NULL;
2613        }
2614
2615        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2616                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2617                kfree(prot->twsk_prot->twsk_slab_name);
2618                prot->twsk_prot->twsk_slab = NULL;
2619        }
2620}
2621EXPORT_SYMBOL(proto_unregister);
2622
2623#ifdef CONFIG_PROC_FS
2624static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2625        __acquires(proto_list_mutex)
2626{
2627        mutex_lock(&proto_list_mutex);
2628        return seq_list_start_head(&proto_list, *pos);
2629}
2630
2631static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2632{
2633        return seq_list_next(v, &proto_list, pos);
2634}
2635
2636static void proto_seq_stop(struct seq_file *seq, void *v)
2637        __releases(proto_list_mutex)
2638{
2639        mutex_unlock(&proto_list_mutex);
2640}
2641
2642static char proto_method_implemented(const void *method)
2643{
2644        return method == NULL ? 'n' : 'y';
2645}
2646static long sock_prot_memory_allocated(struct proto *proto)
2647{
2648        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2649}
2650
2651static char *sock_prot_memory_pressure(struct proto *proto)
2652{
2653        return proto->memory_pressure != NULL ?
2654        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2655}
2656
2657static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2658{
2659
2660        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2661                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2662                   proto->name,
2663                   proto->obj_size,
2664                   sock_prot_inuse_get(seq_file_net(seq), proto),
2665                   sock_prot_memory_allocated(proto),
2666                   sock_prot_memory_pressure(proto),
2667                   proto->max_header,
2668                   proto->slab == NULL ? "no" : "yes",
2669                   module_name(proto->owner),
2670                   proto_method_implemented(proto->close),
2671                   proto_method_implemented(proto->connect),
2672                   proto_method_implemented(proto->disconnect),
2673                   proto_method_implemented(proto->accept),
2674                   proto_method_implemented(proto->ioctl),
2675                   proto_method_implemented(proto->init),
2676                   proto_method_implemented(proto->destroy),
2677                   proto_method_implemented(proto->shutdown),
2678                   proto_method_implemented(proto->setsockopt),
2679                   proto_method_implemented(proto->getsockopt),
2680                   proto_method_implemented(proto->sendmsg),
2681                   proto_method_implemented(proto->recvmsg),
2682                   proto_method_implemented(proto->sendpage),
2683                   proto_method_implemented(proto->bind),
2684                   proto_method_implemented(proto->backlog_rcv),
2685                   proto_method_implemented(proto->hash),
2686                   proto_method_implemented(proto->unhash),
2687                   proto_method_implemented(proto->get_port),
2688                   proto_method_implemented(proto->enter_memory_pressure));
2689}
2690
2691static int proto_seq_show(struct seq_file *seq, void *v)
2692{
2693        if (v == &proto_list)
2694                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2695                           "protocol",
2696                           "size",
2697                           "sockets",
2698                           "memory",
2699                           "press",
2700                           "maxhdr",
2701                           "slab",
2702                           "module",
2703                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2704        else
2705                proto_seq_printf(seq, list_entry(v, struct proto, node));
2706        return 0;
2707}
2708
2709static const struct seq_operations proto_seq_ops = {
2710        .start  = proto_seq_start,
2711        .next   = proto_seq_next,
2712        .stop   = proto_seq_stop,
2713        .show   = proto_seq_show,
2714};
2715
2716static int proto_seq_open(struct inode *inode, struct file *file)
2717{
2718        return seq_open_net(inode, file, &proto_seq_ops,
2719                            sizeof(struct seq_net_private));
2720}
2721
2722static const struct file_operations proto_seq_fops = {
2723        .owner          = THIS_MODULE,
2724        .open           = proto_seq_open,
2725        .read           = seq_read,
2726        .llseek         = seq_lseek,
2727        .release        = seq_release_net,
2728};
2729
2730static __net_init int proto_init_net(struct net *net)
2731{
2732        if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2733                return -ENOMEM;
2734
2735        return 0;
2736}
2737
2738static __net_exit void proto_exit_net(struct net *net)
2739{
2740        proc_net_remove(net, "protocols");
2741}
2742
2743
2744static __net_initdata struct pernet_operations proto_net_ops = {
2745        .init = proto_init_net,
2746        .exit = proto_exit_net,
2747};
2748
2749static int __init proto_init(void)
2750{
2751        return register_pernet_subsys(&proto_net_ops);
2752}
2753
2754subsys_initcall(proto_init);
2755
2756#endif /* PROC_FS */
2757
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.