linux/net/core/sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Generic socket support routines. Memory allocators, socket lock/release
   7 *              handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Numerous verify_area() problems
  17 *              Alan Cox        :       Connecting on a connecting socket
  18 *                                      now returns an error for tcp.
  19 *              Alan Cox        :       sock->protocol is set correctly.
  20 *                                      and is not sometimes left as 0.
  21 *              Alan Cox        :       connect handles icmp errors on a
  22 *                                      connect properly. Unfortunately there
  23 *                                      is a restart syscall nasty there. I
  24 *                                      can't match BSD without hacking the C
  25 *                                      library. Ideas urgently sought!
  26 *              Alan Cox        :       Disallow bind() to addresses that are
  27 *                                      not ours - especially broadcast ones!!
  28 *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29 *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30 *                                      instead they leave that for the DESTROY timer.
  31 *              Alan Cox        :       Clean up error flag in accept
  32 *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33 *                                      was buggy. Put a remove_sock() in the handler
  34 *                                      for memory when we hit 0. Also altered the timer
  35 *                                      code. The ACK stuff can wait and needs major
  36 *                                      TCP layer surgery.
  37 *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38 *                                      and fixed timer/inet_bh race.
  39 *              Alan Cox        :       Added zapped flag for TCP
  40 *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41 *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46 *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47 *      Pauline Middelink       :       identd support
  48 *              Alan Cox        :       Fixed connect() taking signals I think.
  49 *              Alan Cox        :       SO_LINGER supported
  50 *              Alan Cox        :       Error reporting fixes
  51 *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52 *              Alan Cox        :       inet sockets don't set sk->type!
  53 *              Alan Cox        :       Split socket option code
  54 *              Alan Cox        :       Callbacks
  55 *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56 *              Alex            :       Removed restriction on inet fioctl
  57 *              Alan Cox        :       Splitting INET from NET core
  58 *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59 *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *              Alan Cox        :       Split IP from generic code
  61 *              Alan Cox        :       New kfree_skbmem()
  62 *              Alan Cox        :       Make SO_DEBUG superuser only.
  63 *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64 *                                      (compatibility fix)
  65 *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66 *              Alan Cox        :       Allocator for a socket is settable.
  67 *              Alan Cox        :       SO_ERROR includes soft errors.
  68 *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69 *              Alan Cox        :       Generic socket allocation to make hooks
  70 *                                      easier (suggested by Craig Metz).
  71 *              Michael Pall    :       SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79 *              Andi Kleen      :       Fix write_space callback
  80 *              Chris Evans     :       Security fixes - signedness again
  81 *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *              This program is free software; you can redistribute it and/or
  87 *              modify it under the terms of the GNU General Public License
  88 *              as published by the Free Software Foundation; either version
  89 *              2 of the License, or (at your option) any later version.
  90 */
  91
  92#include <linux/capability.h>
  93#include <linux/errno.h>
  94#include <linux/types.h>
  95#include <linux/socket.h>
  96#include <linux/in.h>
  97#include <linux/kernel.h>
  98#include <linux/module.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/sched.h>
 102#include <linux/timer.h>
 103#include <linux/string.h>
 104#include <linux/sockios.h>
 105#include <linux/net.h>
 106#include <linux/mm.h>
 107#include <linux/slab.h>
 108#include <linux/interrupt.h>
 109#include <linux/poll.h>
 110#include <linux/tcp.h>
 111#include <linux/init.h>
 112#include <linux/highmem.h>
 113
 114#include <asm/uaccess.h>
 115#include <asm/system.h>
 116
 117#include <linux/netdevice.h>
 118#include <net/protocol.h>
 119#include <linux/skbuff.h>
 120#include <net/net_namespace.h>
 121#include <net/request_sock.h>
 122#include <net/sock.h>
 123#include <linux/net_tstamp.h>
 124#include <net/xfrm.h>
 125#include <linux/ipsec.h>
 126
 127#include <linux/filter.h>
 128
 129#ifdef CONFIG_INET
 130#include <net/tcp.h>
 131#endif
 132
 133/*
 134 * Each address family might have different locking rules, so we have
 135 * one slock key per address family:
 136 */
 137static struct lock_class_key af_family_keys[AF_MAX];
 138static struct lock_class_key af_family_slock_keys[AF_MAX];
 139
 140/*
 141 * Make lock validator output more readable. (we pre-construct these
 142 * strings build-time, so that runtime initialization of socket
 143 * locks is fast):
 144 */
 145static const char *af_family_key_strings[AF_MAX+1] = {
 146  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 147  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 148  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 149  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 150  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 151  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 152  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 153  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 154  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 155  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 156  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 157  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 158  "sk_lock-AF_MAX"
 159};
 160static const char *af_family_slock_key_strings[AF_MAX+1] = {
 161  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 162  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 163  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 164  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 165  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 166  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 167  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 168  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 169  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 170  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 171  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 172  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 173  "slock-AF_MAX"
 174};
 175static const char *af_family_clock_key_strings[AF_MAX+1] = {
 176  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 177  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 178  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 179  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 180  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 181  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 182  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 183  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 184  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 185  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 186  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 187  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 188  "clock-AF_MAX"
 189};
 190
 191/*
 192 * sk_callback_lock locking rules are per-address-family,
 193 * so split the lock classes by using a per-AF key:
 194 */
 195static struct lock_class_key af_callback_keys[AF_MAX];
 196
 197/* Take into consideration the size of the struct sk_buff overhead in the
 198 * determination of these values, since that is non-constant across
 199 * platforms.  This makes socket queueing behavior and performance
 200 * not depend upon such differences.
 201 */
 202#define _SK_MEM_PACKETS         256
 203#define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 204#define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 205#define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 206
 207/* Run time adjustable parameters. */
 208__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 209__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 210__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 211__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 212
 213/* Maximal space eaten by iovec or ancilliary data plus some space */
 214int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 215
 216static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 217{
 218        struct timeval tv;
 219
 220        if (optlen < sizeof(tv))
 221                return -EINVAL;
 222        if (copy_from_user(&tv, optval, sizeof(tv)))
 223                return -EFAULT;
 224        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 225                return -EDOM;
 226
 227        if (tv.tv_sec < 0) {
 228                static int warned __read_mostly;
 229
 230                *timeo_p = 0;
 231                if (warned < 10 && net_ratelimit()) {
 232                        warned++;
 233                        printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 234                               "tries to set negative timeout\n",
 235                                current->comm, task_pid_nr(current));
 236                }
 237                return 0;
 238        }
 239        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 240        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 241                return 0;
 242        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 243                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 244        return 0;
 245}
 246
 247static void sock_warn_obsolete_bsdism(const char *name)
 248{
 249        static int warned;
 250        static char warncomm[TASK_COMM_LEN];
 251        if (strcmp(warncomm, current->comm) && warned < 5) {
 252                strcpy(warncomm,  current->comm);
 253                printk(KERN_WARNING "process `%s' is using obsolete "
 254                       "%s SO_BSDCOMPAT\n", warncomm, name);
 255                warned++;
 256        }
 257}
 258
 259static void sock_disable_timestamp(struct sock *sk, int flag)
 260{
 261        if (sock_flag(sk, flag)) {
 262                sock_reset_flag(sk, flag);
 263                if (!sock_flag(sk, SOCK_TIMESTAMP) &&
 264                    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
 265                        net_disable_timestamp();
 266                }
 267        }
 268}
 269
 270
 271int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 272{
 273        int err = 0;
 274        int skb_len;
 275
 276        /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 277           number of warnings when compiling with -W --ANK
 278         */
 279        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 280            (unsigned)sk->sk_rcvbuf) {
 281                err = -ENOMEM;
 282                goto out;
 283        }
 284
 285        err = sk_filter(sk, skb);
 286        if (err)
 287                goto out;
 288
 289        if (!sk_rmem_schedule(sk, skb->truesize)) {
 290                err = -ENOBUFS;
 291                goto out;
 292        }
 293
 294        skb->dev = NULL;
 295        skb_set_owner_r(skb, sk);
 296
 297        /* Cache the SKB length before we tack it onto the receive
 298         * queue.  Once it is added it no longer belongs to us and
 299         * may be freed by other threads of control pulling packets
 300         * from the queue.
 301         */
 302        skb_len = skb->len;
 303
 304        skb_queue_tail(&sk->sk_receive_queue, skb);
 305
 306        if (!sock_flag(sk, SOCK_DEAD))
 307                sk->sk_data_ready(sk, skb_len);
 308out:
 309        return err;
 310}
 311EXPORT_SYMBOL(sock_queue_rcv_skb);
 312
 313int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 314{
 315        int rc = NET_RX_SUCCESS;
 316
 317        if (sk_filter(sk, skb))
 318                goto discard_and_relse;
 319
 320        skb->dev = NULL;
 321
 322        if (nested)
 323                bh_lock_sock_nested(sk);
 324        else
 325                bh_lock_sock(sk);
 326        if (!sock_owned_by_user(sk)) {
 327                /*
 328                 * trylock + unlock semantics:
 329                 */
 330                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 331
 332                rc = sk_backlog_rcv(sk, skb);
 333
 334                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 335        } else
 336                sk_add_backlog(sk, skb);
 337        bh_unlock_sock(sk);
 338out:
 339        sock_put(sk);
 340        return rc;
 341discard_and_relse:
 342        kfree_skb(skb);
 343        goto out;
 344}
 345EXPORT_SYMBOL(sk_receive_skb);
 346
 347struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 348{
 349        struct dst_entry *dst = sk->sk_dst_cache;
 350
 351        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 352                sk->sk_dst_cache = NULL;
 353                dst_release(dst);
 354                return NULL;
 355        }
 356
 357        return dst;
 358}
 359EXPORT_SYMBOL(__sk_dst_check);
 360
 361struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 362{
 363        struct dst_entry *dst = sk_dst_get(sk);
 364
 365        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 366                sk_dst_reset(sk);
 367                dst_release(dst);
 368                return NULL;
 369        }
 370
 371        return dst;
 372}
 373EXPORT_SYMBOL(sk_dst_check);
 374
 375static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 376{
 377        int ret = -ENOPROTOOPT;
 378#ifdef CONFIG_NETDEVICES
 379        struct net *net = sock_net(sk);
 380        char devname[IFNAMSIZ];
 381        int index;
 382
 383        /* Sorry... */
 384        ret = -EPERM;
 385        if (!capable(CAP_NET_RAW))
 386                goto out;
 387
 388        ret = -EINVAL;
 389        if (optlen < 0)
 390                goto out;
 391
 392        /* Bind this socket to a particular device like "eth0",
 393         * as specified in the passed interface name. If the
 394         * name is "" or the option length is zero the socket
 395         * is not bound.
 396         */
 397        if (optlen > IFNAMSIZ - 1)
 398                optlen = IFNAMSIZ - 1;
 399        memset(devname, 0, sizeof(devname));
 400
 401        ret = -EFAULT;
 402        if (copy_from_user(devname, optval, optlen))
 403                goto out;
 404
 405        if (devname[0] == '\0') {
 406                index = 0;
 407        } else {
 408                struct net_device *dev = dev_get_by_name(net, devname);
 409
 410                ret = -ENODEV;
 411                if (!dev)
 412                        goto out;
 413
 414                index = dev->ifindex;
 415                dev_put(dev);
 416        }
 417
 418        lock_sock(sk);
 419        sk->sk_bound_dev_if = index;
 420        sk_dst_reset(sk);
 421        release_sock(sk);
 422
 423        ret = 0;
 424
 425out:
 426#endif
 427
 428        return ret;
 429}
 430
 431static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 432{
 433        if (valbool)
 434                sock_set_flag(sk, bit);
 435        else
 436                sock_reset_flag(sk, bit);
 437}
 438
 439/*
 440 *      This is meant for all protocols to use and covers goings on
 441 *      at the socket level. Everything here is generic.
 442 */
 443
 444int sock_setsockopt(struct socket *sock, int level, int optname,
 445                    char __user *optval, int optlen)
 446{
 447        struct sock *sk=sock->sk;
 448        int val;
 449        int valbool;
 450        struct linger ling;
 451        int ret = 0;
 452
 453        /*
 454         *      Options without arguments
 455         */
 456
 457        if (optname == SO_BINDTODEVICE)
 458                return sock_bindtodevice(sk, optval, optlen);
 459
 460        if (optlen < sizeof(int))
 461                return -EINVAL;
 462
 463        if (get_user(val, (int __user *)optval))
 464                return -EFAULT;
 465
 466        valbool = val?1:0;
 467
 468        lock_sock(sk);
 469
 470        switch(optname) {
 471        case SO_DEBUG:
 472                if (val && !capable(CAP_NET_ADMIN)) {
 473                        ret = -EACCES;
 474                } else
 475                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 476                break;
 477        case SO_REUSEADDR:
 478                sk->sk_reuse = valbool;
 479                break;
 480        case SO_TYPE:
 481        case SO_ERROR:
 482                ret = -ENOPROTOOPT;
 483                break;
 484        case SO_DONTROUTE:
 485                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 486                break;
 487        case SO_BROADCAST:
 488                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 489                break;
 490        case SO_SNDBUF:
 491                /* Don't error on this BSD doesn't and if you think
 492                   about it this is right. Otherwise apps have to
 493                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 494                   are treated in BSD as hints */
 495
 496                if (val > sysctl_wmem_max)
 497                        val = sysctl_wmem_max;
 498set_sndbuf:
 499                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 500                if ((val * 2) < SOCK_MIN_SNDBUF)
 501                        sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 502                else
 503                        sk->sk_sndbuf = val * 2;
 504
 505                /*
 506                 *      Wake up sending tasks if we
 507                 *      upped the value.
 508                 */
 509                sk->sk_write_space(sk);
 510                break;
 511
 512        case SO_SNDBUFFORCE:
 513                if (!capable(CAP_NET_ADMIN)) {
 514                        ret = -EPERM;
 515                        break;
 516                }
 517                goto set_sndbuf;
 518
 519        case SO_RCVBUF:
 520                /* Don't error on this BSD doesn't and if you think
 521                   about it this is right. Otherwise apps have to
 522                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 523                   are treated in BSD as hints */
 524
 525                if (val > sysctl_rmem_max)
 526                        val = sysctl_rmem_max;
 527set_rcvbuf:
 528                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 529                /*
 530                 * We double it on the way in to account for
 531                 * "struct sk_buff" etc. overhead.   Applications
 532                 * assume that the SO_RCVBUF setting they make will
 533                 * allow that much actual data to be received on that
 534                 * socket.
 535                 *
 536                 * Applications are unaware that "struct sk_buff" and
 537                 * other overheads allocate from the receive buffer
 538                 * during socket buffer allocation.
 539                 *
 540                 * And after considering the possible alternatives,
 541                 * returning the value we actually used in getsockopt
 542                 * is the most desirable behavior.
 543                 */
 544                if ((val * 2) < SOCK_MIN_RCVBUF)
 545                        sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 546                else
 547                        sk->sk_rcvbuf = val * 2;
 548                break;
 549
 550        case SO_RCVBUFFORCE:
 551                if (!capable(CAP_NET_ADMIN)) {
 552                        ret = -EPERM;
 553                        break;
 554                }
 555                goto set_rcvbuf;
 556
 557        case SO_KEEPALIVE:
 558#ifdef CONFIG_INET
 559                if (sk->sk_protocol == IPPROTO_TCP)
 560                        tcp_set_keepalive(sk, valbool);
 561#endif
 562                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 563                break;
 564
 565        case SO_OOBINLINE:
 566                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 567                break;
 568
 569        case SO_NO_CHECK:
 570                sk->sk_no_check = valbool;
 571                break;
 572
 573        case SO_PRIORITY:
 574                if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 575                        sk->sk_priority = val;
 576                else
 577                        ret = -EPERM;
 578                break;
 579
 580        case SO_LINGER:
 581                if (optlen < sizeof(ling)) {
 582                        ret = -EINVAL;  /* 1003.1g */
 583                        break;
 584                }
 585                if (copy_from_user(&ling,optval,sizeof(ling))) {
 586                        ret = -EFAULT;
 587                        break;
 588                }
 589                if (!ling.l_onoff)
 590                        sock_reset_flag(sk, SOCK_LINGER);
 591                else {
 592#if (BITS_PER_LONG == 32)
 593                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 594                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 595                        else
 596#endif
 597                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 598                        sock_set_flag(sk, SOCK_LINGER);
 599                }
 600                break;
 601
 602        case SO_BSDCOMPAT:
 603                sock_warn_obsolete_bsdism("setsockopt");
 604                break;
 605
 606        case SO_PASSCRED:
 607                if (valbool)
 608                        set_bit(SOCK_PASSCRED, &sock->flags);
 609                else
 610                        clear_bit(SOCK_PASSCRED, &sock->flags);
 611                break;
 612
 613        case SO_TIMESTAMP:
 614        case SO_TIMESTAMPNS:
 615                if (valbool)  {
 616                        if (optname == SO_TIMESTAMP)
 617                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 618                        else
 619                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 620                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 621                        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 622                } else {
 623                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 624                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 625                }
 626                break;
 627
 628        case SO_TIMESTAMPING:
 629                if (val & ~SOF_TIMESTAMPING_MASK) {
 630                        ret = EINVAL;
 631                        break;
 632                }
 633                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 634                                  val & SOF_TIMESTAMPING_TX_HARDWARE);
 635                sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 636                                  val & SOF_TIMESTAMPING_TX_SOFTWARE);
 637                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 638                                  val & SOF_TIMESTAMPING_RX_HARDWARE);
 639                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 640                        sock_enable_timestamp(sk,
 641                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
 642                else
 643                        sock_disable_timestamp(sk,
 644                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 645                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 646                                  val & SOF_TIMESTAMPING_SOFTWARE);
 647                sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 648                                  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 649                sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 650                                  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 651                break;
 652
 653        case SO_RCVLOWAT:
 654                if (val < 0)
 655                        val = INT_MAX;
 656                sk->sk_rcvlowat = val ? : 1;
 657                break;
 658
 659        case SO_RCVTIMEO:
 660                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 661                break;
 662
 663        case SO_SNDTIMEO:
 664                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 665                break;
 666
 667        case SO_ATTACH_FILTER:
 668                ret = -EINVAL;
 669                if (optlen == sizeof(struct sock_fprog)) {
 670                        struct sock_fprog fprog;
 671
 672                        ret = -EFAULT;
 673                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 674                                break;
 675
 676                        ret = sk_attach_filter(&fprog, sk);
 677                }
 678                break;
 679
 680        case SO_DETACH_FILTER:
 681                ret = sk_detach_filter(sk);
 682                break;
 683
 684        case SO_PASSSEC:
 685                if (valbool)
 686                        set_bit(SOCK_PASSSEC, &sock->flags);
 687                else
 688                        clear_bit(SOCK_PASSSEC, &sock->flags);
 689                break;
 690        case SO_MARK:
 691                if (!capable(CAP_NET_ADMIN))
 692                        ret = -EPERM;
 693                else {
 694                        sk->sk_mark = val;
 695                }
 696                break;
 697
 698                /* We implement the SO_SNDLOWAT etc to
 699                   not be settable (1003.1g 5.3) */
 700        default:
 701                ret = -ENOPROTOOPT;
 702                break;
 703        }
 704        release_sock(sk);
 705        return ret;
 706}
 707
 708
 709int sock_getsockopt(struct socket *sock, int level, int optname,
 710                    char __user *optval, int __user *optlen)
 711{
 712        struct sock *sk = sock->sk;
 713
 714        union {
 715                int val;
 716                struct linger ling;
 717                struct timeval tm;
 718        } v;
 719
 720        unsigned int lv = sizeof(int);
 721        int len;
 722
 723        if (get_user(len, optlen))
 724                return -EFAULT;
 725        if (len < 0)
 726                return -EINVAL;
 727
 728        memset(&v, 0, sizeof(v));
 729
 730        switch(optname) {
 731        case SO_DEBUG:
 732                v.val = sock_flag(sk, SOCK_DBG);
 733                break;
 734
 735        case SO_DONTROUTE:
 736                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 737                break;
 738
 739        case SO_BROADCAST:
 740                v.val = !!sock_flag(sk, SOCK_BROADCAST);
 741                break;
 742
 743        case SO_SNDBUF:
 744                v.val = sk->sk_sndbuf;
 745                break;
 746
 747        case SO_RCVBUF:
 748                v.val = sk->sk_rcvbuf;
 749                break;
 750
 751        case SO_REUSEADDR:
 752                v.val = sk->sk_reuse;
 753                break;
 754
 755        case SO_KEEPALIVE:
 756                v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 757                break;
 758
 759        case SO_TYPE:
 760                v.val = sk->sk_type;
 761                break;
 762
 763        case SO_ERROR:
 764                v.val = -sock_error(sk);
 765                if (v.val==0)
 766                        v.val = xchg(&sk->sk_err_soft, 0);
 767                break;
 768
 769        case SO_OOBINLINE:
 770                v.val = !!sock_flag(sk, SOCK_URGINLINE);
 771                break;
 772
 773        case SO_NO_CHECK:
 774                v.val = sk->sk_no_check;
 775                break;
 776
 777        case SO_PRIORITY:
 778                v.val = sk->sk_priority;
 779                break;
 780
 781        case SO_LINGER:
 782                lv              = sizeof(v.ling);
 783                v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 784                v.ling.l_linger = sk->sk_lingertime / HZ;
 785                break;
 786
 787        case SO_BSDCOMPAT:
 788                sock_warn_obsolete_bsdism("getsockopt");
 789                break;
 790
 791        case SO_TIMESTAMP:
 792                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 793                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
 794                break;
 795
 796        case SO_TIMESTAMPNS:
 797                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 798                break;
 799
 800        case SO_TIMESTAMPING:
 801                v.val = 0;
 802                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 803                        v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 804                if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 805                        v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 806                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 807                        v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 808                if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 809                        v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 810                if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 811                        v.val |= SOF_TIMESTAMPING_SOFTWARE;
 812                if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 813                        v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 814                if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 815                        v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 816                break;
 817
 818        case SO_RCVTIMEO:
 819                lv=sizeof(struct timeval);
 820                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 821                        v.tm.tv_sec = 0;
 822                        v.tm.tv_usec = 0;
 823                } else {
 824                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 825                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 826                }
 827                break;
 828
 829        case SO_SNDTIMEO:
 830                lv=sizeof(struct timeval);
 831                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 832                        v.tm.tv_sec = 0;
 833                        v.tm.tv_usec = 0;
 834                } else {
 835                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 836                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 837                }
 838                break;
 839
 840        case SO_RCVLOWAT:
 841                v.val = sk->sk_rcvlowat;
 842                break;
 843
 844        case SO_SNDLOWAT:
 845                v.val=1;
 846                break;
 847
 848        case SO_PASSCRED:
 849                v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 850                break;
 851
 852        case SO_PEERCRED:
 853                if (len > sizeof(sk->sk_peercred))
 854                        len = sizeof(sk->sk_peercred);
 855                if (copy_to_user(optval, &sk->sk_peercred, len))
 856                        return -EFAULT;
 857                goto lenout;
 858
 859        case SO_PEERNAME:
 860        {
 861                char address[128];
 862
 863                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 864                        return -ENOTCONN;
 865                if (lv < len)
 866                        return -EINVAL;
 867                if (copy_to_user(optval, address, len))
 868                        return -EFAULT;
 869                goto lenout;
 870        }
 871
 872        /* Dubious BSD thing... Probably nobody even uses it, but
 873         * the UNIX standard wants it for whatever reason... -DaveM
 874         */
 875        case SO_ACCEPTCONN:
 876                v.val = sk->sk_state == TCP_LISTEN;
 877                break;
 878
 879        case SO_PASSSEC:
 880                v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 881                break;
 882
 883        case SO_PEERSEC:
 884                return security_socket_getpeersec_stream(sock, optval, optlen, len);
 885
 886        case SO_MARK:
 887                v.val = sk->sk_mark;
 888                break;
 889
 890        default:
 891                return -ENOPROTOOPT;
 892        }
 893
 894        if (len > lv)
 895                len = lv;
 896        if (copy_to_user(optval, &v, len))
 897                return -EFAULT;
 898lenout:
 899        if (put_user(len, optlen))
 900                return -EFAULT;
 901        return 0;
 902}
 903
 904/*
 905 * Initialize an sk_lock.
 906 *
 907 * (We also register the sk_lock with the lock validator.)
 908 */
 909static inline void sock_lock_init(struct sock *sk)
 910{
 911        sock_lock_init_class_and_name(sk,
 912                        af_family_slock_key_strings[sk->sk_family],
 913                        af_family_slock_keys + sk->sk_family,
 914                        af_family_key_strings[sk->sk_family],
 915                        af_family_keys + sk->sk_family);
 916}
 917
 918static void sock_copy(struct sock *nsk, const struct sock *osk)
 919{
 920#ifdef CONFIG_SECURITY_NETWORK
 921        void *sptr = nsk->sk_security;
 922#endif
 923
 924        memcpy(nsk, osk, osk->sk_prot->obj_size);
 925#ifdef CONFIG_SECURITY_NETWORK
 926        nsk->sk_security = sptr;
 927        security_sk_clone(osk, nsk);
 928#endif
 929}
 930
 931static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 932                int family)
 933{
 934        struct sock *sk;
 935        struct kmem_cache *slab;
 936
 937        slab = prot->slab;
 938        if (slab != NULL)
 939                sk = kmem_cache_alloc(slab, priority);
 940        else
 941                sk = kmalloc(prot->obj_size, priority);
 942
 943        if (sk != NULL) {
 944                if (security_sk_alloc(sk, family, priority))
 945                        goto out_free;
 946
 947                if (!try_module_get(prot->owner))
 948                        goto out_free_sec;
 949        }
 950
 951        return sk;
 952
 953out_free_sec:
 954        security_sk_free(sk);
 955out_free:
 956        if (slab != NULL)
 957                kmem_cache_free(slab, sk);
 958        else
 959                kfree(sk);
 960        return NULL;
 961}
 962
 963static void sk_prot_free(struct proto *prot, struct sock *sk)
 964{
 965        struct kmem_cache *slab;
 966        struct module *owner;
 967
 968        owner = prot->owner;
 969        slab = prot->slab;
 970
 971        security_sk_free(sk);
 972        if (slab != NULL)
 973                kmem_cache_free(slab, sk);
 974        else
 975                kfree(sk);
 976        module_put(owner);
 977}
 978
 979/**
 980 *      sk_alloc - All socket objects are allocated here
 981 *      @net: the applicable net namespace
 982 *      @family: protocol family
 983 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 984 *      @prot: struct proto associated with this new sock instance
 985 */
 986struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 987                      struct proto *prot)
 988{
 989        struct sock *sk;
 990
 991        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
 992        if (sk) {
 993                sk->sk_family = family;
 994                /*
 995                 * See comment in struct sock definition to understand
 996                 * why we need sk_prot_creator -acme
 997                 */
 998                sk->sk_prot = sk->sk_prot_creator = prot;
 999                sock_lock_init(sk);
1000                sock_net_set(sk, get_net(net));
1001        }
1002
1003        return sk;
1004}
1005
1006void sk_free(struct sock *sk)
1007{
1008        struct sk_filter *filter;
1009
1010        if (sk->sk_destruct)
1011                sk->sk_destruct(sk);
1012
1013        filter = rcu_dereference(sk->sk_filter);
1014        if (filter) {
1015                sk_filter_uncharge(sk, filter);
1016                rcu_assign_pointer(sk->sk_filter, NULL);
1017        }
1018
1019        sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1020        sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1021
1022        if (atomic_read(&sk->sk_omem_alloc))
1023                printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1024                       __func__, atomic_read(&sk->sk_omem_alloc));
1025
1026        put_net(sock_net(sk));
1027        sk_prot_free(sk->sk_prot_creator, sk);
1028}
1029
1030/*
1031 * Last sock_put should drop referrence to sk->sk_net. It has already
1032 * been dropped in sk_change_net. Taking referrence to stopping namespace
1033 * is not an option.
1034 * Take referrence to a socket to remove it from hash _alive_ and after that
1035 * destroy it in the context of init_net.
1036 */
1037void sk_release_kernel(struct sock *sk)
1038{
1039        if (sk == NULL || sk->sk_socket == NULL)
1040                return;
1041
1042        sock_hold(sk);
1043        sock_release(sk->sk_socket);
1044        release_net(sock_net(sk));
1045        sock_net_set(sk, get_net(&init_net));
1046        sock_put(sk);
1047}
1048EXPORT_SYMBOL(sk_release_kernel);
1049
1050struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1051{
1052        struct sock *newsk;
1053
1054        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1055        if (newsk != NULL) {
1056                struct sk_filter *filter;
1057
1058                sock_copy(newsk, sk);
1059
1060                /* SANITY */
1061                get_net(sock_net(newsk));
1062                sk_node_init(&newsk->sk_node);
1063                sock_lock_init(newsk);
1064                bh_lock_sock(newsk);
1065                newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1066
1067                atomic_set(&newsk->sk_rmem_alloc, 0);
1068                atomic_set(&newsk->sk_wmem_alloc, 0);
1069                atomic_set(&newsk->sk_omem_alloc, 0);
1070                skb_queue_head_init(&newsk->sk_receive_queue);
1071                skb_queue_head_init(&newsk->sk_write_queue);
1072#ifdef CONFIG_NET_DMA
1073                skb_queue_head_init(&newsk->sk_async_wait_queue);
1074#endif
1075
1076                rwlock_init(&newsk->sk_dst_lock);
1077                rwlock_init(&newsk->sk_callback_lock);
1078                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1079                                af_callback_keys + newsk->sk_family,
1080                                af_family_clock_key_strings[newsk->sk_family]);
1081
1082                newsk->sk_dst_cache     = NULL;
1083                newsk->sk_wmem_queued   = 0;
1084                newsk->sk_forward_alloc = 0;
1085                newsk->sk_send_head     = NULL;
1086                newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1087
1088                sock_reset_flag(newsk, SOCK_DONE);
1089                skb_queue_head_init(&newsk->sk_error_queue);
1090
1091                filter = newsk->sk_filter;
1092                if (filter != NULL)
1093                        sk_filter_charge(newsk, filter);
1094
1095                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1096                        /* It is still raw copy of parent, so invalidate
1097                         * destructor and make plain sk_free() */
1098                        newsk->sk_destruct = NULL;
1099                        sk_free(newsk);
1100                        newsk = NULL;
1101                        goto out;
1102                }
1103
1104                newsk->sk_err      = 0;
1105                newsk->sk_priority = 0;
1106                atomic_set(&newsk->sk_refcnt, 2);
1107
1108                /*
1109                 * Increment the counter in the same struct proto as the master
1110                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1111                 * is the same as sk->sk_prot->socks, as this field was copied
1112                 * with memcpy).
1113                 *
1114                 * This _changes_ the previous behaviour, where
1115                 * tcp_create_openreq_child always was incrementing the
1116                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1117                 * to be taken into account in all callers. -acme
1118                 */
1119                sk_refcnt_debug_inc(newsk);
1120                sk_set_socket(newsk, NULL);
1121                newsk->sk_sleep  = NULL;
1122
1123                if (newsk->sk_prot->sockets_allocated)
1124                        percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1125        }
1126out:
1127        return newsk;
1128}
1129
1130EXPORT_SYMBOL_GPL(sk_clone);
1131
1132void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1133{
1134        __sk_dst_set(sk, dst);
1135        sk->sk_route_caps = dst->dev->features;
1136        if (sk->sk_route_caps & NETIF_F_GSO)
1137                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1138        if (sk_can_gso(sk)) {
1139                if (dst->header_len) {
1140                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1141                } else {
1142                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1143                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1144                }
1145        }
1146}
1147EXPORT_SYMBOL_GPL(sk_setup_caps);
1148
1149void __init sk_init(void)
1150{
1151        if (num_physpages <= 4096) {
1152                sysctl_wmem_max = 32767;
1153                sysctl_rmem_max = 32767;
1154                sysctl_wmem_default = 32767;
1155                sysctl_rmem_default = 32767;
1156        } else if (num_physpages >= 131072) {
1157                sysctl_wmem_max = 131071;
1158                sysctl_rmem_max = 131071;
1159        }
1160}
1161
1162/*
1163 *      Simple resource managers for sockets.
1164 */
1165
1166
1167/*
1168 * Write buffer destructor automatically called from kfree_skb.
1169 */
1170void sock_wfree(struct sk_buff *skb)
1171{
1172        struct sock *sk = skb->sk;
1173
1174        /* In case it might be waiting for more memory. */
1175        atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1176        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1177                sk->sk_write_space(sk);
1178        sock_put(sk);
1179}
1180
1181/*
1182 * Read buffer destructor automatically called from kfree_skb.
1183 */
1184void sock_rfree(struct sk_buff *skb)
1185{
1186        struct sock *sk = skb->sk;
1187
1188        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1189        sk_mem_uncharge(skb->sk, skb->truesize);
1190}
1191
1192
1193int sock_i_uid(struct sock *sk)
1194{
1195        int uid;
1196
1197        read_lock(&sk->sk_callback_lock);
1198        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1199        read_unlock(&sk->sk_callback_lock);
1200        return uid;
1201}
1202
1203unsigned long sock_i_ino(struct sock *sk)
1204{
1205        unsigned long ino;
1206
1207        read_lock(&sk->sk_callback_lock);
1208        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1209        read_unlock(&sk->sk_callback_lock);
1210        return ino;
1211}
1212
1213/*
1214 * Allocate a skb from the socket's send buffer.
1215 */
1216struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1217                             gfp_t priority)
1218{
1219        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1220                struct sk_buff * skb = alloc_skb(size, priority);
1221                if (skb) {
1222                        skb_set_owner_w(skb, sk);
1223                        return skb;
1224                }
1225        }
1226        return NULL;
1227}
1228
1229/*
1230 * Allocate a skb from the socket's receive buffer.
1231 */
1232struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1233                             gfp_t priority)
1234{
1235        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1236                struct sk_buff *skb = alloc_skb(size, priority);
1237                if (skb) {
1238                        skb_set_owner_r(skb, sk);
1239                        return skb;
1240                }
1241        }
1242        return NULL;
1243}
1244
1245/*
1246 * Allocate a memory block from the socket's option memory buffer.
1247 */
1248void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1249{
1250        if ((unsigned)size <= sysctl_optmem_max &&
1251            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1252                void *mem;
1253                /* First do the add, to avoid the race if kmalloc
1254                 * might sleep.
1255                 */
1256                atomic_add(size, &sk->sk_omem_alloc);
1257                mem = kmalloc(size, priority);
1258                if (mem)
1259                        return mem;
1260                atomic_sub(size, &sk->sk_omem_alloc);
1261        }
1262        return NULL;
1263}
1264
1265/*
1266 * Free an option memory block.
1267 */
1268void sock_kfree_s(struct sock *sk, void *mem, int size)
1269{
1270        kfree(mem);
1271        atomic_sub(size, &sk->sk_omem_alloc);
1272}
1273
1274/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1275   I think, these locks should be removed for datagram sockets.
1276 */
1277static long sock_wait_for_wmem(struct sock * sk, long timeo)
1278{
1279        DEFINE_WAIT(wait);
1280
1281        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1282        for (;;) {
1283                if (!timeo)
1284                        break;
1285                if (signal_pending(current))
1286                        break;
1287                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1288                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1289                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1290                        break;
1291                if (sk->sk_shutdown & SEND_SHUTDOWN)
1292                        break;
1293                if (sk->sk_err)
1294                        break;
1295                timeo = schedule_timeout(timeo);
1296        }
1297        finish_wait(sk->sk_sleep, &wait);
1298        return timeo;
1299}
1300
1301
1302/*
1303 *      Generic send/receive buffer handlers
1304 */
1305
1306struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1307                                     unsigned long data_len, int noblock,
1308                                     int *errcode)
1309{
1310        struct sk_buff *skb;
1311        gfp_t gfp_mask;
1312        long timeo;
1313        int err;
1314
1315        gfp_mask = sk->sk_allocation;
1316        if (gfp_mask & __GFP_WAIT)
1317                gfp_mask |= __GFP_REPEAT;
1318
1319        timeo = sock_sndtimeo(sk, noblock);
1320        while (1) {
1321                err = sock_error(sk);
1322                if (err != 0)
1323                        goto failure;
1324
1325                err = -EPIPE;
1326                if (sk->sk_shutdown & SEND_SHUTDOWN)
1327                        goto failure;
1328
1329                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1330                        skb = alloc_skb(header_len, gfp_mask);
1331                        if (skb) {
1332                                int npages;
1333                                int i;
1334
1335                                /* No pages, we're done... */
1336                                if (!data_len)
1337                                        break;
1338
1339                                npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1340                                skb->truesize += data_len;
1341                                skb_shinfo(skb)->nr_frags = npages;
1342                                for (i = 0; i < npages; i++) {
1343                                        struct page *page;
1344                                        skb_frag_t *frag;
1345
1346                                        page = alloc_pages(sk->sk_allocation, 0);
1347                                        if (!page) {
1348                                                err = -ENOBUFS;
1349                                                skb_shinfo(skb)->nr_frags = i;
1350                                                kfree_skb(skb);
1351                                                goto failure;
1352                                        }
1353
1354                                        frag = &skb_shinfo(skb)->frags[i];
1355                                        frag->page = page;
1356                                        frag->page_offset = 0;
1357                                        frag->size = (data_len >= PAGE_SIZE ?
1358                                                      PAGE_SIZE :
1359                                                      data_len);
1360                                        data_len -= PAGE_SIZE;
1361                                }
1362
1363                                /* Full success... */
1364                                break;
1365                        }
1366                        err = -ENOBUFS;
1367                        goto failure;
1368                }
1369                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1370                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1371                err = -EAGAIN;
1372                if (!timeo)
1373                        goto failure;
1374                if (signal_pending(current))
1375                        goto interrupted;
1376                timeo = sock_wait_for_wmem(sk, timeo);
1377        }
1378
1379        skb_set_owner_w(skb, sk);
1380        return skb;
1381
1382interrupted:
1383        err = sock_intr_errno(timeo);
1384failure:
1385        *errcode = err;
1386        return NULL;
1387}
1388EXPORT_SYMBOL(sock_alloc_send_pskb);
1389
1390struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1391                                    int noblock, int *errcode)
1392{
1393        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1394}
1395
1396static void __lock_sock(struct sock *sk)
1397{
1398        DEFINE_WAIT(wait);
1399
1400        for (;;) {
1401                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1402                                        TASK_UNINTERRUPTIBLE);
1403                spin_unlock_bh(&sk->sk_lock.slock);
1404                schedule();
1405                spin_lock_bh(&sk->sk_lock.slock);
1406                if (!sock_owned_by_user(sk))
1407                        break;
1408        }
1409        finish_wait(&sk->sk_lock.wq, &wait);
1410}
1411
1412static void __release_sock(struct sock *sk)
1413{
1414        struct sk_buff *skb = sk->sk_backlog.head;
1415
1416        do {
1417                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1418                bh_unlock_sock(sk);
1419
1420                do {
1421                        struct sk_buff *next = skb->next;
1422
1423                        skb->next = NULL;
1424                        sk_backlog_rcv(sk, skb);
1425
1426                        /*
1427                         * We are in process context here with softirqs
1428                         * disabled, use cond_resched_softirq() to preempt.
1429                         * This is safe to do because we've taken the backlog
1430                         * queue private:
1431                         */
1432                        cond_resched_softirq();
1433
1434                        skb = next;
1435                } while (skb != NULL);
1436
1437                bh_lock_sock(sk);
1438        } while ((skb = sk->sk_backlog.head) != NULL);
1439}
1440
1441/**
1442 * sk_wait_data - wait for data to arrive at sk_receive_queue
1443 * @sk:    sock to wait on
1444 * @timeo: for how long
1445 *
1446 * Now socket state including sk->sk_err is changed only under lock,
1447 * hence we may omit checks after joining wait queue.
1448 * We check receive queue before schedule() only as optimization;
1449 * it is very likely that release_sock() added new data.
1450 */
1451int sk_wait_data(struct sock *sk, long *timeo)
1452{
1453        int rc;
1454        DEFINE_WAIT(wait);
1455
1456        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1457        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1458        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1459        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1460        finish_wait(sk->sk_sleep, &wait);
1461        return rc;
1462}
1463
1464EXPORT_SYMBOL(sk_wait_data);
1465
1466/**
1467 *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1468 *      @sk: socket
1469 *      @size: memory size to allocate
1470 *      @kind: allocation type
1471 *
1472 *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1473 *      rmem allocation. This function assumes that protocols which have
1474 *      memory_pressure use sk_wmem_queued as write buffer accounting.
1475 */
1476int __sk_mem_schedule(struct sock *sk, int size, int kind)
1477{
1478        struct proto *prot = sk->sk_prot;
1479        int amt = sk_mem_pages(size);
1480        int allocated;
1481
1482        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1483        allocated = atomic_add_return(amt, prot->memory_allocated);
1484
1485        /* Under limit. */
1486        if (allocated <= prot->sysctl_mem[0]) {
1487                if (prot->memory_pressure && *prot->memory_pressure)
1488                        *prot->memory_pressure = 0;
1489                return 1;
1490        }
1491
1492        /* Under pressure. */
1493        if (allocated > prot->sysctl_mem[1])
1494                if (prot->enter_memory_pressure)
1495                        prot->enter_memory_pressure(sk);
1496
1497        /* Over hard limit. */
1498        if (allocated > prot->sysctl_mem[2])
1499                goto suppress_allocation;
1500
1501        /* guarantee minimum buffer size under pressure */
1502        if (kind == SK_MEM_RECV) {
1503                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1504                        return 1;
1505        } else { /* SK_MEM_SEND */
1506                if (sk->sk_type == SOCK_STREAM) {
1507                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1508                                return 1;
1509                } else if (atomic_read(&sk->sk_wmem_alloc) <
1510                           prot->sysctl_wmem[0])
1511                                return 1;
1512        }
1513
1514        if (prot->memory_pressure) {
1515                int alloc;
1516
1517                if (!*prot->memory_pressure)
1518                        return 1;
1519                alloc = percpu_counter_read_positive(prot->sockets_allocated);
1520                if (prot->sysctl_mem[2] > alloc *
1521                    sk_mem_pages(sk->sk_wmem_queued +
1522                                 atomic_read(&sk->sk_rmem_alloc) +
1523                                 sk->sk_forward_alloc))
1524                        return 1;
1525        }
1526
1527suppress_allocation:
1528
1529        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1530                sk_stream_moderate_sndbuf(sk);
1531
1532                /* Fail only if socket is _under_ its sndbuf.
1533                 * In this case we cannot block, so that we have to fail.
1534                 */
1535                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1536                        return 1;
1537        }
1538
1539        /* Alas. Undo changes. */
1540        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1541        atomic_sub(amt, prot->memory_allocated);
1542        return 0;
1543}
1544
1545EXPORT_SYMBOL(__sk_mem_schedule);
1546
1547/**
1548 *      __sk_reclaim - reclaim memory_allocated
1549 *      @sk: socket
1550 */
1551void __sk_mem_reclaim(struct sock *sk)
1552{
1553        struct proto *prot = sk->sk_prot;
1554
1555        atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1556                   prot->memory_allocated);
1557        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1558
1559        if (prot->memory_pressure && *prot->memory_pressure &&
1560            (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1561                *prot->memory_pressure = 0;
1562}
1563
1564EXPORT_SYMBOL(__sk_mem_reclaim);
1565
1566
1567/*
1568 * Set of default routines for initialising struct proto_ops when
1569 * the protocol does not support a particular function. In certain
1570 * cases where it makes no sense for a protocol to have a "do nothing"
1571 * function, some default processing is provided.
1572 */
1573
1574int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1575{
1576        return -EOPNOTSUPP;
1577}
1578
1579int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1580                    int len, int flags)
1581{
1582        return -EOPNOTSUPP;
1583}
1584
1585int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1586{
1587        return -EOPNOTSUPP;
1588}
1589
1590int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1591{
1592        return -EOPNOTSUPP;
1593}
1594
1595int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1596                    int *len, int peer)
1597{
1598        return -EOPNOTSUPP;
1599}
1600
1601unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1602{
1603        return 0;
1604}
1605
1606int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1607{
1608        return -EOPNOTSUPP;
1609}
1610
1611int sock_no_listen(struct socket *sock, int backlog)
1612{
1613        return -EOPNOTSUPP;
1614}
1615
1616int sock_no_shutdown(struct socket *sock, int how)
1617{
1618        return -EOPNOTSUPP;
1619}
1620
1621int sock_no_setsockopt(struct socket *sock, int level, int optname,
1622                    char __user *optval, int optlen)
1623{
1624        return -EOPNOTSUPP;
1625}
1626
1627int sock_no_getsockopt(struct socket *sock, int level, int optname,
1628                    char __user *optval, int __user *optlen)
1629{
1630        return -EOPNOTSUPP;
1631}
1632
1633int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1634                    size_t len)
1635{
1636        return -EOPNOTSUPP;
1637}
1638
1639int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1640                    size_t len, int flags)
1641{
1642        return -EOPNOTSUPP;
1643}
1644
1645int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1646{
1647        /* Mirror missing mmap method error code */
1648        return -ENODEV;
1649}
1650
1651ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1652{
1653        ssize_t res;
1654        struct msghdr msg = {.msg_flags = flags};
1655        struct kvec iov;
1656        char *kaddr = kmap(page);
1657        iov.iov_base = kaddr + offset;
1658        iov.iov_len = size;
1659        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1660        kunmap(page);
1661        return res;
1662}
1663
1664/*
1665 *      Default Socket Callbacks
1666 */
1667
1668static void sock_def_wakeup(struct sock *sk)
1669{
1670        read_lock(&sk->sk_callback_lock);
1671        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1672                wake_up_interruptible_all(sk->sk_sleep);
1673        read_unlock(&sk->sk_callback_lock);
1674}
1675
1676static void sock_def_error_report(struct sock *sk)
1677{
1678        read_lock(&sk->sk_callback_lock);
1679        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1680                wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1681        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1682        read_unlock(&sk->sk_callback_lock);
1683}
1684
1685static void sock_def_readable(struct sock *sk, int len)
1686{
1687        read_lock(&sk->sk_callback_lock);
1688        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1689                wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1690                                                POLLRDNORM | POLLRDBAND);
1691        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1692        read_unlock(&sk->sk_callback_lock);
1693}
1694
1695static void sock_def_write_space(struct sock *sk)
1696{
1697        read_lock(&sk->sk_callback_lock);
1698
1699        /* Do not wake up a writer until he can make "significant"
1700         * progress.  --DaveM
1701         */
1702        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1703                if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1704                        wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1705                                                POLLWRNORM | POLLWRBAND);
1706
1707                /* Should agree with poll, otherwise some programs break */
1708                if (sock_writeable(sk))
1709                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1710        }
1711
1712        read_unlock(&sk->sk_callback_lock);
1713}
1714
1715static void sock_def_destruct(struct sock *sk)
1716{
1717        kfree(sk->sk_protinfo);
1718}
1719
1720void sk_send_sigurg(struct sock *sk)
1721{
1722        if (sk->sk_socket && sk->sk_socket->file)
1723                if (send_sigurg(&sk->sk_socket->file->f_owner))
1724                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1725}
1726
1727void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1728                    unsigned long expires)
1729{
1730        if (!mod_timer(timer, expires))
1731                sock_hold(sk);
1732}
1733
1734EXPORT_SYMBOL(sk_reset_timer);
1735
1736void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1737{
1738        if (timer_pending(timer) && del_timer(timer))
1739                __sock_put(sk);
1740}
1741
1742EXPORT_SYMBOL(sk_stop_timer);
1743
1744void sock_init_data(struct socket *sock, struct sock *sk)
1745{
1746        skb_queue_head_init(&sk->sk_receive_queue);
1747        skb_queue_head_init(&sk->sk_write_queue);
1748        skb_queue_head_init(&sk->sk_error_queue);
1749#ifdef CONFIG_NET_DMA
1750        skb_queue_head_init(&sk->sk_async_wait_queue);
1751#endif
1752
1753        sk->sk_send_head        =       NULL;
1754
1755        init_timer(&sk->sk_timer);
1756
1757        sk->sk_allocation       =       GFP_KERNEL;
1758        sk->sk_rcvbuf           =       sysctl_rmem_default;
1759        sk->sk_sndbuf           =       sysctl_wmem_default;
1760        sk->sk_state            =       TCP_CLOSE;
1761        sk_set_socket(sk, sock);
1762
1763        sock_set_flag(sk, SOCK_ZAPPED);
1764
1765        if (sock) {
1766                sk->sk_type     =       sock->type;
1767                sk->sk_sleep    =       &sock->wait;
1768                sock->sk        =       sk;
1769        } else
1770                sk->sk_sleep    =       NULL;
1771
1772        rwlock_init(&sk->sk_dst_lock);
1773        rwlock_init(&sk->sk_callback_lock);
1774        lockdep_set_class_and_name(&sk->sk_callback_lock,
1775                        af_callback_keys + sk->sk_family,
1776                        af_family_clock_key_strings[sk->sk_family]);
1777
1778        sk->sk_state_change     =       sock_def_wakeup;
1779        sk->sk_data_ready       =       sock_def_readable;
1780        sk->sk_write_space      =       sock_def_write_space;
1781        sk->sk_error_report     =       sock_def_error_report;
1782        sk->sk_destruct         =       sock_def_destruct;
1783
1784        sk->sk_sndmsg_page      =       NULL;
1785        sk->sk_sndmsg_off       =       0;
1786
1787        sk->sk_peercred.pid     =       0;
1788        sk->sk_peercred.uid     =       -1;
1789        sk->sk_peercred.gid     =       -1;
1790        sk->sk_write_pending    =       0;
1791        sk->sk_rcvlowat         =       1;
1792        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1793        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1794
1795        sk->sk_stamp = ktime_set(-1L, 0);
1796
1797        atomic_set(&sk->sk_refcnt, 1);
1798        atomic_set(&sk->sk_drops, 0);
1799}
1800
1801void lock_sock_nested(struct sock *sk, int subclass)
1802{
1803        might_sleep();
1804        spin_lock_bh(&sk->sk_lock.slock);
1805        if (sk->sk_lock.owned)
1806                __lock_sock(sk);
1807        sk->sk_lock.owned = 1;
1808        spin_unlock(&sk->sk_lock.slock);
1809        /*
1810         * The sk_lock has mutex_lock() semantics here:
1811         */
1812        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1813        local_bh_enable();
1814}
1815
1816EXPORT_SYMBOL(lock_sock_nested);
1817
1818void release_sock(struct sock *sk)
1819{
1820        /*
1821         * The sk_lock has mutex_unlock() semantics:
1822         */
1823        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1824
1825        spin_lock_bh(&sk->sk_lock.slock);
1826        if (sk->sk_backlog.tail)
1827                __release_sock(sk);
1828        sk->sk_lock.owned = 0;
1829        if (waitqueue_active(&sk->sk_lock.wq))
1830                wake_up(&sk->sk_lock.wq);
1831        spin_unlock_bh(&sk->sk_lock.slock);
1832}
1833EXPORT_SYMBOL(release_sock);
1834
1835int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1836{
1837        struct timeval tv;
1838        if (!sock_flag(sk, SOCK_TIMESTAMP))
1839                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1840        tv = ktime_to_timeval(sk->sk_stamp);
1841        if (tv.tv_sec == -1)
1842                return -ENOENT;
1843        if (tv.tv_sec == 0) {
1844                sk->sk_stamp = ktime_get_real();
1845                tv = ktime_to_timeval(sk->sk_stamp);
1846        }
1847        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1848}
1849EXPORT_SYMBOL(sock_get_timestamp);
1850
1851int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1852{
1853        struct timespec ts;
1854        if (!sock_flag(sk, SOCK_TIMESTAMP))
1855                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1856        ts = ktime_to_timespec(sk->sk_stamp);
1857        if (ts.tv_sec == -1)
1858                return -ENOENT;
1859        if (ts.tv_sec == 0) {
1860                sk->sk_stamp = ktime_get_real();
1861                ts = ktime_to_timespec(sk->sk_stamp);
1862        }
1863        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1864}
1865EXPORT_SYMBOL(sock_get_timestampns);
1866
1867void sock_enable_timestamp(struct sock *sk, int flag)
1868{
1869        if (!sock_flag(sk, flag)) {
1870                sock_set_flag(sk, flag);
1871                /*
1872                 * we just set one of the two flags which require net
1873                 * time stamping, but time stamping might have been on
1874                 * already because of the other one
1875                 */
1876                if (!sock_flag(sk,
1877                                flag == SOCK_TIMESTAMP ?
1878                                SOCK_TIMESTAMPING_RX_SOFTWARE :
1879                                SOCK_TIMESTAMP))
1880                        net_enable_timestamp();
1881        }
1882}
1883
1884/*
1885 *      Get a socket option on an socket.
1886 *
1887 *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1888 *      asynchronous errors should be reported by getsockopt. We assume
1889 *      this means if you specify SO_ERROR (otherwise whats the point of it).
1890 */
1891int sock_common_getsockopt(struct socket *sock, int level, int optname,
1892                           char __user *optval, int __user *optlen)
1893{
1894        struct sock *sk = sock->sk;
1895
1896        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1897}
1898
1899EXPORT_SYMBOL(sock_common_getsockopt);
1900
1901#ifdef CONFIG_COMPAT
1902int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1903                                  char __user *optval, int __user *optlen)
1904{
1905        struct sock *sk = sock->sk;
1906
1907        if (sk->sk_prot->compat_getsockopt != NULL)
1908                return sk->sk_prot->compat_getsockopt(sk, level, optname,
1909                                                      optval, optlen);
1910        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1911}
1912EXPORT_SYMBOL(compat_sock_common_getsockopt);
1913#endif
1914
1915int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1916                        struct msghdr *msg, size_t size, int flags)
1917{
1918        struct sock *sk = sock->sk;
1919        int addr_len = 0;
1920        int err;
1921
1922        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1923                                   flags & ~MSG_DONTWAIT, &addr_len);
1924        if (err >= 0)
1925                msg->msg_namelen = addr_len;
1926        return err;
1927}
1928
1929EXPORT_SYMBOL(sock_common_recvmsg);
1930
1931/*
1932 *      Set socket options on an inet socket.
1933 */
1934int sock_common_setsockopt(struct socket *sock, int level, int optname,
1935                           char __user *optval, int optlen)
1936{
1937        struct sock *sk = sock->sk;
1938
1939        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1940}
1941
1942EXPORT_SYMBOL(sock_common_setsockopt);
1943
1944#ifdef CONFIG_COMPAT
1945int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1946                                  char __user *optval, int optlen)
1947{
1948        struct sock *sk = sock->sk;
1949
1950        if (sk->sk_prot->compat_setsockopt != NULL)
1951                return sk->sk_prot->compat_setsockopt(sk, level, optname,
1952                                                      optval, optlen);
1953        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1954}
1955EXPORT_SYMBOL(compat_sock_common_setsockopt);
1956#endif
1957
1958void sk_common_release(struct sock *sk)
1959{
1960        if (sk->sk_prot->destroy)
1961                sk->sk_prot->destroy(sk);
1962
1963        /*
1964         * Observation: when sock_common_release is called, processes have
1965         * no access to socket. But net still has.
1966         * Step one, detach it from networking:
1967         *
1968         * A. Remove from hash tables.
1969         */
1970
1971        sk->sk_prot->unhash(sk);
1972
1973        /*
1974         * In this point socket cannot receive new packets, but it is possible
1975         * that some packets are in flight because some CPU runs receiver and
1976         * did hash table lookup before we unhashed socket. They will achieve
1977         * receive queue and will be purged by socket destructor.
1978         *
1979         * Also we still have packets pending on receive queue and probably,
1980         * our own packets waiting in device queues. sock_destroy will drain
1981         * receive queue, but transmitted packets will delay socket destruction
1982         * until the last reference will be released.
1983         */
1984
1985        sock_orphan(sk);
1986
1987        xfrm_sk_free_policy(sk);
1988
1989        sk_refcnt_debug_release(sk);
1990        sock_put(sk);
1991}
1992
1993EXPORT_SYMBOL(sk_common_release);
1994
1995static DEFINE_RWLOCK(proto_list_lock);
1996static LIST_HEAD(proto_list);
1997
1998#ifdef CONFIG_PROC_FS
1999#define PROTO_INUSE_NR  64      /* should be enough for the first time */
2000struct prot_inuse {
2001        int val[PROTO_INUSE_NR];
2002};
2003
2004static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2005
2006#ifdef CONFIG_NET_NS
2007void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2008{
2009        int cpu = smp_processor_id();
2010        per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2011}
2012EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2013
2014int sock_prot_inuse_get(struct net *net, struct proto *prot)
2015{
2016        int cpu, idx = prot->inuse_idx;
2017        int res = 0;
2018
2019        for_each_possible_cpu(cpu)
2020                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2021
2022        return res >= 0 ? res : 0;
2023}
2024EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2025
2026static int sock_inuse_init_net(struct net *net)
2027{
2028        net->core.inuse = alloc_percpu(struct prot_inuse);
2029        return net->core.inuse ? 0 : -ENOMEM;
2030}
2031
2032static void sock_inuse_exit_net(struct net *net)
2033{
2034        free_percpu(net->core.inuse);
2035}
2036
2037static struct pernet_operations net_inuse_ops = {
2038        .init = sock_inuse_init_net,
2039        .exit = sock_inuse_exit_net,
2040};
2041
2042static __init int net_inuse_init(void)
2043{
2044        if (register_pernet_subsys(&net_inuse_ops))
2045                panic("Cannot initialize net inuse counters");
2046
2047        return 0;
2048}
2049
2050core_initcall(net_inuse_init);
2051#else
2052static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2053
2054void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2055{
2056        __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2057}
2058EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2059
2060int sock_prot_inuse_get(struct net *net, struct proto *prot)
2061{
2062        int cpu, idx = prot->inuse_idx;
2063        int res = 0;
2064
2065        for_each_possible_cpu(cpu)
2066                res += per_cpu(prot_inuse, cpu).val[idx];
2067
2068        return res >= 0 ? res : 0;
2069}
2070EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2071#endif
2072
2073static void assign_proto_idx(struct proto *prot)
2074{
2075        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2076
2077        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2078                printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2079                return;
2080        }
2081
2082        set_bit(prot->inuse_idx, proto_inuse_idx);
2083}
2084
2085static void release_proto_idx(struct proto *prot)
2086{
2087        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2088                clear_bit(prot->inuse_idx, proto_inuse_idx);
2089}
2090#else
2091static inline void assign_proto_idx(struct proto *prot)
2092{
2093}
2094
2095static inline void release_proto_idx(struct proto *prot)
2096{
2097}
2098#endif
2099
2100int proto_register(struct proto *prot, int alloc_slab)
2101{
2102        if (alloc_slab) {
2103                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2104                                        SLAB_HWCACHE_ALIGN | prot->slab_flags,
2105                                        NULL);
2106
2107                if (prot->slab == NULL) {
2108                        printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2109                               prot->name);
2110                        goto out;
2111                }
2112
2113                if (prot->rsk_prot != NULL) {
2114                        static const char mask[] = "request_sock_%s";
2115
2116                        prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2117                        if (prot->rsk_prot->slab_name == NULL)
2118                                goto out_free_sock_slab;
2119
2120                        sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2121                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2122                                                                 prot->rsk_prot->obj_size, 0,
2123                                                                 SLAB_HWCACHE_ALIGN, NULL);
2124
2125                        if (prot->rsk_prot->slab == NULL) {
2126                                printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2127                                       prot->name);
2128                                goto out_free_request_sock_slab_name;
2129                        }
2130                }
2131
2132                if (prot->twsk_prot != NULL) {
2133                        static const char mask[] = "tw_sock_%s";
2134
2135                        prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2136
2137                        if (prot->twsk_prot->twsk_slab_name == NULL)
2138                                goto out_free_request_sock_slab;
2139
2140                        sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2141                        prot->twsk_prot->twsk_slab =
2142                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2143                                                  prot->twsk_prot->twsk_obj_size,
2144                                                  0,
2145                                                  SLAB_HWCACHE_ALIGN |
2146                                                        prot->slab_flags,
2147                                                  NULL);
2148                        if (prot->twsk_prot->twsk_slab == NULL)
2149                                goto out_free_timewait_sock_slab_name;
2150                }
2151        }
2152
2153        write_lock(&proto_list_lock);
2154        list_add(&prot->node, &proto_list);
2155        assign_proto_idx(prot);
2156        write_unlock(&proto_list_lock);
2157        return 0;
2158
2159out_free_timewait_sock_slab_name:
2160        kfree(prot->twsk_prot->twsk_slab_name);
2161out_free_request_sock_slab:
2162        if (prot->rsk_prot && prot->rsk_prot->slab) {
2163                kmem_cache_destroy(prot->rsk_prot->slab);
2164                prot->rsk_prot->slab = NULL;
2165        }
2166out_free_request_sock_slab_name:
2167        kfree(prot->rsk_prot->slab_name);
2168out_free_sock_slab:
2169        kmem_cache_destroy(prot->slab);
2170        prot->slab = NULL;
2171out:
2172        return -ENOBUFS;
2173}
2174
2175EXPORT_SYMBOL(proto_register);
2176
2177void proto_unregister(struct proto *prot)
2178{
2179        write_lock(&proto_list_lock);
2180        release_proto_idx(prot);
2181        list_del(&prot->node);
2182        write_unlock(&proto_list_lock);
2183
2184        if (prot->slab != NULL) {
2185                kmem_cache_destroy(prot->slab);
2186                prot->slab = NULL;
2187        }
2188
2189        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2190                kmem_cache_destroy(prot->rsk_prot->slab);
2191                kfree(prot->rsk_prot->slab_name);
2192                prot->rsk_prot->slab = NULL;
2193        }
2194
2195        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2196                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2197                kfree(prot->twsk_prot->twsk_slab_name);
2198                prot->twsk_prot->twsk_slab = NULL;
2199        }
2200}
2201
2202EXPORT_SYMBOL(proto_unregister);
2203
2204#ifdef CONFIG_PROC_FS
2205static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2206        __acquires(proto_list_lock)
2207{
2208        read_lock(&proto_list_lock);
2209        return seq_list_start_head(&proto_list, *pos);
2210}
2211
2212static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213{
2214        return seq_list_next(v, &proto_list, pos);
2215}
2216
2217static void proto_seq_stop(struct seq_file *seq, void *v)
2218        __releases(proto_list_lock)
2219{
2220        read_unlock(&proto_list_lock);
2221}
2222
2223static char proto_method_implemented(const void *method)
2224{
2225        return method == NULL ? 'n' : 'y';
2226}
2227
2228static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2229{
2230        seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2231                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2232                   proto->name,
2233                   proto->obj_size,
2234                   sock_prot_inuse_get(seq_file_net(seq), proto),
2235                   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2236                   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2237                   proto->max_header,
2238                   proto->slab == NULL ? "no" : "yes",
2239                   module_name(proto->owner),
2240                   proto_method_implemented(proto->close),
2241                   proto_method_implemented(proto->connect),
2242                   proto_method_implemented(proto->disconnect),
2243                   proto_method_implemented(proto->accept),
2244                   proto_method_implemented(proto->ioctl),
2245                   proto_method_implemented(proto->init),
2246                   proto_method_implemented(proto->destroy),
2247                   proto_method_implemented(proto->shutdown),
2248                   proto_method_implemented(proto->setsockopt),
2249                   proto_method_implemented(proto->getsockopt),
2250                   proto_method_implemented(proto->sendmsg),
2251                   proto_method_implemented(proto->recvmsg),
2252                   proto_method_implemented(proto->sendpage),
2253                   proto_method_implemented(proto->bind),
2254                   proto_method_implemented(proto->backlog_rcv),
2255                   proto_method_implemented(proto->hash),
2256                   proto_method_implemented(proto->unhash),
2257                   proto_method_implemented(proto->get_port),
2258                   proto_method_implemented(proto->enter_memory_pressure));
2259}
2260
2261static int proto_seq_show(struct seq_file *seq, void *v)
2262{
2263        if (v == &proto_list)
2264                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2265                           "protocol",
2266                           "size",
2267                           "sockets",
2268                           "memory",
2269                           "press",
2270                           "maxhdr",
2271                           "slab",
2272                           "module",
2273                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2274        else
2275                proto_seq_printf(seq, list_entry(v, struct proto, node));
2276        return 0;
2277}
2278
2279static const struct seq_operations proto_seq_ops = {
2280        .start  = proto_seq_start,
2281        .next   = proto_seq_next,
2282        .stop   = proto_seq_stop,
2283        .show   = proto_seq_show,
2284};
2285
2286static int proto_seq_open(struct inode *inode, struct file *file)
2287{
2288        return seq_open_net(inode, file, &proto_seq_ops,
2289                            sizeof(struct seq_net_private));
2290}
2291
2292static const struct file_operations proto_seq_fops = {
2293        .owner          = THIS_MODULE,
2294        .open           = proto_seq_open,
2295        .read           = seq_read,
2296        .llseek         = seq_lseek,
2297        .release        = seq_release_net,
2298};
2299
2300static __net_init int proto_init_net(struct net *net)
2301{
2302        if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2303                return -ENOMEM;
2304
2305        return 0;
2306}
2307
2308static __net_exit void proto_exit_net(struct net *net)
2309{
2310        proc_net_remove(net, "protocols");
2311}
2312
2313
2314static __net_initdata struct pernet_operations proto_net_ops = {
2315        .init = proto_init_net,
2316        .exit = proto_exit_net,
2317};
2318
2319static int __init proto_init(void)
2320{
2321        return register_pernet_subsys(&proto_net_ops);
2322}
2323
2324subsys_initcall(proto_init);
2325
2326#endif /* PROC_FS */
2327
2328EXPORT_SYMBOL(sk_alloc);
2329EXPORT_SYMBOL(sk_free);
2330EXPORT_SYMBOL(sk_send_sigurg);
2331EXPORT_SYMBOL(sock_alloc_send_skb);
2332EXPORT_SYMBOL(sock_init_data);
2333EXPORT_SYMBOL(sock_kfree_s);
2334EXPORT_SYMBOL(sock_kmalloc);
2335EXPORT_SYMBOL(sock_no_accept);
2336EXPORT_SYMBOL(sock_no_bind);
2337EXPORT_SYMBOL(sock_no_connect);
2338EXPORT_SYMBOL(sock_no_getname);
2339EXPORT_SYMBOL(sock_no_getsockopt);
2340EXPORT_SYMBOL(sock_no_ioctl);
2341EXPORT_SYMBOL(sock_no_listen);
2342EXPORT_SYMBOL(sock_no_mmap);
2343EXPORT_SYMBOL(sock_no_poll);
2344EXPORT_SYMBOL(sock_no_recvmsg);
2345EXPORT_SYMBOL(sock_no_sendmsg);
2346EXPORT_SYMBOL(sock_no_sendpage);
2347EXPORT_SYMBOL(sock_no_setsockopt);
2348EXPORT_SYMBOL(sock_no_shutdown);
2349EXPORT_SYMBOL(sock_no_socketpair);
2350EXPORT_SYMBOL(sock_rfree);
2351EXPORT_SYMBOL(sock_setsockopt);
2352EXPORT_SYMBOL(sock_wfree);
2353EXPORT_SYMBOL(sock_wmalloc);
2354EXPORT_SYMBOL(sock_i_uid);
2355EXPORT_SYMBOL(sock_i_ino);
2356EXPORT_SYMBOL(sysctl_optmem_max);
2357