linux/net/packet/af_packet.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              PACKET - implements raw packet sockets.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *
  12 * Fixes:
  13 *              Alan Cox        :       verify_area() now used correctly
  14 *              Alan Cox        :       new skbuff lists, look ma no backlogs!
  15 *              Alan Cox        :       tidied skbuff lists.
  16 *              Alan Cox        :       Now uses generic datagram routines I
  17 *                                      added. Also fixed the peek/read crash
  18 *                                      from all old Linux datagram code.
  19 *              Alan Cox        :       Uses the improved datagram code.
  20 *              Alan Cox        :       Added NULL's for socket options.
  21 *              Alan Cox        :       Re-commented the code.
  22 *              Alan Cox        :       Use new kernel side addressing
  23 *              Rob Janssen     :       Correct MTU usage.
  24 *              Dave Platt      :       Counter leaks caused by incorrect
  25 *                                      interrupt locking and some slightly
  26 *                                      dubious gcc output. Can you read
  27 *                                      compiler: it said _VOLATILE_
  28 *      Richard Kooijman        :       Timestamp fixes.
  29 *              Alan Cox        :       New buffers. Use sk->mac.raw.
  30 *              Alan Cox        :       sendmsg/recvmsg support.
  31 *              Alan Cox        :       Protocol setting support
  32 *      Alexey Kuznetsov        :       Untied from IPv4 stack.
  33 *      Cyrus Durgin            :       Fixed kerneld for kmod.
  34 *      Michal Ostrowski        :       Module initialization cleanup.
  35 *         Ulises Alonso        :       Frame number limit removal and
  36 *                                      packet_set_ring memory leak.
  37 *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
  38 *                                      The convention is that longer addresses
  39 *                                      will simply extend the hardware address
  40 *                                      byte arrays at the end of sockaddr_ll
  41 *                                      and packet_mreq.
  42 *
  43 *              This program is free software; you can redistribute it and/or
  44 *              modify it under the terms of the GNU General Public License
  45 *              as published by the Free Software Foundation; either version
  46 *              2 of the License, or (at your option) any later version.
  47 *
  48 */
  49
  50#include <linux/types.h>
  51#include <linux/mm.h>
  52#include <linux/capability.h>
  53#include <linux/fcntl.h>
  54#include <linux/socket.h>
  55#include <linux/in.h>
  56#include <linux/inet.h>
  57#include <linux/netdevice.h>
  58#include <linux/if_packet.h>
  59#include <linux/wireless.h>
  60#include <linux/kernel.h>
  61#include <linux/kmod.h>
  62#include <net/net_namespace.h>
  63#include <net/ip.h>
  64#include <net/protocol.h>
  65#include <linux/skbuff.h>
  66#include <net/sock.h>
  67#include <linux/errno.h>
  68#include <linux/timer.h>
  69#include <asm/system.h>
  70#include <asm/uaccess.h>
  71#include <asm/ioctls.h>
  72#include <asm/page.h>
  73#include <asm/cacheflush.h>
  74#include <asm/io.h>
  75#include <linux/proc_fs.h>
  76#include <linux/seq_file.h>
  77#include <linux/poll.h>
  78#include <linux/module.h>
  79#include <linux/init.h>
  80#include <linux/mutex.h>
  81
  82#ifdef CONFIG_INET
  83#include <net/inet_common.h>
  84#endif
  85
  86/*
  87   Assumptions:
  88   - if device has no dev->hard_header routine, it adds and removes ll header
  89     inside itself. In this case ll header is invisible outside of device,
  90     but higher levels still should reserve dev->hard_header_len.
  91     Some devices are enough clever to reallocate skb, when header
  92     will not fit to reserved space (tunnel), another ones are silly
  93     (PPP).
  94   - packet socket receives packets with pulled ll header,
  95     so that SOCK_RAW should push it back.
  96
  97On receive:
  98-----------
  99
 100Incoming, dev->hard_header!=NULL
 101   mac_header -> ll header
 102   data       -> data
 103
 104Outgoing, dev->hard_header!=NULL
 105   mac_header -> ll header
 106   data       -> ll header
 107
 108Incoming, dev->hard_header==NULL
 109   mac_header -> UNKNOWN position. It is very likely, that it points to ll
 110                 header.  PPP makes it, that is wrong, because introduce
 111                 assymetry between rx and tx paths.
 112   data       -> data
 113
 114Outgoing, dev->hard_header==NULL
 115   mac_header -> data. ll header is still not built!
 116   data       -> data
 117
 118Resume
 119  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
 120
 121
 122On transmit:
 123------------
 124
 125dev->hard_header != NULL
 126   mac_header -> ll header
 127   data       -> ll header
 128
 129dev->hard_header == NULL (ll header is added by device, we cannot control it)
 130   mac_header -> data
 131   data       -> data
 132
 133   We should set nh.raw on output to correct posistion,
 134   packet classifier depends on it.
 135 */
 136
 137/* Private packet socket structures. */
 138
 139struct packet_mclist
 140{
 141        struct packet_mclist    *next;
 142        int                     ifindex;
 143        int                     count;
 144        unsigned short          type;
 145        unsigned short          alen;
 146        unsigned char           addr[MAX_ADDR_LEN];
 147};
 148/* identical to struct packet_mreq except it has
 149 * a longer address field.
 150 */
 151struct packet_mreq_max
 152{
 153        int             mr_ifindex;
 154        unsigned short  mr_type;
 155        unsigned short  mr_alen;
 156        unsigned char   mr_address[MAX_ADDR_LEN];
 157};
 158
 159#ifdef CONFIG_PACKET_MMAP
 160static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
 161#endif
 162
 163static void packet_flush_mclist(struct sock *sk);
 164
 165struct packet_sock {
 166        /* struct sock has to be the first member of packet_sock */
 167        struct sock             sk;
 168        struct tpacket_stats    stats;
 169#ifdef CONFIG_PACKET_MMAP
 170        char *                  *pg_vec;
 171        unsigned int            head;
 172        unsigned int            frames_per_block;
 173        unsigned int            frame_size;
 174        unsigned int            frame_max;
 175        int                     copy_thresh;
 176#endif
 177        struct packet_type      prot_hook;
 178        spinlock_t              bind_lock;
 179        struct mutex            pg_vec_lock;
 180        unsigned int            running:1,      /* prot_hook is attached*/
 181                                auxdata:1,
 182                                origdev:1;
 183        int                     ifindex;        /* bound device         */
 184        __be16                  num;
 185        struct packet_mclist    *mclist;
 186#ifdef CONFIG_PACKET_MMAP
 187        atomic_t                mapped;
 188        unsigned int            pg_vec_order;
 189        unsigned int            pg_vec_pages;
 190        unsigned int            pg_vec_len;
 191        enum tpacket_versions   tp_version;
 192        unsigned int            tp_hdrlen;
 193        unsigned int            tp_reserve;
 194#endif
 195};
 196
 197struct packet_skb_cb {
 198        unsigned int origlen;
 199        union {
 200                struct sockaddr_pkt pkt;
 201                struct sockaddr_ll ll;
 202        } sa;
 203};
 204
 205#define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
 206
 207#ifdef CONFIG_PACKET_MMAP
 208
 209static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
 210                                 int status)
 211{
 212        unsigned int pg_vec_pos, frame_offset;
 213        union {
 214                struct tpacket_hdr *h1;
 215                struct tpacket2_hdr *h2;
 216                void *raw;
 217        } h;
 218
 219        pg_vec_pos = position / po->frames_per_block;
 220        frame_offset = position % po->frames_per_block;
 221
 222        h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
 223        switch (po->tp_version) {
 224        case TPACKET_V1:
 225                if (status != (h.h1->tp_status ? TP_STATUS_USER :
 226                                                TP_STATUS_KERNEL))
 227                        return NULL;
 228                break;
 229        case TPACKET_V2:
 230                if (status != (h.h2->tp_status ? TP_STATUS_USER :
 231                                                TP_STATUS_KERNEL))
 232                        return NULL;
 233                break;
 234        }
 235        return h.raw;
 236}
 237
 238static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 239{
 240        union {
 241                struct tpacket_hdr *h1;
 242                struct tpacket2_hdr *h2;
 243                void *raw;
 244        } h;
 245
 246        h.raw = frame;
 247        switch (po->tp_version) {
 248        case TPACKET_V1:
 249                h.h1->tp_status = status;
 250                break;
 251        case TPACKET_V2:
 252                h.h2->tp_status = status;
 253                break;
 254        }
 255}
 256#endif
 257
 258static inline struct packet_sock *pkt_sk(struct sock *sk)
 259{
 260        return (struct packet_sock *)sk;
 261}
 262
 263static void packet_sock_destruct(struct sock *sk)
 264{
 265        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
 266        WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 267
 268        if (!sock_flag(sk, SOCK_DEAD)) {
 269                printk("Attempt to release alive packet socket: %p\n", sk);
 270                return;
 271        }
 272
 273        sk_refcnt_debug_dec(sk);
 274}
 275
 276
 277static const struct proto_ops packet_ops;
 278
 279static const struct proto_ops packet_ops_spkt;
 280
 281static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
 282{
 283        struct sock *sk;
 284        struct sockaddr_pkt *spkt;
 285
 286        /*
 287         *      When we registered the protocol we saved the socket in the data
 288         *      field for just this event.
 289         */
 290
 291        sk = pt->af_packet_priv;
 292
 293        /*
 294         *      Yank back the headers [hope the device set this
 295         *      right or kerboom...]
 296         *
 297         *      Incoming packets have ll header pulled,
 298         *      push it back.
 299         *
 300         *      For outgoing ones skb->data == skb_mac_header(skb)
 301         *      so that this procedure is noop.
 302         */
 303
 304        if (skb->pkt_type == PACKET_LOOPBACK)
 305                goto out;
 306
 307        if (dev_net(dev) != sock_net(sk))
 308                goto out;
 309
 310        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
 311                goto oom;
 312
 313        /* drop any routing info */
 314        dst_release(skb->dst);
 315        skb->dst = NULL;
 316
 317        /* drop conntrack reference */
 318        nf_reset(skb);
 319
 320        spkt = &PACKET_SKB_CB(skb)->sa.pkt;
 321
 322        skb_push(skb, skb->data - skb_mac_header(skb));
 323
 324        /*
 325         *      The SOCK_PACKET socket receives _all_ frames.
 326         */
 327
 328        spkt->spkt_family = dev->type;
 329        strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
 330        spkt->spkt_protocol = skb->protocol;
 331
 332        /*
 333         *      Charge the memory to the socket. This is done specifically
 334         *      to prevent sockets using all the memory up.
 335         */
 336
 337        if (sock_queue_rcv_skb(sk,skb) == 0)
 338                return 0;
 339
 340out:
 341        kfree_skb(skb);
 342oom:
 343        return 0;
 344}
 345
 346
 347/*
 348 *      Output a raw packet to a device layer. This bypasses all the other
 349 *      protocol layers and you must therefore supply it with a complete frame
 350 */
 351
 352static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
 353                               struct msghdr *msg, size_t len)
 354{
 355        struct sock *sk = sock->sk;
 356        struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
 357        struct sk_buff *skb;
 358        struct net_device *dev;
 359        __be16 proto=0;
 360        int err;
 361
 362        /*
 363         *      Get and verify the address.
 364         */
 365
 366        if (saddr)
 367        {
 368                if (msg->msg_namelen < sizeof(struct sockaddr))
 369                        return(-EINVAL);
 370                if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
 371                        proto=saddr->spkt_protocol;
 372        }
 373        else
 374                return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
 375
 376        /*
 377         *      Find the device first to size check it
 378         */
 379
 380        saddr->spkt_device[13] = 0;
 381        dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
 382        err = -ENODEV;
 383        if (dev == NULL)
 384                goto out_unlock;
 385
 386        err = -ENETDOWN;
 387        if (!(dev->flags & IFF_UP))
 388                goto out_unlock;
 389
 390        /*
 391         *      You may not queue a frame bigger than the mtu. This is the lowest level
 392         *      raw protocol and you must do your own fragmentation at this level.
 393         */
 394
 395        err = -EMSGSIZE;
 396        if (len > dev->mtu + dev->hard_header_len)
 397                goto out_unlock;
 398
 399        err = -ENOBUFS;
 400        skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
 401
 402        /*
 403         *      If the write buffer is full, then tough. At this level the user gets to
 404         *      deal with the problem - do your own algorithmic backoffs. That's far
 405         *      more flexible.
 406         */
 407
 408        if (skb == NULL)
 409                goto out_unlock;
 410
 411        /*
 412         *      Fill it in
 413         */
 414
 415        /* FIXME: Save some space for broken drivers that write a
 416         * hard header at transmission time by themselves. PPP is the
 417         * notable one here. This should really be fixed at the driver level.
 418         */
 419        skb_reserve(skb, LL_RESERVED_SPACE(dev));
 420        skb_reset_network_header(skb);
 421
 422        /* Try to align data part correctly */
 423        if (dev->header_ops) {
 424                skb->data -= dev->hard_header_len;
 425                skb->tail -= dev->hard_header_len;
 426                if (len < dev->hard_header_len)
 427                        skb_reset_network_header(skb);
 428        }
 429
 430        /* Returns -EFAULT on error */
 431        err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
 432        skb->protocol = proto;
 433        skb->dev = dev;
 434        skb->priority = sk->sk_priority;
 435        if (err)
 436                goto out_free;
 437
 438        /*
 439         *      Now send it
 440         */
 441
 442        dev_queue_xmit(skb);
 443        dev_put(dev);
 444        return(len);
 445
 446out_free:
 447        kfree_skb(skb);
 448out_unlock:
 449        if (dev)
 450                dev_put(dev);
 451        return err;
 452}
 453
 454static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
 455                                      unsigned int res)
 456{
 457        struct sk_filter *filter;
 458
 459        rcu_read_lock_bh();
 460        filter = rcu_dereference(sk->sk_filter);
 461        if (filter != NULL)
 462                res = sk_run_filter(skb, filter->insns, filter->len);
 463        rcu_read_unlock_bh();
 464
 465        return res;
 466}
 467
 468/*
 469   This function makes lazy skb cloning in hope that most of packets
 470   are discarded by BPF.
 471
 472   Note tricky part: we DO mangle shared skb! skb->data, skb->len
 473   and skb->cb are mangled. It works because (and until) packets
 474   falling here are owned by current CPU. Output packets are cloned
 475   by dev_queue_xmit_nit(), input packets are processed by net_bh
 476   sequencially, so that if we return skb to original state on exit,
 477   we will not harm anyone.
 478 */
 479
 480static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 481{
 482        struct sock *sk;
 483        struct sockaddr_ll *sll;
 484        struct packet_sock *po;
 485        u8 * skb_head = skb->data;
 486        int skb_len = skb->len;
 487        unsigned int snaplen, res;
 488
 489        if (skb->pkt_type == PACKET_LOOPBACK)
 490                goto drop;
 491
 492        sk = pt->af_packet_priv;
 493        po = pkt_sk(sk);
 494
 495        if (dev_net(dev) != sock_net(sk))
 496                goto drop;
 497
 498        skb->dev = dev;
 499
 500        if (dev->header_ops) {
 501                /* The device has an explicit notion of ll header,
 502                   exported to higher levels.
 503
 504                   Otherwise, the device hides datails of it frame
 505                   structure, so that corresponding packet head
 506                   never delivered to user.
 507                 */
 508                if (sk->sk_type != SOCK_DGRAM)
 509                        skb_push(skb, skb->data - skb_mac_header(skb));
 510                else if (skb->pkt_type == PACKET_OUTGOING) {
 511                        /* Special case: outgoing packets have ll header at head */
 512                        skb_pull(skb, skb_network_offset(skb));
 513                }
 514        }
 515
 516        snaplen = skb->len;
 517
 518        res = run_filter(skb, sk, snaplen);
 519        if (!res)
 520                goto drop_n_restore;
 521        if (snaplen > res)
 522                snaplen = res;
 523
 524        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 525            (unsigned)sk->sk_rcvbuf)
 526                goto drop_n_acct;
 527
 528        if (skb_shared(skb)) {
 529                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 530                if (nskb == NULL)
 531                        goto drop_n_acct;
 532
 533                if (skb_head != skb->data) {
 534                        skb->data = skb_head;
 535                        skb->len = skb_len;
 536                }
 537                kfree_skb(skb);
 538                skb = nskb;
 539        }
 540
 541        BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
 542                     sizeof(skb->cb));
 543
 544        sll = &PACKET_SKB_CB(skb)->sa.ll;
 545        sll->sll_family = AF_PACKET;
 546        sll->sll_hatype = dev->type;
 547        sll->sll_protocol = skb->protocol;
 548        sll->sll_pkttype = skb->pkt_type;
 549        if (unlikely(po->origdev))
 550                sll->sll_ifindex = orig_dev->ifindex;
 551        else
 552                sll->sll_ifindex = dev->ifindex;
 553
 554        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
 555
 556        PACKET_SKB_CB(skb)->origlen = skb->len;
 557
 558        if (pskb_trim(skb, snaplen))
 559                goto drop_n_acct;
 560
 561        skb_set_owner_r(skb, sk);
 562        skb->dev = NULL;
 563        dst_release(skb->dst);
 564        skb->dst = NULL;
 565
 566        /* drop conntrack reference */
 567        nf_reset(skb);
 568
 569        spin_lock(&sk->sk_receive_queue.lock);
 570        po->stats.tp_packets++;
 571        __skb_queue_tail(&sk->sk_receive_queue, skb);
 572        spin_unlock(&sk->sk_receive_queue.lock);
 573        sk->sk_data_ready(sk, skb->len);
 574        return 0;
 575
 576drop_n_acct:
 577        spin_lock(&sk->sk_receive_queue.lock);
 578        po->stats.tp_drops++;
 579        spin_unlock(&sk->sk_receive_queue.lock);
 580
 581drop_n_restore:
 582        if (skb_head != skb->data && skb_shared(skb)) {
 583                skb->data = skb_head;
 584                skb->len = skb_len;
 585        }
 586drop:
 587        consume_skb(skb);
 588        return 0;
 589}
 590
 591#ifdef CONFIG_PACKET_MMAP
 592static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 593{
 594        struct sock *sk;
 595        struct packet_sock *po;
 596        struct sockaddr_ll *sll;
 597        union {
 598                struct tpacket_hdr *h1;
 599                struct tpacket2_hdr *h2;
 600                void *raw;
 601        } h;
 602        u8 * skb_head = skb->data;
 603        int skb_len = skb->len;
 604        unsigned int snaplen, res;
 605        unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
 606        unsigned short macoff, netoff, hdrlen;
 607        struct sk_buff *copy_skb = NULL;
 608        struct timeval tv;
 609        struct timespec ts;
 610
 611        if (skb->pkt_type == PACKET_LOOPBACK)
 612                goto drop;
 613
 614        sk = pt->af_packet_priv;
 615        po = pkt_sk(sk);
 616
 617        if (dev_net(dev) != sock_net(sk))
 618                goto drop;
 619
 620        if (dev->header_ops) {
 621                if (sk->sk_type != SOCK_DGRAM)
 622                        skb_push(skb, skb->data - skb_mac_header(skb));
 623                else if (skb->pkt_type == PACKET_OUTGOING) {
 624                        /* Special case: outgoing packets have ll header at head */
 625                        skb_pull(skb, skb_network_offset(skb));
 626                }
 627        }
 628
 629        if (skb->ip_summed == CHECKSUM_PARTIAL)
 630                status |= TP_STATUS_CSUMNOTREADY;
 631
 632        snaplen = skb->len;
 633
 634        res = run_filter(skb, sk, snaplen);
 635        if (!res)
 636                goto drop_n_restore;
 637        if (snaplen > res)
 638                snaplen = res;
 639
 640        if (sk->sk_type == SOCK_DGRAM) {
 641                macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
 642                                  po->tp_reserve;
 643        } else {
 644                unsigned maclen = skb_network_offset(skb);
 645                netoff = TPACKET_ALIGN(po->tp_hdrlen +
 646                                       (maclen < 16 ? 16 : maclen)) +
 647                        po->tp_reserve;
 648                macoff = netoff - maclen;
 649        }
 650
 651        if (macoff + snaplen > po->frame_size) {
 652                if (po->copy_thresh &&
 653                    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
 654                    (unsigned)sk->sk_rcvbuf) {
 655                        if (skb_shared(skb)) {
 656                                copy_skb = skb_clone(skb, GFP_ATOMIC);
 657                        } else {
 658                                copy_skb = skb_get(skb);
 659                                skb_head = skb->data;
 660                        }
 661                        if (copy_skb)
 662                                skb_set_owner_r(copy_skb, sk);
 663                }
 664                snaplen = po->frame_size - macoff;
 665                if ((int)snaplen < 0)
 666                        snaplen = 0;
 667        }
 668
 669        spin_lock(&sk->sk_receive_queue.lock);
 670        h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
 671        if (!h.raw)
 672                goto ring_is_full;
 673        po->head = po->head != po->frame_max ? po->head+1 : 0;
 674        po->stats.tp_packets++;
 675        if (copy_skb) {
 676                status |= TP_STATUS_COPY;
 677                __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
 678        }
 679        if (!po->stats.tp_drops)
 680                status &= ~TP_STATUS_LOSING;
 681        spin_unlock(&sk->sk_receive_queue.lock);
 682
 683        skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
 684
 685        switch (po->tp_version) {
 686        case TPACKET_V1:
 687                h.h1->tp_len = skb->len;
 688                h.h1->tp_snaplen = snaplen;
 689                h.h1->tp_mac = macoff;
 690                h.h1->tp_net = netoff;
 691                if (skb->tstamp.tv64)
 692                        tv = ktime_to_timeval(skb->tstamp);
 693                else
 694                        do_gettimeofday(&tv);
 695                h.h1->tp_sec = tv.tv_sec;
 696                h.h1->tp_usec = tv.tv_usec;
 697                hdrlen = sizeof(*h.h1);
 698                break;
 699        case TPACKET_V2:
 700                h.h2->tp_len = skb->len;
 701                h.h2->tp_snaplen = snaplen;
 702                h.h2->tp_mac = macoff;
 703                h.h2->tp_net = netoff;
 704                if (skb->tstamp.tv64)
 705                        ts = ktime_to_timespec(skb->tstamp);
 706                else
 707                        getnstimeofday(&ts);
 708                h.h2->tp_sec = ts.tv_sec;
 709                h.h2->tp_nsec = ts.tv_nsec;
 710                h.h2->tp_vlan_tci = skb->vlan_tci;
 711                hdrlen = sizeof(*h.h2);
 712                break;
 713        default:
 714                BUG();
 715        }
 716
 717        sll = h.raw + TPACKET_ALIGN(hdrlen);
 718        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
 719        sll->sll_family = AF_PACKET;
 720        sll->sll_hatype = dev->type;
 721        sll->sll_protocol = skb->protocol;
 722        sll->sll_pkttype = skb->pkt_type;
 723        if (unlikely(po->origdev))
 724                sll->sll_ifindex = orig_dev->ifindex;
 725        else
 726                sll->sll_ifindex = dev->ifindex;
 727
 728        __packet_set_status(po, h.raw, status);
 729        smp_mb();
 730
 731        {
 732                struct page *p_start, *p_end;
 733                u8 *h_end = h.raw + macoff + snaplen - 1;
 734
 735                p_start = virt_to_page(h.raw);
 736                p_end = virt_to_page(h_end);
 737                while (p_start <= p_end) {
 738                        flush_dcache_page(p_start);
 739                        p_start++;
 740                }
 741        }
 742
 743        sk->sk_data_ready(sk, 0);
 744
 745drop_n_restore:
 746        if (skb_head != skb->data && skb_shared(skb)) {
 747                skb->data = skb_head;
 748                skb->len = skb_len;
 749        }
 750drop:
 751        kfree_skb(skb);
 752        return 0;
 753
 754ring_is_full:
 755        po->stats.tp_drops++;
 756        spin_unlock(&sk->sk_receive_queue.lock);
 757
 758        sk->sk_data_ready(sk, 0);
 759        kfree_skb(copy_skb);
 760        goto drop_n_restore;
 761}
 762
 763#endif
 764
 765
 766static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
 767                          struct msghdr *msg, size_t len)
 768{
 769        struct sock *sk = sock->sk;
 770        struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
 771        struct sk_buff *skb;
 772        struct net_device *dev;
 773        __be16 proto;
 774        unsigned char *addr;
 775        int ifindex, err, reserve = 0;
 776
 777        /*
 778         *      Get and verify the address.
 779         */
 780
 781        if (saddr == NULL) {
 782                struct packet_sock *po = pkt_sk(sk);
 783
 784                ifindex = po->ifindex;
 785                proto   = po->num;
 786                addr    = NULL;
 787        } else {
 788                err = -EINVAL;
 789                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
 790                        goto out;
 791                if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
 792                        goto out;
 793                ifindex = saddr->sll_ifindex;
 794                proto   = saddr->sll_protocol;
 795                addr    = saddr->sll_addr;
 796        }
 797
 798
 799        dev = dev_get_by_index(sock_net(sk), ifindex);
 800        err = -ENXIO;
 801        if (dev == NULL)
 802                goto out_unlock;
 803        if (sock->type == SOCK_RAW)
 804                reserve = dev->hard_header_len;
 805
 806        err = -ENETDOWN;
 807        if (!(dev->flags & IFF_UP))
 808                goto out_unlock;
 809
 810        err = -EMSGSIZE;
 811        if (len > dev->mtu+reserve)
 812                goto out_unlock;
 813
 814        skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
 815                                msg->msg_flags & MSG_DONTWAIT, &err);
 816        if (skb==NULL)
 817                goto out_unlock;
 818
 819        skb_reserve(skb, LL_RESERVED_SPACE(dev));
 820        skb_reset_network_header(skb);
 821
 822        err = -EINVAL;
 823        if (sock->type == SOCK_DGRAM &&
 824            dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
 825                goto out_free;
 826
 827        /* Returns -EFAULT on error */
 828        err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
 829        if (err)
 830                goto out_free;
 831
 832        skb->protocol = proto;
 833        skb->dev = dev;
 834        skb->priority = sk->sk_priority;
 835
 836        /*
 837         *      Now send it
 838         */
 839
 840        err = dev_queue_xmit(skb);
 841        if (err > 0 && (err = net_xmit_errno(err)) != 0)
 842                goto out_unlock;
 843
 844        dev_put(dev);
 845
 846        return(len);
 847
 848out_free:
 849        kfree_skb(skb);
 850out_unlock:
 851        if (dev)
 852                dev_put(dev);
 853out:
 854        return err;
 855}
 856
 857/*
 858 *      Close a PACKET socket. This is fairly simple. We immediately go
 859 *      to 'closed' state and remove our protocol entry in the device list.
 860 */
 861
 862static int packet_release(struct socket *sock)
 863{
 864        struct sock *sk = sock->sk;
 865        struct packet_sock *po;
 866        struct net *net;
 867
 868        if (!sk)
 869                return 0;
 870
 871        net = sock_net(sk);
 872        po = pkt_sk(sk);
 873
 874        write_lock_bh(&net->packet.sklist_lock);
 875        sk_del_node_init(sk);
 876        sock_prot_inuse_add(net, sk->sk_prot, -1);
 877        write_unlock_bh(&net->packet.sklist_lock);
 878
 879        /*
 880         *      Unhook packet receive handler.
 881         */
 882
 883        if (po->running) {
 884                /*
 885                 *      Remove the protocol hook
 886                 */
 887                dev_remove_pack(&po->prot_hook);
 888                po->running = 0;
 889                po->num = 0;
 890                __sock_put(sk);
 891        }
 892
 893        packet_flush_mclist(sk);
 894
 895#ifdef CONFIG_PACKET_MMAP
 896        if (po->pg_vec) {
 897                struct tpacket_req req;
 898                memset(&req, 0, sizeof(req));
 899                packet_set_ring(sk, &req, 1);
 900        }
 901#endif
 902
 903        /*
 904         *      Now the socket is dead. No more input will appear.
 905         */
 906
 907        sock_orphan(sk);
 908        sock->sk = NULL;
 909
 910        /* Purge queues */
 911
 912        skb_queue_purge(&sk->sk_receive_queue);
 913        sk_refcnt_debug_release(sk);
 914
 915        sock_put(sk);
 916        return 0;
 917}
 918
 919/*
 920 *      Attach a packet hook.
 921 */
 922
 923static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
 924{
 925        struct packet_sock *po = pkt_sk(sk);
 926        /*
 927         *      Detach an existing hook if present.
 928         */
 929
 930        lock_sock(sk);
 931
 932        spin_lock(&po->bind_lock);
 933        if (po->running) {
 934                __sock_put(sk);
 935                po->running = 0;
 936                po->num = 0;
 937                spin_unlock(&po->bind_lock);
 938                dev_remove_pack(&po->prot_hook);
 939                spin_lock(&po->bind_lock);
 940        }
 941
 942        po->num = protocol;
 943        po->prot_hook.type = protocol;
 944        po->prot_hook.dev = dev;
 945
 946        po->ifindex = dev ? dev->ifindex : 0;
 947
 948        if (protocol == 0)
 949                goto out_unlock;
 950
 951        if (!dev || (dev->flags & IFF_UP)) {
 952                dev_add_pack(&po->prot_hook);
 953                sock_hold(sk);
 954                po->running = 1;
 955        } else {
 956                sk->sk_err = ENETDOWN;
 957                if (!sock_flag(sk, SOCK_DEAD))
 958                        sk->sk_error_report(sk);
 959        }
 960
 961out_unlock:
 962        spin_unlock(&po->bind_lock);
 963        release_sock(sk);
 964        return 0;
 965}
 966
 967/*
 968 *      Bind a packet socket to a device
 969 */
 970
 971static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 972{
 973        struct sock *sk=sock->sk;
 974        char name[15];
 975        struct net_device *dev;
 976        int err = -ENODEV;
 977
 978        /*
 979         *      Check legality
 980         */
 981
 982        if (addr_len != sizeof(struct sockaddr))
 983                return -EINVAL;
 984        strlcpy(name,uaddr->sa_data,sizeof(name));
 985
 986        dev = dev_get_by_name(sock_net(sk), name);
 987        if (dev) {
 988                err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
 989                dev_put(dev);
 990        }
 991        return err;
 992}
 993
 994static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 995{
 996        struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
 997        struct sock *sk=sock->sk;
 998        struct net_device *dev = NULL;
 999        int err;
1000
1001
1002        /*
1003         *      Check legality
1004         */
1005
1006        if (addr_len < sizeof(struct sockaddr_ll))
1007                return -EINVAL;
1008        if (sll->sll_family != AF_PACKET)
1009                return -EINVAL;
1010
1011        if (sll->sll_ifindex) {
1012                err = -ENODEV;
1013                dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1014                if (dev == NULL)
1015                        goto out;
1016        }
1017        err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1018        if (dev)
1019                dev_put(dev);
1020
1021out:
1022        return err;
1023}
1024
1025static struct proto packet_proto = {
1026        .name     = "PACKET",
1027        .owner    = THIS_MODULE,
1028        .obj_size = sizeof(struct packet_sock),
1029};
1030
1031/*
1032 *      Create a packet of type SOCK_PACKET.
1033 */
1034
1035static int packet_create(struct net *net, struct socket *sock, int protocol)
1036{
1037        struct sock *sk;
1038        struct packet_sock *po;
1039        __be16 proto = (__force __be16)protocol; /* weird, but documented */
1040        int err;
1041
1042        if (!capable(CAP_NET_RAW))
1043                return -EPERM;
1044        if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1045            sock->type != SOCK_PACKET)
1046                return -ESOCKTNOSUPPORT;
1047
1048        sock->state = SS_UNCONNECTED;
1049
1050        err = -ENOBUFS;
1051        sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1052        if (sk == NULL)
1053                goto out;
1054
1055        sock->ops = &packet_ops;
1056        if (sock->type == SOCK_PACKET)
1057                sock->ops = &packet_ops_spkt;
1058
1059        sock_init_data(sock, sk);
1060
1061        po = pkt_sk(sk);
1062        sk->sk_family = PF_PACKET;
1063        po->num = proto;
1064
1065        sk->sk_destruct = packet_sock_destruct;
1066        sk_refcnt_debug_inc(sk);
1067
1068        /*
1069         *      Attach a protocol block
1070         */
1071
1072        spin_lock_init(&po->bind_lock);
1073        mutex_init(&po->pg_vec_lock);
1074        po->prot_hook.func = packet_rcv;
1075
1076        if (sock->type == SOCK_PACKET)
1077                po->prot_hook.func = packet_rcv_spkt;
1078
1079        po->prot_hook.af_packet_priv = sk;
1080
1081        if (proto) {
1082                po->prot_hook.type = proto;
1083                dev_add_pack(&po->prot_hook);
1084                sock_hold(sk);
1085                po->running = 1;
1086        }
1087
1088        write_lock_bh(&net->packet.sklist_lock);
1089        sk_add_node(sk, &net->packet.sklist);
1090        sock_prot_inuse_add(net, &packet_proto, 1);
1091        write_unlock_bh(&net->packet.sklist_lock);
1092        return(0);
1093out:
1094        return err;
1095}
1096
1097/*
1098 *      Pull a packet from our receive queue and hand it to the user.
1099 *      If necessary we block.
1100 */
1101
1102static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1103                          struct msghdr *msg, size_t len, int flags)
1104{
1105        struct sock *sk = sock->sk;
1106        struct sk_buff *skb;
1107        int copied, err;
1108        struct sockaddr_ll *sll;
1109
1110        err = -EINVAL;
1111        if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1112                goto out;
1113
1114#if 0
1115        /* What error should we return now? EUNATTACH? */
1116        if (pkt_sk(sk)->ifindex < 0)
1117                return -ENODEV;
1118#endif
1119
1120        /*
1121         *      Call the generic datagram receiver. This handles all sorts
1122         *      of horrible races and re-entrancy so we can forget about it
1123         *      in the protocol layers.
1124         *
1125         *      Now it will return ENETDOWN, if device have just gone down,
1126         *      but then it will block.
1127         */
1128
1129        skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1130
1131        /*
1132         *      An error occurred so return it. Because skb_recv_datagram()
1133         *      handles the blocking we don't see and worry about blocking
1134         *      retries.
1135         */
1136
1137        if (skb == NULL)
1138                goto out;
1139
1140        /*
1141         *      If the address length field is there to be filled in, we fill
1142         *      it in now.
1143         */
1144
1145        sll = &PACKET_SKB_CB(skb)->sa.ll;
1146        if (sock->type == SOCK_PACKET)
1147                msg->msg_namelen = sizeof(struct sockaddr_pkt);
1148        else
1149                msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1150
1151        /*
1152         *      You lose any data beyond the buffer you gave. If it worries a
1153         *      user program they can ask the device for its MTU anyway.
1154         */
1155
1156        copied = skb->len;
1157        if (copied > len)
1158        {
1159                copied=len;
1160                msg->msg_flags|=MSG_TRUNC;
1161        }
1162
1163        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1164        if (err)
1165                goto out_free;
1166
1167        sock_recv_timestamp(msg, sk, skb);
1168
1169        if (msg->msg_name)
1170                memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1171                       msg->msg_namelen);
1172
1173        if (pkt_sk(sk)->auxdata) {
1174                struct tpacket_auxdata aux;
1175
1176                aux.tp_status = TP_STATUS_USER;
1177                if (skb->ip_summed == CHECKSUM_PARTIAL)
1178                        aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1179                aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1180                aux.tp_snaplen = skb->len;
1181                aux.tp_mac = 0;
1182                aux.tp_net = skb_network_offset(skb);
1183                aux.tp_vlan_tci = skb->vlan_tci;
1184
1185                put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1186        }
1187
1188        /*
1189         *      Free or return the buffer as appropriate. Again this
1190         *      hides all the races and re-entrancy issues from us.
1191         */
1192        err = (flags&MSG_TRUNC) ? skb->len : copied;
1193
1194out_free:
1195        skb_free_datagram(sk, skb);
1196out:
1197        return err;
1198}
1199
1200static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1201                               int *uaddr_len, int peer)
1202{
1203        struct net_device *dev;
1204        struct sock *sk = sock->sk;
1205
1206        if (peer)
1207                return -EOPNOTSUPP;
1208
1209        uaddr->sa_family = AF_PACKET;
1210        dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1211        if (dev) {
1212                strlcpy(uaddr->sa_data, dev->name, 15);
1213                dev_put(dev);
1214        } else
1215                memset(uaddr->sa_data, 0, 14);
1216        *uaddr_len = sizeof(*uaddr);
1217
1218        return 0;
1219}
1220
1221static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1222                          int *uaddr_len, int peer)
1223{
1224        struct net_device *dev;
1225        struct sock *sk = sock->sk;
1226        struct packet_sock *po = pkt_sk(sk);
1227        struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1228
1229        if (peer)
1230                return -EOPNOTSUPP;
1231
1232        sll->sll_family = AF_PACKET;
1233        sll->sll_ifindex = po->ifindex;
1234        sll->sll_protocol = po->num;
1235        dev = dev_get_by_index(sock_net(sk), po->ifindex);
1236        if (dev) {
1237                sll->sll_hatype = dev->type;
1238                sll->sll_halen = dev->addr_len;
1239                memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1240                dev_put(dev);
1241        } else {
1242                sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1243                sll->sll_halen = 0;
1244        }
1245        *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1246
1247        return 0;
1248}
1249
1250static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1251                         int what)
1252{
1253        switch (i->type) {
1254        case PACKET_MR_MULTICAST:
1255                if (what > 0)
1256                        dev_mc_add(dev, i->addr, i->alen, 0);
1257                else
1258                        dev_mc_delete(dev, i->addr, i->alen, 0);
1259                break;
1260        case PACKET_MR_PROMISC:
1261                return dev_set_promiscuity(dev, what);
1262                break;
1263        case PACKET_MR_ALLMULTI:
1264                return dev_set_allmulti(dev, what);
1265                break;
1266        default:;
1267        }
1268        return 0;
1269}
1270
1271static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1272{
1273        for ( ; i; i=i->next) {
1274                if (i->ifindex == dev->ifindex)
1275                        packet_dev_mc(dev, i, what);
1276        }
1277}
1278
1279static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1280{
1281        struct packet_sock *po = pkt_sk(sk);
1282        struct packet_mclist *ml, *i;
1283        struct net_device *dev;
1284        int err;
1285
1286        rtnl_lock();
1287
1288        err = -ENODEV;
1289        dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1290        if (!dev)
1291                goto done;
1292
1293        err = -EINVAL;
1294        if (mreq->mr_alen > dev->addr_len)
1295                goto done;
1296
1297        err = -ENOBUFS;
1298        i = kmalloc(sizeof(*i), GFP_KERNEL);
1299        if (i == NULL)
1300                goto done;
1301
1302        err = 0;
1303        for (ml = po->mclist; ml; ml = ml->next) {
1304                if (ml->ifindex == mreq->mr_ifindex &&
1305                    ml->type == mreq->mr_type &&
1306                    ml->alen == mreq->mr_alen &&
1307                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1308                        ml->count++;
1309                        /* Free the new element ... */
1310                        kfree(i);
1311                        goto done;
1312                }
1313        }
1314
1315        i->type = mreq->mr_type;
1316        i->ifindex = mreq->mr_ifindex;
1317        i->alen = mreq->mr_alen;
1318        memcpy(i->addr, mreq->mr_address, i->alen);
1319        i->count = 1;
1320        i->next = po->mclist;
1321        po->mclist = i;
1322        err = packet_dev_mc(dev, i, 1);
1323        if (err) {
1324                po->mclist = i->next;
1325                kfree(i);
1326        }
1327
1328done:
1329        rtnl_unlock();
1330        return err;
1331}
1332
1333static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1334{
1335        struct packet_mclist *ml, **mlp;
1336
1337        rtnl_lock();
1338
1339        for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1340                if (ml->ifindex == mreq->mr_ifindex &&
1341                    ml->type == mreq->mr_type &&
1342                    ml->alen == mreq->mr_alen &&
1343                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1344                        if (--ml->count == 0) {
1345                                struct net_device *dev;
1346                                *mlp = ml->next;
1347                                dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1348                                if (dev) {
1349                                        packet_dev_mc(dev, ml, -1);
1350                                        dev_put(dev);
1351                                }
1352                                kfree(ml);
1353                        }
1354                        rtnl_unlock();
1355                        return 0;
1356                }
1357        }
1358        rtnl_unlock();
1359        return -EADDRNOTAVAIL;
1360}
1361
1362static void packet_flush_mclist(struct sock *sk)
1363{
1364        struct packet_sock *po = pkt_sk(sk);
1365        struct packet_mclist *ml;
1366
1367        if (!po->mclist)
1368                return;
1369
1370        rtnl_lock();
1371        while ((ml = po->mclist) != NULL) {
1372                struct net_device *dev;
1373
1374                po->mclist = ml->next;
1375                if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1376                        packet_dev_mc(dev, ml, -1);
1377                        dev_put(dev);
1378                }
1379                kfree(ml);
1380        }
1381        rtnl_unlock();
1382}
1383
1384static int
1385packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1386{
1387        struct sock *sk = sock->sk;
1388        struct packet_sock *po = pkt_sk(sk);
1389        int ret;
1390
1391        if (level != SOL_PACKET)
1392                return -ENOPROTOOPT;
1393
1394        switch(optname) {
1395        case PACKET_ADD_MEMBERSHIP:
1396        case PACKET_DROP_MEMBERSHIP:
1397        {
1398                struct packet_mreq_max mreq;
1399                int len = optlen;
1400                memset(&mreq, 0, sizeof(mreq));
1401                if (len < sizeof(struct packet_mreq))
1402                        return -EINVAL;
1403                if (len > sizeof(mreq))
1404                        len = sizeof(mreq);
1405                if (copy_from_user(&mreq,optval,len))
1406                        return -EFAULT;
1407                if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1408                        return -EINVAL;
1409                if (optname == PACKET_ADD_MEMBERSHIP)
1410                        ret = packet_mc_add(sk, &mreq);
1411                else
1412                        ret = packet_mc_drop(sk, &mreq);
1413                return ret;
1414        }
1415
1416#ifdef CONFIG_PACKET_MMAP
1417        case PACKET_RX_RING:
1418        {
1419                struct tpacket_req req;
1420
1421                if (optlen<sizeof(req))
1422                        return -EINVAL;
1423                if (copy_from_user(&req,optval,sizeof(req)))
1424                        return -EFAULT;
1425                return packet_set_ring(sk, &req, 0);
1426        }
1427        case PACKET_COPY_THRESH:
1428        {
1429                int val;
1430
1431                if (optlen!=sizeof(val))
1432                        return -EINVAL;
1433                if (copy_from_user(&val,optval,sizeof(val)))
1434                        return -EFAULT;
1435
1436                pkt_sk(sk)->copy_thresh = val;
1437                return 0;
1438        }
1439        case PACKET_VERSION:
1440        {
1441                int val;
1442
1443                if (optlen != sizeof(val))
1444                        return -EINVAL;
1445                if (po->pg_vec)
1446                        return -EBUSY;
1447                if (copy_from_user(&val, optval, sizeof(val)))
1448                        return -EFAULT;
1449                switch (val) {
1450                case TPACKET_V1:
1451                case TPACKET_V2:
1452                        po->tp_version = val;
1453                        return 0;
1454                default:
1455                        return -EINVAL;
1456                }
1457        }
1458        case PACKET_RESERVE:
1459        {
1460                unsigned int val;
1461
1462                if (optlen != sizeof(val))
1463                        return -EINVAL;
1464                if (po->pg_vec)
1465                        return -EBUSY;
1466                if (copy_from_user(&val, optval, sizeof(val)))
1467                        return -EFAULT;
1468                po->tp_reserve = val;
1469                return 0;
1470        }
1471#endif
1472        case PACKET_AUXDATA:
1473        {
1474                int val;
1475
1476                if (optlen < sizeof(val))
1477                        return -EINVAL;
1478                if (copy_from_user(&val, optval, sizeof(val)))
1479                        return -EFAULT;
1480
1481                po->auxdata = !!val;
1482                return 0;
1483        }
1484        case PACKET_ORIGDEV:
1485        {
1486                int val;
1487
1488                if (optlen < sizeof(val))
1489                        return -EINVAL;
1490                if (copy_from_user(&val, optval, sizeof(val)))
1491                        return -EFAULT;
1492
1493                po->origdev = !!val;
1494                return 0;
1495        }
1496        default:
1497                return -ENOPROTOOPT;
1498        }
1499}
1500
1501static int packet_getsockopt(struct socket *sock, int level, int optname,
1502                             char __user *optval, int __user *optlen)
1503{
1504        int len;
1505        int val;
1506        struct sock *sk = sock->sk;
1507        struct packet_sock *po = pkt_sk(sk);
1508        void *data;
1509        struct tpacket_stats st;
1510
1511        if (level != SOL_PACKET)
1512                return -ENOPROTOOPT;
1513
1514        if (get_user(len, optlen))
1515                return -EFAULT;
1516
1517        if (len < 0)
1518                return -EINVAL;
1519
1520        switch(optname) {
1521        case PACKET_STATISTICS:
1522                if (len > sizeof(struct tpacket_stats))
1523                        len = sizeof(struct tpacket_stats);
1524                spin_lock_bh(&sk->sk_receive_queue.lock);
1525                st = po->stats;
1526                memset(&po->stats, 0, sizeof(st));
1527                spin_unlock_bh(&sk->sk_receive_queue.lock);
1528                st.tp_packets += st.tp_drops;
1529
1530                data = &st;
1531                break;
1532        case PACKET_AUXDATA:
1533                if (len > sizeof(int))
1534                        len = sizeof(int);
1535                val = po->auxdata;
1536
1537                data = &val;
1538                break;
1539        case PACKET_ORIGDEV:
1540                if (len > sizeof(int))
1541                        len = sizeof(int);
1542                val = po->origdev;
1543
1544                data = &val;
1545                break;
1546#ifdef CONFIG_PACKET_MMAP
1547        case PACKET_VERSION:
1548                if (len > sizeof(int))
1549                        len = sizeof(int);
1550                val = po->tp_version;
1551                data = &val;
1552                break;
1553        case PACKET_HDRLEN:
1554                if (len > sizeof(int))
1555                        len = sizeof(int);
1556                if (copy_from_user(&val, optval, len))
1557                        return -EFAULT;
1558                switch (val) {
1559                case TPACKET_V1:
1560                        val = sizeof(struct tpacket_hdr);
1561                        break;
1562                case TPACKET_V2:
1563                        val = sizeof(struct tpacket2_hdr);
1564                        break;
1565                default:
1566                        return -EINVAL;
1567                }
1568                data = &val;
1569                break;
1570        case PACKET_RESERVE:
1571                if (len > sizeof(unsigned int))
1572                        len = sizeof(unsigned int);
1573                val = po->tp_reserve;
1574                data = &val;
1575                break;
1576#endif
1577        default:
1578                return -ENOPROTOOPT;
1579        }
1580
1581        if (put_user(len, optlen))
1582                return -EFAULT;
1583        if (copy_to_user(optval, data, len))
1584                return -EFAULT;
1585        return 0;
1586}
1587
1588
1589static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1590{
1591        struct sock *sk;
1592        struct hlist_node *node;
1593        struct net_device *dev = data;
1594        struct net *net = dev_net(dev);
1595
1596        read_lock(&net->packet.sklist_lock);
1597        sk_for_each(sk, node, &net->packet.sklist) {
1598                struct packet_sock *po = pkt_sk(sk);
1599
1600                switch (msg) {
1601                case NETDEV_UNREGISTER:
1602                        if (po->mclist)
1603                                packet_dev_mclist(dev, po->mclist, -1);
1604                        /* fallthrough */
1605
1606                case NETDEV_DOWN:
1607                        if (dev->ifindex == po->ifindex) {
1608                                spin_lock(&po->bind_lock);
1609                                if (po->running) {
1610                                        __dev_remove_pack(&po->prot_hook);
1611                                        __sock_put(sk);
1612                                        po->running = 0;
1613                                        sk->sk_err = ENETDOWN;
1614                                        if (!sock_flag(sk, SOCK_DEAD))
1615                                                sk->sk_error_report(sk);
1616                                }
1617                                if (msg == NETDEV_UNREGISTER) {
1618                                        po->ifindex = -1;
1619                                        po->prot_hook.dev = NULL;
1620                                }
1621                                spin_unlock(&po->bind_lock);
1622                        }
1623                        break;
1624                case NETDEV_UP:
1625                        spin_lock(&po->bind_lock);
1626                        if (dev->ifindex == po->ifindex && po->num &&
1627                            !po->running) {
1628                                dev_add_pack(&po->prot_hook);
1629                                sock_hold(sk);
1630                                po->running = 1;
1631                        }
1632                        spin_unlock(&po->bind_lock);
1633                        break;
1634                }
1635        }
1636        read_unlock(&net->packet.sklist_lock);
1637        return NOTIFY_DONE;
1638}
1639
1640
1641static int packet_ioctl(struct socket *sock, unsigned int cmd,
1642                        unsigned long arg)
1643{
1644        struct sock *sk = sock->sk;
1645
1646        switch(cmd) {
1647                case SIOCOUTQ:
1648                {
1649                        int amount = atomic_read(&sk->sk_wmem_alloc);
1650                        return put_user(amount, (int __user *)arg);
1651                }
1652                case SIOCINQ:
1653                {
1654                        struct sk_buff *skb;
1655                        int amount = 0;
1656
1657                        spin_lock_bh(&sk->sk_receive_queue.lock);
1658                        skb = skb_peek(&sk->sk_receive_queue);
1659                        if (skb)
1660                                amount = skb->len;
1661                        spin_unlock_bh(&sk->sk_receive_queue.lock);
1662                        return put_user(amount, (int __user *)arg);
1663                }
1664                case SIOCGSTAMP:
1665                        return sock_get_timestamp(sk, (struct timeval __user *)arg);
1666                case SIOCGSTAMPNS:
1667                        return sock_get_timestampns(sk, (struct timespec __user *)arg);
1668
1669#ifdef CONFIG_INET
1670                case SIOCADDRT:
1671                case SIOCDELRT:
1672                case SIOCDARP:
1673                case SIOCGARP:
1674                case SIOCSARP:
1675                case SIOCGIFADDR:
1676                case SIOCSIFADDR:
1677                case SIOCGIFBRDADDR:
1678                case SIOCSIFBRDADDR:
1679                case SIOCGIFNETMASK:
1680                case SIOCSIFNETMASK:
1681                case SIOCGIFDSTADDR:
1682                case SIOCSIFDSTADDR:
1683                case SIOCSIFFLAGS:
1684                        if (!net_eq(sock_net(sk), &init_net))
1685                                return -ENOIOCTLCMD;
1686                        return inet_dgram_ops.ioctl(sock, cmd, arg);
1687#endif
1688
1689                default:
1690                        return -ENOIOCTLCMD;
1691        }
1692        return 0;
1693}
1694
1695#ifndef CONFIG_PACKET_MMAP
1696#define packet_mmap sock_no_mmap
1697#define packet_poll datagram_poll
1698#else
1699
1700static unsigned int packet_poll(struct file * file, struct socket *sock,
1701                                poll_table *wait)
1702{
1703        struct sock *sk = sock->sk;
1704        struct packet_sock *po = pkt_sk(sk);
1705        unsigned int mask = datagram_poll(file, sock, wait);
1706
1707        spin_lock_bh(&sk->sk_receive_queue.lock);
1708        if (po->pg_vec) {
1709                unsigned last = po->head ? po->head-1 : po->frame_max;
1710
1711                if (packet_lookup_frame(po, last, TP_STATUS_USER))
1712                        mask |= POLLIN | POLLRDNORM;
1713        }
1714        spin_unlock_bh(&sk->sk_receive_queue.lock);
1715        return mask;
1716}
1717
1718
1719/* Dirty? Well, I still did not learn better way to account
1720 * for user mmaps.
1721 */
1722
1723static void packet_mm_open(struct vm_area_struct *vma)
1724{
1725        struct file *file = vma->vm_file;
1726        struct socket * sock = file->private_data;
1727        struct sock *sk = sock->sk;
1728
1729        if (sk)
1730                atomic_inc(&pkt_sk(sk)->mapped);
1731}
1732
1733static void packet_mm_close(struct vm_area_struct *vma)
1734{
1735        struct file *file = vma->vm_file;
1736        struct socket * sock = file->private_data;
1737        struct sock *sk = sock->sk;
1738
1739        if (sk)
1740                atomic_dec(&pkt_sk(sk)->mapped);
1741}
1742
1743static struct vm_operations_struct packet_mmap_ops = {
1744        .open = packet_mm_open,
1745        .close =packet_mm_close,
1746};
1747
1748static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1749{
1750        int i;
1751
1752        for (i = 0; i < len; i++) {
1753                if (likely(pg_vec[i]))
1754                        free_pages((unsigned long) pg_vec[i], order);
1755        }
1756        kfree(pg_vec);
1757}
1758
1759static inline char *alloc_one_pg_vec_page(unsigned long order)
1760{
1761        gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
1762
1763        return (char *) __get_free_pages(gfp_flags, order);
1764}
1765
1766static char **alloc_pg_vec(struct tpacket_req *req, int order)
1767{
1768        unsigned int block_nr = req->tp_block_nr;
1769        char **pg_vec;
1770        int i;
1771
1772        pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1773        if (unlikely(!pg_vec))
1774                goto out;
1775
1776        for (i = 0; i < block_nr; i++) {
1777                pg_vec[i] = alloc_one_pg_vec_page(order);
1778                if (unlikely(!pg_vec[i]))
1779                        goto out_free_pgvec;
1780        }
1781
1782out:
1783        return pg_vec;
1784
1785out_free_pgvec:
1786        free_pg_vec(pg_vec, order, block_nr);
1787        pg_vec = NULL;
1788        goto out;
1789}
1790
1791static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1792{
1793        char **pg_vec = NULL;
1794        struct packet_sock *po = pkt_sk(sk);
1795        int was_running, order = 0;
1796        __be16 num;
1797        int err = 0;
1798
1799        if (req->tp_block_nr) {
1800                int i;
1801
1802                /* Sanity tests and some calculations */
1803
1804                if (unlikely(po->pg_vec))
1805                        return -EBUSY;
1806
1807                switch (po->tp_version) {
1808                case TPACKET_V1:
1809                        po->tp_hdrlen = TPACKET_HDRLEN;
1810                        break;
1811                case TPACKET_V2:
1812                        po->tp_hdrlen = TPACKET2_HDRLEN;
1813                        break;
1814                }
1815
1816                if (unlikely((int)req->tp_block_size <= 0))
1817                        return -EINVAL;
1818                if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1819                        return -EINVAL;
1820                if (unlikely(req->tp_frame_size < po->tp_hdrlen +
1821                                                  po->tp_reserve))
1822                        return -EINVAL;
1823                if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1824                        return -EINVAL;
1825
1826                po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1827                if (unlikely(po->frames_per_block <= 0))
1828                        return -EINVAL;
1829                if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1830                             req->tp_frame_nr))
1831                        return -EINVAL;
1832
1833                err = -ENOMEM;
1834                order = get_order(req->tp_block_size);
1835                pg_vec = alloc_pg_vec(req, order);
1836                if (unlikely(!pg_vec))
1837                        goto out;
1838
1839                for (i = 0; i < req->tp_block_nr; i++) {
1840                        void *ptr = pg_vec[i];
1841                        int k;
1842
1843                        for (k = 0; k < po->frames_per_block; k++) {
1844                                __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1845                                ptr += req->tp_frame_size;
1846                        }
1847                }
1848                /* Done */
1849        } else {
1850                if (unlikely(req->tp_frame_nr))
1851                        return -EINVAL;
1852        }
1853
1854        lock_sock(sk);
1855
1856        /* Detach socket from network */
1857        spin_lock(&po->bind_lock);
1858        was_running = po->running;
1859        num = po->num;
1860        if (was_running) {
1861                __dev_remove_pack(&po->prot_hook);
1862                po->num = 0;
1863                po->running = 0;
1864                __sock_put(sk);
1865        }
1866        spin_unlock(&po->bind_lock);
1867
1868        synchronize_net();
1869
1870        err = -EBUSY;
1871        mutex_lock(&po->pg_vec_lock);
1872        if (closing || atomic_read(&po->mapped) == 0) {
1873                err = 0;
1874#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1875
1876                spin_lock_bh(&sk->sk_receive_queue.lock);
1877                pg_vec = XC(po->pg_vec, pg_vec);
1878                po->frame_max = (req->tp_frame_nr - 1);
1879                po->head = 0;
1880                po->frame_size = req->tp_frame_size;
1881                spin_unlock_bh(&sk->sk_receive_queue.lock);
1882
1883                order = XC(po->pg_vec_order, order);
1884                req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1885
1886                po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1887                po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1888                skb_queue_purge(&sk->sk_receive_queue);
1889#undef XC
1890                if (atomic_read(&po->mapped))
1891                        printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1892        }
1893        mutex_unlock(&po->pg_vec_lock);
1894
1895        spin_lock(&po->bind_lock);
1896        if (was_running && !po->running) {
1897                sock_hold(sk);
1898                po->running = 1;
1899                po->num = num;
1900                dev_add_pack(&po->prot_hook);
1901        }
1902        spin_unlock(&po->bind_lock);
1903
1904        release_sock(sk);
1905
1906        if (pg_vec)
1907                free_pg_vec(pg_vec, order, req->tp_block_nr);
1908out:
1909        return err;
1910}
1911
1912static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1913{
1914        struct sock *sk = sock->sk;
1915        struct packet_sock *po = pkt_sk(sk);
1916        unsigned long size;
1917        unsigned long start;
1918        int err = -EINVAL;
1919        int i;
1920
1921        if (vma->vm_pgoff)
1922                return -EINVAL;
1923
1924        size = vma->vm_end - vma->vm_start;
1925
1926        mutex_lock(&po->pg_vec_lock);
1927        if (po->pg_vec == NULL)
1928                goto out;
1929        if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1930                goto out;
1931
1932        start = vma->vm_start;
1933        for (i = 0; i < po->pg_vec_len; i++) {
1934                struct page *page = virt_to_page(po->pg_vec[i]);
1935                int pg_num;
1936
1937                for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1938                        err = vm_insert_page(vma, start, page);
1939                        if (unlikely(err))
1940                                goto out;
1941                        start += PAGE_SIZE;
1942                }
1943        }
1944        atomic_inc(&po->mapped);
1945        vma->vm_ops = &packet_mmap_ops;
1946        err = 0;
1947
1948out:
1949        mutex_unlock(&po->pg_vec_lock);
1950        return err;
1951}
1952#endif
1953
1954
1955static const struct proto_ops packet_ops_spkt = {
1956        .family =       PF_PACKET,
1957        .owner =        THIS_MODULE,
1958        .release =      packet_release,
1959        .bind =         packet_bind_spkt,
1960        .connect =      sock_no_connect,
1961        .socketpair =   sock_no_socketpair,
1962        .accept =       sock_no_accept,
1963        .getname =      packet_getname_spkt,
1964        .poll =         datagram_poll,
1965        .ioctl =        packet_ioctl,
1966        .listen =       sock_no_listen,
1967        .shutdown =     sock_no_shutdown,
1968        .setsockopt =   sock_no_setsockopt,
1969        .getsockopt =   sock_no_getsockopt,
1970        .sendmsg =      packet_sendmsg_spkt,
1971        .recvmsg =      packet_recvmsg,
1972        .mmap =         sock_no_mmap,
1973        .sendpage =     sock_no_sendpage,
1974};
1975
1976static const struct proto_ops packet_ops = {
1977        .family =       PF_PACKET,
1978        .owner =        THIS_MODULE,
1979        .release =      packet_release,
1980        .bind =         packet_bind,
1981        .connect =      sock_no_connect,
1982        .socketpair =   sock_no_socketpair,
1983        .accept =       sock_no_accept,
1984        .getname =      packet_getname,
1985        .poll =         packet_poll,
1986        .ioctl =        packet_ioctl,
1987        .listen =       sock_no_listen,
1988        .shutdown =     sock_no_shutdown,
1989        .setsockopt =   packet_setsockopt,
1990        .getsockopt =   packet_getsockopt,
1991        .sendmsg =      packet_sendmsg,
1992        .recvmsg =      packet_recvmsg,
1993        .mmap =         packet_mmap,
1994        .sendpage =     sock_no_sendpage,
1995};
1996
1997static struct net_proto_family packet_family_ops = {
1998        .family =       PF_PACKET,
1999        .create =       packet_create,
2000        .owner  =       THIS_MODULE,
2001};
2002
2003static struct notifier_block packet_netdev_notifier = {
2004        .notifier_call =packet_notifier,
2005};
2006
2007#ifdef CONFIG_PROC_FS
2008static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2009{
2010        struct sock *s;
2011        struct hlist_node *node;
2012
2013        sk_for_each(s, node, &net->packet.sklist) {
2014                if (!off--)
2015                        return s;
2016        }
2017        return NULL;
2018}
2019
2020static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2021        __acquires(seq_file_net(seq)->packet.sklist_lock)
2022{
2023        struct net *net = seq_file_net(seq);
2024        read_lock(&net->packet.sklist_lock);
2025        return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2026}
2027
2028static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2029{
2030        struct net *net = seq_file_net(seq);
2031        ++*pos;
2032        return  (v == SEQ_START_TOKEN)
2033                ? sk_head(&net->packet.sklist)
2034                : sk_next((struct sock*)v) ;
2035}
2036
2037static void packet_seq_stop(struct seq_file *seq, void *v)
2038        __releases(seq_file_net(seq)->packet.sklist_lock)
2039{
2040        struct net *net = seq_file_net(seq);
2041        read_unlock(&net->packet.sklist_lock);
2042}
2043
2044static int packet_seq_show(struct seq_file *seq, void *v)
2045{
2046        if (v == SEQ_START_TOKEN)
2047                seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2048        else {
2049                struct sock *s = v;
2050                const struct packet_sock *po = pkt_sk(s);
2051
2052                seq_printf(seq,
2053                           "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2054                           s,
2055                           atomic_read(&s->sk_refcnt),
2056                           s->sk_type,
2057                           ntohs(po->num),
2058                           po->ifindex,
2059                           po->running,
2060                           atomic_read(&s->sk_rmem_alloc),
2061                           sock_i_uid(s),
2062                           sock_i_ino(s) );
2063        }
2064
2065        return 0;
2066}
2067
2068static const struct seq_operations packet_seq_ops = {
2069        .start  = packet_seq_start,
2070        .next   = packet_seq_next,
2071        .stop   = packet_seq_stop,
2072        .show   = packet_seq_show,
2073};
2074
2075static int packet_seq_open(struct inode *inode, struct file *file)
2076{
2077        return seq_open_net(inode, file, &packet_seq_ops,
2078                            sizeof(struct seq_net_private));
2079}
2080
2081static const struct file_operations packet_seq_fops = {
2082        .owner          = THIS_MODULE,
2083        .open           = packet_seq_open,
2084        .read           = seq_read,
2085        .llseek         = seq_lseek,
2086        .release        = seq_release_net,
2087};
2088
2089#endif
2090
2091static int packet_net_init(struct net *net)
2092{
2093        rwlock_init(&net->packet.sklist_lock);
2094        INIT_HLIST_HEAD(&net->packet.sklist);
2095
2096        if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2097                return -ENOMEM;
2098
2099        return 0;
2100}
2101
2102static void packet_net_exit(struct net *net)
2103{
2104        proc_net_remove(net, "packet");
2105}
2106
2107static struct pernet_operations packet_net_ops = {
2108        .init = packet_net_init,
2109        .exit = packet_net_exit,
2110};
2111
2112
2113static void __exit packet_exit(void)
2114{
2115        unregister_netdevice_notifier(&packet_netdev_notifier);
2116        unregister_pernet_subsys(&packet_net_ops);
2117        sock_unregister(PF_PACKET);
2118        proto_unregister(&packet_proto);
2119}
2120
2121static int __init packet_init(void)
2122{
2123        int rc = proto_register(&packet_proto, 0);
2124
2125        if (rc != 0)
2126                goto out;
2127
2128        sock_register(&packet_family_ops);
2129        register_pernet_subsys(&packet_net_ops);
2130        register_netdevice_notifier(&packet_netdev_notifier);
2131out:
2132        return rc;
2133}
2134
2135module_init(packet_init);
2136module_exit(packet_exit);
2137MODULE_LICENSE("GPL");
2138MODULE_ALIAS_NETPROTO(PF_PACKET);
2139