linux/drivers/net/tun.c
<<
>>
Prefs
   1/*
   2 *  TUN - Universal TUN/TAP device driver.
   3 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
   4 *
   5 *  This program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2 of the License, or
   8 *  (at your option) any later version.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
  16 */
  17
  18/*
  19 *  Changes:
  20 *
  21 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
  22 *    Add TUNSETLINK ioctl to set the link encapsulation
  23 *
  24 *  Mark Smith <markzzzsmith@yahoo.com.au>
  25 *    Use eth_random_addr() for tap MAC address.
  26 *
  27 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
  28 *    Fixes in packet dropping, queue length setting and queue wakeup.
  29 *    Increased default tx queue length.
  30 *    Added ethtool API.
  31 *    Minor cleanups
  32 *
  33 *  Daniel Podlejski <underley@underley.eu.org>
  34 *    Modifications for 2.3.99-pre5 kernel.
  35 */
  36
  37#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  38
  39#define DRV_NAME        "tun"
  40#define DRV_VERSION     "1.6"
  41#define DRV_DESCRIPTION "Universal TUN/TAP device driver"
  42#define DRV_COPYRIGHT   "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
  43
  44#include <linux/module.h>
  45#include <linux/errno.h>
  46#include <linux/kernel.h>
  47#include <linux/major.h>
  48#include <linux/slab.h>
  49#include <linux/poll.h>
  50#include <linux/fcntl.h>
  51#include <linux/init.h>
  52#include <linux/skbuff.h>
  53#include <linux/netdevice.h>
  54#include <linux/etherdevice.h>
  55#include <linux/miscdevice.h>
  56#include <linux/ethtool.h>
  57#include <linux/rtnetlink.h>
  58#include <linux/compat.h>
  59#include <linux/if.h>
  60#include <linux/if_arp.h>
  61#include <linux/if_ether.h>
  62#include <linux/if_tun.h>
  63#include <linux/crc32.h>
  64#include <linux/nsproxy.h>
  65#include <linux/virtio_net.h>
  66#include <linux/rcupdate.h>
  67#include <net/net_namespace.h>
  68#include <net/netns/generic.h>
  69#include <net/rtnetlink.h>
  70#include <net/sock.h>
  71#include <net/cls_cgroup.h>
  72
  73#include <asm/uaccess.h>
  74
  75/* Uncomment to enable debugging */
  76/* #define TUN_DEBUG 1 */
  77
  78#ifdef TUN_DEBUG
  79static int debug;
  80
  81#define tun_debug(level, tun, fmt, args...)                     \
  82do {                                                            \
  83        if (tun->debug)                                         \
  84                netdev_printk(level, tun->dev, fmt, ##args);    \
  85} while (0)
  86#define DBG1(level, fmt, args...)                               \
  87do {                                                            \
  88        if (debug == 2)                                         \
  89                printk(level fmt, ##args);                      \
  90} while (0)
  91#else
  92#define tun_debug(level, tun, fmt, args...)                     \
  93do {                                                            \
  94        if (0)                                                  \
  95                netdev_printk(level, tun->dev, fmt, ##args);    \
  96} while (0)
  97#define DBG1(level, fmt, args...)                               \
  98do {                                                            \
  99        if (0)                                                  \
 100                printk(level fmt, ##args);                      \
 101} while (0)
 102#endif
 103
 104#define GOODCOPY_LEN 128
 105
 106#define FLT_EXACT_COUNT 8
 107struct tap_filter {
 108        unsigned int    count;    /* Number of addrs. Zero means disabled */
 109        u32             mask[2];  /* Mask of the hashed addrs */
 110        unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
 111};
 112
 113struct tun_file {
 114        atomic_t count;
 115        struct tun_struct *tun;
 116        struct net *net;
 117};
 118
 119struct tun_sock;
 120
 121struct tun_struct {
 122        struct tun_file         *tfile;
 123        unsigned int            flags;
 124        kuid_t                  owner;
 125        kgid_t                  group;
 126
 127        struct net_device       *dev;
 128        netdev_features_t       set_features;
 129#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
 130                          NETIF_F_TSO6|NETIF_F_UFO)
 131        struct fasync_struct    *fasync;
 132
 133        struct tap_filter       txflt;
 134        struct socket           socket;
 135        struct socket_wq        wq;
 136
 137        int                     vnet_hdr_sz;
 138
 139#ifdef TUN_DEBUG
 140        int debug;
 141#endif
 142};
 143
 144struct tun_sock {
 145        struct sock             sk;
 146        struct tun_struct       *tun;
 147};
 148
 149static inline struct tun_sock *tun_sk(struct sock *sk)
 150{
 151        return container_of(sk, struct tun_sock, sk);
 152}
 153
 154static int tun_attach(struct tun_struct *tun, struct file *file)
 155{
 156        struct tun_file *tfile = file->private_data;
 157        int err;
 158
 159        ASSERT_RTNL();
 160
 161        netif_tx_lock_bh(tun->dev);
 162
 163        err = -EINVAL;
 164        if (tfile->tun)
 165                goto out;
 166
 167        err = -EBUSY;
 168        if (tun->tfile)
 169                goto out;
 170
 171        err = 0;
 172        tfile->tun = tun;
 173        tun->tfile = tfile;
 174        tun->socket.file = file;
 175        netif_carrier_on(tun->dev);
 176        dev_hold(tun->dev);
 177        sock_hold(tun->socket.sk);
 178        atomic_inc(&tfile->count);
 179
 180out:
 181        netif_tx_unlock_bh(tun->dev);
 182        return err;
 183}
 184
 185static void __tun_detach(struct tun_struct *tun)
 186{
 187        /* Detach from net device */
 188        netif_tx_lock_bh(tun->dev);
 189        netif_carrier_off(tun->dev);
 190        tun->tfile = NULL;
 191        netif_tx_unlock_bh(tun->dev);
 192
 193        /* Drop read queue */
 194        skb_queue_purge(&tun->socket.sk->sk_receive_queue);
 195
 196        /* Drop the extra count on the net device */
 197        dev_put(tun->dev);
 198}
 199
 200static void tun_detach(struct tun_struct *tun)
 201{
 202        rtnl_lock();
 203        __tun_detach(tun);
 204        rtnl_unlock();
 205}
 206
 207static struct tun_struct *__tun_get(struct tun_file *tfile)
 208{
 209        struct tun_struct *tun = NULL;
 210
 211        if (atomic_inc_not_zero(&tfile->count))
 212                tun = tfile->tun;
 213
 214        return tun;
 215}
 216
 217static struct tun_struct *tun_get(struct file *file)
 218{
 219        return __tun_get(file->private_data);
 220}
 221
 222static void tun_put(struct tun_struct *tun)
 223{
 224        struct tun_file *tfile = tun->tfile;
 225
 226        if (atomic_dec_and_test(&tfile->count))
 227                tun_detach(tfile->tun);
 228}
 229
 230/* TAP filtering */
 231static void addr_hash_set(u32 *mask, const u8 *addr)
 232{
 233        int n = ether_crc(ETH_ALEN, addr) >> 26;
 234        mask[n >> 5] |= (1 << (n & 31));
 235}
 236
 237static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
 238{
 239        int n = ether_crc(ETH_ALEN, addr) >> 26;
 240        return mask[n >> 5] & (1 << (n & 31));
 241}
 242
 243static int update_filter(struct tap_filter *filter, void __user *arg)
 244{
 245        struct { u8 u[ETH_ALEN]; } *addr;
 246        struct tun_filter uf;
 247        int err, alen, n, nexact;
 248
 249        if (copy_from_user(&uf, arg, sizeof(uf)))
 250                return -EFAULT;
 251
 252        if (!uf.count) {
 253                /* Disabled */
 254                filter->count = 0;
 255                return 0;
 256        }
 257
 258        alen = ETH_ALEN * uf.count;
 259        addr = kmalloc(alen, GFP_KERNEL);
 260        if (!addr)
 261                return -ENOMEM;
 262
 263        if (copy_from_user(addr, arg + sizeof(uf), alen)) {
 264                err = -EFAULT;
 265                goto done;
 266        }
 267
 268        /* The filter is updated without holding any locks. Which is
 269         * perfectly safe. We disable it first and in the worst
 270         * case we'll accept a few undesired packets. */
 271        filter->count = 0;
 272        wmb();
 273
 274        /* Use first set of addresses as an exact filter */
 275        for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
 276                memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
 277
 278        nexact = n;
 279
 280        /* Remaining multicast addresses are hashed,
 281         * unicast will leave the filter disabled. */
 282        memset(filter->mask, 0, sizeof(filter->mask));
 283        for (; n < uf.count; n++) {
 284                if (!is_multicast_ether_addr(addr[n].u)) {
 285                        err = 0; /* no filter */
 286                        goto done;
 287                }
 288                addr_hash_set(filter->mask, addr[n].u);
 289        }
 290
 291        /* For ALLMULTI just set the mask to all ones.
 292         * This overrides the mask populated above. */
 293        if ((uf.flags & TUN_FLT_ALLMULTI))
 294                memset(filter->mask, ~0, sizeof(filter->mask));
 295
 296        /* Now enable the filter */
 297        wmb();
 298        filter->count = nexact;
 299
 300        /* Return the number of exact filters */
 301        err = nexact;
 302
 303done:
 304        kfree(addr);
 305        return err;
 306}
 307
 308/* Returns: 0 - drop, !=0 - accept */
 309static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
 310{
 311        /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
 312         * at this point. */
 313        struct ethhdr *eh = (struct ethhdr *) skb->data;
 314        int i;
 315
 316        /* Exact match */
 317        for (i = 0; i < filter->count; i++)
 318                if (ether_addr_equal(eh->h_dest, filter->addr[i]))
 319                        return 1;
 320
 321        /* Inexact match (multicast only) */
 322        if (is_multicast_ether_addr(eh->h_dest))
 323                return addr_hash_test(filter->mask, eh->h_dest);
 324
 325        return 0;
 326}
 327
 328/*
 329 * Checks whether the packet is accepted or not.
 330 * Returns: 0 - drop, !=0 - accept
 331 */
 332static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
 333{
 334        if (!filter->count)
 335                return 1;
 336
 337        return run_filter(filter, skb);
 338}
 339
 340/* Network device part of the driver */
 341
 342static const struct ethtool_ops tun_ethtool_ops;
 343
 344/* Net device detach from fd. */
 345static void tun_net_uninit(struct net_device *dev)
 346{
 347        struct tun_struct *tun = netdev_priv(dev);
 348        struct tun_file *tfile = tun->tfile;
 349
 350        /* Inform the methods they need to stop using the dev.
 351         */
 352        if (tfile) {
 353                wake_up_all(&tun->wq.wait);
 354                if (atomic_dec_and_test(&tfile->count))
 355                        __tun_detach(tun);
 356        }
 357}
 358
 359static void tun_free_netdev(struct net_device *dev)
 360{
 361        struct tun_struct *tun = netdev_priv(dev);
 362
 363        BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags));
 364
 365        sk_release_kernel(tun->socket.sk);
 366}
 367
 368/* Net device open. */
 369static int tun_net_open(struct net_device *dev)
 370{
 371        netif_start_queue(dev);
 372        return 0;
 373}
 374
 375/* Net device close. */
 376static int tun_net_close(struct net_device *dev)
 377{
 378        netif_stop_queue(dev);
 379        return 0;
 380}
 381
 382/* Net device start xmit */
 383static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 384{
 385        struct tun_struct *tun = netdev_priv(dev);
 386
 387        tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
 388
 389        /* Drop packet if interface is not attached */
 390        if (!tun->tfile)
 391                goto drop;
 392
 393        /* Drop if the filter does not like it.
 394         * This is a noop if the filter is disabled.
 395         * Filter can be enabled only for the TAP devices. */
 396        if (!check_filter(&tun->txflt, skb))
 397                goto drop;
 398
 399        if (tun->socket.sk->sk_filter &&
 400            sk_filter(tun->socket.sk, skb))
 401                goto drop;
 402
 403        if (skb_queue_len(&tun->socket.sk->sk_receive_queue) >= dev->tx_queue_len) {
 404                if (!(tun->flags & TUN_ONE_QUEUE)) {
 405                        /* Normal queueing mode. */
 406                        /* Packet scheduler handles dropping of further packets. */
 407                        netif_stop_queue(dev);
 408
 409                        /* We won't see all dropped packets individually, so overrun
 410                         * error is more appropriate. */
 411                        dev->stats.tx_fifo_errors++;
 412                } else {
 413                        /* Single queue mode.
 414                         * Driver handles dropping of all packets itself. */
 415                        goto drop;
 416                }
 417        }
 418
 419        /* Orphan the skb - required as we might hang on to it
 420         * for indefinite time. */
 421        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
 422                goto drop;
 423        skb_orphan(skb);
 424
 425        /* Enqueue packet */
 426        skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
 427
 428        /* Notify and wake up reader process */
 429        if (tun->flags & TUN_FASYNC)
 430                kill_fasync(&tun->fasync, SIGIO, POLL_IN);
 431        wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
 432                                   POLLRDNORM | POLLRDBAND);
 433        return NETDEV_TX_OK;
 434
 435drop:
 436        dev->stats.tx_dropped++;
 437        kfree_skb(skb);
 438        return NETDEV_TX_OK;
 439}
 440
 441static void tun_net_mclist(struct net_device *dev)
 442{
 443        /*
 444         * This callback is supposed to deal with mc filter in
 445         * _rx_ path and has nothing to do with the _tx_ path.
 446         * In rx path we always accept everything userspace gives us.
 447         */
 448}
 449
 450#define MIN_MTU 68
 451#define MAX_MTU 65535
 452
 453static int
 454tun_net_change_mtu(struct net_device *dev, int new_mtu)
 455{
 456        if (new_mtu < MIN_MTU || new_mtu + dev->hard_header_len > MAX_MTU)
 457                return -EINVAL;
 458        dev->mtu = new_mtu;
 459        return 0;
 460}
 461
 462static netdev_features_t tun_net_fix_features(struct net_device *dev,
 463        netdev_features_t features)
 464{
 465        struct tun_struct *tun = netdev_priv(dev);
 466
 467        return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
 468}
 469#ifdef CONFIG_NET_POLL_CONTROLLER
 470static void tun_poll_controller(struct net_device *dev)
 471{
 472        /*
 473         * Tun only receives frames when:
 474         * 1) the char device endpoint gets data from user space
 475         * 2) the tun socket gets a sendmsg call from user space
 476         * Since both of those are syncronous operations, we are guaranteed
 477         * never to have pending data when we poll for it
 478         * so theres nothing to do here but return.
 479         * We need this though so netpoll recognizes us as an interface that
 480         * supports polling, which enables bridge devices in virt setups to
 481         * still use netconsole
 482         */
 483        return;
 484}
 485#endif
 486static const struct net_device_ops tun_netdev_ops = {
 487        .ndo_uninit             = tun_net_uninit,
 488        .ndo_open               = tun_net_open,
 489        .ndo_stop               = tun_net_close,
 490        .ndo_start_xmit         = tun_net_xmit,
 491        .ndo_change_mtu         = tun_net_change_mtu,
 492        .ndo_fix_features       = tun_net_fix_features,
 493#ifdef CONFIG_NET_POLL_CONTROLLER
 494        .ndo_poll_controller    = tun_poll_controller,
 495#endif
 496};
 497
 498static const struct net_device_ops tap_netdev_ops = {
 499        .ndo_uninit             = tun_net_uninit,
 500        .ndo_open               = tun_net_open,
 501        .ndo_stop               = tun_net_close,
 502        .ndo_start_xmit         = tun_net_xmit,
 503        .ndo_change_mtu         = tun_net_change_mtu,
 504        .ndo_fix_features       = tun_net_fix_features,
 505        .ndo_set_rx_mode        = tun_net_mclist,
 506        .ndo_set_mac_address    = eth_mac_addr,
 507        .ndo_validate_addr      = eth_validate_addr,
 508#ifdef CONFIG_NET_POLL_CONTROLLER
 509        .ndo_poll_controller    = tun_poll_controller,
 510#endif
 511};
 512
 513/* Initialize net device. */
 514static void tun_net_init(struct net_device *dev)
 515{
 516        struct tun_struct *tun = netdev_priv(dev);
 517
 518        switch (tun->flags & TUN_TYPE_MASK) {
 519        case TUN_TUN_DEV:
 520                dev->netdev_ops = &tun_netdev_ops;
 521
 522                /* Point-to-Point TUN Device */
 523                dev->hard_header_len = 0;
 524                dev->addr_len = 0;
 525                dev->mtu = 1500;
 526
 527                /* Zero header length */
 528                dev->type = ARPHRD_NONE;
 529                dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
 530                dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
 531                break;
 532
 533        case TUN_TAP_DEV:
 534                dev->netdev_ops = &tap_netdev_ops;
 535                /* Ethernet TAP Device */
 536                ether_setup(dev);
 537                dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 538
 539                eth_hw_addr_random(dev);
 540
 541                dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
 542                break;
 543        }
 544}
 545
 546/* Character device part */
 547
 548/* Poll */
 549static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 550{
 551        struct tun_file *tfile = file->private_data;
 552        struct tun_struct *tun = __tun_get(tfile);
 553        struct sock *sk;
 554        unsigned int mask = 0;
 555
 556        if (!tun)
 557                return POLLERR;
 558
 559        sk = tun->socket.sk;
 560
 561        tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
 562
 563        poll_wait(file, &tun->wq.wait, wait);
 564
 565        if (!skb_queue_empty(&sk->sk_receive_queue))
 566                mask |= POLLIN | POLLRDNORM;
 567
 568        if (sock_writeable(sk) ||
 569            (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
 570             sock_writeable(sk)))
 571                mask |= POLLOUT | POLLWRNORM;
 572
 573        if (tun->dev->reg_state != NETREG_REGISTERED)
 574                mask = POLLERR;
 575
 576        tun_put(tun);
 577        return mask;
 578}
 579
 580/* prepad is the amount to reserve at front.  len is length after that.
 581 * linear is a hint as to how much to copy (usually headers). */
 582static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
 583                                     size_t prepad, size_t len,
 584                                     size_t linear, int noblock)
 585{
 586        struct sock *sk = tun->socket.sk;
 587        struct sk_buff *skb;
 588        int err;
 589
 590        sock_update_classid(sk);
 591
 592        /* Under a page?  Don't bother with paged skb. */
 593        if (prepad + len < PAGE_SIZE || !linear)
 594                linear = len;
 595
 596        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
 597                                   &err);
 598        if (!skb)
 599                return ERR_PTR(err);
 600
 601        skb_reserve(skb, prepad);
 602        skb_put(skb, linear);
 603        skb->data_len = len - linear;
 604        skb->len += len - linear;
 605
 606        return skb;
 607}
 608
 609/* set skb frags from iovec, this can move to core network code for reuse */
 610static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 611                                  int offset, size_t count)
 612{
 613        int len = iov_length(from, count) - offset;
 614        int copy = skb_headlen(skb);
 615        int size, offset1 = 0;
 616        int i = 0;
 617
 618        /* Skip over from offset */
 619        while (count && (offset >= from->iov_len)) {
 620                offset -= from->iov_len;
 621                ++from;
 622                --count;
 623        }
 624
 625        /* copy up to skb headlen */
 626        while (count && (copy > 0)) {
 627                size = min_t(unsigned int, copy, from->iov_len - offset);
 628                if (copy_from_user(skb->data + offset1, from->iov_base + offset,
 629                                   size))
 630                        return -EFAULT;
 631                if (copy > size) {
 632                        ++from;
 633                        --count;
 634                        offset = 0;
 635                } else
 636                        offset += size;
 637                copy -= size;
 638                offset1 += size;
 639        }
 640
 641        if (len == offset1)
 642                return 0;
 643
 644        while (count--) {
 645                struct page *page[MAX_SKB_FRAGS];
 646                int num_pages;
 647                unsigned long base;
 648                unsigned long truesize;
 649
 650                len = from->iov_len - offset;
 651                if (!len) {
 652                        offset = 0;
 653                        ++from;
 654                        continue;
 655                }
 656                base = (unsigned long)from->iov_base + offset;
 657                size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
 658                if (i + size > MAX_SKB_FRAGS)
 659                        return -EMSGSIZE;
 660                num_pages = get_user_pages_fast(base, size, 0, &page[i]);
 661                if (num_pages != size) {
 662                        for (i = 0; i < num_pages; i++)
 663                                put_page(page[i]);
 664                        return -EFAULT;
 665                }
 666                truesize = size * PAGE_SIZE;
 667                skb->data_len += len;
 668                skb->len += len;
 669                skb->truesize += truesize;
 670                atomic_add(truesize, &skb->sk->sk_wmem_alloc);
 671                while (len) {
 672                        int off = base & ~PAGE_MASK;
 673                        int size = min_t(int, len, PAGE_SIZE - off);
 674                        __skb_fill_page_desc(skb, i, page[i], off, size);
 675                        skb_shinfo(skb)->nr_frags++;
 676                        /* increase sk_wmem_alloc */
 677                        base += size;
 678                        len -= size;
 679                        i++;
 680                }
 681                offset = 0;
 682                ++from;
 683        }
 684        return 0;
 685}
 686
 687/* Get packet from user space buffer */
 688static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
 689                            const struct iovec *iv, size_t total_len,
 690                            size_t count, int noblock)
 691{
 692        struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
 693        struct sk_buff *skb;
 694        size_t len = total_len, align = NET_SKB_PAD;
 695        struct virtio_net_hdr gso = { 0 };
 696        int offset = 0;
 697        int copylen;
 698        bool zerocopy = false;
 699        int err;
 700
 701        if (!(tun->flags & TUN_NO_PI)) {
 702                if ((len -= sizeof(pi)) > total_len)
 703                        return -EINVAL;
 704
 705                if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
 706                        return -EFAULT;
 707                offset += sizeof(pi);
 708        }
 709
 710        if (tun->flags & TUN_VNET_HDR) {
 711                if ((len -= tun->vnet_hdr_sz) > total_len)
 712                        return -EINVAL;
 713
 714                if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
 715                        return -EFAULT;
 716
 717                if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
 718                    gso.csum_start + gso.csum_offset + 2 > gso.hdr_len)
 719                        gso.hdr_len = gso.csum_start + gso.csum_offset + 2;
 720
 721                if (gso.hdr_len > len)
 722                        return -EINVAL;
 723                offset += tun->vnet_hdr_sz;
 724        }
 725
 726        if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
 727                align += NET_IP_ALIGN;
 728                if (unlikely(len < ETH_HLEN ||
 729                             (gso.hdr_len && gso.hdr_len < ETH_HLEN)))
 730                        return -EINVAL;
 731        }
 732
 733        if (msg_control)
 734                zerocopy = true;
 735
 736        if (zerocopy) {
 737                /* Userspace may produce vectors with count greater than
 738                 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
 739                 * to let the rest of data to be fit in the frags.
 740                 */
 741                if (count > MAX_SKB_FRAGS) {
 742                        copylen = iov_length(iv, count - MAX_SKB_FRAGS);
 743                        if (copylen < offset)
 744                                copylen = 0;
 745                        else
 746                                copylen -= offset;
 747                } else
 748                                copylen = 0;
 749                /* There are 256 bytes to be copied in skb, so there is enough
 750                 * room for skb expand head in case it is used.
 751                 * The rest of the buffer is mapped from userspace.
 752                 */
 753                if (copylen < gso.hdr_len)
 754                        copylen = gso.hdr_len;
 755                if (!copylen)
 756                        copylen = GOODCOPY_LEN;
 757        } else
 758                copylen = len;
 759
 760        skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
 761        if (IS_ERR(skb)) {
 762                if (PTR_ERR(skb) != -EAGAIN)
 763                        tun->dev->stats.rx_dropped++;
 764                return PTR_ERR(skb);
 765        }
 766
 767        if (zerocopy)
 768                err = zerocopy_sg_from_iovec(skb, iv, offset, count);
 769        else
 770                err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
 771
 772        if (err) {
 773                tun->dev->stats.rx_dropped++;
 774                kfree_skb(skb);
 775                return -EFAULT;
 776        }
 777
 778        if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 779                if (!skb_partial_csum_set(skb, gso.csum_start,
 780                                          gso.csum_offset)) {
 781                        tun->dev->stats.rx_frame_errors++;
 782                        kfree_skb(skb);
 783                        return -EINVAL;
 784                }
 785        }
 786
 787        switch (tun->flags & TUN_TYPE_MASK) {
 788        case TUN_TUN_DEV:
 789                if (tun->flags & TUN_NO_PI) {
 790                        switch (skb->data[0] & 0xf0) {
 791                        case 0x40:
 792                                pi.proto = htons(ETH_P_IP);
 793                                break;
 794                        case 0x60:
 795                                pi.proto = htons(ETH_P_IPV6);
 796                                break;
 797                        default:
 798                                tun->dev->stats.rx_dropped++;
 799                                kfree_skb(skb);
 800                                return -EINVAL;
 801                        }
 802                }
 803
 804                skb_reset_mac_header(skb);
 805                skb->protocol = pi.proto;
 806                skb->dev = tun->dev;
 807                break;
 808        case TUN_TAP_DEV:
 809                skb->protocol = eth_type_trans(skb, tun->dev);
 810                break;
 811        }
 812
 813        if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 814                pr_debug("GSO!\n");
 815                switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 816                case VIRTIO_NET_HDR_GSO_TCPV4:
 817                        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 818                        break;
 819                case VIRTIO_NET_HDR_GSO_TCPV6:
 820                        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
 821                        break;
 822                case VIRTIO_NET_HDR_GSO_UDP:
 823                        skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 824                        break;
 825                default:
 826                        tun->dev->stats.rx_frame_errors++;
 827                        kfree_skb(skb);
 828                        return -EINVAL;
 829                }
 830
 831                if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
 832                        skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 833
 834                skb_shinfo(skb)->gso_size = gso.gso_size;
 835                if (skb_shinfo(skb)->gso_size == 0) {
 836                        tun->dev->stats.rx_frame_errors++;
 837                        kfree_skb(skb);
 838                        return -EINVAL;
 839                }
 840
 841                /* Header must be checked, and gso_segs computed. */
 842                skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
 843                skb_shinfo(skb)->gso_segs = 0;
 844        }
 845
 846        /* copy skb_ubuf_info for callback when skb has no error */
 847        if (zerocopy) {
 848                skb_shinfo(skb)->destructor_arg = msg_control;
 849                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 850        }
 851
 852        netif_rx_ni(skb);
 853
 854        tun->dev->stats.rx_packets++;
 855        tun->dev->stats.rx_bytes += len;
 856
 857        return total_len;
 858}
 859
 860static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 861                              unsigned long count, loff_t pos)
 862{
 863        struct file *file = iocb->ki_filp;
 864        struct tun_struct *tun = tun_get(file);
 865        ssize_t result;
 866
 867        if (!tun)
 868                return -EBADFD;
 869
 870        tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
 871
 872        result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count,
 873                              file->f_flags & O_NONBLOCK);
 874
 875        tun_put(tun);
 876        return result;
 877}
 878
 879/* Put packet to the user space buffer */
 880static ssize_t tun_put_user(struct tun_struct *tun,
 881                            struct sk_buff *skb,
 882                            const struct iovec *iv, int len)
 883{
 884        struct tun_pi pi = { 0, skb->protocol };
 885        ssize_t total = 0;
 886
 887        if (!(tun->flags & TUN_NO_PI)) {
 888                if ((len -= sizeof(pi)) < 0)
 889                        return -EINVAL;
 890
 891                if (len < skb->len) {
 892                        /* Packet will be striped */
 893                        pi.flags |= TUN_PKT_STRIP;
 894                }
 895
 896                if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
 897                        return -EFAULT;
 898                total += sizeof(pi);
 899        }
 900
 901        if (tun->flags & TUN_VNET_HDR) {
 902                struct virtio_net_hdr gso = { 0 }; /* no info leak */
 903                if ((len -= tun->vnet_hdr_sz) < 0)
 904                        return -EINVAL;
 905
 906                if (skb_is_gso(skb)) {
 907                        struct skb_shared_info *sinfo = skb_shinfo(skb);
 908
 909                        /* This is a hint as to how much should be linear. */
 910                        gso.hdr_len = skb_headlen(skb);
 911                        gso.gso_size = sinfo->gso_size;
 912                        if (sinfo->gso_type & SKB_GSO_TCPV4)
 913                                gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 914                        else if (sinfo->gso_type & SKB_GSO_TCPV6)
 915                                gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
 916                        else if (sinfo->gso_type & SKB_GSO_UDP)
 917                                gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
 918                        else {
 919                                pr_err("unexpected GSO type: "
 920                                       "0x%x, gso_size %d, hdr_len %d\n",
 921                                       sinfo->gso_type, gso.gso_size,
 922                                       gso.hdr_len);
 923                                print_hex_dump(KERN_ERR, "tun: ",
 924                                               DUMP_PREFIX_NONE,
 925                                               16, 1, skb->head,
 926                                               min((int)gso.hdr_len, 64), true);
 927                                WARN_ON_ONCE(1);
 928                                return -EINVAL;
 929                        }
 930                        if (sinfo->gso_type & SKB_GSO_TCP_ECN)
 931                                gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
 932                } else
 933                        gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
 934
 935                if (skb->ip_summed == CHECKSUM_PARTIAL) {
 936                        gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 937                        gso.csum_start = skb_checksum_start_offset(skb);
 938                        gso.csum_offset = skb->csum_offset;
 939                } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
 940                        gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 941                } /* else everything is zero */
 942
 943                if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
 944                                               sizeof(gso))))
 945                        return -EFAULT;
 946                total += tun->vnet_hdr_sz;
 947        }
 948
 949        len = min_t(int, skb->len, len);
 950
 951        skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
 952        total += skb->len;
 953
 954        tun->dev->stats.tx_packets++;
 955        tun->dev->stats.tx_bytes += len;
 956
 957        return total;
 958}
 959
 960static ssize_t tun_do_read(struct tun_struct *tun,
 961                           struct kiocb *iocb, const struct iovec *iv,
 962                           ssize_t len, int noblock)
 963{
 964        DECLARE_WAITQUEUE(wait, current);
 965        struct sk_buff *skb;
 966        ssize_t ret = 0;
 967
 968        tun_debug(KERN_INFO, tun, "tun_chr_read\n");
 969
 970        if (unlikely(!noblock))
 971                add_wait_queue(&tun->wq.wait, &wait);
 972        while (len) {
 973                current->state = TASK_INTERRUPTIBLE;
 974
 975                /* Read frames from the queue */
 976                if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
 977                        if (noblock) {
 978                                ret = -EAGAIN;
 979                                break;
 980                        }
 981                        if (signal_pending(current)) {
 982                                ret = -ERESTARTSYS;
 983                                break;
 984                        }
 985                        if (tun->dev->reg_state != NETREG_REGISTERED) {
 986                                ret = -EIO;
 987                                break;
 988                        }
 989
 990                        /* Nothing to read, let's sleep */
 991                        schedule();
 992                        continue;
 993                }
 994                netif_wake_queue(tun->dev);
 995
 996                ret = tun_put_user(tun, skb, iv, len);
 997                kfree_skb(skb);
 998                break;
 999        }
1000
1001        current->state = TASK_RUNNING;
1002        if (unlikely(!noblock))
1003                remove_wait_queue(&tun->wq.wait, &wait);
1004
1005        return ret;
1006}
1007
1008static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
1009                            unsigned long count, loff_t pos)
1010{
1011        struct file *file = iocb->ki_filp;
1012        struct tun_file *tfile = file->private_data;
1013        struct tun_struct *tun = __tun_get(tfile);
1014        ssize_t len, ret;
1015
1016        if (!tun)
1017                return -EBADFD;
1018        len = iov_length(iv, count);
1019        if (len < 0) {
1020                ret = -EINVAL;
1021                goto out;
1022        }
1023
1024        ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK);
1025        ret = min_t(ssize_t, ret, len);
1026out:
1027        tun_put(tun);
1028        return ret;
1029}
1030
1031static void tun_setup(struct net_device *dev)
1032{
1033        struct tun_struct *tun = netdev_priv(dev);
1034
1035        tun->owner = INVALID_UID;
1036        tun->group = INVALID_GID;
1037
1038        dev->ethtool_ops = &tun_ethtool_ops;
1039        dev->destructor = tun_free_netdev;
1040}
1041
1042/* Trivial set of netlink ops to allow deleting tun or tap
1043 * device with netlink.
1044 */
1045static int tun_validate(struct nlattr *tb[], struct nlattr *data[])
1046{
1047        return -EINVAL;
1048}
1049
1050static struct rtnl_link_ops tun_link_ops __read_mostly = {
1051        .kind           = DRV_NAME,
1052        .priv_size      = sizeof(struct tun_struct),
1053        .setup          = tun_setup,
1054        .validate       = tun_validate,
1055};
1056
1057static void tun_sock_write_space(struct sock *sk)
1058{
1059        struct tun_struct *tun;
1060        wait_queue_head_t *wqueue;
1061
1062        if (!sock_writeable(sk))
1063                return;
1064
1065        if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
1066                return;
1067
1068        wqueue = sk_sleep(sk);
1069        if (wqueue && waitqueue_active(wqueue))
1070                wake_up_interruptible_sync_poll(wqueue, POLLOUT |
1071                                                POLLWRNORM | POLLWRBAND);
1072
1073        tun = tun_sk(sk)->tun;
1074        kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
1075}
1076
1077static void tun_sock_destruct(struct sock *sk)
1078{
1079        free_netdev(tun_sk(sk)->tun->dev);
1080}
1081
1082static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
1083                       struct msghdr *m, size_t total_len)
1084{
1085        struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
1086        return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
1087                            m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
1088}
1089
1090static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
1091                       struct msghdr *m, size_t total_len,
1092                       int flags)
1093{
1094        struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
1095        int ret;
1096        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
1097                return -EINVAL;
1098        ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
1099                          flags & MSG_DONTWAIT);
1100        if (ret > total_len) {
1101                m->msg_flags |= MSG_TRUNC;
1102                ret = flags & MSG_TRUNC ? ret : total_len;
1103        }
1104        return ret;
1105}
1106
1107static int tun_release(struct socket *sock)
1108{
1109        if (sock->sk)
1110                sock_put(sock->sk);
1111        return 0;
1112}
1113
1114/* Ops structure to mimic raw sockets with tun */
1115static const struct proto_ops tun_socket_ops = {
1116        .sendmsg = tun_sendmsg,
1117        .recvmsg = tun_recvmsg,
1118        .release = tun_release,
1119};
1120
1121static struct proto tun_proto = {
1122        .name           = "tun",
1123        .owner          = THIS_MODULE,
1124        .obj_size       = sizeof(struct tun_sock),
1125};
1126
1127static int tun_flags(struct tun_struct *tun)
1128{
1129        int flags = 0;
1130
1131        if (tun->flags & TUN_TUN_DEV)
1132                flags |= IFF_TUN;
1133        else
1134                flags |= IFF_TAP;
1135
1136        if (tun->flags & TUN_NO_PI)
1137                flags |= IFF_NO_PI;
1138
1139        if (tun->flags & TUN_ONE_QUEUE)
1140                flags |= IFF_ONE_QUEUE;
1141
1142        if (tun->flags & TUN_VNET_HDR)
1143                flags |= IFF_VNET_HDR;
1144
1145        return flags;
1146}
1147
1148static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
1149                              char *buf)
1150{
1151        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1152        return sprintf(buf, "0x%x\n", tun_flags(tun));
1153}
1154
1155static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
1156                              char *buf)
1157{
1158        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1159        return uid_valid(tun->owner)?
1160                sprintf(buf, "%u\n",
1161                        from_kuid_munged(current_user_ns(), tun->owner)):
1162                sprintf(buf, "-1\n");
1163}
1164
1165static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
1166                              char *buf)
1167{
1168        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1169        return gid_valid(tun->group) ?
1170                sprintf(buf, "%u\n",
1171                        from_kgid_munged(current_user_ns(), tun->group)):
1172                sprintf(buf, "-1\n");
1173}
1174
1175static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
1176static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
1177static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
1178
1179static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
1180{
1181        struct sock *sk;
1182        struct tun_struct *tun;
1183        struct net_device *dev;
1184        int err;
1185
1186        dev = __dev_get_by_name(net, ifr->ifr_name);
1187        if (dev) {
1188                const struct cred *cred = current_cred();
1189
1190                if (ifr->ifr_flags & IFF_TUN_EXCL)
1191                        return -EBUSY;
1192                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
1193                        tun = netdev_priv(dev);
1194                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
1195                        tun = netdev_priv(dev);
1196                else
1197                        return -EINVAL;
1198
1199                if (((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
1200                     (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
1201                    !capable(CAP_NET_ADMIN))
1202                        return -EPERM;
1203                err = security_tun_dev_attach(tun->socket.sk);
1204                if (err < 0)
1205                        return err;
1206
1207                err = tun_attach(tun, file);
1208                if (err < 0)
1209                        return err;
1210        }
1211        else {
1212                char *name;
1213                unsigned long flags = 0;
1214
1215                if (!capable(CAP_NET_ADMIN))
1216                        return -EPERM;
1217                err = security_tun_dev_create();
1218                if (err < 0)
1219                        return err;
1220
1221                /* Set dev type */
1222                if (ifr->ifr_flags & IFF_TUN) {
1223                        /* TUN device */
1224                        flags |= TUN_TUN_DEV;
1225                        name = "tun%d";
1226                } else if (ifr->ifr_flags & IFF_TAP) {
1227                        /* TAP device */
1228                        flags |= TUN_TAP_DEV;
1229                        name = "tap%d";
1230                } else
1231                        return -EINVAL;
1232
1233                if (*ifr->ifr_name)
1234                        name = ifr->ifr_name;
1235
1236                dev = alloc_netdev(sizeof(struct tun_struct), name,
1237                                   tun_setup);
1238                if (!dev)
1239                        return -ENOMEM;
1240
1241                dev_net_set(dev, net);
1242                dev->rtnl_link_ops = &tun_link_ops;
1243
1244                tun = netdev_priv(dev);
1245                tun->dev = dev;
1246                tun->flags = flags;
1247                tun->txflt.count = 0;
1248                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
1249                set_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags);
1250
1251                err = -ENOMEM;
1252                sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
1253                if (!sk)
1254                        goto err_free_dev;
1255
1256                sk_change_net(sk, net);
1257                tun->socket.wq = &tun->wq;
1258                init_waitqueue_head(&tun->wq.wait);
1259                tun->socket.ops = &tun_socket_ops;
1260                sock_init_data(&tun->socket, sk);
1261                sk->sk_write_space = tun_sock_write_space;
1262                sk->sk_sndbuf = INT_MAX;
1263                sock_set_flag(sk, SOCK_ZEROCOPY);
1264
1265                tun_sk(sk)->tun = tun;
1266
1267                security_tun_dev_post_create(sk);
1268
1269                tun_net_init(dev);
1270
1271                dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
1272                        TUN_USER_FEATURES;
1273                dev->features = dev->hw_features;
1274
1275                err = register_netdevice(tun->dev);
1276                if (err < 0)
1277                        goto err_free_sk;
1278
1279                if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
1280                    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
1281                    device_create_file(&tun->dev->dev, &dev_attr_group))
1282                        pr_err("Failed to create tun sysfs files\n");
1283
1284                sk->sk_destruct = tun_sock_destruct;
1285
1286                err = tun_attach(tun, file);
1287                if (err < 0)
1288                        goto failed;
1289        }
1290
1291        tun_debug(KERN_INFO, tun, "tun_set_iff\n");
1292
1293        if (ifr->ifr_flags & IFF_NO_PI)
1294                tun->flags |= TUN_NO_PI;
1295        else
1296                tun->flags &= ~TUN_NO_PI;
1297
1298        if (ifr->ifr_flags & IFF_ONE_QUEUE)
1299                tun->flags |= TUN_ONE_QUEUE;
1300        else
1301                tun->flags &= ~TUN_ONE_QUEUE;
1302
1303        if (ifr->ifr_flags & IFF_VNET_HDR)
1304                tun->flags |= TUN_VNET_HDR;
1305        else
1306                tun->flags &= ~TUN_VNET_HDR;
1307
1308        /* Make sure persistent devices do not get stuck in
1309         * xoff state.
1310         */
1311        if (netif_running(tun->dev))
1312                netif_wake_queue(tun->dev);
1313
1314        strcpy(ifr->ifr_name, tun->dev->name);
1315        return 0;
1316
1317 err_free_sk:
1318        tun_free_netdev(dev);
1319 err_free_dev:
1320        free_netdev(dev);
1321 failed:
1322        return err;
1323}
1324
1325static int tun_get_iff(struct net *net, struct tun_struct *tun,
1326                       struct ifreq *ifr)
1327{
1328        tun_debug(KERN_INFO, tun, "tun_get_iff\n");
1329
1330        strcpy(ifr->ifr_name, tun->dev->name);
1331
1332        ifr->ifr_flags = tun_flags(tun);
1333
1334        return 0;
1335}
1336
1337/* This is like a cut-down ethtool ops, except done via tun fd so no
1338 * privs required. */
1339static int set_offload(struct tun_struct *tun, unsigned long arg)
1340{
1341        netdev_features_t features = 0;
1342
1343        if (arg & TUN_F_CSUM) {
1344                features |= NETIF_F_HW_CSUM;
1345                arg &= ~TUN_F_CSUM;
1346
1347                if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
1348                        if (arg & TUN_F_TSO_ECN) {
1349                                features |= NETIF_F_TSO_ECN;
1350                                arg &= ~TUN_F_TSO_ECN;
1351                        }
1352                        if (arg & TUN_F_TSO4)
1353                                features |= NETIF_F_TSO;
1354                        if (arg & TUN_F_TSO6)
1355                                features |= NETIF_F_TSO6;
1356                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
1357                }
1358
1359                if (arg & TUN_F_UFO) {
1360                        features |= NETIF_F_UFO;
1361                        arg &= ~TUN_F_UFO;
1362                }
1363        }
1364
1365        /* This gives the user a way to test for new features in future by
1366         * trying to set them. */
1367        if (arg)
1368                return -EINVAL;
1369
1370        tun->set_features = features;
1371        netdev_update_features(tun->dev);
1372
1373        return 0;
1374}
1375
1376static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
1377                            unsigned long arg, int ifreq_len)
1378{
1379        struct tun_file *tfile = file->private_data;
1380        struct tun_struct *tun;
1381        void __user* argp = (void __user*)arg;
1382        struct sock_fprog fprog;
1383        struct ifreq ifr;
1384        kuid_t owner;
1385        kgid_t group;
1386        int sndbuf;
1387        int vnet_hdr_sz;
1388        int ret;
1389
1390        if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) {
1391                if (copy_from_user(&ifr, argp, ifreq_len))
1392                        return -EFAULT;
1393        } else {
1394                memset(&ifr, 0, sizeof(ifr));
1395        }
1396        if (cmd == TUNGETFEATURES) {
1397                /* Currently this just means: "what IFF flags are valid?".
1398                 * This is needed because we never checked for invalid flags on
1399                 * TUNSETIFF. */
1400                return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
1401                                IFF_VNET_HDR,
1402                                (unsigned int __user*)argp);
1403        }
1404
1405        rtnl_lock();
1406
1407        tun = __tun_get(tfile);
1408        if (cmd == TUNSETIFF && !tun) {
1409                ifr.ifr_name[IFNAMSIZ-1] = '\0';
1410
1411                ret = tun_set_iff(tfile->net, file, &ifr);
1412
1413                if (ret)
1414                        goto unlock;
1415
1416                if (copy_to_user(argp, &ifr, ifreq_len))
1417                        ret = -EFAULT;
1418                goto unlock;
1419        }
1420
1421        ret = -EBADFD;
1422        if (!tun)
1423                goto unlock;
1424
1425        tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
1426
1427        ret = 0;
1428        switch (cmd) {
1429        case TUNGETIFF:
1430                ret = tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
1431                if (ret)
1432                        break;
1433
1434                if (copy_to_user(argp, &ifr, ifreq_len))
1435                        ret = -EFAULT;
1436                break;
1437
1438        case TUNSETNOCSUM:
1439                /* Disable/Enable checksum */
1440
1441                /* [unimplemented] */
1442                tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
1443                          arg ? "disabled" : "enabled");
1444                break;
1445
1446        case TUNSETPERSIST:
1447                /* Disable/Enable persist mode */
1448                if (arg)
1449                        tun->flags |= TUN_PERSIST;
1450                else
1451                        tun->flags &= ~TUN_PERSIST;
1452
1453                tun_debug(KERN_INFO, tun, "persist %s\n",
1454                          arg ? "enabled" : "disabled");
1455                break;
1456
1457        case TUNSETOWNER:
1458                /* Set owner of the device */
1459                owner = make_kuid(current_user_ns(), arg);
1460                if (!uid_valid(owner)) {
1461                        ret = -EINVAL;
1462                        break;
1463                }
1464                tun->owner = owner;
1465                tun_debug(KERN_INFO, tun, "owner set to %d\n",
1466                          from_kuid(&init_user_ns, tun->owner));
1467                break;
1468
1469        case TUNSETGROUP:
1470                /* Set group of the device */
1471                group = make_kgid(current_user_ns(), arg);
1472                if (!gid_valid(group)) {
1473                        ret = -EINVAL;
1474                        break;
1475                }
1476                tun->group = group;
1477                tun_debug(KERN_INFO, tun, "group set to %d\n",
1478                          from_kgid(&init_user_ns, tun->group));
1479                break;
1480
1481        case TUNSETLINK:
1482                /* Only allow setting the type when the interface is down */
1483                if (tun->dev->flags & IFF_UP) {
1484                        tun_debug(KERN_INFO, tun,
1485                                  "Linktype set failed because interface is up\n");
1486                        ret = -EBUSY;
1487                } else {
1488                        tun->dev->type = (int) arg;
1489                        tun_debug(KERN_INFO, tun, "linktype set to %d\n",
1490                                  tun->dev->type);
1491                        ret = 0;
1492                }
1493                break;
1494
1495#ifdef TUN_DEBUG
1496        case TUNSETDEBUG:
1497                tun->debug = arg;
1498                break;
1499#endif
1500        case TUNSETOFFLOAD:
1501                ret = set_offload(tun, arg);
1502                break;
1503
1504        case TUNSETTXFILTER:
1505                /* Can be set only for TAPs */
1506                ret = -EINVAL;
1507                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
1508                        break;
1509                ret = update_filter(&tun->txflt, (void __user *)arg);
1510                break;
1511
1512        case SIOCGIFHWADDR:
1513                /* Get hw address */
1514                memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
1515                ifr.ifr_hwaddr.sa_family = tun->dev->type;
1516                if (copy_to_user(argp, &ifr, ifreq_len))
1517                        ret = -EFAULT;
1518                break;
1519
1520        case SIOCSIFHWADDR:
1521                /* Set hw address */
1522                tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
1523                          ifr.ifr_hwaddr.sa_data);
1524
1525                ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
1526                break;
1527
1528        case TUNGETSNDBUF:
1529                sndbuf = tun->socket.sk->sk_sndbuf;
1530                if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
1531                        ret = -EFAULT;
1532                break;
1533
1534        case TUNSETSNDBUF:
1535                if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
1536                        ret = -EFAULT;
1537                        break;
1538                }
1539
1540                tun->socket.sk->sk_sndbuf = sndbuf;
1541                break;
1542
1543        case TUNGETVNETHDRSZ:
1544                vnet_hdr_sz = tun->vnet_hdr_sz;
1545                if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
1546                        ret = -EFAULT;
1547                break;
1548
1549        case TUNSETVNETHDRSZ:
1550                if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
1551                        ret = -EFAULT;
1552                        break;
1553                }
1554                if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
1555                        ret = -EINVAL;
1556                        break;
1557                }
1558
1559                tun->vnet_hdr_sz = vnet_hdr_sz;
1560                break;
1561
1562        case TUNATTACHFILTER:
1563                /* Can be set only for TAPs */
1564                ret = -EINVAL;
1565                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
1566                        break;
1567                ret = -EFAULT;
1568                if (copy_from_user(&fprog, argp, sizeof(fprog)))
1569                        break;
1570
1571                ret = sk_attach_filter(&fprog, tun->socket.sk);
1572                break;
1573
1574        case TUNDETACHFILTER:
1575                /* Can be set only for TAPs */
1576                ret = -EINVAL;
1577                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
1578                        break;
1579                ret = sk_detach_filter(tun->socket.sk);
1580                break;
1581
1582        default:
1583                ret = -EINVAL;
1584                break;
1585        }
1586
1587unlock:
1588        rtnl_unlock();
1589        if (tun)
1590                tun_put(tun);
1591        return ret;
1592}
1593
1594static long tun_chr_ioctl(struct file *file,
1595                          unsigned int cmd, unsigned long arg)
1596{
1597        return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
1598}
1599
1600#ifdef CONFIG_COMPAT
1601static long tun_chr_compat_ioctl(struct file *file,
1602                         unsigned int cmd, unsigned long arg)
1603{
1604        switch (cmd) {
1605        case TUNSETIFF:
1606        case TUNGETIFF:
1607        case TUNSETTXFILTER:
1608        case TUNGETSNDBUF:
1609        case TUNSETSNDBUF:
1610        case SIOCGIFHWADDR:
1611        case SIOCSIFHWADDR:
1612                arg = (unsigned long)compat_ptr(arg);
1613                break;
1614        default:
1615                arg = (compat_ulong_t)arg;
1616                break;
1617        }
1618
1619        /*
1620         * compat_ifreq is shorter than ifreq, so we must not access beyond
1621         * the end of that structure. All fields that are used in this
1622         * driver are compatible though, we don't need to convert the
1623         * contents.
1624         */
1625        return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
1626}
1627#endif /* CONFIG_COMPAT */
1628
1629static int tun_chr_fasync(int fd, struct file *file, int on)
1630{
1631        struct tun_struct *tun = tun_get(file);
1632        int ret;
1633
1634        if (!tun)
1635                return -EBADFD;
1636
1637        tun_debug(KERN_INFO, tun, "tun_chr_fasync %d\n", on);
1638
1639        if ((ret = fasync_helper(fd, file, on, &tun->fasync)) < 0)
1640                goto out;
1641
1642        if (on) {
1643                ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
1644                if (ret)
1645                        goto out;
1646                tun->flags |= TUN_FASYNC;
1647        } else
1648                tun->flags &= ~TUN_FASYNC;
1649        ret = 0;
1650out:
1651        tun_put(tun);
1652        return ret;
1653}
1654
1655static int tun_chr_open(struct inode *inode, struct file * file)
1656{
1657        struct tun_file *tfile;
1658
1659        DBG1(KERN_INFO, "tunX: tun_chr_open\n");
1660
1661        tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
1662        if (!tfile)
1663                return -ENOMEM;
1664        atomic_set(&tfile->count, 0);
1665        tfile->tun = NULL;
1666        tfile->net = get_net(current->nsproxy->net_ns);
1667        file->private_data = tfile;
1668        return 0;
1669}
1670
1671static int tun_chr_close(struct inode *inode, struct file *file)
1672{
1673        struct tun_file *tfile = file->private_data;
1674        struct tun_struct *tun;
1675
1676        tun = __tun_get(tfile);
1677        if (tun) {
1678                struct net_device *dev = tun->dev;
1679
1680                tun_debug(KERN_INFO, tun, "tun_chr_close\n");
1681
1682                __tun_detach(tun);
1683
1684                /* If desirable, unregister the netdevice. */
1685                if (!(tun->flags & TUN_PERSIST)) {
1686                        rtnl_lock();
1687                        if (dev->reg_state == NETREG_REGISTERED)
1688                                unregister_netdevice(dev);
1689                        rtnl_unlock();
1690                }
1691        }
1692
1693        tun = tfile->tun;
1694        if (tun)
1695                sock_put(tun->socket.sk);
1696
1697        put_net(tfile->net);
1698        kfree(tfile);
1699
1700        return 0;
1701}
1702
1703static const struct file_operations tun_fops = {
1704        .owner  = THIS_MODULE,
1705        .llseek = no_llseek,
1706        .read  = do_sync_read,
1707        .aio_read  = tun_chr_aio_read,
1708        .write = do_sync_write,
1709        .aio_write = tun_chr_aio_write,
1710        .poll   = tun_chr_poll,
1711        .unlocked_ioctl = tun_chr_ioctl,
1712#ifdef CONFIG_COMPAT
1713        .compat_ioctl = tun_chr_compat_ioctl,
1714#endif
1715        .open   = tun_chr_open,
1716        .release = tun_chr_close,
1717        .fasync = tun_chr_fasync
1718};
1719
1720static struct miscdevice tun_miscdev = {
1721        .minor = TUN_MINOR,
1722        .name = "tun",
1723        .nodename = "net/tun",
1724        .fops = &tun_fops,
1725};
1726
1727/* ethtool interface */
1728
1729static int tun_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
1730{
1731        cmd->supported          = 0;
1732        cmd->advertising        = 0;
1733        ethtool_cmd_speed_set(cmd, SPEED_10);
1734        cmd->duplex             = DUPLEX_FULL;
1735        cmd->port               = PORT_TP;
1736        cmd->phy_address        = 0;
1737        cmd->transceiver        = XCVR_INTERNAL;
1738        cmd->autoneg            = AUTONEG_DISABLE;
1739        cmd->maxtxpkt           = 0;
1740        cmd->maxrxpkt           = 0;
1741        return 0;
1742}
1743
1744static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
1745{
1746        struct tun_struct *tun = netdev_priv(dev);
1747
1748        strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
1749        strlcpy(info->version, DRV_VERSION, sizeof(info->version));
1750
1751        switch (tun->flags & TUN_TYPE_MASK) {
1752        case TUN_TUN_DEV:
1753                strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
1754                break;
1755        case TUN_TAP_DEV:
1756                strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
1757                break;
1758        }
1759}
1760
1761static u32 tun_get_msglevel(struct net_device *dev)
1762{
1763#ifdef TUN_DEBUG
1764        struct tun_struct *tun = netdev_priv(dev);
1765        return tun->debug;
1766#else
1767        return -EOPNOTSUPP;
1768#endif
1769}
1770
1771static void tun_set_msglevel(struct net_device *dev, u32 value)
1772{
1773#ifdef TUN_DEBUG
1774        struct tun_struct *tun = netdev_priv(dev);
1775        tun->debug = value;
1776#endif
1777}
1778
1779static const struct ethtool_ops tun_ethtool_ops = {
1780        .get_settings   = tun_get_settings,
1781        .get_drvinfo    = tun_get_drvinfo,
1782        .get_msglevel   = tun_get_msglevel,
1783        .set_msglevel   = tun_set_msglevel,
1784        .get_link       = ethtool_op_get_link,
1785};
1786
1787
1788static int __init tun_init(void)
1789{
1790        int ret = 0;
1791
1792        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
1793        pr_info("%s\n", DRV_COPYRIGHT);
1794
1795        ret = rtnl_link_register(&tun_link_ops);
1796        if (ret) {
1797                pr_err("Can't register link_ops\n");
1798                goto err_linkops;
1799        }
1800
1801        ret = misc_register(&tun_miscdev);
1802        if (ret) {
1803                pr_err("Can't register misc device %d\n", TUN_MINOR);
1804                goto err_misc;
1805        }
1806        return  0;
1807err_misc:
1808        rtnl_link_unregister(&tun_link_ops);
1809err_linkops:
1810        return ret;
1811}
1812
1813static void tun_cleanup(void)
1814{
1815        misc_deregister(&tun_miscdev);
1816        rtnl_link_unregister(&tun_link_ops);
1817}
1818
1819/* Get an underlying socket object from tun file.  Returns error unless file is
1820 * attached to a device.  The returned object works like a packet socket, it
1821 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
1822 * holding a reference to the file for as long as the socket is in use. */
1823struct socket *tun_get_socket(struct file *file)
1824{
1825        struct tun_struct *tun;
1826        if (file->f_op != &tun_fops)
1827                return ERR_PTR(-EINVAL);
1828        tun = tun_get(file);
1829        if (!tun)
1830                return ERR_PTR(-EBADFD);
1831        tun_put(tun);
1832        return &tun->socket;
1833}
1834EXPORT_SYMBOL_GPL(tun_get_socket);
1835
1836module_init(tun_init);
1837module_exit(tun_cleanup);
1838MODULE_DESCRIPTION(DRV_DESCRIPTION);
1839MODULE_AUTHOR(DRV_COPYRIGHT);
1840MODULE_LICENSE("GPL");
1841MODULE_ALIAS_MISCDEV(TUN_MINOR);
1842MODULE_ALIAS("devname:net/tun");
1843
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.