linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
   4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 *
  34 * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $
  35 */
  36
  37#include "ipoib.h"
  38
  39#include <linux/module.h>
  40
  41#include <linux/init.h>
  42#include <linux/slab.h>
  43#include <linux/vmalloc.h>
  44#include <linux/kernel.h>
  45
  46#include <linux/if_arp.h>       /* For ARPHRD_xxx */
  47
  48#include <linux/ip.h>
  49#include <linux/in.h>
  50
  51#include <net/dst.h>
  52
  53MODULE_AUTHOR("Roland Dreier");
  54MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
  55MODULE_LICENSE("Dual BSD/GPL");
  56
  57int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
  58int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
  59
  60module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
  61MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
  62module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
  63MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
  64
  65#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  66int ipoib_debug_level;
  67
  68module_param_named(debug_level, ipoib_debug_level, int, 0644);
  69MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
  70#endif
  71
  72struct ipoib_path_iter {
  73        struct net_device *dev;
  74        struct ipoib_path  path;
  75};
  76
  77static const u8 ipv4_bcast_addr[] = {
  78        0x00, 0xff, 0xff, 0xff,
  79        0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  80        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
  81};
  82
  83struct workqueue_struct *ipoib_workqueue;
  84
  85static void ipoib_add_one(struct ib_device *device);
  86static void ipoib_remove_one(struct ib_device *device);
  87
  88static struct ib_client ipoib_client = {
  89        .name   = "ipoib",
  90        .add    = ipoib_add_one,
  91        .remove = ipoib_remove_one
  92};
  93
  94int ipoib_open(struct net_device *dev)
  95{
  96        struct ipoib_dev_priv *priv = netdev_priv(dev);
  97
  98        ipoib_dbg(priv, "bringing up interface\n");
  99
 100        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 101
 102        if (ipoib_pkey_dev_delay_open(dev))
 103                return 0;
 104
 105        if (ipoib_ib_dev_open(dev))
 106                return -EINVAL;
 107
 108        if (ipoib_ib_dev_up(dev)) {
 109                ipoib_ib_dev_stop(dev);
 110                return -EINVAL;
 111        }
 112
 113        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 114                struct ipoib_dev_priv *cpriv;
 115
 116                /* Bring up any child interfaces too */
 117                mutex_lock(&priv->vlan_mutex);
 118                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 119                        int flags;
 120
 121                        flags = cpriv->dev->flags;
 122                        if (flags & IFF_UP)
 123                                continue;
 124
 125                        dev_change_flags(cpriv->dev, flags | IFF_UP);
 126                }
 127                mutex_unlock(&priv->vlan_mutex);
 128        }
 129
 130        netif_start_queue(dev);
 131
 132        return 0;
 133}
 134
 135static int ipoib_stop(struct net_device *dev)
 136{
 137        struct ipoib_dev_priv *priv = netdev_priv(dev);
 138
 139        ipoib_dbg(priv, "stopping interface\n");
 140
 141        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 142
 143        netif_stop_queue(dev);
 144
 145        /*
 146         * Now flush workqueue to make sure a scheduled task doesn't
 147         * bring our internal state back up.
 148         */
 149        flush_workqueue(ipoib_workqueue);
 150
 151        ipoib_ib_dev_down(dev, 1);
 152        ipoib_ib_dev_stop(dev);
 153
 154        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 155                struct ipoib_dev_priv *cpriv;
 156
 157                /* Bring down any child interfaces too */
 158                mutex_lock(&priv->vlan_mutex);
 159                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 160                        int flags;
 161
 162                        flags = cpriv->dev->flags;
 163                        if (!(flags & IFF_UP))
 164                                continue;
 165
 166                        dev_change_flags(cpriv->dev, flags & ~IFF_UP);
 167                }
 168                mutex_unlock(&priv->vlan_mutex);
 169        }
 170
 171        return 0;
 172}
 173
 174static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 175{
 176        struct ipoib_dev_priv *priv = netdev_priv(dev);
 177
 178        if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
 179                return -EINVAL;
 180
 181        priv->admin_mtu = new_mtu;
 182
 183        dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 184
 185        return 0;
 186}
 187
 188static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
 189{
 190        struct ipoib_dev_priv *priv = netdev_priv(dev);
 191        struct rb_node *n = priv->path_tree.rb_node;
 192        struct ipoib_path *path;
 193        int ret;
 194
 195        while (n) {
 196                path = rb_entry(n, struct ipoib_path, rb_node);
 197
 198                ret = memcmp(gid, path->pathrec.dgid.raw,
 199                             sizeof (union ib_gid));
 200
 201                if (ret < 0)
 202                        n = n->rb_left;
 203                else if (ret > 0)
 204                        n = n->rb_right;
 205                else
 206                        return path;
 207        }
 208
 209        return NULL;
 210}
 211
 212static int __path_add(struct net_device *dev, struct ipoib_path *path)
 213{
 214        struct ipoib_dev_priv *priv = netdev_priv(dev);
 215        struct rb_node **n = &priv->path_tree.rb_node;
 216        struct rb_node *pn = NULL;
 217        struct ipoib_path *tpath;
 218        int ret;
 219
 220        while (*n) {
 221                pn = *n;
 222                tpath = rb_entry(pn, struct ipoib_path, rb_node);
 223
 224                ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
 225                             sizeof (union ib_gid));
 226                if (ret < 0)
 227                        n = &pn->rb_left;
 228                else if (ret > 0)
 229                        n = &pn->rb_right;
 230                else
 231                        return -EEXIST;
 232        }
 233
 234        rb_link_node(&path->rb_node, pn, n);
 235        rb_insert_color(&path->rb_node, &priv->path_tree);
 236
 237        list_add_tail(&path->list, &priv->path_list);
 238
 239        return 0;
 240}
 241
 242static void path_free(struct net_device *dev, struct ipoib_path *path)
 243{
 244        struct ipoib_dev_priv *priv = netdev_priv(dev);
 245        struct ipoib_neigh *neigh, *tn;
 246        struct sk_buff *skb;
 247        unsigned long flags;
 248
 249        while ((skb = __skb_dequeue(&path->queue)))
 250                dev_kfree_skb_irq(skb);
 251
 252        spin_lock_irqsave(&priv->lock, flags);
 253
 254        list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
 255                /*
 256                 * It's safe to call ipoib_put_ah() inside priv->lock
 257                 * here, because we know that path->ah will always
 258                 * hold one more reference, so ipoib_put_ah() will
 259                 * never do more than decrement the ref count.
 260                 */
 261                if (neigh->ah)
 262                        ipoib_put_ah(neigh->ah);
 263
 264                ipoib_neigh_free(neigh);
 265        }
 266
 267        spin_unlock_irqrestore(&priv->lock, flags);
 268
 269        if (path->ah)
 270                ipoib_put_ah(path->ah);
 271
 272        kfree(path);
 273}
 274
 275#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 276
 277struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
 278{
 279        struct ipoib_path_iter *iter;
 280
 281        iter = kmalloc(sizeof *iter, GFP_KERNEL);
 282        if (!iter)
 283                return NULL;
 284
 285        iter->dev = dev;
 286        memset(iter->path.pathrec.dgid.raw, 0, 16);
 287
 288        if (ipoib_path_iter_next(iter)) {
 289                kfree(iter);
 290                return NULL;
 291        }
 292
 293        return iter;
 294}
 295
 296int ipoib_path_iter_next(struct ipoib_path_iter *iter)
 297{
 298        struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
 299        struct rb_node *n;
 300        struct ipoib_path *path;
 301        int ret = 1;
 302
 303        spin_lock_irq(&priv->lock);
 304
 305        n = rb_first(&priv->path_tree);
 306
 307        while (n) {
 308                path = rb_entry(n, struct ipoib_path, rb_node);
 309
 310                if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
 311                           sizeof (union ib_gid)) < 0) {
 312                        iter->path = *path;
 313                        ret = 0;
 314                        break;
 315                }
 316
 317                n = rb_next(n);
 318        }
 319
 320        spin_unlock_irq(&priv->lock);
 321
 322        return ret;
 323}
 324
 325void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 326                          struct ipoib_path *path)
 327{
 328        *path = iter->path;
 329}
 330
 331#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 332
 333void ipoib_flush_paths(struct net_device *dev)
 334{
 335        struct ipoib_dev_priv *priv = netdev_priv(dev);
 336        struct ipoib_path *path, *tp;
 337        LIST_HEAD(remove_list);
 338
 339        spin_lock_irq(&priv->lock);
 340
 341        list_splice(&priv->path_list, &remove_list);
 342        INIT_LIST_HEAD(&priv->path_list);
 343
 344        list_for_each_entry(path, &remove_list, list)
 345                rb_erase(&path->rb_node, &priv->path_tree);
 346
 347        list_for_each_entry_safe(path, tp, &remove_list, list) {
 348                if (path->query)
 349                        ib_sa_cancel_query(path->query_id, path->query);
 350                spin_unlock_irq(&priv->lock);
 351                wait_for_completion(&path->done);
 352                path_free(dev, path);
 353                spin_lock_irq(&priv->lock);
 354        }
 355        spin_unlock_irq(&priv->lock);
 356}
 357
 358static void path_rec_completion(int status,
 359                                struct ib_sa_path_rec *pathrec,
 360                                void *path_ptr)
 361{
 362        struct ipoib_path *path = path_ptr;
 363        struct net_device *dev = path->dev;
 364        struct ipoib_dev_priv *priv = netdev_priv(dev);
 365        struct ipoib_ah *ah = NULL;
 366        struct ipoib_neigh *neigh;
 367        struct sk_buff_head skqueue;
 368        struct sk_buff *skb;
 369        unsigned long flags;
 370
 371        if (pathrec)
 372                ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n",
 373                          be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid));
 374        else
 375                ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n",
 376                          status, IPOIB_GID_ARG(path->pathrec.dgid));
 377
 378        skb_queue_head_init(&skqueue);
 379
 380        if (!status) {
 381                struct ib_ah_attr av = {
 382                        .dlid          = be16_to_cpu(pathrec->dlid),
 383                        .sl            = pathrec->sl,
 384                        .port_num      = priv->port,
 385                        .static_rate   = pathrec->rate
 386                };
 387
 388                ah = ipoib_create_ah(dev, priv->pd, &av);
 389        }
 390
 391        spin_lock_irqsave(&priv->lock, flags);
 392
 393        path->ah = ah;
 394
 395        if (ah) {
 396                path->pathrec = *pathrec;
 397
 398                ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
 399                          ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
 400
 401                while ((skb = __skb_dequeue(&path->queue)))
 402                        __skb_queue_tail(&skqueue, skb);
 403
 404                list_for_each_entry(neigh, &path->neigh_list, list) {
 405                        kref_get(&path->ah->ref);
 406                        neigh->ah = path->ah;
 407                        memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
 408                               sizeof(union ib_gid));
 409
 410                        while ((skb = __skb_dequeue(&neigh->queue)))
 411                                __skb_queue_tail(&skqueue, skb);
 412                }
 413        }
 414
 415        path->query = NULL;
 416        complete(&path->done);
 417
 418        spin_unlock_irqrestore(&priv->lock, flags);
 419
 420        while ((skb = __skb_dequeue(&skqueue))) {
 421                skb->dev = dev;
 422                if (dev_queue_xmit(skb))
 423                        ipoib_warn(priv, "dev_queue_xmit failed "
 424                                   "to requeue packet\n");
 425        }
 426}
 427
 428static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 429{
 430        struct ipoib_dev_priv *priv = netdev_priv(dev);
 431        struct ipoib_path *path;
 432
 433        path = kzalloc(sizeof *path, GFP_ATOMIC);
 434        if (!path)
 435                return NULL;
 436
 437        path->dev = dev;
 438
 439        skb_queue_head_init(&path->queue);
 440
 441        INIT_LIST_HEAD(&path->neigh_list);
 442
 443        memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
 444        path->pathrec.sgid      = priv->local_gid;
 445        path->pathrec.pkey      = cpu_to_be16(priv->pkey);
 446        path->pathrec.numb_path = 1;
 447
 448        return path;
 449}
 450
 451static int path_rec_start(struct net_device *dev,
 452                          struct ipoib_path *path)
 453{
 454        struct ipoib_dev_priv *priv = netdev_priv(dev);
 455
 456        ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n",
 457                  IPOIB_GID_ARG(path->pathrec.dgid));
 458
 459        init_completion(&path->done);
 460
 461        path->query_id =
 462                ib_sa_path_rec_get(priv->ca, priv->port,
 463                                   &path->pathrec,
 464                                   IB_SA_PATH_REC_DGID          |
 465                                   IB_SA_PATH_REC_SGID          |
 466                                   IB_SA_PATH_REC_NUMB_PATH     |
 467                                   IB_SA_PATH_REC_PKEY,
 468                                   1000, GFP_ATOMIC,
 469                                   path_rec_completion,
 470                                   path, &path->query);
 471        if (path->query_id < 0) {
 472                ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
 473                path->query = NULL;
 474                return path->query_id;
 475        }
 476
 477        return 0;
 478}
 479
 480static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 481{
 482        struct ipoib_dev_priv *priv = netdev_priv(dev);
 483        struct ipoib_path *path;
 484        struct ipoib_neigh *neigh;
 485
 486        neigh = ipoib_neigh_alloc(skb->dst->neighbour);
 487        if (!neigh) {
 488                ++priv->stats.tx_dropped;
 489                dev_kfree_skb_any(skb);
 490                return;
 491        }
 492
 493        skb_queue_head_init(&neigh->queue);
 494
 495        /*
 496         * We can only be called from ipoib_start_xmit, so we're
 497         * inside tx_lock -- no need to save/restore flags.
 498         */
 499        spin_lock(&priv->lock);
 500
 501        path = __path_find(dev, skb->dst->neighbour->ha + 4);
 502        if (!path) {
 503                path = path_rec_create(dev, skb->dst->neighbour->ha + 4);
 504                if (!path)
 505                        goto err_path;
 506
 507                __path_add(dev, path);
 508        }
 509
 510        list_add_tail(&neigh->list, &path->neigh_list);
 511
 512        if (path->ah) {
 513                kref_get(&path->ah->ref);
 514                neigh->ah = path->ah;
 515                memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
 516                       sizeof(union ib_gid));
 517
 518                ipoib_send(dev, skb, path->ah,
 519                           be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
 520        } else {
 521                neigh->ah  = NULL;
 522                __skb_queue_tail(&neigh->queue, skb);
 523
 524                if (!path->query && path_rec_start(dev, path))
 525                        goto err_list;
 526        }
 527
 528        spin_unlock(&priv->lock);
 529        return;
 530
 531err_list:
 532        list_del(&neigh->list);
 533
 534err_path:
 535        ipoib_neigh_free(neigh);
 536        ++priv->stats.tx_dropped;
 537        dev_kfree_skb_any(skb);
 538
 539        spin_unlock(&priv->lock);
 540}
 541
 542static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev)
 543{
 544        struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
 545
 546        /* Look up path record for unicasts */
 547        if (skb->dst->neighbour->ha[4] != 0xff) {
 548                neigh_add_path(skb, dev);
 549                return;
 550        }
 551
 552        /* Add in the P_Key for multicasts */
 553        skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
 554        skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
 555        ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb);
 556}
 557
 558static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 559                             struct ipoib_pseudoheader *phdr)
 560{
 561        struct ipoib_dev_priv *priv = netdev_priv(dev);
 562        struct ipoib_path *path;
 563
 564        /*
 565         * We can only be called from ipoib_start_xmit, so we're
 566         * inside tx_lock -- no need to save/restore flags.
 567         */
 568        spin_lock(&priv->lock);
 569
 570        path = __path_find(dev, phdr->hwaddr + 4);
 571        if (!path) {
 572                path = path_rec_create(dev, phdr->hwaddr + 4);
 573                if (path) {
 574                        /* put pseudoheader back on for next time */
 575                        skb_push(skb, sizeof *phdr);
 576                        __skb_queue_tail(&path->queue, skb);
 577
 578                        if (path_rec_start(dev, path)) {
 579                                spin_unlock(&priv->lock);
 580                                path_free(dev, path);
 581                                return;
 582                        } else
 583                                __path_add(dev, path);
 584                } else {
 585                        ++priv->stats.tx_dropped;
 586                        dev_kfree_skb_any(skb);
 587                }
 588
 589                spin_unlock(&priv->lock);
 590                return;
 591        }
 592
 593        if (path->ah) {
 594                ipoib_dbg(priv, "Send unicast ARP to %04x\n",
 595                          be16_to_cpu(path->pathrec.dlid));
 596
 597                ipoib_send(dev, skb, path->ah,
 598                           be32_to_cpup((__be32 *) phdr->hwaddr));
 599        } else if ((path->query || !path_rec_start(dev, path)) &&
 600                   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 601                /* put pseudoheader back on for next time */
 602                skb_push(skb, sizeof *phdr);
 603                __skb_queue_tail(&path->queue, skb);
 604        } else {
 605                ++priv->stats.tx_dropped;
 606                dev_kfree_skb_any(skb);
 607        }
 608
 609        spin_unlock(&priv->lock);
 610}
 611
 612static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 613{
 614        struct ipoib_dev_priv *priv = netdev_priv(dev);
 615        struct ipoib_neigh *neigh;
 616        unsigned long flags;
 617
 618        if (!spin_trylock_irqsave(&priv->tx_lock, flags))
 619                return NETDEV_TX_LOCKED;
 620
 621        /*
 622         * Check if our queue is stopped.  Since we have the LLTX bit
 623         * set, we can't rely on netif_stop_queue() preventing our
 624         * xmit function from being called with a full queue.
 625         */
 626        if (unlikely(netif_queue_stopped(dev))) {
 627                spin_unlock_irqrestore(&priv->tx_lock, flags);
 628                return NETDEV_TX_BUSY;
 629        }
 630
 631        if (skb->dst && skb->dst->neighbour) {
 632                if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
 633                        ipoib_path_lookup(skb, dev);
 634                        goto out;
 635                }
 636
 637                neigh = *to_ipoib_neigh(skb->dst->neighbour);
 638
 639                if (likely(neigh->ah)) {
 640                        if (unlikely(memcmp(&neigh->dgid.raw,
 641                                            skb->dst->neighbour->ha + 4,
 642                                            sizeof(union ib_gid)))) {
 643                                spin_lock(&priv->lock);
 644                                /*
 645                                 * It's safe to call ipoib_put_ah() inside
 646                                 * priv->lock here, because we know that
 647                                 * path->ah will always hold one more reference,
 648                                 * so ipoib_put_ah() will never do more than
 649                                 * decrement the ref count.
 650                                 */
 651                                ipoib_put_ah(neigh->ah);
 652                                list_del(&neigh->list);
 653                                ipoib_neigh_free(neigh);
 654                                spin_unlock(&priv->lock);
 655                                ipoib_path_lookup(skb, dev);
 656                                goto out;
 657                        }
 658
 659                        ipoib_send(dev, skb, neigh->ah,
 660                                   be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
 661                        goto out;
 662                }
 663
 664                if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 665                        spin_lock(&priv->lock);
 666                        __skb_queue_tail(&neigh->queue, skb);
 667                        spin_unlock(&priv->lock);
 668                } else {
 669                        ++priv->stats.tx_dropped;
 670                        dev_kfree_skb_any(skb);
 671                }
 672        } else {
 673                struct ipoib_pseudoheader *phdr =
 674                        (struct ipoib_pseudoheader *) skb->data;
 675                skb_pull(skb, sizeof *phdr);
 676
 677                if (phdr->hwaddr[4] == 0xff) {
 678                        /* Add in the P_Key for multicast*/
 679                        phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
 680                        phdr->hwaddr[9] = priv->pkey & 0xff;
 681
 682                        ipoib_mcast_send(dev, phdr->hwaddr + 4, skb);
 683                } else {
 684                        /* unicast GID -- should be ARP or RARP reply */
 685
 686                        if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
 687                            (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
 688                                ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
 689                                           IPOIB_GID_FMT "\n",
 690                                           skb->dst ? "neigh" : "dst",
 691                                           be16_to_cpup((__be16 *) skb->data),
 692                                           be32_to_cpup((__be32 *) phdr->hwaddr),
 693                                           IPOIB_GID_RAW_ARG(phdr->hwaddr + 4));
 694                                dev_kfree_skb_any(skb);
 695                                ++priv->stats.tx_dropped;
 696                                goto out;
 697                        }
 698
 699                        unicast_arp_send(skb, dev, phdr);
 700                }
 701        }
 702
 703out:
 704        spin_unlock_irqrestore(&priv->tx_lock, flags);
 705
 706        return NETDEV_TX_OK;
 707}
 708
 709static struct net_device_stats *ipoib_get_stats(struct net_device *dev)
 710{
 711        struct ipoib_dev_priv *priv = netdev_priv(dev);
 712
 713        return &priv->stats;
 714}
 715
 716static void ipoib_timeout(struct net_device *dev)
 717{
 718        struct ipoib_dev_priv *priv = netdev_priv(dev);
 719
 720        ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
 721                   jiffies_to_msecs(jiffies - dev->trans_start));
 722        ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
 723                   netif_queue_stopped(dev),
 724                   priv->tx_head, priv->tx_tail);
 725        /* XXX reset QP, etc. */
 726}
 727
 728static int ipoib_hard_header(struct sk_buff *skb,
 729                             struct net_device *dev,
 730                             unsigned short type,
 731                             void *daddr, void *saddr, unsigned len)
 732{
 733        struct ipoib_header *header;
 734
 735        header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 736
 737        header->proto = htons(type);
 738        header->reserved = 0;
 739
 740        /*
 741         * If we don't have a neighbour structure, stuff the
 742         * destination address onto the front of the skb so we can
 743         * figure out where to send the packet later.
 744         */
 745        if ((!skb->dst || !skb->dst->neighbour) && daddr) {
 746                struct ipoib_pseudoheader *phdr =
 747                        (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
 748                memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
 749        }
 750
 751        return 0;
 752}
 753
 754static void ipoib_set_mcast_list(struct net_device *dev)
 755{
 756        struct ipoib_dev_priv *priv = netdev_priv(dev);
 757
 758        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
 759                ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
 760                return;
 761        }
 762
 763        queue_work(ipoib_workqueue, &priv->restart_task);
 764}
 765
 766static void ipoib_neigh_destructor(struct neighbour *n)
 767{
 768        struct ipoib_neigh *neigh;
 769        struct ipoib_dev_priv *priv = netdev_priv(n->dev);
 770        unsigned long flags;
 771        struct ipoib_ah *ah = NULL;
 772
 773        ipoib_dbg(priv,
 774                  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
 775                  be32_to_cpup((__be32 *) n->ha),
 776                  IPOIB_GID_RAW_ARG(n->ha + 4));
 777
 778        spin_lock_irqsave(&priv->lock, flags);
 779
 780        neigh = *to_ipoib_neigh(n);
 781        if (neigh) {
 782                if (neigh->ah)
 783                        ah = neigh->ah;
 784                list_del(&neigh->list);
 785                ipoib_neigh_free(neigh);
 786        }
 787
 788        spin_unlock_irqrestore(&priv->lock, flags);
 789
 790        if (ah)
 791                ipoib_put_ah(ah);
 792}
 793
 794struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
 795{
 796        struct ipoib_neigh *neigh;
 797
 798        neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
 799        if (!neigh)
 800                return NULL;
 801
 802        neigh->neighbour = neighbour;
 803        *to_ipoib_neigh(neighbour) = neigh;
 804
 805        return neigh;
 806}
 807
 808void ipoib_neigh_free(struct ipoib_neigh *neigh)
 809{
 810        *to_ipoib_neigh(neigh->neighbour) = NULL;
 811        kfree(neigh);
 812}
 813
 814static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
 815{
 816        parms->neigh_destructor = ipoib_neigh_destructor;
 817
 818        return 0;
 819}
 820
 821int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 822{
 823        struct ipoib_dev_priv *priv = netdev_priv(dev);
 824
 825        /* Allocate RX/TX "rings" to hold queued skbs */
 826        priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
 827                                GFP_KERNEL);
 828        if (!priv->rx_ring) {
 829                printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
 830                       ca->name, ipoib_recvq_size);
 831                goto out;
 832        }
 833
 834        priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring,
 835                                GFP_KERNEL);
 836        if (!priv->tx_ring) {
 837                printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
 838                       ca->name, ipoib_sendq_size);
 839                goto out_rx_ring_cleanup;
 840        }
 841
 842        /* priv->tx_head & tx_tail are already 0 */
 843
 844        if (ipoib_ib_dev_init(dev, ca, port))
 845                goto out_tx_ring_cleanup;
 846
 847        return 0;
 848
 849out_tx_ring_cleanup:
 850        kfree(priv->tx_ring);
 851
 852out_rx_ring_cleanup:
 853        kfree(priv->rx_ring);
 854
 855out:
 856        return -ENOMEM;
 857}
 858
 859void ipoib_dev_cleanup(struct net_device *dev)
 860{
 861        struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
 862
 863        ipoib_delete_debug_files(dev);
 864
 865        /* Delete any child interfaces first */
 866        list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
 867                unregister_netdev(cpriv->dev);
 868                ipoib_dev_cleanup(cpriv->dev);
 869                free_netdev(cpriv->dev);
 870        }
 871
 872        ipoib_ib_dev_cleanup(dev);
 873
 874        kfree(priv->rx_ring);
 875        kfree(priv->tx_ring);
 876
 877        priv->rx_ring = NULL;
 878        priv->tx_ring = NULL;
 879}
 880
 881static void ipoib_setup(struct net_device *dev)
 882{
 883        struct ipoib_dev_priv *priv = netdev_priv(dev);
 884
 885        dev->open                = ipoib_open;
 886        dev->stop                = ipoib_stop;
 887        dev->change_mtu          = ipoib_change_mtu;
 888        dev->hard_start_xmit     = ipoib_start_xmit;
 889        dev->get_stats           = ipoib_get_stats;
 890        dev->tx_timeout          = ipoib_timeout;
 891        dev->hard_header         = ipoib_hard_header;
 892        dev->set_multicast_list  = ipoib_set_mcast_list;
 893        dev->neigh_setup         = ipoib_neigh_setup_dev;
 894
 895        dev->watchdog_timeo      = HZ;
 896
 897        dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
 898
 899        /*
 900         * We add in INFINIBAND_ALEN to allow for the destination
 901         * address "pseudoheader" for skbs without neighbour struct.
 902         */
 903        dev->hard_header_len     = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
 904        dev->addr_len            = INFINIBAND_ALEN;
 905        dev->type                = ARPHRD_INFINIBAND;
 906        dev->tx_queue_len        = ipoib_sendq_size * 2;
 907        dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 908
 909        /* MTU will be reset when mcast join happens */
 910        dev->mtu                 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
 911        priv->mcast_mtu          = priv->admin_mtu = dev->mtu;
 912
 913        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 914
 915        netif_carrier_off(dev);
 916
 917        SET_MODULE_OWNER(dev);
 918
 919        priv->dev = dev;
 920
 921        spin_lock_init(&priv->lock);
 922        spin_lock_init(&priv->tx_lock);
 923
 924        mutex_init(&priv->mcast_mutex);
 925        mutex_init(&priv->vlan_mutex);
 926
 927        INIT_LIST_HEAD(&priv->path_list);
 928        INIT_LIST_HEAD(&priv->child_intfs);
 929        INIT_LIST_HEAD(&priv->dead_ahs);
 930        INIT_LIST_HEAD(&priv->multicast_list);
 931
 932        INIT_WORK(&priv->pkey_task,    ipoib_pkey_poll,          priv->dev);
 933        INIT_WORK(&priv->mcast_task,   ipoib_mcast_join_task,    priv->dev);
 934        INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush,       priv->dev);
 935        INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev);
 936        INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah,            priv->dev);
 937}
 938
 939struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
 940{
 941        struct net_device *dev;
 942
 943        dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
 944                           ipoib_setup);
 945        if (!dev)
 946                return NULL;
 947
 948        return netdev_priv(dev);
 949}
 950
 951static ssize_t show_pkey(struct class_device *cdev, char *buf)
 952{
 953        struct ipoib_dev_priv *priv =
 954                netdev_priv(container_of(cdev, struct net_device, class_dev));
 955
 956        return sprintf(buf, "0x%04x\n", priv->pkey);
 957}
 958static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
 959
 960static ssize_t create_child(struct class_device *cdev,
 961                            const char *buf, size_t count)
 962{
 963        int pkey;
 964        int ret;
 965
 966        if (sscanf(buf, "%i", &pkey) != 1)
 967                return -EINVAL;
 968
 969        if (pkey < 0 || pkey > 0xffff)
 970                return -EINVAL;
 971
 972        /*
 973         * Set the full membership bit, so that we join the right
 974         * broadcast group, etc.
 975         */
 976        pkey |= 0x8000;
 977
 978        ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev),
 979                             pkey);
 980
 981        return ret ? ret : count;
 982}
 983static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child);
 984
 985static ssize_t delete_child(struct class_device *cdev,
 986                            const char *buf, size_t count)
 987{
 988        int pkey;
 989        int ret;
 990
 991        if (sscanf(buf, "%i", &pkey) != 1)
 992                return -EINVAL;
 993
 994        if (pkey < 0 || pkey > 0xffff)
 995                return -EINVAL;
 996
 997        ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev),
 998                                pkey);
 999
1000        return ret ? ret : count;
1001
1002}
1003static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child);
1004
1005int ipoib_add_pkey_attr(struct net_device *dev)
1006{
1007        return class_device_create_file(&dev->class_dev,
1008                                        &class_device_attr_pkey);
1009}
1010
1011static struct net_device *ipoib_add_port(const char *format,
1012                                         struct ib_device *hca, u8 port)
1013{
1014        struct ipoib_dev_priv *priv;
1015        int result = -ENOMEM;
1016
1017        priv = ipoib_intf_alloc(format);
1018        if (!priv)
1019                goto alloc_mem_failed;
1020
1021        SET_NETDEV_DEV(priv->dev, hca->dma_device);
1022
1023        result = ib_query_pkey(hca, port, 0, &priv->pkey);
1024        if (result) {
1025                printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1026                       hca->name, port, result);
1027                goto alloc_mem_failed;
1028        }
1029
1030        /*
1031         * Set the full membership bit, so that we join the right
1032         * broadcast group, etc.
1033         */
1034        priv->pkey |= 0x8000;
1035
1036        priv->dev->broadcast[8] = priv->pkey >> 8;
1037        priv->dev->broadcast[9] = priv->pkey & 0xff;
1038
1039        result = ib_query_gid(hca, port, 0, &priv->local_gid);
1040        if (result) {
1041                printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1042                       hca->name, port, result);
1043                goto alloc_mem_failed;
1044        } else
1045                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
1046
1047
1048        result = ipoib_dev_init(priv->dev, hca, port);
1049        if (result < 0) {
1050                printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1051                       hca->name, port, result);
1052                goto device_init_failed;
1053        }
1054
1055        INIT_IB_EVENT_HANDLER(&priv->event_handler,
1056                              priv->ca, ipoib_event);
1057        result = ib_register_event_handler(&priv->event_handler);
1058        if (result < 0) {
1059                printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1060                       "port %d (ret = %d)\n",
1061                       hca->name, port, result);
1062                goto event_failed;
1063        }
1064
1065        result = register_netdev(priv->dev);
1066        if (result) {
1067                printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
1068                       hca->name, port, result);
1069                goto register_failed;
1070        }
1071
1072        ipoib_create_debug_files(priv->dev);
1073
1074        if (ipoib_add_pkey_attr(priv->dev))
1075                goto sysfs_failed;
1076        if (class_device_create_file(&priv->dev->class_dev,
1077                                     &class_device_attr_create_child))
1078                goto sysfs_failed;
1079        if (class_device_create_file(&priv->dev->class_dev,
1080                                     &class_device_attr_delete_child))
1081                goto sysfs_failed;
1082
1083        return priv->dev;
1084
1085sysfs_failed:
1086        ipoib_delete_debug_files(priv->dev);
1087        unregister_netdev(priv->dev);
1088
1089register_failed:
1090        ib_unregister_event_handler(&priv->event_handler);
1091        flush_scheduled_work();
1092
1093event_failed:
1094        ipoib_dev_cleanup(priv->dev);
1095
1096device_init_failed:
1097        free_netdev(priv->dev);
1098
1099alloc_mem_failed:
1100        return ERR_PTR(result);
1101}
1102
1103static void ipoib_add_one(struct ib_device *device)
1104{
1105        struct list_head *dev_list;
1106        struct net_device *dev;
1107        struct ipoib_dev_priv *priv;
1108        int s, e, p;
1109
1110        dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1111        if (!dev_list)
1112                return;
1113
1114        INIT_LIST_HEAD(dev_list);
1115
1116        if (device->node_type == IB_NODE_SWITCH) {
1117                s = 0;
1118                e = 0;
1119        } else {
1120                s = 1;
1121                e = device->phys_port_cnt;
1122        }
1123
1124        for (p = s; p <= e; ++p) {
1125                dev = ipoib_add_port("ib%d", device, p);
1126                if (!IS_ERR(dev)) {
1127                        priv = netdev_priv(dev);
1128                        list_add_tail(&priv->list, dev_list);
1129                }
1130        }
1131
1132        ib_set_client_data(device, &ipoib_client, dev_list);
1133}
1134
1135static void ipoib_remove_one(struct ib_device *device)
1136{
1137        struct ipoib_dev_priv *priv, *tmp;
1138        struct list_head *dev_list;
1139
1140        dev_list = ib_get_client_data(device, &ipoib_client);
1141
1142        list_for_each_entry_safe(priv, tmp, dev_list, list) {
1143                ib_unregister_event_handler(&priv->event_handler);
1144                flush_scheduled_work();
1145
1146                unregister_netdev(priv->dev);
1147                ipoib_dev_cleanup(priv->dev);
1148                free_netdev(priv->dev);
1149        }
1150
1151        kfree(dev_list);
1152}
1153
1154static int __init ipoib_init_module(void)
1155{
1156        int ret;
1157
1158        ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1159        ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1160        ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1161
1162        ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1163        ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1164        ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE);
1165
1166        ret = ipoib_register_debugfs();
1167        if (ret)
1168                return ret;
1169
1170        /*
1171         * We create our own workqueue mainly because we want to be
1172         * able to flush it when devices are being removed.  We can't
1173         * use schedule_work()/flush_scheduled_work() because both
1174         * unregister_netdev() and linkwatch_event take the rtnl lock,
1175         * so flush_scheduled_work() can deadlock during device
1176         * removal.
1177         */
1178        ipoib_workqueue = create_singlethread_workqueue("ipoib");
1179        if (!ipoib_workqueue) {
1180                ret = -ENOMEM;
1181                goto err_fs;
1182        }
1183
1184        ret = ib_register_client(&ipoib_client);
1185        if (ret)
1186                goto err_wq;
1187
1188        return 0;
1189
1190err_wq:
1191        destroy_workqueue(ipoib_workqueue);
1192
1193err_fs:
1194        ipoib_unregister_debugfs();
1195
1196        return ret;
1197}
1198
1199static void __exit ipoib_cleanup_module(void)
1200{
1201        ib_unregister_client(&ipoib_client);
1202        ipoib_unregister_debugfs();
1203        destroy_workqueue(ipoib_workqueue);
1204}
1205
1206module_init(ipoib_init_module);
1207module_exit(ipoib_cleanup_module);
1208
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.