linux/drivers/net/ethernet/chelsio/cxgb4/sge.c
<<
>>
Prefs
   1/*
   2 * This file is part of the Chelsio T4 Ethernet driver for Linux.
   3 *
   4 * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34
  35#include <linux/skbuff.h>
  36#include <linux/netdevice.h>
  37#include <linux/etherdevice.h>
  38#include <linux/if_vlan.h>
  39#include <linux/ip.h>
  40#include <linux/dma-mapping.h>
  41#include <linux/jiffies.h>
  42#include <linux/prefetch.h>
  43#include <linux/export.h>
  44#include <net/xfrm.h>
  45#include <net/ipv6.h>
  46#include <net/tcp.h>
  47#include <net/busy_poll.h>
  48#ifdef CONFIG_CHELSIO_T4_FCOE
  49#include <scsi/fc/fc_fcoe.h>
  50#endif /* CONFIG_CHELSIO_T4_FCOE */
  51#include "cxgb4.h"
  52#include "t4_regs.h"
  53#include "t4_values.h"
  54#include "t4_msg.h"
  55#include "t4fw_api.h"
  56#include "cxgb4_ptp.h"
  57#include "cxgb4_uld.h"
  58#include "cxgb4_tc_mqprio.h"
  59#include "sched.h"
  60
  61/*
  62 * Rx buffer size.  We use largish buffers if possible but settle for single
  63 * pages under memory shortage.
  64 */
  65#if PAGE_SHIFT >= 16
  66# define FL_PG_ORDER 0
  67#else
  68# define FL_PG_ORDER (16 - PAGE_SHIFT)
  69#endif
  70
  71/* RX_PULL_LEN should be <= RX_COPY_THRES */
  72#define RX_COPY_THRES    256
  73#define RX_PULL_LEN      128
  74
  75/*
  76 * Main body length for sk_buffs used for Rx Ethernet packets with fragments.
  77 * Should be >= RX_PULL_LEN but possibly bigger to give pskb_may_pull some room.
  78 */
  79#define RX_PKT_SKB_LEN   512
  80
  81/*
  82 * Max number of Tx descriptors we clean up at a time.  Should be modest as
  83 * freeing skbs isn't cheap and it happens while holding locks.  We just need
  84 * to free packets faster than they arrive, we eventually catch up and keep
  85 * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.  It should
  86 * also match the CIDX Flush Threshold.
  87 */
  88#define MAX_TX_RECLAIM 32
  89
  90/*
  91 * Max number of Rx buffers we replenish at a time.  Again keep this modest,
  92 * allocating buffers isn't cheap either.
  93 */
  94#define MAX_RX_REFILL 16U
  95
  96/*
  97 * Period of the Rx queue check timer.  This timer is infrequent as it has
  98 * something to do only when the system experiences severe memory shortage.
  99 */
 100#define RX_QCHECK_PERIOD (HZ / 2)
 101
 102/*
 103 * Period of the Tx queue check timer.
 104 */
 105#define TX_QCHECK_PERIOD (HZ / 2)
 106
 107/*
 108 * Max number of Tx descriptors to be reclaimed by the Tx timer.
 109 */
 110#define MAX_TIMER_TX_RECLAIM 100
 111
 112/*
 113 * Timer index used when backing off due to memory shortage.
 114 */
 115#define NOMEM_TMR_IDX (SGE_NTIMERS - 1)
 116
 117/*
 118 * Suspension threshold for non-Ethernet Tx queues.  We require enough room
 119 * for a full sized WR.
 120 */
 121#define TXQ_STOP_THRES (SGE_MAX_WR_LEN / sizeof(struct tx_desc))
 122
 123/*
 124 * Max Tx descriptor space we allow for an Ethernet packet to be inlined
 125 * into a WR.
 126 */
 127#define MAX_IMM_TX_PKT_LEN 256
 128
 129/*
 130 * Max size of a WR sent through a control Tx queue.
 131 */
 132#define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN
 133
 134struct rx_sw_desc {                /* SW state per Rx descriptor */
 135        struct page *page;
 136        dma_addr_t dma_addr;
 137};
 138
 139/*
 140 * Rx buffer sizes for "useskbs" Free List buffers (one ingress packet pe skb
 141 * buffer).  We currently only support two sizes for 1500- and 9000-byte MTUs.
 142 * We could easily support more but there doesn't seem to be much need for
 143 * that ...
 144 */
 145#define FL_MTU_SMALL 1500
 146#define FL_MTU_LARGE 9000
 147
 148static inline unsigned int fl_mtu_bufsize(struct adapter *adapter,
 149                                          unsigned int mtu)
 150{
 151        struct sge *s = &adapter->sge;
 152
 153        return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align);
 154}
 155
 156#define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL)
 157#define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE)
 158
 159/*
 160 * Bits 0..3 of rx_sw_desc.dma_addr have special meaning.  The hardware uses
 161 * these to specify the buffer size as an index into the SGE Free List Buffer
 162 * Size register array.  We also use bit 4, when the buffer has been unmapped
 163 * for DMA, but this is of course never sent to the hardware and is only used
 164 * to prevent double unmappings.  All of the above requires that the Free List
 165 * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are
 166 * 32-byte or or a power of 2 greater in alignment.  Since the SGE's minimal
 167 * Free List Buffer alignment is 32 bytes, this works out for us ...
 168 */
 169enum {
 170        RX_BUF_FLAGS     = 0x1f,   /* bottom five bits are special */
 171        RX_BUF_SIZE      = 0x0f,   /* bottom three bits are for buf sizes */
 172        RX_UNMAPPED_BUF  = 0x10,   /* buffer is not mapped */
 173
 174        /*
 175         * XXX We shouldn't depend on being able to use these indices.
 176         * XXX Especially when some other Master PF has initialized the
 177         * XXX adapter or we use the Firmware Configuration File.  We
 178         * XXX should really search through the Host Buffer Size register
 179         * XXX array for the appropriately sized buffer indices.
 180         */
 181        RX_SMALL_PG_BUF  = 0x0,   /* small (PAGE_SIZE) page buffer */
 182        RX_LARGE_PG_BUF  = 0x1,   /* buffer large (FL_PG_ORDER) page buffer */
 183
 184        RX_SMALL_MTU_BUF = 0x2,   /* small MTU buffer */
 185        RX_LARGE_MTU_BUF = 0x3,   /* large MTU buffer */
 186};
 187
 188static int timer_pkt_quota[] = {1, 1, 2, 3, 4, 5};
 189#define MIN_NAPI_WORK  1
 190
 191static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d)
 192{
 193        return d->dma_addr & ~(dma_addr_t)RX_BUF_FLAGS;
 194}
 195
 196static inline bool is_buf_mapped(const struct rx_sw_desc *d)
 197{
 198        return !(d->dma_addr & RX_UNMAPPED_BUF);
 199}
 200
 201/**
 202 *      txq_avail - return the number of available slots in a Tx queue
 203 *      @q: the Tx queue
 204 *
 205 *      Returns the number of descriptors in a Tx queue available to write new
 206 *      packets.
 207 */
 208static inline unsigned int txq_avail(const struct sge_txq *q)
 209{
 210        return q->size - 1 - q->in_use;
 211}
 212
 213/**
 214 *      fl_cap - return the capacity of a free-buffer list
 215 *      @fl: the FL
 216 *
 217 *      Returns the capacity of a free-buffer list.  The capacity is less than
 218 *      the size because one descriptor needs to be left unpopulated, otherwise
 219 *      HW will think the FL is empty.
 220 */
 221static inline unsigned int fl_cap(const struct sge_fl *fl)
 222{
 223        return fl->size - 8;   /* 1 descriptor = 8 buffers */
 224}
 225
 226/**
 227 *      fl_starving - return whether a Free List is starving.
 228 *      @adapter: pointer to the adapter
 229 *      @fl: the Free List
 230 *
 231 *      Tests specified Free List to see whether the number of buffers
 232 *      available to the hardware has falled below our "starvation"
 233 *      threshold.
 234 */
 235static inline bool fl_starving(const struct adapter *adapter,
 236                               const struct sge_fl *fl)
 237{
 238        const struct sge *s = &adapter->sge;
 239
 240        return fl->avail - fl->pend_cred <= s->fl_starve_thres;
 241}
 242
 243int cxgb4_map_skb(struct device *dev, const struct sk_buff *skb,
 244                  dma_addr_t *addr)
 245{
 246        const skb_frag_t *fp, *end;
 247        const struct skb_shared_info *si;
 248
 249        *addr = dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
 250        if (dma_mapping_error(dev, *addr))
 251                goto out_err;
 252
 253        si = skb_shinfo(skb);
 254        end = &si->frags[si->nr_frags];
 255
 256        for (fp = si->frags; fp < end; fp++) {
 257                *++addr = skb_frag_dma_map(dev, fp, 0, skb_frag_size(fp),
 258                                           DMA_TO_DEVICE);
 259                if (dma_mapping_error(dev, *addr))
 260                        goto unwind;
 261        }
 262        return 0;
 263
 264unwind:
 265        while (fp-- > si->frags)
 266                dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE);
 267
 268        dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE);
 269out_err:
 270        return -ENOMEM;
 271}
 272EXPORT_SYMBOL(cxgb4_map_skb);
 273
 274static void unmap_skb(struct device *dev, const struct sk_buff *skb,
 275                      const dma_addr_t *addr)
 276{
 277        const skb_frag_t *fp, *end;
 278        const struct skb_shared_info *si;
 279
 280        dma_unmap_single(dev, *addr++, skb_headlen(skb), DMA_TO_DEVICE);
 281
 282        si = skb_shinfo(skb);
 283        end = &si->frags[si->nr_frags];
 284        for (fp = si->frags; fp < end; fp++)
 285                dma_unmap_page(dev, *addr++, skb_frag_size(fp), DMA_TO_DEVICE);
 286}
 287
 288#ifdef CONFIG_NEED_DMA_MAP_STATE
 289/**
 290 *      deferred_unmap_destructor - unmap a packet when it is freed
 291 *      @skb: the packet
 292 *
 293 *      This is the packet destructor used for Tx packets that need to remain
 294 *      mapped until they are freed rather than until their Tx descriptors are
 295 *      freed.
 296 */
 297static void deferred_unmap_destructor(struct sk_buff *skb)
 298{
 299        unmap_skb(skb->dev->dev.parent, skb, (dma_addr_t *)skb->head);
 300}
 301#endif
 302
 303/**
 304 *      free_tx_desc - reclaims Tx descriptors and their buffers
 305 *      @adap: the adapter
 306 *      @q: the Tx queue to reclaim descriptors from
 307 *      @n: the number of descriptors to reclaim
 308 *      @unmap: whether the buffers should be unmapped for DMA
 309 *
 310 *      Reclaims Tx descriptors from an SGE Tx queue and frees the associated
 311 *      Tx buffers.  Called with the Tx queue lock held.
 312 */
 313void free_tx_desc(struct adapter *adap, struct sge_txq *q,
 314                  unsigned int n, bool unmap)
 315{
 316        unsigned int cidx = q->cidx;
 317        struct tx_sw_desc *d;
 318
 319        d = &q->sdesc[cidx];
 320        while (n--) {
 321                if (d->skb) {                       /* an SGL is present */
 322                        if (unmap && d->addr[0]) {
 323                                unmap_skb(adap->pdev_dev, d->skb, d->addr);
 324                                memset(d->addr, 0, sizeof(d->addr));
 325                        }
 326                        dev_consume_skb_any(d->skb);
 327                        d->skb = NULL;
 328                }
 329                ++d;
 330                if (++cidx == q->size) {
 331                        cidx = 0;
 332                        d = q->sdesc;
 333                }
 334        }
 335        q->cidx = cidx;
 336}
 337
 338/*
 339 * Return the number of reclaimable descriptors in a Tx queue.
 340 */
 341static inline int reclaimable(const struct sge_txq *q)
 342{
 343        int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
 344        hw_cidx -= q->cidx;
 345        return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx;
 346}
 347
 348/**
 349 *      reclaim_completed_tx - reclaims completed TX Descriptors
 350 *      @adap: the adapter
 351 *      @q: the Tx queue to reclaim completed descriptors from
 352 *      @maxreclaim: the maximum number of TX Descriptors to reclaim or -1
 353 *      @unmap: whether the buffers should be unmapped for DMA
 354 *
 355 *      Reclaims Tx Descriptors that the SGE has indicated it has processed,
 356 *      and frees the associated buffers if possible.  If @max == -1, then
 357 *      we'll use a defaiult maximum.  Called with the TX Queue locked.
 358 */
 359static inline int reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
 360                                       int maxreclaim, bool unmap)
 361{
 362        int reclaim = reclaimable(q);
 363
 364        if (reclaim) {
 365                /*
 366                 * Limit the amount of clean up work we do at a time to keep
 367                 * the Tx lock hold time O(1).
 368                 */
 369                if (maxreclaim < 0)
 370                        maxreclaim = MAX_TX_RECLAIM;
 371                if (reclaim > maxreclaim)
 372                        reclaim = maxreclaim;
 373
 374                free_tx_desc(adap, q, reclaim, unmap);
 375                q->in_use -= reclaim;
 376        }
 377
 378        return reclaim;
 379}
 380
 381/**
 382 *      cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
 383 *      @adap: the adapter
 384 *      @q: the Tx queue to reclaim completed descriptors from
 385 *      @unmap: whether the buffers should be unmapped for DMA
 386 *
 387 *      Reclaims Tx descriptors that the SGE has indicated it has processed,
 388 *      and frees the associated buffers if possible.  Called with the Tx
 389 *      queue locked.
 390 */
 391void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
 392                                bool unmap)
 393{
 394        (void)reclaim_completed_tx(adap, q, -1, unmap);
 395}
 396EXPORT_SYMBOL(cxgb4_reclaim_completed_tx);
 397
 398static inline int get_buf_size(struct adapter *adapter,
 399                               const struct rx_sw_desc *d)
 400{
 401        struct sge *s = &adapter->sge;
 402        unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE;
 403        int buf_size;
 404
 405        switch (rx_buf_size_idx) {
 406        case RX_SMALL_PG_BUF:
 407                buf_size = PAGE_SIZE;
 408                break;
 409
 410        case RX_LARGE_PG_BUF:
 411                buf_size = PAGE_SIZE << s->fl_pg_order;
 412                break;
 413
 414        case RX_SMALL_MTU_BUF:
 415                buf_size = FL_MTU_SMALL_BUFSIZE(adapter);
 416                break;
 417
 418        case RX_LARGE_MTU_BUF:
 419                buf_size = FL_MTU_LARGE_BUFSIZE(adapter);
 420                break;
 421
 422        default:
 423                BUG();
 424        }
 425
 426        return buf_size;
 427}
 428
 429/**
 430 *      free_rx_bufs - free the Rx buffers on an SGE free list
 431 *      @adap: the adapter
 432 *      @q: the SGE free list to free buffers from
 433 *      @n: how many buffers to free
 434 *
 435 *      Release the next @n buffers on an SGE free-buffer Rx queue.   The
 436 *      buffers must be made inaccessible to HW before calling this function.
 437 */
 438static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n)
 439{
 440        while (n--) {
 441                struct rx_sw_desc *d = &q->sdesc[q->cidx];
 442
 443                if (is_buf_mapped(d))
 444                        dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
 445                                       get_buf_size(adap, d),
 446                                       PCI_DMA_FROMDEVICE);
 447                put_page(d->page);
 448                d->page = NULL;
 449                if (++q->cidx == q->size)
 450                        q->cidx = 0;
 451                q->avail--;
 452        }
 453}
 454
 455/**
 456 *      unmap_rx_buf - unmap the current Rx buffer on an SGE free list
 457 *      @adap: the adapter
 458 *      @q: the SGE free list
 459 *
 460 *      Unmap the current buffer on an SGE free-buffer Rx queue.   The
 461 *      buffer must be made inaccessible to HW before calling this function.
 462 *
 463 *      This is similar to @free_rx_bufs above but does not free the buffer.
 464 *      Do note that the FL still loses any further access to the buffer.
 465 */
 466static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q)
 467{
 468        struct rx_sw_desc *d = &q->sdesc[q->cidx];
 469
 470        if (is_buf_mapped(d))
 471                dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
 472                               get_buf_size(adap, d), PCI_DMA_FROMDEVICE);
 473        d->page = NULL;
 474        if (++q->cidx == q->size)
 475                q->cidx = 0;
 476        q->avail--;
 477}
 478
 479static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
 480{
 481        if (q->pend_cred >= 8) {
 482                u32 val = adap->params.arch.sge_fl_db;
 483
 484                if (is_t4(adap->params.chip))
 485                        val |= PIDX_V(q->pend_cred / 8);
 486                else
 487                        val |= PIDX_T5_V(q->pend_cred / 8);
 488
 489                /* Make sure all memory writes to the Free List queue are
 490                 * committed before we tell the hardware about them.
 491                 */
 492                wmb();
 493
 494                /* If we don't have access to the new User Doorbell (T5+), use
 495                 * the old doorbell mechanism; otherwise use the new BAR2
 496                 * mechanism.
 497                 */
 498                if (unlikely(q->bar2_addr == NULL)) {
 499                        t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
 500                                     val | QID_V(q->cntxt_id));
 501                } else {
 502                        writel(val | QID_V(q->bar2_qid),
 503                               q->bar2_addr + SGE_UDB_KDOORBELL);
 504
 505                        /* This Write memory Barrier will force the write to
 506                         * the User Doorbell area to be flushed.
 507                         */
 508                        wmb();
 509                }
 510                q->pend_cred &= 7;
 511        }
 512}
 513
 514static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg,
 515                                  dma_addr_t mapping)
 516{
 517        sd->page = pg;
 518        sd->dma_addr = mapping;      /* includes size low bits */
 519}
 520
 521/**
 522 *      refill_fl - refill an SGE Rx buffer ring
 523 *      @adap: the adapter
 524 *      @q: the ring to refill
 525 *      @n: the number of new buffers to allocate
 526 *      @gfp: the gfp flags for the allocations
 527 *
 528 *      (Re)populate an SGE free-buffer queue with up to @n new packet buffers,
 529 *      allocated with the supplied gfp flags.  The caller must assure that
 530 *      @n does not exceed the queue's capacity.  If afterwards the queue is
 531 *      found critically low mark it as starving in the bitmap of starving FLs.
 532 *
 533 *      Returns the number of buffers allocated.
 534 */
 535static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n,
 536                              gfp_t gfp)
 537{
 538        struct sge *s = &adap->sge;
 539        struct page *pg;
 540        dma_addr_t mapping;
 541        unsigned int cred = q->avail;
 542        __be64 *d = &q->desc[q->pidx];
 543        struct rx_sw_desc *sd = &q->sdesc[q->pidx];
 544        int node;
 545
 546#ifdef CONFIG_DEBUG_FS
 547        if (test_bit(q->cntxt_id - adap->sge.egr_start, adap->sge.blocked_fl))
 548                goto out;
 549#endif
 550
 551        gfp |= __GFP_NOWARN;
 552        node = dev_to_node(adap->pdev_dev);
 553
 554        if (s->fl_pg_order == 0)
 555                goto alloc_small_pages;
 556
 557        /*
 558         * Prefer large buffers
 559         */
 560        while (n) {
 561                pg = alloc_pages_node(node, gfp | __GFP_COMP, s->fl_pg_order);
 562                if (unlikely(!pg)) {
 563                        q->large_alloc_failed++;
 564                        break;       /* fall back to single pages */
 565                }
 566
 567                mapping = dma_map_page(adap->pdev_dev, pg, 0,
 568                                       PAGE_SIZE << s->fl_pg_order,
 569                                       PCI_DMA_FROMDEVICE);
 570                if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
 571                        __free_pages(pg, s->fl_pg_order);
 572                        q->mapping_err++;
 573                        goto out;   /* do not try small pages for this error */
 574                }
 575                mapping |= RX_LARGE_PG_BUF;
 576                *d++ = cpu_to_be64(mapping);
 577
 578                set_rx_sw_desc(sd, pg, mapping);
 579                sd++;
 580
 581                q->avail++;
 582                if (++q->pidx == q->size) {
 583                        q->pidx = 0;
 584                        sd = q->sdesc;
 585                        d = q->desc;
 586                }
 587                n--;
 588        }
 589
 590alloc_small_pages:
 591        while (n--) {
 592                pg = alloc_pages_node(node, gfp, 0);
 593                if (unlikely(!pg)) {
 594                        q->alloc_failed++;
 595                        break;
 596                }
 597
 598                mapping = dma_map_page(adap->pdev_dev, pg, 0, PAGE_SIZE,
 599                                       PCI_DMA_FROMDEVICE);
 600                if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
 601                        put_page(pg);
 602                        q->mapping_err++;
 603                        goto out;
 604                }
 605                *d++ = cpu_to_be64(mapping);
 606
 607                set_rx_sw_desc(sd, pg, mapping);
 608                sd++;
 609
 610                q->avail++;
 611                if (++q->pidx == q->size) {
 612                        q->pidx = 0;
 613                        sd = q->sdesc;
 614                        d = q->desc;
 615                }
 616        }
 617
 618out:    cred = q->avail - cred;
 619        q->pend_cred += cred;
 620        ring_fl_db(adap, q);
 621
 622        if (unlikely(fl_starving(adap, q))) {
 623                smp_wmb();
 624                q->low++;
 625                set_bit(q->cntxt_id - adap->sge.egr_start,
 626                        adap->sge.starving_fl);
 627        }
 628
 629        return cred;
 630}
 631
 632static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
 633{
 634        refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail),
 635                  GFP_ATOMIC);
 636}
 637
 638/**
 639 *      alloc_ring - allocate resources for an SGE descriptor ring
 640 *      @dev: the PCI device's core device
 641 *      @nelem: the number of descriptors
 642 *      @elem_size: the size of each descriptor
 643 *      @sw_size: the size of the SW state associated with each ring element
 644 *      @phys: the physical address of the allocated ring
 645 *      @metadata: address of the array holding the SW state for the ring
 646 *      @stat_size: extra space in HW ring for status information
 647 *      @node: preferred node for memory allocations
 648 *
 649 *      Allocates resources for an SGE descriptor ring, such as Tx queues,
 650 *      free buffer lists, or response queues.  Each SGE ring requires
 651 *      space for its HW descriptors plus, optionally, space for the SW state
 652 *      associated with each HW entry (the metadata).  The function returns
 653 *      three values: the virtual address for the HW ring (the return value
 654 *      of the function), the bus address of the HW ring, and the address
 655 *      of the SW ring.
 656 */
 657static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size,
 658                        size_t sw_size, dma_addr_t *phys, void *metadata,
 659                        size_t stat_size, int node)
 660{
 661        size_t len = nelem * elem_size + stat_size;
 662        void *s = NULL;
 663        void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL);
 664
 665        if (!p)
 666                return NULL;
 667        if (sw_size) {
 668                s = kcalloc_node(sw_size, nelem, GFP_KERNEL, node);
 669
 670                if (!s) {
 671                        dma_free_coherent(dev, len, p, *phys);
 672                        return NULL;
 673                }
 674        }
 675        if (metadata)
 676                *(void **)metadata = s;
 677        return p;
 678}
 679
 680/**
 681 *      sgl_len - calculates the size of an SGL of the given capacity
 682 *      @n: the number of SGL entries
 683 *
 684 *      Calculates the number of flits needed for a scatter/gather list that
 685 *      can hold the given number of entries.
 686 */
 687static inline unsigned int sgl_len(unsigned int n)
 688{
 689        /* A Direct Scatter Gather List uses 32-bit lengths and 64-bit PCI DMA
 690         * addresses.  The DSGL Work Request starts off with a 32-bit DSGL
 691         * ULPTX header, then Length0, then Address0, then, for 1 <= i <= N,
 692         * repeated sequences of { Length[i], Length[i+1], Address[i],
 693         * Address[i+1] } (this ensures that all addresses are on 64-bit
 694         * boundaries).  If N is even, then Length[N+1] should be set to 0 and
 695         * Address[N+1] is omitted.
 696         *
 697         * The following calculation incorporates all of the above.  It's
 698         * somewhat hard to follow but, briefly: the "+2" accounts for the
 699         * first two flits which include the DSGL header, Length0 and
 700         * Address0; the "(3*(n-1))/2" covers the main body of list entries (3
 701         * flits for every pair of the remaining N) +1 if (n-1) is odd; and
 702         * finally the "+((n-1)&1)" adds the one remaining flit needed if
 703         * (n-1) is odd ...
 704         */
 705        n--;
 706        return (3 * n) / 2 + (n & 1) + 2;
 707}
 708
 709/**
 710 *      flits_to_desc - returns the num of Tx descriptors for the given flits
 711 *      @n: the number of flits
 712 *
 713 *      Returns the number of Tx descriptors needed for the supplied number
 714 *      of flits.
 715 */
 716static inline unsigned int flits_to_desc(unsigned int n)
 717{
 718        BUG_ON(n > SGE_MAX_WR_LEN / 8);
 719        return DIV_ROUND_UP(n, 8);
 720}
 721
 722/**
 723 *      is_eth_imm - can an Ethernet packet be sent as immediate data?
 724 *      @skb: the packet
 725 *      @chip_ver: chip version
 726 *
 727 *      Returns whether an Ethernet packet is small enough to fit as
 728 *      immediate data. Return value corresponds to headroom required.
 729 */
 730static inline int is_eth_imm(const struct sk_buff *skb, unsigned int chip_ver)
 731{
 732        int hdrlen = 0;
 733
 734        if (skb->encapsulation && skb_shinfo(skb)->gso_size &&
 735            chip_ver > CHELSIO_T5) {
 736                hdrlen = sizeof(struct cpl_tx_tnl_lso);
 737                hdrlen += sizeof(struct cpl_tx_pkt_core);
 738        } else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
 739                return 0;
 740        } else {
 741                hdrlen = skb_shinfo(skb)->gso_size ?
 742                         sizeof(struct cpl_tx_pkt_lso_core) : 0;
 743                hdrlen += sizeof(struct cpl_tx_pkt);
 744        }
 745        if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
 746                return hdrlen;
 747        return 0;
 748}
 749
 750/**
 751 *      calc_tx_flits - calculate the number of flits for a packet Tx WR
 752 *      @skb: the packet
 753 *      @chip_ver: chip version
 754 *
 755 *      Returns the number of flits needed for a Tx WR for the given Ethernet
 756 *      packet, including the needed WR and CPL headers.
 757 */
 758static inline unsigned int calc_tx_flits(const struct sk_buff *skb,
 759                                         unsigned int chip_ver)
 760{
 761        unsigned int flits;
 762        int hdrlen = is_eth_imm(skb, chip_ver);
 763
 764        /* If the skb is small enough, we can pump it out as a work request
 765         * with only immediate data.  In that case we just have to have the
 766         * TX Packet header plus the skb data in the Work Request.
 767         */
 768
 769        if (hdrlen)
 770                return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64));
 771
 772        /* Otherwise, we're going to have to construct a Scatter gather list
 773         * of the skb body and fragments.  We also include the flits necessary
 774         * for the TX Packet Work Request and CPL.  We always have a firmware
 775         * Write Header (incorporated as part of the cpl_tx_pkt_lso and
 776         * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
 777         * message or, if we're doing a Large Send Offload, an LSO CPL message
 778         * with an embedded TX Packet Write CPL message.
 779         */
 780        flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
 781        if (skb_shinfo(skb)->gso_size) {
 782                if (skb->encapsulation && chip_ver > CHELSIO_T5) {
 783                        hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
 784                                 sizeof(struct cpl_tx_tnl_lso);
 785                } else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
 786                        u32 pkt_hdrlen;
 787
 788                        pkt_hdrlen = eth_get_headlen(skb->dev, skb->data,
 789                                                     skb_headlen(skb));
 790                        hdrlen = sizeof(struct fw_eth_tx_eo_wr) +
 791                                 round_up(pkt_hdrlen, 16);
 792                } else {
 793                        hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
 794                                 sizeof(struct cpl_tx_pkt_lso_core);
 795                }
 796
 797                hdrlen += sizeof(struct cpl_tx_pkt_core);
 798                flits += (hdrlen / sizeof(__be64));
 799        } else {
 800                flits += (sizeof(struct fw_eth_tx_pkt_wr) +
 801                          sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
 802        }
 803        return flits;
 804}
 805
 806/**
 807 *      calc_tx_descs - calculate the number of Tx descriptors for a packet
 808 *      @skb: the packet
 809 *      @chip_ver: chip version
 810 *
 811 *      Returns the number of Tx descriptors needed for the given Ethernet
 812 *      packet, including the needed WR and CPL headers.
 813 */
 814static inline unsigned int calc_tx_descs(const struct sk_buff *skb,
 815                                         unsigned int chip_ver)
 816{
 817        return flits_to_desc(calc_tx_flits(skb, chip_ver));
 818}
 819
 820/**
 821 *      cxgb4_write_sgl - populate a scatter/gather list for a packet
 822 *      @skb: the packet
 823 *      @q: the Tx queue we are writing into
 824 *      @sgl: starting location for writing the SGL
 825 *      @end: points right after the end of the SGL
 826 *      @start: start offset into skb main-body data to include in the SGL
 827 *      @addr: the list of bus addresses for the SGL elements
 828 *
 829 *      Generates a gather list for the buffers that make up a packet.
 830 *      The caller must provide adequate space for the SGL that will be written.
 831 *      The SGL includes all of the packet's page fragments and the data in its
 832 *      main body except for the first @start bytes.  @sgl must be 16-byte
 833 *      aligned and within a Tx descriptor with available space.  @end points
 834 *      right after the end of the SGL but does not account for any potential
 835 *      wrap around, i.e., @end > @sgl.
 836 */
 837void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q,
 838                     struct ulptx_sgl *sgl, u64 *end, unsigned int start,
 839                     const dma_addr_t *addr)
 840{
 841        unsigned int i, len;
 842        struct ulptx_sge_pair *to;
 843        const struct skb_shared_info *si = skb_shinfo(skb);
 844        unsigned int nfrags = si->nr_frags;
 845        struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1];
 846
 847        len = skb_headlen(skb) - start;
 848        if (likely(len)) {
 849                sgl->len0 = htonl(len);
 850                sgl->addr0 = cpu_to_be64(addr[0] + start);
 851                nfrags++;
 852        } else {
 853                sgl->len0 = htonl(skb_frag_size(&si->frags[0]));
 854                sgl->addr0 = cpu_to_be64(addr[1]);
 855        }
 856
 857        sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
 858                              ULPTX_NSGE_V(nfrags));
 859        if (likely(--nfrags == 0))
 860                return;
 861        /*
 862         * Most of the complexity below deals with the possibility we hit the
 863         * end of the queue in the middle of writing the SGL.  For this case
 864         * only we create the SGL in a temporary buffer and then copy it.
 865         */
 866        to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
 867
 868        for (i = (nfrags != si->nr_frags); nfrags >= 2; nfrags -= 2, to++) {
 869                to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
 870                to->len[1] = cpu_to_be32(skb_frag_size(&si->frags[++i]));
 871                to->addr[0] = cpu_to_be64(addr[i]);
 872                to->addr[1] = cpu_to_be64(addr[++i]);
 873        }
 874        if (nfrags) {
 875                to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
 876                to->len[1] = cpu_to_be32(0);
 877                to->addr[0] = cpu_to_be64(addr[i + 1]);
 878        }
 879        if (unlikely((u8 *)end > (u8 *)q->stat)) {
 880                unsigned int part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
 881
 882                if (likely(part0))
 883                        memcpy(sgl->sge, buf, part0);
 884                part1 = (u8 *)end - (u8 *)q->stat;
 885                memcpy(q->desc, (u8 *)buf + part0, part1);
 886                end = (void *)q->desc + part1;
 887        }
 888        if ((uintptr_t)end & 8)           /* 0-pad to multiple of 16 */
 889                *end = 0;
 890}
 891EXPORT_SYMBOL(cxgb4_write_sgl);
 892
 893/*      cxgb4_write_partial_sgl - populate SGL for partial packet
 894 *      @skb: the packet
 895 *      @q: the Tx queue we are writing into
 896 *      @sgl: starting location for writing the SGL
 897 *      @end: points right after the end of the SGL
 898 *      @addr: the list of bus addresses for the SGL elements
 899 *      @start: start offset in the SKB where partial data starts
 900 *      @len: length of data from @start to send out
 901 *
 902 *      This API will handle sending out partial data of a skb if required.
 903 *      Unlike cxgb4_write_sgl, @start can be any offset into the skb data,
 904 *      and @len will decide how much data after @start offset to send out.
 905 */
 906void cxgb4_write_partial_sgl(const struct sk_buff *skb, struct sge_txq *q,
 907                             struct ulptx_sgl *sgl, u64 *end,
 908                             const dma_addr_t *addr, u32 start, u32 len)
 909{
 910        struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1] = {0}, *to;
 911        u32 frag_size, skb_linear_data_len = skb_headlen(skb);
 912        struct skb_shared_info *si = skb_shinfo(skb);
 913        u8 i = 0, frag_idx = 0, nfrags = 0;
 914        skb_frag_t *frag;
 915
 916        /* Fill the first SGL either from linear data or from partial
 917         * frag based on @start.
 918         */
 919        if (unlikely(start < skb_linear_data_len)) {
 920                frag_size = min(len, skb_linear_data_len - start);
 921                sgl->len0 = htonl(frag_size);
 922                sgl->addr0 = cpu_to_be64(addr[0] + start);
 923                len -= frag_size;
 924                nfrags++;
 925        } else {
 926                start -= skb_linear_data_len;
 927                frag = &si->frags[frag_idx];
 928                frag_size = skb_frag_size(frag);
 929                /* find the first frag */
 930                while (start >= frag_size) {
 931                        start -= frag_size;
 932                        frag_idx++;
 933                        frag = &si->frags[frag_idx];
 934                        frag_size = skb_frag_size(frag);
 935                }
 936
 937                frag_size = min(len, skb_frag_size(frag) - start);
 938                sgl->len0 = cpu_to_be32(frag_size);
 939                sgl->addr0 = cpu_to_be64(addr[frag_idx + 1] + start);
 940                len -= frag_size;
 941                nfrags++;
 942                frag_idx++;
 943        }
 944
 945        /* If the entire partial data fit in one SGL, then send it out
 946         * now.
 947         */
 948        if (!len)
 949                goto done;
 950
 951        /* Most of the complexity below deals with the possibility we hit the
 952         * end of the queue in the middle of writing the SGL.  For this case
 953         * only we create the SGL in a temporary buffer and then copy it.
 954         */
 955        to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
 956
 957        /* If the skb couldn't fit in first SGL completely, fill the
 958         * rest of the frags in subsequent SGLs. Note that each SGL
 959         * pair can store 2 frags.
 960         */
 961        while (len) {
 962                frag_size = min(len, skb_frag_size(&si->frags[frag_idx]));
 963                to->len[i & 1] = cpu_to_be32(frag_size);
 964                to->addr[i & 1] = cpu_to_be64(addr[frag_idx + 1]);
 965                if (i && (i & 1))
 966                        to++;
 967                nfrags++;
 968                frag_idx++;
 969                i++;
 970                len -= frag_size;
 971        }
 972
 973        /* If we ended in an odd boundary, then set the second SGL's
 974         * length in the pair to 0.
 975         */
 976        if (i & 1)
 977                to->len[1] = cpu_to_be32(0);
 978
 979        /* Copy from temporary buffer to Tx ring, in case we hit the
 980         * end of the queue in the middle of writing the SGL.
 981         */
 982        if (unlikely((u8 *)end > (u8 *)q->stat)) {
 983                u32 part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
 984
 985                if (likely(part0))
 986                        memcpy(sgl->sge, buf, part0);
 987                part1 = (u8 *)end - (u8 *)q->stat;
 988                memcpy(q->desc, (u8 *)buf + part0, part1);
 989                end = (void *)q->desc + part1;
 990        }
 991
 992        /* 0-pad to multiple of 16 */
 993        if ((uintptr_t)end & 8)
 994                *end = 0;
 995done:
 996        sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
 997                        ULPTX_NSGE_V(nfrags));
 998}
 999EXPORT_SYMBOL(cxgb4_write_partial_sgl);
1000
1001/* This function copies 64 byte coalesced work request to
1002 * memory mapped BAR2 space. For coalesced WR SGE fetches
1003 * data from the FIFO instead of from Host.
1004 */
1005static void cxgb_pio_copy(u64 __iomem *dst, u64 *src)
1006{
1007        int count = 8;
1008
1009        while (count) {
1010                writeq(*src, dst);
1011                src++;
1012                dst++;
1013                count--;
1014        }
1015}
1016
1017/**
1018 *      cxgb4_ring_tx_db - check and potentially ring a Tx queue's doorbell
1019 *      @adap: the adapter
1020 *      @q: the Tx queue
1021 *      @n: number of new descriptors to give to HW
1022 *
1023 *      Ring the doorbel for a Tx queue.
1024 */
1025inline void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
1026{
1027        /* Make sure that all writes to the TX Descriptors are committed
1028         * before we tell the hardware about them.
1029         */
1030        wmb();
1031
1032        /* If we don't have access to the new User Doorbell (T5+), use the old
1033         * doorbell mechanism; otherwise use the new BAR2 mechanism.
1034         */
1035        if (unlikely(q->bar2_addr == NULL)) {
1036                u32 val = PIDX_V(n);
1037                unsigned long flags;
1038
1039                /* For T4 we need to participate in the Doorbell Recovery
1040                 * mechanism.
1041                 */
1042                spin_lock_irqsave(&q->db_lock, flags);
1043                if (!q->db_disabled)
1044                        t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
1045                                     QID_V(q->cntxt_id) | val);
1046                else
1047                        q->db_pidx_inc += n;
1048                q->db_pidx = q->pidx;
1049                spin_unlock_irqrestore(&q->db_lock, flags);
1050        } else {
1051                u32 val = PIDX_T5_V(n);
1052
1053                /* T4 and later chips share the same PIDX field offset within
1054                 * the doorbell, but T5 and later shrank the field in order to
1055                 * gain a bit for Doorbell Priority.  The field was absurdly
1056                 * large in the first place (14 bits) so we just use the T5
1057                 * and later limits and warn if a Queue ID is too large.
1058                 */
1059                WARN_ON(val & DBPRIO_F);
1060
1061                /* If we're only writing a single TX Descriptor and we can use
1062                 * Inferred QID registers, we can use the Write Combining
1063                 * Gather Buffer; otherwise we use the simple doorbell.
1064                 */
1065                if (n == 1 && q->bar2_qid == 0) {
1066                        int index = (q->pidx
1067                                     ? (q->pidx - 1)
1068                                     : (q->size - 1));
1069                        u64 *wr = (u64 *)&q->desc[index];
1070
1071                        cxgb_pio_copy((u64 __iomem *)
1072                                      (q->bar2_addr + SGE_UDB_WCDOORBELL),
1073                                      wr);
1074                } else {
1075                        writel(val | QID_V(q->bar2_qid),
1076                               q->bar2_addr + SGE_UDB_KDOORBELL);
1077                }
1078
1079                /* This Write Memory Barrier will force the write to the User
1080                 * Doorbell area to be flushed.  This is needed to prevent
1081                 * writes on different CPUs for the same queue from hitting
1082                 * the adapter out of order.  This is required when some Work
1083                 * Requests take the Write Combine Gather Buffer path (user
1084                 * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
1085                 * take the traditional path where we simply increment the
1086                 * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
1087                 * hardware DMA read the actual Work Request.
1088                 */
1089                wmb();
1090        }
1091}
1092EXPORT_SYMBOL(cxgb4_ring_tx_db);
1093
1094/**
1095 *      cxgb4_inline_tx_skb - inline a packet's data into Tx descriptors
1096 *      @skb: the packet
1097 *      @q: the Tx queue where the packet will be inlined
1098 *      @pos: starting position in the Tx queue where to inline the packet
1099 *
1100 *      Inline a packet's contents directly into Tx descriptors, starting at
1101 *      the given position within the Tx DMA ring.
1102 *      Most of the complexity of this operation is dealing with wrap arounds
1103 *      in the middle of the packet we want to inline.
1104 */
1105void cxgb4_inline_tx_skb(const struct sk_buff *skb,
1106                         const struct sge_txq *q, void *pos)
1107{
1108        int left = (void *)q->stat - pos;
1109        u64 *p;
1110
1111        if (likely(skb->len <= left)) {
1112                if (likely(!skb->data_len))
1113                        skb_copy_from_linear_data(skb, pos, skb->len);
1114                else
1115                        skb_copy_bits(skb, 0, pos, skb->len);
1116                pos += skb->len;
1117        } else {
1118                skb_copy_bits(skb, 0, pos, left);
1119                skb_copy_bits(skb, left, q->desc, skb->len - left);
1120                pos = (void *)q->desc + (skb->len - left);
1121        }
1122
1123        /* 0-pad to multiple of 16 */
1124        p = PTR_ALIGN(pos, 8);
1125        if ((uintptr_t)p & 8)
1126                *p = 0;
1127}
1128EXPORT_SYMBOL(cxgb4_inline_tx_skb);
1129
1130static void *inline_tx_skb_header(const struct sk_buff *skb,
1131                                  const struct sge_txq *q,  void *pos,
1132                                  int length)
1133{
1134        u64 *p;
1135        int left = (void *)q->stat - pos;
1136
1137        if (likely(length <= left)) {
1138                memcpy(pos, skb->data, length);
1139                pos += length;
1140        } else {
1141                memcpy(pos, skb->data, left);
1142                memcpy(q->desc, skb->data + left, length - left);
1143                pos = (void *)q->desc + (length - left);
1144        }
1145        /* 0-pad to multiple of 16 */
1146        p = PTR_ALIGN(pos, 8);
1147        if ((uintptr_t)p & 8) {
1148                *p = 0;
1149                return p + 1;
1150        }
1151        return p;
1152}
1153
1154/*
1155 * Figure out what HW csum a packet wants and return the appropriate control
1156 * bits.
1157 */
1158static u64 hwcsum(enum chip_type chip, const struct sk_buff *skb)
1159{
1160        int csum_type;
1161        bool inner_hdr_csum = false;
1162        u16 proto, ver;
1163
1164        if (skb->encapsulation &&
1165            (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5))
1166                inner_hdr_csum = true;
1167
1168        if (inner_hdr_csum) {
1169                ver = inner_ip_hdr(skb)->version;
1170                proto = (ver == 4) ? inner_ip_hdr(skb)->protocol :
1171                        inner_ipv6_hdr(skb)->nexthdr;
1172        } else {
1173                ver = ip_hdr(skb)->version;
1174                proto = (ver == 4) ? ip_hdr(skb)->protocol :
1175                        ipv6_hdr(skb)->nexthdr;
1176        }
1177
1178        if (ver == 4) {
1179                if (proto == IPPROTO_TCP)
1180                        csum_type = TX_CSUM_TCPIP;
1181                else if (proto == IPPROTO_UDP)
1182                        csum_type = TX_CSUM_UDPIP;
1183                else {
1184nocsum:                 /*
1185                         * unknown protocol, disable HW csum
1186                         * and hope a bad packet is detected
1187                         */
1188                        return TXPKT_L4CSUM_DIS_F;
1189                }
1190        } else {
1191                /*
1192                 * this doesn't work with extension headers
1193                 */
1194                if (proto == IPPROTO_TCP)
1195                        csum_type = TX_CSUM_TCPIP6;
1196                else if (proto == IPPROTO_UDP)
1197                        csum_type = TX_CSUM_UDPIP6;
1198                else
1199                        goto nocsum;
1200        }
1201
1202        if (likely(csum_type >= TX_CSUM_TCPIP)) {
1203                int eth_hdr_len, l4_len;
1204                u64 hdr_len;
1205
1206                if (inner_hdr_csum) {
1207                        /* This allows checksum offload for all encapsulated
1208                         * packets like GRE etc..
1209                         */
1210                        l4_len = skb_inner_network_header_len(skb);
1211                        eth_hdr_len = skb_inner_network_offset(skb) - ETH_HLEN;
1212                } else {
1213                        l4_len = skb_network_header_len(skb);
1214                        eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
1215                }
1216                hdr_len = TXPKT_IPHDR_LEN_V(l4_len);
1217
1218                if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
1219                        hdr_len |= TXPKT_ETHHDR_LEN_V(eth_hdr_len);
1220                else
1221                        hdr_len |= T6_TXPKT_ETHHDR_LEN_V(eth_hdr_len);
1222                return TXPKT_CSUM_TYPE_V(csum_type) | hdr_len;
1223        } else {
1224                int start = skb_transport_offset(skb);
1225
1226                return TXPKT_CSUM_TYPE_V(csum_type) |
1227                        TXPKT_CSUM_START_V(start) |
1228                        TXPKT_CSUM_LOC_V(start + skb->csum_offset);
1229        }
1230}
1231
1232static void eth_txq_stop(struct sge_eth_txq *q)
1233{
1234        netif_tx_stop_queue(q->txq);
1235        q->q.stops++;
1236}
1237
1238static inline void txq_advance(struct sge_txq *q, unsigned int n)
1239{
1240        q->in_use += n;
1241        q->pidx += n;
1242        if (q->pidx >= q->size)
1243                q->pidx -= q->size;
1244}
1245
1246#ifdef CONFIG_CHELSIO_T4_FCOE
1247static inline int
1248cxgb_fcoe_offload(struct sk_buff *skb, struct adapter *adap,
1249                  const struct port_info *pi, u64 *cntrl)
1250{
1251        const struct cxgb_fcoe *fcoe = &pi->fcoe;
1252
1253        if (!(fcoe->flags & CXGB_FCOE_ENABLED))
1254                return 0;
1255
1256        if (skb->protocol != htons(ETH_P_FCOE))
1257                return 0;
1258
1259        skb_reset_mac_header(skb);
1260        skb->mac_len = sizeof(struct ethhdr);
1261
1262        skb_set_network_header(skb, skb->mac_len);
1263        skb_set_transport_header(skb, skb->mac_len + sizeof(struct fcoe_hdr));
1264
1265        if (!cxgb_fcoe_sof_eof_supported(adap, skb))
1266                return -ENOTSUPP;
1267
1268        /* FC CRC offload */
1269        *cntrl = TXPKT_CSUM_TYPE_V(TX_CSUM_FCOE) |
1270                     TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F |
1271                     TXPKT_CSUM_START_V(CXGB_FCOE_TXPKT_CSUM_START) |
1272                     TXPKT_CSUM_END_V(CXGB_FCOE_TXPKT_CSUM_END) |
1273                     TXPKT_CSUM_LOC_V(CXGB_FCOE_TXPKT_CSUM_END);
1274        return 0;
1275}
1276#endif /* CONFIG_CHELSIO_T4_FCOE */
1277
1278/* Returns tunnel type if hardware supports offloading of the same.
1279 * It is called only for T5 and onwards.
1280 */
1281enum cpl_tx_tnl_lso_type cxgb_encap_offload_supported(struct sk_buff *skb)
1282{
1283        u8 l4_hdr = 0;
1284        enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
1285        struct port_info *pi = netdev_priv(skb->dev);
1286        struct adapter *adapter = pi->adapter;
1287
1288        if (skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
1289            skb->inner_protocol != htons(ETH_P_TEB))
1290                return tnl_type;
1291
1292        switch (vlan_get_protocol(skb)) {
1293        case htons(ETH_P_IP):
1294                l4_hdr = ip_hdr(skb)->protocol;
1295                break;
1296        case htons(ETH_P_IPV6):
1297                l4_hdr = ipv6_hdr(skb)->nexthdr;
1298                break;
1299        default:
1300                return tnl_type;
1301        }
1302
1303        switch (l4_hdr) {
1304        case IPPROTO_UDP:
1305                if (adapter->vxlan_port == udp_hdr(skb)->dest)
1306                        tnl_type = TX_TNL_TYPE_VXLAN;
1307                else if (adapter->geneve_port == udp_hdr(skb)->dest)
1308                        tnl_type = TX_TNL_TYPE_GENEVE;
1309                break;
1310        default:
1311                return tnl_type;
1312        }
1313
1314        return tnl_type;
1315}
1316
1317static inline void t6_fill_tnl_lso(struct sk_buff *skb,
1318                                   struct cpl_tx_tnl_lso *tnl_lso,
1319                                   enum cpl_tx_tnl_lso_type tnl_type)
1320{
1321        u32 val;
1322        int in_eth_xtra_len;
1323        int l3hdr_len = skb_network_header_len(skb);
1324        int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1325        const struct skb_shared_info *ssi = skb_shinfo(skb);
1326        bool v6 = (ip_hdr(skb)->version == 6);
1327
1328        val = CPL_TX_TNL_LSO_OPCODE_V(CPL_TX_TNL_LSO) |
1329              CPL_TX_TNL_LSO_FIRST_F |
1330              CPL_TX_TNL_LSO_LAST_F |
1331              (v6 ? CPL_TX_TNL_LSO_IPV6OUT_F : 0) |
1332              CPL_TX_TNL_LSO_ETHHDRLENOUT_V(eth_xtra_len / 4) |
1333              CPL_TX_TNL_LSO_IPHDRLENOUT_V(l3hdr_len / 4) |
1334              (v6 ? 0 : CPL_TX_TNL_LSO_IPHDRCHKOUT_F) |
1335              CPL_TX_TNL_LSO_IPLENSETOUT_F |
1336              (v6 ? 0 : CPL_TX_TNL_LSO_IPIDINCOUT_F);
1337        tnl_lso->op_to_IpIdSplitOut = htonl(val);
1338
1339        tnl_lso->IpIdOffsetOut = 0;
1340
1341        /* Get the tunnel header length */
1342        val = skb_inner_mac_header(skb) - skb_mac_header(skb);
1343        in_eth_xtra_len = skb_inner_network_header(skb) -
1344                          skb_inner_mac_header(skb) - ETH_HLEN;
1345
1346        switch (tnl_type) {
1347        case TX_TNL_TYPE_VXLAN:
1348        case TX_TNL_TYPE_GENEVE:
1349                tnl_lso->UdpLenSetOut_to_TnlHdrLen =
1350                        htons(CPL_TX_TNL_LSO_UDPCHKCLROUT_F |
1351                        CPL_TX_TNL_LSO_UDPLENSETOUT_F);
1352                break;
1353        default:
1354                tnl_lso->UdpLenSetOut_to_TnlHdrLen = 0;
1355                break;
1356        }
1357
1358        tnl_lso->UdpLenSetOut_to_TnlHdrLen |=
1359                 htons(CPL_TX_TNL_LSO_TNLHDRLEN_V(val) |
1360                       CPL_TX_TNL_LSO_TNLTYPE_V(tnl_type));
1361
1362        tnl_lso->r1 = 0;
1363
1364        val = CPL_TX_TNL_LSO_ETHHDRLEN_V(in_eth_xtra_len / 4) |
1365              CPL_TX_TNL_LSO_IPV6_V(inner_ip_hdr(skb)->version == 6) |
1366              CPL_TX_TNL_LSO_IPHDRLEN_V(skb_inner_network_header_len(skb) / 4) |
1367              CPL_TX_TNL_LSO_TCPHDRLEN_V(inner_tcp_hdrlen(skb) / 4);
1368        tnl_lso->Flow_to_TcpHdrLen = htonl(val);
1369
1370        tnl_lso->IpIdOffset = htons(0);
1371
1372        tnl_lso->IpIdSplit_to_Mss = htons(CPL_TX_TNL_LSO_MSS_V(ssi->gso_size));
1373        tnl_lso->TCPSeqOffset = htonl(0);
1374        tnl_lso->EthLenOffset_Size = htonl(CPL_TX_TNL_LSO_SIZE_V(skb->len));
1375}
1376
1377static inline void *write_tso_wr(struct adapter *adap, struct sk_buff *skb,
1378                                 struct cpl_tx_pkt_lso_core *lso)
1379{
1380        int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1381        int l3hdr_len = skb_network_header_len(skb);
1382        const struct skb_shared_info *ssi;
1383        bool ipv6 = false;
1384
1385        ssi = skb_shinfo(skb);
1386        if (ssi->gso_type & SKB_GSO_TCPV6)
1387                ipv6 = true;
1388
1389        lso->lso_ctrl = htonl(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
1390                              LSO_FIRST_SLICE_F | LSO_LAST_SLICE_F |
1391                              LSO_IPV6_V(ipv6) |
1392                              LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
1393                              LSO_IPHDR_LEN_V(l3hdr_len / 4) |
1394                              LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
1395        lso->ipid_ofst = htons(0);
1396        lso->mss = htons(ssi->gso_size);
1397        lso->seqno_offset = htonl(0);
1398        if (is_t4(adap->params.chip))
1399                lso->len = htonl(skb->len);
1400        else
1401                lso->len = htonl(LSO_T5_XFER_SIZE_V(skb->len));
1402
1403        return (void *)(lso + 1);
1404}
1405
1406/**
1407 *      t4_sge_eth_txq_egress_update - handle Ethernet TX Queue update
1408 *      @adap: the adapter
1409 *      @eq: the Ethernet TX Queue
1410 *      @maxreclaim: the maximum number of TX Descriptors to reclaim or -1
1411 *
1412 *      We're typically called here to update the state of an Ethernet TX
1413 *      Queue with respect to the hardware's progress in consuming the TX
1414 *      Work Requests that we've put on that Egress Queue.  This happens
1415 *      when we get Egress Queue Update messages and also prophylactically
1416 *      in regular timer-based Ethernet TX Queue maintenance.
1417 */
1418int t4_sge_eth_txq_egress_update(struct adapter *adap, struct sge_eth_txq *eq,
1419                                 int maxreclaim)
1420{
1421        unsigned int reclaimed, hw_cidx;
1422        struct sge_txq *q = &eq->q;
1423        int hw_in_use;
1424
1425        if (!q->in_use || !__netif_tx_trylock(eq->txq))
1426                return 0;
1427
1428        /* Reclaim pending completed TX Descriptors. */
1429        reclaimed = reclaim_completed_tx(adap, &eq->q, maxreclaim, true);
1430
1431        hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
1432        hw_in_use = q->pidx - hw_cidx;
1433        if (hw_in_use < 0)
1434                hw_in_use += q->size;
1435
1436        /* If the TX Queue is currently stopped and there's now more than half
1437         * the queue available, restart it.  Otherwise bail out since the rest
1438         * of what we want do here is with the possibility of shipping any
1439         * currently buffered Coalesced TX Work Request.
1440         */
1441        if (netif_tx_queue_stopped(eq->txq) && hw_in_use < (q->size / 2)) {
1442                netif_tx_wake_queue(eq->txq);
1443                eq->q.restarts++;
1444        }
1445
1446        __netif_tx_unlock(eq->txq);
1447        return reclaimed;
1448}
1449
1450static inline int cxgb4_validate_skb(struct sk_buff *skb,
1451                                     struct net_device *dev,
1452                                     u32 min_pkt_len)
1453{
1454        u32 max_pkt_len;
1455
1456        /* The chip min packet length is 10 octets but some firmware
1457         * commands have a minimum packet length requirement. So, play
1458         * safe and reject anything shorter than @min_pkt_len.
1459         */
1460        if (unlikely(skb->len < min_pkt_len))
1461                return -EINVAL;
1462
1463        /* Discard the packet if the length is greater than mtu */
1464        max_pkt_len = ETH_HLEN + dev->mtu;
1465
1466        if (skb_vlan_tagged(skb))
1467                max_pkt_len += VLAN_HLEN;
1468
1469        if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len)))
1470                return -EINVAL;
1471
1472        return 0;
1473}
1474
1475static void *write_eo_udp_wr(struct sk_buff *skb, struct fw_eth_tx_eo_wr *wr,
1476                             u32 hdr_len)
1477{
1478        wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
1479        wr->u.udpseg.ethlen = skb_network_offset(skb);
1480        wr->u.udpseg.iplen = cpu_to_be16(skb_network_header_len(skb));
1481        wr->u.udpseg.udplen = sizeof(struct udphdr);
1482        wr->u.udpseg.rtplen = 0;
1483        wr->u.udpseg.r4 = 0;
1484        if (skb_shinfo(skb)->gso_size)
1485                wr->u.udpseg.mss = cpu_to_be16(skb_shinfo(skb)->gso_size);
1486        else
1487                wr->u.udpseg.mss = cpu_to_be16(skb->len - hdr_len);
1488        wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
1489        wr->u.udpseg.plen = cpu_to_be32(skb->len - hdr_len);
1490
1491        return (void *)(wr + 1);
1492}
1493
1494/**
1495 *      cxgb4_eth_xmit - add a packet to an Ethernet Tx queue
1496 *      @skb: the packet
1497 *      @dev: the egress net device
1498 *
1499 *      Add a packet to an SGE Ethernet Tx queue.  Runs with softirqs disabled.
1500 */
1501static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1502{
1503        enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
1504        bool ptp_enabled = is_ptp_enabled(skb, dev);
1505        unsigned int last_desc, flits, ndesc;
1506        u32 wr_mid, ctrl0, op, sgl_off = 0;
1507        const struct skb_shared_info *ssi;
1508        int len, qidx, credits, ret, left;
1509        struct tx_sw_desc *sgl_sdesc;
1510        struct fw_eth_tx_eo_wr *eowr;
1511        struct fw_eth_tx_pkt_wr *wr;
1512        struct cpl_tx_pkt_core *cpl;
1513        const struct port_info *pi;
1514        bool immediate = false;
1515        u64 cntrl, *end, *sgl;
1516        struct sge_eth_txq *q;
1517        unsigned int chip_ver;
1518        struct adapter *adap;
1519
1520        ret = cxgb4_validate_skb(skb, dev, ETH_HLEN);
1521        if (ret)
1522                goto out_free;
1523
1524        pi = netdev_priv(dev);
1525        adap = pi->adapter;
1526        ssi = skb_shinfo(skb);
1527#if IS_ENABLED(CONFIG_CHELSIO_IPSEC_INLINE)
1528        if (xfrm_offload(skb) && !ssi->gso_size)
1529                return adap->uld[CXGB4_ULD_IPSEC].tx_handler(skb, dev);
1530#endif /* CHELSIO_IPSEC_INLINE */
1531
1532#if IS_ENABLED(CONFIG_CHELSIO_TLS_DEVICE)
1533        if (cxgb4_is_ktls_skb(skb) &&
1534            (skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb))))
1535                return adap->uld[CXGB4_ULD_KTLS].tx_handler(skb, dev);
1536#endif /* CHELSIO_TLS_DEVICE */
1537
1538        qidx = skb_get_queue_mapping(skb);
1539        if (ptp_enabled) {
1540                if (!(adap->ptp_tx_skb)) {
1541                        skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
1542                        adap->ptp_tx_skb = skb_get(skb);
1543                } else {
1544                        goto out_free;
1545                }
1546                q = &adap->sge.ptptxq;
1547        } else {
1548                q = &adap->sge.ethtxq[qidx + pi->first_qset];
1549        }
1550        skb_tx_timestamp(skb);
1551
1552        reclaim_completed_tx(adap, &q->q, -1, true);
1553        cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
1554
1555#ifdef CONFIG_CHELSIO_T4_FCOE
1556        ret = cxgb_fcoe_offload(skb, adap, pi, &cntrl);
1557        if (unlikely(ret == -EOPNOTSUPP))
1558                goto out_free;
1559#endif /* CONFIG_CHELSIO_T4_FCOE */
1560
1561        chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
1562        flits = calc_tx_flits(skb, chip_ver);
1563        ndesc = flits_to_desc(flits);
1564        credits = txq_avail(&q->q) - ndesc;
1565
1566        if (unlikely(credits < 0)) {
1567                eth_txq_stop(q);
1568                dev_err(adap->pdev_dev,
1569                        "%s: Tx ring %u full while queue awake!\n",
1570                        dev->name, qidx);
1571                return NETDEV_TX_BUSY;
1572        }
1573
1574        if (is_eth_imm(skb, chip_ver))
1575                immediate = true;
1576
1577        if (skb->encapsulation && chip_ver > CHELSIO_T5)
1578                tnl_type = cxgb_encap_offload_supported(skb);
1579
1580        last_desc = q->q.pidx + ndesc - 1;
1581        if (last_desc >= q->q.size)
1582                last_desc -= q->q.size;
1583        sgl_sdesc = &q->q.sdesc[last_desc];
1584
1585        if (!immediate &&
1586            unlikely(cxgb4_map_skb(adap->pdev_dev, skb, sgl_sdesc->addr) < 0)) {
1587                memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr));
1588                q->mapping_err++;
1589                goto out_free;
1590        }
1591
1592        wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
1593        if (unlikely(credits < ETHTXQ_STOP_THRES)) {
1594                /* After we're done injecting the Work Request for this
1595                 * packet, we'll be below our "stop threshold" so stop the TX
1596                 * Queue now and schedule a request for an SGE Egress Queue
1597                 * Update message. The queue will get started later on when
1598                 * the firmware processes this Work Request and sends us an
1599                 * Egress Queue Status Update message indicating that space
1600                 * has opened up.
1601                 */
1602                eth_txq_stop(q);
1603                if (chip_ver > CHELSIO_T5)
1604                        wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
1605        }
1606
1607        wr = (void *)&q->q.desc[q->q.pidx];
1608        eowr = (void *)&q->q.desc[q->q.pidx];
1609        wr->equiq_to_len16 = htonl(wr_mid);
1610        wr->r3 = cpu_to_be64(0);
1611        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
1612                end = (u64 *)eowr + flits;
1613        else
1614                end = (u64 *)wr + flits;
1615
1616        len = immediate ? skb->len : 0;
1617        len += sizeof(*cpl);
1618        if (ssi->gso_size && !(ssi->gso_type & SKB_GSO_UDP_L4)) {
1619                struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
1620                struct cpl_tx_tnl_lso *tnl_lso = (void *)(wr + 1);
1621
1622                if (tnl_type)
1623                        len += sizeof(*tnl_lso);
1624                else
1625                        len += sizeof(*lso);
1626
1627                wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) |
1628                                       FW_WR_IMMDLEN_V(len));
1629                if (tnl_type) {
1630                        struct iphdr *iph = ip_hdr(skb);
1631
1632                        t6_fill_tnl_lso(skb, tnl_lso, tnl_type);
1633                        cpl = (void *)(tnl_lso + 1);
1634                        /* Driver is expected to compute partial checksum that
1635                         * does not include the IP Total Length.
1636                         */
1637                        if (iph->version == 4) {
1638                                iph->check = 0;
1639                                iph->tot_len = 0;
1640                                iph->check = ~ip_fast_csum((u8 *)iph, iph->ihl);
1641                        }
1642                        if (skb->ip_summed == CHECKSUM_PARTIAL)
1643                                cntrl = hwcsum(adap->params.chip, skb);
1644                } else {
1645                        cpl = write_tso_wr(adap, skb, lso);
1646                        cntrl = hwcsum(adap->params.chip, skb);
1647                }
1648                sgl = (u64 *)(cpl + 1); /* sgl start here */
1649                q->tso++;
1650                q->tx_cso += ssi->gso_segs;
1651        } else if (ssi->gso_size) {
1652                u64 *start;
1653                u32 hdrlen;
1654
1655                hdrlen = eth_get_headlen(dev, skb->data, skb_headlen(skb));
1656                len += hdrlen;
1657                wr->op_immdlen = cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_EO_WR) |
1658                                             FW_ETH_TX_EO_WR_IMMDLEN_V(len));
1659                cpl = write_eo_udp_wr(skb, eowr, hdrlen);
1660                cntrl = hwcsum(adap->params.chip, skb);
1661
1662                start = (u64 *)(cpl + 1);
1663                sgl = (u64 *)inline_tx_skb_header(skb, &q->q, (void *)start,
1664                                                  hdrlen);
1665                if (unlikely(start > sgl)) {
1666                        left = (u8 *)end - (u8 *)q->q.stat;
1667                        end = (void *)q->q.desc + left;
1668                }
1669                sgl_off = hdrlen;
1670                q->uso++;
1671                q->tx_cso += ssi->gso_segs;
1672        } else {
1673                if (ptp_enabled)
1674                        op = FW_PTP_TX_PKT_WR;
1675                else
1676                        op = FW_ETH_TX_PKT_WR;
1677                wr->op_immdlen = htonl(FW_WR_OP_V(op) |
1678                                       FW_WR_IMMDLEN_V(len));
1679                cpl = (void *)(wr + 1);
1680                sgl = (u64 *)(cpl + 1);
1681                if (skb->ip_summed == CHECKSUM_PARTIAL) {
1682                        cntrl = hwcsum(adap->params.chip, skb) |
1683                                TXPKT_IPCSUM_DIS_F;
1684                        q->tx_cso++;
1685                }
1686        }
1687
1688        if (unlikely((u8 *)sgl >= (u8 *)q->q.stat)) {
1689                /* If current position is already at the end of the
1690                 * txq, reset the current to point to start of the queue
1691                 * and update the end ptr as well.
1692                 */
1693                left = (u8 *)end - (u8 *)q->q.stat;
1694                end = (void *)q->q.desc + left;
1695                sgl = (void *)q->q.desc;
1696        }
1697
1698        if (skb_vlan_tag_present(skb)) {
1699                q->vlan_ins++;
1700                cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
1701#ifdef CONFIG_CHELSIO_T4_FCOE
1702                if (skb->protocol == htons(ETH_P_FCOE))
1703                        cntrl |= TXPKT_VLAN_V(
1704                                 ((skb->priority & 0x7) << VLAN_PRIO_SHIFT));
1705#endif /* CONFIG_CHELSIO_T4_FCOE */
1706        }
1707
1708        ctrl0 = TXPKT_OPCODE_V(CPL_TX_PKT_XT) | TXPKT_INTF_V(pi->tx_chan) |
1709                TXPKT_PF_V(adap->pf);
1710        if (ptp_enabled)
1711                ctrl0 |= TXPKT_TSTAMP_F;
1712#ifdef CONFIG_CHELSIO_T4_DCB
1713        if (is_t4(adap->params.chip))
1714                ctrl0 |= TXPKT_OVLAN_IDX_V(q->dcb_prio);
1715        else
1716                ctrl0 |= TXPKT_T5_OVLAN_IDX_V(q->dcb_prio);
1717#endif
1718        cpl->ctrl0 = htonl(ctrl0);
1719        cpl->pack = htons(0);
1720        cpl->len = htons(skb->len);
1721        cpl->ctrl1 = cpu_to_be64(cntrl);
1722
1723        if (immediate) {
1724                cxgb4_inline_tx_skb(skb, &q->q, sgl);
1725                dev_consume_skb_any(skb);
1726        } else {
1727                cxgb4_write_sgl(skb, &q->q, (void *)sgl, end, sgl_off,
1728                                sgl_sdesc->addr);
1729                skb_orphan(skb);
1730                sgl_sdesc->skb = skb;
1731        }
1732
1733        txq_advance(&q->q, ndesc);
1734
1735        cxgb4_ring_tx_db(adap, &q->q, ndesc);
1736        return NETDEV_TX_OK;
1737
1738out_free:
1739        dev_kfree_skb_any(skb);
1740        return NETDEV_TX_OK;
1741}
1742
1743/* Constants ... */
1744enum {
1745        /* Egress Queue sizes, producer and consumer indices are all in units
1746         * of Egress Context Units bytes.  Note that as far as the hardware is
1747         * concerned, the free list is an Egress Queue (the host produces free
1748         * buffers which the hardware consumes) and free list entries are
1749         * 64-bit PCI DMA addresses.
1750         */
1751        EQ_UNIT = SGE_EQ_IDXSIZE,
1752        FL_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
1753        TXD_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
1754
1755        T4VF_ETHTXQ_MAX_HDR = (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1756                               sizeof(struct cpl_tx_pkt_lso_core) +
1757                               sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64),
1758};
1759
1760/**
1761 *      t4vf_is_eth_imm - can an Ethernet packet be sent as immediate data?
1762 *      @skb: the packet
1763 *
1764 *      Returns whether an Ethernet packet is small enough to fit completely as
1765 *      immediate data.
1766 */
1767static inline int t4vf_is_eth_imm(const struct sk_buff *skb)
1768{
1769        /* The VF Driver uses the FW_ETH_TX_PKT_VM_WR firmware Work Request
1770         * which does not accommodate immediate data.  We could dike out all
1771         * of the support code for immediate data but that would tie our hands
1772         * too much if we ever want to enhace the firmware.  It would also
1773         * create more differences between the PF and VF Drivers.
1774         */
1775        return false;
1776}
1777
1778/**
1779 *      t4vf_calc_tx_flits - calculate the number of flits for a packet TX WR
1780 *      @skb: the packet
1781 *
1782 *      Returns the number of flits needed for a TX Work Request for the
1783 *      given Ethernet packet, including the needed WR and CPL headers.
1784 */
1785static inline unsigned int t4vf_calc_tx_flits(const struct sk_buff *skb)
1786{
1787        unsigned int flits;
1788
1789        /* If the skb is small enough, we can pump it out as a work request
1790         * with only immediate data.  In that case we just have to have the
1791         * TX Packet header plus the skb data in the Work Request.
1792         */
1793        if (t4vf_is_eth_imm(skb))
1794                return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt),
1795                                    sizeof(__be64));
1796
1797        /* Otherwise, we're going to have to construct a Scatter gather list
1798         * of the skb body and fragments.  We also include the flits necessary
1799         * for the TX Packet Work Request and CPL.  We always have a firmware
1800         * Write Header (incorporated as part of the cpl_tx_pkt_lso and
1801         * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
1802         * message or, if we're doing a Large Send Offload, an LSO CPL message
1803         * with an embedded TX Packet Write CPL message.
1804         */
1805        flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
1806        if (skb_shinfo(skb)->gso_size)
1807                flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1808                          sizeof(struct cpl_tx_pkt_lso_core) +
1809                          sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
1810        else
1811                flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1812                          sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
1813        return flits;
1814}
1815
1816/**
1817 *      cxgb4_vf_eth_xmit - add a packet to an Ethernet TX queue
1818 *      @skb: the packet
1819 *      @dev: the egress net device
1820 *
1821 *      Add a packet to an SGE Ethernet TX queue.  Runs with softirqs disabled.
1822 */
1823static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
1824                                     struct net_device *dev)
1825{
1826        unsigned int last_desc, flits, ndesc;
1827        const struct skb_shared_info *ssi;
1828        struct fw_eth_tx_pkt_vm_wr *wr;
1829        struct tx_sw_desc *sgl_sdesc;
1830        struct cpl_tx_pkt_core *cpl;
1831        const struct port_info *pi;
1832        struct sge_eth_txq *txq;
1833        struct adapter *adapter;
1834        int qidx, credits, ret;
1835        size_t fw_hdr_copy_len;
1836        unsigned int chip_ver;
1837        u64 cntrl, *end;
1838        u32 wr_mid;
1839
1840        /* The chip minimum packet length is 10 octets but the firmware
1841         * command that we are using requires that we copy the Ethernet header
1842         * (including the VLAN tag) into the header so we reject anything
1843         * smaller than that ...
1844         */
1845        fw_hdr_copy_len = sizeof(wr->ethmacdst) + sizeof(wr->ethmacsrc) +
1846                          sizeof(wr->ethtype) + sizeof(wr->vlantci);
1847        ret = cxgb4_validate_skb(skb, dev, fw_hdr_copy_len);
1848        if (ret)
1849                goto out_free;
1850
1851        /* Figure out which TX Queue we're going to use. */
1852        pi = netdev_priv(dev);
1853        adapter = pi->adapter;
1854        qidx = skb_get_queue_mapping(skb);
1855        WARN_ON(qidx >= pi->nqsets);
1856        txq = &adapter->sge.ethtxq[pi->first_qset + qidx];
1857
1858        /* Take this opportunity to reclaim any TX Descriptors whose DMA
1859         * transfers have completed.
1860         */
1861        reclaim_completed_tx(adapter, &txq->q, -1, true);
1862
1863        /* Calculate the number of flits and TX Descriptors we're going to
1864         * need along with how many TX Descriptors will be left over after
1865         * we inject our Work Request.
1866         */
1867        flits = t4vf_calc_tx_flits(skb);
1868        ndesc = flits_to_desc(flits);
1869        credits = txq_avail(&txq->q) - ndesc;
1870
1871        if (unlikely(credits < 0)) {
1872                /* Not enough room for this packet's Work Request.  Stop the
1873                 * TX Queue and return a "busy" condition.  The queue will get
1874                 * started later on when the firmware informs us that space
1875                 * has opened up.
1876                 */
1877                eth_txq_stop(txq);
1878                dev_err(adapter->pdev_dev,
1879                        "%s: TX ring %u full while queue awake!\n",
1880                        dev->name, qidx);
1881                return NETDEV_TX_BUSY;
1882        }
1883
1884        last_desc = txq->q.pidx + ndesc - 1;
1885        if (last_desc >= txq->q.size)
1886                last_desc -= txq->q.size;
1887        sgl_sdesc = &txq->q.sdesc[last_desc];
1888
1889        if (!t4vf_is_eth_imm(skb) &&
1890            unlikely(cxgb4_map_skb(adapter->pdev_dev, skb,
1891                                   sgl_sdesc->addr) < 0)) {
1892                /* We need to map the skb into PCI DMA space (because it can't
1893                 * be in-lined directly into the Work Request) and the mapping
1894                 * operation failed.  Record the error and drop the packet.
1895                 */
1896                memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr));
1897                txq->mapping_err++;
1898                goto out_free;
1899        }
1900
1901        chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
1902        wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
1903        if (unlikely(credits < ETHTXQ_STOP_THRES)) {
1904                /* After we're done injecting the Work Request for this
1905                 * packet, we'll be below our "stop threshold" so stop the TX
1906                 * Queue now and schedule a request for an SGE Egress Queue
1907                 * Update message.  The queue will get started later on when
1908                 * the firmware processes this Work Request and sends us an
1909                 * Egress Queue Status Update message indicating that space
1910                 * has opened up.
1911                 */
1912                eth_txq_stop(txq);
1913                if (chip_ver > CHELSIO_T5)
1914                        wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
1915        }
1916
1917        /* Start filling in our Work Request.  Note that we do _not_ handle
1918         * the WR Header wrapping around the TX Descriptor Ring.  If our
1919         * maximum header size ever exceeds one TX Descriptor, we'll need to
1920         * do something else here.
1921         */
1922        WARN_ON(DIV_ROUND_UP(T4VF_ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1);
1923        wr = (void *)&txq->q.desc[txq->q.pidx];
1924        wr->equiq_to_len16 = cpu_to_be32(wr_mid);
1925        wr->r3[0] = cpu_to_be32(0);
1926        wr->r3[1] = cpu_to_be32(0);
1927        skb_copy_from_linear_data(skb, (void *)wr->ethmacdst, fw_hdr_copy_len);
1928        end = (u64 *)wr + flits;
1929
1930        /* If this is a Large Send Offload packet we'll put in an LSO CPL
1931         * message with an encapsulated TX Packet CPL message.  Otherwise we
1932         * just use a TX Packet CPL message.
1933         */
1934        ssi = skb_shinfo(skb);
1935        if (ssi->gso_size) {
1936                struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
1937                bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
1938                int l3hdr_len = skb_network_header_len(skb);
1939                int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1940
1941                wr->op_immdlen =
1942                        cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
1943                                    FW_WR_IMMDLEN_V(sizeof(*lso) +
1944                                                    sizeof(*cpl)));
1945                 /* Fill in the LSO CPL message. */
1946                lso->lso_ctrl =
1947                        cpu_to_be32(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
1948                                    LSO_FIRST_SLICE_F |
1949                                    LSO_LAST_SLICE_F |
1950                                    LSO_IPV6_V(v6) |
1951                                    LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
1952                                    LSO_IPHDR_LEN_V(l3hdr_len / 4) |
1953                                    LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
1954                lso->ipid_ofst = cpu_to_be16(0);
1955                lso->mss = cpu_to_be16(ssi->gso_size);
1956                lso->seqno_offset = cpu_to_be32(0);
1957                if (is_t4(adapter->params.chip))
1958                        lso->len = cpu_to_be32(skb->len);
1959                else
1960                        lso->len = cpu_to_be32(LSO_T5_XFER_SIZE_V(skb->len));
1961
1962                /* Set up TX Packet CPL pointer, control word and perform
1963                 * accounting.
1964                 */
1965                cpl = (void *)(lso + 1);
1966
1967                if (chip_ver <= CHELSIO_T5)
1968                        cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len);
1969                else
1970                        cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
1971
1972                cntrl |= TXPKT_CSUM_TYPE_V(v6 ?
1973                                           TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
1974                         TXPKT_IPHDR_LEN_V(l3hdr_len);
1975                txq->tso++;
1976                txq->tx_cso += ssi->gso_segs;
1977        } else {
1978                int len;
1979
1980                len = (t4vf_is_eth_imm(skb)
1981                       ? skb->len + sizeof(*cpl)
1982                       : sizeof(*cpl));
1983                wr->op_immdlen =
1984                        cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
1985                                    FW_WR_IMMDLEN_V(len));
1986
1987                /* Set up TX Packet CPL pointer, control word and perform
1988                 * accounting.
1989                 */
1990                cpl = (void *)(wr + 1);
1991                if (skb->ip_summed == CHECKSUM_PARTIAL) {
1992                        cntrl = hwcsum(adapter->params.chip, skb) |
1993                                TXPKT_IPCSUM_DIS_F;
1994                        txq->tx_cso++;
1995                } else {
1996                        cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
1997                }
1998        }
1999
2000        /* If there's a VLAN tag present, add that to the list of things to
2001         * do in this Work Request.
2002         */
2003        if (skb_vlan_tag_present(skb)) {
2004                txq->vlan_ins++;
2005                cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
2006        }
2007
2008         /* Fill in the TX Packet CPL message header. */
2009        cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
2010                                 TXPKT_INTF_V(pi->port_id) |
2011                                 TXPKT_PF_V(0));
2012        cpl->pack = cpu_to_be16(0);
2013        cpl->len = cpu_to_be16(skb->len);
2014        cpl->ctrl1 = cpu_to_be64(cntrl);
2015
2016        /* Fill in the body of the TX Packet CPL message with either in-lined
2017         * data or a Scatter/Gather List.
2018         */
2019        if (t4vf_is_eth_imm(skb)) {
2020                /* In-line the packet's data and free the skb since we don't
2021                 * need it any longer.
2022                 */
2023                cxgb4_inline_tx_skb(skb, &txq->q, cpl + 1);
2024                dev_consume_skb_any(skb);
2025        } else {
2026                /* Write the skb's Scatter/Gather list into the TX Packet CPL
2027                 * message and retain a pointer to the skb so we can free it
2028                 * later when its DMA completes.  (We store the skb pointer
2029                 * in the Software Descriptor corresponding to the last TX
2030                 * Descriptor used by the Work Request.)
2031                 *
2032                 * The retained skb will be freed when the corresponding TX
2033                 * Descriptors are reclaimed after their DMAs complete.
2034                 * However, this could take quite a while since, in general,
2035                 * the hardware is set up to be lazy about sending DMA
2036                 * completion notifications to us and we mostly perform TX
2037                 * reclaims in the transmit routine.
2038                 *
2039                 * This is good for performamce but means that we rely on new
2040                 * TX packets arriving to run the destructors of completed
2041                 * packets, which open up space in their sockets' send queues.
2042                 * Sometimes we do not get such new packets causing TX to
2043                 * stall.  A single UDP transmitter is a good example of this
2044                 * situation.  We have a clean up timer that periodically
2045                 * reclaims completed packets but it doesn't run often enough
2046                 * (nor do we want it to) to prevent lengthy stalls.  A
2047                 * solution to this problem is to run the destructor early,
2048                 * after the packet is queued but before it's DMAd.  A con is
2049                 * that we lie to socket memory accounting, but the amount of
2050                 * extra memory is reasonable (limited by the number of TX
2051                 * descriptors), the packets do actually get freed quickly by
2052                 * new packets almost always, and for protocols like TCP that
2053                 * wait for acks to really free up the data the extra memory
2054                 * is even less.  On the positive side we run the destructors
2055                 * on the sending CPU rather than on a potentially different
2056                 * completing CPU, usually a good thing.
2057                 *
2058                 * Run the destructor before telling the DMA engine about the
2059                 * packet to make sure it doesn't complete and get freed
2060                 * prematurely.
2061                 */
2062                struct ulptx_sgl *sgl = (struct ulptx_sgl *)(cpl + 1);
2063                struct sge_txq *tq = &txq->q;
2064
2065                /* If the Work Request header was an exact multiple of our TX
2066                 * Descriptor length, then it's possible that the starting SGL
2067                 * pointer lines up exactly with the end of our TX Descriptor
2068                 * ring.  If that's the case, wrap around to the beginning
2069                 * here ...
2070                 */
2071                if (unlikely((void *)sgl == (void *)tq->stat)) {
2072                        sgl = (void *)tq->desc;
2073                        end = (void *)((void *)tq->desc +
2074                                       ((void *)end - (void *)tq->stat));
2075                }
2076
2077                cxgb4_write_sgl(skb, tq, sgl, end, 0, sgl_sdesc->addr);
2078                skb_orphan(skb);
2079                sgl_sdesc->skb = skb;
2080        }
2081
2082        /* Advance our internal TX Queue state, tell the hardware about
2083         * the new TX descriptors and return success.
2084         */
2085        txq_advance(&txq->q, ndesc);
2086
2087        cxgb4_ring_tx_db(adapter, &txq->q, ndesc);
2088        return NETDEV_TX_OK;
2089
2090out_free:
2091        /* An error of some sort happened.  Free the TX skb and tell the
2092         * OS that we've "dealt" with the packet ...
2093         */
2094        dev_kfree_skb_any(skb);
2095        return NETDEV_TX_OK;
2096}
2097
2098/**
2099 * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
2100 * @q: the SGE control Tx queue
2101 *
2102 * This is a variant of cxgb4_reclaim_completed_tx() that is used
2103 * for Tx queues that send only immediate data (presently just
2104 * the control queues) and      thus do not have any sk_buffs to release.
2105 */
2106static inline void reclaim_completed_tx_imm(struct sge_txq *q)
2107{
2108        int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
2109        int reclaim = hw_cidx - q->cidx;
2110
2111        if (reclaim < 0)
2112                reclaim += q->size;
2113
2114        q->in_use -= reclaim;
2115        q->cidx = hw_cidx;
2116}
2117
2118static inline void eosw_txq_advance_index(u32 *idx, u32 n, u32 max)
2119{
2120        u32 val = *idx + n;
2121
2122        if (val >= max)
2123                val -= max;
2124
2125        *idx = val;
2126}
2127
2128void cxgb4_eosw_txq_free_desc(struct adapter *adap,
2129                              struct sge_eosw_txq *eosw_txq, u32 ndesc)
2130{
2131        struct tx_sw_desc *d;
2132
2133        d = &eosw_txq->desc[eosw_txq->last_cidx];
2134        while (ndesc--) {
2135                if (d->skb) {
2136                        if (d->addr[0]) {
2137                                unmap_skb(adap->pdev_dev, d->skb, d->addr);
2138                                memset(d->addr, 0, sizeof(d->addr));
2139                        }
2140                        dev_consume_skb_any(d->skb);
2141                        d->skb = NULL;
2142                }
2143                eosw_txq_advance_index(&eosw_txq->last_cidx, 1,
2144                                       eosw_txq->ndesc);
2145                d = &eosw_txq->desc[eosw_txq->last_cidx];
2146        }
2147}
2148
2149static inline void eosw_txq_advance(struct sge_eosw_txq *eosw_txq, u32 n)
2150{
2151        eosw_txq_advance_index(&eosw_txq->pidx, n, eosw_txq->ndesc);
2152        eosw_txq->inuse += n;
2153}
2154
2155static inline int eosw_txq_enqueue(struct sge_eosw_txq *eosw_txq,
2156                                   struct sk_buff *skb)
2157{
2158        if (eosw_txq->inuse == eosw_txq->ndesc)
2159                return -ENOMEM;
2160
2161        eosw_txq->desc[eosw_txq->pidx].skb = skb;
2162        return 0;
2163}
2164
2165static inline struct sk_buff *eosw_txq_peek(struct sge_eosw_txq *eosw_txq)
2166{
2167        return eosw_txq->desc[eosw_txq->last_pidx].skb;
2168}
2169
2170static inline u8 ethofld_calc_tx_flits(struct adapter *adap,
2171                                       struct sk_buff *skb, u32 hdr_len)
2172{
2173        u8 flits, nsgl = 0;
2174        u32 wrlen;
2175
2176        wrlen = sizeof(struct fw_eth_tx_eo_wr) + sizeof(struct cpl_tx_pkt_core);
2177        if (skb_shinfo(skb)->gso_size &&
2178            !(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4))
2179                wrlen += sizeof(struct cpl_tx_pkt_lso_core);
2180
2181        wrlen += roundup(hdr_len, 16);
2182
2183        /* Packet headers + WR + CPLs */
2184        flits = DIV_ROUND_UP(wrlen, 8);
2185
2186        if (skb_shinfo(skb)->nr_frags > 0) {
2187                if (skb_headlen(skb) - hdr_len)
2188                        nsgl = sgl_len(skb_shinfo(skb)->nr_frags + 1);
2189                else
2190                        nsgl = sgl_len(skb_shinfo(skb)->nr_frags);
2191        } else if (skb->len - hdr_len) {
2192                nsgl = sgl_len(1);
2193        }
2194
2195        return flits + nsgl;
2196}
2197
2198static void *write_eo_wr(struct adapter *adap, struct sge_eosw_txq *eosw_txq,
2199                         struct sk_buff *skb, struct fw_eth_tx_eo_wr *wr,
2200                         u32 hdr_len, u32 wrlen)
2201{
2202        const struct skb_shared_info *ssi = skb_shinfo(skb);
2203        struct cpl_tx_pkt_core *cpl;
2204        u32 immd_len, wrlen16;
2205        bool compl = false;
2206        u8 ver, proto;
2207
2208        ver = ip_hdr(skb)->version;
2209        proto = (ver == 6) ? ipv6_hdr(skb)->nexthdr : ip_hdr(skb)->protocol;
2210
2211        wrlen16 = DIV_ROUND_UP(wrlen, 16);
2212        immd_len = sizeof(struct cpl_tx_pkt_core);
2213        if (skb_shinfo(skb)->gso_size &&
2214            !(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4))
2215                immd_len += sizeof(struct cpl_tx_pkt_lso_core);
2216        immd_len += hdr_len;
2217
2218        if (!eosw_txq->ncompl ||
2219            (eosw_txq->last_compl + wrlen16) >=
2220            (adap->params.ofldq_wr_cred / 2)) {
2221                compl = true;
2222                eosw_txq->ncompl++;
2223                eosw_txq->last_compl = 0;
2224        }
2225
2226        wr->op_immdlen = cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_EO_WR) |
2227                                     FW_ETH_TX_EO_WR_IMMDLEN_V(immd_len) |
2228                                     FW_WR_COMPL_V(compl));
2229        wr->equiq_to_len16 = cpu_to_be32(FW_WR_LEN16_V(wrlen16) |
2230                                         FW_WR_FLOWID_V(eosw_txq->hwtid));
2231        wr->r3 = 0;
2232        if (proto == IPPROTO_UDP) {
2233                cpl = write_eo_udp_wr(skb, wr, hdr_len);
2234        } else {
2235                wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
2236                wr->u.tcpseg.ethlen = skb_network_offset(skb);
2237                wr->u.tcpseg.iplen = cpu_to_be16(skb_network_header_len(skb));
2238                wr->u.tcpseg.tcplen = tcp_hdrlen(skb);
2239                wr->u.tcpseg.tsclk_tsoff = 0;
2240                wr->u.tcpseg.r4 = 0;
2241                wr->u.tcpseg.r5 = 0;
2242                wr->u.tcpseg.plen = cpu_to_be32(skb->len - hdr_len);
2243
2244                if (ssi->gso_size) {
2245                        struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
2246
2247                        wr->u.tcpseg.mss = cpu_to_be16(ssi->gso_size);
2248                        cpl = write_tso_wr(adap, skb, lso);
2249                } else {
2250                        wr->u.tcpseg.mss = cpu_to_be16(0xffff);
2251                        cpl = (void *)(wr + 1);
2252                }
2253        }
2254
2255        eosw_txq->cred -= wrlen16;
2256        eosw_txq->last_compl += wrlen16;
2257        return cpl;
2258}
2259
2260static int ethofld_hard_xmit(struct net_device *dev,
2261                             struct sge_eosw_txq *eosw_txq)
2262{
2263        struct port_info *pi = netdev2pinfo(dev);
2264        struct adapter *adap = netdev2adap(dev);
2265        u32 wrlen, wrlen16, hdr_len, data_len;
2266        enum sge_eosw_state next_state;
2267        u64 cntrl, *start, *end, *sgl;
2268        struct sge_eohw_txq *eohw_txq;
2269        struct cpl_tx_pkt_core *cpl;
2270        struct fw_eth_tx_eo_wr *wr;
2271        bool skip_eotx_wr = false;
2272        struct tx_sw_desc *d;
2273        struct sk_buff *skb;
2274        int left, ret = 0;
2275        u8 flits, ndesc;
2276
2277        eohw_txq = &adap->sge.eohw_txq[eosw_txq->hwqid];
2278        spin_lock(&eohw_txq->lock);
2279        reclaim_completed_tx_imm(&eohw_txq->q);
2280
2281        d = &eosw_txq->desc[eosw_txq->last_pidx];
2282        skb = d->skb;
2283        skb_tx_timestamp(skb);
2284
2285        wr = (struct fw_eth_tx_eo_wr *)&eohw_txq->q.desc[eohw_txq->q.pidx];
2286        if (unlikely(eosw_txq->state != CXGB4_EO_STATE_ACTIVE &&
2287                     eosw_txq->last_pidx == eosw_txq->flowc_idx)) {
2288                hdr_len = skb->len;
2289                data_len = 0;
2290                flits = DIV_ROUND_UP(hdr_len, 8);
2291                if (eosw_txq->state == CXGB4_EO_STATE_FLOWC_OPEN_SEND)
2292                        next_state = CXGB4_EO_STATE_FLOWC_OPEN_REPLY;
2293                else
2294                        next_state = CXGB4_EO_STATE_FLOWC_CLOSE_REPLY;
2295                skip_eotx_wr = true;
2296        } else {
2297                hdr_len = eth_get_headlen(dev, skb->data, skb_headlen(skb));
2298                data_len = skb->len - hdr_len;
2299                flits = ethofld_calc_tx_flits(adap, skb, hdr_len);
2300        }
2301        ndesc = flits_to_desc(flits);
2302        wrlen = flits * 8;
2303        wrlen16 = DIV_ROUND_UP(wrlen, 16);
2304
2305        left = txq_avail(&eohw_txq->q) - ndesc;
2306
2307        /* If there are no descriptors left in hardware queues or no
2308         * CPL credits left in software queues, then wait for them
2309         * to come back and retry again. Note that we always request
2310         * for credits update via interrupt for every half credits
2311         * consumed. So, the interrupt will eventually restore the
2312         * credits and invoke the Tx path again.
2313         */
2314        if (unlikely(left < 0 || wrlen16 > eosw_txq->cred)) {
2315                ret = -ENOMEM;
2316                goto out_unlock;
2317        }
2318
2319        if (unlikely(skip_eotx_wr)) {
2320                start = (u64 *)wr;
2321                eosw_txq->state = next_state;
2322                eosw_txq->cred -= wrlen16;
2323                eosw_txq->ncompl++;
2324                eosw_txq->last_compl = 0;
2325                goto write_wr_headers;
2326        }
2327
2328        cpl = write_eo_wr(adap, eosw_txq, skb, wr, hdr_len, wrlen);
2329        cntrl = hwcsum(adap->params.chip, skb);
2330        if (skb_vlan_tag_present(skb))
2331                cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
2332
2333        cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
2334                                 TXPKT_INTF_V(pi->tx_chan) |
2335                                 TXPKT_PF_V(adap->pf));
2336        cpl->pack = 0;
2337        cpl->len = cpu_to_be16(skb->len);
2338        cpl->ctrl1 = cpu_to_be64(cntrl);
2339
2340        start = (u64 *)(cpl + 1);
2341
2342write_wr_headers:
2343        sgl = (u64 *)inline_tx_skb_header(skb, &eohw_txq->q, (void *)start,
2344                                          hdr_len);
2345        if (data_len) {
2346                ret = cxgb4_map_skb(adap->pdev_dev, skb, d->addr);
2347                if (unlikely(ret)) {
2348                        memset(d->addr, 0, sizeof(d->addr));
2349                        eohw_txq->mapping_err++;
2350                        goto out_unlock;
2351                }
2352
2353                end = (u64 *)wr + flits;
2354                if (unlikely(start > sgl)) {
2355                        left = (u8 *)end - (u8 *)eohw_txq->q.stat;
2356                        end = (void *)eohw_txq->q.desc + left;
2357                }
2358
2359                if (unlikely((u8 *)sgl >= (u8 *)eohw_txq->q.stat)) {
2360                        /* If current position is already at the end of the
2361                         * txq, reset the current to point to start of the queue
2362                         * and update the end ptr as well.
2363                         */
2364                        left = (u8 *)end - (u8 *)eohw_txq->q.stat;
2365
2366                        end = (void *)eohw_txq->q.desc + left;
2367                        sgl = (void *)eohw_txq->q.desc;
2368                }
2369
2370                cxgb4_write_sgl(skb, &eohw_txq->q, (void *)sgl, end, hdr_len,
2371                                d->addr);
2372        }
2373
2374        if (skb_shinfo(skb)->gso_size) {
2375                if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
2376                        eohw_txq->uso++;
2377                else
2378                        eohw_txq->tso++;
2379                eohw_txq->tx_cso += skb_shinfo(skb)->gso_segs;
2380        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
2381                eohw_txq->tx_cso++;
2382        }
2383
2384        if (skb_vlan_tag_present(skb))
2385                eohw_txq->vlan_ins++;
2386
2387        txq_advance(&eohw_txq->q, ndesc);
2388        cxgb4_ring_tx_db(adap, &eohw_txq->q, ndesc);
2389        eosw_txq_advance_index(&eosw_txq->last_pidx, 1, eosw_txq->ndesc);
2390
2391out_unlock:
2392        spin_unlock(&eohw_txq->lock);
2393        return ret;
2394}
2395
2396static void ethofld_xmit(struct net_device *dev, struct sge_eosw_txq *eosw_txq)
2397{
2398        struct sk_buff *skb;
2399        int pktcount, ret;
2400
2401        switch (eosw_txq->state) {
2402        case CXGB4_EO_STATE_ACTIVE:
2403        case CXGB4_EO_STATE_FLOWC_OPEN_SEND:
2404        case CXGB4_EO_STATE_FLOWC_CLOSE_SEND:
2405                pktcount = eosw_txq->pidx - eosw_txq->last_pidx;
2406                if (pktcount < 0)
2407                        pktcount += eosw_txq->ndesc;
2408                break;
2409        case CXGB4_EO_STATE_FLOWC_OPEN_REPLY:
2410        case CXGB4_EO_STATE_FLOWC_CLOSE_REPLY:
2411        case CXGB4_EO_STATE_CLOSED:
2412        default:
2413                return;
2414        }
2415
2416        while (pktcount--) {
2417                skb = eosw_txq_peek(eosw_txq);
2418                if (!skb) {
2419                        eosw_txq_advance_index(&eosw_txq->last_pidx, 1,
2420                                               eosw_txq->ndesc);
2421                        continue;
2422                }
2423
2424                ret = ethofld_hard_xmit(dev, eosw_txq);
2425                if (ret)
2426                        break;
2427        }
2428}
2429
2430static netdev_tx_t cxgb4_ethofld_xmit(struct sk_buff *skb,
2431                                      struct net_device *dev)
2432{
2433        struct cxgb4_tc_port_mqprio *tc_port_mqprio;
2434        struct port_info *pi = netdev2pinfo(dev);
2435        struct adapter *adap = netdev2adap(dev);
2436        struct sge_eosw_txq *eosw_txq;
2437        u32 qid;
2438        int ret;
2439
2440        ret = cxgb4_validate_skb(skb, dev, ETH_HLEN);
2441        if (ret)
2442                goto out_free;
2443
2444        tc_port_mqprio = &adap->tc_mqprio->port_mqprio[pi->port_id];
2445        qid = skb_get_queue_mapping(skb) - pi->nqsets;
2446        eosw_txq = &tc_port_mqprio->eosw_txq[qid];
2447        spin_lock_bh(&eosw_txq->lock);
2448        if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
2449                goto out_unlock;
2450
2451        ret = eosw_txq_enqueue(eosw_txq, skb);
2452        if (ret)
2453                goto out_unlock;
2454
2455        /* SKB is queued for processing until credits are available.
2456         * So, call the destructor now and we'll free the skb later
2457         * after it has been successfully transmitted.
2458         */
2459        skb_orphan(skb);
2460
2461        eosw_txq_advance(eosw_txq, 1);
2462        ethofld_xmit(dev, eosw_txq);
2463        spin_unlock_bh(&eosw_txq->lock);
2464        return NETDEV_TX_OK;
2465
2466out_unlock:
2467        spin_unlock_bh(&eosw_txq->lock);
2468out_free:
2469        dev_kfree_skb_any(skb);
2470        return NETDEV_TX_OK;
2471}
2472
2473netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev)
2474{
2475        struct port_info *pi = netdev_priv(dev);
2476        u16 qid = skb_get_queue_mapping(skb);
2477
2478        if (unlikely(pi->eth_flags & PRIV_FLAG_PORT_TX_VM))
2479                return cxgb4_vf_eth_xmit(skb, dev);
2480
2481        if (unlikely(qid >= pi->nqsets))
2482                return cxgb4_ethofld_xmit(skb, dev);
2483
2484        if (is_ptp_enabled(skb, dev)) {
2485                struct adapter *adap = netdev2adap(dev);
2486                netdev_tx_t ret;
2487
2488                spin_lock(&adap->ptp_lock);
2489                ret = cxgb4_eth_xmit(skb, dev);
2490                spin_unlock(&adap->ptp_lock);
2491                return ret;
2492        }
2493
2494        return cxgb4_eth_xmit(skb, dev);
2495}
2496
2497static void eosw_txq_flush_pending_skbs(struct sge_eosw_txq *eosw_txq)
2498{
2499        int pktcount = eosw_txq->pidx - eosw_txq->last_pidx;
2500        int pidx = eosw_txq->pidx;
2501        struct sk_buff *skb;
2502
2503        if (!pktcount)
2504                return;
2505
2506        if (pktcount < 0)
2507                pktcount += eosw_txq->ndesc;
2508
2509        while (pktcount--) {
2510                pidx--;
2511                if (pidx < 0)
2512                        pidx += eosw_txq->ndesc;
2513
2514                skb = eosw_txq->desc[pidx].skb;
2515                if (skb) {
2516                        dev_consume_skb_any(skb);
2517                        eosw_txq->desc[pidx].skb = NULL;
2518                        eosw_txq->inuse--;
2519                }
2520        }
2521
2522        eosw_txq->pidx = eosw_txq->last_pidx + 1;
2523}
2524
2525/**
2526 * cxgb4_ethofld_send_flowc - Send ETHOFLD flowc request to bind eotid to tc.
2527 * @dev: netdevice
2528 * @eotid: ETHOFLD tid to bind/unbind
2529 * @tc: traffic class. If set to FW_SCHED_CLS_NONE, then unbinds the @eotid
2530 *
2531 * Send a FLOWC work request to bind an ETHOFLD TID to a traffic class.
2532 * If @tc is set to FW_SCHED_CLS_NONE, then the @eotid is unbound from
2533 * a traffic class.
2534 */
2535int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc)
2536{
2537        struct port_info *pi = netdev2pinfo(dev);
2538        struct adapter *adap = netdev2adap(dev);
2539        enum sge_eosw_state next_state;
2540        struct sge_eosw_txq *eosw_txq;
2541        u32 len, len16, nparams = 6;
2542        struct fw_flowc_wr *flowc;
2543        struct eotid_entry *entry;
2544        struct sge_ofld_rxq *rxq;
2545        struct sk_buff *skb;
2546        int ret = 0;
2547
2548        len = struct_size(flowc, mnemval, nparams);
2549        len16 = DIV_ROUND_UP(len, 16);
2550
2551        entry = cxgb4_lookup_eotid(&adap->tids, eotid);
2552        if (!entry)
2553                return -ENOMEM;
2554
2555        eosw_txq = (struct sge_eosw_txq *)entry->data;
2556        if (!eosw_txq)
2557                return -ENOMEM;
2558
2559        if (!(adap->flags & CXGB4_FW_OK)) {
2560                /* Don't stall caller when access to FW is lost */
2561                complete(&eosw_txq->completion);
2562                return -EIO;
2563        }
2564
2565        skb = alloc_skb(len, GFP_KERNEL);
2566        if (!skb)
2567                return -ENOMEM;
2568
2569        spin_lock_bh(&eosw_txq->lock);
2570        if (tc != FW_SCHED_CLS_NONE) {
2571                if (eosw_txq->state != CXGB4_EO_STATE_CLOSED)
2572                        goto out_free_skb;
2573
2574                next_state = CXGB4_EO_STATE_FLOWC_OPEN_SEND;
2575        } else {
2576                if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
2577                        goto out_free_skb;
2578
2579                next_state = CXGB4_EO_STATE_FLOWC_CLOSE_SEND;
2580        }
2581
2582        flowc = __skb_put(skb, len);
2583        memset(flowc, 0, len);
2584
2585        rxq = &adap->sge.eohw_rxq[eosw_txq->hwqid];
2586        flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(len16) |
2587                                          FW_WR_FLOWID_V(eosw_txq->hwtid));
2588        flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
2589                                           FW_FLOWC_WR_NPARAMS_V(nparams) |
2590                                           FW_WR_COMPL_V(1));
2591        flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
2592        flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V(adap->pf));
2593        flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
2594        flowc->mnemval[1].val = cpu_to_be32(pi->tx_chan);
2595        flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
2596        flowc->mnemval[2].val = cpu_to_be32(pi->tx_chan);
2597        flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
2598        flowc->mnemval[3].val = cpu_to_be32(rxq->rspq.abs_id);
2599        flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
2600        flowc->mnemval[4].val = cpu_to_be32(tc);
2601        flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_EOSTATE;
2602        flowc->mnemval[5].val = cpu_to_be32(tc == FW_SCHED_CLS_NONE ?
2603                                            FW_FLOWC_MNEM_EOSTATE_CLOSING :
2604                                            FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
2605
2606        /* Free up any pending skbs to ensure there's room for
2607         * termination FLOWC.
2608         */
2609        if (tc == FW_SCHED_CLS_NONE)
2610                eosw_txq_flush_pending_skbs(eosw_txq);
2611
2612        ret = eosw_txq_enqueue(eosw_txq, skb);
2613        if (ret)
2614                goto out_free_skb;
2615
2616        eosw_txq->state = next_state;
2617        eosw_txq->flowc_idx = eosw_txq->pidx;
2618        eosw_txq_advance(eosw_txq, 1);
2619        ethofld_xmit(dev, eosw_txq);
2620
2621        spin_unlock_bh(&eosw_txq->lock);
2622        return 0;
2623
2624out_free_skb:
2625        dev_consume_skb_any(skb);
2626        spin_unlock_bh(&eosw_txq->lock);
2627        return ret;
2628}
2629
2630/**
2631 *      is_imm - check whether a packet can be sent as immediate data
2632 *      @skb: the packet
2633 *
2634 *      Returns true if a packet can be sent as a WR with immediate data.
2635 */
2636static inline int is_imm(const struct sk_buff *skb)
2637{
2638        return skb->len <= MAX_CTRL_WR_LEN;
2639}
2640
2641/**
2642 *      ctrlq_check_stop - check if a control queue is full and should stop
2643 *      @q: the queue
2644 *      @wr: most recent WR written to the queue
2645 *
2646 *      Check if a control queue has become full and should be stopped.
2647 *      We clean up control queue descriptors very lazily, only when we are out.
2648 *      If the queue is still full after reclaiming any completed descriptors
2649 *      we suspend it and have the last WR wake it up.
2650 */
2651static void ctrlq_check_stop(struct sge_ctrl_txq *q, struct fw_wr_hdr *wr)
2652{
2653        reclaim_completed_tx_imm(&q->q);
2654        if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
2655                wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
2656                q->q.stops++;
2657                q->full = 1;
2658        }
2659}
2660
2661#define CXGB4_SELFTEST_LB_STR "CHELSIO_SELFTEST"
2662
2663int cxgb4_selftest_lb_pkt(struct net_device *netdev)
2664{
2665        struct port_info *pi = netdev_priv(netdev);
2666        struct adapter *adap = pi->adapter;
2667        struct cxgb4_ethtool_lb_test *lb;
2668        int ret, i = 0, pkt_len, credits;
2669        struct fw_eth_tx_pkt_wr *wr;
2670        struct cpl_tx_pkt_core *cpl;
2671        u32 ctrl0, ndesc, flits;
2672        struct sge_eth_txq *q;
2673        u8 *sgl;
2674
2675        pkt_len = ETH_HLEN + sizeof(CXGB4_SELFTEST_LB_STR);
2676
2677        flits = DIV_ROUND_UP(pkt_len + sizeof(*cpl) + sizeof(*wr),
2678                             sizeof(__be64));
2679        ndesc = flits_to_desc(flits);
2680
2681        lb = &pi->ethtool_lb;
2682        lb->loopback = 1;
2683
2684        q = &adap->sge.ethtxq[pi->first_qset];
2685        __netif_tx_lock(q->txq, smp_processor_id());
2686
2687        reclaim_completed_tx(adap, &q->q, -1, true);
2688        credits = txq_avail(&q->q) - ndesc;
2689        if (unlikely(credits < 0)) {
2690                __netif_tx_unlock(q->txq);
2691                return -ENOMEM;
2692        }
2693
2694        wr = (void *)&q->q.desc[q->q.pidx];
2695        memset(wr, 0, sizeof(struct tx_desc));
2696
2697        wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) |
2698                               FW_WR_IMMDLEN_V(pkt_len +
2699                               sizeof(*cpl)));
2700        wr->equiq_to_len16 = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2)));
2701        wr->r3 = cpu_to_be64(0);
2702
2703        cpl = (void *)(wr + 1);
2704        sgl = (u8 *)(cpl + 1);
2705
2706        ctrl0 = TXPKT_OPCODE_V(CPL_TX_PKT_XT) | TXPKT_PF_V(adap->pf) |
2707                TXPKT_INTF_V(pi->tx_chan + 4);
2708
2709        cpl->ctrl0 = htonl(ctrl0);
2710        cpl->pack = htons(0);
2711        cpl->len = htons(pkt_len);
2712        cpl->ctrl1 = cpu_to_be64(TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F);
2713
2714        eth_broadcast_addr(sgl);
2715        i += ETH_ALEN;
2716        ether_addr_copy(&sgl[i], netdev->dev_addr);
2717        i += ETH_ALEN;
2718
2719        snprintf(&sgl[i], sizeof(CXGB4_SELFTEST_LB_STR), "%s",
2720                 CXGB4_SELFTEST_LB_STR);
2721
2722        init_completion(&lb->completion);
2723        txq_advance(&q->q, ndesc);
2724        cxgb4_ring_tx_db(adap, &q->q, ndesc);
2725        __netif_tx_unlock(q->txq);
2726
2727        /* wait for the pkt to return */
2728        ret = wait_for_completion_timeout(&lb->completion, 10 * HZ);
2729        if (!ret)
2730                ret = -ETIMEDOUT;
2731        else
2732                ret = lb->result;
2733
2734        lb->loopback = 0;
2735
2736        return ret;
2737}
2738
2739/**
2740 *      ctrl_xmit - send a packet through an SGE control Tx queue
2741 *      @q: the control queue
2742 *      @skb: the packet
2743 *
2744 *      Send a packet through an SGE control Tx queue.  Packets sent through
2745 *      a control queue must fit entirely as immediate data.
2746 */
2747static int ctrl_xmit(struct sge_ctrl_txq *q, struct sk_buff *skb)
2748{
2749        unsigned int ndesc;
2750        struct fw_wr_hdr *wr;
2751
2752        if (unlikely(!is_imm(skb))) {
2753                WARN_ON(1);
2754                dev_kfree_skb(skb);
2755                return NET_XMIT_DROP;
2756        }
2757
2758        ndesc = DIV_ROUND_UP(skb->len, sizeof(struct tx_desc));
2759        spin_lock(&q->sendq.lock);
2760
2761        if (unlikely(q->full)) {
2762                skb->priority = ndesc;                  /* save for restart */
2763                __skb_queue_tail(&q->sendq, skb);
2764                spin_unlock(&q->sendq.lock);
2765                return NET_XMIT_CN;
2766        }
2767
2768        wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
2769        cxgb4_inline_tx_skb(skb, &q->q, wr);
2770
2771        txq_advance(&q->q, ndesc);
2772        if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES))
2773                ctrlq_check_stop(q, wr);
2774
2775        cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
2776        spin_unlock(&q->sendq.lock);
2777
2778        kfree_skb(skb);
2779        return NET_XMIT_SUCCESS;
2780}
2781
2782/**
2783 *      restart_ctrlq - restart a suspended control queue
2784 *      @t: pointer to the tasklet associated with this handler
2785 *
2786 *      Resumes transmission on a suspended Tx control queue.
2787 */
2788static void restart_ctrlq(struct tasklet_struct *t)
2789{
2790        struct sk_buff *skb;
2791        unsigned int written = 0;
2792        struct sge_ctrl_txq *q = from_tasklet(q, t, qresume_tsk);
2793
2794        spin_lock(&q->sendq.lock);
2795        reclaim_completed_tx_imm(&q->q);
2796        BUG_ON(txq_avail(&q->q) < TXQ_STOP_THRES);  /* q should be empty */
2797
2798        while ((skb = __skb_dequeue(&q->sendq)) != NULL) {
2799                struct fw_wr_hdr *wr;
2800                unsigned int ndesc = skb->priority;     /* previously saved */
2801
2802                written += ndesc;
2803                /* Write descriptors and free skbs outside the lock to limit
2804                 * wait times.  q->full is still set so new skbs will be queued.
2805                 */
2806                wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
2807                txq_advance(&q->q, ndesc);
2808                spin_unlock(&q->sendq.lock);
2809
2810                cxgb4_inline_tx_skb(skb, &q->q, wr);
2811                kfree_skb(skb);
2812
2813                if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
2814                        unsigned long old = q->q.stops;
2815
2816                        ctrlq_check_stop(q, wr);
2817                        if (q->q.stops != old) {          /* suspended anew */
2818                                spin_lock(&q->sendq.lock);
2819                                goto ringdb;
2820                        }
2821                }
2822                if (written > 16) {
2823                        cxgb4_ring_tx_db(q->adap, &q->q, written);
2824                        written = 0;
2825                }
2826                spin_lock(&q->sendq.lock);
2827        }
2828        q->full = 0;
2829ringdb:
2830        if (written)
2831                cxgb4_ring_tx_db(q->adap, &q->q, written);
2832        spin_unlock(&q->sendq.lock);
2833}
2834
2835/**
2836 *      t4_mgmt_tx - send a management message
2837 *      @adap: the adapter
2838 *      @skb: the packet containing the management message
2839 *
2840 *      Send a management message through control queue 0.
2841 */
2842int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
2843{
2844        int ret;
2845
2846        local_bh_disable();
2847        ret = ctrl_xmit(&adap->sge.ctrlq[0], skb);
2848        local_bh_enable();
2849        return ret;
2850}
2851
2852/**
2853 *      is_ofld_imm - check whether a packet can be sent as immediate data
2854 *      @skb: the packet
2855 *
2856 *      Returns true if a packet can be sent as an offload WR with immediate
2857 *      data.
2858 *      FW_OFLD_TX_DATA_WR limits the payload to 255 bytes due to 8-bit field.
2859 *      However, FW_ULPTX_WR commands have a 256 byte immediate only
2860 *      payload limit.
2861 */
2862static inline int is_ofld_imm(const struct sk_buff *skb)
2863{
2864        struct work_request_hdr *req = (struct work_request_hdr *)skb->data;
2865        unsigned long opcode = FW_WR_OP_G(ntohl(req->wr_hi));
2866
2867        if (unlikely(opcode == FW_ULPTX_WR))
2868                return skb->len <= MAX_IMM_ULPTX_WR_LEN;
2869        else if (opcode == FW_CRYPTO_LOOKASIDE_WR)
2870                return skb->len <= SGE_MAX_WR_LEN;
2871        else
2872                return skb->len <= MAX_IMM_OFLD_TX_DATA_WR_LEN;
2873}
2874
2875/**
2876 *      calc_tx_flits_ofld - calculate # of flits for an offload packet
2877 *      @skb: the packet
2878 *
2879 *      Returns the number of flits needed for the given offload packet.
2880 *      These packets are already fully constructed and no additional headers
2881 *      will be added.
2882 */
2883static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
2884{
2885        unsigned int flits, cnt;
2886
2887        if (is_ofld_imm(skb))
2888                return DIV_ROUND_UP(skb->len, 8);
2889
2890        flits = skb_transport_offset(skb) / 8U;   /* headers */
2891        cnt = skb_shinfo(skb)->nr_frags;
2892        if (skb_tail_pointer(skb) != skb_transport_header(skb))
2893                cnt++;
2894        return flits + sgl_len(cnt);
2895}
2896
2897/**
2898 *      txq_stop_maperr - stop a Tx queue due to I/O MMU exhaustion
2899 *      @q: the queue to stop
2900 *
2901 *      Mark a Tx queue stopped due to I/O MMU exhaustion and resulting
2902 *      inability to map packets.  A periodic timer attempts to restart
2903 *      queues so marked.
2904 */
2905static void txq_stop_maperr(struct sge_uld_txq *q)
2906{
2907        q->mapping_err++;
2908        q->q.stops++;
2909        set_bit(q->q.cntxt_id - q->adap->sge.egr_start,
2910                q->adap->sge.txq_maperr);
2911}
2912
2913/**
2914 *      ofldtxq_stop - stop an offload Tx queue that has become full
2915 *      @q: the queue to stop
2916 *      @wr: the Work Request causing the queue to become full
2917 *
2918 *      Stops an offload Tx queue that has become full and modifies the packet
2919 *      being written to request a wakeup.
2920 */
2921static void ofldtxq_stop(struct sge_uld_txq *q, struct fw_wr_hdr *wr)
2922{
2923        wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
2924        q->q.stops++;
2925        q->full = 1;
2926}
2927
2928/**
2929 *      service_ofldq - service/restart a suspended offload queue
2930 *      @q: the offload queue
2931 *
2932 *      Services an offload Tx queue by moving packets from its Pending Send
2933 *      Queue to the Hardware TX ring.  The function starts and ends with the
2934 *      Send Queue locked, but drops the lock while putting the skb at the
2935 *      head of the Send Queue onto the Hardware TX Ring.  Dropping the lock
2936 *      allows more skbs to be added to the Send Queue by other threads.
2937 *      The packet being processed at the head of the Pending Send Queue is
2938 *      left on the queue in case we experience DMA Mapping errors, etc.
2939 *      and need to give up and restart later.
2940 *
2941 *      service_ofldq() can be thought of as a task which opportunistically
2942 *      uses other threads execution contexts.  We use the Offload Queue
2943 *      boolean "service_ofldq_running" to make sure that only one instance
2944 *      is ever running at a time ...
2945 */
2946static void service_ofldq(struct sge_uld_txq *q)
2947        __must_hold(&q->sendq.lock)
2948{
2949        u64 *pos, *before, *end;
2950        int credits;
2951        struct sk_buff *skb;
2952        struct sge_txq *txq;
2953        unsigned int left;
2954        unsigned int written = 0;
2955        unsigned int flits, ndesc;
2956
2957        /* If another thread is currently in service_ofldq() processing the
2958         * Pending Send Queue then there's nothing to do. Otherwise, flag
2959         * that we're doing the work and continue.  Examining/modifying
2960         * the Offload Queue boolean "service_ofldq_running" must be done
2961         * while holding the Pending Send Queue Lock.
2962         */
2963        if (q->service_ofldq_running)
2964                return;
2965        q->service_ofldq_running = true;
2966
2967        while ((skb = skb_peek(&q->sendq)) != NULL && !q->full) {
2968                /* We drop the lock while we're working with the skb at the
2969                 * head of the Pending Send Queue.  This allows more skbs to
2970                 * be added to the Pending Send Queue while we're working on
2971                 * this one.  We don't need to lock to guard the TX Ring
2972                 * updates because only one thread of execution is ever
2973                 * allowed into service_ofldq() at a time.
2974                 */
2975                spin_unlock(&q->sendq.lock);
2976
2977                cxgb4_reclaim_completed_tx(q->adap, &q->q, false);
2978
2979                flits = skb->priority;                /* previously saved */
2980                ndesc = flits_to_desc(flits);
2981                credits = txq_avail(&q->q) - ndesc;
2982                BUG_ON(credits < 0);
2983                if (unlikely(credits < TXQ_STOP_THRES))
2984                        ofldtxq_stop(q, (struct fw_wr_hdr *)skb->data);
2985
2986                pos = (u64 *)&q->q.desc[q->q.pidx];
2987                if (is_ofld_imm(skb))
2988                        cxgb4_inline_tx_skb(skb, &q->q, pos);
2989                else if (cxgb4_map_skb(q->adap->pdev_dev, skb,
2990                                       (dma_addr_t *)skb->head)) {
2991                        txq_stop_maperr(q);
2992                        spin_lock(&q->sendq.lock);
2993                        break;
2994                } else {
2995                        int last_desc, hdr_len = skb_transport_offset(skb);
2996
2997                        /* The WR headers  may not fit within one descriptor.
2998                         * So we need to deal with wrap-around here.
2999                         */
3000                        before = (u64 *)pos;
3001                        end = (u64 *)pos + flits;
3002                        txq = &q->q;
3003                        pos = (void *)inline_tx_skb_header(skb, &q->q,
3004                                                           (void *)pos,
3005                                                           hdr_len);
3006                        if (before > (u64 *)pos) {
3007                                left = (u8 *)end - (u8 *)txq->stat;
3008                                end = (void *)txq->desc + left;
3009                        }
3010
3011                        /* If current position is already at the end of the
3012                         * ofld queue, reset the current to point to
3013                         * start of the queue and update the end ptr as well.
3014                         */
3015                        if (pos == (u64 *)txq->stat) {
3016                                left = (u8 *)end - (u8 *)txq->stat;
3017                                end = (void *)txq->desc + left;
3018                                pos = (void *)txq->desc;
3019                        }
3020
3021                        cxgb4_write_sgl(skb, &q->q, (void *)pos,
3022                                        end, hdr_len,
3023                                        (dma_addr_t *)skb->head);
3024#ifdef CONFIG_NEED_DMA_MAP_STATE
3025                        skb->dev = q->adap->port[0];
3026                        skb->destructor = deferred_unmap_destructor;
3027#endif
3028                        last_desc = q->q.pidx + ndesc - 1;
3029                        if (last_desc >= q->q.size)
3030                                last_desc -= q->q.size;
3031                        q->q.sdesc[last_desc].skb = skb;
3032                }
3033
3034                txq_advance(&q->q, ndesc);
3035                written += ndesc;
3036                if (unlikely(written > 32)) {
3037                        cxgb4_ring_tx_db(q->adap, &q->q, written);
3038                        written = 0;
3039                }
3040
3041                /* Reacquire the Pending Send Queue Lock so we can unlink the
3042                 * skb we've just successfully transferred to the TX Ring and
3043                 * loop for the next skb which may be at the head of the
3044                 * Pending Send Queue.
3045                 */
3046                spin_lock(&q->sendq.lock);
3047                __skb_unlink(skb, &q->sendq);
3048                if (is_ofld_imm(skb))
3049                        kfree_skb(skb);
3050        }
3051        if (likely(written))
3052                cxgb4_ring_tx_db(q->adap, &q->q, written);
3053
3054        /*Indicate that no thread is processing the Pending Send Queue
3055         * currently.
3056         */
3057        q->service_ofldq_running = false;
3058}
3059
3060/**
3061 *      ofld_xmit - send a packet through an offload queue
3062 *      @q: the Tx offload queue
3063 *      @skb: the packet
3064 *
3065 *      Send an offload packet through an SGE offload queue.
3066 */
3067static int ofld_xmit(struct sge_uld_txq *q, struct sk_buff *skb)
3068{
3069        skb->priority = calc_tx_flits_ofld(skb);       /* save for restart */
3070        spin_lock(&q->sendq.lock);
3071
3072        /* Queue the new skb onto the Offload Queue's Pending Send Queue.  If
3073         * that results in this new skb being the only one on the queue, start
3074         * servicing it.  If there are other skbs already on the list, then
3075         * either the queue is currently being processed or it's been stopped
3076         * for some reason and it'll be restarted at a later time.  Restart
3077         * paths are triggered by events like experiencing a DMA Mapping Error
3078         * or filling the Hardware TX Ring.
3079         */
3080        __skb_queue_tail(&q->sendq, skb);
3081        if (q->sendq.qlen == 1)
3082                service_ofldq(q);
3083
3084        spin_unlock(&q->sendq.lock);
3085        return NET_XMIT_SUCCESS;
3086}
3087
3088/**
3089 *      restart_ofldq - restart a suspended offload queue
3090 *      @t: pointer to the tasklet associated with this handler
3091 *
3092 *      Resumes transmission on a suspended Tx offload queue.
3093 */
3094static void restart_ofldq(struct tasklet_struct *t)
3095{
3096        struct sge_uld_txq *q = from_tasklet(q, t, qresume_tsk);
3097
3098        spin_lock(&q->sendq.lock);
3099        q->full = 0;            /* the queue actually is completely empty now */
3100        service_ofldq(q);
3101        spin_unlock(&q->sendq.lock);
3102}
3103
3104/**
3105 *      skb_txq - return the Tx queue an offload packet should use
3106 *      @skb: the packet
3107 *
3108 *      Returns the Tx queue an offload packet should use as indicated by bits
3109 *      1-15 in the packet's queue_mapping.
3110 */
3111static inline unsigned int skb_txq(const struct sk_buff *skb)
3112{
3113        return skb->queue_mapping >> 1;
3114}
3115
3116/**
3117 *      is_ctrl_pkt - return whether an offload packet is a control packet
3118 *      @skb: the packet
3119 *
3120 *      Returns whether an offload packet should use an OFLD or a CTRL
3121 *      Tx queue as indicated by bit 0 in the packet's queue_mapping.
3122 */
3123static inline unsigned int is_ctrl_pkt(const struct sk_buff *skb)
3124{
3125        return skb->queue_mapping & 1;
3126}
3127
3128static inline int uld_send(struct adapter *adap, struct sk_buff *skb,
3129                           unsigned int tx_uld_type)
3130{
3131        struct sge_uld_txq_info *txq_info;
3132        struct sge_uld_txq *txq;
3133        unsigned int idx = skb_txq(skb);
3134
3135        if (unlikely(is_ctrl_pkt(skb))) {
3136                /* Single ctrl queue is a requirement for LE workaround path */
3137                if (adap->tids.nsftids)
3138                        idx = 0;
3139                return ctrl_xmit(&adap->sge.ctrlq[idx], skb);
3140        }
3141
3142        txq_info = adap->sge.uld_txq_info[tx_uld_type];
3143        if (unlikely(!txq_info)) {
3144                WARN_ON(true);
3145                kfree_skb(skb);
3146                return NET_XMIT_DROP;
3147        }
3148
3149        txq = &txq_info->uldtxq[idx];
3150        return ofld_xmit(txq, skb);
3151}
3152
3153/**
3154 *      t4_ofld_send - send an offload packet
3155 *      @adap: the adapter
3156 *      @skb: the packet
3157 *
3158 *      Sends an offload packet.  We use the packet queue_mapping to select the
3159 *      appropriate Tx queue as follows: bit 0 indicates whether the packet
3160 *      should be sent as regular or control, bits 1-15 select the queue.
3161 */
3162int t4_ofld_send(struct adapter *adap, struct sk_buff *skb)
3163{
3164        int ret;
3165
3166        local_bh_disable();
3167        ret = uld_send(adap, skb, CXGB4_TX_OFLD);
3168        local_bh_enable();
3169        return ret;
3170}
3171
3172/**
3173 *      cxgb4_ofld_send - send an offload packet
3174 *      @dev: the net device
3175 *      @skb: the packet
3176 *
3177 *      Sends an offload packet.  This is an exported version of @t4_ofld_send,
3178 *      intended for ULDs.
3179 */
3180int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb)
3181{
3182        return t4_ofld_send(netdev2adap(dev), skb);
3183}
3184EXPORT_SYMBOL(cxgb4_ofld_send);
3185
3186static void *inline_tx_header(const void *src,
3187                              const struct sge_txq *q,
3188                              void *pos, int length)
3189{
3190        int left = (void *)q->stat - pos;
3191        u64 *p;
3192
3193        if (likely(length <= left)) {
3194                memcpy(pos, src, length);
3195                pos += length;
3196        } else {
3197                memcpy(pos, src, left);
3198                memcpy(q->desc, src + left, length - left);
3199                pos = (void *)q->desc + (length - left);
3200        }
3201        /* 0-pad to multiple of 16 */
3202        p = PTR_ALIGN(pos, 8);
3203        if ((uintptr_t)p & 8) {
3204                *p = 0;
3205                return p + 1;
3206        }
3207        return p;
3208}
3209
3210/**
3211 *      ofld_xmit_direct - copy a WR into offload queue
3212 *      @q: the Tx offload queue
3213 *      @src: location of WR
3214 *      @len: WR length
3215 *
3216 *      Copy an immediate WR into an uncontended SGE offload queue.
3217 */
3218static int ofld_xmit_direct(struct sge_uld_txq *q, const void *src,
3219                            unsigned int len)
3220{
3221        unsigned int ndesc;
3222        int credits;
3223        u64 *pos;
3224
3225        /* Use the lower limit as the cut-off */
3226        if (len > MAX_IMM_OFLD_TX_DATA_WR_LEN) {
3227                WARN_ON(1);
3228                return NET_XMIT_DROP;
3229        }
3230
3231        /* Don't return NET_XMIT_CN here as the current
3232         * implementation doesn't queue the request
3233         * using an skb when the following conditions not met
3234         */
3235        if (!spin_trylock(&q->sendq.lock))
3236                return NET_XMIT_DROP;
3237
3238        if (q->full || !skb_queue_empty(&q->sendq) ||
3239            q->service_ofldq_running) {
3240                spin_unlock(&q->sendq.lock);
3241                return NET_XMIT_DROP;
3242        }
3243        ndesc = flits_to_desc(DIV_ROUND_UP(len, 8));
3244        credits = txq_avail(&q->q) - ndesc;
3245        pos = (u64 *)&q->q.desc[q->q.pidx];
3246
3247        /* ofldtxq_stop modifies WR header in-situ */
3248        inline_tx_header(src, &q->q, pos, len);
3249        if (unlikely(credits < TXQ_STOP_THRES))
3250                ofldtxq_stop(q, (struct fw_wr_hdr *)pos);
3251        txq_advance(&q->q, ndesc);
3252        cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
3253
3254        spin_unlock(&q->sendq.lock);
3255        return NET_XMIT_SUCCESS;
3256}
3257
3258int cxgb4_immdata_send(struct net_device *dev, unsigned int idx,
3259                       const void *src, unsigned int len)
3260{
3261        struct sge_uld_txq_info *txq_info;
3262        struct sge_uld_txq *txq;
3263        struct adapter *adap;
3264        int ret;
3265
3266        adap = netdev2adap(dev);
3267
3268        local_bh_disable();
3269        txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
3270        if (unlikely(!txq_info)) {
3271                WARN_ON(true);
3272                local_bh_enable();
3273                return NET_XMIT_DROP;
3274        }
3275        txq = &txq_info->uldtxq[idx];
3276
3277        ret = ofld_xmit_direct(txq, src, len);
3278        local_bh_enable();
3279        return net_xmit_eval(ret);
3280}
3281EXPORT_SYMBOL(cxgb4_immdata_send);
3282
3283/**
3284 *      t4_crypto_send - send crypto packet
3285 *      @adap: the adapter
3286 *      @skb: the packet
3287 *
3288 *      Sends crypto packet.  We use the packet queue_mapping to select the
3289 *      appropriate Tx queue as follows: bit 0 indicates whether the packet
3290 *      should be sent as regular or control, bits 1-15 select the queue.
3291 */
3292static int t4_crypto_send(struct adapter *adap, struct sk_buff *skb)
3293{
3294        int ret;
3295
3296        local_bh_disable();
3297        ret = uld_send(adap, skb, CXGB4_TX_CRYPTO);
3298        local_bh_enable();
3299        return ret;
3300}
3301
3302/**
3303 *      cxgb4_crypto_send - send crypto packet
3304 *      @dev: the net device
3305 *      @skb: the packet
3306 *
3307 *      Sends crypto packet.  This is an exported version of @t4_crypto_send,
3308 *      intended for ULDs.
3309 */
3310int cxgb4_crypto_send(struct net_device *dev, struct sk_buff *skb)
3311{
3312        return t4_crypto_send(netdev2adap(dev), skb);
3313}
3314EXPORT_SYMBOL(cxgb4_crypto_send);
3315
3316static inline void copy_frags(struct sk_buff *skb,
3317                              const struct pkt_gl *gl, unsigned int offset)
3318{
3319        int i;
3320
3321        /* usually there's just one frag */
3322        __skb_fill_page_desc(skb, 0, gl->frags[0].page,
3323                             gl->frags[0].offset + offset,
3324                             gl->frags[0].size - offset);
3325        skb_shinfo(skb)->nr_frags = gl->nfrags;
3326        for (i = 1; i < gl->nfrags; i++)
3327                __skb_fill_page_desc(skb, i, gl->frags[i].page,
3328                                     gl->frags[i].offset,
3329                                     gl->frags[i].size);
3330
3331        /* get a reference to the last page, we don't own it */
3332        get_page(gl->frags[gl->nfrags - 1].page);
3333}
3334
3335/**
3336 *      cxgb4_pktgl_to_skb - build an sk_buff from a packet gather list
3337 *      @gl: the gather list
3338 *      @skb_len: size of sk_buff main body if it carries fragments
3339 *      @pull_len: amount of data to move to the sk_buff's main body
3340 *
3341 *      Builds an sk_buff from the given packet gather list.  Returns the
3342 *      sk_buff or %NULL if sk_buff allocation failed.
3343 */
3344struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
3345                                   unsigned int skb_len, unsigned int pull_len)
3346{
3347        struct sk_buff *skb;
3348
3349        /*
3350         * Below we rely on RX_COPY_THRES being less than the smallest Rx buffer
3351         * size, which is expected since buffers are at least PAGE_SIZEd.
3352         * In this case packets up to RX_COPY_THRES have only one fragment.
3353         */
3354        if (gl->tot_len <= RX_COPY_THRES) {
3355                skb = dev_alloc_skb(gl->tot_len);
3356                if (unlikely(!skb))
3357                        goto out;
3358                __skb_put(skb, gl->tot_len);
3359                skb_copy_to_linear_data(skb, gl->va, gl->tot_len);
3360        } else {
3361                skb = dev_alloc_skb(skb_len);
3362                if (unlikely(!skb))
3363                        goto out;
3364                __skb_put(skb, pull_len);
3365                skb_copy_to_linear_data(skb, gl->va, pull_len);
3366
3367                copy_frags(skb, gl, pull_len);
3368                skb->len = gl->tot_len;
3369                skb->data_len = skb->len - pull_len;
3370                skb->truesize += skb->data_len;
3371        }
3372out:    return skb;
3373}
3374EXPORT_SYMBOL(cxgb4_pktgl_to_skb);
3375
3376/**
3377 *      t4_pktgl_free - free a packet gather list
3378 *      @gl: the gather list
3379 *
3380 *      Releases the pages of a packet gather list.  We do not own the last
3381 *      page on the list and do not free it.
3382 */
3383static void t4_pktgl_free(const struct pkt_gl *gl)
3384{
3385        int n;
3386        const struct page_frag *p;
3387
3388        for (p = gl->frags, n = gl->nfrags - 1; n--; p++)
3389                put_page(p->page);
3390}
3391
3392/*
3393 * Process an MPS trace packet.  Give it an unused protocol number so it won't
3394 * be delivered to anyone and send it to the stack for capture.
3395 */
3396static noinline int handle_trace_pkt(struct adapter *adap,
3397                                     const struct pkt_gl *gl)
3398{
3399        struct sk_buff *skb;
3400
3401        skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN);
3402        if (unlikely(!skb)) {
3403                t4_pktgl_free(gl);
3404                return 0;
3405        }
3406
3407        if (is_t4(adap->params.chip))
3408                __skb_pull(skb, sizeof(struct cpl_trace_pkt));
3409        else
3410                __skb_pull(skb, sizeof(struct cpl_t5_trace_pkt));
3411
3412        skb_reset_mac_header(skb);
3413        skb->protocol = htons(0xffff);
3414        skb->dev = adap->port[0];
3415        netif_receive_skb(skb);
3416        return 0;
3417}
3418
3419/**
3420 * cxgb4_sgetim_to_hwtstamp - convert sge time stamp to hw time stamp
3421 * @adap: the adapter
3422 * @hwtstamps: time stamp structure to update
3423 * @sgetstamp: 60bit iqe timestamp
3424 *
3425 * Every ingress queue entry has the 60-bit timestamp, convert that timestamp
3426 * which is in Core Clock ticks into ktime_t and assign it
3427 **/
3428static void cxgb4_sgetim_to_hwtstamp(struct adapter *adap,
3429                                     struct skb_shared_hwtstamps *hwtstamps,
3430                                     u64 sgetstamp)
3431{
3432        u64 ns;
3433        u64 tmp = (sgetstamp * 1000 * 1000 + adap->params.vpd.cclk / 2);
3434
3435        ns = div_u64(tmp, adap->params.vpd.cclk);
3436
3437        memset(hwtstamps, 0, sizeof(*hwtstamps));
3438        hwtstamps->hwtstamp = ns_to_ktime(ns);
3439}
3440
3441static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
3442                   const struct cpl_rx_pkt *pkt, unsigned long tnl_hdr_len)
3443{
3444        struct adapter *adapter = rxq->rspq.adap;
3445        struct sge *s = &adapter->sge;
3446        struct port_info *pi;
3447        int ret;
3448        struct sk_buff *skb;
3449
3450        skb = napi_get_frags(&rxq->rspq.napi);
3451        if (unlikely(!skb)) {
3452                t4_pktgl_free(gl);
3453                rxq->stats.rx_drops++;
3454                return;
3455        }
3456
3457        copy_frags(skb, gl, s->pktshift);
3458        if (tnl_hdr_len)
3459                skb->csum_level = 1;
3460        skb->len = gl->tot_len - s->pktshift;
3461        skb->data_len = skb->len;
3462        skb->truesize += skb->data_len;
3463        skb->ip_summed = CHECKSUM_UNNECESSARY;
3464        skb_record_rx_queue(skb, rxq->rspq.idx);
3465        pi = netdev_priv(skb->dev);
3466        if (pi->rxtstamp)
3467                cxgb4_sgetim_to_hwtstamp(adapter, skb_hwtstamps(skb),
3468                                         gl->sgetstamp);
3469        if (rxq->rspq.netdev->features & NETIF_F_RXHASH)
3470                skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
3471                             PKT_HASH_TYPE_L3);
3472
3473        if (unlikely(pkt->vlan_ex)) {
3474                __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
3475                rxq->stats.vlan_ex++;
3476        }
3477        ret = napi_gro_frags(&rxq->rspq.napi);
3478        if (ret == GRO_HELD)
3479                rxq->stats.lro_pkts++;
3480        else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
3481                rxq->stats.lro_merged++;
3482        rxq->stats.pkts++;
3483        rxq->stats.rx_cso++;
3484}
3485
3486enum {
3487        RX_NON_PTP_PKT = 0,
3488        RX_PTP_PKT_SUC = 1,
3489        RX_PTP_PKT_ERR = 2
3490};
3491
3492/**
3493 *     t4_systim_to_hwstamp - read hardware time stamp
3494 *     @adapter: the adapter
3495 *     @skb: the packet
3496 *
3497 *     Read Time Stamp from MPS packet and insert in skb which
3498 *     is forwarded to PTP application
3499 */
3500static noinline int t4_systim_to_hwstamp(struct adapter *adapter,
3501                                         struct sk_buff *skb)
3502{
3503        struct skb_shared_hwtstamps *hwtstamps;
3504        struct cpl_rx_mps_pkt *cpl = NULL;
3505        unsigned char *data;
3506        int offset;
3507
3508        cpl = (struct cpl_rx_mps_pkt *)skb->data;
3509        if (!(CPL_RX_MPS_PKT_TYPE_G(ntohl(cpl->op_to_r1_hi)) &
3510             X_CPL_RX_MPS_PKT_TYPE_PTP))
3511                return RX_PTP_PKT_ERR;
3512
3513        data = skb->data + sizeof(*cpl);
3514        skb_pull(skb, 2 * sizeof(u64) + sizeof(struct cpl_rx_mps_pkt));
3515        offset = ETH_HLEN + IPV4_HLEN(skb->data) + UDP_HLEN;
3516        if (skb->len < offset + OFF_PTP_SEQUENCE_ID + sizeof(short))
3517                return RX_PTP_PKT_ERR;
3518
3519        hwtstamps = skb_hwtstamps(skb);
3520        memset(hwtstamps, 0, sizeof(*hwtstamps));
3521        hwtstamps->hwtstamp = ns_to_ktime(get_unaligned_be64(data));
3522
3523        return RX_PTP_PKT_SUC;
3524}
3525
3526/**
3527 *     t4_rx_hststamp - Recv PTP Event Message
3528 *     @adapter: the adapter
3529 *     @rsp: the response queue descriptor holding the RX_PKT message
3530 *     @rxq: the response queue holding the RX_PKT message
3531 *     @skb: the packet
3532 *
3533 *     PTP enabled and MPS packet, read HW timestamp
3534 */
3535static int t4_rx_hststamp(struct adapter *adapter, const __be64 *rsp,
3536                          struct sge_eth_rxq *rxq, struct sk_buff *skb)
3537{
3538        int ret;
3539
3540        if (unlikely((*(u8 *)rsp == CPL_RX_MPS_PKT) &&
3541                     !is_t4(adapter->params.chip))) {
3542                ret = t4_systim_to_hwstamp(adapter, skb);
3543                if (ret == RX_PTP_PKT_ERR) {
3544                        kfree_skb(skb);
3545                        rxq->stats.rx_drops++;
3546                }
3547                return ret;
3548        }
3549        return RX_NON_PTP_PKT;
3550}
3551
3552/**
3553 *      t4_tx_hststamp - Loopback PTP Transmit Event Message
3554 *      @adapter: the adapter
3555 *      @skb: the packet
3556 *      @dev: the ingress net device
3557 *
3558 *      Read hardware timestamp for the loopback PTP Tx event message
3559 */
3560static int t4_tx_hststamp(struct adapter *adapter, struct sk_buff *skb,
3561                          struct net_device *dev)
3562{
3563        struct port_info *pi = netdev_priv(dev);
3564
3565        if (!is_t4(adapter->params.chip) && adapter->ptp_tx_skb) {
3566                cxgb4_ptp_read_hwstamp(adapter, pi);
3567                kfree_skb(skb);
3568                return 0;
3569        }
3570        return 1;
3571}
3572
3573/**
3574 *      t4_tx_completion_handler - handle CPL_SGE_EGR_UPDATE messages
3575 *      @rspq: Ethernet RX Response Queue associated with Ethernet TX Queue
3576 *      @rsp: Response Entry pointer into Response Queue
3577 *      @gl: Gather List pointer
3578 *
3579 *      For adapters which support the SGE Doorbell Queue Timer facility,
3580 *      we configure the Ethernet TX Queues to send CIDX Updates to the
3581 *      Associated Ethernet RX Response Queue with CPL_SGE_EGR_UPDATE
3582 *      messages.  This adds a small load to PCIe Link RX bandwidth and,
3583 *      potentially, higher CPU Interrupt load, but allows us to respond
3584 *      much more quickly to the CIDX Updates.  This is important for
3585 *      Upper Layer Software which isn't willing to have a large amount
3586 *      of TX Data outstanding before receiving DMA Completions.
3587 */
3588static void t4_tx_completion_handler(struct sge_rspq *rspq,
3589                                     const __be64 *rsp,
3590                                     const struct pkt_gl *gl)
3591{
3592        u8 opcode = ((const struct rss_header *)rsp)->opcode;
3593        struct port_info *pi = netdev_priv(rspq->netdev);
3594        struct adapter *adapter = rspq->adap;
3595        struct sge *s = &adapter->sge;
3596        struct sge_eth_txq *txq;
3597
3598        /* skip RSS header */
3599        rsp++;
3600
3601        /* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
3602         */
3603        if (unlikely(opcode == CPL_FW4_MSG &&
3604                     ((const struct cpl_fw4_msg *)rsp)->type ==
3605                                                        FW_TYPE_RSSCPL)) {
3606                rsp++;
3607                opcode = ((const struct rss_header *)rsp)->opcode;
3608                rsp++;
3609        }
3610
3611        if (unlikely(opcode != CPL_SGE_EGR_UPDATE)) {
3612                pr_info("%s: unexpected FW4/CPL %#x on Rx queue\n",
3613                        __func__, opcode);
3614                return;
3615        }
3616
3617        txq = &s->ethtxq[pi->first_qset + rspq->idx];
3618
3619        /* We've got the Hardware Consumer Index Update in the Egress Update
3620         * message. These Egress Update messages will be our sole CIDX Updates
3621         * we get since we don't want to chew up PCIe bandwidth for both Ingress
3622         * Messages and Status Page writes.  However, The code which manages
3623         * reclaiming successfully DMA'ed TX Work Requests uses the CIDX value
3624         * stored in the Status Page at the end of the TX Queue.  It's easiest
3625         * to simply copy the CIDX Update value from the Egress Update message
3626         * to the Status Page.  Also note that no Endian issues need to be
3627         * considered here since both are Big Endian and we're just copying
3628         * bytes consistently ...
3629         */
3630        if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) {
3631                struct cpl_sge_egr_update *egr;
3632
3633                egr = (struct cpl_sge_egr_update *)rsp;
3634                WRITE_ONCE(txq->q.stat->cidx, egr->cidx);
3635        }
3636
3637        t4_sge_eth_txq_egress_update(adapter, txq, -1);
3638}
3639
3640static int cxgb4_validate_lb_pkt(struct port_info *pi, const struct pkt_gl *si)
3641{
3642        struct adapter *adap = pi->adapter;
3643        struct cxgb4_ethtool_lb_test *lb;
3644        struct sge *s = &adap->sge;
3645        struct net_device *netdev;
3646        u8 *data;
3647        int i;
3648
3649        netdev = adap->port[pi->port_id];
3650        lb = &pi->ethtool_lb;
3651        data = si->va + s->pktshift;
3652
3653        i = ETH_ALEN;
3654        if (!ether_addr_equal(data + i, netdev->dev_addr))
3655                return -1;
3656
3657        i += ETH_ALEN;
3658        if (strcmp(&data[i], CXGB4_SELFTEST_LB_STR))
3659                lb->result = -EIO;
3660
3661        complete(&lb->completion);
3662        return 0;
3663}
3664
3665/**
3666 *      t4_ethrx_handler - process an ingress ethernet packet
3667 *      @q: the response queue that received the packet
3668 *      @rsp: the response queue descriptor holding the RX_PKT message
3669 *      @si: the gather list of packet fragments
3670 *
3671 *      Process an ingress ethernet packet and deliver it to the stack.
3672 */
3673int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
3674                     const struct pkt_gl *si)
3675{
3676        bool csum_ok;
3677        struct sk_buff *skb;
3678        const struct cpl_rx_pkt *pkt;
3679        struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
3680        struct adapter *adapter = q->adap;
3681        struct sge *s = &q->adap->sge;
3682        int cpl_trace_pkt = is_t4(q->adap->params.chip) ?
3683                            CPL_TRACE_PKT : CPL_TRACE_PKT_T5;
3684        u16 err_vec, tnl_hdr_len = 0;
3685        struct port_info *pi;
3686        int ret = 0;
3687
3688        pi = netdev_priv(q->netdev);
3689        /* If we're looking at TX Queue CIDX Update, handle that separately
3690         * and return.
3691         */
3692        if (unlikely((*(u8 *)rsp == CPL_FW4_MSG) ||
3693                     (*(u8 *)rsp == CPL_SGE_EGR_UPDATE))) {
3694                t4_tx_completion_handler(q, rsp, si);
3695                return 0;
3696        }
3697
3698        if (unlikely(*(u8 *)rsp == cpl_trace_pkt))
3699                return handle_trace_pkt(q->adap, si);
3700
3701        pkt = (const struct cpl_rx_pkt *)rsp;
3702        /* Compressed error vector is enabled for T6 only */
3703        if (q->adap->params.tp.rx_pkt_encap) {
3704                err_vec = T6_COMPR_RXERR_VEC_G(be16_to_cpu(pkt->err_vec));
3705                tnl_hdr_len = T6_RX_TNLHDR_LEN_G(ntohs(pkt->err_vec));
3706        } else {
3707                err_vec = be16_to_cpu(pkt->err_vec);
3708        }
3709
3710        csum_ok = pkt->csum_calc && !err_vec &&
3711                  (q->netdev->features & NETIF_F_RXCSUM);
3712
3713        if (err_vec)
3714                rxq->stats.bad_rx_pkts++;
3715
3716        if (unlikely(pi->ethtool_lb.loopback && pkt->iff >= NCHAN)) {
3717                ret = cxgb4_validate_lb_pkt(pi, si);
3718                if (!ret)
3719                        return 0;
3720        }
3721
3722        if (((pkt->l2info & htonl(RXF_TCP_F)) ||
3723             tnl_hdr_len) &&
3724            (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) {
3725                do_gro(rxq, si, pkt, tnl_hdr_len);
3726                return 0;
3727        }
3728
3729        skb = cxgb4_pktgl_to_skb(si, RX_PKT_SKB_LEN, RX_PULL_LEN);
3730        if (unlikely(!skb)) {
3731                t4_pktgl_free(si);
3732                rxq->stats.rx_drops++;
3733                return 0;
3734        }
3735
3736        /* Handle PTP Event Rx packet */
3737        if (unlikely(pi->ptp_enable)) {
3738                ret = t4_rx_hststamp(adapter, rsp, rxq, skb);
3739                if (ret == RX_PTP_PKT_ERR)
3740                        return 0;
3741        }
3742        if (likely(!ret))
3743                __skb_pull(skb, s->pktshift); /* remove ethernet header pad */
3744
3745        /* Handle the PTP Event Tx Loopback packet */
3746        if (unlikely(pi->ptp_enable && !ret &&
3747                     (pkt->l2info & htonl(RXF_UDP_F)) &&
3748                     cxgb4_ptp_is_ptp_rx(skb))) {
3749                if (!t4_tx_hststamp(adapter, skb, q->netdev))
3750                        return 0;
3751        }
3752
3753        skb->protocol = eth_type_trans(skb, q->netdev);
3754        skb_record_rx_queue(skb, q->idx);
3755        if (skb->dev->features & NETIF_F_RXHASH)
3756                skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
3757                             PKT_HASH_TYPE_L3);
3758
3759        rxq->stats.pkts++;
3760
3761        if (pi->rxtstamp)
3762                cxgb4_sgetim_to_hwtstamp(q->adap, skb_hwtstamps(skb),
3763                                         si->sgetstamp);
3764        if (csum_ok && (pkt->l2info & htonl(RXF_UDP_F | RXF_TCP_F))) {
3765                if (!pkt->ip_frag) {
3766                        skb->ip_summed = CHECKSUM_UNNECESSARY;
3767                        rxq->stats.rx_cso++;
3768                } else if (pkt->l2info & htonl(RXF_IP_F)) {
3769                        __sum16 c = (__force __sum16)pkt->csum;
3770                        skb->csum = csum_unfold(c);
3771
3772                        if (tnl_hdr_len) {
3773                                skb->ip_summed = CHECKSUM_UNNECESSARY;
3774                                skb->csum_level = 1;
3775                        } else {
3776                                skb->ip_summed = CHECKSUM_COMPLETE;
3777                        }
3778                        rxq->stats.rx_cso++;
3779                }
3780        } else {
3781                skb_checksum_none_assert(skb);
3782#ifdef CONFIG_CHELSIO_T4_FCOE
3783#define CPL_RX_PKT_FLAGS (RXF_PSH_F | RXF_SYN_F | RXF_UDP_F | \
3784                          RXF_TCP_F | RXF_IP_F | RXF_IP6_F | RXF_LRO_F)
3785
3786                if (!(pkt->l2info & cpu_to_be32(CPL_RX_PKT_FLAGS))) {
3787                        if ((pkt->l2info & cpu_to_be32(RXF_FCOE_F)) &&
3788                            (pi->fcoe.flags & CXGB_FCOE_ENABLED)) {
3789                                if (q->adap->params.tp.rx_pkt_encap)
3790                                        csum_ok = err_vec &
3791                                                  T6_COMPR_RXERR_SUM_F;
3792                                else
3793                                        csum_ok = err_vec & RXERR_CSUM_F;
3794                                if (!csum_ok)
3795                                        skb->ip_summed = CHECKSUM_UNNECESSARY;
3796                        }
3797                }
3798
3799#undef CPL_RX_PKT_FLAGS
3800#endif /* CONFIG_CHELSIO_T4_FCOE */
3801        }
3802
3803        if (unlikely(pkt->vlan_ex)) {
3804                __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
3805                rxq->stats.vlan_ex++;
3806        }
3807        skb_mark_napi_id(skb, &q->napi);
3808        netif_receive_skb(skb);
3809        return 0;
3810}
3811
3812/**
3813 *      restore_rx_bufs - put back a packet's Rx buffers
3814 *      @si: the packet gather list
3815 *      @q: the SGE free list
3816 *      @frags: number of FL buffers to restore
3817 *
3818 *      Puts back on an FL the Rx buffers associated with @si.  The buffers
3819 *      have already been unmapped and are left unmapped, we mark them so to
3820 *      prevent further unmapping attempts.
3821 *
3822 *      This function undoes a series of @unmap_rx_buf calls when we find out
3823 *      that the current packet can't be processed right away afterall and we
3824 *      need to come back to it later.  This is a very rare event and there's
3825 *      no effort to make this particularly efficient.
3826 */
3827static void restore_rx_bufs(const struct pkt_gl *si, struct sge_fl *q,
3828                            int frags)
3829{
3830        struct rx_sw_desc *d;
3831
3832        while (frags--) {
3833                if (q->cidx == 0)
3834                        q->cidx = q->size - 1;
3835                else
3836                        q->cidx--;
3837                d = &q->sdesc[q->cidx];
3838                d->page = si->frags[frags].page;
3839                d->dma_addr |= RX_UNMAPPED_BUF;
3840                q->avail++;
3841        }
3842}
3843
3844/**
3845 *      is_new_response - check if a response is newly written
3846 *      @r: the response descriptor
3847 *      @q: the response queue
3848 *
3849 *      Returns true if a response descriptor contains a yet unprocessed
3850 *      response.
3851 */
3852static inline bool is_new_response(const struct rsp_ctrl *r,
3853                                   const struct sge_rspq *q)
3854{
3855        return (r->type_gen >> RSPD_GEN_S) == q->gen;
3856}
3857
3858/**
3859 *      rspq_next - advance to the next entry in a response queue
3860 *      @q: the queue
3861 *
3862 *      Updates the state of a response queue to advance it to the next entry.
3863 */
3864static inline void rspq_next(struct sge_rspq *q)
3865{
3866        q->cur_desc = (void *)q->cur_desc + q->iqe_len;
3867        if (unlikely(++q->cidx == q->size)) {
3868                q->cidx = 0;
3869                q->gen ^= 1;
3870                q->cur_desc = q->desc;
3871        }
3872}
3873
3874/**
3875 *      process_responses - process responses from an SGE response queue
3876 *      @q: the ingress queue to process
3877 *      @budget: how many responses can be processed in this round
3878 *
3879 *      Process responses from an SGE response queue up to the supplied budget.
3880 *      Responses include received packets as well as control messages from FW
3881 *      or HW.
3882 *
3883 *      Additionally choose the interrupt holdoff time for the next interrupt
3884 *      on this queue.  If the system is under memory shortage use a fairly
3885 *      long delay to help recovery.
3886 */
3887static int process_responses(struct sge_rspq *q, int budget)
3888{
3889        int ret, rsp_type;
3890        int budget_left = budget;
3891        const struct rsp_ctrl *rc;
3892        struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
3893        struct adapter *adapter = q->adap;
3894        struct sge *s = &adapter->sge;
3895
3896        while (likely(budget_left)) {
3897                rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
3898                if (!is_new_response(rc, q)) {
3899                        if (q->flush_handler)
3900                                q->flush_handler(q);
3901                        break;
3902                }
3903
3904                dma_rmb();
3905                rsp_type = RSPD_TYPE_G(rc->type_gen);
3906                if (likely(rsp_type == RSPD_TYPE_FLBUF_X)) {
3907                        struct page_frag *fp;
3908                        struct pkt_gl si;
3909                        const struct rx_sw_desc *rsd;
3910                        u32 len = ntohl(rc->pldbuflen_qid), bufsz, frags;
3911
3912                        if (len & RSPD_NEWBUF_F) {
3913                                if (likely(q->offset > 0)) {
3914                                        free_rx_bufs(q->adap, &rxq->fl, 1);
3915                                        q->offset = 0;
3916                                }
3917                                len = RSPD_LEN_G(len);
3918                        }
3919                        si.tot_len = len;
3920
3921                        /* gather packet fragments */
3922                        for (frags = 0, fp = si.frags; ; frags++, fp++) {
3923                                rsd = &rxq->fl.sdesc[rxq->fl.cidx];
3924                                bufsz = get_buf_size(adapter, rsd);
3925                                fp->page = rsd->page;
3926                                fp->offset = q->offset;
3927                                fp->size = min(bufsz, len);
3928                                len -= fp->size;
3929                                if (!len)
3930                                        break;
3931                                unmap_rx_buf(q->adap, &rxq->fl);
3932                        }
3933
3934                        si.sgetstamp = SGE_TIMESTAMP_G(
3935                                        be64_to_cpu(rc->last_flit));
3936                        /*
3937                         * Last buffer remains mapped so explicitly make it
3938                         * coherent for CPU access.
3939                         */
3940                        dma_sync_single_for_cpu(q->adap->pdev_dev,
3941                                                get_buf_addr(rsd),
3942                                                fp->size, DMA_FROM_DEVICE);
3943
3944                        si.va = page_address(si.frags[0].page) +
3945                                si.frags[0].offset;
3946                        prefetch(si.va);
3947
3948                        si.nfrags = frags + 1;
3949                        ret = q->handler(q, q->cur_desc, &si);
3950                        if (likely(ret == 0))
3951                                q->offset += ALIGN(fp->size, s->fl_align);
3952                        else
3953                                restore_rx_bufs(&si, &rxq->fl, frags);
3954                } else if (likely(rsp_type == RSPD_TYPE_CPL_X)) {
3955                        ret = q->handler(q, q->cur_desc, NULL);
3956                } else {
3957                        ret = q->handler(q, (const __be64 *)rc, CXGB4_MSG_AN);
3958                }
3959
3960                if (unlikely(ret)) {
3961                        /* couldn't process descriptor, back off for recovery */
3962                        q->next_intr_params = QINTR_TIMER_IDX_V(NOMEM_TMR_IDX);
3963                        break;
3964                }
3965
3966                rspq_next(q);
3967                budget_left--;
3968        }
3969
3970        if (q->offset >= 0 && fl_cap(&rxq->fl) - rxq->fl.avail >= 16)
3971                __refill_fl(q->adap, &rxq->fl);
3972        return budget - budget_left;
3973}
3974
3975/**
3976 *      napi_rx_handler - the NAPI handler for Rx processing
3977 *      @napi: the napi instance
3978 *      @budget: how many packets we can process in this round
3979 *
3980 *      Handler for new data events when using NAPI.  This does not need any
3981 *      locking or protection from interrupts as data interrupts are off at
3982 *      this point and other adapter interrupts do not interfere (the latter
3983 *      in not a concern at all with MSI-X as non-data interrupts then have
3984 *      a separate handler).
3985 */
3986static int napi_rx_handler(struct napi_struct *napi, int budget)
3987{
3988        unsigned int params;
3989        struct sge_rspq *q = container_of(napi, struct sge_rspq, napi);
3990        int work_done;
3991        u32 val;
3992
3993        work_done = process_responses(q, budget);
3994        if (likely(work_done < budget)) {
3995                int timer_index;
3996
3997                napi_complete_done(napi, work_done);
3998                timer_index = QINTR_TIMER_IDX_G(q->next_intr_params);
3999
4000                if (q->adaptive_rx) {
4001                        if (work_done > max(timer_pkt_quota[timer_index],
4002                                            MIN_NAPI_WORK))
4003                                timer_index = (timer_index + 1);
4004                        else
4005                                timer_index = timer_index - 1;
4006
4007                        timer_index = clamp(timer_index, 0, SGE_TIMERREGS - 1);
4008                        q->next_intr_params =
4009                                        QINTR_TIMER_IDX_V(timer_index) |
4010                                        QINTR_CNT_EN_V(0);
4011                        params = q->next_intr_params;
4012                } else {
4013                        params = q->next_intr_params;
4014                        q->next_intr_params = q->intr_params;
4015                }
4016        } else
4017                params = QINTR_TIMER_IDX_V(7);
4018
4019        val = CIDXINC_V(work_done) | SEINTARM_V(params);