linux/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
   2/* Copyright (c) 2019 Mellanox Technologies. */
   3
   4#include <linux/smp.h>
   5#include "dr_types.h"
   6
   7#define QUEUE_SIZE 128
   8#define SIGNAL_PER_DIV_QUEUE 16
   9#define TH_NUMS_TO_DRAIN 2
  10
  11enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
  12
  13struct dr_data_seg {
  14        u64 addr;
  15        u32 length;
  16        u32 lkey;
  17        unsigned int send_flags;
  18};
  19
  20struct postsend_info {
  21        struct dr_data_seg write;
  22        struct dr_data_seg read;
  23        u64 remote_addr;
  24        u32 rkey;
  25};
  26
  27struct dr_qp_rtr_attr {
  28        struct mlx5dr_cmd_gid_attr dgid_attr;
  29        enum ib_mtu mtu;
  30        u32 qp_num;
  31        u16 port_num;
  32        u8 min_rnr_timer;
  33        u8 sgid_index;
  34        u16 udp_src_port;
  35        u8 fl:1;
  36};
  37
  38struct dr_qp_rts_attr {
  39        u8 timeout;
  40        u8 retry_cnt;
  41        u8 rnr_retry;
  42};
  43
  44struct dr_qp_init_attr {
  45        u32 cqn;
  46        u32 pdn;
  47        u32 max_send_wr;
  48        struct mlx5_uars_page *uar;
  49        u8 isolate_vl_tc:1;
  50};
  51
  52static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
  53{
  54        unsigned int idx;
  55        u8 opcode;
  56
  57        opcode = get_cqe_opcode(cqe64);
  58        if (opcode == MLX5_CQE_REQ_ERR) {
  59                idx = be16_to_cpu(cqe64->wqe_counter) &
  60                        (dr_cq->qp->sq.wqe_cnt - 1);
  61                dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
  62        } else if (opcode == MLX5_CQE_RESP_ERR) {
  63                ++dr_cq->qp->sq.cc;
  64        } else {
  65                idx = be16_to_cpu(cqe64->wqe_counter) &
  66                        (dr_cq->qp->sq.wqe_cnt - 1);
  67                dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
  68
  69                return CQ_OK;
  70        }
  71
  72        return CQ_POLL_ERR;
  73}
  74
  75static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
  76{
  77        struct mlx5_cqe64 *cqe64;
  78        int err;
  79
  80        cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
  81        if (!cqe64)
  82                return CQ_EMPTY;
  83
  84        mlx5_cqwq_pop(&dr_cq->wq);
  85        err = dr_parse_cqe(dr_cq, cqe64);
  86        mlx5_cqwq_update_db_record(&dr_cq->wq);
  87
  88        return err;
  89}
  90
  91static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
  92{
  93        int npolled;
  94        int err = 0;
  95
  96        for (npolled = 0; npolled < ne; ++npolled) {
  97                err = dr_cq_poll_one(dr_cq);
  98                if (err != CQ_OK)
  99                        break;
 100        }
 101
 102        return err == CQ_POLL_ERR ? err : npolled;
 103}
 104
 105static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
 106                                         struct dr_qp_init_attr *attr)
 107{
 108        u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
 109        u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
 110        struct mlx5_wq_param wqp;
 111        struct mlx5dr_qp *dr_qp;
 112        int inlen;
 113        void *qpc;
 114        void *in;
 115        int err;
 116
 117        dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
 118        if (!dr_qp)
 119                return NULL;
 120
 121        wqp.buf_numa_node = mdev->priv.numa_node;
 122        wqp.db_numa_node = mdev->priv.numa_node;
 123
 124        dr_qp->rq.pc = 0;
 125        dr_qp->rq.cc = 0;
 126        dr_qp->rq.wqe_cnt = 4;
 127        dr_qp->sq.pc = 0;
 128        dr_qp->sq.cc = 0;
 129        dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
 130
 131        MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
 132        MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
 133        MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
 134        err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
 135                                &dr_qp->wq_ctrl);
 136        if (err) {
 137                mlx5_core_warn(mdev, "Can't create QP WQ\n");
 138                goto err_wq;
 139        }
 140
 141        dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
 142                                     sizeof(dr_qp->sq.wqe_head[0]),
 143                                     GFP_KERNEL);
 144
 145        if (!dr_qp->sq.wqe_head) {
 146                mlx5_core_warn(mdev, "Can't allocate wqe head\n");
 147                goto err_wqe_head;
 148        }
 149
 150        inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
 151                MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
 152                dr_qp->wq_ctrl.buf.npages;
 153        in = kvzalloc(inlen, GFP_KERNEL);
 154        if (!in) {
 155                err = -ENOMEM;
 156                goto err_in;
 157        }
 158
 159        qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
 160        MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
 161        MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
 162        MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc);
 163        MLX5_SET(qpc, qpc, pd, attr->pdn);
 164        MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
 165        MLX5_SET(qpc, qpc, log_page_size,
 166                 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 167        MLX5_SET(qpc, qpc, fre, 1);
 168        MLX5_SET(qpc, qpc, rlky, 1);
 169        MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
 170        MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
 171        MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
 172        MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
 173        MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
 174        MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
 175        MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
 176        MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
 177        if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
 178                MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
 179        mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
 180                                  (__be64 *)MLX5_ADDR_OF(create_qp_in,
 181                                                         in, pas));
 182
 183        MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
 184        err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
 185        dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn);
 186        kvfree(in);
 187        if (err)
 188                goto err_in;
 189        dr_qp->uar = attr->uar;
 190
 191        return dr_qp;
 192
 193err_in:
 194        kfree(dr_qp->sq.wqe_head);
 195err_wqe_head:
 196        mlx5_wq_destroy(&dr_qp->wq_ctrl);
 197err_wq:
 198        kfree(dr_qp);
 199        return NULL;
 200}
 201
 202static void dr_destroy_qp(struct mlx5_core_dev *mdev,
 203                          struct mlx5dr_qp *dr_qp)
 204{
 205        u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
 206
 207        MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
 208        MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn);
 209        mlx5_cmd_exec_in(mdev, destroy_qp, in);
 210
 211        kfree(dr_qp->sq.wqe_head);
 212        mlx5_wq_destroy(&dr_qp->wq_ctrl);
 213        kfree(dr_qp);
 214}
 215
 216static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
 217{
 218        dma_wmb();
 219        *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff);
 220
 221        /* After wmb() the hw aware of new work */
 222        wmb();
 223
 224        mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
 225}
 226
 227static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
 228                             u32 rkey, struct dr_data_seg *data_seg,
 229                             u32 opcode, bool notify_hw)
 230{
 231        struct mlx5_wqe_raddr_seg *wq_raddr;
 232        struct mlx5_wqe_ctrl_seg *wq_ctrl;
 233        struct mlx5_wqe_data_seg *wq_dseg;
 234        unsigned int size;
 235        unsigned int idx;
 236
 237        size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
 238                sizeof(*wq_raddr) / 16;
 239
 240        idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
 241
 242        wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
 243        wq_ctrl->imm = 0;
 244        wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
 245                MLX5_WQE_CTRL_CQ_UPDATE : 0;
 246        wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
 247                                                opcode);
 248        wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8);
 249        wq_raddr = (void *)(wq_ctrl + 1);
 250        wq_raddr->raddr = cpu_to_be64(remote_addr);
 251        wq_raddr->rkey = cpu_to_be32(rkey);
 252        wq_raddr->reserved = 0;
 253
 254        wq_dseg = (void *)(wq_raddr + 1);
 255        wq_dseg->byte_count = cpu_to_be32(data_seg->length);
 256        wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
 257        wq_dseg->addr = cpu_to_be64(data_seg->addr);
 258
 259        dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
 260
 261        if (notify_hw)
 262                dr_cmd_notify_hw(dr_qp, wq_ctrl);
 263}
 264
 265static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
 266{
 267        dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
 268                         &send_info->write, MLX5_OPCODE_RDMA_WRITE, false);
 269        dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
 270                         &send_info->read, MLX5_OPCODE_RDMA_READ, true);
 271}
 272
 273/**
 274 * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
 275 * with send_list parameters:
 276 *
 277 *     @ste:       The data that attached to this specific ste
 278 *     @size:      of data to write
 279 *     @offset:    of the data from start of the hw_ste entry
 280 *     @data:      data
 281 *     @ste_info:  ste to be sent with send_list
 282 *     @send_list: to append into it
 283 *     @copy_data: if true indicates that the data should be kept because
 284 *                 it's not backuped any where (like in re-hash).
 285 *                 if false, it lets the data to be updated after
 286 *                 it was added to the list.
 287 */
 288void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
 289                                               u16 offset, u8 *data,
 290                                               struct mlx5dr_ste_send_info *ste_info,
 291                                               struct list_head *send_list,
 292                                               bool copy_data)
 293{
 294        ste_info->size = size;
 295        ste_info->ste = ste;
 296        ste_info->offset = offset;
 297
 298        if (copy_data) {
 299                memcpy(ste_info->data_cont, data, size);
 300                ste_info->data = ste_info->data_cont;
 301        } else {
 302                ste_info->data = data;
 303        }
 304
 305        list_add_tail(&ste_info->send_list, send_list);
 306}
 307
 308/* The function tries to consume one wc each time, unless the queue is full, in
 309 * that case, which means that the hw is behind the sw in a full queue len
 310 * the function will drain the cq till it empty.
 311 */
 312static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
 313                                struct mlx5dr_send_ring *send_ring)
 314{
 315        bool is_drain = false;
 316        int ne;
 317
 318        if (send_ring->pending_wqe < send_ring->signal_th)
 319                return 0;
 320
 321        /* Queue is full start drain it */
 322        if (send_ring->pending_wqe >=
 323            dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
 324                is_drain = true;
 325
 326        do {
 327                ne = dr_poll_cq(send_ring->cq, 1);
 328                if (ne < 0)
 329                        return ne;
 330                else if (ne == 1)
 331                        send_ring->pending_wqe -= send_ring->signal_th;
 332        } while (is_drain && send_ring->pending_wqe);
 333
 334        return 0;
 335}
 336
 337static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
 338                              struct postsend_info *send_info)
 339{
 340        send_ring->pending_wqe++;
 341
 342        if (send_ring->pending_wqe % send_ring->signal_th == 0)
 343                send_info->write.send_flags |= IB_SEND_SIGNALED;
 344
 345        send_ring->pending_wqe++;
 346        send_info->read.length = send_info->write.length;
 347        /* Read into the same write area */
 348        send_info->read.addr = (uintptr_t)send_info->write.addr;
 349        send_info->read.lkey = send_ring->mr->mkey.key;
 350
 351        if (send_ring->pending_wqe % send_ring->signal_th == 0)
 352                send_info->read.send_flags = IB_SEND_SIGNALED;
 353        else
 354                send_info->read.send_flags = 0;
 355}
 356
 357static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
 358                                struct postsend_info *send_info)
 359{
 360        struct mlx5dr_send_ring *send_ring = dmn->send_ring;
 361        u32 buff_offset;
 362        int ret;
 363
 364        spin_lock(&send_ring->lock);
 365
 366        ret = dr_handle_pending_wc(dmn, send_ring);
 367        if (ret)
 368                goto out_unlock;
 369
 370        if (send_info->write.length > dmn->info.max_inline_size) {
 371                buff_offset = (send_ring->tx_head &
 372                               (dmn->send_ring->signal_th - 1)) *
 373                        send_ring->max_post_send_size;
 374                /* Copy to ring mr */
 375                memcpy(send_ring->buf + buff_offset,
 376                       (void *)(uintptr_t)send_info->write.addr,
 377                       send_info->write.length);
 378                send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
 379                send_info->write.lkey = send_ring->mr->mkey.key;
 380        }
 381
 382        send_ring->tx_head++;
 383        dr_fill_data_segs(send_ring, send_info);
 384        dr_post_send(send_ring->qp, send_info);
 385
 386out_unlock:
 387        spin_unlock(&send_ring->lock);
 388        return ret;
 389}
 390
 391static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
 392                                   struct mlx5dr_ste_htbl *htbl,
 393                                   u8 **data,
 394                                   u32 *byte_size,
 395                                   int *iterations,
 396                                   int *num_stes)
 397{
 398        int alloc_size;
 399
 400        if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
 401                *iterations = htbl->chunk->byte_size /
 402                        dmn->send_ring->max_post_send_size;
 403                *byte_size = dmn->send_ring->max_post_send_size;
 404                alloc_size = *byte_size;
 405                *num_stes = *byte_size / DR_STE_SIZE;
 406        } else {
 407                *iterations = 1;
 408                *num_stes = htbl->chunk->num_of_entries;
 409                alloc_size = *num_stes * DR_STE_SIZE;
 410        }
 411
 412        *data = kvzalloc(alloc_size, GFP_KERNEL);
 413        if (!*data)
 414                return -ENOMEM;
 415
 416        return 0;
 417}
 418
 419/**
 420 * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
 421 *
 422 *     @dmn:    Domain
 423 *     @ste:    The ste struct that contains the data (at
 424 *              least part of it)
 425 *     @data:   The real data to send size data
 426 *     @size:   for writing.
 427 *     @offset: The offset from the icm mapped data to
 428 *              start write to this for write only part of the
 429 *              buffer.
 430 *
 431 * Return: 0 on success.
 432 */
 433int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
 434                             u8 *data, u16 size, u16 offset)
 435{
 436        struct postsend_info send_info = {};
 437
 438        mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size);
 439
 440        send_info.write.addr = (uintptr_t)data;
 441        send_info.write.length = size;
 442        send_info.write.lkey = 0;
 443        send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
 444        send_info.rkey = ste->htbl->chunk->rkey;
 445
 446        return dr_postsend_icm_data(dmn, &send_info);
 447}
 448
 449int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
 450                              struct mlx5dr_ste_htbl *htbl,
 451                              u8 *formatted_ste, u8 *mask)
 452{
 453        u32 byte_size = htbl->chunk->byte_size;
 454        int num_stes_per_iter;
 455        int iterations;
 456        u8 *data;
 457        int ret;
 458        int i;
 459        int j;
 460
 461        ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
 462                                      &iterations, &num_stes_per_iter);
 463        if (ret)
 464                return ret;
 465
 466        mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE);
 467
 468        /* Send the data iteration times */
 469        for (i = 0; i < iterations; i++) {
 470                u32 ste_index = i * (byte_size / DR_STE_SIZE);
 471                struct postsend_info send_info = {};
 472
 473                /* Copy all ste's on the data buffer
 474                 * need to add the bit_mask
 475                 */
 476                for (j = 0; j < num_stes_per_iter; j++) {
 477                        struct mlx5dr_ste *ste = &htbl->ste_arr[ste_index + j];
 478                        u32 ste_off = j * DR_STE_SIZE;
 479
 480                        if (mlx5dr_ste_is_not_used(ste)) {
 481                                memcpy(data + ste_off,
 482                                       formatted_ste, DR_STE_SIZE);
 483                        } else {
 484                                /* Copy data */
 485                                memcpy(data + ste_off,
 486                                       htbl->ste_arr[ste_index + j].hw_ste,
 487                                       DR_STE_SIZE_REDUCED);
 488                                /* Copy bit_mask */
 489                                memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
 490                                       mask, DR_STE_SIZE_MASK);
 491                                /* Only when we have mask we need to re-arrange the STE */
 492                                mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx,
 493                                                                data + (j * DR_STE_SIZE),
 494                                                                DR_STE_SIZE);
 495                        }
 496                }
 497
 498                send_info.write.addr = (uintptr_t)data;
 499                send_info.write.length = byte_size;
 500                send_info.write.lkey = 0;
 501                send_info.remote_addr =
 502                        mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
 503                send_info.rkey = htbl->chunk->rkey;
 504
 505                ret = dr_postsend_icm_data(dmn, &send_info);
 506                if (ret)
 507                        goto out_free;
 508        }
 509
 510out_free:
 511        kvfree(data);
 512        return ret;
 513}
 514
 515/* Initialize htble with default STEs */
 516int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
 517                                        struct mlx5dr_ste_htbl *htbl,
 518                                        u8 *ste_init_data,
 519                                        bool update_hw_ste)
 520{
 521        u32 byte_size = htbl->chunk->byte_size;
 522        int iterations;
 523        int num_stes;
 524        u8 *copy_dst;
 525        u8 *data;
 526        int ret;
 527        int i;
 528
 529        ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
 530                                      &iterations, &num_stes);
 531        if (ret)
 532                return ret;
 533
 534        if (update_hw_ste) {
 535                /* Copy the reduced STE to hash table ste_arr */
 536                for (i = 0; i < num_stes; i++) {
 537                        copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
 538                        memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
 539                }
 540        }
 541
 542        mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE);
 543
 544        /* Copy the same STE on the data buffer */
 545        for (i = 0; i < num_stes; i++) {
 546                copy_dst = data + i * DR_STE_SIZE;
 547                memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
 548        }
 549
 550        /* Send the data iteration times */
 551        for (i = 0; i < iterations; i++) {
 552                u8 ste_index = i * (byte_size / DR_STE_SIZE);
 553                struct postsend_info send_info = {};
 554
 555                send_info.write.addr = (uintptr_t)data;
 556                send_info.write.length = byte_size;
 557                send_info.write.lkey = 0;
 558                send_info.remote_addr =
 559                        mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
 560                send_info.rkey = htbl->chunk->rkey;
 561
 562                ret = dr_postsend_icm_data(dmn, &send_info);
 563                if (ret)
 564                        goto out_free;
 565        }
 566
 567out_free:
 568        kvfree(data);
 569        return ret;
 570}
 571
 572int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
 573                                struct mlx5dr_action *action)
 574{
 575        struct postsend_info send_info = {};
 576        int ret;
 577
 578        send_info.write.addr = (uintptr_t)action->rewrite->data;
 579        send_info.write.length = action->rewrite->num_of_actions *
 580                                 DR_MODIFY_ACTION_SIZE;
 581        send_info.write.lkey = 0;
 582        send_info.remote_addr = action->rewrite->chunk->mr_addr;
 583        send_info.rkey = action->rewrite->chunk->rkey;
 584
 585        ret = dr_postsend_icm_data(dmn, &send_info);
 586
 587        return ret;
 588}
 589
 590static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
 591                                 struct mlx5dr_qp *dr_qp,
 592                                 int port)
 593{
 594        u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
 595        void *qpc;
 596
 597        qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
 598
 599        MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
 600        MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
 601        MLX5_SET(qpc, qpc, rre, 1);
 602        MLX5_SET(qpc, qpc, rwe, 1);
 603
 604        MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
 605        MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn);
 606
 607        return mlx5_cmd_exec_in(mdev, rst2init_qp, in);
 608}
 609
 610static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev,
 611                                    struct mlx5dr_qp *dr_qp,
 612                                    struct dr_qp_rts_attr *attr)
 613{
 614        u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
 615        void *qpc;
 616
 617        qpc  = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
 618
 619        MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
 620
 621        MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
 622        MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
 623
 624        MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
 625        MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
 626
 627        return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in);
 628}
 629
 630static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
 631                                     struct mlx5dr_qp *dr_qp,
 632                                     struct dr_qp_rtr_attr *attr)
 633{
 634        u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
 635        void *qpc;
 636
 637        qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
 638
 639        MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
 640
 641        MLX5_SET(qpc, qpc, mtu, attr->mtu);
 642        MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1);
 643        MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num);
 644        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
 645               attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac));
 646        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
 647               attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid));
 648        MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
 649                 attr->sgid_index);
 650
 651        if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2)
 652                MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
 653                         attr->udp_src_port);
 654
 655        MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
 656        MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl);
 657        MLX5_SET(qpc, qpc, min_rnr_nak, 1);
 658
 659        MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
 660        MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
 661
 662        return mlx5_cmd_exec_in(mdev, init2rtr_qp, in);
 663}
 664
 665static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps)
 666{
 667        /* Check whether RC RoCE QP creation with force loopback is allowed.
 668         * There are two separate capability bits for this:
 669         *  - force loopback when RoCE is enabled
 670         *  - force loopback when RoCE is disabled
 671         */
 672        return ((caps->roce_caps.roce_en &&
 673                 caps->roce_caps.fl_rc_qp_when_roce_enabled) ||
 674                (!caps->roce_caps.roce_en &&
 675                 caps->roce_caps.fl_rc_qp_when_roce_disabled));
 676}
 677
 678static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
 679{
 680        struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
 681        struct dr_qp_rts_attr rts_attr = {};
 682        struct dr_qp_rtr_attr rtr_attr = {};
 683        enum ib_mtu mtu = IB_MTU_1024;
 684        u16 gid_index = 0;
 685        int port = 1;
 686        int ret;
 687
 688        /* Init */
 689        ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port);
 690        if (ret) {
 691                mlx5dr_err(dmn, "Failed modify QP rst2init\n");
 692                return ret;
 693        }
 694
 695        /* RTR */
 696        rtr_attr.mtu            = mtu;
 697        rtr_attr.qp_num         = dr_qp->qpn;
 698        rtr_attr.min_rnr_timer  = 12;
 699        rtr_attr.port_num       = port;
 700        rtr_attr.udp_src_port   = dmn->info.caps.roce_min_src_udp;
 701
 702        /* If QP creation with force loopback is allowed, then there
 703         * is no need for GID index when creating the QP.
 704         * Otherwise we query GID attributes and use GID index.
 705         */
 706        rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps);
 707        if (!rtr_attr.fl) {
 708                ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index,
 709                                           &rtr_attr.dgid_attr);
 710                if (ret)
 711                        return ret;
 712
 713                rtr_attr.sgid_index = gid_index;
 714        }
 715
 716        ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
 717        if (ret) {
 718                mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
 719                return ret;
 720        }
 721
 722        /* RTS */
 723        rts_attr.timeout        = 14;
 724        rts_attr.retry_cnt      = 7;
 725        rts_attr.rnr_retry      = 7;
 726
 727        ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr);
 728        if (ret) {
 729                mlx5dr_err(dmn, "Failed modify QP rtr2rts\n");
 730                return ret;
 731        }
 732
 733        return 0;
 734}
 735
 736static void dr_cq_complete(struct mlx5_core_cq *mcq,
 737                           struct mlx5_eqe *eqe)
 738{
 739        pr_err("CQ completion CQ: #%u\n", mcq->cqn);
 740}
 741
 742static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
 743                                      struct mlx5_uars_page *uar,
 744                                      size_t ncqe)
 745{
 746        u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {};
 747        u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 748        struct mlx5_wq_param wqp;
 749        struct mlx5_cqe64 *cqe;
 750        struct mlx5dr_cq *cq;
 751        int inlen, err, eqn;
 752        void *cqc, *in;
 753        __be64 *pas;
 754        int vector;
 755        u32 i;
 756
 757        cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 758        if (!cq)
 759                return NULL;
 760
 761        ncqe = roundup_pow_of_two(ncqe);
 762        MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe));
 763
 764        wqp.buf_numa_node = mdev->priv.numa_node;
 765        wqp.db_numa_node = mdev->priv.numa_node;
 766
 767        err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq,
 768                               &cq->wq_ctrl);
 769        if (err)
 770                goto out;
 771
 772        for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 773                cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
 774                cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
 775        }
 776
 777        inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 778                sizeof(u64) * cq->wq_ctrl.buf.npages;
 779        in = kvzalloc(inlen, GFP_KERNEL);
 780        if (!in)
 781                goto err_cqwq;
 782
 783        vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
 784        err = mlx5_vector2eqn(mdev, vector, &eqn);
 785        if (err) {
 786                kvfree(in);
 787                goto err_cqwq;
 788        }
 789
 790        cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 791        MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
 792        MLX5_SET(cqc, cqc, c_eqn, eqn);
 793        MLX5_SET(cqc, cqc, uar_page, uar->index);
 794        MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 795                 MLX5_ADAPTER_PAGE_SHIFT);
 796        MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
 797
 798        pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
 799        mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
 800
 801        cq->mcq.comp  = dr_cq_complete;
 802
 803        err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
 804        kvfree(in);
 805
 806        if (err)
 807                goto err_cqwq;
 808
 809        cq->mcq.cqe_sz = 64;
 810        cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
 811        cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
 812        *cq->mcq.set_ci_db = 0;
 813
 814        /* set no-zero value, in order to avoid the HW to run db-recovery on
 815         * CQ that used in polling mode.
 816         */
 817        *cq->mcq.arm_db = cpu_to_be32(2 << 28);
 818
 819        cq->mcq.vector = 0;
 820        cq->mcq.uar = uar;
 821
 822        return cq;
 823
 824err_cqwq:
 825        mlx5_wq_destroy(&cq->wq_ctrl);
 826out:
 827        kfree(cq);
 828        return NULL;
 829}
 830
 831static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
 832{
 833        mlx5_core_destroy_cq(mdev, &cq->mcq);
 834        mlx5_wq_destroy(&cq->wq_ctrl);
 835        kfree(cq);
 836}
 837
 838static int
 839dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey)
 840{
 841        u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
 842        void *mkc;
 843
 844        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 845        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
 846        MLX5_SET(mkc, mkc, a, 1);
 847        MLX5_SET(mkc, mkc, rw, 1);
 848        MLX5_SET(mkc, mkc, rr, 1);
 849        MLX5_SET(mkc, mkc, lw, 1);
 850        MLX5_SET(mkc, mkc, lr, 1);
 851
 852        MLX5_SET(mkc, mkc, pd, pdn);
 853        MLX5_SET(mkc, mkc, length64, 1);
 854        MLX5_SET(mkc, mkc, qpn, 0xffffff);
 855
 856        return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in));
 857}
 858
 859static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev,
 860                                   u32 pdn, void *buf, size_t size)
 861{
 862        struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 863        struct device *dma_device;
 864        dma_addr_t dma_addr;
 865        int err;
 866
 867        if (!mr)
 868                return NULL;
 869
 870        dma_device = mlx5_core_dma_dev(mdev);
 871        dma_addr = dma_map_single(dma_device, buf, size,
 872                                  DMA_BIDIRECTIONAL);
 873        err = dma_mapping_error(dma_device, dma_addr);
 874        if (err) {
 875                mlx5_core_warn(mdev, "Can't dma buf\n");
 876                kfree(mr);
 877                return NULL;
 878        }
 879
 880        err = dr_create_mkey(mdev, pdn, &mr->mkey);
 881        if (err) {
 882                mlx5_core_warn(mdev, "Can't create mkey\n");
 883                dma_unmap_single(dma_device, dma_addr, size,
 884                                 DMA_BIDIRECTIONAL);
 885                kfree(mr);
 886                return NULL;
 887        }
 888
 889        mr->dma_addr = dma_addr;
 890        mr->size = size;
 891        mr->addr = buf;
 892
 893        return mr;
 894}
 895
 896static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr)
 897{
 898        mlx5_core_destroy_mkey(mdev, &mr->mkey);
 899        dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size,
 900                         DMA_BIDIRECTIONAL);
 901        kfree(mr);
 902}
 903
 904int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
 905{
 906        struct dr_qp_init_attr init_attr = {};
 907        int cq_size;
 908        int size;
 909        int ret;
 910
 911        dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL);
 912        if (!dmn->send_ring)
 913                return -ENOMEM;
 914
 915        cq_size = QUEUE_SIZE + 1;
 916        dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size);
 917        if (!dmn->send_ring->cq) {
 918                mlx5dr_err(dmn, "Failed creating CQ\n");
 919                ret = -ENOMEM;
 920                goto free_send_ring;
 921        }
 922
 923        init_attr.cqn = dmn->send_ring->cq->mcq.cqn;
 924        init_attr.pdn = dmn->pdn;
 925        init_attr.uar = dmn->uar;
 926        init_attr.max_send_wr = QUEUE_SIZE;
 927
 928        /* Isolated VL is applicable only if force loopback is supported */
 929        if (dr_send_allow_fl(&dmn->info.caps))
 930                init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc;
 931
 932        spin_lock_init(&dmn->send_ring->lock);
 933
 934        dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
 935        if (!dmn->send_ring->qp)  {
 936                mlx5dr_err(dmn, "Failed creating QP\n");
 937                ret = -ENOMEM;
 938                goto clean_cq;
 939        }
 940
 941        dmn->send_ring->cq->qp = dmn->send_ring->qp;
 942
 943        dmn->info.max_send_wr = QUEUE_SIZE;
 944        dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
 945                                        DR_STE_SIZE);
 946
 947        dmn->send_ring->signal_th = dmn->info.max_send_wr /
 948                SIGNAL_PER_DIV_QUEUE;
 949
 950        /* Prepare qp to be used */
 951        ret = dr_prepare_qp_to_rts(dmn);
 952        if (ret)
 953                goto clean_qp;
 954
 955        dmn->send_ring->max_post_send_size =
 956                mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K,
 957                                                   DR_ICM_TYPE_STE);
 958
 959        /* Allocating the max size as a buffer for writing */
 960        size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
 961        dmn->send_ring->buf = kzalloc(size, GFP_KERNEL);
 962        if (!dmn->send_ring->buf) {
 963                ret = -ENOMEM;
 964                goto clean_qp;
 965        }
 966
 967        dmn->send_ring->buf_size = size;
 968
 969        dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
 970                                       dmn->pdn, dmn->send_ring->buf, size);
 971        if (!dmn->send_ring->mr) {
 972                ret = -ENOMEM;
 973                goto free_mem;
 974        }
 975
 976        dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev,
 977                                            dmn->pdn, dmn->send_ring->sync_buff,
 978                                            MIN_READ_SYNC);
 979        if (!dmn->send_ring->sync_mr) {
 980                ret = -ENOMEM;
 981                goto clean_mr;
 982        }
 983
 984        return 0;
 985
 986clean_mr:
 987        dr_dereg_mr(dmn->mdev, dmn->send_ring->mr);
 988free_mem:
 989        kfree(dmn->send_ring->buf);
 990clean_qp:
 991        dr_destroy_qp(dmn->mdev, dmn->send_ring->qp);
 992clean_cq:
 993        dr_destroy_cq(dmn->mdev, dmn->send_ring->cq);
 994free_send_ring:
 995        kfree(dmn->send_ring);
 996
 997        return ret;
 998}
 999
1000void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn,
1001                           struct mlx5dr_send_ring *send_ring)
1002{
1003        dr_destroy_qp(dmn->mdev, send_ring->qp);
1004        dr_destroy_cq(dmn->mdev, send_ring->cq);
1005        dr_dereg_mr(dmn->mdev, send_ring->sync_mr);
1006        dr_dereg_mr(dmn->mdev, send_ring->mr);
1007        kfree(send_ring->buf);
1008        kfree(send_ring);
1009}
1010
1011int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
1012{
1013        struct mlx5dr_send_ring *send_ring = dmn->send_ring;
1014        struct postsend_info send_info = {};
1015        u8 data[DR_STE_SIZE];
1016        int num_of_sends_req;
1017        int ret;
1018        int i;
1019
1020        /* Sending this amount of requests makes sure we will get drain */
1021        num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
1022
1023        /* Send fake requests forcing the last to be signaled */
1024        send_info.write.addr = (uintptr_t)data;
1025        send_info.write.length = DR_STE_SIZE;
1026        send_info.write.lkey = 0;
1027        /* Using the sync_mr in order to write/read */
1028        send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
1029        send_info.rkey = send_ring->sync_mr->mkey.key;
1030
1031        for (i = 0; i < num_of_sends_req; i++) {
1032                ret = dr_postsend_icm_data(dmn, &send_info);
1033                if (ret)
1034                        return ret;
1035        }
1036
1037        spin_lock(&send_ring->lock);
1038        ret = dr_handle_pending_wc(dmn, send_ring);
1039        spin_unlock(&send_ring->lock);
1040
1041        return ret;
1042}
1043