linux/drivers/gpu/drm/i915/gt/intel_lrc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2014 Intel Corporation
   4 */
   5
   6#include "gem/i915_gem_lmem.h"
   7
   8#include "gen8_engine_cs.h"
   9#include "i915_drv.h"
  10#include "i915_perf.h"
  11#include "intel_engine.h"
  12#include "intel_gpu_commands.h"
  13#include "intel_gt.h"
  14#include "intel_lrc.h"
  15#include "intel_lrc_reg.h"
  16#include "intel_ring.h"
  17#include "shmem_utils.h"
  18
  19static void set_offsets(u32 *regs,
  20                        const u8 *data,
  21                        const struct intel_engine_cs *engine,
  22                        bool close)
  23#define NOP(x) (BIT(7) | (x))
  24#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
  25#define POSTED BIT(0)
  26#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  27#define REG16(x) \
  28        (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  29        (((x) >> 2) & 0x7f)
  30#define END 0
  31{
  32        const u32 base = engine->mmio_base;
  33
  34        while (*data) {
  35                u8 count, flags;
  36
  37                if (*data & BIT(7)) { /* skip */
  38                        count = *data++ & ~BIT(7);
  39                        regs += count;
  40                        continue;
  41                }
  42
  43                count = *data & 0x3f;
  44                flags = *data >> 6;
  45                data++;
  46
  47                *regs = MI_LOAD_REGISTER_IMM(count);
  48                if (flags & POSTED)
  49                        *regs |= MI_LRI_FORCE_POSTED;
  50                if (GRAPHICS_VER(engine->i915) >= 11)
  51                        *regs |= MI_LRI_LRM_CS_MMIO;
  52                regs++;
  53
  54                GEM_BUG_ON(!count);
  55                do {
  56                        u32 offset = 0;
  57                        u8 v;
  58
  59                        do {
  60                                v = *data++;
  61                                offset <<= 7;
  62                                offset |= v & ~BIT(7);
  63                        } while (v & BIT(7));
  64
  65                        regs[0] = base + (offset << 2);
  66                        regs += 2;
  67                } while (--count);
  68        }
  69
  70        if (close) {
  71                /* Close the batch; used mainly by live_lrc_layout() */
  72                *regs = MI_BATCH_BUFFER_END;
  73                if (GRAPHICS_VER(engine->i915) >= 10)
  74                        *regs |= BIT(0);
  75        }
  76}
  77
  78static const u8 gen8_xcs_offsets[] = {
  79        NOP(1),
  80        LRI(11, 0),
  81        REG16(0x244),
  82        REG(0x034),
  83        REG(0x030),
  84        REG(0x038),
  85        REG(0x03c),
  86        REG(0x168),
  87        REG(0x140),
  88        REG(0x110),
  89        REG(0x11c),
  90        REG(0x114),
  91        REG(0x118),
  92
  93        NOP(9),
  94        LRI(9, 0),
  95        REG16(0x3a8),
  96        REG16(0x28c),
  97        REG16(0x288),
  98        REG16(0x284),
  99        REG16(0x280),
 100        REG16(0x27c),
 101        REG16(0x278),
 102        REG16(0x274),
 103        REG16(0x270),
 104
 105        NOP(13),
 106        LRI(2, 0),
 107        REG16(0x200),
 108        REG(0x028),
 109
 110        END
 111};
 112
 113static const u8 gen9_xcs_offsets[] = {
 114        NOP(1),
 115        LRI(14, POSTED),
 116        REG16(0x244),
 117        REG(0x034),
 118        REG(0x030),
 119        REG(0x038),
 120        REG(0x03c),
 121        REG(0x168),
 122        REG(0x140),
 123        REG(0x110),
 124        REG(0x11c),
 125        REG(0x114),
 126        REG(0x118),
 127        REG(0x1c0),
 128        REG(0x1c4),
 129        REG(0x1c8),
 130
 131        NOP(3),
 132        LRI(9, POSTED),
 133        REG16(0x3a8),
 134        REG16(0x28c),
 135        REG16(0x288),
 136        REG16(0x284),
 137        REG16(0x280),
 138        REG16(0x27c),
 139        REG16(0x278),
 140        REG16(0x274),
 141        REG16(0x270),
 142
 143        NOP(13),
 144        LRI(1, POSTED),
 145        REG16(0x200),
 146
 147        NOP(13),
 148        LRI(44, POSTED),
 149        REG(0x028),
 150        REG(0x09c),
 151        REG(0x0c0),
 152        REG(0x178),
 153        REG(0x17c),
 154        REG16(0x358),
 155        REG(0x170),
 156        REG(0x150),
 157        REG(0x154),
 158        REG(0x158),
 159        REG16(0x41c),
 160        REG16(0x600),
 161        REG16(0x604),
 162        REG16(0x608),
 163        REG16(0x60c),
 164        REG16(0x610),
 165        REG16(0x614),
 166        REG16(0x618),
 167        REG16(0x61c),
 168        REG16(0x620),
 169        REG16(0x624),
 170        REG16(0x628),
 171        REG16(0x62c),
 172        REG16(0x630),
 173        REG16(0x634),
 174        REG16(0x638),
 175        REG16(0x63c),
 176        REG16(0x640),
 177        REG16(0x644),
 178        REG16(0x648),
 179        REG16(0x64c),
 180        REG16(0x650),
 181        REG16(0x654),
 182        REG16(0x658),
 183        REG16(0x65c),
 184        REG16(0x660),
 185        REG16(0x664),
 186        REG16(0x668),
 187        REG16(0x66c),
 188        REG16(0x670),
 189        REG16(0x674),
 190        REG16(0x678),
 191        REG16(0x67c),
 192        REG(0x068),
 193
 194        END
 195};
 196
 197static const u8 gen12_xcs_offsets[] = {
 198        NOP(1),
 199        LRI(13, POSTED),
 200        REG16(0x244),
 201        REG(0x034),
 202        REG(0x030),
 203        REG(0x038),
 204        REG(0x03c),
 205        REG(0x168),
 206        REG(0x140),
 207        REG(0x110),
 208        REG(0x1c0),
 209        REG(0x1c4),
 210        REG(0x1c8),
 211        REG(0x180),
 212        REG16(0x2b4),
 213
 214        NOP(5),
 215        LRI(9, POSTED),
 216        REG16(0x3a8),
 217        REG16(0x28c),
 218        REG16(0x288),
 219        REG16(0x284),
 220        REG16(0x280),
 221        REG16(0x27c),
 222        REG16(0x278),
 223        REG16(0x274),
 224        REG16(0x270),
 225
 226        END
 227};
 228
 229static const u8 gen8_rcs_offsets[] = {
 230        NOP(1),
 231        LRI(14, POSTED),
 232        REG16(0x244),
 233        REG(0x034),
 234        REG(0x030),
 235        REG(0x038),
 236        REG(0x03c),
 237        REG(0x168),
 238        REG(0x140),
 239        REG(0x110),
 240        REG(0x11c),
 241        REG(0x114),
 242        REG(0x118),
 243        REG(0x1c0),
 244        REG(0x1c4),
 245        REG(0x1c8),
 246
 247        NOP(3),
 248        LRI(9, POSTED),
 249        REG16(0x3a8),
 250        REG16(0x28c),
 251        REG16(0x288),
 252        REG16(0x284),
 253        REG16(0x280),
 254        REG16(0x27c),
 255        REG16(0x278),
 256        REG16(0x274),
 257        REG16(0x270),
 258
 259        NOP(13),
 260        LRI(1, 0),
 261        REG(0x0c8),
 262
 263        END
 264};
 265
 266static const u8 gen9_rcs_offsets[] = {
 267        NOP(1),
 268        LRI(14, POSTED),
 269        REG16(0x244),
 270        REG(0x34),
 271        REG(0x30),
 272        REG(0x38),
 273        REG(0x3c),
 274        REG(0x168),
 275        REG(0x140),
 276        REG(0x110),
 277        REG(0x11c),
 278        REG(0x114),
 279        REG(0x118),
 280        REG(0x1c0),
 281        REG(0x1c4),
 282        REG(0x1c8),
 283
 284        NOP(3),
 285        LRI(9, POSTED),
 286        REG16(0x3a8),
 287        REG16(0x28c),
 288        REG16(0x288),
 289        REG16(0x284),
 290        REG16(0x280),
 291        REG16(0x27c),
 292        REG16(0x278),
 293        REG16(0x274),
 294        REG16(0x270),
 295
 296        NOP(13),
 297        LRI(1, 0),
 298        REG(0xc8),
 299
 300        NOP(13),
 301        LRI(44, POSTED),
 302        REG(0x28),
 303        REG(0x9c),
 304        REG(0xc0),
 305        REG(0x178),
 306        REG(0x17c),
 307        REG16(0x358),
 308        REG(0x170),
 309        REG(0x150),
 310        REG(0x154),
 311        REG(0x158),
 312        REG16(0x41c),
 313        REG16(0x600),
 314        REG16(0x604),
 315        REG16(0x608),
 316        REG16(0x60c),
 317        REG16(0x610),
 318        REG16(0x614),
 319        REG16(0x618),
 320        REG16(0x61c),
 321        REG16(0x620),
 322        REG16(0x624),
 323        REG16(0x628),
 324        REG16(0x62c),
 325        REG16(0x630),
 326        REG16(0x634),
 327        REG16(0x638),
 328        REG16(0x63c),
 329        REG16(0x640),
 330        REG16(0x644),
 331        REG16(0x648),
 332        REG16(0x64c),
 333        REG16(0x650),
 334        REG16(0x654),
 335        REG16(0x658),
 336        REG16(0x65c),
 337        REG16(0x660),
 338        REG16(0x664),
 339        REG16(0x668),
 340        REG16(0x66c),
 341        REG16(0x670),
 342        REG16(0x674),
 343        REG16(0x678),
 344        REG16(0x67c),
 345        REG(0x68),
 346
 347        END
 348};
 349
 350static const u8 gen11_rcs_offsets[] = {
 351        NOP(1),
 352        LRI(15, POSTED),
 353        REG16(0x244),
 354        REG(0x034),
 355        REG(0x030),
 356        REG(0x038),
 357        REG(0x03c),
 358        REG(0x168),
 359        REG(0x140),
 360        REG(0x110),
 361        REG(0x11c),
 362        REG(0x114),
 363        REG(0x118),
 364        REG(0x1c0),
 365        REG(0x1c4),
 366        REG(0x1c8),
 367        REG(0x180),
 368
 369        NOP(1),
 370        LRI(9, POSTED),
 371        REG16(0x3a8),
 372        REG16(0x28c),
 373        REG16(0x288),
 374        REG16(0x284),
 375        REG16(0x280),
 376        REG16(0x27c),
 377        REG16(0x278),
 378        REG16(0x274),
 379        REG16(0x270),
 380
 381        LRI(1, POSTED),
 382        REG(0x1b0),
 383
 384        NOP(10),
 385        LRI(1, 0),
 386        REG(0x0c8),
 387
 388        END
 389};
 390
 391static const u8 gen12_rcs_offsets[] = {
 392        NOP(1),
 393        LRI(13, POSTED),
 394        REG16(0x244),
 395        REG(0x034),
 396        REG(0x030),
 397        REG(0x038),
 398        REG(0x03c),
 399        REG(0x168),
 400        REG(0x140),
 401        REG(0x110),
 402        REG(0x1c0),
 403        REG(0x1c4),
 404        REG(0x1c8),
 405        REG(0x180),
 406        REG16(0x2b4),
 407
 408        NOP(5),
 409        LRI(9, POSTED),
 410        REG16(0x3a8),
 411        REG16(0x28c),
 412        REG16(0x288),
 413        REG16(0x284),
 414        REG16(0x280),
 415        REG16(0x27c),
 416        REG16(0x278),
 417        REG16(0x274),
 418        REG16(0x270),
 419
 420        LRI(3, POSTED),
 421        REG(0x1b0),
 422        REG16(0x5a8),
 423        REG16(0x5ac),
 424
 425        NOP(6),
 426        LRI(1, 0),
 427        REG(0x0c8),
 428        NOP(3 + 9 + 1),
 429
 430        LRI(51, POSTED),
 431        REG16(0x588),
 432        REG16(0x588),
 433        REG16(0x588),
 434        REG16(0x588),
 435        REG16(0x588),
 436        REG16(0x588),
 437        REG(0x028),
 438        REG(0x09c),
 439        REG(0x0c0),
 440        REG(0x178),
 441        REG(0x17c),
 442        REG16(0x358),
 443        REG(0x170),
 444        REG(0x150),
 445        REG(0x154),
 446        REG(0x158),
 447        REG16(0x41c),
 448        REG16(0x600),
 449        REG16(0x604),
 450        REG16(0x608),
 451        REG16(0x60c),
 452        REG16(0x610),
 453        REG16(0x614),
 454        REG16(0x618),
 455        REG16(0x61c),
 456        REG16(0x620),
 457        REG16(0x624),
 458        REG16(0x628),
 459        REG16(0x62c),
 460        REG16(0x630),
 461        REG16(0x634),
 462        REG16(0x638),
 463        REG16(0x63c),
 464        REG16(0x640),
 465        REG16(0x644),
 466        REG16(0x648),
 467        REG16(0x64c),
 468        REG16(0x650),
 469        REG16(0x654),
 470        REG16(0x658),
 471        REG16(0x65c),
 472        REG16(0x660),
 473        REG16(0x664),
 474        REG16(0x668),
 475        REG16(0x66c),
 476        REG16(0x670),
 477        REG16(0x674),
 478        REG16(0x678),
 479        REG16(0x67c),
 480        REG(0x068),
 481        REG(0x084),
 482        NOP(1),
 483
 484        END
 485};
 486
 487#undef END
 488#undef REG16
 489#undef REG
 490#undef LRI
 491#undef NOP
 492
 493static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 494{
 495        /*
 496         * The gen12+ lists only have the registers we program in the basic
 497         * default state. We rely on the context image using relative
 498         * addressing to automatic fixup the register state between the
 499         * physical engines for virtual engine.
 500         */
 501        GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
 502                   !intel_engine_has_relative_mmio(engine));
 503
 504        if (engine->class == RENDER_CLASS) {
 505                if (GRAPHICS_VER(engine->i915) >= 12)
 506                        return gen12_rcs_offsets;
 507                else if (GRAPHICS_VER(engine->i915) >= 11)
 508                        return gen11_rcs_offsets;
 509                else if (GRAPHICS_VER(engine->i915) >= 9)
 510                        return gen9_rcs_offsets;
 511                else
 512                        return gen8_rcs_offsets;
 513        } else {
 514                if (GRAPHICS_VER(engine->i915) >= 12)
 515                        return gen12_xcs_offsets;
 516                else if (GRAPHICS_VER(engine->i915) >= 9)
 517                        return gen9_xcs_offsets;
 518                else
 519                        return gen8_xcs_offsets;
 520        }
 521}
 522
 523static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 524{
 525        if (GRAPHICS_VER(engine->i915) >= 12)
 526                return 0x60;
 527        else if (GRAPHICS_VER(engine->i915) >= 9)
 528                return 0x54;
 529        else if (engine->class == RENDER_CLASS)
 530                return 0x58;
 531        else
 532                return -1;
 533}
 534
 535static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 536{
 537        if (GRAPHICS_VER(engine->i915) >= 12)
 538                return 0x74;
 539        else if (GRAPHICS_VER(engine->i915) >= 9)
 540                return 0x68;
 541        else if (engine->class == RENDER_CLASS)
 542                return 0xd8;
 543        else
 544                return -1;
 545}
 546
 547static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 548{
 549        if (GRAPHICS_VER(engine->i915) >= 12)
 550                return 0x12;
 551        else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 552                return 0x18;
 553        else
 554                return -1;
 555}
 556
 557static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 558{
 559        int x;
 560
 561        x = lrc_ring_wa_bb_per_ctx(engine);
 562        if (x < 0)
 563                return x;
 564
 565        return x + 2;
 566}
 567
 568static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 569{
 570        int x;
 571
 572        x = lrc_ring_indirect_ptr(engine);
 573        if (x < 0)
 574                return x;
 575
 576        return x + 2;
 577}
 578
 579static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 580{
 581        if (engine->class != RENDER_CLASS)
 582                return -1;
 583
 584        if (GRAPHICS_VER(engine->i915) >= 12)
 585                return 0xb6;
 586        else if (GRAPHICS_VER(engine->i915) >= 11)
 587                return 0xaa;
 588        else
 589                return -1;
 590}
 591
 592static u32
 593lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 594{
 595        switch (GRAPHICS_VER(engine->i915)) {
 596        default:
 597                MISSING_CASE(GRAPHICS_VER(engine->i915));
 598                fallthrough;
 599        case 12:
 600                return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 601        case 11:
 602                return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 603        case 10:
 604                return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 605        case 9:
 606                return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 607        case 8:
 608                return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 609        }
 610}
 611
 612static void
 613lrc_setup_indirect_ctx(u32 *regs,
 614                       const struct intel_engine_cs *engine,
 615                       u32 ctx_bb_ggtt_addr,
 616                       u32 size)
 617{
 618        GEM_BUG_ON(!size);
 619        GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 620        GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 621        regs[lrc_ring_indirect_ptr(engine) + 1] =
 622                ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 623
 624        GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 625        regs[lrc_ring_indirect_offset(engine) + 1] =
 626                lrc_ring_indirect_offset_default(engine) << 6;
 627}
 628
 629static void init_common_regs(u32 * const regs,
 630                             const struct intel_context *ce,
 631                             const struct intel_engine_cs *engine,
 632                             bool inhibit)
 633{
 634        u32 ctl;
 635
 636        ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
 637        ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 638        if (inhibit)
 639                ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 640        if (GRAPHICS_VER(engine->i915) < 11)
 641                ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
 642                                           CTX_CTRL_RS_CTX_ENABLE);
 643        regs[CTX_CONTEXT_CONTROL] = ctl;
 644
 645        regs[CTX_TIMESTAMP] = ce->runtime.last;
 646}
 647
 648static void init_wa_bb_regs(u32 * const regs,
 649                            const struct intel_engine_cs *engine)
 650{
 651        const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 652
 653        if (wa_ctx->per_ctx.size) {
 654                const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 655
 656                GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 657                regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 658                        (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
 659        }
 660
 661        if (wa_ctx->indirect_ctx.size) {
 662                lrc_setup_indirect_ctx(regs, engine,
 663                                       i915_ggtt_offset(wa_ctx->vma) +
 664                                       wa_ctx->indirect_ctx.offset,
 665                                       wa_ctx->indirect_ctx.size);
 666        }
 667}
 668
 669static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
 670{
 671        if (i915_vm_is_4lvl(&ppgtt->vm)) {
 672                /* 64b PPGTT (48bit canonical)
 673                 * PDP0_DESCRIPTOR contains the base address to PML4 and
 674                 * other PDP Descriptors are ignored.
 675                 */
 676                ASSIGN_CTX_PML4(ppgtt, regs);
 677        } else {
 678                ASSIGN_CTX_PDP(ppgtt, regs, 3);
 679                ASSIGN_CTX_PDP(ppgtt, regs, 2);
 680                ASSIGN_CTX_PDP(ppgtt, regs, 1);
 681                ASSIGN_CTX_PDP(ppgtt, regs, 0);
 682        }
 683}
 684
 685static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
 686{
 687        if (i915_is_ggtt(vm))
 688                return i915_vm_to_ggtt(vm)->alias;
 689        else
 690                return i915_vm_to_ppgtt(vm);
 691}
 692
 693static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 694{
 695        int x;
 696
 697        x = lrc_ring_mi_mode(engine);
 698        if (x != -1) {
 699                regs[x + 1] &= ~STOP_RING;
 700                regs[x + 1] |= STOP_RING << 16;
 701        }
 702}
 703
 704static void __lrc_init_regs(u32 *regs,
 705                            const struct intel_context *ce,
 706                            const struct intel_engine_cs *engine,
 707                            bool inhibit)
 708{
 709        /*
 710         * A context is actually a big batch buffer with several
 711         * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 712         * values we are setting here are only for the first context restore:
 713         * on a subsequent save, the GPU will recreate this batchbuffer with new
 714         * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 715         * we are not initializing here).
 716         *
 717         * Must keep consistent with virtual_update_register_offsets().
 718         */
 719
 720        if (inhibit)
 721                memset(regs, 0, PAGE_SIZE);
 722
 723        set_offsets(regs, reg_offsets(engine), engine, inhibit);
 724
 725        init_common_regs(regs, ce, engine, inhibit);
 726        init_ppgtt_regs(regs, vm_alias(ce->vm));
 727
 728        init_wa_bb_regs(regs, engine);
 729
 730        __reset_stop_ring(regs, engine);
 731}
 732
 733void lrc_init_regs(const struct intel_context *ce,
 734                   const struct intel_engine_cs *engine,
 735                   bool inhibit)
 736{
 737        __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
 738}
 739
 740void lrc_reset_regs(const struct intel_context *ce,
 741                    const struct intel_engine_cs *engine)
 742{
 743        __reset_stop_ring(ce->lrc_reg_state, engine);
 744}
 745
 746static void
 747set_redzone(void *vaddr, const struct intel_engine_cs *engine)
 748{
 749        if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 750                return;
 751
 752        vaddr += engine->context_size;
 753
 754        memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
 755}
 756
 757static void
 758check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 759{
 760        if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 761                return;
 762
 763        vaddr += engine->context_size;
 764
 765        if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
 766                drm_err_once(&engine->i915->drm,
 767                             "%s context redzone overwritten!\n",
 768                             engine->name);
 769}
 770
 771void lrc_init_state(struct intel_context *ce,
 772                    struct intel_engine_cs *engine,
 773                    void *state)
 774{
 775        bool inhibit = true;
 776
 777        set_redzone(state, engine);
 778
 779        if (engine->default_state) {
 780                shmem_read(engine->default_state, 0,
 781                           state, engine->context_size);
 782                __set_bit(CONTEXT_VALID_BIT, &ce->flags);
 783                inhibit = false;
 784        }
 785
 786        /* Clear the ppHWSP (inc. per-context counters) */
 787        memset(state, 0, PAGE_SIZE);
 788
 789        /*
 790         * The second page of the context object contains some registers which
 791         * must be set up prior to the first execution.
 792         */
 793        __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
 794}
 795
 796static struct i915_vma *
 797__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
 798{
 799        struct drm_i915_gem_object *obj;
 800        struct i915_vma *vma;
 801        u32 context_size;
 802
 803        context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
 804
 805        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 806                context_size += I915_GTT_PAGE_SIZE; /* for redzone */
 807
 808        if (GRAPHICS_VER(engine->i915) == 12) {
 809                ce->wa_bb_page = context_size / PAGE_SIZE;
 810                context_size += PAGE_SIZE;
 811        }
 812
 813        obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
 814        if (IS_ERR(obj))
 815                obj = i915_gem_object_create_shmem(engine->i915, context_size);
 816        if (IS_ERR(obj))
 817                return ERR_CAST(obj);
 818
 819        vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
 820        if (IS_ERR(vma)) {
 821                i915_gem_object_put(obj);
 822                return vma;
 823        }
 824
 825        return vma;
 826}
 827
 828static struct intel_timeline *
 829pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
 830{
 831        struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
 832
 833        return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
 834}
 835
 836int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
 837{
 838        struct intel_ring *ring;
 839        struct i915_vma *vma;
 840        int err;
 841
 842        GEM_BUG_ON(ce->state);
 843
 844        vma = __lrc_alloc_state(ce, engine);
 845        if (IS_ERR(vma))
 846                return PTR_ERR(vma);
 847
 848        ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
 849        if (IS_ERR(ring)) {
 850                err = PTR_ERR(ring);
 851                goto err_vma;
 852        }
 853
 854        if (!page_mask_bits(ce->timeline)) {
 855                struct intel_timeline *tl;
 856
 857                /*
 858                 * Use the static global HWSP for the kernel context, and
 859                 * a dynamically allocated cacheline for everyone else.
 860                 */
 861                if (unlikely(ce->timeline))
 862                        tl = pinned_timeline(ce, engine);
 863                else
 864                        tl = intel_timeline_create(engine->gt);
 865                if (IS_ERR(tl)) {
 866                        err = PTR_ERR(tl);
 867                        goto err_ring;
 868                }
 869
 870                ce->timeline = tl;
 871        }
 872
 873        ce->ring = ring;
 874        ce->state = vma;
 875
 876        return 0;
 877
 878err_ring:
 879        intel_ring_put(ring);
 880err_vma:
 881        i915_vma_put(vma);
 882        return err;
 883}
 884
 885void lrc_reset(struct intel_context *ce)
 886{
 887        GEM_BUG_ON(!intel_context_is_pinned(ce));
 888
 889        intel_ring_reset(ce->ring, ce->ring->emit);
 890
 891        /* Scrub away the garbage */
 892        lrc_init_regs(ce, ce->engine, true);
 893        ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
 894}
 895
 896int
 897lrc_pre_pin(struct intel_context *ce,
 898            struct intel_engine_cs *engine,
 899            struct i915_gem_ww_ctx *ww,
 900            void **vaddr)
 901{
 902        GEM_BUG_ON(!ce->state);
 903        GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
 904
 905        *vaddr = i915_gem_object_pin_map(ce->state->obj,
 906                                         i915_coherent_map_type(ce->engine->i915,
 907                                                                ce->state->obj,
 908                                                                false) |
 909                                         I915_MAP_OVERRIDE);
 910
 911        return PTR_ERR_OR_ZERO(*vaddr);
 912}
 913
 914int
 915lrc_pin(struct intel_context *ce,
 916        struct intel_engine_cs *engine,
 917        void *vaddr)
 918{
 919        ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
 920
 921        if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
 922                lrc_init_state(ce, engine, vaddr);
 923
 924        ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
 925        return 0;
 926}
 927
 928void lrc_unpin(struct intel_context *ce)
 929{
 930        check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
 931                      ce->engine);
 932}
 933
 934void lrc_post_unpin(struct intel_context *ce)
 935{
 936        i915_gem_object_unpin_map(ce->state->obj);
 937}
 938
 939void lrc_fini(struct intel_context *ce)
 940{
 941        if (!ce->state)
 942                return;
 943
 944        intel_ring_put(fetch_and_zero(&ce->ring));
 945        i915_vma_put(fetch_and_zero(&ce->state));
 946}
 947
 948void lrc_destroy(struct kref *kref)
 949{
 950        struct intel_context *ce = container_of(kref, typeof(*ce), ref);
 951
 952        GEM_BUG_ON(!i915_active_is_idle(&ce->active));
 953        GEM_BUG_ON(intel_context_is_pinned(ce));
 954
 955        lrc_fini(ce);
 956
 957        intel_context_fini(ce);
 958        intel_context_free(ce);
 959}
 960
 961static u32 *
 962gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
 963{
 964        *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
 965                MI_SRM_LRM_GLOBAL_GTT |
 966                MI_LRI_LRM_CS_MMIO;
 967        *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 968        *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
 969                CTX_TIMESTAMP * sizeof(u32);
 970        *cs++ = 0;
 971
 972        *cs++ = MI_LOAD_REGISTER_REG |
 973                MI_LRR_SOURCE_CS_MMIO |
 974                MI_LRI_LRM_CS_MMIO;
 975        *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 976        *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
 977
 978        *cs++ = MI_LOAD_REGISTER_REG |
 979                MI_LRR_SOURCE_CS_MMIO |
 980                MI_LRI_LRM_CS_MMIO;
 981        *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 982        *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
 983
 984        return cs;
 985}
 986
 987static u32 *
 988gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
 989{
 990        GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
 991
 992        *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
 993                MI_SRM_LRM_GLOBAL_GTT |
 994                MI_LRI_LRM_CS_MMIO;
 995        *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 996        *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
 997                (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
 998        *cs++ = 0;
 999
1000        return cs;
1001}
1002
1003static u32 *
1004gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1005{
1006        GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1007
1008        *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1009                MI_SRM_LRM_GLOBAL_GTT |
1010                MI_LRI_LRM_CS_MMIO;
1011        *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1012        *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1013                (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1014        *cs++ = 0;
1015
1016        *cs++ = MI_LOAD_REGISTER_REG |
1017                MI_LRR_SOURCE_CS_MMIO |
1018                MI_LRI_LRM_CS_MMIO;
1019        *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1020        *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1021
1022        return cs;
1023}
1024
1025static u32 *
1026gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1027{
1028        cs = gen12_emit_timestamp_wa(ce, cs);
1029        cs = gen12_emit_cmd_buf_wa(ce, cs);
1030        cs = gen12_emit_restore_scratch(ce, cs);
1031
1032        return cs;
1033}
1034
1035static u32 *
1036gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1037{
1038        cs = gen12_emit_timestamp_wa(ce, cs);
1039        cs = gen12_emit_restore_scratch(ce, cs);
1040
1041        return cs;
1042}
1043
1044static u32 context_wa_bb_offset(const struct intel_context *ce)
1045{
1046        return PAGE_SIZE * ce->wa_bb_page;
1047}
1048
1049static u32 *context_indirect_bb(const struct intel_context *ce)
1050{
1051        void *ptr;
1052
1053        GEM_BUG_ON(!ce->wa_bb_page);
1054
1055        ptr = ce->lrc_reg_state;
1056        ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1057        ptr += context_wa_bb_offset(ce);
1058
1059        return ptr;
1060}
1061
1062static void
1063setup_indirect_ctx_bb(const struct intel_context *ce,
1064                      const struct intel_engine_cs *engine,
1065                      u32 *(*emit)(const struct intel_context *, u32 *))
1066{
1067        u32 * const start = context_indirect_bb(ce);
1068        u32 *cs;
1069
1070        cs = emit(ce, start);
1071        GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1072        while ((unsigned long)cs % CACHELINE_BYTES)
1073                *cs++ = MI_NOOP;
1074
1075        lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1076                               i915_ggtt_offset(ce->state) +
1077                               context_wa_bb_offset(ce),
1078                               (cs - start) * sizeof(*cs));
1079}
1080
1081/*
1082 * The context descriptor encodes various attributes of a context,
1083 * including its GTT address and some flags. Because it's fairly
1084 * expensive to calculate, we'll just do it once and cache the result,
1085 * which remains valid until the context is unpinned.
1086 *
1087 * This is what a descriptor looks like, from LSB to MSB::
1088 *
1089 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1090 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1091 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1092 *      bits 53-54:    mbz, reserved for use by hardware
1093 *      bits 55-63:    group ID, currently unused and set to 0
1094 *
1095 * Starting from Gen11, the upper dword of the descriptor has a new format:
1096 *
1097 *      bits 32-36:    reserved
1098 *      bits 37-47:    SW context ID
1099 *      bits 48:53:    engine instance
1100 *      bit 54:        mbz, reserved for use by hardware
1101 *      bits 55-60:    SW counter
1102 *      bits 61-63:    engine class
1103 *
1104 * engine info, SW context ID and SW counter need to form a unique number
1105 * (Context ID) per lrc.
1106 */
1107static u32 lrc_descriptor(const struct intel_context *ce)
1108{
1109        u32 desc;
1110
1111        desc = INTEL_LEGACY_32B_CONTEXT;
1112        if (i915_vm_is_4lvl(ce->vm))
1113                desc = INTEL_LEGACY_64B_CONTEXT;
1114        desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1115
1116        desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1117        if (GRAPHICS_VER(ce->vm->i915) == 8)
1118                desc |= GEN8_CTX_L3LLC_COHERENT;
1119
1120        return i915_ggtt_offset(ce->state) | desc;
1121}
1122
1123u32 lrc_update_regs(const struct intel_context *ce,
1124                    const struct intel_engine_cs *engine,
1125                    u32 head)
1126{
1127        struct intel_ring *ring = ce->ring;
1128        u32 *regs = ce->lrc_reg_state;
1129
1130        GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1131        GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1132
1133        regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1134        regs[CTX_RING_HEAD] = head;
1135        regs[CTX_RING_TAIL] = ring->tail;
1136        regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1137
1138        /* RPCS */
1139        if (engine->class == RENDER_CLASS) {
1140                regs[CTX_R_PWR_CLK_STATE] =
1141                        intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1142
1143                i915_oa_init_reg_state(ce, engine);
1144        }
1145
1146        if (ce->wa_bb_page) {
1147                u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1148
1149                fn = gen12_emit_indirect_ctx_xcs;
1150                if (ce->engine->class == RENDER_CLASS)
1151                        fn = gen12_emit_indirect_ctx_rcs;
1152
1153                /* Mutually exclusive wrt to global indirect bb */
1154                GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1155                setup_indirect_ctx_bb(ce, engine, fn);
1156        }
1157
1158        return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1159}
1160
1161void lrc_update_offsets(struct intel_context *ce,
1162                        struct intel_engine_cs *engine)
1163{
1164        set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1165}
1166
1167void lrc_check_regs(const struct intel_context *ce,
1168                    const struct intel_engine_cs *engine,
1169                    const char *when)
1170{
1171        const struct intel_ring *ring = ce->ring;
1172        u32 *regs = ce->lrc_reg_state;
1173        bool valid = true;
1174        int x;
1175
1176        if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1177                pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1178                       engine->name,
1179                       regs[CTX_RING_START],
1180                       i915_ggtt_offset(ring->vma));
1181                regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1182                valid = false;
1183        }
1184
1185        if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1186            (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1187                pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1188                       engine->name,
1189                       regs[CTX_RING_CTL],
1190                       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1191                regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1192                valid = false;
1193        }
1194
1195        x = lrc_ring_mi_mode(engine);
1196        if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1197                pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1198                       engine->name, regs[x + 1]);
1199                regs[x + 1] &= ~STOP_RING;
1200                regs[x + 1] |= STOP_RING << 16;
1201                valid = false;
1202        }
1203
1204        WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1205}
1206
1207/*
1208 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1209 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1210 * but there is a slight complication as this is applied in WA batch where the
1211 * values are only initialized once so we cannot take register value at the
1212 * beginning and reuse it further; hence we save its value to memory, upload a
1213 * constant value with bit21 set and then we restore it back with the saved value.
1214 * To simplify the WA, a constant value is formed by using the default value
1215 * of this register. This shouldn't be a problem because we are only modifying
1216 * it for a short period and this batch in non-premptible. We can ofcourse
1217 * use additional instructions that read the actual value of the register
1218 * at that time and set our bit of interest but it makes the WA complicated.
1219 *
1220 * This WA is also required for Gen9 so extracting as a function avoids
1221 * code duplication.
1222 */
1223static u32 *
1224gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1225{
1226        /* NB no one else is allowed to scribble over scratch + 256! */
1227        *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1228        *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1229        *batch++ = intel_gt_scratch_offset(engine->gt,
1230                                           INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1231        *batch++ = 0;
1232
1233        *batch++ = MI_LOAD_REGISTER_IMM(1);
1234        *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1235        *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1236
1237        batch = gen8_emit_pipe_control(batch,
1238                                       PIPE_CONTROL_CS_STALL |
1239                                       PIPE_CONTROL_DC_FLUSH_ENABLE,
1240                                       0);
1241
1242        *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1243        *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1244        *batch++ = intel_gt_scratch_offset(engine->gt,
1245                                           INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1246        *batch++ = 0;
1247
1248        return batch;
1249}
1250
1251/*
1252 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1253 * initialized at the beginning and shared across all contexts but this field
1254 * helps us to have multiple batches at different offsets and select them based
1255 * on a criteria. At the moment this batch always start at the beginning of the page
1256 * and at this point we don't have multiple wa_ctx batch buffers.
1257 *
1258 * The number of WA applied are not known at the beginning; we use this field
1259 * to return the no of DWORDS written.
1260 *
1261 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1262 * so it adds NOOPs as padding to make it cacheline aligned.
1263 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1264 * makes a complete batch buffer.
1265 */
1266static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1267{
1268        /* WaDisableCtxRestoreArbitration:bdw,chv */
1269        *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1270
1271        /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1272        if (IS_BROADWELL(engine->i915))
1273                batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1274
1275        /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1276        /* Actual scratch location is at 128 bytes offset */
1277        batch = gen8_emit_pipe_control(batch,
1278                                       PIPE_CONTROL_FLUSH_L3 |
1279                                       PIPE_CONTROL_STORE_DATA_INDEX |
1280                                       PIPE_CONTROL_CS_STALL |
1281                                       PIPE_CONTROL_QW_WRITE,
1282                                       LRC_PPHWSP_SCRATCH_ADDR);
1283
1284        *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1285
1286        /* Pad to end of cacheline */
1287        while ((unsigned long)batch % CACHELINE_BYTES)
1288                *batch++ = MI_NOOP;
1289
1290        /*
1291         * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1292         * execution depends on the length specified in terms of cache lines
1293         * in the register CTX_RCS_INDIRECT_CTX
1294         */
1295
1296        return batch;
1297}
1298
1299struct lri {
1300        i915_reg_t reg;
1301        u32 value;
1302};
1303
1304static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1305{
1306        GEM_BUG_ON(!count || count > 63);
1307
1308        *batch++ = MI_LOAD_REGISTER_IMM(count);
1309        do {
1310                *batch++ = i915_mmio_reg_offset(lri->reg);
1311                *batch++ = lri->value;
1312        } while (lri++, --count);
1313        *batch++ = MI_NOOP;
1314
1315        return batch;
1316}
1317
1318static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1319{
1320        static const struct lri lri[] = {
1321                /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1322                {
1323                        COMMON_SLICE_CHICKEN2,
1324                        __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1325                                       0),
1326                },
1327
1328                /* BSpec: 11391 */
1329                {
1330                        FF_SLICE_CHICKEN,
1331                        __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1332                                       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1333                },
1334
1335                /* BSpec: 11299 */
1336                {
1337                        _3D_CHICKEN3,
1338                        __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1339                                       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1340                }
1341        };
1342
1343        *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1344
1345        /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1346        batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1347
1348        /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1349        batch = gen8_emit_pipe_control(batch,
1350                                       PIPE_CONTROL_FLUSH_L3 |
1351                                       PIPE_CONTROL_STORE_DATA_INDEX |
1352                                       PIPE_CONTROL_CS_STALL |
1353                                       PIPE_CONTROL_QW_WRITE,
1354                                       LRC_PPHWSP_SCRATCH_ADDR);
1355
1356        batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1357
1358        /* WaMediaPoolStateCmdInWABB:bxt,glk */
1359        if (HAS_POOLED_EU(engine->i915)) {
1360                /*
1361                 * EU pool configuration is setup along with golden context
1362                 * during context initialization. This value depends on
1363                 * device type (2x6 or 3x6) and needs to be updated based
1364                 * on which subslice is disabled especially for 2x6
1365                 * devices, however it is safe to load default
1366                 * configuration of 3x6 device instead of masking off
1367                 * corresponding bits because HW ignores bits of a disabled
1368                 * subslice and drops down to appropriate config. Please
1369                 * see render_state_setup() in i915_gem_render_state.c for
1370                 * possible configurations, to avoid duplication they are
1371                 * not shown here again.
1372                 */
1373                *batch++ = GEN9_MEDIA_POOL_STATE;
1374                *batch++ = GEN9_MEDIA_POOL_ENABLE;
1375                *batch++ = 0x00777000;
1376                *batch++ = 0;
1377                *batch++ = 0;
1378                *batch++ = 0;
1379        }
1380
1381        *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1382
1383        /* Pad to end of cacheline */
1384        while ((unsigned long)batch % CACHELINE_BYTES)
1385                *batch++ = MI_NOOP;
1386
1387        return batch;
1388}
1389
1390static u32 *
1391gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1392{
1393        int i;
1394
1395        /*
1396         * WaPipeControlBefore3DStateSamplePattern: cnl
1397         *
1398         * Ensure the engine is idle prior to programming a
1399         * 3DSTATE_SAMPLE_PATTERN during a context restore.
1400         */
1401        batch = gen8_emit_pipe_control(batch,
1402                                       PIPE_CONTROL_CS_STALL,
1403                                       0);
1404        /*
1405         * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1406         * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1407         * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1408         * confusing. Since gen8_emit_pipe_control() already advances the
1409         * batch by 6 dwords, we advance the other 10 here, completing a
1410         * cacheline. It's not clear if the workaround requires this padding
1411         * before other commands, or if it's just the regular padding we would
1412         * already have for the workaround bb, so leave it here for now.
1413         */
1414        for (i = 0; i < 10; i++)
1415                *batch++ = MI_NOOP;
1416
1417        /* Pad to end of cacheline */
1418        while ((unsigned long)batch % CACHELINE_BYTES)
1419                *batch++ = MI_NOOP;
1420
1421        return batch;
1422}
1423
1424#define CTX_WA_BB_SIZE (PAGE_SIZE)
1425
1426static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1427{
1428        struct drm_i915_gem_object *obj;
1429        struct i915_vma *vma;
1430        int err;
1431
1432        obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1433        if (IS_ERR(obj))
1434                return PTR_ERR(obj);
1435
1436        vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1437        if (IS_ERR(vma)) {
1438                err = PTR_ERR(vma);
1439                goto err;
1440        }
1441
1442        engine->wa_ctx.vma = vma;
1443        return 0;
1444
1445err:
1446        i915_gem_object_put(obj);
1447        return err;
1448}
1449
1450void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1451{
1452        i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1453}
1454
1455typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1456
1457void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1458{
1459        struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1460        struct i915_wa_ctx_bb *wa_bb[] = {
1461                &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1462        };
1463        wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1464        struct i915_gem_ww_ctx ww;
1465        void *batch, *batch_ptr;
1466        unsigned int i;
1467        int err;
1468
1469        if (engine->class != RENDER_CLASS)
1470                return;
1471
1472        switch (GRAPHICS_VER(engine->i915)) {
1473        case 12:
1474        case 11:
1475                return;
1476        case 10:
1477                wa_bb_fn[0] = gen10_init_indirectctx_bb;
1478                wa_bb_fn[1] = NULL;
1479                break;
1480        case 9:
1481                wa_bb_fn[0] = gen9_init_indirectctx_bb;
1482                wa_bb_fn[1] = NULL;
1483                break;
1484        case 8:
1485                wa_bb_fn[0] = gen8_init_indirectctx_bb;
1486                wa_bb_fn[1] = NULL;
1487                break;
1488        default:
1489                MISSING_CASE(GRAPHICS_VER(engine->i915));
1490                return;
1491        }
1492
1493        err = lrc_create_wa_ctx(engine);
1494        if (err) {
1495                /*
1496                 * We continue even if we fail to initialize WA batch
1497                 * because we only expect rare glitches but nothing
1498                 * critical to prevent us from using GPU
1499                 */
1500                drm_err(&engine->i915->drm,
1501                        "Ignoring context switch w/a allocation error:%d\n",
1502                        err);
1503                return;
1504        }
1505
1506        if (!engine->wa_ctx.vma)
1507                return;
1508
1509        i915_gem_ww_ctx_init(&ww, true);
1510retry:
1511        err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1512        if (!err)
1513                err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1514        if (err)
1515                goto err;
1516
1517        batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1518        if (IS_ERR(batch)) {
1519                err = PTR_ERR(batch);
1520                goto err_unpin;
1521        }
1522
1523        /*
1524         * Emit the two workaround batch buffers, recording the offset from the
1525         * start of the workaround batch buffer object for each and their
1526         * respective sizes.
1527         */
1528        batch_ptr = batch;
1529        for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1530                wa_bb[i]->offset = batch_ptr - batch;
1531                if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1532                                                  CACHELINE_BYTES))) {
1533                        err = -EINVAL;
1534                        break;
1535                }
1536                if (wa_bb_fn[i])
1537                        batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1538                wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1539        }
1540        GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1541
1542        __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1543        __i915_gem_object_release_map(wa_ctx->vma->obj);
1544
1545        /* Verify that we can handle failure to setup the wa_ctx */
1546        if (!err)
1547                err = i915_inject_probe_error(engine->i915, -ENODEV);
1548
1549err_unpin:
1550        if (err)
1551                i915_vma_unpin(wa_ctx->vma);
1552err:
1553        if (err == -EDEADLK) {
1554                err = i915_gem_ww_ctx_backoff(&ww);
1555                if (!err)
1556                        goto retry;
1557        }
1558        i915_gem_ww_ctx_fini(&ww);
1559
1560        if (err) {
1561                i915_vma_put(engine->wa_ctx.vma);
1562
1563                /* Clear all flags to prevent further use */
1564                memset(wa_ctx, 0, sizeof(*wa_ctx));
1565        }
1566}
1567
1568static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1569{
1570#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1571        ce->runtime.num_underflow++;
1572        ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1573#endif
1574}
1575
1576void lrc_update_runtime(struct intel_context *ce)
1577{
1578        u32 old;
1579        s32 dt;
1580
1581        if (intel_context_is_barrier(ce))
1582                return;
1583
1584        old = ce->runtime.last;
1585        ce->runtime.last = lrc_get_runtime(ce);
1586        dt = ce->runtime.last - old;
1587
1588        if (unlikely(dt < 0)) {
1589                CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1590                         old, ce->runtime.last, dt);
1591                st_update_runtime_underflow(ce, dt);
1592                return;
1593        }
1594
1595        ewma_runtime_add(&ce->runtime.avg, dt);
1596        ce->runtime.total += dt;
1597}
1598
1599#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1600#include "selftest_lrc.c"
1601#endif
1602