linux/arch/s390/kernel/nmi.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *   Machine check handler
   4 *
   5 *    Copyright IBM Corp. 2000, 2009
   6 *    Author(s): Ingo Adlung <adlung@de.ibm.com>,
   7 *               Martin Schwidefsky <schwidefsky@de.ibm.com>,
   8 *               Cornelia Huck <cornelia.huck@de.ibm.com>,
   9 *               Heiko Carstens <heiko.carstens@de.ibm.com>,
  10 */
  11
  12#include <linux/kernel_stat.h>
  13#include <linux/init.h>
  14#include <linux/errno.h>
  15#include <linux/hardirq.h>
  16#include <linux/log2.h>
  17#include <linux/kprobes.h>
  18#include <linux/kmemleak.h>
  19#include <linux/time.h>
  20#include <linux/module.h>
  21#include <linux/sched/signal.h>
  22
  23#include <linux/export.h>
  24#include <asm/lowcore.h>
  25#include <asm/smp.h>
  26#include <asm/stp.h>
  27#include <asm/cputime.h>
  28#include <asm/nmi.h>
  29#include <asm/crw.h>
  30#include <asm/switch_to.h>
  31#include <asm/ctl_reg.h>
  32#include <asm/asm-offsets.h>
  33#include <linux/kvm_host.h>
  34
  35struct mcck_struct {
  36        unsigned int kill_task : 1;
  37        unsigned int channel_report : 1;
  38        unsigned int warning : 1;
  39        unsigned int stp_queue : 1;
  40        unsigned long mcck_code;
  41};
  42
  43static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
  44static struct kmem_cache *mcesa_cache;
  45static unsigned long mcesa_origin_lc;
  46
  47static inline int nmi_needs_mcesa(void)
  48{
  49        return MACHINE_HAS_VX || MACHINE_HAS_GS;
  50}
  51
  52static inline unsigned long nmi_get_mcesa_size(void)
  53{
  54        if (MACHINE_HAS_GS)
  55                return MCESA_MAX_SIZE;
  56        return MCESA_MIN_SIZE;
  57}
  58
  59/*
  60 * The initial machine check extended save area for the boot CPU.
  61 * It will be replaced by nmi_init() with an allocated structure.
  62 * The structure is required for machine check happening early in
  63 * the boot process.
  64 */
  65static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE);
  66
  67void __init nmi_alloc_boot_cpu(struct lowcore *lc)
  68{
  69        if (!nmi_needs_mcesa())
  70                return;
  71        lc->mcesad = (unsigned long) &boot_mcesa;
  72        if (MACHINE_HAS_GS)
  73                lc->mcesad |= ilog2(MCESA_MAX_SIZE);
  74}
  75
  76static int __init nmi_init(void)
  77{
  78        unsigned long origin, cr0, size;
  79
  80        if (!nmi_needs_mcesa())
  81                return 0;
  82        size = nmi_get_mcesa_size();
  83        if (size > MCESA_MIN_SIZE)
  84                mcesa_origin_lc = ilog2(size);
  85        /* create slab cache for the machine-check-extended-save-areas */
  86        mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL);
  87        if (!mcesa_cache)
  88                panic("Couldn't create nmi save area cache");
  89        origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
  90        if (!origin)
  91                panic("Couldn't allocate nmi save area");
  92        /* The pointer is stored with mcesa_bits ORed in */
  93        kmemleak_not_leak((void *) origin);
  94        __ctl_store(cr0, 0, 0);
  95        __ctl_clear_bit(0, 28); /* disable lowcore protection */
  96        /* Replace boot_mcesa on the boot CPU */
  97        S390_lowcore.mcesad = origin | mcesa_origin_lc;
  98        __ctl_load(cr0, 0, 0);
  99        return 0;
 100}
 101early_initcall(nmi_init);
 102
 103int nmi_alloc_per_cpu(struct lowcore *lc)
 104{
 105        unsigned long origin;
 106
 107        if (!nmi_needs_mcesa())
 108                return 0;
 109        origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
 110        if (!origin)
 111                return -ENOMEM;
 112        /* The pointer is stored with mcesa_bits ORed in */
 113        kmemleak_not_leak((void *) origin);
 114        lc->mcesad = origin | mcesa_origin_lc;
 115        return 0;
 116}
 117
 118void nmi_free_per_cpu(struct lowcore *lc)
 119{
 120        if (!nmi_needs_mcesa())
 121                return;
 122        kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK));
 123}
 124
 125static notrace void s390_handle_damage(void)
 126{
 127        smp_emergency_stop();
 128        disabled_wait();
 129        while (1);
 130}
 131NOKPROBE_SYMBOL(s390_handle_damage);
 132
 133/*
 134 * Main machine check handler function. Will be called with interrupts disabled
 135 * and machine checks enabled.
 136 */
 137void __s390_handle_mcck(void)
 138{
 139        struct mcck_struct mcck;
 140
 141        /*
 142         * Disable machine checks and get the current state of accumulated
 143         * machine checks. Afterwards delete the old state and enable machine
 144         * checks again.
 145         */
 146        local_mcck_disable();
 147        mcck = *this_cpu_ptr(&cpu_mcck);
 148        memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck));
 149        local_mcck_enable();
 150
 151        if (mcck.channel_report)
 152                crw_handle_channel_report();
 153        /*
 154         * A warning may remain for a prolonged period on the bare iron.
 155         * (actually until the machine is powered off, or the problem is gone)
 156         * So we just stop listening for the WARNING MCH and avoid continuously
 157         * being interrupted.  One caveat is however, that we must do this per
 158         * processor and cannot use the smp version of ctl_clear_bit().
 159         * On VM we only get one interrupt per virtally presented machinecheck.
 160         * Though one suffices, we may get one interrupt per (virtual) cpu.
 161         */
 162        if (mcck.warning) {     /* WARNING pending ? */
 163                static int mchchk_wng_posted = 0;
 164
 165                /* Use single cpu clear, as we cannot handle smp here. */
 166                __ctl_clear_bit(14, 24);        /* Disable WARNING MCH */
 167                if (xchg(&mchchk_wng_posted, 1) == 0)
 168                        kill_cad_pid(SIGPWR, 1);
 169        }
 170        if (mcck.stp_queue)
 171                stp_queue_work();
 172        if (mcck.kill_task) {
 173                local_irq_enable();
 174                printk(KERN_EMERG "mcck: Terminating task because of machine "
 175                       "malfunction (code 0x%016lx).\n", mcck.mcck_code);
 176                printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
 177                       current->comm, current->pid);
 178                do_exit(SIGSEGV);
 179        }
 180}
 181
 182void noinstr s390_handle_mcck(void)
 183{
 184        trace_hardirqs_off();
 185        __s390_handle_mcck();
 186        trace_hardirqs_on();
 187}
 188/*
 189 * returns 0 if all required registers are available
 190 * returns 1 otherwise
 191 */
 192static int notrace s390_validate_registers(union mci mci, int umode)
 193{
 194        struct mcesa *mcesa;
 195        void *fpt_save_area;
 196        union ctlreg2 cr2;
 197        int kill_task;
 198        u64 zero;
 199
 200        kill_task = 0;
 201        zero = 0;
 202
 203        if (!mci.gr) {
 204                /*
 205                 * General purpose registers couldn't be restored and have
 206                 * unknown contents. Stop system or terminate process.
 207                 */
 208                if (!umode)
 209                        s390_handle_damage();
 210                kill_task = 1;
 211        }
 212        if (!mci.fp) {
 213                /*
 214                 * Floating point registers can't be restored. If the
 215                 * kernel currently uses floating point registers the
 216                 * system is stopped. If the process has its floating
 217                 * pointer registers loaded it is terminated.
 218                 */
 219                if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7)
 220                        s390_handle_damage();
 221                if (!test_cpu_flag(CIF_FPU))
 222                        kill_task = 1;
 223        }
 224        fpt_save_area = &S390_lowcore.floating_pt_save_area;
 225        if (!mci.fc) {
 226                /*
 227                 * Floating point control register can't be restored.
 228                 * If the kernel currently uses the floating pointer
 229                 * registers and needs the FPC register the system is
 230                 * stopped. If the process has its floating pointer
 231                 * registers loaded it is terminated. Otherwise the
 232                 * FPC is just validated.
 233                 */
 234                if (S390_lowcore.fpu_flags & KERNEL_FPC)
 235                        s390_handle_damage();
 236                asm volatile(
 237                        "       lfpc    %0\n"
 238                        :
 239                        : "Q" (zero));
 240                if (!test_cpu_flag(CIF_FPU))
 241                        kill_task = 1;
 242        } else {
 243                asm volatile(
 244                        "       lfpc    %0\n"
 245                        :
 246                        : "Q" (S390_lowcore.fpt_creg_save_area));
 247        }
 248
 249        mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
 250        if (!MACHINE_HAS_VX) {
 251                /* Validate floating point registers */
 252                asm volatile(
 253                        "       ld      0,0(%0)\n"
 254                        "       ld      1,8(%0)\n"
 255                        "       ld      2,16(%0)\n"
 256                        "       ld      3,24(%0)\n"
 257                        "       ld      4,32(%0)\n"
 258                        "       ld      5,40(%0)\n"
 259                        "       ld      6,48(%0)\n"
 260                        "       ld      7,56(%0)\n"
 261                        "       ld      8,64(%0)\n"
 262                        "       ld      9,72(%0)\n"
 263                        "       ld      10,80(%0)\n"
 264                        "       ld      11,88(%0)\n"
 265                        "       ld      12,96(%0)\n"
 266                        "       ld      13,104(%0)\n"
 267                        "       ld      14,112(%0)\n"
 268                        "       ld      15,120(%0)\n"
 269                        :
 270                        : "a" (fpt_save_area)
 271                        : "memory");
 272        } else {
 273                /* Validate vector registers */
 274                union ctlreg0 cr0;
 275
 276                if (!mci.vr) {
 277                        /*
 278                         * Vector registers can't be restored. If the kernel
 279                         * currently uses vector registers the system is
 280                         * stopped. If the process has its vector registers
 281                         * loaded it is terminated. Otherwise just validate
 282                         * the registers.
 283                         */
 284                        if (S390_lowcore.fpu_flags & KERNEL_VXR)
 285                                s390_handle_damage();
 286                        if (!test_cpu_flag(CIF_FPU))
 287                                kill_task = 1;
 288                }
 289                cr0.val = S390_lowcore.cregs_save_area[0];
 290                cr0.afp = cr0.vx = 1;
 291                __ctl_load(cr0.val, 0, 0);
 292                asm volatile(
 293                        "       la      1,%0\n"
 294                        "       .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
 295                        "       .word   0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
 296                        :
 297                        : "Q" (*(struct vx_array *)mcesa->vector_save_area)
 298                        : "1");
 299                __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
 300        }
 301        /* Validate access registers */
 302        asm volatile(
 303                "       lam     0,15,0(%0)\n"
 304                :
 305                : "a" (&S390_lowcore.access_regs_save_area)
 306                : "memory");
 307        if (!mci.ar) {
 308                /*
 309                 * Access registers have unknown contents.
 310                 * Terminating task.
 311                 */
 312                kill_task = 1;
 313        }
 314        /* Validate guarded storage registers */
 315        cr2.val = S390_lowcore.cregs_save_area[2];
 316        if (cr2.gse) {
 317                if (!mci.gs) {
 318                        /*
 319                         * Guarded storage register can't be restored and
 320                         * the current processes uses guarded storage.
 321                         * It has to be terminated.
 322                         */
 323                        kill_task = 1;
 324                } else {
 325                        load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
 326                }
 327        }
 328        /*
 329         * The getcpu vdso syscall reads CPU number from the programmable
 330         * field of the TOD clock. Disregard the TOD programmable register
 331         * validity bit and load the CPU number into the TOD programmable
 332         * field unconditionally.
 333         */
 334        set_tod_programmable_field(raw_smp_processor_id());
 335        /* Validate clock comparator register */
 336        set_clock_comparator(S390_lowcore.clock_comparator);
 337
 338        if (!mci.ms || !mci.pm || !mci.ia)
 339                kill_task = 1;
 340
 341        return kill_task;
 342}
 343NOKPROBE_SYMBOL(s390_validate_registers);
 344
 345/*
 346 * Backup the guest's machine check info to its description block
 347 */
 348static void notrace s390_backup_mcck_info(struct pt_regs *regs)
 349{
 350        struct mcck_volatile_info *mcck_backup;
 351        struct sie_page *sie_page;
 352
 353        /* r14 contains the sie block, which was set in sie64a */
 354        struct kvm_s390_sie_block *sie_block =
 355                        (struct kvm_s390_sie_block *) regs->gprs[14];
 356
 357        if (sie_block == NULL)
 358                /* Something's seriously wrong, stop system. */
 359                s390_handle_damage();
 360
 361        sie_page = container_of(sie_block, struct sie_page, sie_block);
 362        mcck_backup = &sie_page->mcck_info;
 363        mcck_backup->mcic = S390_lowcore.mcck_interruption_code &
 364                                ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE);
 365        mcck_backup->ext_damage_code = S390_lowcore.external_damage_code;
 366        mcck_backup->failing_storage_address
 367                        = S390_lowcore.failing_storage_address;
 368}
 369NOKPROBE_SYMBOL(s390_backup_mcck_info);
 370
 371#define MAX_IPD_COUNT   29
 372#define MAX_IPD_TIME    (5 * 60 * USEC_PER_SEC) /* 5 minutes */
 373
 374#define ED_STP_ISLAND   6       /* External damage STP island check */
 375#define ED_STP_SYNC     7       /* External damage STP sync check */
 376
 377#define MCCK_CODE_NO_GUEST      (MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE)
 378
 379/*
 380 * machine check handler.
 381 */
 382int notrace s390_do_machine_check(struct pt_regs *regs)
 383{
 384        static int ipd_count;
 385        static DEFINE_SPINLOCK(ipd_lock);
 386        static unsigned long long last_ipd;
 387        struct mcck_struct *mcck;
 388        unsigned long long tmp;
 389        union mci mci;
 390        unsigned long mcck_dam_code;
 391        int mcck_pending = 0;
 392
 393        nmi_enter();
 394
 395        if (user_mode(regs))
 396                update_timer_mcck();
 397        inc_irq_stat(NMI_NMI);
 398        mci.val = S390_lowcore.mcck_interruption_code;
 399        mcck = this_cpu_ptr(&cpu_mcck);
 400
 401        /*
 402         * Reinject the instruction processing damages' machine checks
 403         * including Delayed Access Exception into the guest
 404         * instead of damaging the host if they happen in the guest.
 405         */
 406        if (mci.pd && !test_cpu_flag(CIF_MCCK_GUEST)) {
 407                if (mci.b) {
 408                        /* Processing backup -> verify if we can survive this */
 409                        u64 z_mcic, o_mcic, t_mcic;
 410                        z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
 411                        o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 |
 412                                  1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
 413                                  1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 |
 414                                  1ULL<<16);
 415                        t_mcic = mci.val;
 416
 417                        if (((t_mcic & z_mcic) != 0) ||
 418                            ((t_mcic & o_mcic) != o_mcic)) {
 419                                s390_handle_damage();
 420                        }
 421
 422                        /*
 423                         * Nullifying exigent condition, therefore we might
 424                         * retry this instruction.
 425                         */
 426                        spin_lock(&ipd_lock);
 427                        tmp = get_tod_clock();
 428                        if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME)
 429                                ipd_count++;
 430                        else
 431                                ipd_count = 1;
 432                        last_ipd = tmp;
 433                        if (ipd_count == MAX_IPD_COUNT)
 434                                s390_handle_damage();
 435                        spin_unlock(&ipd_lock);
 436                } else {
 437                        /* Processing damage -> stopping machine */
 438                        s390_handle_damage();
 439                }
 440        }
 441        if (s390_validate_registers(mci, user_mode(regs))) {
 442                /*
 443                 * Couldn't restore all register contents for the
 444                 * user space process -> mark task for termination.
 445                 */
 446                mcck->kill_task = 1;
 447                mcck->mcck_code = mci.val;
 448                mcck_pending = 1;
 449        }
 450
 451        /*
 452         * Backup the machine check's info if it happens when the guest
 453         * is running.
 454         */
 455        if (test_cpu_flag(CIF_MCCK_GUEST))
 456                s390_backup_mcck_info(regs);
 457
 458        if (mci.cd) {
 459                /* Timing facility damage */
 460                s390_handle_damage();
 461        }
 462        if (mci.ed && mci.ec) {
 463                /* External damage */
 464                if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC))
 465                        mcck->stp_queue |= stp_sync_check();
 466                if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
 467                        mcck->stp_queue |= stp_island_check();
 468                mcck_pending = 1;
 469        }
 470
 471        if (mci.cp) {
 472                /* Channel report word pending */
 473                mcck->channel_report = 1;
 474                mcck_pending = 1;
 475        }
 476        if (mci.w) {
 477                /* Warning pending */
 478                mcck->warning = 1;
 479                mcck_pending = 1;
 480        }
 481
 482        /*
 483         * If there are only Channel Report Pending and External Damage
 484         * machine checks, they will not be reinjected into the guest
 485         * because they refer to host conditions only.
 486         */
 487        mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK);
 488        if (test_cpu_flag(CIF_MCCK_GUEST) &&
 489        (mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) {
 490                /* Set exit reason code for host's later handling */
 491                *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR;
 492        }
 493        clear_cpu_flag(CIF_MCCK_GUEST);
 494
 495        if (user_mode(regs) && mcck_pending) {
 496                nmi_exit();
 497                return 1;
 498        }
 499
 500        if (mcck_pending)
 501                schedule_mcck_handler();
 502
 503        nmi_exit();
 504        return 0;
 505}
 506NOKPROBE_SYMBOL(s390_do_machine_check);
 507
 508static int __init machine_check_init(void)
 509{
 510        ctl_set_bit(14, 25);    /* enable external damage MCH */
 511        ctl_set_bit(14, 27);    /* enable system recovery MCH */
 512        ctl_set_bit(14, 24);    /* enable warning MCH */
 513        return 0;
 514}
 515early_initcall(machine_check_init);
 516