linux/arch/x86/kernel/cpu/mcheck/mce.c
<<
>>
Prefs
   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/thread_info.h>
  14#include <linux/capability.h>
  15#include <linux/miscdevice.h>
  16#include <linux/ratelimit.h>
  17#include <linux/kallsyms.h>
  18#include <linux/rcupdate.h>
  19#include <linux/kobject.h>
  20#include <linux/uaccess.h>
  21#include <linux/kdebug.h>
  22#include <linux/kernel.h>
  23#include <linux/percpu.h>
  24#include <linux/string.h>
  25#include <linux/device.h>
  26#include <linux/syscore_ops.h>
  27#include <linux/delay.h>
  28#include <linux/ctype.h>
  29#include <linux/sched.h>
  30#include <linux/sysfs.h>
  31#include <linux/types.h>
  32#include <linux/slab.h>
  33#include <linux/init.h>
  34#include <linux/kmod.h>
  35#include <linux/poll.h>
  36#include <linux/nmi.h>
  37#include <linux/cpu.h>
  38#include <linux/smp.h>
  39#include <linux/fs.h>
  40#include <linux/mm.h>
  41#include <linux/debugfs.h>
  42#include <linux/irq_work.h>
  43#include <linux/export.h>
  44
  45#include <asm/processor.h>
  46#include <asm/mce.h>
  47#include <asm/msr.h>
  48
  49#include "mce-internal.h"
  50
  51static DEFINE_MUTEX(mce_chrdev_read_mutex);
  52
  53#define rcu_dereference_check_mce(p) \
  54        rcu_dereference_index_check((p), \
  55                              rcu_read_lock_sched_held() || \
  56                              lockdep_is_held(&mce_chrdev_read_mutex))
  57
  58#define CREATE_TRACE_POINTS
  59#include <trace/events/mce.h>
  60
  61int mce_disabled __read_mostly;
  62
  63#define SPINUNIT 100    /* 100ns */
  64
  65atomic_t mce_entry;
  66
  67DEFINE_PER_CPU(unsigned, mce_exception_count);
  68
  69/*
  70 * Tolerant levels:
  71 *   0: always panic on uncorrected errors, log corrected errors
  72 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  73 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  74 *   3: never panic or SIGBUS, log all errors (for testing only)
  75 */
  76static int                      tolerant                __read_mostly = 1;
  77static int                      banks                   __read_mostly;
  78static int                      rip_msr                 __read_mostly;
  79static int                      mce_bootlog             __read_mostly = -1;
  80static int                      monarch_timeout         __read_mostly = -1;
  81static int                      mce_panic_timeout       __read_mostly;
  82static int                      mce_dont_log_ce         __read_mostly;
  83int                             mce_cmci_disabled       __read_mostly;
  84int                             mce_ignore_ce           __read_mostly;
  85int                             mce_ser                 __read_mostly;
  86int                             mce_bios_cmci_threshold __read_mostly;
  87
  88struct mce_bank                *mce_banks               __read_mostly;
  89
  90/* User mode helper program triggered by machine check event */
  91static unsigned long            mce_need_notify;
  92static char                     mce_helper[128];
  93static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  94
  95static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  96
  97static DEFINE_PER_CPU(struct mce, mces_seen);
  98static int                      cpu_missing;
  99
 100/* MCA banks polled by the period polling timer for corrected events */
 101DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 102        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 103};
 104
 105static DEFINE_PER_CPU(struct work_struct, mce_work);
 106
 107static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 108
 109/*
 110 * CPU/chipset specific EDAC code can register a notifier call here to print
 111 * MCE errors in a human-readable form.
 112 */
 113ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 114
 115/* Do initial initialization of a struct mce */
 116void mce_setup(struct mce *m)
 117{
 118        memset(m, 0, sizeof(struct mce));
 119        m->cpu = m->extcpu = smp_processor_id();
 120        rdtscll(m->tsc);
 121        /* We hope get_seconds stays lockless */
 122        m->time = get_seconds();
 123        m->cpuvendor = boot_cpu_data.x86_vendor;
 124        m->cpuid = cpuid_eax(1);
 125        m->socketid = cpu_data(m->extcpu).phys_proc_id;
 126        m->apicid = cpu_data(m->extcpu).initial_apicid;
 127        rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 128}
 129
 130DEFINE_PER_CPU(struct mce, injectm);
 131EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 132
 133/*
 134 * Lockless MCE logging infrastructure.
 135 * This avoids deadlocks on printk locks without having to break locks. Also
 136 * separate MCEs from kernel messages to avoid bogus bug reports.
 137 */
 138
 139static struct mce_log mcelog = {
 140        .signature      = MCE_LOG_SIGNATURE,
 141        .len            = MCE_LOG_LEN,
 142        .recordlen      = sizeof(struct mce),
 143};
 144
 145void mce_log(struct mce *mce)
 146{
 147        unsigned next, entry;
 148        int ret = 0;
 149
 150        /* Emit the trace record: */
 151        trace_mce_record(mce);
 152
 153        ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 154        if (ret == NOTIFY_STOP)
 155                return;
 156
 157        mce->finished = 0;
 158        wmb();
 159        for (;;) {
 160                entry = rcu_dereference_check_mce(mcelog.next);
 161                for (;;) {
 162
 163                        /*
 164                         * When the buffer fills up discard new entries.
 165                         * Assume that the earlier errors are the more
 166                         * interesting ones:
 167                         */
 168                        if (entry >= MCE_LOG_LEN) {
 169                                set_bit(MCE_OVERFLOW,
 170                                        (unsigned long *)&mcelog.flags);
 171                                return;
 172                        }
 173                        /* Old left over entry. Skip: */
 174                        if (mcelog.entry[entry].finished) {
 175                                entry++;
 176                                continue;
 177                        }
 178                        break;
 179                }
 180                smp_rmb();
 181                next = entry + 1;
 182                if (cmpxchg(&mcelog.next, entry, next) == entry)
 183                        break;
 184        }
 185        memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 186        wmb();
 187        mcelog.entry[entry].finished = 1;
 188        wmb();
 189
 190        mce->finished = 1;
 191        set_bit(0, &mce_need_notify);
 192}
 193
 194static void drain_mcelog_buffer(void)
 195{
 196        unsigned int next, i, prev = 0;
 197
 198        next = ACCESS_ONCE(mcelog.next);
 199
 200        do {
 201                struct mce *m;
 202
 203                /* drain what was logged during boot */
 204                for (i = prev; i < next; i++) {
 205                        unsigned long start = jiffies;
 206                        unsigned retries = 1;
 207
 208                        m = &mcelog.entry[i];
 209
 210                        while (!m->finished) {
 211                                if (time_after_eq(jiffies, start + 2*retries))
 212                                        retries++;
 213
 214                                cpu_relax();
 215
 216                                if (!m->finished && retries >= 4) {
 217                                        pr_err("skipping error being logged currently!\n");
 218                                        break;
 219                                }
 220                        }
 221                        smp_rmb();
 222                        atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 223                }
 224
 225                memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
 226                prev = next;
 227                next = cmpxchg(&mcelog.next, prev, 0);
 228        } while (next != prev);
 229}
 230
 231
 232void mce_register_decode_chain(struct notifier_block *nb)
 233{
 234        atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 235        drain_mcelog_buffer();
 236}
 237EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 238
 239void mce_unregister_decode_chain(struct notifier_block *nb)
 240{
 241        atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 242}
 243EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 244
 245static void print_mce(struct mce *m)
 246{
 247        int ret = 0;
 248
 249        pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 250               m->extcpu, m->mcgstatus, m->bank, m->status);
 251
 252        if (m->ip) {
 253                pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 254                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 255                                m->cs, m->ip);
 256
 257                if (m->cs == __KERNEL_CS)
 258                        print_symbol("{%s}", m->ip);
 259                pr_cont("\n");
 260        }
 261
 262        pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 263        if (m->addr)
 264                pr_cont("ADDR %llx ", m->addr);
 265        if (m->misc)
 266                pr_cont("MISC %llx ", m->misc);
 267
 268        pr_cont("\n");
 269        /*
 270         * Note this output is parsed by external tools and old fields
 271         * should not be changed.
 272         */
 273        pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 274                m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 275                cpu_data(m->extcpu).microcode);
 276
 277        /*
 278         * Print out human-readable details about the MCE error,
 279         * (if the CPU has an implementation for that)
 280         */
 281        ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 282        if (ret == NOTIFY_STOP)
 283                return;
 284
 285        pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 286}
 287
 288#define PANIC_TIMEOUT 5 /* 5 seconds */
 289
 290static atomic_t mce_paniced;
 291
 292static int fake_panic;
 293static atomic_t mce_fake_paniced;
 294
 295/* Panic in progress. Enable interrupts and wait for final IPI */
 296static void wait_for_panic(void)
 297{
 298        long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 299
 300        preempt_disable();
 301        local_irq_enable();
 302        while (timeout-- > 0)
 303                udelay(1);
 304        if (panic_timeout == 0)
 305                panic_timeout = mce_panic_timeout;
 306        panic("Panicing machine check CPU died");
 307}
 308
 309static void mce_panic(char *msg, struct mce *final, char *exp)
 310{
 311        int i, apei_err = 0;
 312
 313        if (!fake_panic) {
 314                /*
 315                 * Make sure only one CPU runs in machine check panic
 316                 */
 317                if (atomic_inc_return(&mce_paniced) > 1)
 318                        wait_for_panic();
 319                barrier();
 320
 321                bust_spinlocks(1);
 322                console_verbose();
 323        } else {
 324                /* Don't log too much for fake panic */
 325                if (atomic_inc_return(&mce_fake_paniced) > 1)
 326                        return;
 327        }
 328        /* First print corrected ones that are still unlogged */
 329        for (i = 0; i < MCE_LOG_LEN; i++) {
 330                struct mce *m = &mcelog.entry[i];
 331                if (!(m->status & MCI_STATUS_VAL))
 332                        continue;
 333                if (!(m->status & MCI_STATUS_UC)) {
 334                        print_mce(m);
 335                        if (!apei_err)
 336                                apei_err = apei_write_mce(m);
 337                }
 338        }
 339        /* Now print uncorrected but with the final one last */
 340        for (i = 0; i < MCE_LOG_LEN; i++) {
 341                struct mce *m = &mcelog.entry[i];
 342                if (!(m->status & MCI_STATUS_VAL))
 343                        continue;
 344                if (!(m->status & MCI_STATUS_UC))
 345                        continue;
 346                if (!final || memcmp(m, final, sizeof(struct mce))) {
 347                        print_mce(m);
 348                        if (!apei_err)
 349                                apei_err = apei_write_mce(m);
 350                }
 351        }
 352        if (final) {
 353                print_mce(final);
 354                if (!apei_err)
 355                        apei_err = apei_write_mce(final);
 356        }
 357        if (cpu_missing)
 358                pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 359        if (exp)
 360                pr_emerg(HW_ERR "Machine check: %s\n", exp);
 361        if (!fake_panic) {
 362                if (panic_timeout == 0)
 363                        panic_timeout = mce_panic_timeout;
 364                panic(msg);
 365        } else
 366                pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 367}
 368
 369/* Support code for software error injection */
 370
 371static int msr_to_offset(u32 msr)
 372{
 373        unsigned bank = __this_cpu_read(injectm.bank);
 374
 375        if (msr == rip_msr)
 376                return offsetof(struct mce, ip);
 377        if (msr == MSR_IA32_MCx_STATUS(bank))
 378                return offsetof(struct mce, status);
 379        if (msr == MSR_IA32_MCx_ADDR(bank))
 380                return offsetof(struct mce, addr);
 381        if (msr == MSR_IA32_MCx_MISC(bank))
 382                return offsetof(struct mce, misc);
 383        if (msr == MSR_IA32_MCG_STATUS)
 384                return offsetof(struct mce, mcgstatus);
 385        return -1;
 386}
 387
 388/* MSR access wrappers used for error injection */
 389static u64 mce_rdmsrl(u32 msr)
 390{
 391        u64 v;
 392
 393        if (__this_cpu_read(injectm.finished)) {
 394                int offset = msr_to_offset(msr);
 395
 396                if (offset < 0)
 397                        return 0;
 398                return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
 399        }
 400
 401        if (rdmsrl_safe(msr, &v)) {
 402                WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 403                /*
 404                 * Return zero in case the access faulted. This should
 405                 * not happen normally but can happen if the CPU does
 406                 * something weird, or if the code is buggy.
 407                 */
 408                v = 0;
 409        }
 410
 411        return v;
 412}
 413
 414static void mce_wrmsrl(u32 msr, u64 v)
 415{
 416        if (__this_cpu_read(injectm.finished)) {
 417                int offset = msr_to_offset(msr);
 418
 419                if (offset >= 0)
 420                        *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
 421                return;
 422        }
 423        wrmsrl(msr, v);
 424}
 425
 426/*
 427 * Collect all global (w.r.t. this processor) status about this machine
 428 * check into our "mce" struct so that we can use it later to assess
 429 * the severity of the problem as we read per-bank specific details.
 430 */
 431static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 432{
 433        mce_setup(m);
 434
 435        m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 436        if (regs) {
 437                /*
 438                 * Get the address of the instruction at the time of
 439                 * the machine check error.
 440                 */
 441                if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 442                        m->ip = regs->ip;
 443                        m->cs = regs->cs;
 444
 445                        /*
 446                         * When in VM86 mode make the cs look like ring 3
 447                         * always. This is a lie, but it's better than passing
 448                         * the additional vm86 bit around everywhere.
 449                         */
 450                        if (v8086_mode(regs))
 451                                m->cs |= 3;
 452                }
 453                /* Use accurate RIP reporting if available. */
 454                if (rip_msr)
 455                        m->ip = mce_rdmsrl(rip_msr);
 456        }
 457}
 458
 459/*
 460 * Simple lockless ring to communicate PFNs from the exception handler with the
 461 * process context work function. This is vastly simplified because there's
 462 * only a single reader and a single writer.
 463 */
 464#define MCE_RING_SIZE 16        /* we use one entry less */
 465
 466struct mce_ring {
 467        unsigned short start;
 468        unsigned short end;
 469        unsigned long ring[MCE_RING_SIZE];
 470};
 471static DEFINE_PER_CPU(struct mce_ring, mce_ring);
 472
 473/* Runs with CPU affinity in workqueue */
 474static int mce_ring_empty(void)
 475{
 476        struct mce_ring *r = &__get_cpu_var(mce_ring);
 477
 478        return r->start == r->end;
 479}
 480
 481static int mce_ring_get(unsigned long *pfn)
 482{
 483        struct mce_ring *r;
 484        int ret = 0;
 485
 486        *pfn = 0;
 487        get_cpu();
 488        r = &__get_cpu_var(mce_ring);
 489        if (r->start == r->end)
 490                goto out;
 491        *pfn = r->ring[r->start];
 492        r->start = (r->start + 1) % MCE_RING_SIZE;
 493        ret = 1;
 494out:
 495        put_cpu();
 496        return ret;
 497}
 498
 499/* Always runs in MCE context with preempt off */
 500static int mce_ring_add(unsigned long pfn)
 501{
 502        struct mce_ring *r = &__get_cpu_var(mce_ring);
 503        unsigned next;
 504
 505        next = (r->end + 1) % MCE_RING_SIZE;
 506        if (next == r->start)
 507                return -1;
 508        r->ring[r->end] = pfn;
 509        wmb();
 510        r->end = next;
 511        return 0;
 512}
 513
 514int mce_available(struct cpuinfo_x86 *c)
 515{
 516        if (mce_disabled)
 517                return 0;
 518        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 519}
 520
 521static void mce_schedule_work(void)
 522{
 523        if (!mce_ring_empty()) {
 524                struct work_struct *work = &__get_cpu_var(mce_work);
 525                if (!work_pending(work))
 526                        schedule_work(work);
 527        }
 528}
 529
 530DEFINE_PER_CPU(struct irq_work, mce_irq_work);
 531
 532static void mce_irq_work_cb(struct irq_work *entry)
 533{
 534        mce_notify_irq();
 535        mce_schedule_work();
 536}
 537
 538static void mce_report_event(struct pt_regs *regs)
 539{
 540        if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 541                mce_notify_irq();
 542                /*
 543                 * Triggering the work queue here is just an insurance
 544                 * policy in case the syscall exit notify handler
 545                 * doesn't run soon enough or ends up running on the
 546                 * wrong CPU (can happen when audit sleeps)
 547                 */
 548                mce_schedule_work();
 549                return;
 550        }
 551
 552        irq_work_queue(&__get_cpu_var(mce_irq_work));
 553}
 554
 555/*
 556 * Read ADDR and MISC registers.
 557 */
 558static void mce_read_aux(struct mce *m, int i)
 559{
 560        if (m->status & MCI_STATUS_MISCV)
 561                m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 562        if (m->status & MCI_STATUS_ADDRV) {
 563                m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 564
 565                /*
 566                 * Mask the reported address by the reported granularity.
 567                 */
 568                if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
 569                        u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 570                        m->addr >>= shift;
 571                        m->addr <<= shift;
 572                }
 573        }
 574}
 575
 576DEFINE_PER_CPU(unsigned, mce_poll_count);
 577
 578/*
 579 * Poll for corrected events or events that happened before reset.
 580 * Those are just logged through /dev/mcelog.
 581 *
 582 * This is executed in standard interrupt context.
 583 *
 584 * Note: spec recommends to panic for fatal unsignalled
 585 * errors here. However this would be quite problematic --
 586 * we would need to reimplement the Monarch handling and
 587 * it would mess up the exclusion between exception handler
 588 * and poll hander -- * so we skip this for now.
 589 * These cases should not happen anyways, or only when the CPU
 590 * is already totally * confused. In this case it's likely it will
 591 * not fully execute the machine check handler either.
 592 */
 593void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 594{
 595        struct mce m;
 596        int i;
 597
 598        this_cpu_inc(mce_poll_count);
 599
 600        mce_gather_info(&m, NULL);
 601
 602        for (i = 0; i < banks; i++) {
 603                if (!mce_banks[i].ctl || !test_bit(i, *b))
 604                        continue;
 605
 606                m.misc = 0;
 607                m.addr = 0;
 608                m.bank = i;
 609                m.tsc = 0;
 610
 611                barrier();
 612                m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 613                if (!(m.status & MCI_STATUS_VAL))
 614                        continue;
 615
 616                /*
 617                 * Uncorrected or signalled events are handled by the exception
 618                 * handler when it is enabled, so don't process those here.
 619                 *
 620                 * TBD do the same check for MCI_STATUS_EN here?
 621                 */
 622                if (!(flags & MCP_UC) &&
 623                    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 624                        continue;
 625
 626                mce_read_aux(&m, i);
 627
 628                if (!(flags & MCP_TIMESTAMP))
 629                        m.tsc = 0;
 630                /*
 631                 * Don't get the IP here because it's unlikely to
 632                 * have anything to do with the actual error location.
 633                 */
 634                if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce)
 635                        mce_log(&m);
 636
 637                /*
 638                 * Clear state for this bank.
 639                 */
 640                mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 641        }
 642
 643        /*
 644         * Don't clear MCG_STATUS here because it's only defined for
 645         * exceptions.
 646         */
 647
 648        sync_core();
 649}
 650EXPORT_SYMBOL_GPL(machine_check_poll);
 651
 652/*
 653 * Do a quick check if any of the events requires a panic.
 654 * This decides if we keep the events around or clear them.
 655 */
 656static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 657                          struct pt_regs *regs)
 658{
 659        int i, ret = 0;
 660
 661        for (i = 0; i < banks; i++) {
 662                m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 663                if (m->status & MCI_STATUS_VAL) {
 664                        __set_bit(i, validp);
 665                        if (quirk_no_way_out)
 666                                quirk_no_way_out(i, m, regs);
 667                }
 668                if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
 669                        ret = 1;
 670        }
 671        return ret;
 672}
 673
 674/*
 675 * Variable to establish order between CPUs while scanning.
 676 * Each CPU spins initially until executing is equal its number.
 677 */
 678static atomic_t mce_executing;
 679
 680/*
 681 * Defines order of CPUs on entry. First CPU becomes Monarch.
 682 */
 683static atomic_t mce_callin;
 684
 685/*
 686 * Check if a timeout waiting for other CPUs happened.
 687 */
 688static int mce_timed_out(u64 *t)
 689{
 690        /*
 691         * The others already did panic for some reason.
 692         * Bail out like in a timeout.
 693         * rmb() to tell the compiler that system_state
 694         * might have been modified by someone else.
 695         */
 696        rmb();
 697        if (atomic_read(&mce_paniced))
 698                wait_for_panic();
 699        if (!monarch_timeout)
 700                goto out;
 701        if ((s64)*t < SPINUNIT) {
 702                /* CHECKME: Make panic default for 1 too? */
 703                if (tolerant < 1)
 704                        mce_panic("Timeout synchronizing machine check over CPUs",
 705                                  NULL, NULL);
 706                cpu_missing = 1;
 707                return 1;
 708        }
 709        *t -= SPINUNIT;
 710out:
 711        touch_nmi_watchdog();
 712        return 0;
 713}
 714
 715/*
 716 * The Monarch's reign.  The Monarch is the CPU who entered
 717 * the machine check handler first. It waits for the others to
 718 * raise the exception too and then grades them. When any
 719 * error is fatal panic. Only then let the others continue.
 720 *
 721 * The other CPUs entering the MCE handler will be controlled by the
 722 * Monarch. They are called Subjects.
 723 *
 724 * This way we prevent any potential data corruption in a unrecoverable case
 725 * and also makes sure always all CPU's errors are examined.
 726 *
 727 * Also this detects the case of a machine check event coming from outer
 728 * space (not detected by any CPUs) In this case some external agent wants
 729 * us to shut down, so panic too.
 730 *
 731 * The other CPUs might still decide to panic if the handler happens
 732 * in a unrecoverable place, but in this case the system is in a semi-stable
 733 * state and won't corrupt anything by itself. It's ok to let the others
 734 * continue for a bit first.
 735 *
 736 * All the spin loops have timeouts; when a timeout happens a CPU
 737 * typically elects itself to be Monarch.
 738 */
 739static void mce_reign(void)
 740{
 741        int cpu;
 742        struct mce *m = NULL;
 743        int global_worst = 0;
 744        char *msg = NULL;
 745        char *nmsg = NULL;
 746
 747        /*
 748         * This CPU is the Monarch and the other CPUs have run
 749         * through their handlers.
 750         * Grade the severity of the errors of all the CPUs.
 751         */
 752        for_each_possible_cpu(cpu) {
 753                int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
 754                                            &nmsg);
 755                if (severity > global_worst) {
 756                        msg = nmsg;
 757                        global_worst = severity;
 758                        m = &per_cpu(mces_seen, cpu);
 759                }
 760        }
 761
 762        /*
 763         * Cannot recover? Panic here then.
 764         * This dumps all the mces in the log buffer and stops the
 765         * other CPUs.
 766         */
 767        if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
 768                mce_panic("Fatal Machine check", m, msg);
 769
 770        /*
 771         * For UC somewhere we let the CPU who detects it handle it.
 772         * Also must let continue the others, otherwise the handling
 773         * CPU could deadlock on a lock.
 774         */
 775
 776        /*
 777         * No machine check event found. Must be some external
 778         * source or one CPU is hung. Panic.
 779         */
 780        if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
 781                mce_panic("Machine check from unknown source", NULL, NULL);
 782
 783        /*
 784         * Now clear all the mces_seen so that they don't reappear on
 785         * the next mce.
 786         */
 787        for_each_possible_cpu(cpu)
 788                memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 789}
 790
 791static atomic_t global_nwo;
 792
 793/*
 794 * Start of Monarch synchronization. This waits until all CPUs have
 795 * entered the exception handler and then determines if any of them
 796 * saw a fatal event that requires panic. Then it executes them
 797 * in the entry order.
 798 * TBD double check parallel CPU hotunplug
 799 */
 800static int mce_start(int *no_way_out)
 801{
 802        int order;
 803        int cpus = num_online_cpus();
 804        u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 805
 806        if (!timeout)
 807                return -1;
 808
 809        atomic_add(*no_way_out, &global_nwo);
 810        /*
 811         * global_nwo should be updated before mce_callin
 812         */
 813        smp_wmb();
 814        order = atomic_inc_return(&mce_callin);
 815
 816        /*
 817         * Wait for everyone.
 818         */
 819        while (atomic_read(&mce_callin) != cpus) {
 820                if (mce_timed_out(&timeout)) {
 821                        atomic_set(&global_nwo, 0);
 822                        return -1;
 823                }
 824                ndelay(SPINUNIT);
 825        }
 826
 827        /*
 828         * mce_callin should be read before global_nwo
 829         */
 830        smp_rmb();
 831
 832        if (order == 1) {
 833                /*
 834                 * Monarch: Starts executing now, the others wait.
 835                 */
 836                atomic_set(&mce_executing, 1);
 837        } else {
 838                /*
 839                 * Subject: Now start the scanning loop one by one in
 840                 * the original callin order.
 841                 * This way when there are any shared banks it will be
 842                 * only seen by one CPU before cleared, avoiding duplicates.
 843                 */
 844                while (atomic_read(&mce_executing) < order) {
 845                        if (mce_timed_out(&timeout)) {
 846                                atomic_set(&global_nwo, 0);
 847                                return -1;
 848                        }
 849                        ndelay(SPINUNIT);
 850                }
 851        }
 852
 853        /*
 854         * Cache the global no_way_out state.
 855         */
 856        *no_way_out = atomic_read(&global_nwo);
 857
 858        return order;
 859}
 860
 861/*
 862 * Synchronize between CPUs after main scanning loop.
 863 * This invokes the bulk of the Monarch processing.
 864 */
 865static int mce_end(int order)
 866{
 867        int ret = -1;
 868        u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 869
 870        if (!timeout)
 871                goto reset;
 872        if (order < 0)
 873                goto reset;
 874
 875        /*
 876         * Allow others to run.
 877         */
 878        atomic_inc(&mce_executing);
 879
 880        if (order == 1) {
 881                /* CHECKME: Can this race with a parallel hotplug? */
 882                int cpus = num_online_cpus();
 883
 884                /*
 885                 * Monarch: Wait for everyone to go through their scanning
 886                 * loops.
 887                 */
 888                while (atomic_read(&mce_executing) <= cpus) {
 889                        if (mce_timed_out(&timeout))
 890                                goto reset;
 891                        ndelay(SPINUNIT);
 892                }
 893
 894                mce_reign();
 895                barrier();
 896                ret = 0;
 897        } else {
 898                /*
 899                 * Subject: Wait for Monarch to finish.
 900                 */
 901                while (atomic_read(&mce_executing) != 0) {
 902                        if (mce_timed_out(&timeout))
 903                                goto reset;
 904                        ndelay(SPINUNIT);
 905                }
 906
 907                /*
 908                 * Don't reset anything. That's done by the Monarch.
 909                 */
 910                return 0;
 911        }
 912
 913        /*
 914         * Reset all global state.
 915         */
 916reset:
 917        atomic_set(&global_nwo, 0);
 918        atomic_set(&mce_callin, 0);
 919        barrier();
 920
 921        /*
 922         * Let others run again.
 923         */
 924        atomic_set(&mce_executing, 0);
 925        return ret;
 926}
 927
 928/*
 929 * Check if the address reported by the CPU is in a format we can parse.
 930 * It would be possible to add code for most other cases, but all would
 931 * be somewhat complicated (e.g. segment offset would require an instruction
 932 * parser). So only support physical addresses up to page granuality for now.
 933 */
 934static int mce_usable_address(struct mce *m)
 935{
 936        if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 937                return 0;
 938        if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 939                return 0;
 940        if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 941                return 0;
 942        return 1;
 943}
 944
 945static void mce_clear_state(unsigned long *toclear)
 946{
 947        int i;
 948
 949        for (i = 0; i < banks; i++) {
 950                if (test_bit(i, toclear))
 951                        mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 952        }
 953}
 954
 955/*
 956 * Need to save faulting physical address associated with a process
 957 * in the machine check handler some place where we can grab it back
 958 * later in mce_notify_process()
 959 */
 960#define MCE_INFO_MAX    16
 961
 962struct mce_info {
 963        atomic_t                inuse;
 964        struct task_struct      *t;
 965        __u64                   paddr;
 966        int                     restartable;
 967} mce_info[MCE_INFO_MAX];
 968
 969static void mce_save_info(__u64 addr, int c)
 970{
 971        struct mce_info *mi;
 972
 973        for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
 974                if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
 975                        mi->t = current;
 976                        mi->paddr = addr;
 977                        mi->restartable = c;
 978                        return;
 979                }
 980        }
 981
 982        mce_panic("Too many concurrent recoverable errors", NULL, NULL);
 983}
 984
 985static struct mce_info *mce_find_info(void)
 986{
 987        struct mce_info *mi;
 988
 989        for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
 990                if (atomic_read(&mi->inuse) && mi->t == current)
 991                        return mi;
 992        return NULL;
 993}
 994
 995static void mce_clear_info(struct mce_info *mi)
 996{
 997        atomic_set(&mi->inuse, 0);
 998}
 999
1000/*
1001 * The actual machine check handler. This only handles real
1002 * exceptions when something got corrupted coming in through int 18.
1003 *
1004 * This is executed in NMI context not subject to normal locking rules. This
1005 * implies that most kernel services cannot be safely used. Don't even
1006 * think about putting a printk in there!
1007 *
1008 * On Intel systems this is entered on all CPUs in parallel through
1009 * MCE broadcast. However some CPUs might be broken beyond repair,
1010 * so be always careful when synchronizing with others.
1011 */
1012void do_machine_check(struct pt_regs *regs, long error_code)
1013{
1014        struct mce m, *final;
1015        int i;
1016        int worst = 0;
1017        int severity;
1018        /*
1019         * Establish sequential order between the CPUs entering the machine
1020         * check handler.
1021         */
1022        int order;
1023        /*
1024         * If no_way_out gets set, there is no safe way to recover from this
1025         * MCE.  If tolerant is cranked up, we'll try anyway.
1026         */
1027        int no_way_out = 0;
1028        /*
1029         * If kill_it gets set, there might be a way to recover from this
1030         * error.
1031         */
1032        int kill_it = 0;
1033        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1034        DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1035        char *msg = "Unknown";
1036
1037        atomic_inc(&mce_entry);
1038
1039        this_cpu_inc(mce_exception_count);
1040
1041        if (!banks)
1042                goto out;
1043
1044        mce_gather_info(&m, regs);
1045
1046        final = &__get_cpu_var(mces_seen);
1047        *final = m;
1048
1049        memset(valid_banks, 0, sizeof(valid_banks));
1050        no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1051
1052        barrier();
1053
1054        /*
1055         * When no restart IP might need to kill or panic.
1056         * Assume the worst for now, but if we find the
1057         * severity is MCE_AR_SEVERITY we have other options.
1058         */
1059        if (!(m.mcgstatus & MCG_STATUS_RIPV))
1060                kill_it = 1;
1061
1062        /*
1063         * Go through all the banks in exclusion of the other CPUs.
1064         * This way we don't report duplicated events on shared banks
1065         * because the first one to see it will clear it.
1066         */
1067        order = mce_start(&no_way_out);
1068        for (i = 0; i < banks; i++) {
1069                __clear_bit(i, toclear);
1070                if (!test_bit(i, valid_banks))
1071                        continue;
1072                if (!mce_banks[i].ctl)
1073                        continue;
1074
1075                m.misc = 0;
1076                m.addr = 0;
1077                m.bank = i;
1078
1079                m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1080                if ((m.status & MCI_STATUS_VAL) == 0)
1081                        continue;
1082
1083                /*
1084                 * Non uncorrected or non signaled errors are handled by
1085                 * machine_check_poll. Leave them alone, unless this panics.
1086                 */
1087                if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1088                        !no_way_out)
1089                        continue;
1090
1091                /*
1092                 * Set taint even when machine check was not enabled.
1093                 */
1094                add_taint(TAINT_MACHINE_CHECK);
1095
1096                severity = mce_severity(&m, tolerant, NULL);
1097
1098                /*
1099                 * When machine check was for corrected handler don't touch,
1100                 * unless we're panicing.
1101                 */
1102                if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1103                        continue;
1104                __set_bit(i, toclear);
1105                if (severity == MCE_NO_SEVERITY) {
1106                        /*
1107                         * Machine check event was not enabled. Clear, but
1108                         * ignore.
1109                         */
1110                        continue;
1111                }
1112
1113                mce_read_aux(&m, i);
1114
1115                /*
1116                 * Action optional error. Queue address for later processing.
1117                 * When the ring overflows we just ignore the AO error.
1118                 * RED-PEN add some logging mechanism when
1119                 * usable_address or mce_add_ring fails.
1120                 * RED-PEN don't ignore overflow for tolerant == 0
1121                 */
1122                if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1123                        mce_ring_add(m.addr >> PAGE_SHIFT);
1124
1125                mce_log(&m);
1126
1127                if (severity > worst) {
1128                        *final = m;
1129                        worst = severity;
1130                }
1131        }
1132
1133        /* mce_clear_state will clear *final, save locally for use later */
1134        m = *final;
1135
1136        if (!no_way_out)
1137                mce_clear_state(toclear);
1138
1139        /*
1140         * Do most of the synchronization with other CPUs.
1141         * When there's any problem use only local no_way_out state.
1142         */
1143        if (mce_end(order) < 0)
1144                no_way_out = worst >= MCE_PANIC_SEVERITY;
1145
1146        /*
1147         * At insane "tolerant" levels we take no action. Otherwise
1148         * we only die if we have no other choice. For less serious
1149         * issues we try to recover, or limit damage to the current
1150         * process.
1151         */
1152        if (tolerant < 3) {
1153                if (no_way_out)
1154                        mce_panic("Fatal machine check on current CPU", &m, msg);
1155                if (worst == MCE_AR_SEVERITY) {
1156                        /* schedule action before return to userland */
1157                        mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1158                        set_thread_flag(TIF_MCE_NOTIFY);
1159                } else if (kill_it) {
1160                        force_sig(SIGBUS, current);
1161                }
1162        }
1163
1164        if (worst > 0)
1165                mce_report_event(regs);
1166        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1167out:
1168        atomic_dec(&mce_entry);
1169        sync_core();
1170}
1171EXPORT_SYMBOL_GPL(do_machine_check);
1172
1173#ifndef CONFIG_MEMORY_FAILURE
1174int memory_failure(unsigned long pfn, int vector, int flags)
1175{
1176        /* mce_severity() should not hand us an ACTION_REQUIRED error */
1177        BUG_ON(flags & MF_ACTION_REQUIRED);
1178        pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1179               "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1180               pfn);
1181
1182        return 0;
1183}
1184#endif
1185
1186/*
1187 * Called in process context that interrupted by MCE and marked with
1188 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1189 * This code is allowed to sleep.
1190 * Attempt possible recovery such as calling the high level VM handler to
1191 * process any corrupted pages, and kill/signal current process if required.
1192 * Action required errors are handled here.
1193 */
1194void mce_notify_process(void)
1195{
1196        unsigned long pfn;
1197        struct mce_info *mi = mce_find_info();
1198        int flags = MF_ACTION_REQUIRED;
1199
1200        if (!mi)
1201                mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1202        pfn = mi->paddr >> PAGE_SHIFT;
1203
1204        clear_thread_flag(TIF_MCE_NOTIFY);
1205
1206        pr_err("Uncorrected hardware memory error in user-access at %llx",
1207                 mi->paddr);
1208        /*
1209         * We must call memory_failure() here even if the current process is
1210         * doomed. We still need to mark the page as poisoned and alert any
1211         * other users of the page.
1212         */
1213        if (!mi->restartable)
1214                flags |= MF_MUST_KILL;
1215        if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1216                pr_err("Memory error not recovered");
1217                force_sig(SIGBUS, current);
1218        }
1219        mce_clear_info(mi);
1220}
1221
1222/*
1223 * Action optional processing happens here (picking up
1224 * from the list of faulting pages that do_machine_check()
1225 * placed into the "ring").
1226 */
1227static void mce_process_work(struct work_struct *dummy)
1228{
1229        unsigned long pfn;
1230
1231        while (mce_ring_get(&pfn))
1232                memory_failure(pfn, MCE_VECTOR, 0);
1233}
1234
1235#ifdef CONFIG_X86_MCE_INTEL
1236/***
1237 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1238 * @cpu: The CPU on which the event occurred.
1239 * @status: Event status information
1240 *
1241 * This function should be called by the thermal interrupt after the
1242 * event has been processed and the decision was made to log the event
1243 * further.
1244 *
1245 * The status parameter will be saved to the 'status' field of 'struct mce'
1246 * and historically has been the register value of the
1247 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1248 */
1249void mce_log_therm_throt_event(__u64 status)
1250{
1251        struct mce m;
1252
1253        mce_setup(&m);
1254        m.bank = MCE_THERMAL_BANK;
1255        m.status = status;
1256        mce_log(&m);
1257}
1258#endif /* CONFIG_X86_MCE_INTEL */
1259
1260/*
1261 * Periodic polling timer for "silent" machine check errors.  If the
1262 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1263 * errors, poll 2x slower (up to check_interval seconds).
1264 */
1265static unsigned long check_interval = 5 * 60; /* 5 minutes */
1266
1267static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1268static DEFINE_PER_CPU(struct timer_list, mce_timer);
1269
1270static unsigned long mce_adjust_timer_default(unsigned long interval)
1271{
1272        return interval;
1273}
1274
1275static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1276        mce_adjust_timer_default;
1277
1278static void mce_timer_fn(unsigned long data)
1279{
1280        struct timer_list *t = &__get_cpu_var(mce_timer);
1281        unsigned long iv;
1282
1283        WARN_ON(smp_processor_id() != data);
1284
1285        if (mce_available(__this_cpu_ptr(&cpu_info))) {
1286                machine_check_poll(MCP_TIMESTAMP,
1287                                &__get_cpu_var(mce_poll_banks));
1288                mce_intel_cmci_poll();
1289        }
1290
1291        /*
1292         * Alert userspace if needed.  If we logged an MCE, reduce the
1293         * polling interval, otherwise increase the polling interval.
1294         */
1295        iv = __this_cpu_read(mce_next_interval);
1296        if (mce_notify_irq()) {
1297                iv = max(iv / 2, (unsigned long) HZ/100);
1298        } else {
1299                iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1300                iv = mce_adjust_timer(iv);
1301        }
1302        __this_cpu_write(mce_next_interval, iv);
1303        /* Might have become 0 after CMCI storm subsided */
1304        if (iv) {
1305                t->expires = jiffies + iv;
1306                add_timer_on(t, smp_processor_id());
1307        }
1308}
1309
1310/*
1311 * Ensure that the timer is firing in @interval from now.
1312 */
1313void mce_timer_kick(unsigned long interval)
1314{
1315        struct timer_list *t = &__get_cpu_var(mce_timer);
1316        unsigned long when = jiffies + interval;
1317        unsigned long iv = __this_cpu_read(mce_next_interval);
1318
1319        if (timer_pending(t)) {
1320                if (time_before(when, t->expires))
1321                        mod_timer_pinned(t, when);
1322        } else {
1323                t->expires = round_jiffies(when);
1324                add_timer_on(t, smp_processor_id());
1325        }
1326        if (interval < iv)
1327                __this_cpu_write(mce_next_interval, interval);
1328}
1329
1330/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1331static void mce_timer_delete_all(void)
1332{
1333        int cpu;
1334
1335        for_each_online_cpu(cpu)
1336                del_timer_sync(&per_cpu(mce_timer, cpu));
1337}
1338
1339static void mce_do_trigger(struct work_struct *work)
1340{
1341        call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1342}
1343
1344static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1345
1346/*
1347 * Notify the user(s) about new machine check events.
1348 * Can be called from interrupt context, but not from machine check/NMI
1349 * context.
1350 */
1351int mce_notify_irq(void)
1352{
1353        /* Not more than two messages every minute */
1354        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1355
1356        if (test_and_clear_bit(0, &mce_need_notify)) {
1357                /* wake processes polling /dev/mcelog */
1358                wake_up_interruptible(&mce_chrdev_wait);
1359
1360                /*
1361                 * There is no risk of missing notifications because
1362                 * work_pending is always cleared before the function is
1363                 * executed.
1364                 */
1365                if (mce_helper[0] && !work_pending(&mce_trigger_work))
1366                        schedule_work(&mce_trigger_work);
1367
1368                if (__ratelimit(&ratelimit))
1369                        pr_info(HW_ERR "Machine check events logged\n");
1370
1371                return 1;
1372        }
1373        return 0;
1374}
1375EXPORT_SYMBOL_GPL(mce_notify_irq);
1376
1377static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1378{
1379        int i;
1380
1381        mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1382        if (!mce_banks)
1383                return -ENOMEM;
1384        for (i = 0; i < banks; i++) {
1385                struct mce_bank *b = &mce_banks[i];
1386
1387                b->ctl = -1ULL;
1388                b->init = 1;
1389        }
1390        return 0;
1391}
1392
1393/*
1394 * Initialize Machine Checks for a CPU.
1395 */
1396static int __cpuinit __mcheck_cpu_cap_init(void)
1397{
1398        unsigned b;
1399        u64 cap;
1400
1401        rdmsrl(MSR_IA32_MCG_CAP, cap);
1402
1403        b = cap & MCG_BANKCNT_MASK;
1404        if (!banks)
1405                pr_info("CPU supports %d MCE banks\n", b);
1406
1407        if (b > MAX_NR_BANKS) {
1408                pr_warn("Using only %u machine check banks out of %u\n",
1409                        MAX_NR_BANKS, b);
1410                b = MAX_NR_BANKS;
1411        }
1412
1413        /* Don't support asymmetric configurations today */
1414        WARN_ON(banks != 0 && b != banks);
1415        banks = b;
1416        if (!mce_banks) {
1417                int err = __mcheck_cpu_mce_banks_init();
1418
1419                if (err)
1420                        return err;
1421        }
1422
1423        /* Use accurate RIP reporting if available. */
1424        if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1425                rip_msr = MSR_IA32_MCG_EIP;
1426
1427        if (cap & MCG_SER_P)
1428                mce_ser = 1;
1429
1430        return 0;
1431}
1432
1433static void __mcheck_cpu_init_generic(void)
1434{
1435        mce_banks_t all_banks;
1436        u64 cap;
1437        int i;
1438
1439        /*
1440         * Log the machine checks left over from the previous reset.
1441         */
1442        bitmap_fill(all_banks, MAX_NR_BANKS);
1443        machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1444
1445        set_in_cr4(X86_CR4_MCE);
1446
1447        rdmsrl(MSR_IA32_MCG_CAP, cap);
1448        if (cap & MCG_CTL_P)
1449                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1450
1451        for (i = 0; i < banks; i++) {
1452                struct mce_bank *b = &mce_banks[i];
1453
1454                if (!b->init)
1455                        continue;
1456                wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1457                wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1458        }
1459}
1460
1461/*
1462 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1463 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1464 * Vol 3B Table 15-20). But this confuses both the code that determines
1465 * whether the machine check occurred in kernel or user mode, and also
1466 * the severity assessment code. Pretend that EIPV was set, and take the
1467 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1468 */
1469static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1470{
1471        if (bank != 0)
1472                return;
1473        if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1474                return;
1475        if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1476                          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1477                          MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1478                          MCACOD)) !=
1479                         (MCI_STATUS_UC|MCI_STATUS_EN|
1480                          MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1481                          MCI_STATUS_AR|MCACOD_INSTR))
1482                return;
1483
1484        m->mcgstatus |= MCG_STATUS_EIPV;
1485        m->ip = regs->ip;
1486        m->cs = regs->cs;
1487}
1488
1489/* Add per CPU specific workarounds here */
1490static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1491{
1492        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1493                pr_info("unknown CPU type - not enabling MCE support\n");
1494                return -EOPNOTSUPP;
1495        }
1496
1497        /* This should be disabled by the BIOS, but isn't always */
1498        if (c->x86_vendor == X86_VENDOR_AMD) {
1499                if (c->x86 == 15 && banks > 4) {
1500                        /*
1501                         * disable GART TBL walk error reporting, which
1502                         * trips off incorrectly with the IOMMU & 3ware
1503                         * & Cerberus:
1504                         */
1505                        clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1506                }
1507                if (c->x86 <= 17 && mce_bootlog < 0) {
1508                        /*
1509                         * Lots of broken BIOS around that don't clear them
1510                         * by default and leave crap in there. Don't log:
1511                         */
1512                        mce_bootlog = 0;
1513                }
1514                /*
1515                 * Various K7s with broken bank 0 around. Always disable
1516                 * by default.
1517                 */
1518                 if (c->x86 == 6 && banks > 0)
1519                        mce_banks[0].ctl = 0;
1520
1521                 /*
1522                  * Turn off MC4_MISC thresholding banks on those models since
1523                  * they're not supported there.
1524                  */
1525                 if (c->x86 == 0x15 &&
1526                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1527                         int i;
1528                         u64 val, hwcr;
1529                         bool need_toggle;
1530                         u32 msrs[] = {
1531                                0x00000413, /* MC4_MISC0 */
1532                                0xc0000408, /* MC4_MISC1 */
1533                         };
1534
1535                         rdmsrl(MSR_K7_HWCR, hwcr);
1536
1537                         /* McStatusWrEn has to be set */
1538                         need_toggle = !(hwcr & BIT(18));
1539
1540                         if (need_toggle)
1541                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1542
1543                         for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1544                                 rdmsrl(msrs[i], val);
1545
1546                                 /* CntP bit set? */
1547                                 if (val & BIT_64(62)) {
1548                                        val &= ~BIT_64(62);
1549                                        wrmsrl(msrs[i], val);
1550                                 }
1551                         }
1552
1553                         /* restore old settings */
1554                         if (need_toggle)
1555                                 wrmsrl(MSR_K7_HWCR, hwcr);
1556                 }
1557        }
1558
1559        if (c->x86_vendor == X86_VENDOR_INTEL) {
1560                /*
1561                 * SDM documents that on family 6 bank 0 should not be written
1562                 * because it aliases to another special BIOS controlled
1563                 * register.
1564                 * But it's not aliased anymore on model 0x1a+
1565                 * Don't ignore bank 0 completely because there could be a
1566                 * valid event later, merely don't write CTL0.
1567                 */
1568
1569                if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1570                        mce_banks[0].init = 0;
1571
1572                /*
1573                 * All newer Intel systems support MCE broadcasting. Enable
1574                 * synchronization with a one second timeout.
1575                 */
1576                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1577                        monarch_timeout < 0)
1578                        monarch_timeout = USEC_PER_SEC;
1579
1580                /*
1581                 * There are also broken BIOSes on some Pentium M and
1582                 * earlier systems:
1583                 */
1584                if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1585                        mce_bootlog = 0;
1586
1587                if (c->x86 == 6 && c->x86_model == 45)
1588                        quirk_no_way_out = quirk_sandybridge_ifu;
1589        }
1590        if (monarch_timeout < 0)
1591                monarch_timeout = 0;
1592        if (mce_bootlog != 0)
1593                mce_panic_timeout = 30;
1594
1595        return 0;
1596}
1597
1598static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1599{
1600        if (c->x86 != 5)
1601                return 0;
1602
1603        switch (c->x86_vendor) {
1604        case X86_VENDOR_INTEL:
1605                intel_p5_mcheck_init(c);
1606                return 1;
1607                break;
1608        case X86_VENDOR_CENTAUR:
1609                winchip_mcheck_init(c);
1610                return 1;
1611                break;
1612        }
1613
1614        return 0;
1615}
1616
1617static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1618{
1619        switch (c->x86_vendor) {
1620        case X86_VENDOR_INTEL:
1621                mce_intel_feature_init(c);
1622                mce_adjust_timer = mce_intel_adjust_timer;
1623                break;
1624        case X86_VENDOR_AMD:
1625                mce_amd_feature_init(c);
1626                break;
1627        default:
1628                break;
1629        }
1630}
1631
1632static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1633{
1634        unsigned long iv = mce_adjust_timer(check_interval * HZ);
1635
1636        __this_cpu_write(mce_next_interval, iv);
1637
1638        if (mce_ignore_ce || !iv)
1639                return;
1640
1641        t->expires = round_jiffies(jiffies + iv);
1642        add_timer_on(t, smp_processor_id());
1643}
1644
1645static void __mcheck_cpu_init_timer(void)
1646{
1647        struct timer_list *t = &__get_cpu_var(mce_timer);
1648        unsigned int cpu = smp_processor_id();
1649
1650        setup_timer(t, mce_timer_fn, cpu);
1651        mce_start_timer(cpu, t);
1652}
1653
1654/* Handle unconfigured int18 (should never happen) */
1655static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1656{
1657        pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1658               smp_processor_id());
1659}
1660
1661/* Call the installed machine check handler for this CPU setup. */
1662void (*machine_check_vector)(struct pt_regs *, long error_code) =
1663                                                unexpected_machine_check;
1664
1665/*
1666 * Called for each booted CPU to set up machine checks.
1667 * Must be called with preempt off:
1668 */
1669void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1670{
1671        if (mce_disabled)
1672                return;
1673
1674        if (__mcheck_cpu_ancient_init(c))
1675                return;
1676
1677        if (!mce_available(c))
1678                return;
1679
1680        if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1681                mce_disabled = 1;
1682                return;
1683        }
1684
1685        machine_check_vector = do_machine_check;
1686
1687        __mcheck_cpu_init_generic();
1688        __mcheck_cpu_init_vendor(c);
1689        __mcheck_cpu_init_timer();
1690        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1691        init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1692}
1693
1694/*
1695 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1696 */
1697
1698static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1699static int mce_chrdev_open_count;       /* #times opened */
1700static int mce_chrdev_open_exclu;       /* already open exclusive? */
1701
1702static int mce_chrdev_open(struct inode *inode, struct file *file)
1703{
1704        spin_lock(&mce_chrdev_state_lock);
1705
1706        if (mce_chrdev_open_exclu ||
1707            (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1708                spin_unlock(&mce_chrdev_state_lock);
1709
1710                return -EBUSY;
1711        }
1712
1713        if (file->f_flags & O_EXCL)
1714                mce_chrdev_open_exclu = 1;
1715        mce_chrdev_open_count++;
1716
1717        spin_unlock(&mce_chrdev_state_lock);
1718
1719        return nonseekable_open(inode, file);
1720}
1721
1722static int mce_chrdev_release(struct inode *inode, struct file *file)
1723{
1724        spin_lock(&mce_chrdev_state_lock);
1725
1726        mce_chrdev_open_count--;
1727        mce_chrdev_open_exclu = 0;
1728
1729        spin_unlock(&mce_chrdev_state_lock);
1730
1731        return 0;
1732}
1733
1734static void collect_tscs(void *data)
1735{
1736        unsigned long *cpu_tsc = (unsigned long *)data;
1737
1738        rdtscll(cpu_tsc[smp_processor_id()]);
1739}
1740
1741static int mce_apei_read_done;
1742
1743/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1744static int __mce_read_apei(char __user **ubuf, size_t usize)
1745{
1746        int rc;
1747        u64 record_id;
1748        struct mce m;
1749
1750        if (usize < sizeof(struct mce))
1751                return -EINVAL;
1752
1753        rc = apei_read_mce(&m, &record_id);
1754        /* Error or no more MCE record */
1755        if (rc <= 0) {
1756                mce_apei_read_done = 1;
1757                /*
1758                 * When ERST is disabled, mce_chrdev_read() should return
1759                 * "no record" instead of "no device."
1760                 */
1761                if (rc == -ENODEV)
1762                        return 0;
1763                return rc;
1764        }
1765        rc = -EFAULT;
1766        if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1767                return rc;
1768        /*
1769         * In fact, we should have cleared the record after that has
1770         * been flushed to the disk or sent to network in
1771         * /sbin/mcelog, but we have no interface to support that now,
1772         * so just clear it to avoid duplication.
1773         */
1774        rc = apei_clear_mce(record_id);
1775        if (rc) {
1776                mce_apei_read_done = 1;
1777                return rc;
1778        }
1779        *ubuf += sizeof(struct mce);
1780
1781        return 0;
1782}
1783
1784static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1785                                size_t usize, loff_t *off)
1786{
1787        char __user *buf = ubuf;
1788        unsigned long *cpu_tsc;
1789        unsigned prev, next;
1790        int i, err;
1791
1792        cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1793        if (!cpu_tsc)
1794                return -ENOMEM;
1795
1796        mutex_lock(&mce_chrdev_read_mutex);
1797
1798        if (!mce_apei_read_done) {
1799                err = __mce_read_apei(&buf, usize);
1800                if (err || buf != ubuf)
1801                        goto out;
1802        }
1803
1804        next = rcu_dereference_check_mce(mcelog.next);
1805
1806        /* Only supports full reads right now */
1807        err = -EINVAL;
1808        if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1809                goto out;
1810
1811        err = 0;
1812        prev = 0;
1813        do {
1814                for (i = prev; i < next; i++) {
1815                        unsigned long start = jiffies;
1816                        struct mce *m = &mcelog.entry[i];
1817
1818                        while (!m->finished) {
1819                                if (time_after_eq(jiffies, start + 2)) {
1820                                        memset(m, 0, sizeof(*m));
1821                                        goto timeout;
1822                                }
1823                                cpu_relax();
1824                        }
1825                        smp_rmb();
1826                        err |= copy_to_user(buf, m, sizeof(*m));
1827                        buf += sizeof(*m);
1828timeout:
1829                        ;
1830                }
1831
1832                memset(mcelog.entry + prev, 0,
1833                       (next - prev) * sizeof(struct mce));
1834                prev = next;
1835                next = cmpxchg(&mcelog.next, prev, 0);
1836        } while (next != prev);
1837
1838        synchronize_sched();
1839
1840        /*
1841         * Collect entries that were still getting written before the
1842         * synchronize.
1843         */
1844        on_each_cpu(collect_tscs, cpu_tsc, 1);
1845
1846        for (i = next; i < MCE_LOG_LEN; i++) {
1847                struct mce *m = &mcelog.entry[i];
1848
1849                if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1850                        err |= copy_to_user(buf, m, sizeof(*m));
1851                        smp_rmb();
1852                        buf += sizeof(*m);
1853                        memset(m, 0, sizeof(*m));
1854                }
1855        }
1856
1857        if (err)
1858                err = -EFAULT;
1859
1860out:
1861        mutex_unlock(&mce_chrdev_read_mutex);
1862        kfree(cpu_tsc);
1863
1864        return err ? err : buf - ubuf;
1865}
1866
1867static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1868{
1869        poll_wait(file, &mce_chrdev_wait, wait);
1870        if (rcu_access_index(mcelog.next))
1871                return POLLIN | POLLRDNORM;
1872        if (!mce_apei_read_done && apei_check_mce())
1873                return POLLIN | POLLRDNORM;
1874        return 0;
1875}
1876
1877static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1878                                unsigned long arg)
1879{
1880        int __user *p = (int __user *)arg;
1881
1882        if (!capable(CAP_SYS_ADMIN))
1883                return -EPERM;
1884
1885        switch (cmd) {
1886        case MCE_GET_RECORD_LEN:
1887                return put_user(sizeof(struct mce), p);
1888        case MCE_GET_LOG_LEN:
1889                return put_user(MCE_LOG_LEN, p);
1890        case MCE_GETCLEAR_FLAGS: {
1891                unsigned flags;
1892
1893                do {
1894                        flags = mcelog.flags;
1895                } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1896
1897                return put_user(flags, p);
1898        }
1899        default:
1900                return -ENOTTY;
1901        }
1902}
1903
1904static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1905                            size_t usize, loff_t *off);
1906
1907void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1908                             const char __user *ubuf,
1909                             size_t usize, loff_t *off))
1910{
1911        mce_write = fn;
1912}
1913EXPORT_SYMBOL_GPL(register_mce_write_callback);
1914
1915ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1916                         size_t usize, loff_t *off)
1917{
1918        if (mce_write)
1919                return mce_write(filp, ubuf, usize, off);
1920        else
1921                return -EINVAL;
1922}
1923
1924static const struct file_operations mce_chrdev_ops = {
1925        .open                   = mce_chrdev_open,
1926        .release                = mce_chrdev_release,
1927        .read                   = mce_chrdev_read,
1928        .write                  = mce_chrdev_write,
1929        .poll                   = mce_chrdev_poll,
1930        .unlocked_ioctl         = mce_chrdev_ioctl,
1931        .llseek                 = no_llseek,
1932};
1933
1934static struct miscdevice mce_chrdev_device = {
1935        MISC_MCELOG_MINOR,
1936        "mcelog",
1937        &mce_chrdev_ops,
1938};
1939
1940/*
1941 * mce=off Disables machine check
1942 * mce=no_cmci Disables CMCI
1943 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1944 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1945 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1946 *      monarchtimeout is how long to wait for other CPUs on machine
1947 *      check, or 0 to not wait
1948 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1949 * mce=nobootlog Don't log MCEs from before booting.
1950 * mce=bios_cmci_threshold Don't program the CMCI threshold
1951 */
1952static int __init mcheck_enable(char *str)
1953{
1954        if (*str == 0) {
1955                enable_p5_mce();
1956                return 1;
1957        }
1958        if (*str == '=')
1959                str++;
1960        if (!strcmp(str, "off"))
1961                mce_disabled = 1;
1962        else if (!strcmp(str, "no_cmci"))
1963                mce_cmci_disabled = 1;
1964        else if (!strcmp(str, "dont_log_ce"))
1965                mce_dont_log_ce = 1;
1966        else if (!strcmp(str, "ignore_ce"))
1967                mce_ignore_ce = 1;
1968        else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1969                mce_bootlog = (str[0] == 'b');
1970        else if (!strcmp(str, "bios_cmci_threshold"))
1971                mce_bios_cmci_threshold = 1;
1972        else if (isdigit(str[0])) {
1973                get_option(&str, &tolerant);
1974                if (*str == ',') {
1975                        ++str;
1976                        get_option(&str, &monarch_timeout);
1977                }
1978        } else {
1979                pr_info("mce argument %s ignored. Please use /sys\n", str);
1980                return 0;
1981        }
1982        return 1;
1983}
1984__setup("mce", mcheck_enable);
1985
1986int __init mcheck_init(void)
1987{
1988        mcheck_intel_therm_init();
1989
1990        return 0;
1991}
1992
1993/*
1994 * mce_syscore: PM support
1995 */
1996
1997/*
1998 * Disable machine checks on suspend and shutdown. We can't really handle
1999 * them later.
2000 */
2001static int mce_disable_error_reporting(void)
2002{
2003        int i;
2004
2005        for (i = 0; i < banks; i++) {
2006                struct mce_bank *b = &mce_banks[i];
2007
2008                if (b->init)
2009                        wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2010        }
2011        return 0;
2012}
2013
2014static int mce_syscore_suspend(void)
2015{
2016        return mce_disable_error_reporting();
2017}
2018
2019static void mce_syscore_shutdown(void)
2020{
2021        mce_disable_error_reporting();
2022}
2023
2024/*
2025 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2026 * Only one CPU is active at this time, the others get re-added later using
2027 * CPU hotplug:
2028 */
2029static void mce_syscore_resume(void)
2030{
2031        __mcheck_cpu_init_generic();
2032        __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
2033}
2034
2035static struct syscore_ops mce_syscore_ops = {
2036        .suspend        = mce_syscore_suspend,
2037        .shutdown       = mce_syscore_shutdown,
2038        .resume         = mce_syscore_resume,
2039};
2040
2041/*
2042 * mce_device: Sysfs support
2043 */
2044
2045static void mce_cpu_restart(void *data)
2046{
2047        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2048                return;
2049        __mcheck_cpu_init_generic();
2050        __mcheck_cpu_init_timer();
2051}
2052
2053/* Reinit MCEs after user configuration changes */
2054static void mce_restart(void)
2055{
2056        mce_timer_delete_all();
2057        on_each_cpu(mce_cpu_restart, NULL, 1);
2058}
2059
2060/* Toggle features for corrected errors */
2061static void mce_disable_cmci(void *data)
2062{
2063        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2064                return;
2065        cmci_clear();
2066}
2067
2068static void mce_enable_ce(void *all)
2069{
2070        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2071                return;
2072        cmci_reenable();
2073        cmci_recheck();
2074        if (all)
2075                __mcheck_cpu_init_timer();
2076}
2077
2078static struct bus_type mce_subsys = {
2079        .name           = "machinecheck",
2080        .dev_name       = "machinecheck",
2081};
2082
2083DEFINE_PER_CPU(struct device *, mce_device);
2084
2085__cpuinitdata
2086void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2087
2088static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2089{
2090        return container_of(attr, struct mce_bank, attr);
2091}
2092
2093static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2094                         char *buf)
2095{
2096        return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2097}
2098
2099static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2100                        const char *buf, size_t size)
2101{
2102        u64 new;
2103
2104        if (strict_strtoull(buf, 0, &new) < 0)
2105                return -EINVAL;
2106
2107        attr_to_bank(attr)->ctl = new;
2108        mce_restart();
2109
2110        return size;
2111}
2112
2113static ssize_t
2114show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2115{
2116        strcpy(buf, mce_helper);
2117        strcat(buf, "\n");
2118        return strlen(mce_helper) + 1;
2119}
2120
2121static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2122                                const char *buf, size_t siz)
2123{
2124        char *p;
2125
2126        strncpy(mce_helper, buf, sizeof(mce_helper));
2127        mce_helper[sizeof(mce_helper)-1] = 0;
2128        p = strchr(mce_helper, '\n');
2129
2130        if (p)
2131                *p = 0;
2132
2133        return strlen(mce_helper) + !!p;
2134}
2135
2136static ssize_t set_ignore_ce(struct device *s,
2137                             struct device_attribute *attr,
2138                             const char *buf, size_t size)
2139{
2140        u64 new;
2141
2142        if (strict_strtoull(buf, 0, &new) < 0)
2143                return -EINVAL;
2144
2145        if (mce_ignore_ce ^ !!new) {
2146                if (new) {
2147                        /* disable ce features */
2148                        mce_timer_delete_all();
2149                        on_each_cpu(mce_disable_cmci, NULL, 1);
2150                        mce_ignore_ce = 1;
2151                } else {
2152                        /* enable ce features */
2153                        mce_ignore_ce = 0;
2154                        on_each_cpu(mce_enable_ce, (void *)1, 1);
2155                }
2156        }
2157        return size;
2158}
2159
2160static ssize_t set_cmci_disabled(struct device *s,
2161                                 struct device_attribute *attr,
2162                                 const char *buf, size_t size)
2163{
2164        u64 new;
2165
2166        if (strict_strtoull(buf, 0, &new) < 0)
2167                return -EINVAL;
2168
2169        if (mce_cmci_disabled ^ !!new) {
2170                if (new) {
2171                        /* disable cmci */
2172                        on_each_cpu(mce_disable_cmci, NULL, 1);
2173                        mce_cmci_disabled = 1;
2174                } else {
2175                        /* enable cmci */
2176                        mce_cmci_disabled = 0;
2177                        on_each_cpu(mce_enable_ce, NULL, 1);
2178                }
2179        }
2180        return size;
2181}
2182
2183static ssize_t store_int_with_restart(struct device *s,
2184                                      struct device_attribute *attr,
2185                                      const char *buf, size_t size)
2186{
2187        ssize_t ret = device_store_int(s, attr, buf, size);
2188        mce_restart();
2189        return ret;
2190}
2191
2192static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2193static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
2194static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
2195static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
2196
2197static struct dev_ext_attribute dev_attr_check_interval = {
2198        __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2199        &check_interval
2200};
2201
2202static struct dev_ext_attribute dev_attr_ignore_ce = {
2203        __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
2204        &mce_ignore_ce
2205};
2206
2207static struct dev_ext_attribute dev_attr_cmci_disabled = {
2208        __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
2209        &mce_cmci_disabled
2210};
2211
2212static struct device_attribute *mce_device_attrs[] = {
2213        &dev_attr_tolerant.attr,
2214        &dev_attr_check_interval.attr,
2215        &dev_attr_trigger,
2216        &dev_attr_monarch_timeout.attr,
2217        &dev_attr_dont_log_ce.attr,
2218        &dev_attr_ignore_ce.attr,
2219        &dev_attr_cmci_disabled.attr,
2220        NULL
2221};
2222
2223static cpumask_var_t mce_device_initialized;
2224
2225static void mce_device_release(struct device *dev)
2226{
2227        kfree(dev);
2228}
2229
2230/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2231static __cpuinit int mce_device_create(unsigned int cpu)
2232{
2233        struct device *dev;
2234        int err;
2235        int i, j;
2236
2237        if (!mce_available(&boot_cpu_data))
2238                return -EIO;
2239
2240        dev = kzalloc(sizeof *dev, GFP_KERNEL);
2241        if (!dev)
2242                return -ENOMEM;
2243        dev->id  = cpu;
2244        dev->bus = &mce_subsys;
2245        dev->release = &mce_device_release;
2246
2247        err = device_register(dev);
2248        if (err)
2249                return err;
2250
2251        for (i = 0; mce_device_attrs[i]; i++) {
2252                err = device_create_file(dev, mce_device_attrs[i]);
2253                if (err)
2254                        goto error;
2255        }
2256        for (j = 0; j < banks; j++) {
2257                err = device_create_file(dev, &mce_banks[j].attr);
2258                if (err)
2259                        goto error2;
2260        }
2261        cpumask_set_cpu(cpu, mce_device_initialized);
2262        per_cpu(mce_device, cpu) = dev;
2263
2264        return 0;
2265error2:
2266        while (--j >= 0)
2267                device_remove_file(dev, &mce_banks[j].attr);
2268error:
2269        while (--i >= 0)
2270                device_remove_file(dev, mce_device_attrs[i]);
2271
2272        device_unregister(dev);
2273
2274        return err;
2275}
2276
2277static __cpuinit void mce_device_remove(unsigned int cpu)
2278{
2279        struct device *dev = per_cpu(mce_device, cpu);
2280        int i;
2281
2282        if (!cpumask_test_cpu(cpu, mce_device_initialized))
2283                return;
2284
2285        for (i = 0; mce_device_attrs[i]; i++)
2286                device_remove_file(dev, mce_device_attrs[i]);
2287
2288        for (i = 0; i < banks; i++)
2289                device_remove_file(dev, &mce_banks[i].attr);
2290
2291        device_unregister(dev);
2292        cpumask_clear_cpu(cpu, mce_device_initialized);
2293        per_cpu(mce_device, cpu) = NULL;
2294}
2295
2296/* Make sure there are no machine checks on offlined CPUs. */
2297static void __cpuinit mce_disable_cpu(void *h)
2298{
2299        unsigned long action = *(unsigned long *)h;
2300        int i;
2301
2302        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2303                return;
2304
2305        if (!(action & CPU_TASKS_FROZEN))
2306                cmci_clear();
2307        for (i = 0; i < banks; i++) {
2308                struct mce_bank *b = &mce_banks[i];
2309
2310                if (b->init)
2311                        wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2312        }
2313}
2314
2315static void __cpuinit mce_reenable_cpu(void *h)
2316{
2317        unsigned long action = *(unsigned long *)h;
2318        int i;
2319
2320        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2321                return;
2322
2323        if (!(action & CPU_TASKS_FROZEN))
2324                cmci_reenable();
2325        for (i = 0; i < banks; i++) {
2326                struct mce_bank *b = &mce_banks[i];
2327
2328                if (b->init)
2329                        wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2330        }
2331}
2332
2333/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2334static int __cpuinit
2335mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2336{
2337        unsigned int cpu = (unsigned long)hcpu;
2338        struct timer_list *t = &per_cpu(mce_timer, cpu);
2339
2340        switch (action & ~CPU_TASKS_FROZEN) {
2341        case CPU_ONLINE:
2342                mce_device_create(cpu);
2343                if (threshold_cpu_callback)
2344                        threshold_cpu_callback(action, cpu);
2345                break;
2346        case CPU_DEAD:
2347                if (threshold_cpu_callback)
2348                        threshold_cpu_callback(action, cpu);
2349                mce_device_remove(cpu);
2350                mce_intel_hcpu_update(cpu);
2351                break;
2352        case CPU_DOWN_PREPARE:
2353                smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2354                del_timer_sync(t);
2355                break;
2356        case CPU_DOWN_FAILED:
2357                smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2358                mce_start_timer(cpu, t);
2359                break;
2360        }
2361
2362        if (action == CPU_POST_DEAD) {
2363                /* intentionally ignoring frozen here */
2364                cmci_rediscover(cpu);
2365        }
2366
2367        return NOTIFY_OK;
2368}
2369
2370static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2371        .notifier_call = mce_cpu_callback,
2372};
2373
2374static __init void mce_init_banks(void)
2375{
2376        int i;
2377
2378        for (i = 0; i < banks; i++) {
2379                struct mce_bank *b = &mce_banks[i];
2380                struct device_attribute *a = &b->attr;
2381
2382                sysfs_attr_init(&a->attr);
2383                a->attr.name    = b->attrname;
2384                snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2385
2386                a->attr.mode    = 0644;
2387                a->show         = show_bank;
2388                a->store        = set_bank;
2389        }
2390}
2391
2392static __init int mcheck_init_device(void)
2393{
2394        int err;
2395        int i = 0;
2396
2397        if (!mce_available(&boot_cpu_data))
2398                return -EIO;
2399
2400        zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2401
2402        mce_init_banks();
2403
2404        err = subsys_system_register(&mce_subsys, NULL);
2405        if (err)
2406                return err;
2407
2408        for_each_online_cpu(i) {
2409                err = mce_device_create(i);
2410                if (err)
2411                        return err;
2412        }
2413
2414        register_syscore_ops(&mce_syscore_ops);
2415        register_hotcpu_notifier(&mce_cpu_notifier);
2416
2417        /* register character device /dev/mcelog */
2418        misc_register(&mce_chrdev_device);
2419
2420        return err;
2421}
2422device_initcall_sync(mcheck_init_device);
2423
2424/*
2425 * Old style boot options parsing. Only for compatibility.
2426 */
2427static int __init mcheck_disable(char *str)
2428{
2429        mce_disabled = 1;
2430        return 1;
2431}
2432__setup("nomce", mcheck_disable);
2433
2434#ifdef CONFIG_DEBUG_FS
2435struct dentry *mce_get_debugfs_dir(void)
2436{
2437        static struct dentry *dmce;
2438
2439        if (!dmce)
2440                dmce = debugfs_create_dir("mce", NULL);
2441
2442        return dmce;
2443}
2444
2445static void mce_reset(void)
2446{
2447        cpu_missing = 0;
2448        atomic_set(&mce_fake_paniced, 0);
2449        atomic_set(&mce_executing, 0);
2450        atomic_set(&mce_callin, 0);
2451        atomic_set(&global_nwo, 0);
2452}
2453
2454static int fake_panic_get(void *data, u64 *val)
2455{
2456        *val = fake_panic;
2457        return 0;
2458}
2459
2460static int fake_panic_set(void *data, u64 val)
2461{
2462        mce_reset();
2463        fake_panic = val;
2464        return 0;
2465}
2466
2467DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2468                        fake_panic_set, "%llu\n");
2469
2470static int __init mcheck_debugfs_init(void)
2471{
2472        struct dentry *dmce, *ffake_panic;
2473
2474        dmce = mce_get_debugfs_dir();
2475        if (!dmce)
2476                return -ENOMEM;
2477        ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2478                                          &fake_panic_fops);
2479        if (!ffake_panic)
2480                return -ENOMEM;
2481
2482        return 0;
2483}
2484late_initcall(mcheck_debugfs_init);
2485#endif
2486
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.