linux-old/arch/i386/kernel/smp.c
<<
>>
Prefs
   1/*
   2 *      Intel SMP support routines.
   3 *
   4 *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
   5 *      (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
   6 *
   7 *      This code is released under the GNU General Public License version 2 or
   8 *      later.
   9 */
  10
  11#include <linux/init.h>
  12
  13#include <linux/mm.h>
  14#include <linux/irq.h>
  15#include <linux/delay.h>
  16#include <linux/spinlock.h>
  17#include <linux/smp_lock.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/mc146818rtc.h>
  20#include <linux/cache.h>
  21
  22#include <asm/mtrr.h>
  23#include <asm/pgalloc.h>
  24#include <asm/smpboot.h>
  25
  26/*
  27 *      Some notes on x86 processor bugs affecting SMP operation:
  28 *
  29 *      Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
  30 *      The Linux implications for SMP are handled as follows:
  31 *
  32 *      Pentium III / [Xeon]
  33 *              None of the E1AP-E3AP errata are visible to the user.
  34 *
  35 *      E1AP.   see PII A1AP
  36 *      E2AP.   see PII A2AP
  37 *      E3AP.   see PII A3AP
  38 *
  39 *      Pentium II / [Xeon]
  40 *              None of the A1AP-A3AP errata are visible to the user.
  41 *
  42 *      A1AP.   see PPro 1AP
  43 *      A2AP.   see PPro 2AP
  44 *      A3AP.   see PPro 7AP
  45 *
  46 *      Pentium Pro
  47 *              None of 1AP-9AP errata are visible to the normal user,
  48 *      except occasional delivery of 'spurious interrupt' as trap #15.
  49 *      This is very rare and a non-problem.
  50 *
  51 *      1AP.    Linux maps APIC as non-cacheable
  52 *      2AP.    worked around in hardware
  53 *      3AP.    fixed in C0 and above steppings microcode update.
  54 *              Linux does not use excessive STARTUP_IPIs.
  55 *      4AP.    worked around in hardware
  56 *      5AP.    symmetric IO mode (normal Linux operation) not affected.
  57 *              'noapic' mode has vector 0xf filled out properly.
  58 *      6AP.    'noapic' mode might be affected - fixed in later steppings
  59 *      7AP.    We do not assume writes to the LVT deassering IRQs
  60 *      8AP.    We do not enable low power mode (deep sleep) during MP bootup
  61 *      9AP.    We do not use mixed mode
  62 *
  63 *      Pentium
  64 *              There is a marginal case where REP MOVS on 100MHz SMP
  65 *      machines with B stepping processors can fail. XXX should provide
  66 *      an L1cache=Writethrough or L1cache=off option.
  67 *
  68 *              B stepping CPUs may hang. There are hardware work arounds
  69 *      for this. We warn about it in case your board doesn't have the work
  70 *      arounds. Basically thats so I can tell anyone with a B stepping
  71 *      CPU and SMP problems "tough".
  72 *
  73 *      Specific items [From Pentium Processor Specification Update]
  74 *
  75 *      1AP.    Linux doesn't use remote read
  76 *      2AP.    Linux doesn't trust APIC errors
  77 *      3AP.    We work around this
  78 *      4AP.    Linux never generated 3 interrupts of the same priority
  79 *              to cause a lost local interrupt.
  80 *      5AP.    Remote read is never used
  81 *      6AP.    not affected - worked around in hardware
  82 *      7AP.    not affected - worked around in hardware
  83 *      8AP.    worked around in hardware - we get explicit CS errors if not
  84 *      9AP.    only 'noapic' mode affected. Might generate spurious
  85 *              interrupts, we log only the first one and count the
  86 *              rest silently.
  87 *      10AP.   not affected - worked around in hardware
  88 *      11AP.   Linux reads the APIC between writes to avoid this, as per
  89 *              the documentation. Make sure you preserve this as it affects
  90 *              the C stepping chips too.
  91 *      12AP.   not affected - worked around in hardware
  92 *      13AP.   not affected - worked around in hardware
  93 *      14AP.   we always deassert INIT during bootup
  94 *      15AP.   not affected - worked around in hardware
  95 *      16AP.   not affected - worked around in hardware
  96 *      17AP.   not affected - worked around in hardware
  97 *      18AP.   not affected - worked around in hardware
  98 *      19AP.   not affected - worked around in BIOS
  99 *
 100 *      If this sounds worrying believe me these bugs are either ___RARE___,
 101 *      or are signal timing bugs worked around in hardware and there's
 102 *      about nothing of note with C stepping upwards.
 103 */
 104
 105/* The 'big kernel lock' */
 106spinlock_cacheline_t kernel_flag_cacheline = {SPIN_LOCK_UNLOCKED};
 107
 108struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
 109
 110/*
 111 * the following functions deal with sending IPIs between CPUs.
 112 *
 113 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
 114 */
 115
 116static inline int __prepare_ICR (unsigned int shortcut, int vector)
 117{
 118        return APIC_DM_FIXED | shortcut | vector | INT_DEST_ADDR_MODE;
 119}
 120
 121static inline int __prepare_ICR2 (unsigned int mask)
 122{
 123        return SET_APIC_DEST_FIELD(mask);
 124}
 125
 126static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
 127{
 128        /*
 129         * Subtle. In the case of the 'never do double writes' workaround
 130         * we have to lock out interrupts to be safe.  As we don't care
 131         * of the value read we use an atomic rmw access to avoid costly
 132         * cli/sti.  Otherwise we use an even cheaper single atomic write
 133         * to the APIC.
 134         */
 135        unsigned int cfg;
 136
 137        /*
 138         * Wait for idle.
 139         */
 140        apic_wait_icr_idle();
 141
 142        /*
 143         * No need to touch the target chip field
 144         */
 145        cfg = __prepare_ICR(shortcut, vector);
 146
 147        /*
 148         * Send the IPI. The write to APIC_ICR fires this off.
 149         */
 150        apic_write_around(APIC_ICR, cfg);
 151}
 152
 153void send_IPI_self(int vector)
 154{
 155        __send_IPI_shortcut(APIC_DEST_SELF, vector);
 156}
 157
 158static inline void send_IPI_mask_bitmask(int mask, int vector)
 159{
 160        unsigned long cfg;
 161        unsigned long flags;
 162
 163        __save_flags(flags);
 164        __cli();
 165
 166                
 167        /*
 168         * Wait for idle.
 169         */
 170        apic_wait_icr_idle();
 171                
 172        /*
 173         * prepare target chip field
 174         */
 175        cfg = __prepare_ICR2(mask);
 176        apic_write_around(APIC_ICR2, cfg);
 177                
 178        /*
 179         * program the ICR 
 180         */
 181        cfg = __prepare_ICR(0, vector);
 182                        
 183        /*
 184         * Send the IPI. The write to APIC_ICR fires this off.
 185         */
 186        apic_write_around(APIC_ICR, cfg);
 187
 188        __restore_flags(flags);
 189}
 190
 191static inline void send_IPI_mask_sequence(int mask, int vector)
 192{
 193        unsigned long cfg, flags;
 194        unsigned int query_cpu, query_mask;
 195
 196        /*
 197         * Hack. The clustered APIC addressing mode doesn't allow us to send 
 198         * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
 199         * should be modified to do 1 message per cluster ID - mbligh
 200         */ 
 201
 202        __save_flags(flags);
 203        __cli();
 204
 205        for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
 206                query_mask = 1 << query_cpu;
 207                if (query_mask & mask) {
 208                
 209                        /*
 210                         * Wait for idle.
 211                         */
 212                        apic_wait_icr_idle();
 213                
 214                        /*
 215                         * prepare target chip field
 216                         */
 217                        if(clustered_apic_mode == CLUSTERED_APIC_XAPIC)
 218                                cfg = __prepare_ICR2(cpu_to_physical_apicid(query_cpu));
 219                        else
 220                                cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
 221                        apic_write_around(APIC_ICR2, cfg);
 222                
 223                        /*
 224                         * program the ICR 
 225                         */
 226                        cfg = __prepare_ICR(0, vector);
 227                        
 228                        /*
 229                         * Send the IPI. The write to APIC_ICR fires this off.
 230                         */
 231                        apic_write_around(APIC_ICR, cfg);
 232                }
 233        }
 234        __restore_flags(flags);
 235}
 236
 237static inline void send_IPI_mask(int mask, int vector)
 238{
 239        if (clustered_apic_mode) 
 240                send_IPI_mask_sequence(mask, vector);
 241        else
 242                send_IPI_mask_bitmask(mask, vector);
 243}
 244
 245static inline void send_IPI_allbutself(int vector)
 246{
 247        /*
 248         * if there are no other CPUs in the system then
 249         * we get an APIC send error if we try to broadcast.
 250         * thus we have to avoid sending IPIs in this case.
 251         */
 252        if (!(smp_num_cpus > 1))
 253                return;
 254
 255        if (clustered_apic_mode) {
 256                // Pointless. Use send_IPI_mask to do this instead
 257                int cpu;
 258
 259                if (smp_num_cpus > 1) {
 260                        for (cpu = 0; cpu < smp_num_cpus; ++cpu) {
 261                                if (cpu != smp_processor_id())
 262                                        send_IPI_mask(1 << cpu, vector);
 263                        }
 264                }
 265        } else {
 266                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
 267                return;
 268        }
 269}
 270
 271static inline void send_IPI_all(int vector)
 272{
 273        if (clustered_apic_mode) {
 274                // Pointless. Use send_IPI_mask to do this instead
 275                int cpu;
 276
 277                for (cpu = 0; cpu < smp_num_cpus; ++cpu) {
 278                        send_IPI_mask(1 << cpu, vector);
 279                }
 280        } else {
 281                __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
 282        }
 283}
 284
 285/*
 286 *      Smarter SMP flushing macros. 
 287 *              c/o Linus Torvalds.
 288 *
 289 *      These mean you can really definitely utterly forget about
 290 *      writing to user space from interrupts. (Its not allowed anyway).
 291 *
 292 *      Optimizations Manfred Spraul <manfred@colorfullife.com>
 293 */
 294
 295static volatile unsigned long flush_cpumask;
 296static struct mm_struct * flush_mm;
 297static unsigned long flush_va;
 298static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
 299#define FLUSH_ALL       0xffffffff
 300
 301/*
 302 * We cannot call mmdrop() because we are in interrupt context, 
 303 * instead update mm->cpu_vm_mask.
 304 *
 305 * We need to reload %cr3 since the page tables may be going
 306 * away frm under us...
 307 */
 308static void inline leave_mm (unsigned long cpu)
 309{
 310        BUG_ON(cpu_tlbstate[cpu].state == TLBSTATE_OK);
 311        clear_bit(cpu, &cpu_tlbstate[cpu].active_mm->cpu_vm_mask);
 312        load_cr3(swapper_pg_dir);
 313}
 314
 315/*
 316 *
 317 * The flush IPI assumes that a thread switch happens in this order:
 318 * [cpu0: the cpu that switches]
 319 * 1) switch_mm() either 1a) or 1b)
 320 * 1a) thread switch to a different mm
 321 * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
 322 *      Stop ipi delivery for the old mm. This is not synchronized with
 323 *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
 324 *      for the wrong mm, and in the worst case we perform a superflous
 325 *      tlb flush.
 326 * 1a2) set cpu_tlbstate to TLBSTATE_OK
 327 *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
 328 *      was in lazy tlb mode.
 329 * 1a3) update cpu_tlbstate[].active_mm
 330 *      Now cpu0 accepts tlb flushes for the new mm.
 331 * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
 332 *      Now the other cpus will send tlb flush ipis.
 333 * 1a4) change cr3.
 334 * 1b) thread switch without mm change
 335 *      cpu_tlbstate[].active_mm is correct, cpu0 already handles
 336 *      flush ipis.
 337 * 1b1) set cpu_tlbstate to TLBSTATE_OK
 338 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
 339 *      Atomically set the bit [other cpus will start sending flush ipis],
 340 *      and test the bit.
 341 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
 342 * 2) switch %%esp, ie current
 343 *
 344 * The interrupt must handle 2 special cases:
 345 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
 346 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
 347 *   runs in kernel space, the cpu could load tlb entries for user space
 348 *   pages.
 349 *
 350 * The good news is that cpu_tlbstate is local to each cpu, no
 351 * write/read ordering problems.
 352 */
 353
 354/*
 355 * TLB flush IPI:
 356 *
 357 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 358 * 2) Leave the mm if we are in the lazy tlb mode.
 359 */
 360
 361asmlinkage void smp_invalidate_interrupt (void)
 362{
 363        unsigned long cpu = smp_processor_id();
 364
 365        if (!test_bit(cpu, &flush_cpumask))
 366                return;
 367                /* 
 368                 * This was a BUG() but until someone can quote me the
 369                 * line from the intel manual that guarantees an IPI to
 370                 * multiple CPUs is retried _only_ on the erroring CPUs
 371                 * its staying as a return
 372                 *
 373                 * BUG();
 374                 */
 375                 
 376        if (flush_mm == cpu_tlbstate[cpu].active_mm) {
 377                if (cpu_tlbstate[cpu].state == TLBSTATE_OK) {
 378                        if (flush_va == FLUSH_ALL)
 379                                local_flush_tlb();
 380                        else
 381                                __flush_tlb_one(flush_va);
 382                } else
 383                        leave_mm(cpu);
 384        }
 385        ack_APIC_irq();
 386        clear_bit(cpu, &flush_cpumask);
 387}
 388
 389static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
 390                                                unsigned long va)
 391{
 392        /*
 393         * A couple of (to be removed) sanity checks:
 394         *
 395         * - we do not send IPIs to not-yet booted CPUs.
 396         * - current CPU must not be in mask
 397         * - mask must exist :)
 398         */
 399        if (!cpumask)
 400                BUG();
 401        if ((cpumask & cpu_online_map) != cpumask)
 402                BUG();
 403        if (cpumask & (1 << smp_processor_id()))
 404                BUG();
 405        if (!mm)
 406                BUG();
 407
 408        /*
 409         * i'm not happy about this global shared spinlock in the
 410         * MM hot path, but we'll see how contended it is.
 411         * Temporarily this turns IRQs off, so that lockups are
 412         * detected by the NMI watchdog.
 413         */
 414        spin_lock(&tlbstate_lock);
 415        
 416        flush_mm = mm;
 417        flush_va = va;
 418        atomic_set_mask(cpumask, &flush_cpumask);
 419        /*
 420         * We have to send the IPI only to
 421         * CPUs affected.
 422         */
 423        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
 424
 425        while (flush_cpumask)
 426                /* nothing. lockup detection does not belong here */;
 427
 428        flush_mm = NULL;
 429        flush_va = 0;
 430        spin_unlock(&tlbstate_lock);
 431}
 432        
 433void flush_tlb_current_task(void)
 434{
 435        struct mm_struct *mm = current->mm;
 436        unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id());
 437
 438        local_flush_tlb();
 439        if (cpu_mask)
 440                flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
 441}
 442
 443void flush_tlb_mm (struct mm_struct * mm)
 444{
 445        unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id());
 446
 447        if (current->active_mm == mm) {
 448                if (current->mm)
 449                        local_flush_tlb();
 450                else
 451                        leave_mm(smp_processor_id());
 452        }
 453        if (cpu_mask)
 454                flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
 455}
 456
 457void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
 458{
 459        struct mm_struct *mm = vma->vm_mm;
 460        unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id());
 461
 462        if (current->active_mm == mm) {
 463                if(current->mm)
 464                        __flush_tlb_one(va);
 465                 else
 466                        leave_mm(smp_processor_id());
 467        }
 468
 469        if (cpu_mask)
 470                flush_tlb_others(cpu_mask, mm, va);
 471}
 472
 473static inline void do_flush_tlb_all_local(void)
 474{
 475        unsigned long cpu = smp_processor_id();
 476
 477        __flush_tlb_all();
 478        if (cpu_tlbstate[cpu].state == TLBSTATE_LAZY)
 479                leave_mm(cpu);
 480}
 481
 482static void flush_tlb_all_ipi(void* info)
 483{
 484        do_flush_tlb_all_local();
 485}
 486
 487void flush_tlb_all(void)
 488{
 489        smp_call_function (flush_tlb_all_ipi,0,1,1);
 490
 491        do_flush_tlb_all_local();
 492}
 493
 494/*
 495 * this function sends a 'reschedule' IPI to another CPU.
 496 * it goes straight through and wastes no time serializing
 497 * anything. Worst case is that we lose a reschedule ...
 498 */
 499
 500void smp_send_reschedule(int cpu)
 501{
 502        send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
 503}
 504
 505/*
 506 * Structure and data for smp_call_function(). This is designed to minimise
 507 * static memory requirements. It also looks cleaner.
 508 */
 509static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
 510
 511struct call_data_struct {
 512        void (*func) (void *info);
 513        void *info;
 514        atomic_t started;
 515        atomic_t finished;
 516        int wait;
 517};
 518
 519static struct call_data_struct * call_data;
 520
 521/*
 522 * this function sends a 'generic call function' IPI to all other CPUs
 523 * in the system.
 524 */
 525
 526int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 527                        int wait)
 528/*
 529 * [SUMMARY] Run a function on all other CPUs.
 530 * <func> The function to run. This must be fast and non-blocking.
 531 * <info> An arbitrary pointer to pass to the function.
 532 * <nonatomic> currently unused.
 533 * <wait> If true, wait (atomically) until function has completed on other CPUs.
 534 * [RETURNS] 0 on success, else a negative status code. Does not return until
 535 * remote CPUs are nearly ready to execute <<func>> or are or have executed.
 536 *
 537 * You must not call this function with disabled interrupts or from a
 538 * hardware interrupt handler or from a bottom half handler.
 539 */
 540{
 541        struct call_data_struct data;
 542        int cpus = smp_num_cpus-1;
 543
 544        if (!cpus)
 545                return 0;
 546
 547        data.func = func;
 548        data.info = info;
 549        atomic_set(&data.started, 0);
 550        data.wait = wait;
 551        if (wait)
 552                atomic_set(&data.finished, 0);
 553
 554        spin_lock(&call_lock);
 555        call_data = &data;
 556        wmb();
 557        /* Send a message to all other CPUs and wait for them to respond */
 558        send_IPI_allbutself(CALL_FUNCTION_VECTOR);
 559
 560        /* Wait for response */
 561        while (atomic_read(&data.started) != cpus)
 562                barrier();
 563
 564        if (wait)
 565                while (atomic_read(&data.finished) != cpus)
 566                        barrier();
 567        spin_unlock(&call_lock);
 568
 569        return 0;
 570}
 571
 572static void stop_this_cpu (void * dummy)
 573{
 574        /*
 575         * Remove this CPU:
 576         */
 577        clear_bit(smp_processor_id(), &cpu_online_map);
 578        __cli();
 579        disable_local_APIC();
 580        if (cpu_data[smp_processor_id()].hlt_works_ok)
 581                for(;;) __asm__("hlt");
 582        for (;;);
 583}
 584
 585/*
 586 * this function calls the 'stop' function on all other CPUs in the system.
 587 */
 588
 589void smp_send_stop(void)
 590{
 591        smp_call_function(stop_this_cpu, NULL, 1, 0);
 592        smp_num_cpus = 1;
 593
 594        __cli();
 595        disable_local_APIC();
 596        __sti();
 597}
 598
 599/*
 600 * Reschedule call back. Nothing to do,
 601 * all the work is done automatically when
 602 * we return from the interrupt.
 603 */
 604asmlinkage void smp_reschedule_interrupt(void)
 605{
 606        ack_APIC_irq();
 607}
 608
 609asmlinkage void smp_call_function_interrupt(void)
 610{
 611        void (*func) (void *info) = call_data->func;
 612        void *info = call_data->info;
 613        int wait = call_data->wait;
 614
 615        ack_APIC_irq();
 616        /*
 617         * Notify initiating CPU that I've grabbed the data and am
 618         * about to execute the function
 619         */
 620        mb();
 621        atomic_inc(&call_data->started);
 622        /*
 623         * At this point the info structure may be out of scope unless wait==1
 624         */
 625        (*func)(info);
 626        if (wait) {
 627                mb();
 628                atomic_inc(&call_data->finished);
 629        }
 630}
 631
 632
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.