linux/arch/ppc64/mm/numa.c
<<
>>
Prefs
   1/*
   2 * pSeries NUMA support
   3 *
   4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11#include <linux/threads.h>
  12#include <linux/bootmem.h>
  13#include <linux/init.h>
  14#include <linux/mm.h>
  15#include <linux/mmzone.h>
  16#include <linux/module.h>
  17#include <linux/nodemask.h>
  18#include <linux/cpu.h>
  19#include <linux/notifier.h>
  20#include <asm/lmb.h>
  21#include <asm/machdep.h>
  22#include <asm/abs_addr.h>
  23
  24static int numa_enabled = 1;
  25
  26static int numa_debug;
  27#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
  28
  29#ifdef DEBUG_NUMA
  30#define ARRAY_INITIALISER -1
  31#else
  32#define ARRAY_INITIALISER 0
  33#endif
  34
  35int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
  36        ARRAY_INITIALISER};
  37char *numa_memory_lookup_table;
  38cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
  39int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
  40
  41struct pglist_data *node_data[MAX_NUMNODES];
  42bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
  43static unsigned long node0_io_hole_size;
  44static int min_common_depth;
  45
  46/*
  47 * We need somewhere to store start/span for each node until we have
  48 * allocated the real node_data structures.
  49 */
  50static struct {
  51        unsigned long node_start_pfn;
  52        unsigned long node_spanned_pages;
  53} init_node_data[MAX_NUMNODES] __initdata;
  54
  55EXPORT_SYMBOL(node_data);
  56EXPORT_SYMBOL(numa_cpu_lookup_table);
  57EXPORT_SYMBOL(numa_memory_lookup_table);
  58EXPORT_SYMBOL(numa_cpumask_lookup_table);
  59EXPORT_SYMBOL(nr_cpus_in_node);
  60
  61static inline void map_cpu_to_node(int cpu, int node)
  62{
  63        numa_cpu_lookup_table[cpu] = node;
  64        if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
  65                cpu_set(cpu, numa_cpumask_lookup_table[node]);
  66                nr_cpus_in_node[node]++;
  67        }
  68}
  69
  70#ifdef CONFIG_HOTPLUG_CPU
  71static void unmap_cpu_from_node(unsigned long cpu)
  72{
  73        int node = numa_cpu_lookup_table[cpu];
  74
  75        dbg("removing cpu %lu from node %d\n", cpu, node);
  76
  77        if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
  78                cpu_clear(cpu, numa_cpumask_lookup_table[node]);
  79                nr_cpus_in_node[node]--;
  80        } else {
  81                printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
  82                       cpu, node);
  83        }
  84}
  85#endif /* CONFIG_HOTPLUG_CPU */
  86
  87static struct device_node * __devinit find_cpu_node(unsigned int cpu)
  88{
  89        unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
  90        struct device_node *cpu_node = NULL;
  91        unsigned int *interrupt_server, *reg;
  92        int len;
  93
  94        while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
  95                /* Try interrupt server first */
  96                interrupt_server = (unsigned int *)get_property(cpu_node,
  97                                        "ibm,ppc-interrupt-server#s", &len);
  98
  99                len = len / sizeof(u32);
 100
 101                if (interrupt_server && (len > 0)) {
 102                        while (len--) {
 103                                if (interrupt_server[len] == hw_cpuid)
 104                                        return cpu_node;
 105                        }
 106                } else {
 107                        reg = (unsigned int *)get_property(cpu_node,
 108                                                           "reg", &len);
 109                        if (reg && (len > 0) && (reg[0] == hw_cpuid))
 110                                return cpu_node;
 111                }
 112        }
 113
 114        return NULL;
 115}
 116
 117/* must hold reference to node during call */
 118static int *of_get_associativity(struct device_node *dev)
 119{
 120        return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
 121}
 122
 123static int of_node_numa_domain(struct device_node *device)
 124{
 125        int numa_domain;
 126        unsigned int *tmp;
 127
 128        if (min_common_depth == -1)
 129                return 0;
 130
 131        tmp = of_get_associativity(device);
 132        if (tmp && (tmp[0] >= min_common_depth)) {
 133                numa_domain = tmp[min_common_depth];
 134        } else {
 135                dbg("WARNING: no NUMA information for %s\n",
 136                    device->full_name);
 137                numa_domain = 0;
 138        }
 139        return numa_domain;
 140}
 141
 142/*
 143 * In theory, the "ibm,associativity" property may contain multiple
 144 * associativity lists because a resource may be multiply connected
 145 * into the machine.  This resource then has different associativity
 146 * characteristics relative to its multiple connections.  We ignore
 147 * this for now.  We also assume that all cpu and memory sets have
 148 * their distances represented at a common level.  This won't be
 149 * true for heirarchical NUMA.
 150 *
 151 * In any case the ibm,associativity-reference-points should give
 152 * the correct depth for a normal NUMA system.
 153 *
 154 * - Dave Hansen <haveblue@us.ibm.com>
 155 */
 156static int __init find_min_common_depth(void)
 157{
 158        int depth;
 159        unsigned int *ref_points;
 160        struct device_node *rtas_root;
 161        unsigned int len;
 162
 163        rtas_root = of_find_node_by_path("/rtas");
 164
 165        if (!rtas_root)
 166                return -1;
 167
 168        /*
 169         * this property is 2 32-bit integers, each representing a level of
 170         * depth in the associativity nodes.  The first is for an SMP
 171         * configuration (should be all 0's) and the second is for a normal
 172         * NUMA configuration.
 173         */
 174        ref_points = (unsigned int *)get_property(rtas_root,
 175                        "ibm,associativity-reference-points", &len);
 176
 177        if ((len >= 1) && ref_points) {
 178                depth = ref_points[1];
 179        } else {
 180                dbg("WARNING: could not find NUMA "
 181                    "associativity reference point\n");
 182                depth = -1;
 183        }
 184        of_node_put(rtas_root);
 185
 186        return depth;
 187}
 188
 189static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf)
 190{
 191        int i;
 192        unsigned long result = 0;
 193
 194        i = prom_n_size_cells(device);
 195        /* bug on i>2 ?? */
 196        while (i--) {
 197                result = (result << 32) | **buf;
 198                (*buf)++;
 199        }
 200        return result;
 201}
 202
 203/*
 204 * Figure out to which domain a cpu belongs and stick it there.
 205 * Return the id of the domain used.
 206 */
 207static int numa_setup_cpu(unsigned long lcpu)
 208{
 209        int numa_domain = 0;
 210        struct device_node *cpu = find_cpu_node(lcpu);
 211
 212        if (!cpu) {
 213                WARN_ON(1);
 214                goto out;
 215        }
 216
 217        numa_domain = of_node_numa_domain(cpu);
 218
 219        if (numa_domain >= num_online_nodes()) {
 220                /*
 221                 * POWER4 LPAR uses 0xffff as invalid node,
 222                 * dont warn in this case.
 223                 */
 224                if (numa_domain != 0xffff)
 225                        printk(KERN_ERR "WARNING: cpu %ld "
 226                               "maps to invalid NUMA node %d\n",
 227                               lcpu, numa_domain);
 228                numa_domain = 0;
 229        }
 230out:
 231        node_set_online(numa_domain);
 232
 233        map_cpu_to_node(lcpu, numa_domain);
 234
 235        of_node_put(cpu);
 236
 237        return numa_domain;
 238}
 239
 240static int cpu_numa_callback(struct notifier_block *nfb,
 241                             unsigned long action,
 242                             void *hcpu)
 243{
 244        unsigned long lcpu = (unsigned long)hcpu;
 245        int ret = NOTIFY_DONE;
 246
 247        switch (action) {
 248        case CPU_UP_PREPARE:
 249                if (min_common_depth == -1 || !numa_enabled)
 250                        map_cpu_to_node(lcpu, 0);
 251                else
 252                        numa_setup_cpu(lcpu);
 253                ret = NOTIFY_OK;
 254                break;
 255#ifdef CONFIG_HOTPLUG_CPU
 256        case CPU_DEAD:
 257        case CPU_UP_CANCELED:
 258                unmap_cpu_from_node(lcpu);
 259                break;
 260                ret = NOTIFY_OK;
 261#endif
 262        }
 263        return ret;
 264}
 265
 266static int __init parse_numa_properties(void)
 267{
 268        struct device_node *cpu = NULL;
 269        struct device_node *memory = NULL;
 270        int max_domain = 0;
 271        long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
 272        unsigned long i;
 273
 274        if (numa_enabled == 0) {
 275                printk(KERN_WARNING "NUMA disabled by user\n");
 276                return -1;
 277        }
 278
 279        numa_memory_lookup_table =
 280                (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
 281        memset(numa_memory_lookup_table, 0, entries * sizeof(char));
 282
 283        for (i = 0; i < entries ; i++)
 284                numa_memory_lookup_table[i] = ARRAY_INITIALISER;
 285
 286        min_common_depth = find_min_common_depth();
 287
 288        dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
 289        if (min_common_depth < 0)
 290                return min_common_depth;
 291
 292        max_domain = numa_setup_cpu(boot_cpuid);
 293
 294        /*
 295         * Even though we connect cpus to numa domains later in SMP init,
 296         * we need to know the maximum node id now. This is because each
 297         * node id must have NODE_DATA etc backing it.
 298         * As a result of hotplug we could still have cpus appear later on
 299         * with larger node ids. In that case we force the cpu into node 0.
 300         */
 301        for_each_cpu(i) {
 302                int numa_domain;
 303
 304                cpu = find_cpu_node(i);
 305
 306                if (cpu) {
 307                        numa_domain = of_node_numa_domain(cpu);
 308                        of_node_put(cpu);
 309
 310                        if (numa_domain < MAX_NUMNODES &&
 311                            max_domain < numa_domain)
 312                                max_domain = numa_domain;
 313                }
 314        }
 315
 316        memory = NULL;
 317        while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 318                unsigned long start;
 319                unsigned long size;
 320                int numa_domain;
 321                int ranges;
 322                unsigned int *memcell_buf;
 323                unsigned int len;
 324
 325                memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
 326                if (!memcell_buf || len <= 0)
 327                        continue;
 328
 329                ranges = memory->n_addrs;
 330new_range:
 331                /* these are order-sensitive, and modify the buffer pointer */
 332                start = read_cell_ul(memory, &memcell_buf);
 333                size = read_cell_ul(memory, &memcell_buf);
 334
 335                start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
 336                size = _ALIGN_UP(size, MEMORY_INCREMENT);
 337
 338                numa_domain = of_node_numa_domain(memory);
 339
 340                if (numa_domain >= MAX_NUMNODES) {
 341                        if (numa_domain != 0xffff)
 342                                printk(KERN_ERR "WARNING: memory at %lx maps "
 343                                       "to invalid NUMA node %d\n", start,
 344                                       numa_domain);
 345                        numa_domain = 0;
 346                }
 347
 348                if (max_domain < numa_domain)
 349                        max_domain = numa_domain;
 350
 351                /* 
 352                 * For backwards compatibility, OF splits the first node
 353                 * into two regions (the first being 0-4GB). Check for
 354                 * this simple case and complain if there is a gap in
 355                 * memory
 356                 */
 357                if (init_node_data[numa_domain].node_spanned_pages) {
 358                        unsigned long shouldstart =
 359                                init_node_data[numa_domain].node_start_pfn +
 360                                init_node_data[numa_domain].node_spanned_pages;
 361                        if (shouldstart != (start / PAGE_SIZE)) {
 362                                /* Revert to non-numa for now */
 363                                printk(KERN_ERR
 364                                       "WARNING: Unexpected node layout: "
 365                                       "region start %lx length %lx\n",
 366                                       start, size);
 367                                printk(KERN_ERR "NUMA is disabled\n");
 368                                goto err;
 369                        }
 370                        init_node_data[numa_domain].node_spanned_pages +=
 371                                size / PAGE_SIZE;
 372                } else {
 373                        node_set_online(numa_domain);
 374
 375                        init_node_data[numa_domain].node_start_pfn =
 376                                start / PAGE_SIZE;
 377                        init_node_data[numa_domain].node_spanned_pages =
 378                                size / PAGE_SIZE;
 379                }
 380
 381                for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
 382                        numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
 383                                numa_domain;
 384
 385                ranges--;
 386                if (ranges)
 387                        goto new_range;
 388        }
 389
 390        for (i = 0; i <= max_domain; i++)
 391                node_set_online(i);
 392
 393        return 0;
 394err:
 395        /* Something has gone wrong; revert any setup we've done */
 396        for_each_node(i) {
 397                node_set_offline(i);
 398                init_node_data[i].node_start_pfn = 0;
 399                init_node_data[i].node_spanned_pages = 0;
 400        }
 401        return -1;
 402}
 403
 404static void __init setup_nonnuma(void)
 405{
 406        unsigned long top_of_ram = lmb_end_of_DRAM();
 407        unsigned long total_ram = lmb_phys_mem_size();
 408        unsigned long i;
 409
 410        printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 411               top_of_ram, total_ram);
 412        printk(KERN_INFO "Memory hole size: %ldMB\n",
 413               (top_of_ram - total_ram) >> 20);
 414
 415        if (!numa_memory_lookup_table) {
 416                long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
 417                numa_memory_lookup_table =
 418                        (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
 419                memset(numa_memory_lookup_table, 0, entries * sizeof(char));
 420                for (i = 0; i < entries ; i++)
 421                        numa_memory_lookup_table[i] = ARRAY_INITIALISER;
 422        }
 423
 424        map_cpu_to_node(boot_cpuid, 0);
 425
 426        node_set_online(0);
 427
 428        init_node_data[0].node_start_pfn = 0;
 429        init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
 430
 431        for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
 432                numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
 433
 434        node0_io_hole_size = top_of_ram - total_ram;
 435}
 436
 437static void __init dump_numa_topology(void)
 438{
 439        unsigned int node;
 440        unsigned int count;
 441
 442        if (min_common_depth == -1 || !numa_enabled)
 443                return;
 444
 445        for_each_online_node(node) {
 446                unsigned long i;
 447
 448                printk(KERN_INFO "Node %d Memory:", node);
 449
 450                count = 0;
 451
 452                for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
 453                        if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
 454                                if (count == 0)
 455                                        printk(" 0x%lx", i);
 456                                ++count;
 457                        } else {
 458                                if (count > 0)
 459                                        printk("-0x%lx", i);
 460                                count = 0;
 461                        }
 462                }
 463
 464                if (count > 0)
 465                        printk("-0x%lx", i);
 466                printk("\n");
 467        }
 468        return;
 469}
 470
 471/*
 472 * Allocate some memory, satisfying the lmb or bootmem allocator where
 473 * required. nid is the preferred node and end is the physical address of
 474 * the highest address in the node.
 475 *
 476 * Returns the physical address of the memory.
 477 */
 478static unsigned long careful_allocation(int nid, unsigned long size,
 479                                        unsigned long align, unsigned long end)
 480{
 481        unsigned long ret = lmb_alloc_base(size, align, end);
 482
 483        /* retry over all memory */
 484        if (!ret)
 485                ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
 486
 487        if (!ret)
 488                panic("numa.c: cannot allocate %lu bytes on node %d",
 489                      size, nid);
 490
 491        /*
 492         * If the memory came from a previously allocated node, we must
 493         * retry with the bootmem allocator.
 494         */
 495        if (pa_to_nid(ret) < nid) {
 496                nid = pa_to_nid(ret);
 497                ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
 498                                size, align, 0);
 499
 500                if (!ret)
 501                        panic("numa.c: cannot allocate %lu bytes on node %d",
 502                              size, nid);
 503
 504                ret = virt_to_abs(ret);
 505
 506                dbg("alloc_bootmem %lx %lx\n", ret, size);
 507        }
 508
 509        return ret;
 510}
 511
 512void __init do_init_bootmem(void)
 513{
 514        int nid;
 515        static struct notifier_block ppc64_numa_nb = {
 516                .notifier_call = cpu_numa_callback,
 517                .priority = 1 /* Must run before sched domains notifier. */
 518        };
 519
 520        min_low_pfn = 0;
 521        max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
 522        max_pfn = max_low_pfn;
 523
 524        if (parse_numa_properties())
 525                setup_nonnuma();
 526        else
 527                dump_numa_topology();
 528
 529        register_cpu_notifier(&ppc64_numa_nb);
 530
 531        for_each_online_node(nid) {
 532                unsigned long start_paddr, end_paddr;
 533                int i;
 534                unsigned long bootmem_paddr;
 535                unsigned long bootmap_pages;
 536
 537                start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
 538                end_paddr = start_paddr + (init_node_data[nid].node_spanned_pages * PAGE_SIZE);
 539
 540                /* Allocate the node structure node local if possible */
 541                NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
 542                                        sizeof(struct pglist_data),
 543                                        SMP_CACHE_BYTES, end_paddr);
 544                NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
 545                memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 546
 547                dbg("node %d\n", nid);
 548                dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 549
 550                NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
 551                NODE_DATA(nid)->node_start_pfn =
 552                        init_node_data[nid].node_start_pfn;
 553                NODE_DATA(nid)->node_spanned_pages =
 554                        init_node_data[nid].node_spanned_pages;
 555
 556                if (init_node_data[nid].node_spanned_pages == 0)
 557                        continue;
 558
 559                dbg("start_paddr = %lx\n", start_paddr);
 560                dbg("end_paddr = %lx\n", end_paddr);
 561
 562                bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
 563
 564                bootmem_paddr = careful_allocation(nid,
 565                                bootmap_pages << PAGE_SHIFT,
 566                                PAGE_SIZE, end_paddr);
 567                memset(abs_to_virt(bootmem_paddr), 0,
 568                       bootmap_pages << PAGE_SHIFT);
 569                dbg("bootmap_paddr = %lx\n", bootmem_paddr);
 570
 571                init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
 572                                  start_paddr >> PAGE_SHIFT,
 573                                  end_paddr >> PAGE_SHIFT);
 574
 575                for (i = 0; i < lmb.memory.cnt; i++) {
 576                        unsigned long physbase, size;
 577
 578                        physbase = lmb.memory.region[i].physbase;
 579                        size = lmb.memory.region[i].size;
 580
 581                        if (physbase < end_paddr &&
 582                            (physbase+size) > start_paddr) {
 583                                /* overlaps */
 584                                if (physbase < start_paddr) {
 585                                        size -= start_paddr - physbase;
 586                                        physbase = start_paddr;
 587                                }
 588
 589                                if (size > end_paddr - physbase)
 590                                        size = end_paddr - physbase;
 591
 592                                dbg("free_bootmem %lx %lx\n", physbase, size);
 593                                free_bootmem_node(NODE_DATA(nid), physbase,
 594                                                  size);
 595                        }
 596                }
 597
 598                for (i = 0; i < lmb.reserved.cnt; i++) {
 599                        unsigned long physbase = lmb.reserved.region[i].physbase;
 600                        unsigned long size = lmb.reserved.region[i].size;
 601
 602                        if (physbase < end_paddr &&
 603                            (physbase+size) > start_paddr) {
 604                                /* overlaps */
 605                                if (physbase < start_paddr) {
 606                                        size -= start_paddr - physbase;
 607                                        physbase = start_paddr;
 608                                }
 609
 610                                if (size > end_paddr - physbase)
 611                                        size = end_paddr - physbase;
 612
 613                                dbg("reserve_bootmem %lx %lx\n", physbase,
 614                                    size);
 615                                reserve_bootmem_node(NODE_DATA(nid), physbase,
 616                                                     size);
 617                        }
 618                }
 619        }
 620}
 621
 622void __init paging_init(void)
 623{
 624        unsigned long zones_size[MAX_NR_ZONES];
 625        unsigned long zholes_size[MAX_NR_ZONES];
 626        int nid;
 627
 628        memset(zones_size, 0, sizeof(zones_size));
 629        memset(zholes_size, 0, sizeof(zholes_size));
 630
 631        for_each_online_node(nid) {
 632                unsigned long start_pfn;
 633                unsigned long end_pfn;
 634
 635                start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
 636                end_pfn = plat_node_bdata[nid].node_low_pfn;
 637
 638                zones_size[ZONE_DMA] = end_pfn - start_pfn;
 639                zholes_size[ZONE_DMA] = 0;
 640                if (nid == 0)
 641                        zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT;
 642
 643                dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
 644                    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
 645
 646                free_area_init_node(nid, NODE_DATA(nid), zones_size,
 647                                                        start_pfn, zholes_size);
 648        }
 649}
 650
 651static int __init early_numa(char *p)
 652{
 653        if (!p)
 654                return 0;
 655
 656        if (strstr(p, "off"))
 657                numa_enabled = 0;
 658
 659        if (strstr(p, "debug"))
 660                numa_debug = 1;
 661
 662        return 0;
 663}
 664early_param("numa", early_numa);
 665
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.