linux-bk/arch/ppc64/mm/numa.c
<<
>>
Prefs
   1/*
   2 * pSeries NUMA support
   3 *
   4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11#include <linux/threads.h>
  12#include <linux/bootmem.h>
  13#include <linux/init.h>
  14#include <linux/mm.h>
  15#include <linux/mmzone.h>
  16#include <linux/module.h>
  17#include <asm/lmb.h>
  18#include <asm/machdep.h>
  19#include <asm/abs_addr.h>
  20
  21#if 1
  22#define dbg(args...) printk(KERN_INFO args)
  23#else
  24#define dbg(args...)
  25#endif
  26
  27#ifdef DEBUG_NUMA
  28#define ARRAY_INITIALISER -1
  29#else
  30#define ARRAY_INITIALISER 0
  31#endif
  32
  33int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
  34        ARRAY_INITIALISER};
  35char *numa_memory_lookup_table;
  36cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
  37int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
  38
  39struct pglist_data node_data[MAX_NUMNODES];
  40bootmem_data_t plat_node_bdata[MAX_NUMNODES];
  41static unsigned long node0_io_hole_size;
  42
  43EXPORT_SYMBOL(node_data);
  44EXPORT_SYMBOL(numa_cpu_lookup_table);
  45EXPORT_SYMBOL(numa_memory_lookup_table);
  46EXPORT_SYMBOL(numa_cpumask_lookup_table);
  47EXPORT_SYMBOL(nr_cpus_in_node);
  48
  49static inline void map_cpu_to_node(int cpu, int node)
  50{
  51        dbg("cpu %d maps to domain %d\n", cpu, node);
  52        numa_cpu_lookup_table[cpu] = node;
  53        if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
  54                cpu_set(cpu, numa_cpumask_lookup_table[node]);
  55                nr_cpus_in_node[node]++;
  56        }
  57}
  58
  59static struct device_node * __init find_cpu_node(unsigned int cpu)
  60{
  61        unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
  62        struct device_node *cpu_node = NULL;
  63        unsigned int *interrupt_server, *reg;
  64        int len;
  65
  66        while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
  67                /* Try interrupt server first */
  68                interrupt_server = (unsigned int *)get_property(cpu_node,
  69                                        "ibm,ppc-interrupt-server#s", &len);
  70
  71                if (interrupt_server && (len > 0)) {
  72                        while (len--) {
  73                                if (interrupt_server[len-1] == hw_cpuid)
  74                                        return cpu_node;
  75                        }
  76                } else {
  77                        reg = (unsigned int *)get_property(cpu_node,
  78                                                           "reg", &len);
  79                        if (reg && (len > 0) && (reg[0] == hw_cpuid))
  80                                return cpu_node;
  81                }
  82        }
  83
  84        return NULL;
  85}
  86
  87/* must hold reference to node during call */
  88static int *of_get_associativity(struct device_node *dev)
  89 {
  90        unsigned int *result;
  91        int len;
  92
  93        result = (unsigned int *)get_property(dev, "ibm,associativity", &len);
  94
  95        if (len <= 0)
  96                return NULL;
  97
  98        return result;
  99}
 100
 101static int of_node_numa_domain(struct device_node *device, int depth)
 102{
 103        int numa_domain;
 104        unsigned int *tmp;
 105
 106        tmp = of_get_associativity(device);
 107        if (tmp && (tmp[0] >= depth)) {
 108                numa_domain = tmp[depth];
 109        } else {
 110                printk(KERN_ERR "WARNING: no NUMA information for "
 111                       "%s\n", device->full_name);
 112                numa_domain = 0;
 113        }
 114        return numa_domain;
 115}
 116
 117/*
 118 * In theory, the "ibm,associativity" property may contain multiple
 119 * associativity lists because a resource may be multiply connected
 120 * into the machine.  This resource then has different associativity
 121 * characteristics relative to its multiple connections.  We ignore
 122 * this for now.  We also assume that all cpu and memory sets have
 123 * their distances represented at a common level.  This won't be
 124 * true for heirarchical NUMA.
 125 *
 126 * In any case the ibm,associativity-reference-points should give
 127 * the correct depth for a normal NUMA system.
 128 *
 129 * - Dave Hansen <haveblue@us.ibm.com>
 130 */
 131static int find_min_common_depth(void)
 132{
 133        int depth;
 134        unsigned int *ref_points;
 135        struct device_node *rtas_root;
 136        unsigned int len;
 137
 138        rtas_root = of_find_node_by_path("/rtas");
 139
 140        if (!rtas_root) {
 141                printk(KERN_ERR "WARNING: %s() could not find rtas root\n",
 142                                __FUNCTION__);
 143                return -1;
 144        }
 145
 146        /*
 147         * this property is 2 32-bit integers, each representing a level of
 148         * depth in the associativity nodes.  The first is for an SMP
 149         * configuration (should be all 0's) and the second is for a normal
 150         * NUMA configuration.
 151         */
 152        ref_points = (unsigned int *)get_property(rtas_root,
 153                        "ibm,associativity-reference-points", &len);
 154
 155        if ((len >= 1) && ref_points) {
 156                depth = ref_points[1];
 157        } else {
 158                printk(KERN_ERR "WARNING: could not find NUMA "
 159                                "associativity reference point\n");
 160                depth = -1;
 161        }
 162        of_node_put(rtas_root);
 163
 164        return depth;
 165}
 166
 167static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf)
 168{
 169        int i;
 170        unsigned long result = 0;
 171
 172        i = prom_n_size_cells(device);
 173        /* bug on i>2 ?? */
 174        while (i--) {
 175                result = (result << 32) | **buf;
 176                (*buf)++;
 177        }
 178        return result;
 179}
 180
 181static int __init parse_numa_properties(void)
 182{
 183        struct device_node *cpu = NULL;
 184        struct device_node *memory = NULL;
 185        int depth;
 186        int max_domain = 0;
 187        long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
 188        unsigned long i;
 189
 190        if (strstr(saved_command_line, "numa=off")) {
 191                printk(KERN_WARNING "NUMA disabled by user\n");
 192                return -1;
 193        }
 194
 195        numa_memory_lookup_table =
 196                (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
 197
 198        for (i = 0; i < entries ; i++)
 199                numa_memory_lookup_table[i] = ARRAY_INITIALISER;
 200
 201        depth = find_min_common_depth();
 202
 203        printk(KERN_INFO "NUMA associativity depth for CPU/Memory: %d\n", depth);
 204        if (depth < 0)
 205                return depth;
 206
 207        for_each_cpu(i) {
 208                int numa_domain;
 209
 210                cpu = find_cpu_node(i);
 211
 212                if (cpu) {
 213                        numa_domain = of_node_numa_domain(cpu, depth);
 214                        of_node_put(cpu);
 215
 216                        if (numa_domain >= MAX_NUMNODES) {
 217                                /*
 218                                 * POWER4 LPAR uses 0xffff as invalid node,
 219                                 * dont warn in this case.
 220                                 */
 221                                if (numa_domain != 0xffff)
 222                                        printk(KERN_ERR "WARNING: cpu %ld "
 223                                               "maps to invalid NUMA node %d\n",
 224                                               i, numa_domain);
 225                                numa_domain = 0;
 226                        }
 227                } else {
 228                        printk(KERN_ERR "WARNING: no NUMA information for "
 229                               "cpu %ld\n", i);
 230                        numa_domain = 0;
 231                }
 232
 233                node_set_online(numa_domain);
 234
 235                if (max_domain < numa_domain)
 236                        max_domain = numa_domain;
 237
 238                map_cpu_to_node(i, numa_domain);
 239        }
 240
 241        memory = NULL;
 242        while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 243                unsigned long start;
 244                unsigned long size;
 245                int numa_domain;
 246                int ranges;
 247                unsigned int *memcell_buf;
 248                unsigned int len;
 249
 250                memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
 251                if (!memcell_buf || len <= 0)
 252                        continue;
 253
 254                ranges = memory->n_addrs;
 255new_range:
 256                /* these are order-sensitive, and modify the buffer pointer */
 257                start = read_cell_ul(memory, &memcell_buf);
 258                size = read_cell_ul(memory, &memcell_buf);
 259
 260                start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
 261                size = _ALIGN_UP(size, MEMORY_INCREMENT);
 262
 263                numa_domain = of_node_numa_domain(memory, depth);
 264
 265                if (numa_domain >= MAX_NUMNODES) {
 266                        if (numa_domain != 0xffff)
 267                                printk(KERN_ERR "WARNING: memory at %lx maps "
 268                                       "to invalid NUMA node %d\n", start,
 269                                       numa_domain);
 270                        numa_domain = 0;
 271                }
 272
 273                node_set_online(numa_domain);
 274
 275                if (max_domain < numa_domain)
 276                        max_domain = numa_domain;
 277
 278                /* 
 279                 * For backwards compatibility, OF splits the first node
 280                 * into two regions (the first being 0-4GB). Check for
 281                 * this simple case and complain if there is a gap in
 282                 * memory
 283                 */
 284                if (node_data[numa_domain].node_spanned_pages) {
 285                        unsigned long shouldstart =
 286                                node_data[numa_domain].node_start_pfn + 
 287                                node_data[numa_domain].node_spanned_pages;
 288                        if (shouldstart != (start / PAGE_SIZE)) {
 289                                printk(KERN_ERR "Hole in node, disabling "
 290                                                "region start %lx length %lx\n",
 291                                                start, size);
 292                                continue;
 293                        }
 294                        node_data[numa_domain].node_spanned_pages +=
 295                                size / PAGE_SIZE;
 296                } else {
 297                        node_data[numa_domain].node_start_pfn =
 298                                start / PAGE_SIZE;
 299                        node_data[numa_domain].node_spanned_pages =
 300                                size / PAGE_SIZE;
 301                }
 302
 303                for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
 304                        numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
 305                                numa_domain;
 306
 307                dbg("memory region %lx to %lx maps to domain %d\n",
 308                    start, start+size, numa_domain);
 309
 310                ranges--;
 311                if (ranges)
 312                        goto new_range;
 313        }
 314
 315        numnodes = max_domain + 1;
 316
 317        return 0;
 318}
 319
 320static void __init setup_nonnuma(void)
 321{
 322        unsigned long top_of_ram = lmb_end_of_DRAM();
 323        unsigned long total_ram = lmb_phys_mem_size();
 324        unsigned long i;
 325
 326        printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 327               top_of_ram, total_ram);
 328        printk(KERN_INFO "Memory hole size: %ldMB\n",
 329               (top_of_ram - total_ram) >> 20);
 330
 331        if (!numa_memory_lookup_table) {
 332                long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
 333                numa_memory_lookup_table =
 334                        (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
 335                for (i = 0; i < entries ; i++)
 336                        numa_memory_lookup_table[i] = ARRAY_INITIALISER;
 337        }
 338
 339        for (i = 0; i < NR_CPUS; i++)
 340                map_cpu_to_node(i, 0);
 341
 342        node_set_online(0);
 343
 344        node_data[0].node_start_pfn = 0;
 345        node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
 346
 347        for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
 348                numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
 349
 350        node0_io_hole_size = top_of_ram - total_ram;
 351}
 352
 353void __init do_init_bootmem(void)
 354{
 355        int nid;
 356
 357        min_low_pfn = 0;
 358        max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
 359        max_pfn = max_low_pfn;
 360
 361        if (parse_numa_properties())
 362                setup_nonnuma();
 363
 364        for (nid = 0; nid < numnodes; nid++) {
 365                unsigned long start_paddr, end_paddr;
 366                int i;
 367                unsigned long bootmem_paddr;
 368                unsigned long bootmap_pages;
 369
 370                if (node_data[nid].node_spanned_pages == 0)
 371                        continue;
 372
 373                start_paddr = node_data[nid].node_start_pfn * PAGE_SIZE;
 374                end_paddr = start_paddr + 
 375                                (node_data[nid].node_spanned_pages * PAGE_SIZE);
 376
 377                dbg("node %d\n", nid);
 378                dbg("start_paddr = %lx\n", start_paddr);
 379                dbg("end_paddr = %lx\n", end_paddr);
 380
 381                NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
 382
 383                bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
 384                dbg("bootmap_pages = %lx\n", bootmap_pages);
 385
 386                bootmem_paddr = lmb_alloc_base(bootmap_pages << PAGE_SHIFT,
 387                                PAGE_SIZE, end_paddr);
 388                dbg("bootmap_paddr = %lx\n", bootmem_paddr);
 389
 390                init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
 391                                  start_paddr >> PAGE_SHIFT,
 392                                  end_paddr >> PAGE_SHIFT);
 393
 394                for (i = 0; i < lmb.memory.cnt; i++) {
 395                        unsigned long physbase, size;
 396
 397                        physbase = lmb.memory.region[i].physbase;
 398                        size = lmb.memory.region[i].size;
 399
 400                        if (physbase < end_paddr &&
 401                            (physbase+size) > start_paddr) {
 402                                /* overlaps */
 403                                if (physbase < start_paddr) {
 404                                        size -= start_paddr - physbase;
 405                                        physbase = start_paddr;
 406                                }
 407
 408                                if (size > end_paddr - physbase)
 409                                        size = end_paddr - physbase;
 410
 411                                dbg("free_bootmem %lx %lx\n", physbase, size);
 412                                free_bootmem_node(NODE_DATA(nid), physbase,
 413                                                  size);
 414                        }
 415                }
 416
 417                for (i = 0; i < lmb.reserved.cnt; i++) {
 418                        unsigned long physbase = lmb.reserved.region[i].physbase;
 419                        unsigned long size = lmb.reserved.region[i].size;
 420
 421                        if (physbase < end_paddr &&
 422                            (physbase+size) > start_paddr) {
 423                                /* overlaps */
 424                                if (physbase < start_paddr) {
 425                                        size -= start_paddr - physbase;
 426                                        physbase = start_paddr;
 427                                }
 428
 429                                if (size > end_paddr - physbase)
 430                                        size = end_paddr - physbase;
 431
 432                                dbg("reserve_bootmem %lx %lx\n", physbase,
 433                                    size);
 434                                reserve_bootmem_node(NODE_DATA(nid), physbase,
 435                                                     size);
 436                        }
 437                }
 438        }
 439}
 440
 441void __init paging_init(void)
 442{
 443        unsigned long zones_size[MAX_NR_ZONES];
 444        unsigned long zholes_size[MAX_NR_ZONES];
 445        struct page *node_mem_map; 
 446        int nid;
 447
 448        memset(zones_size, 0, sizeof(zones_size));
 449        memset(zholes_size, 0, sizeof(zholes_size));
 450
 451        for (nid = 0; nid < numnodes; nid++) {
 452                unsigned long start_pfn;
 453                unsigned long end_pfn;
 454
 455                start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
 456                end_pfn = plat_node_bdata[nid].node_low_pfn;
 457
 458                zones_size[ZONE_DMA] = end_pfn - start_pfn;
 459                zholes_size[ZONE_DMA] = 0;
 460                if (nid == 0)
 461                        zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT;
 462
 463                dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
 464                    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
 465
 466                /* 
 467                 * Give this empty node a dummy struct page to avoid
 468                 * us from trying to allocate a node local mem_map
 469                 * in free_area_init_node (which will fail).
 470                 */
 471                if (!node_data[nid].node_spanned_pages)
 472                        node_mem_map = alloc_bootmem(sizeof(struct page));
 473                else
 474                        node_mem_map = NULL;
 475
 476                free_area_init_node(nid, NODE_DATA(nid), node_mem_map,
 477                                    zones_size, start_pfn, zholes_size);
 478        }
 479}
 480
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.