linux/arch/powerpc/mm/numa.c
<<
>>
Prefs
   1/*
   2 * pSeries NUMA support
   3 *
   4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11#include <linux/threads.h>
  12#include <linux/bootmem.h>
  13#include <linux/init.h>
  14#include <linux/mm.h>
  15#include <linux/mmzone.h>
  16#include <linux/module.h>
  17#include <linux/nodemask.h>
  18#include <linux/cpu.h>
  19#include <linux/notifier.h>
  20#include <asm/sparsemem.h>
  21#include <asm/lmb.h>
  22#include <asm/system.h>
  23#include <asm/smp.h>
  24
  25static int numa_enabled = 1;
  26
  27static char *cmdline __initdata;
  28
  29static int numa_debug;
  30#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
  31
  32int numa_cpu_lookup_table[NR_CPUS];
  33cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
  34struct pglist_data *node_data[MAX_NUMNODES];
  35
  36EXPORT_SYMBOL(numa_cpu_lookup_table);
  37EXPORT_SYMBOL(numa_cpumask_lookup_table);
  38EXPORT_SYMBOL(node_data);
  39
  40static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
  41static int min_common_depth;
  42static int n_mem_addr_cells, n_mem_size_cells;
  43
  44static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
  45                                                unsigned int *nid)
  46{
  47        unsigned long long mem;
  48        char *p = cmdline;
  49        static unsigned int fake_nid;
  50        static unsigned long long curr_boundary;
  51
  52        /*
  53         * Modify node id, iff we started creating NUMA nodes
  54         * We want to continue from where we left of the last time
  55         */
  56        if (fake_nid)
  57                *nid = fake_nid;
  58        /*
  59         * In case there are no more arguments to parse, the
  60         * node_id should be the same as the last fake node id
  61         * (we've handled this above).
  62         */
  63        if (!p)
  64                return 0;
  65
  66        mem = memparse(p, &p);
  67        if (!mem)
  68                return 0;
  69
  70        if (mem < curr_boundary)
  71                return 0;
  72
  73        curr_boundary = mem;
  74
  75        if ((end_pfn << PAGE_SHIFT) > mem) {
  76                /*
  77                 * Skip commas and spaces
  78                 */
  79                while (*p == ',' || *p == ' ' || *p == '\t')
  80                        p++;
  81
  82                cmdline = p;
  83                fake_nid++;
  84                *nid = fake_nid;
  85                dbg("created new fake_node with id %d\n", fake_nid);
  86                return 1;
  87        }
  88        return 0;
  89}
  90
  91static void __cpuinit map_cpu_to_node(int cpu, int node)
  92{
  93        numa_cpu_lookup_table[cpu] = node;
  94
  95        dbg("adding cpu %d to node %d\n", cpu, node);
  96
  97        if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
  98                cpu_set(cpu, numa_cpumask_lookup_table[node]);
  99}
 100
 101#ifdef CONFIG_HOTPLUG_CPU
 102static void unmap_cpu_from_node(unsigned long cpu)
 103{
 104        int node = numa_cpu_lookup_table[cpu];
 105
 106        dbg("removing cpu %lu from node %d\n", cpu, node);
 107
 108        if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
 109                cpu_clear(cpu, numa_cpumask_lookup_table[node]);
 110        } else {
 111                printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
 112                       cpu, node);
 113        }
 114}
 115#endif /* CONFIG_HOTPLUG_CPU */
 116
 117static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
 118{
 119        unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
 120        struct device_node *cpu_node = NULL;
 121        const unsigned int *interrupt_server, *reg;
 122        int len;
 123
 124        while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
 125                /* Try interrupt server first */
 126                interrupt_server = of_get_property(cpu_node,
 127                                        "ibm,ppc-interrupt-server#s", &len);
 128
 129                len = len / sizeof(u32);
 130
 131                if (interrupt_server && (len > 0)) {
 132                        while (len--) {
 133                                if (interrupt_server[len] == hw_cpuid)
 134                                        return cpu_node;
 135                        }
 136                } else {
 137                        reg = of_get_property(cpu_node, "reg", &len);
 138                        if (reg && (len > 0) && (reg[0] == hw_cpuid))
 139                                return cpu_node;
 140                }
 141        }
 142
 143        return NULL;
 144}
 145
 146/* must hold reference to node during call */
 147static const int *of_get_associativity(struct device_node *dev)
 148{
 149        return of_get_property(dev, "ibm,associativity", NULL);
 150}
 151
 152/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
 153 * info is found.
 154 */
 155static int of_node_to_nid_single(struct device_node *device)
 156{
 157        int nid = -1;
 158        const unsigned int *tmp;
 159
 160        if (min_common_depth == -1)
 161                goto out;
 162
 163        tmp = of_get_associativity(device);
 164        if (!tmp)
 165                goto out;
 166
 167        if (tmp[0] >= min_common_depth)
 168                nid = tmp[min_common_depth];
 169
 170        /* POWER4 LPAR uses 0xffff as invalid node */
 171        if (nid == 0xffff || nid >= MAX_NUMNODES)
 172                nid = -1;
 173out:
 174        return nid;
 175}
 176
 177/* Walk the device tree upwards, looking for an associativity id */
 178int of_node_to_nid(struct device_node *device)
 179{
 180        struct device_node *tmp;
 181        int nid = -1;
 182
 183        of_node_get(device);
 184        while (device) {
 185                nid = of_node_to_nid_single(device);
 186                if (nid != -1)
 187                        break;
 188
 189                tmp = device;
 190                device = of_get_parent(tmp);
 191                of_node_put(tmp);
 192        }
 193        of_node_put(device);
 194
 195        return nid;
 196}
 197EXPORT_SYMBOL_GPL(of_node_to_nid);
 198
 199/*
 200 * In theory, the "ibm,associativity" property may contain multiple
 201 * associativity lists because a resource may be multiply connected
 202 * into the machine.  This resource then has different associativity
 203 * characteristics relative to its multiple connections.  We ignore
 204 * this for now.  We also assume that all cpu and memory sets have
 205 * their distances represented at a common level.  This won't be
 206 * true for hierarchical NUMA.
 207 *
 208 * In any case the ibm,associativity-reference-points should give
 209 * the correct depth for a normal NUMA system.
 210 *
 211 * - Dave Hansen <haveblue@us.ibm.com>
 212 */
 213static int __init find_min_common_depth(void)
 214{
 215        int depth;
 216        const unsigned int *ref_points;
 217        struct device_node *rtas_root;
 218        unsigned int len;
 219
 220        rtas_root = of_find_node_by_path("/rtas");
 221
 222        if (!rtas_root)
 223                return -1;
 224
 225        /*
 226         * this property is 2 32-bit integers, each representing a level of
 227         * depth in the associativity nodes.  The first is for an SMP
 228         * configuration (should be all 0's) and the second is for a normal
 229         * NUMA configuration.
 230         */
 231        ref_points = of_get_property(rtas_root,
 232                        "ibm,associativity-reference-points", &len);
 233
 234        if ((len >= 1) && ref_points) {
 235                depth = ref_points[1];
 236        } else {
 237                dbg("NUMA: ibm,associativity-reference-points not found.\n");
 238                depth = -1;
 239        }
 240        of_node_put(rtas_root);
 241
 242        return depth;
 243}
 244
 245static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
 246{
 247        struct device_node *memory = NULL;
 248
 249        memory = of_find_node_by_type(memory, "memory");
 250        if (!memory)
 251                panic("numa.c: No memory nodes found!");
 252
 253        *n_addr_cells = of_n_addr_cells(memory);
 254        *n_size_cells = of_n_size_cells(memory);
 255        of_node_put(memory);
 256}
 257
 258static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
 259{
 260        unsigned long result = 0;
 261
 262        while (n--) {
 263                result = (result << 32) | **buf;
 264                (*buf)++;
 265        }
 266        return result;
 267}
 268
 269/*
 270 * Figure out to which domain a cpu belongs and stick it there.
 271 * Return the id of the domain used.
 272 */
 273static int __cpuinit numa_setup_cpu(unsigned long lcpu)
 274{
 275        int nid = 0;
 276        struct device_node *cpu = find_cpu_node(lcpu);
 277
 278        if (!cpu) {
 279                WARN_ON(1);
 280                goto out;
 281        }
 282
 283        nid = of_node_to_nid_single(cpu);
 284
 285        if (nid < 0 || !node_online(nid))
 286                nid = any_online_node(NODE_MASK_ALL);
 287out:
 288        map_cpu_to_node(lcpu, nid);
 289
 290        of_node_put(cpu);
 291
 292        return nid;
 293}
 294
 295static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
 296                             unsigned long action,
 297                             void *hcpu)
 298{
 299        unsigned long lcpu = (unsigned long)hcpu;
 300        int ret = NOTIFY_DONE;
 301
 302        switch (action) {
 303        case CPU_UP_PREPARE:
 304        case CPU_UP_PREPARE_FROZEN:
 305                numa_setup_cpu(lcpu);
 306                ret = NOTIFY_OK;
 307                break;
 308#ifdef CONFIG_HOTPLUG_CPU
 309        case CPU_DEAD:
 310        case CPU_DEAD_FROZEN:
 311        case CPU_UP_CANCELED:
 312        case CPU_UP_CANCELED_FROZEN:
 313                unmap_cpu_from_node(lcpu);
 314                break;
 315                ret = NOTIFY_OK;
 316#endif
 317        }
 318        return ret;
 319}
 320
 321/*
 322 * Check and possibly modify a memory region to enforce the memory limit.
 323 *
 324 * Returns the size the region should have to enforce the memory limit.
 325 * This will either be the original value of size, a truncated value,
 326 * or zero. If the returned value of size is 0 the region should be
 327 * discarded as it lies wholy above the memory limit.
 328 */
 329static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 330                                                      unsigned long size)
 331{
 332        /*
 333         * We use lmb_end_of_DRAM() in here instead of memory_limit because
 334         * we've already adjusted it for the limit and it takes care of
 335         * having memory holes below the limit.
 336         */
 337
 338        if (! memory_limit)
 339                return size;
 340
 341        if (start + size <= lmb_end_of_DRAM())
 342                return size;
 343
 344        if (start >= lmb_end_of_DRAM())
 345                return 0;
 346
 347        return lmb_end_of_DRAM() - start;
 348}
 349
 350/*
 351 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
 352 * node.  This assumes n_mem_{addr,size}_cells have been set.
 353 */
 354static void __init parse_drconf_memory(struct device_node *memory)
 355{
 356        const unsigned int *lm, *dm, *aa;
 357        unsigned int ls, ld, la;
 358        unsigned int n, aam, aalen;
 359        unsigned long lmb_size, size, start;
 360        int nid, default_nid = 0;
 361        unsigned int ai, flags;
 362
 363        lm = of_get_property(memory, "ibm,lmb-size", &ls);
 364        dm = of_get_property(memory, "ibm,dynamic-memory", &ld);
 365        aa = of_get_property(memory, "ibm,associativity-lookup-arrays", &la);
 366        if (!lm || !dm || !aa ||
 367            ls < sizeof(unsigned int) || ld < sizeof(unsigned int) ||
 368            la < 2 * sizeof(unsigned int))
 369                return;
 370
 371        lmb_size = read_n_cells(n_mem_size_cells, &lm);
 372        n = *dm++;              /* number of LMBs */
 373        aam = *aa++;            /* number of associativity lists */
 374        aalen = *aa++;          /* length of each associativity list */
 375        if (ld < (n * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int) ||
 376            la < (aam * aalen + 2) * sizeof(unsigned int))
 377                return;
 378
 379        for (; n != 0; --n) {
 380                start = read_n_cells(n_mem_addr_cells, &dm);
 381                ai = dm[2];
 382                flags = dm[3];
 383                dm += 4;
 384                /* 0x80 == reserved, 0x8 = assigned to us */
 385                if ((flags & 0x80) || !(flags & 0x8))
 386                        continue;
 387                nid = default_nid;
 388                /* flags & 0x40 means associativity index is invalid */
 389                if (min_common_depth > 0 && min_common_depth <= aalen &&
 390                    (flags & 0x40) == 0 && ai < aam) {
 391                        /* this is like of_node_to_nid_single */
 392                        nid = aa[ai * aalen + min_common_depth - 1];
 393                        if (nid == 0xffff || nid >= MAX_NUMNODES)
 394                                nid = default_nid;
 395                }
 396
 397                fake_numa_create_new_node(((start + lmb_size) >> PAGE_SHIFT),
 398                                                &nid);
 399                node_set_online(nid);
 400
 401                size = numa_enforce_memory_limit(start, lmb_size);
 402                if (!size)
 403                        continue;
 404
 405                add_active_range(nid, start >> PAGE_SHIFT,
 406                                 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
 407        }
 408}
 409
 410static int __init parse_numa_properties(void)
 411{
 412        struct device_node *cpu = NULL;
 413        struct device_node *memory = NULL;
 414        int default_nid = 0;
 415        unsigned long i;
 416
 417        if (numa_enabled == 0) {
 418                printk(KERN_WARNING "NUMA disabled by user\n");
 419                return -1;
 420        }
 421
 422        min_common_depth = find_min_common_depth();
 423
 424        if (min_common_depth < 0)
 425                return min_common_depth;
 426
 427        dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
 428
 429        /*
 430         * Even though we connect cpus to numa domains later in SMP
 431         * init, we need to know the node ids now. This is because
 432         * each node to be onlined must have NODE_DATA etc backing it.
 433         */
 434        for_each_present_cpu(i) {
 435                int nid;
 436
 437                cpu = find_cpu_node(i);
 438                BUG_ON(!cpu);
 439                nid = of_node_to_nid_single(cpu);
 440                of_node_put(cpu);
 441
 442                /*
 443                 * Don't fall back to default_nid yet -- we will plug
 444                 * cpus into nodes once the memory scan has discovered
 445                 * the topology.
 446                 */
 447                if (nid < 0)
 448                        continue;
 449                node_set_online(nid);
 450        }
 451
 452        get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
 453        memory = NULL;
 454        while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 455                unsigned long start;
 456                unsigned long size;
 457                int nid;
 458                int ranges;
 459                const unsigned int *memcell_buf;
 460                unsigned int len;
 461
 462                memcell_buf = of_get_property(memory,
 463                        "linux,usable-memory", &len);
 464                if (!memcell_buf || len <= 0)
 465                        memcell_buf = of_get_property(memory, "reg", &len);
 466                if (!memcell_buf || len <= 0)
 467                        continue;
 468
 469                /* ranges in cell */
 470                ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
 471new_range:
 472                /* these are order-sensitive, and modify the buffer pointer */
 473                start = read_n_cells(n_mem_addr_cells, &memcell_buf);
 474                size = read_n_cells(n_mem_size_cells, &memcell_buf);
 475
 476                /*
 477                 * Assumption: either all memory nodes or none will
 478                 * have associativity properties.  If none, then
 479                 * everything goes to default_nid.
 480                 */
 481                nid = of_node_to_nid_single(memory);
 482                if (nid < 0)
 483                        nid = default_nid;
 484
 485                fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
 486                node_set_online(nid);
 487
 488                if (!(size = numa_enforce_memory_limit(start, size))) {
 489                        if (--ranges)
 490                                goto new_range;
 491                        else
 492                                continue;
 493                }
 494
 495                add_active_range(nid, start >> PAGE_SHIFT,
 496                                (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
 497
 498                if (--ranges)
 499                        goto new_range;
 500        }
 501
 502        /*
 503         * Now do the same thing for each LMB listed in the ibm,dynamic-memory
 504         * property in the ibm,dynamic-reconfiguration-memory node.
 505         */
 506        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 507        if (memory)
 508                parse_drconf_memory(memory);
 509
 510        return 0;
 511}
 512
 513static void __init setup_nonnuma(void)
 514{
 515        unsigned long top_of_ram = lmb_end_of_DRAM();
 516        unsigned long total_ram = lmb_phys_mem_size();
 517        unsigned long start_pfn, end_pfn;
 518        unsigned int i, nid = 0;
 519
 520        printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 521               top_of_ram, total_ram);
 522        printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 523               (top_of_ram - total_ram) >> 20);
 524
 525        for (i = 0; i < lmb.memory.cnt; ++i) {
 526                start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
 527                end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
 528
 529                fake_numa_create_new_node(end_pfn, &nid);
 530                add_active_range(nid, start_pfn, end_pfn);
 531                node_set_online(nid);
 532        }
 533}
 534
 535void __init dump_numa_cpu_topology(void)
 536{
 537        unsigned int node;
 538        unsigned int cpu, count;
 539
 540        if (min_common_depth == -1 || !numa_enabled)
 541                return;
 542
 543        for_each_online_node(node) {
 544                printk(KERN_DEBUG "Node %d CPUs:", node);
 545
 546                count = 0;
 547                /*
 548                 * If we used a CPU iterator here we would miss printing
 549                 * the holes in the cpumap.
 550                 */
 551                for (cpu = 0; cpu < NR_CPUS; cpu++) {
 552                        if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
 553                                if (count == 0)
 554                                        printk(" %u", cpu);
 555                                ++count;
 556                        } else {
 557                                if (count > 1)
 558                                        printk("-%u", cpu - 1);
 559                                count = 0;
 560                        }
 561                }
 562
 563                if (count > 1)
 564                        printk("-%u", NR_CPUS - 1);
 565                printk("\n");
 566        }
 567}
 568
 569static void __init dump_numa_memory_topology(void)
 570{
 571        unsigned int node;
 572        unsigned int count;
 573
 574        if (min_common_depth == -1 || !numa_enabled)
 575                return;
 576
 577        for_each_online_node(node) {
 578                unsigned long i;
 579
 580                printk(KERN_DEBUG "Node %d Memory:", node);
 581
 582                count = 0;
 583
 584                for (i = 0; i < lmb_end_of_DRAM();
 585                     i += (1 << SECTION_SIZE_BITS)) {
 586                        if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
 587                                if (count == 0)
 588                                        printk(" 0x%lx", i);
 589                                ++count;
 590                        } else {
 591                                if (count > 0)
 592                                        printk("-0x%lx", i);
 593                                count = 0;
 594                        }
 595                }
 596
 597                if (count > 0)
 598                        printk("-0x%lx", i);
 599                printk("\n");
 600        }
 601}
 602
 603/*
 604 * Allocate some memory, satisfying the lmb or bootmem allocator where
 605 * required. nid is the preferred node and end is the physical address of
 606 * the highest address in the node.
 607 *
 608 * Returns the physical address of the memory.
 609 */
 610static void __init *careful_allocation(int nid, unsigned long size,
 611                                       unsigned long align,
 612                                       unsigned long end_pfn)
 613{
 614        int new_nid;
 615        unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
 616
 617        /* retry over all memory */
 618        if (!ret)
 619                ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
 620
 621        if (!ret)
 622                panic("numa.c: cannot allocate %lu bytes on node %d",
 623                      size, nid);
 624
 625        /*
 626         * If the memory came from a previously allocated node, we must
 627         * retry with the bootmem allocator.
 628         */
 629        new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
 630        if (new_nid < nid) {
 631                ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
 632                                size, align, 0);
 633
 634                if (!ret)
 635                        panic("numa.c: cannot allocate %lu bytes on node %d",
 636                              size, new_nid);
 637
 638                ret = __pa(ret);
 639
 640                dbg("alloc_bootmem %lx %lx\n", ret, size);
 641        }
 642
 643        return (void *)ret;
 644}
 645
 646static struct notifier_block __cpuinitdata ppc64_numa_nb = {
 647        .notifier_call = cpu_numa_callback,
 648        .priority = 1 /* Must run before sched domains notifier. */
 649};
 650
 651void __init do_init_bootmem(void)
 652{
 653        int nid;
 654        unsigned int i;
 655
 656        min_low_pfn = 0;
 657        max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
 658        max_pfn = max_low_pfn;
 659
 660        if (parse_numa_properties())
 661                setup_nonnuma();
 662        else
 663                dump_numa_memory_topology();
 664
 665        register_cpu_notifier(&ppc64_numa_nb);
 666        cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
 667                          (void *)(unsigned long)boot_cpuid);
 668
 669        for_each_online_node(nid) {
 670                unsigned long start_pfn, end_pfn;
 671                unsigned long bootmem_paddr;
 672                unsigned long bootmap_pages;
 673
 674                get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 675
 676                /* Allocate the node structure node local if possible */
 677                NODE_DATA(nid) = careful_allocation(nid,
 678                                        sizeof(struct pglist_data),
 679                                        SMP_CACHE_BYTES, end_pfn);
 680                NODE_DATA(nid) = __va(NODE_DATA(nid));
 681                memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 682
 683                dbg("node %d\n", nid);
 684                dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 685
 686                NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
 687                NODE_DATA(nid)->node_start_pfn = start_pfn;
 688                NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 689
 690                if (NODE_DATA(nid)->node_spanned_pages == 0)
 691                        continue;
 692
 693                dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
 694                dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
 695
 696                bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
 697                bootmem_paddr = (unsigned long)careful_allocation(nid,
 698                                        bootmap_pages << PAGE_SHIFT,
 699                                        PAGE_SIZE, end_pfn);
 700                memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
 701
 702                dbg("bootmap_paddr = %lx\n", bootmem_paddr);
 703
 704                init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
 705                                  start_pfn, end_pfn);
 706
 707                free_bootmem_with_active_regions(nid, end_pfn);
 708
 709                /* Mark reserved regions on this node */
 710                for (i = 0; i < lmb.reserved.cnt; i++) {
 711                        unsigned long physbase = lmb.reserved.region[i].base;
 712                        unsigned long size = lmb.reserved.region[i].size;
 713                        unsigned long start_paddr = start_pfn << PAGE_SHIFT;
 714                        unsigned long end_paddr = end_pfn << PAGE_SHIFT;
 715
 716                        if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
 717                            early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
 718                                continue;
 719
 720                        if (physbase < end_paddr &&
 721                            (physbase+size) > start_paddr) {
 722                                /* overlaps */
 723                                if (physbase < start_paddr) {
 724                                        size -= start_paddr - physbase;
 725                                        physbase = start_paddr;
 726                                }
 727
 728                                if (size > end_paddr - physbase)
 729                                        size = end_paddr - physbase;
 730
 731                                dbg("reserve_bootmem %lx %lx\n", physbase,
 732                                    size);
 733                                reserve_bootmem_node(NODE_DATA(nid), physbase,
 734                                                     size, BOOTMEM_DEFAULT);
 735                        }
 736                }
 737
 738                sparse_memory_present_with_active_regions(nid);
 739        }
 740}
 741
 742void __init paging_init(void)
 743{
 744        unsigned long max_zone_pfns[MAX_NR_ZONES];
 745        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 746        max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
 747        free_area_init_nodes(max_zone_pfns);
 748}
 749
 750static int __init early_numa(char *p)
 751{
 752        if (!p)
 753                return 0;
 754
 755        if (strstr(p, "off"))
 756                numa_enabled = 0;
 757
 758        if (strstr(p, "debug"))
 759                numa_debug = 1;
 760
 761        p = strstr(p, "fake=");
 762        if (p)
 763                cmdline = p + strlen("fake=");
 764
 765        return 0;
 766}
 767early_param("numa", early_numa);
 768
 769#ifdef CONFIG_MEMORY_HOTPLUG
 770/*
 771 * Find the node associated with a hot added memory section.  Section
 772 * corresponds to a SPARSEMEM section, not an LMB.  It is assumed that
 773 * sections are fully contained within a single LMB.
 774 */
 775int hot_add_scn_to_nid(unsigned long scn_addr)
 776{
 777        struct device_node *memory = NULL;
 778        nodemask_t nodes;
 779        int default_nid = any_online_node(NODE_MASK_ALL);
 780        int nid;
 781
 782        if (!numa_enabled || (min_common_depth < 0))
 783                return default_nid;
 784
 785        while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 786                unsigned long start, size;
 787                int ranges;
 788                const unsigned int *memcell_buf;
 789                unsigned int len;
 790
 791                memcell_buf = of_get_property(memory, "reg", &len);
 792                if (!memcell_buf || len <= 0)
 793                        continue;
 794
 795                /* ranges in cell */
 796                ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
 797ha_new_range:
 798                start = read_n_cells(n_mem_addr_cells, &memcell_buf);
 799                size = read_n_cells(n_mem_size_cells, &memcell_buf);
 800                nid = of_node_to_nid_single(memory);
 801
 802                /* Domains not present at boot default to 0 */
 803                if (nid < 0 || !node_online(nid))
 804                        nid = default_nid;
 805
 806                if ((scn_addr >= start) && (scn_addr < (start + size))) {
 807                        of_node_put(memory);
 808                        goto got_nid;
 809                }
 810
 811                if (--ranges)           /* process all ranges in cell */
 812                        goto ha_new_range;
 813        }
 814        BUG();  /* section address should be found above */
 815        return 0;
 816
 817        /* Temporary code to ensure that returned node is not empty */
 818got_nid:
 819        nodes_setall(nodes);
 820        while (NODE_DATA(nid)->node_spanned_pages == 0) {
 821                node_clear(nid, nodes);
 822                nid = any_online_node(nodes);
 823        }
 824        return nid;
 825}
 826#endif /* CONFIG_MEMORY_HOTPLUG */
 827
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.