linux/arch/x86/mm/srat_64.c
<<
>>
Prefs
   1/*
   2 * ACPI 3.0 based NUMA setup
   3 * Copyright 2004 Andi Kleen, SuSE Labs.
   4 *
   5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
   6 *
   7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
   8 * Assumes all memory regions belonging to a single proximity domain
   9 * are in one chunk. Holes between them will be included in the node.
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/acpi.h>
  14#include <linux/mmzone.h>
  15#include <linux/bitmap.h>
  16#include <linux/module.h>
  17#include <linux/topology.h>
  18#include <linux/bootmem.h>
  19#include <linux/mm.h>
  20#include <asm/proto.h>
  21#include <asm/numa.h>
  22#include <asm/e820.h>
  23#include <asm/genapic.h>
  24
  25int acpi_numa __initdata;
  26
  27static struct acpi_table_slit *acpi_slit;
  28
  29static nodemask_t nodes_parsed __initdata;
  30static struct bootnode nodes[MAX_NUMNODES] __initdata;
  31static struct bootnode nodes_add[MAX_NUMNODES];
  32static int found_add_area __initdata;
  33int hotadd_percent __initdata = 0;
  34
  35static int num_node_memblks __initdata;
  36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
  37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
  38
  39/* Too small nodes confuse the VM badly. Usually they result
  40   from BIOS bugs. */
  41#define NODE_MIN_SIZE (4*1024*1024)
  42
  43static __init int setup_node(int pxm)
  44{
  45        return acpi_map_pxm_to_node(pxm);
  46}
  47
  48static __init int conflicting_memblks(unsigned long start, unsigned long end)
  49{
  50        int i;
  51        for (i = 0; i < num_node_memblks; i++) {
  52                struct bootnode *nd = &node_memblk_range[i];
  53                if (nd->start == nd->end)
  54                        continue;
  55                if (nd->end > start && nd->start < end)
  56                        return memblk_nodeid[i];
  57                if (nd->end == end && nd->start == start)
  58                        return memblk_nodeid[i];
  59        }
  60        return -1;
  61}
  62
  63static __init void cutoff_node(int i, unsigned long start, unsigned long end)
  64{
  65        struct bootnode *nd = &nodes[i];
  66
  67        if (found_add_area)
  68                return;
  69
  70        if (nd->start < start) {
  71                nd->start = start;
  72                if (nd->end < nd->start)
  73                        nd->start = nd->end;
  74        }
  75        if (nd->end > end) {
  76                nd->end = end;
  77                if (nd->start > nd->end)
  78                        nd->start = nd->end;
  79        }
  80}
  81
  82static __init void bad_srat(void)
  83{
  84        int i;
  85        printk(KERN_ERR "SRAT: SRAT not used.\n");
  86        acpi_numa = -1;
  87        found_add_area = 0;
  88        for (i = 0; i < MAX_LOCAL_APIC; i++)
  89                apicid_to_node[i] = NUMA_NO_NODE;
  90        for (i = 0; i < MAX_NUMNODES; i++)
  91                nodes_add[i].start = nodes[i].end = 0;
  92        remove_all_active_ranges();
  93}
  94
  95static __init inline int srat_disabled(void)
  96{
  97        return numa_off || acpi_numa < 0;
  98}
  99
 100/* Callback for SLIT parsing */
 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 102{
 103        acpi_slit = slit;
 104}
 105
 106/* Callback for Proximity Domain -> LAPIC mapping */
 107void __init
 108acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 109{
 110        int pxm, node;
 111        int apic_id;
 112
 113        if (srat_disabled())
 114                return;
 115        if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
 116                bad_srat();
 117                return;
 118        }
 119        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 120                return;
 121        pxm = pa->proximity_domain_lo;
 122        node = setup_node(pxm);
 123        if (node < 0) {
 124                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
 125                bad_srat();
 126                return;
 127        }
 128
 129        if (is_uv_system())
 130                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
 131        else
 132                apic_id = pa->apic_id;
 133        apicid_to_node[apic_id] = node;
 134        acpi_numa = 1;
 135        printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
 136               pxm, apic_id, node);
 137}
 138
 139static int update_end_of_memory(unsigned long end) {return -1;}
 140static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 141#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 142static inline int save_add_info(void) {return 1;}
 143#else
 144static inline int save_add_info(void) {return 0;}
 145#endif
 146/*
 147 * Update nodes_add and decide if to include add are in the zone.
 148 * Both SPARSE and RESERVE need nodes_add information.
 149 * This code supports one contiguous hot add area per node.
 150 */
 151static int __init
 152reserve_hotadd(int node, unsigned long start, unsigned long end)
 153{
 154        unsigned long s_pfn = start >> PAGE_SHIFT;
 155        unsigned long e_pfn = end >> PAGE_SHIFT;
 156        int ret = 0, changed = 0;
 157        struct bootnode *nd = &nodes_add[node];
 158
 159        /* I had some trouble with strange memory hotadd regions breaking
 160           the boot. Be very strict here and reject anything unexpected.
 161           If you want working memory hotadd write correct SRATs.
 162
 163           The node size check is a basic sanity check to guard against
 164           mistakes */
 165        if ((signed long)(end - start) < NODE_MIN_SIZE) {
 166                printk(KERN_ERR "SRAT: Hotplug area too small\n");
 167                return -1;
 168        }
 169
 170        /* This check might be a bit too strict, but I'm keeping it for now. */
 171        if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
 172                printk(KERN_ERR
 173                        "SRAT: Hotplug area %lu -> %lu has existing memory\n",
 174                        s_pfn, e_pfn);
 175                return -1;
 176        }
 177
 178        if (!hotadd_enough_memory(&nodes_add[node]))  {
 179                printk(KERN_ERR "SRAT: Hotplug area too large\n");
 180                return -1;
 181        }
 182
 183        /* Looks good */
 184
 185        if (nd->start == nd->end) {
 186                nd->start = start;
 187                nd->end = end;
 188                changed = 1;
 189        } else {
 190                if (nd->start == end) {
 191                        nd->start = start;
 192                        changed = 1;
 193                }
 194                if (nd->end == start) {
 195                        nd->end = end;
 196                        changed = 1;
 197                }
 198                if (!changed)
 199                        printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 200        }
 201
 202        ret = update_end_of_memory(nd->end);
 203
 204        if (changed)
 205                printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
 206        return ret;
 207}
 208
 209/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 210void __init
 211acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 212{
 213        struct bootnode *nd, oldnode;
 214        unsigned long start, end;
 215        int node, pxm;
 216        int i;
 217
 218        if (srat_disabled())
 219                return;
 220        if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
 221                bad_srat();
 222                return;
 223        }
 224        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 225                return;
 226
 227        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
 228                return;
 229        start = ma->base_address;
 230        end = start + ma->length;
 231        pxm = ma->proximity_domain;
 232        node = setup_node(pxm);
 233        if (node < 0) {
 234                printk(KERN_ERR "SRAT: Too many proximity domains.\n");
 235                bad_srat();
 236                return;
 237        }
 238        i = conflicting_memblks(start, end);
 239        if (i == node) {
 240                printk(KERN_WARNING
 241                "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
 242                        pxm, start, end, nodes[i].start, nodes[i].end);
 243        } else if (i >= 0) {
 244                printk(KERN_ERR
 245                       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
 246                       pxm, start, end, node_to_pxm(i),
 247                        nodes[i].start, nodes[i].end);
 248                bad_srat();
 249                return;
 250        }
 251        nd = &nodes[node];
 252        oldnode = *nd;
 253        if (!node_test_and_set(node, nodes_parsed)) {
 254                nd->start = start;
 255                nd->end = end;
 256        } else {
 257                if (start < nd->start)
 258                        nd->start = start;
 259                if (nd->end < end)
 260                        nd->end = end;
 261        }
 262
 263        printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 264               start, end);
 265        e820_register_active_regions(node, start >> PAGE_SHIFT,
 266                                     end >> PAGE_SHIFT);
 267        push_node_boundaries(node, nd->start >> PAGE_SHIFT,
 268                                                nd->end >> PAGE_SHIFT);
 269
 270        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
 271            (reserve_hotadd(node, start, end) < 0)) {
 272                /* Ignore hotadd region. Undo damage */
 273                printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
 274                *nd = oldnode;
 275                if ((nd->start | nd->end) == 0)
 276                        node_clear(node, nodes_parsed);
 277        }
 278
 279        node_memblk_range[num_node_memblks].start = start;
 280        node_memblk_range[num_node_memblks].end = end;
 281        memblk_nodeid[num_node_memblks] = node;
 282        num_node_memblks++;
 283}
 284
 285/* Sanity check to catch more bad SRATs (they are amazingly common).
 286   Make sure the PXMs cover all memory. */
 287static int __init nodes_cover_memory(const struct bootnode *nodes)
 288{
 289        int i;
 290        unsigned long pxmram, e820ram;
 291
 292        pxmram = 0;
 293        for_each_node_mask(i, nodes_parsed) {
 294                unsigned long s = nodes[i].start >> PAGE_SHIFT;
 295                unsigned long e = nodes[i].end >> PAGE_SHIFT;
 296                pxmram += e - s;
 297                pxmram -= absent_pages_in_range(s, e);
 298                if ((long)pxmram < 0)
 299                        pxmram = 0;
 300        }
 301
 302        e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
 303        /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
 304        if ((long)(e820ram - pxmram) >= 1*1024*1024) {
 305                printk(KERN_ERR
 306        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
 307                        (pxmram << PAGE_SHIFT) >> 20,
 308                        (e820ram << PAGE_SHIFT) >> 20);
 309                return 0;
 310        }
 311        return 1;
 312}
 313
 314static void __init unparse_node(int node)
 315{
 316        int i;
 317        node_clear(node, nodes_parsed);
 318        for (i = 0; i < MAX_LOCAL_APIC; i++) {
 319                if (apicid_to_node[i] == node)
 320                        apicid_to_node[i] = NUMA_NO_NODE;
 321        }
 322}
 323
 324void __init acpi_numa_arch_fixup(void) {}
 325
 326/* Use the information discovered above to actually set up the nodes. */
 327int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 328{
 329        int i;
 330
 331        if (acpi_numa <= 0)
 332                return -1;
 333
 334        /* First clean up the node list */
 335        for (i = 0; i < MAX_NUMNODES; i++) {
 336                cutoff_node(i, start, end);
 337                /*
 338                 * don't confuse VM with a node that doesn't have the
 339                 * minimum memory.
 340                 */
 341                if (nodes[i].end &&
 342                        (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
 343                        unparse_node(i);
 344                        node_set_offline(i);
 345                }
 346        }
 347
 348        if (!nodes_cover_memory(nodes)) {
 349                bad_srat();
 350                return -1;
 351        }
 352
 353        memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
 354                                           memblk_nodeid);
 355        if (memnode_shift < 0) {
 356                printk(KERN_ERR
 357                     "SRAT: No NUMA node hash function found. Contact maintainer\n");
 358                bad_srat();
 359                return -1;
 360        }
 361
 362        node_possible_map = nodes_parsed;
 363
 364        /* Finally register nodes */
 365        for_each_node_mask(i, node_possible_map)
 366                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 367        /* Try again in case setup_node_bootmem missed one due
 368           to missing bootmem */
 369        for_each_node_mask(i, node_possible_map)
 370                if (!node_online(i))
 371                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 372
 373        for (i = 0; i < NR_CPUS; i++) {
 374                int node = early_cpu_to_node(i);
 375
 376                if (node == NUMA_NO_NODE)
 377                        continue;
 378                if (!node_isset(node, node_possible_map))
 379                        numa_set_node(i, NUMA_NO_NODE);
 380        }
 381        numa_init_array();
 382        return 0;
 383}
 384
 385#ifdef CONFIG_NUMA_EMU
 386static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
 387        [0 ... MAX_NUMNODES-1] = PXM_INVAL
 388};
 389static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 390        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 391};
 392static int __init find_node_by_addr(unsigned long addr)
 393{
 394        int ret = NUMA_NO_NODE;
 395        int i;
 396
 397        for_each_node_mask(i, nodes_parsed) {
 398                /*
 399                 * Find the real node that this emulated node appears on.  For
 400                 * the sake of simplicity, we only use a real node's starting
 401                 * address to determine which emulated node it appears on.
 402                 */
 403                if (addr >= nodes[i].start && addr < nodes[i].end) {
 404                        ret = i;
 405                        break;
 406                }
 407        }
 408        return ret;
 409}
 410
 411/*
 412 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 413 * mappings that respect the real ACPI topology but reflect our emulated
 414 * environment.  For each emulated node, we find which real node it appears on
 415 * and create PXM to NID mappings for those fake nodes which mirror that
 416 * locality.  SLIT will now represent the correct distances between emulated
 417 * nodes as a result of the real topology.
 418 */
 419void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 420{
 421        int i, j;
 422
 423        printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
 424                         "topology.\n");
 425        for (i = 0; i < num_nodes; i++) {
 426                int nid, pxm;
 427
 428                nid = find_node_by_addr(fake_nodes[i].start);
 429                if (nid == NUMA_NO_NODE)
 430                        continue;
 431                pxm = node_to_pxm(nid);
 432                if (pxm == PXM_INVAL)
 433                        continue;
 434                fake_node_to_pxm_map[i] = pxm;
 435                /*
 436                 * For each apicid_to_node mapping that exists for this real
 437                 * node, it must now point to the fake node ID.
 438                 */
 439                for (j = 0; j < MAX_LOCAL_APIC; j++)
 440                        if (apicid_to_node[j] == nid)
 441                                fake_apicid_to_node[j] = i;
 442        }
 443        for (i = 0; i < num_nodes; i++)
 444                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 445        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
 446
 447        nodes_clear(nodes_parsed);
 448        for (i = 0; i < num_nodes; i++)
 449                if (fake_nodes[i].start != fake_nodes[i].end)
 450                        node_set(i, nodes_parsed);
 451        WARN_ON(!nodes_cover_memory(fake_nodes));
 452}
 453
 454static int null_slit_node_compare(int a, int b)
 455{
 456        return node_to_pxm(a) == node_to_pxm(b);
 457}
 458#else
 459static int null_slit_node_compare(int a, int b)
 460{
 461        return a == b;
 462}
 463#endif /* CONFIG_NUMA_EMU */
 464
 465void __init srat_reserve_add_area(int nodeid)
 466{
 467        if (found_add_area && nodes_add[nodeid].end) {
 468                u64 total_mb;
 469
 470                printk(KERN_INFO "SRAT: Reserving hot-add memory space "
 471                                "for node %d at %Lx-%Lx\n",
 472                        nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
 473                total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
 474                                        >> PAGE_SHIFT;
 475                total_mb *= sizeof(struct page);
 476                total_mb >>= 20;
 477                printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
 478                                "pre-allocated memory.\n", (unsigned long long)total_mb);
 479                reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
 480                               nodes_add[nodeid].end - nodes_add[nodeid].start,
 481                               BOOTMEM_DEFAULT);
 482        }
 483}
 484
 485int __node_distance(int a, int b)
 486{
 487        int index;
 488
 489        if (!acpi_slit)
 490                return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
 491                                                      REMOTE_DISTANCE;
 492        index = acpi_slit->locality_count * node_to_pxm(a);
 493        return acpi_slit->entry[index + node_to_pxm(b)];
 494}
 495
 496EXPORT_SYMBOL(__node_distance);
 497
 498int memory_add_physaddr_to_nid(u64 start)
 499{
 500        int i, ret = 0;
 501
 502        for_each_node(i)
 503                if (nodes_add[i].start <= start && nodes_add[i].end > start)
 504                        ret = i;
 505
 506        return ret;
 507}
 508EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 509
 510
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.