linux/arch/x86/mm/srat_64.c
<<
>>
Prefs
   1/*
   2 * ACPI 3.0 based NUMA setup
   3 * Copyright 2004 Andi Kleen, SuSE Labs.
   4 *
   5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
   6 *
   7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
   8 * Assumes all memory regions belonging to a single proximity domain
   9 * are in one chunk. Holes between them will be included in the node.
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/acpi.h>
  14#include <linux/mmzone.h>
  15#include <linux/bitmap.h>
  16#include <linux/module.h>
  17#include <linux/topology.h>
  18#include <linux/bootmem.h>
  19#include <linux/mm.h>
  20#include <asm/proto.h>
  21#include <asm/numa.h>
  22#include <asm/e820.h>
  23#include <asm/genapic.h>
  24
  25int acpi_numa __initdata;
  26
  27static struct acpi_table_slit *acpi_slit;
  28
  29static nodemask_t nodes_parsed __initdata;
  30static struct bootnode nodes[MAX_NUMNODES] __initdata;
  31static struct bootnode nodes_add[MAX_NUMNODES];
  32static int found_add_area __initdata;
  33int hotadd_percent __initdata = 0;
  34
  35static int num_node_memblks __initdata;
  36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
  37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
  38
  39/* Too small nodes confuse the VM badly. Usually they result
  40   from BIOS bugs. */
  41#define NODE_MIN_SIZE (4*1024*1024)
  42
  43static __init int setup_node(int pxm)
  44{
  45        return acpi_map_pxm_to_node(pxm);
  46}
  47
  48static __init int conflicting_memblks(unsigned long start, unsigned long end)
  49{
  50        int i;
  51        for (i = 0; i < num_node_memblks; i++) {
  52                struct bootnode *nd = &node_memblk_range[i];
  53                if (nd->start == nd->end)
  54                        continue;
  55                if (nd->end > start && nd->start < end)
  56                        return memblk_nodeid[i];
  57                if (nd->end == end && nd->start == start)
  58                        return memblk_nodeid[i];
  59        }
  60        return -1;
  61}
  62
  63static __init void cutoff_node(int i, unsigned long start, unsigned long end)
  64{
  65        struct bootnode *nd = &nodes[i];
  66
  67        if (found_add_area)
  68                return;
  69
  70        if (nd->start < start) {
  71                nd->start = start;
  72                if (nd->end < nd->start)
  73                        nd->start = nd->end;
  74        }
  75        if (nd->end > end) {
  76                nd->end = end;
  77                if (nd->start > nd->end)
  78                        nd->start = nd->end;
  79        }
  80}
  81
  82static __init void bad_srat(void)
  83{
  84        int i;
  85        printk(KERN_ERR "SRAT: SRAT not used.\n");
  86        acpi_numa = -1;
  87        found_add_area = 0;
  88        for (i = 0; i < MAX_LOCAL_APIC; i++)
  89                apicid_to_node[i] = NUMA_NO_NODE;
  90        for (i = 0; i < MAX_NUMNODES; i++)
  91                nodes_add[i].start = nodes[i].end = 0;
  92        remove_all_active_ranges();
  93}
  94
  95static __init inline int srat_disabled(void)
  96{
  97        return numa_off || acpi_numa < 0;
  98}
  99
 100/* Callback for SLIT parsing */
 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 102{
 103        unsigned length;
 104        unsigned long phys;
 105
 106        length = slit->header.length;
 107        phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
 108                 PAGE_SIZE);
 109
 110        if (phys == -1L)
 111                panic(" Can not save slit!\n");
 112
 113        acpi_slit = __va(phys);
 114        memcpy(acpi_slit, slit, length);
 115        reserve_early(phys, phys + length, "ACPI SLIT");
 116}
 117
 118/* Callback for Proximity Domain -> LAPIC mapping */
 119void __init
 120acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 121{
 122        int pxm, node;
 123        int apic_id;
 124
 125        if (srat_disabled())
 126                return;
 127        if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
 128                bad_srat();
 129                return;
 130        }
 131        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 132                return;
 133        pxm = pa->proximity_domain_lo;
 134        node = setup_node(pxm);
 135        if (node < 0) {
 136                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
 137                bad_srat();
 138                return;
 139        }
 140
 141        if (get_uv_system_type() >= UV_X2APIC)
 142                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
 143        else
 144                apic_id = pa->apic_id;
 145        apicid_to_node[apic_id] = node;
 146        acpi_numa = 1;
 147        printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
 148               pxm, apic_id, node);
 149}
 150
 151static int update_end_of_memory(unsigned long end) {return -1;}
 152static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 153#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 154static inline int save_add_info(void) {return 1;}
 155#else
 156static inline int save_add_info(void) {return 0;}
 157#endif
 158/*
 159 * Update nodes_add and decide if to include add are in the zone.
 160 * Both SPARSE and RESERVE need nodes_add information.
 161 * This code supports one contiguous hot add area per node.
 162 */
 163static int __init
 164reserve_hotadd(int node, unsigned long start, unsigned long end)
 165{
 166        unsigned long s_pfn = start >> PAGE_SHIFT;
 167        unsigned long e_pfn = end >> PAGE_SHIFT;
 168        int ret = 0, changed = 0;
 169        struct bootnode *nd = &nodes_add[node];
 170
 171        /* I had some trouble with strange memory hotadd regions breaking
 172           the boot. Be very strict here and reject anything unexpected.
 173           If you want working memory hotadd write correct SRATs.
 174
 175           The node size check is a basic sanity check to guard against
 176           mistakes */
 177        if ((signed long)(end - start) < NODE_MIN_SIZE) {
 178                printk(KERN_ERR "SRAT: Hotplug area too small\n");
 179                return -1;
 180        }
 181
 182        /* This check might be a bit too strict, but I'm keeping it for now. */
 183        if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
 184                printk(KERN_ERR
 185                        "SRAT: Hotplug area %lu -> %lu has existing memory\n",
 186                        s_pfn, e_pfn);
 187                return -1;
 188        }
 189
 190        if (!hotadd_enough_memory(&nodes_add[node]))  {
 191                printk(KERN_ERR "SRAT: Hotplug area too large\n");
 192                return -1;
 193        }
 194
 195        /* Looks good */
 196
 197        if (nd->start == nd->end) {
 198                nd->start = start;
 199                nd->end = end;
 200                changed = 1;
 201        } else {
 202                if (nd->start == end) {
 203                        nd->start = start;
 204                        changed = 1;
 205                }
 206                if (nd->end == start) {
 207                        nd->end = end;
 208                        changed = 1;
 209                }
 210                if (!changed)
 211                        printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 212        }
 213
 214        ret = update_end_of_memory(nd->end);
 215
 216        if (changed)
 217                printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
 218        return ret;
 219}
 220
 221/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 222void __init
 223acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 224{
 225        struct bootnode *nd, oldnode;
 226        unsigned long start, end;
 227        int node, pxm;
 228        int i;
 229
 230        if (srat_disabled())
 231                return;
 232        if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
 233                bad_srat();
 234                return;
 235        }
 236        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 237                return;
 238
 239        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
 240                return;
 241        start = ma->base_address;
 242        end = start + ma->length;
 243        pxm = ma->proximity_domain;
 244        node = setup_node(pxm);
 245        if (node < 0) {
 246                printk(KERN_ERR "SRAT: Too many proximity domains.\n");
 247                bad_srat();
 248                return;
 249        }
 250        i = conflicting_memblks(start, end);
 251        if (i == node) {
 252                printk(KERN_WARNING
 253                "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
 254                        pxm, start, end, nodes[i].start, nodes[i].end);
 255        } else if (i >= 0) {
 256                printk(KERN_ERR
 257                       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
 258                       pxm, start, end, node_to_pxm(i),
 259                        nodes[i].start, nodes[i].end);
 260                bad_srat();
 261                return;
 262        }
 263        nd = &nodes[node];
 264        oldnode = *nd;
 265        if (!node_test_and_set(node, nodes_parsed)) {
 266                nd->start = start;
 267                nd->end = end;
 268        } else {
 269                if (start < nd->start)
 270                        nd->start = start;
 271                if (nd->end < end)
 272                        nd->end = end;
 273        }
 274
 275        printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 276               start, end);
 277        e820_register_active_regions(node, start >> PAGE_SHIFT,
 278                                     end >> PAGE_SHIFT);
 279        push_node_boundaries(node, nd->start >> PAGE_SHIFT,
 280                                                nd->end >> PAGE_SHIFT);
 281
 282        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
 283            (reserve_hotadd(node, start, end) < 0)) {
 284                /* Ignore hotadd region. Undo damage */
 285                printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
 286                *nd = oldnode;
 287                if ((nd->start | nd->end) == 0)
 288                        node_clear(node, nodes_parsed);
 289        }
 290
 291        node_memblk_range[num_node_memblks].start = start;
 292        node_memblk_range[num_node_memblks].end = end;
 293        memblk_nodeid[num_node_memblks] = node;
 294        num_node_memblks++;
 295}
 296
 297/* Sanity check to catch more bad SRATs (they are amazingly common).
 298   Make sure the PXMs cover all memory. */
 299static int __init nodes_cover_memory(const struct bootnode *nodes)
 300{
 301        int i;
 302        unsigned long pxmram, e820ram;
 303
 304        pxmram = 0;
 305        for_each_node_mask(i, nodes_parsed) {
 306                unsigned long s = nodes[i].start >> PAGE_SHIFT;
 307                unsigned long e = nodes[i].end >> PAGE_SHIFT;
 308                pxmram += e - s;
 309                pxmram -= absent_pages_in_range(s, e);
 310                if ((long)pxmram < 0)
 311                        pxmram = 0;
 312        }
 313
 314        e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
 315        /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
 316        if ((long)(e820ram - pxmram) >= 1*1024*1024) {
 317                printk(KERN_ERR
 318        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
 319                        (pxmram << PAGE_SHIFT) >> 20,
 320                        (e820ram << PAGE_SHIFT) >> 20);
 321                return 0;
 322        }
 323        return 1;
 324}
 325
 326static void __init unparse_node(int node)
 327{
 328        int i;
 329        node_clear(node, nodes_parsed);
 330        for (i = 0; i < MAX_LOCAL_APIC; i++) {
 331                if (apicid_to_node[i] == node)
 332                        apicid_to_node[i] = NUMA_NO_NODE;
 333        }
 334}
 335
 336void __init acpi_numa_arch_fixup(void) {}
 337
 338/* Use the information discovered above to actually set up the nodes. */
 339int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 340{
 341        int i;
 342
 343        if (acpi_numa <= 0)
 344                return -1;
 345
 346        /* First clean up the node list */
 347        for (i = 0; i < MAX_NUMNODES; i++) {
 348                cutoff_node(i, start, end);
 349                /*
 350                 * don't confuse VM with a node that doesn't have the
 351                 * minimum memory.
 352                 */
 353                if (nodes[i].end &&
 354                        (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
 355                        unparse_node(i);
 356                        node_set_offline(i);
 357                }
 358        }
 359
 360        if (!nodes_cover_memory(nodes)) {
 361                bad_srat();
 362                return -1;
 363        }
 364
 365        memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
 366                                           memblk_nodeid);
 367        if (memnode_shift < 0) {
 368                printk(KERN_ERR
 369                     "SRAT: No NUMA node hash function found. Contact maintainer\n");
 370                bad_srat();
 371                return -1;
 372        }
 373
 374        node_possible_map = nodes_parsed;
 375
 376        /* Finally register nodes */
 377        for_each_node_mask(i, node_possible_map)
 378                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 379        /* Try again in case setup_node_bootmem missed one due
 380           to missing bootmem */
 381        for_each_node_mask(i, node_possible_map)
 382                if (!node_online(i))
 383                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 384
 385        for (i = 0; i < NR_CPUS; i++) {
 386                int node = early_cpu_to_node(i);
 387
 388                if (node == NUMA_NO_NODE)
 389                        continue;
 390                if (!node_isset(node, node_possible_map))
 391                        numa_clear_node(i);
 392        }
 393        numa_init_array();
 394        return 0;
 395}
 396
 397#ifdef CONFIG_NUMA_EMU
 398static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
 399        [0 ... MAX_NUMNODES-1] = PXM_INVAL
 400};
 401static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 402        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 403};
 404static int __init find_node_by_addr(unsigned long addr)
 405{
 406        int ret = NUMA_NO_NODE;
 407        int i;
 408
 409        for_each_node_mask(i, nodes_parsed) {
 410                /*
 411                 * Find the real node that this emulated node appears on.  For
 412                 * the sake of simplicity, we only use a real node's starting
 413                 * address to determine which emulated node it appears on.
 414                 */
 415                if (addr >= nodes[i].start && addr < nodes[i].end) {
 416                        ret = i;
 417                        break;
 418                }
 419        }
 420        return ret;
 421}
 422
 423/*
 424 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 425 * mappings that respect the real ACPI topology but reflect our emulated
 426 * environment.  For each emulated node, we find which real node it appears on
 427 * and create PXM to NID mappings for those fake nodes which mirror that
 428 * locality.  SLIT will now represent the correct distances between emulated
 429 * nodes as a result of the real topology.
 430 */
 431void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 432{
 433        int i, j;
 434
 435        printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
 436                         "topology.\n");
 437        for (i = 0; i < num_nodes; i++) {
 438                int nid, pxm;
 439
 440                nid = find_node_by_addr(fake_nodes[i].start);
 441                if (nid == NUMA_NO_NODE)
 442                        continue;
 443                pxm = node_to_pxm(nid);
 444                if (pxm == PXM_INVAL)
 445                        continue;
 446                fake_node_to_pxm_map[i] = pxm;
 447                /*
 448                 * For each apicid_to_node mapping that exists for this real
 449                 * node, it must now point to the fake node ID.
 450                 */
 451                for (j = 0; j < MAX_LOCAL_APIC; j++)
 452                        if (apicid_to_node[j] == nid)
 453                                fake_apicid_to_node[j] = i;
 454        }
 455        for (i = 0; i < num_nodes; i++)
 456                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 457        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
 458
 459        nodes_clear(nodes_parsed);
 460        for (i = 0; i < num_nodes; i++)
 461                if (fake_nodes[i].start != fake_nodes[i].end)
 462                        node_set(i, nodes_parsed);
 463        WARN_ON(!nodes_cover_memory(fake_nodes));
 464}
 465
 466static int null_slit_node_compare(int a, int b)
 467{
 468        return node_to_pxm(a) == node_to_pxm(b);
 469}
 470#else
 471static int null_slit_node_compare(int a, int b)
 472{
 473        return a == b;
 474}
 475#endif /* CONFIG_NUMA_EMU */
 476
 477void __init srat_reserve_add_area(int nodeid)
 478{
 479        if (found_add_area && nodes_add[nodeid].end) {
 480                u64 total_mb;
 481
 482                printk(KERN_INFO "SRAT: Reserving hot-add memory space "
 483                                "for node %d at %Lx-%Lx\n",
 484                        nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
 485                total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
 486                                        >> PAGE_SHIFT;
 487                total_mb *= sizeof(struct page);
 488                total_mb >>= 20;
 489                printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
 490                                "pre-allocated memory.\n", (unsigned long long)total_mb);
 491                reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
 492                               nodes_add[nodeid].end - nodes_add[nodeid].start,
 493                               BOOTMEM_DEFAULT);
 494        }
 495}
 496
 497int __node_distance(int a, int b)
 498{
 499        int index;
 500
 501        if (!acpi_slit)
 502                return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
 503                                                      REMOTE_DISTANCE;
 504        index = acpi_slit->locality_count * node_to_pxm(a);
 505        return acpi_slit->entry[index + node_to_pxm(b)];
 506}
 507
 508EXPORT_SYMBOL(__node_distance);
 509
 510#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
 511int memory_add_physaddr_to_nid(u64 start)
 512{
 513        int i, ret = 0;
 514
 515        for_each_node(i)
 516                if (nodes_add[i].start <= start && nodes_add[i].end > start)
 517                        ret = i;
 518
 519        return ret;
 520}
 521EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 522#endif
 523