linux/arch/x86/xen/setup.c
<<
>>
Prefs
   1/*
   2 * Machine specific setup for xen
   3 *
   4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
   5 */
   6
   7#include <linux/module.h>
   8#include <linux/sched.h>
   9#include <linux/mm.h>
  10#include <linux/pm.h>
  11#include <linux/memblock.h>
  12#include <linux/cpuidle.h>
  13
  14#include <asm/elf.h>
  15#include <asm/vdso.h>
  16#include <asm/e820.h>
  17#include <asm/setup.h>
  18#include <asm/acpi.h>
  19#include <asm/xen/hypervisor.h>
  20#include <asm/xen/hypercall.h>
  21
  22#include <xen/xen.h>
  23#include <xen/page.h>
  24#include <xen/interface/callback.h>
  25#include <xen/interface/memory.h>
  26#include <xen/interface/physdev.h>
  27#include <xen/features.h>
  28
  29#include "xen-ops.h"
  30#include "vdso.h"
  31
  32/* These are code, but not functions.  Defined in entry.S */
  33extern const char xen_hypervisor_callback[];
  34extern const char xen_failsafe_callback[];
  35extern void xen_sysenter_target(void);
  36extern void xen_syscall_target(void);
  37extern void xen_syscall32_target(void);
  38
  39/* Amount of extra memory space we add to the e820 ranges */
  40struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
  41
  42/* Number of pages released from the initial allocation. */
  43unsigned long xen_released_pages;
  44
  45/* 
  46 * The maximum amount of extra memory compared to the base size.  The
  47 * main scaling factor is the size of struct page.  At extreme ratios
  48 * of base:extra, all the base memory can be filled with page
  49 * structures for the extra memory, leaving no space for anything
  50 * else.
  51 * 
  52 * 10x seems like a reasonable balance between scaling flexibility and
  53 * leaving a practically usable system.
  54 */
  55#define EXTRA_MEM_RATIO         (10)
  56
  57static void __init xen_add_extra_mem(u64 start, u64 size)
  58{
  59        unsigned long pfn;
  60        int i;
  61
  62        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
  63                /* Add new region. */
  64                if (xen_extra_mem[i].size == 0) {
  65                        xen_extra_mem[i].start = start;
  66                        xen_extra_mem[i].size  = size;
  67                        break;
  68                }
  69                /* Append to existing region. */
  70                if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
  71                        xen_extra_mem[i].size += size;
  72                        break;
  73                }
  74        }
  75        if (i == XEN_EXTRA_MEM_MAX_REGIONS)
  76                printk(KERN_WARNING "Warning: not enough extra memory regions\n");
  77
  78        memblock_reserve(start, size);
  79
  80        xen_max_p2m_pfn = PFN_DOWN(start + size);
  81
  82        for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
  83                __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
  84}
  85
  86static unsigned long __init xen_release_chunk(unsigned long start,
  87                                              unsigned long end)
  88{
  89        struct xen_memory_reservation reservation = {
  90                .address_bits = 0,
  91                .extent_order = 0,
  92                .domid        = DOMID_SELF
  93        };
  94        unsigned long len = 0;
  95        unsigned long pfn;
  96        int ret;
  97
  98        for(pfn = start; pfn < end; pfn++) {
  99                unsigned long mfn = pfn_to_mfn(pfn);
 100
 101                /* Make sure pfn exists to start with */
 102                if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
 103                        continue;
 104
 105                set_xen_guest_handle(reservation.extent_start, &mfn);
 106                reservation.nr_extents = 1;
 107
 108                ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 109                                           &reservation);
 110                WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
 111                if (ret == 1) {
 112                        __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 113                        len++;
 114                }
 115        }
 116        printk(KERN_INFO "Freeing  %lx-%lx pfn range: %lu pages freed\n",
 117               start, end, len);
 118
 119        return len;
 120}
 121
 122static unsigned long __init xen_set_identity_and_release(
 123        const struct e820entry *list, size_t map_size, unsigned long nr_pages)
 124{
 125        phys_addr_t start = 0;
 126        unsigned long released = 0;
 127        unsigned long identity = 0;
 128        const struct e820entry *entry;
 129        int i;
 130
 131        /*
 132         * Combine non-RAM regions and gaps until a RAM region (or the
 133         * end of the map) is reached, then set the 1:1 map and
 134         * release the pages (if available) in those non-RAM regions.
 135         *
 136         * The combined non-RAM regions are rounded to a whole number
 137         * of pages so any partial pages are accessible via the 1:1
 138         * mapping.  This is needed for some BIOSes that put (for
 139         * example) the DMI tables in a reserved region that begins on
 140         * a non-page boundary.
 141         */
 142        for (i = 0, entry = list; i < map_size; i++, entry++) {
 143                phys_addr_t end = entry->addr + entry->size;
 144
 145                if (entry->type == E820_RAM || i == map_size - 1) {
 146                        unsigned long start_pfn = PFN_DOWN(start);
 147                        unsigned long end_pfn = PFN_UP(end);
 148
 149                        if (entry->type == E820_RAM)
 150                                end_pfn = PFN_UP(entry->addr);
 151
 152                        if (start_pfn < end_pfn) {
 153                                if (start_pfn < nr_pages)
 154                                        released += xen_release_chunk(
 155                                                start_pfn, min(end_pfn, nr_pages));
 156
 157                                identity += set_phys_range_identity(
 158                                        start_pfn, end_pfn);
 159                        }
 160                        start = end;
 161                }
 162        }
 163
 164        printk(KERN_INFO "Released %lu pages of unused memory\n", released);
 165        printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
 166
 167        return released;
 168}
 169
 170static unsigned long __init xen_get_max_pages(void)
 171{
 172        unsigned long max_pages = MAX_DOMAIN_PAGES;
 173        domid_t domid = DOMID_SELF;
 174        int ret;
 175
 176        /*
 177         * For the initial domain we use the maximum reservation as
 178         * the maximum page.
 179         *
 180         * For guest domains the current maximum reservation reflects
 181         * the current maximum rather than the static maximum. In this
 182         * case the e820 map provided to us will cover the static
 183         * maximum region.
 184         */
 185        if (xen_initial_domain()) {
 186                ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
 187                if (ret > 0)
 188                        max_pages = ret;
 189        }
 190
 191        return min(max_pages, MAX_DOMAIN_PAGES);
 192}
 193
 194static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
 195{
 196        u64 end = start + size;
 197
 198        /* Align RAM regions to page boundaries. */
 199        if (type == E820_RAM) {
 200                start = PAGE_ALIGN(start);
 201                end &= ~((u64)PAGE_SIZE - 1);
 202        }
 203
 204        e820_add_region(start, end - start, type);
 205}
 206
 207/**
 208 * machine_specific_memory_setup - Hook for machine specific memory setup.
 209 **/
 210char * __init xen_memory_setup(void)
 211{
 212        static struct e820entry map[E820MAX] __initdata;
 213
 214        unsigned long max_pfn = xen_start_info->nr_pages;
 215        unsigned long long mem_end;
 216        int rc;
 217        struct xen_memory_map memmap;
 218        unsigned long max_pages;
 219        unsigned long extra_pages = 0;
 220        int i;
 221        int op;
 222
 223        max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
 224        mem_end = PFN_PHYS(max_pfn);
 225
 226        memmap.nr_entries = E820MAX;
 227        set_xen_guest_handle(memmap.buffer, map);
 228
 229        op = xen_initial_domain() ?
 230                XENMEM_machine_memory_map :
 231                XENMEM_memory_map;
 232        rc = HYPERVISOR_memory_op(op, &memmap);
 233        if (rc == -ENOSYS) {
 234                BUG_ON(xen_initial_domain());
 235                memmap.nr_entries = 1;
 236                map[0].addr = 0ULL;
 237                map[0].size = mem_end;
 238                /* 8MB slack (to balance backend allocations). */
 239                map[0].size += 8ULL << 20;
 240                map[0].type = E820_RAM;
 241                rc = 0;
 242        }
 243        BUG_ON(rc);
 244
 245        /* Make sure the Xen-supplied memory map is well-ordered. */
 246        sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
 247
 248        max_pages = xen_get_max_pages();
 249        if (max_pages > max_pfn)
 250                extra_pages += max_pages - max_pfn;
 251
 252        /*
 253         * Set P2M for all non-RAM pages and E820 gaps to be identity
 254         * type PFNs.  Any RAM pages that would be made inaccesible by
 255         * this are first released.
 256         */
 257        xen_released_pages = xen_set_identity_and_release(
 258                map, memmap.nr_entries, max_pfn);
 259        extra_pages += xen_released_pages;
 260
 261        /*
 262         * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
 263         * factor the base size.  On non-highmem systems, the base
 264         * size is the full initial memory allocation; on highmem it
 265         * is limited to the max size of lowmem, so that it doesn't
 266         * get completely filled.
 267         *
 268         * In principle there could be a problem in lowmem systems if
 269         * the initial memory is also very large with respect to
 270         * lowmem, but we won't try to deal with that here.
 271         */
 272        extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
 273                          extra_pages);
 274
 275        i = 0;
 276        while (i < memmap.nr_entries) {
 277                u64 addr = map[i].addr;
 278                u64 size = map[i].size;
 279                u32 type = map[i].type;
 280
 281                if (type == E820_RAM) {
 282                        if (addr < mem_end) {
 283                                size = min(size, mem_end - addr);
 284                        } else if (extra_pages) {
 285                                size = min(size, (u64)extra_pages * PAGE_SIZE);
 286                                extra_pages -= size / PAGE_SIZE;
 287                                xen_add_extra_mem(addr, size);
 288                        } else
 289                                type = E820_UNUSABLE;
 290                }
 291
 292                xen_align_and_add_e820_region(addr, size, type);
 293
 294                map[i].addr += size;
 295                map[i].size -= size;
 296                if (map[i].size == 0)
 297                        i++;
 298        }
 299
 300        /*
 301         * In domU, the ISA region is normal, usable memory, but we
 302         * reserve ISA memory anyway because too many things poke
 303         * about in there.
 304         */
 305        e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 306                        E820_RESERVED);
 307
 308        /*
 309         * Reserve Xen bits:
 310         *  - mfn_list
 311         *  - xen_start_info
 312         * See comment above "struct start_info" in <xen/interface/xen.h>
 313         */
 314        memblock_reserve(__pa(xen_start_info->mfn_list),
 315                         xen_start_info->pt_base - xen_start_info->mfn_list);
 316
 317        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 318
 319        return "Xen";
 320}
 321
 322/*
 323 * Set the bit indicating "nosegneg" library variants should be used.
 324 * We only need to bother in pure 32-bit mode; compat 32-bit processes
 325 * can have un-truncated segments, so wrapping around is allowed.
 326 */
 327static void __init fiddle_vdso(void)
 328{
 329#ifdef CONFIG_X86_32
 330        u32 *mask;
 331        mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
 332        *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 333        mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
 334        *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 335#endif
 336}
 337
 338static int __cpuinit register_callback(unsigned type, const void *func)
 339{
 340        struct callback_register callback = {
 341                .type = type,
 342                .address = XEN_CALLBACK(__KERNEL_CS, func),
 343                .flags = CALLBACKF_mask_events,
 344        };
 345
 346        return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
 347}
 348
 349void __cpuinit xen_enable_sysenter(void)
 350{
 351        int ret;
 352        unsigned sysenter_feature;
 353
 354#ifdef CONFIG_X86_32
 355        sysenter_feature = X86_FEATURE_SEP;
 356#else
 357        sysenter_feature = X86_FEATURE_SYSENTER32;
 358#endif
 359
 360        if (!boot_cpu_has(sysenter_feature))
 361                return;
 362
 363        ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
 364        if(ret != 0)
 365                setup_clear_cpu_cap(sysenter_feature);
 366}
 367
 368void __cpuinit xen_enable_syscall(void)
 369{
 370#ifdef CONFIG_X86_64
 371        int ret;
 372
 373        ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
 374        if (ret != 0) {
 375                printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
 376                /* Pretty fatal; 64-bit userspace has no other
 377                   mechanism for syscalls. */
 378        }
 379
 380        if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
 381                ret = register_callback(CALLBACKTYPE_syscall32,
 382                                        xen_syscall32_target);
 383                if (ret != 0)
 384                        setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
 385        }
 386#endif /* CONFIG_X86_64 */
 387}
 388
 389void __init xen_arch_setup(void)
 390{
 391        xen_panic_handler_init();
 392
 393        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
 394        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
 395
 396        if (!xen_feature(XENFEAT_auto_translated_physmap))
 397                HYPERVISOR_vm_assist(VMASST_CMD_enable,
 398                                     VMASST_TYPE_pae_extended_cr3);
 399
 400        if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
 401            register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
 402                BUG();
 403
 404        xen_enable_sysenter();
 405        xen_enable_syscall();
 406
 407#ifdef CONFIG_ACPI
 408        if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
 409                printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
 410                disable_acpi();
 411        }
 412#endif
 413
 414        memcpy(boot_command_line, xen_start_info->cmd_line,
 415               MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
 416               COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
 417
 418        /* Set up idle, making sure it calls safe_halt() pvop */
 419#ifdef CONFIG_X86_32
 420        boot_cpu_data.hlt_works_ok = 1;
 421#endif
 422        disable_cpuidle();
 423        boot_option_idle_override = IDLE_HALT;
 424        WARN_ON(set_pm_idle_to_default());
 425        fiddle_vdso();
 426}
 427
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.