linux/arch/x86/kernel/machine_kexec_64.c
<<
>>
Prefs
   1/*
   2 * handle transition of Linux booting another kernel
   3 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
   4 *
   5 * This source code is licensed under the GNU General Public License,
   6 * Version 2.  See the file COPYING for more details.
   7 */
   8
   9#include <linux/mm.h>
  10#include <linux/kexec.h>
  11#include <linux/string.h>
  12#include <linux/gfp.h>
  13#include <linux/reboot.h>
  14#include <linux/numa.h>
  15#include <linux/ftrace.h>
  16#include <linux/io.h>
  17#include <linux/suspend.h>
  18
  19#include <asm/pgtable.h>
  20#include <asm/tlbflush.h>
  21#include <asm/mmu_context.h>
  22#include <asm/debugreg.h>
  23
  24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
  25                                unsigned long addr)
  26{
  27        pud_t *pud;
  28        pmd_t *pmd;
  29        struct page *page;
  30        int result = -ENOMEM;
  31
  32        addr &= PMD_MASK;
  33        pgd += pgd_index(addr);
  34        if (!pgd_present(*pgd)) {
  35                page = kimage_alloc_control_pages(image, 0);
  36                if (!page)
  37                        goto out;
  38                pud = (pud_t *)page_address(page);
  39                clear_page(pud);
  40                set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
  41        }
  42        pud = pud_offset(pgd, addr);
  43        if (!pud_present(*pud)) {
  44                page = kimage_alloc_control_pages(image, 0);
  45                if (!page)
  46                        goto out;
  47                pmd = (pmd_t *)page_address(page);
  48                clear_page(pmd);
  49                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
  50        }
  51        pmd = pmd_offset(pud, addr);
  52        if (!pmd_present(*pmd))
  53                set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
  54        result = 0;
  55out:
  56        return result;
  57}
  58
  59static void init_level2_page(pmd_t *level2p, unsigned long addr)
  60{
  61        unsigned long end_addr;
  62
  63        addr &= PAGE_MASK;
  64        end_addr = addr + PUD_SIZE;
  65        while (addr < end_addr) {
  66                set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
  67                addr += PMD_SIZE;
  68        }
  69}
  70
  71static int init_level3_page(struct kimage *image, pud_t *level3p,
  72                                unsigned long addr, unsigned long last_addr)
  73{
  74        unsigned long end_addr;
  75        int result;
  76
  77        result = 0;
  78        addr &= PAGE_MASK;
  79        end_addr = addr + PGDIR_SIZE;
  80        while ((addr < last_addr) && (addr < end_addr)) {
  81                struct page *page;
  82                pmd_t *level2p;
  83
  84                page = kimage_alloc_control_pages(image, 0);
  85                if (!page) {
  86                        result = -ENOMEM;
  87                        goto out;
  88                }
  89                level2p = (pmd_t *)page_address(page);
  90                init_level2_page(level2p, addr);
  91                set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
  92                addr += PUD_SIZE;
  93        }
  94        /* clear the unused entries */
  95        while (addr < end_addr) {
  96                pud_clear(level3p++);
  97                addr += PUD_SIZE;
  98        }
  99out:
 100        return result;
 101}
 102
 103
 104static int init_level4_page(struct kimage *image, pgd_t *level4p,
 105                                unsigned long addr, unsigned long last_addr)
 106{
 107        unsigned long end_addr;
 108        int result;
 109
 110        result = 0;
 111        addr &= PAGE_MASK;
 112        end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
 113        while ((addr < last_addr) && (addr < end_addr)) {
 114                struct page *page;
 115                pud_t *level3p;
 116
 117                page = kimage_alloc_control_pages(image, 0);
 118                if (!page) {
 119                        result = -ENOMEM;
 120                        goto out;
 121                }
 122                level3p = (pud_t *)page_address(page);
 123                result = init_level3_page(image, level3p, addr, last_addr);
 124                if (result)
 125                        goto out;
 126                set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
 127                addr += PGDIR_SIZE;
 128        }
 129        /* clear the unused entries */
 130        while (addr < end_addr) {
 131                pgd_clear(level4p++);
 132                addr += PGDIR_SIZE;
 133        }
 134out:
 135        return result;
 136}
 137
 138static void free_transition_pgtable(struct kimage *image)
 139{
 140        free_page((unsigned long)image->arch.pud);
 141        free_page((unsigned long)image->arch.pmd);
 142        free_page((unsigned long)image->arch.pte);
 143}
 144
 145static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 146{
 147        pud_t *pud;
 148        pmd_t *pmd;
 149        pte_t *pte;
 150        unsigned long vaddr, paddr;
 151        int result = -ENOMEM;
 152
 153        vaddr = (unsigned long)relocate_kernel;
 154        paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
 155        pgd += pgd_index(vaddr);
 156        if (!pgd_present(*pgd)) {
 157                pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 158                if (!pud)
 159                        goto err;
 160                image->arch.pud = pud;
 161                set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
 162        }
 163        pud = pud_offset(pgd, vaddr);
 164        if (!pud_present(*pud)) {
 165                pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 166                if (!pmd)
 167                        goto err;
 168                image->arch.pmd = pmd;
 169                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 170        }
 171        pmd = pmd_offset(pud, vaddr);
 172        if (!pmd_present(*pmd)) {
 173                pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 174                if (!pte)
 175                        goto err;
 176                image->arch.pte = pte;
 177                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 178        }
 179        pte = pte_offset_kernel(pmd, vaddr);
 180        set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
 181        return 0;
 182err:
 183        free_transition_pgtable(image);
 184        return result;
 185}
 186
 187
 188static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 189{
 190        pgd_t *level4p;
 191        int result;
 192        level4p = (pgd_t *)__va(start_pgtable);
 193        result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
 194        if (result)
 195                return result;
 196        /*
 197         * image->start may be outside 0 ~ max_pfn, for example when
 198         * jump back to original kernel from kexeced kernel
 199         */
 200        result = init_one_level2_page(image, level4p, image->start);
 201        if (result)
 202                return result;
 203        return init_transition_pgtable(image, level4p);
 204}
 205
 206static void set_idt(void *newidt, u16 limit)
 207{
 208        struct desc_ptr curidt;
 209
 210        /* x86-64 supports unaliged loads & stores */
 211        curidt.size    = limit;
 212        curidt.address = (unsigned long)newidt;
 213
 214        __asm__ __volatile__ (
 215                "lidtq %0\n"
 216                : : "m" (curidt)
 217                );
 218};
 219
 220
 221static void set_gdt(void *newgdt, u16 limit)
 222{
 223        struct desc_ptr curgdt;
 224
 225        /* x86-64 supports unaligned loads & stores */
 226        curgdt.size    = limit;
 227        curgdt.address = (unsigned long)newgdt;
 228
 229        __asm__ __volatile__ (
 230                "lgdtq %0\n"
 231                : : "m" (curgdt)
 232                );
 233};
 234
 235static void load_segments(void)
 236{
 237        __asm__ __volatile__ (
 238                "\tmovl %0,%%ds\n"
 239                "\tmovl %0,%%es\n"
 240                "\tmovl %0,%%ss\n"
 241                "\tmovl %0,%%fs\n"
 242                "\tmovl %0,%%gs\n"
 243                : : "a" (__KERNEL_DS) : "memory"
 244                );
 245}
 246
 247int machine_kexec_prepare(struct kimage *image)
 248{
 249        unsigned long start_pgtable;
 250        int result;
 251
 252        /* Calculate the offsets */
 253        start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 254
 255        /* Setup the identity mapped 64bit page table */
 256        result = init_pgtable(image, start_pgtable);
 257        if (result)
 258                return result;
 259
 260        return 0;
 261}
 262
 263void machine_kexec_cleanup(struct kimage *image)
 264{
 265        free_transition_pgtable(image);
 266}
 267
 268/*
 269 * Do not allocate memory (or fail in any way) in machine_kexec().
 270 * We are past the point of no return, committed to rebooting now.
 271 */
 272void machine_kexec(struct kimage *image)
 273{
 274        unsigned long page_list[PAGES_NR];
 275        void *control_page;
 276        int save_ftrace_enabled;
 277
 278#ifdef CONFIG_KEXEC_JUMP
 279        if (image->preserve_context)
 280                save_processor_state();
 281#endif
 282
 283        save_ftrace_enabled = __ftrace_enabled_save();
 284
 285        /* Interrupts aren't acceptable while we reboot */
 286        local_irq_disable();
 287        hw_breakpoint_disable();
 288
 289        if (image->preserve_context) {
 290#ifdef CONFIG_X86_IO_APIC
 291                /*
 292                 * We need to put APICs in legacy mode so that we can
 293                 * get timer interrupts in second kernel. kexec/kdump
 294                 * paths already have calls to disable_IO_APIC() in
 295                 * one form or other. kexec jump path also need
 296                 * one.
 297                 */
 298                disable_IO_APIC();
 299#endif
 300        }
 301
 302        control_page = page_address(image->control_code_page) + PAGE_SIZE;
 303        memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 304
 305        page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
 306        page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 307        page_list[PA_TABLE_PAGE] =
 308          (unsigned long)__pa(page_address(image->control_code_page));
 309
 310        if (image->type == KEXEC_TYPE_DEFAULT)
 311                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 312                                                << PAGE_SHIFT);
 313
 314        /*
 315         * The segment registers are funny things, they have both a
 316         * visible and an invisible part.  Whenever the visible part is
 317         * set to a specific selector, the invisible part is loaded
 318         * with from a table in memory.  At no other time is the
 319         * descriptor table in memory accessed.
 320         *
 321         * I take advantage of this here by force loading the
 322         * segments, before I zap the gdt with an invalid value.
 323         */
 324        load_segments();
 325        /*
 326         * The gdt & idt are now invalid.
 327         * If you want to load them you must set up your own idt & gdt.
 328         */
 329        set_gdt(phys_to_virt(0), 0);
 330        set_idt(phys_to_virt(0), 0);
 331
 332        /* now call it */
 333        image->start = relocate_kernel((unsigned long)image->head,
 334                                       (unsigned long)page_list,
 335                                       image->start,
 336                                       image->preserve_context);
 337
 338#ifdef CONFIG_KEXEC_JUMP
 339        if (image->preserve_context)
 340                restore_processor_state();
 341#endif
 342
 343        __ftrace_enabled_restore(save_ftrace_enabled);
 344}
 345
 346void arch_crash_save_vmcoreinfo(void)
 347{
 348        VMCOREINFO_SYMBOL(phys_base);
 349        VMCOREINFO_SYMBOL(init_level4_pgt);
 350
 351#ifdef CONFIG_NUMA
 352        VMCOREINFO_SYMBOL(node_data);
 353        VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
 354#endif
 355}
 356
 357
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.