linux/kernel/kexec.c
<<
>>
Prefs
   1/*
   2 * kexec.c - kexec system call
   3 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4 *
   5 * This source code is licensed under the GNU General Public License,
   6 * Version 2.  See the file COPYING for more details.
   7 */
   8
   9#include <linux/capability.h>
  10#include <linux/mm.h>
  11#include <linux/file.h>
  12#include <linux/slab.h>
  13#include <linux/fs.h>
  14#include <linux/kexec.h>
  15#include <linux/mutex.h>
  16#include <linux/list.h>
  17#include <linux/highmem.h>
  18#include <linux/syscalls.h>
  19#include <linux/reboot.h>
  20#include <linux/ioport.h>
  21#include <linux/hardirq.h>
  22#include <linux/elf.h>
  23#include <linux/elfcore.h>
  24#include <generated/utsrelease.h>
  25#include <linux/utsname.h>
  26#include <linux/numa.h>
  27#include <linux/suspend.h>
  28#include <linux/device.h>
  29#include <linux/freezer.h>
  30#include <linux/pm.h>
  31#include <linux/cpu.h>
  32#include <linux/console.h>
  33#include <linux/vmalloc.h>
  34#include <linux/swap.h>
  35#include <linux/syscore_ops.h>
  36
  37#include <asm/page.h>
  38#include <asm/uaccess.h>
  39#include <asm/io.h>
  40#include <asm/system.h>
  41#include <asm/sections.h>
  42
  43/* Per cpu memory for storing cpu states in case of system crash. */
  44note_buf_t __percpu *crash_notes;
  45
  46/* vmcoreinfo stuff */
  47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
  48u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
  49size_t vmcoreinfo_size;
  50size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
  51
  52/* Location of the reserved area for the crash kernel */
  53struct resource crashk_res = {
  54        .name  = "Crash kernel",
  55        .start = 0,
  56        .end   = 0,
  57        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  58};
  59
  60int kexec_should_crash(struct task_struct *p)
  61{
  62        if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
  63                return 1;
  64        return 0;
  65}
  66
  67/*
  68 * When kexec transitions to the new kernel there is a one-to-one
  69 * mapping between physical and virtual addresses.  On processors
  70 * where you can disable the MMU this is trivial, and easy.  For
  71 * others it is still a simple predictable page table to setup.
  72 *
  73 * In that environment kexec copies the new kernel to its final
  74 * resting place.  This means I can only support memory whose
  75 * physical address can fit in an unsigned long.  In particular
  76 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  77 * If the assembly stub has more restrictive requirements
  78 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  79 * defined more restrictively in <asm/kexec.h>.
  80 *
  81 * The code for the transition from the current kernel to the
  82 * the new kernel is placed in the control_code_buffer, whose size
  83 * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
  84 * page of memory is necessary, but some architectures require more.
  85 * Because this memory must be identity mapped in the transition from
  86 * virtual to physical addresses it must live in the range
  87 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  88 * modifiable.
  89 *
  90 * The assembly stub in the control code buffer is passed a linked list
  91 * of descriptor pages detailing the source pages of the new kernel,
  92 * and the destination addresses of those source pages.  As this data
  93 * structure is not used in the context of the current OS, it must
  94 * be self-contained.
  95 *
  96 * The code has been made to work with highmem pages and will use a
  97 * destination page in its final resting place (if it happens
  98 * to allocate it).  The end product of this is that most of the
  99 * physical address space, and most of RAM can be used.
 100 *
 101 * Future directions include:
 102 *  - allocating a page table with the control code buffer identity
 103 *    mapped, to simplify machine_kexec and make kexec_on_panic more
 104 *    reliable.
 105 */
 106
 107/*
 108 * KIMAGE_NO_DEST is an impossible destination address..., for
 109 * allocating pages whose destination address we do not care about.
 110 */
 111#define KIMAGE_NO_DEST (-1UL)
 112
 113static int kimage_is_destination_range(struct kimage *image,
 114                                       unsigned long start, unsigned long end);
 115static struct page *kimage_alloc_page(struct kimage *image,
 116                                       gfp_t gfp_mask,
 117                                       unsigned long dest);
 118
 119static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 120                            unsigned long nr_segments,
 121                            struct kexec_segment __user *segments)
 122{
 123        size_t segment_bytes;
 124        struct kimage *image;
 125        unsigned long i;
 126        int result;
 127
 128        /* Allocate a controlling structure */
 129        result = -ENOMEM;
 130        image = kzalloc(sizeof(*image), GFP_KERNEL);
 131        if (!image)
 132                goto out;
 133
 134        image->head = 0;
 135        image->entry = &image->head;
 136        image->last_entry = &image->head;
 137        image->control_page = ~0; /* By default this does not apply */
 138        image->start = entry;
 139        image->type = KEXEC_TYPE_DEFAULT;
 140
 141        /* Initialize the list of control pages */
 142        INIT_LIST_HEAD(&image->control_pages);
 143
 144        /* Initialize the list of destination pages */
 145        INIT_LIST_HEAD(&image->dest_pages);
 146
 147        /* Initialize the list of unusable pages */
 148        INIT_LIST_HEAD(&image->unuseable_pages);
 149
 150        /* Read in the segments */
 151        image->nr_segments = nr_segments;
 152        segment_bytes = nr_segments * sizeof(*segments);
 153        result = copy_from_user(image->segment, segments, segment_bytes);
 154        if (result) {
 155                result = -EFAULT;
 156                goto out;
 157        }
 158
 159        /*
 160         * Verify we have good destination addresses.  The caller is
 161         * responsible for making certain we don't attempt to load
 162         * the new image into invalid or reserved areas of RAM.  This
 163         * just verifies it is an address we can use.
 164         *
 165         * Since the kernel does everything in page size chunks ensure
 166         * the destination addresses are page aligned.  Too many
 167         * special cases crop of when we don't do this.  The most
 168         * insidious is getting overlapping destination addresses
 169         * simply because addresses are changed to page size
 170         * granularity.
 171         */
 172        result = -EADDRNOTAVAIL;
 173        for (i = 0; i < nr_segments; i++) {
 174                unsigned long mstart, mend;
 175
 176                mstart = image->segment[i].mem;
 177                mend   = mstart + image->segment[i].memsz;
 178                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 179                        goto out;
 180                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 181                        goto out;
 182        }
 183
 184        /* Verify our destination addresses do not overlap.
 185         * If we alloed overlapping destination addresses
 186         * through very weird things can happen with no
 187         * easy explanation as one segment stops on another.
 188         */
 189        result = -EINVAL;
 190        for (i = 0; i < nr_segments; i++) {
 191                unsigned long mstart, mend;
 192                unsigned long j;
 193
 194                mstart = image->segment[i].mem;
 195                mend   = mstart + image->segment[i].memsz;
 196                for (j = 0; j < i; j++) {
 197                        unsigned long pstart, pend;
 198                        pstart = image->segment[j].mem;
 199                        pend   = pstart + image->segment[j].memsz;
 200                        /* Do the segments overlap ? */
 201                        if ((mend > pstart) && (mstart < pend))
 202                                goto out;
 203                }
 204        }
 205
 206        /* Ensure our buffer sizes are strictly less than
 207         * our memory sizes.  This should always be the case,
 208         * and it is easier to check up front than to be surprised
 209         * later on.
 210         */
 211        result = -EINVAL;
 212        for (i = 0; i < nr_segments; i++) {
 213                if (image->segment[i].bufsz > image->segment[i].memsz)
 214                        goto out;
 215        }
 216
 217        result = 0;
 218out:
 219        if (result == 0)
 220                *rimage = image;
 221        else
 222                kfree(image);
 223
 224        return result;
 225
 226}
 227
 228static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 229                                unsigned long nr_segments,
 230                                struct kexec_segment __user *segments)
 231{
 232        int result;
 233        struct kimage *image;
 234
 235        /* Allocate and initialize a controlling structure */
 236        image = NULL;
 237        result = do_kimage_alloc(&image, entry, nr_segments, segments);
 238        if (result)
 239                goto out;
 240
 241        *rimage = image;
 242
 243        /*
 244         * Find a location for the control code buffer, and add it
 245         * the vector of segments so that it's pages will also be
 246         * counted as destination pages.
 247         */
 248        result = -ENOMEM;
 249        image->control_code_page = kimage_alloc_control_pages(image,
 250                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
 251        if (!image->control_code_page) {
 252                printk(KERN_ERR "Could not allocate control_code_buffer\n");
 253                goto out;
 254        }
 255
 256        image->swap_page = kimage_alloc_control_pages(image, 0);
 257        if (!image->swap_page) {
 258                printk(KERN_ERR "Could not allocate swap buffer\n");
 259                goto out;
 260        }
 261
 262        result = 0;
 263 out:
 264        if (result == 0)
 265                *rimage = image;
 266        else
 267                kfree(image);
 268
 269        return result;
 270}
 271
 272static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 273                                unsigned long nr_segments,
 274                                struct kexec_segment __user *segments)
 275{
 276        int result;
 277        struct kimage *image;
 278        unsigned long i;
 279
 280        image = NULL;
 281        /* Verify we have a valid entry point */
 282        if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 283                result = -EADDRNOTAVAIL;
 284                goto out;
 285        }
 286
 287        /* Allocate and initialize a controlling structure */
 288        result = do_kimage_alloc(&image, entry, nr_segments, segments);
 289        if (result)
 290                goto out;
 291
 292        /* Enable the special crash kernel control page
 293         * allocation policy.
 294         */
 295        image->control_page = crashk_res.start;
 296        image->type = KEXEC_TYPE_CRASH;
 297
 298        /*
 299         * Verify we have good destination addresses.  Normally
 300         * the caller is responsible for making certain we don't
 301         * attempt to load the new image into invalid or reserved
 302         * areas of RAM.  But crash kernels are preloaded into a
 303         * reserved area of ram.  We must ensure the addresses
 304         * are in the reserved area otherwise preloading the
 305         * kernel could corrupt things.
 306         */
 307        result = -EADDRNOTAVAIL;
 308        for (i = 0; i < nr_segments; i++) {
 309                unsigned long mstart, mend;
 310
 311                mstart = image->segment[i].mem;
 312                mend = mstart + image->segment[i].memsz - 1;
 313                /* Ensure we are within the crash kernel limits */
 314                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 315                        goto out;
 316        }
 317
 318        /*
 319         * Find a location for the control code buffer, and add
 320         * the vector of segments so that it's pages will also be
 321         * counted as destination pages.
 322         */
 323        result = -ENOMEM;
 324        image->control_code_page = kimage_alloc_control_pages(image,
 325                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
 326        if (!image->control_code_page) {
 327                printk(KERN_ERR "Could not allocate control_code_buffer\n");
 328                goto out;
 329        }
 330
 331        result = 0;
 332out:
 333        if (result == 0)
 334                *rimage = image;
 335        else
 336                kfree(image);
 337
 338        return result;
 339}
 340
 341static int kimage_is_destination_range(struct kimage *image,
 342                                        unsigned long start,
 343                                        unsigned long end)
 344{
 345        unsigned long i;
 346
 347        for (i = 0; i < image->nr_segments; i++) {
 348                unsigned long mstart, mend;
 349
 350                mstart = image->segment[i].mem;
 351                mend = mstart + image->segment[i].memsz;
 352                if ((end > mstart) && (start < mend))
 353                        return 1;
 354        }
 355
 356        return 0;
 357}
 358
 359static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 360{
 361        struct page *pages;
 362
 363        pages = alloc_pages(gfp_mask, order);
 364        if (pages) {
 365                unsigned int count, i;
 366                pages->mapping = NULL;
 367                set_page_private(pages, order);
 368                count = 1 << order;
 369                for (i = 0; i < count; i++)
 370                        SetPageReserved(pages + i);
 371        }
 372
 373        return pages;
 374}
 375
 376static void kimage_free_pages(struct page *page)
 377{
 378        unsigned int order, count, i;
 379
 380        order = page_private(page);
 381        count = 1 << order;
 382        for (i = 0; i < count; i++)
 383                ClearPageReserved(page + i);
 384        __free_pages(page, order);
 385}
 386
 387static void kimage_free_page_list(struct list_head *list)
 388{
 389        struct list_head *pos, *next;
 390
 391        list_for_each_safe(pos, next, list) {
 392                struct page *page;
 393
 394                page = list_entry(pos, struct page, lru);
 395                list_del(&page->lru);
 396                kimage_free_pages(page);
 397        }
 398}
 399
 400static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 401                                                        unsigned int order)
 402{
 403        /* Control pages are special, they are the intermediaries
 404         * that are needed while we copy the rest of the pages
 405         * to their final resting place.  As such they must
 406         * not conflict with either the destination addresses
 407         * or memory the kernel is already using.
 408         *
 409         * The only case where we really need more than one of
 410         * these are for architectures where we cannot disable
 411         * the MMU and must instead generate an identity mapped
 412         * page table for all of the memory.
 413         *
 414         * At worst this runs in O(N) of the image size.
 415         */
 416        struct list_head extra_pages;
 417        struct page *pages;
 418        unsigned int count;
 419
 420        count = 1 << order;
 421        INIT_LIST_HEAD(&extra_pages);
 422
 423        /* Loop while I can allocate a page and the page allocated
 424         * is a destination page.
 425         */
 426        do {
 427                unsigned long pfn, epfn, addr, eaddr;
 428
 429                pages = kimage_alloc_pages(GFP_KERNEL, order);
 430                if (!pages)
 431                        break;
 432                pfn   = page_to_pfn(pages);
 433                epfn  = pfn + count;
 434                addr  = pfn << PAGE_SHIFT;
 435                eaddr = epfn << PAGE_SHIFT;
 436                if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 437                              kimage_is_destination_range(image, addr, eaddr)) {
 438                        list_add(&pages->lru, &extra_pages);
 439                        pages = NULL;
 440                }
 441        } while (!pages);
 442
 443        if (pages) {
 444                /* Remember the allocated page... */
 445                list_add(&pages->lru, &image->control_pages);
 446
 447                /* Because the page is already in it's destination
 448                 * location we will never allocate another page at
 449                 * that address.  Therefore kimage_alloc_pages
 450                 * will not return it (again) and we don't need
 451                 * to give it an entry in image->segment[].
 452                 */
 453        }
 454        /* Deal with the destination pages I have inadvertently allocated.
 455         *
 456         * Ideally I would convert multi-page allocations into single
 457         * page allocations, and add everything to image->dest_pages.
 458         *
 459         * For now it is simpler to just free the pages.
 460         */
 461        kimage_free_page_list(&extra_pages);
 462
 463        return pages;
 464}
 465
 466static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 467                                                      unsigned int order)
 468{
 469        /* Control pages are special, they are the intermediaries
 470         * that are needed while we copy the rest of the pages
 471         * to their final resting place.  As such they must
 472         * not conflict with either the destination addresses
 473         * or memory the kernel is already using.
 474         *
 475         * Control pages are also the only pags we must allocate
 476         * when loading a crash kernel.  All of the other pages
 477         * are specified by the segments and we just memcpy
 478         * into them directly.
 479         *
 480         * The only case where we really need more than one of
 481         * these are for architectures where we cannot disable
 482         * the MMU and must instead generate an identity mapped
 483         * page table for all of the memory.
 484         *
 485         * Given the low demand this implements a very simple
 486         * allocator that finds the first hole of the appropriate
 487         * size in the reserved memory region, and allocates all
 488         * of the memory up to and including the hole.
 489         */
 490        unsigned long hole_start, hole_end, size;
 491        struct page *pages;
 492
 493        pages = NULL;
 494        size = (1 << order) << PAGE_SHIFT;
 495        hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 496        hole_end   = hole_start + size - 1;
 497        while (hole_end <= crashk_res.end) {
 498                unsigned long i;
 499
 500                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
 501                        break;
 502                if (hole_end > crashk_res.end)
 503                        break;
 504                /* See if I overlap any of the segments */
 505                for (i = 0; i < image->nr_segments; i++) {
 506                        unsigned long mstart, mend;
 507
 508                        mstart = image->segment[i].mem;
 509                        mend   = mstart + image->segment[i].memsz - 1;
 510                        if ((hole_end >= mstart) && (hole_start <= mend)) {
 511                                /* Advance the hole to the end of the segment */
 512                                hole_start = (mend + (size - 1)) & ~(size - 1);
 513                                hole_end   = hole_start + size - 1;
 514                                break;
 515                        }
 516                }
 517                /* If I don't overlap any segments I have found my hole! */
 518                if (i == image->nr_segments) {
 519                        pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 520                        break;
 521                }
 522        }
 523        if (pages)
 524                image->control_page = hole_end;
 525
 526        return pages;
 527}
 528
 529
 530struct page *kimage_alloc_control_pages(struct kimage *image,
 531                                         unsigned int order)
 532{
 533        struct page *pages = NULL;
 534
 535        switch (image->type) {
 536        case KEXEC_TYPE_DEFAULT:
 537                pages = kimage_alloc_normal_control_pages(image, order);
 538                break;
 539        case KEXEC_TYPE_CRASH:
 540                pages = kimage_alloc_crash_control_pages(image, order);
 541                break;
 542        }
 543
 544        return pages;
 545}
 546
 547static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 548{
 549        if (*image->entry != 0)
 550                image->entry++;
 551
 552        if (image->entry == image->last_entry) {
 553                kimage_entry_t *ind_page;
 554                struct page *page;
 555
 556                page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 557                if (!page)
 558                        return -ENOMEM;
 559
 560                ind_page = page_address(page);
 561                *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 562                image->entry = ind_page;
 563                image->last_entry = ind_page +
 564                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 565        }
 566        *image->entry = entry;
 567        image->entry++;
 568        *image->entry = 0;
 569
 570        return 0;
 571}
 572
 573static int kimage_set_destination(struct kimage *image,
 574                                   unsigned long destination)
 575{
 576        int result;
 577
 578        destination &= PAGE_MASK;
 579        result = kimage_add_entry(image, destination | IND_DESTINATION);
 580        if (result == 0)
 581                image->destination = destination;
 582
 583        return result;
 584}
 585
 586
 587static int kimage_add_page(struct kimage *image, unsigned long page)
 588{
 589        int result;
 590
 591        page &= PAGE_MASK;
 592        result = kimage_add_entry(image, page | IND_SOURCE);
 593        if (result == 0)
 594                image->destination += PAGE_SIZE;
 595
 596        return result;
 597}
 598
 599
 600static void kimage_free_extra_pages(struct kimage *image)
 601{
 602        /* Walk through and free any extra destination pages I may have */
 603        kimage_free_page_list(&image->dest_pages);
 604
 605        /* Walk through and free any unusable pages I have cached */
 606        kimage_free_page_list(&image->unuseable_pages);
 607
 608}
 609static void kimage_terminate(struct kimage *image)
 610{
 611        if (*image->entry != 0)
 612                image->entry++;
 613
 614        *image->entry = IND_DONE;
 615}
 616
 617#define for_each_kimage_entry(image, ptr, entry) \
 618        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 619                ptr = (entry & IND_INDIRECTION)? \
 620                        phys_to_virt((entry & PAGE_MASK)): ptr +1)
 621
 622static void kimage_free_entry(kimage_entry_t entry)
 623{
 624        struct page *page;
 625
 626        page = pfn_to_page(entry >> PAGE_SHIFT);
 627        kimage_free_pages(page);
 628}
 629
 630static void kimage_free(struct kimage *image)
 631{
 632        kimage_entry_t *ptr, entry;
 633        kimage_entry_t ind = 0;
 634
 635        if (!image)
 636                return;
 637
 638        kimage_free_extra_pages(image);
 639        for_each_kimage_entry(image, ptr, entry) {
 640                if (entry & IND_INDIRECTION) {
 641                        /* Free the previous indirection page */
 642                        if (ind & IND_INDIRECTION)
 643                                kimage_free_entry(ind);
 644                        /* Save this indirection page until we are
 645                         * done with it.
 646                         */
 647                        ind = entry;
 648                }
 649                else if (entry & IND_SOURCE)
 650                        kimage_free_entry(entry);
 651        }
 652        /* Free the final indirection page */
 653        if (ind & IND_INDIRECTION)
 654                kimage_free_entry(ind);
 655
 656        /* Handle any machine specific cleanup */
 657        machine_kexec_cleanup(image);
 658
 659        /* Free the kexec control pages... */
 660        kimage_free_page_list(&image->control_pages);
 661        kfree(image);
 662}
 663
 664static kimage_entry_t *kimage_dst_used(struct kimage *image,
 665                                        unsigned long page)
 666{
 667        kimage_entry_t *ptr, entry;
 668        unsigned long destination = 0;
 669
 670        for_each_kimage_entry(image, ptr, entry) {
 671                if (entry & IND_DESTINATION)
 672                        destination = entry & PAGE_MASK;
 673                else if (entry & IND_SOURCE) {
 674                        if (page == destination)
 675                                return ptr;
 676                        destination += PAGE_SIZE;
 677                }
 678        }
 679
 680        return NULL;
 681}
 682
 683static struct page *kimage_alloc_page(struct kimage *image,
 684                                        gfp_t gfp_mask,
 685                                        unsigned long destination)
 686{
 687        /*
 688         * Here we implement safeguards to ensure that a source page
 689         * is not copied to its destination page before the data on
 690         * the destination page is no longer useful.
 691         *
 692         * To do this we maintain the invariant that a source page is
 693         * either its own destination page, or it is not a
 694         * destination page at all.
 695         *
 696         * That is slightly stronger than required, but the proof
 697         * that no problems will not occur is trivial, and the
 698         * implementation is simply to verify.
 699         *
 700         * When allocating all pages normally this algorithm will run
 701         * in O(N) time, but in the worst case it will run in O(N^2)
 702         * time.   If the runtime is a problem the data structures can
 703         * be fixed.
 704         */
 705        struct page *page;
 706        unsigned long addr;
 707
 708        /*
 709         * Walk through the list of destination pages, and see if I
 710         * have a match.
 711         */
 712        list_for_each_entry(page, &image->dest_pages, lru) {
 713                addr = page_to_pfn(page) << PAGE_SHIFT;
 714                if (addr == destination) {
 715                        list_del(&page->lru);
 716                        return page;
 717                }
 718        }
 719        page = NULL;
 720        while (1) {
 721                kimage_entry_t *old;
 722
 723                /* Allocate a page, if we run out of memory give up */
 724                page = kimage_alloc_pages(gfp_mask, 0);
 725                if (!page)
 726                        return NULL;
 727                /* If the page cannot be used file it away */
 728                if (page_to_pfn(page) >
 729                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 730                        list_add(&page->lru, &image->unuseable_pages);
 731                        continue;
 732                }
 733                addr = page_to_pfn(page) << PAGE_SHIFT;
 734
 735                /* If it is the destination page we want use it */
 736                if (addr == destination)
 737                        break;
 738
 739                /* If the page is not a destination page use it */
 740                if (!kimage_is_destination_range(image, addr,
 741                                                  addr + PAGE_SIZE))
 742                        break;
 743
 744                /*
 745                 * I know that the page is someones destination page.
 746                 * See if there is already a source page for this
 747                 * destination page.  And if so swap the source pages.
 748                 */
 749                old = kimage_dst_used(image, addr);
 750                if (old) {
 751                        /* If so move it */
 752                        unsigned long old_addr;
 753                        struct page *old_page;
 754
 755                        old_addr = *old & PAGE_MASK;
 756                        old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 757                        copy_highpage(page, old_page);
 758                        *old = addr | (*old & ~PAGE_MASK);
 759
 760                        /* The old page I have found cannot be a
 761                         * destination page, so return it if it's
 762                         * gfp_flags honor the ones passed in.
 763                         */
 764                        if (!(gfp_mask & __GFP_HIGHMEM) &&
 765                            PageHighMem(old_page)) {
 766                                kimage_free_pages(old_page);
 767                                continue;
 768                        }
 769                        addr = old_addr;
 770                        page = old_page;
 771                        break;
 772                }
 773                else {
 774                        /* Place the page on the destination list I
 775                         * will use it later.
 776                         */
 777                        list_add(&page->lru, &image->dest_pages);
 778                }
 779        }
 780
 781        return page;
 782}
 783
 784static int kimage_load_normal_segment(struct kimage *image,
 785                                         struct kexec_segment *segment)
 786{
 787        unsigned long maddr;
 788        unsigned long ubytes, mbytes;
 789        int result;
 790        unsigned char __user *buf;
 791
 792        result = 0;
 793        buf = segment->buf;
 794        ubytes = segment->bufsz;
 795        mbytes = segment->memsz;
 796        maddr = segment->mem;
 797
 798        result = kimage_set_destination(image, maddr);
 799        if (result < 0)
 800                goto out;
 801
 802        while (mbytes) {
 803                struct page *page;
 804                char *ptr;
 805                size_t uchunk, mchunk;
 806
 807                page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 808                if (!page) {
 809                        result  = -ENOMEM;
 810                        goto out;
 811                }
 812                result = kimage_add_page(image, page_to_pfn(page)
 813                                                                << PAGE_SHIFT);
 814                if (result < 0)
 815                        goto out;
 816
 817                ptr = kmap(page);
 818                /* Start with a clear page */
 819                clear_page(ptr);
 820                ptr += maddr & ~PAGE_MASK;
 821                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 822                if (mchunk > mbytes)
 823                        mchunk = mbytes;
 824
 825                uchunk = mchunk;
 826                if (uchunk > ubytes)
 827                        uchunk = ubytes;
 828
 829                result = copy_from_user(ptr, buf, uchunk);
 830                kunmap(page);
 831                if (result) {
 832                        result = -EFAULT;
 833                        goto out;
 834                }
 835                ubytes -= uchunk;
 836                maddr  += mchunk;
 837                buf    += mchunk;
 838                mbytes -= mchunk;
 839        }
 840out:
 841        return result;
 842}
 843
 844static int kimage_load_crash_segment(struct kimage *image,
 845                                        struct kexec_segment *segment)
 846{
 847        /* For crash dumps kernels we simply copy the data from
 848         * user space to it's destination.
 849         * We do things a page at a time for the sake of kmap.
 850         */
 851        unsigned long maddr;
 852        unsigned long ubytes, mbytes;
 853        int result;
 854        unsigned char __user *buf;
 855
 856        result = 0;
 857        buf = segment->buf;
 858        ubytes = segment->bufsz;
 859        mbytes = segment->memsz;
 860        maddr = segment->mem;
 861        while (mbytes) {
 862                struct page *page;
 863                char *ptr;
 864                size_t uchunk, mchunk;
 865
 866                page = pfn_to_page(maddr >> PAGE_SHIFT);
 867                if (!page) {
 868                        result  = -ENOMEM;
 869                        goto out;
 870                }
 871                ptr = kmap(page);
 872                ptr += maddr & ~PAGE_MASK;
 873                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 874                if (mchunk > mbytes)
 875                        mchunk = mbytes;
 876
 877                uchunk = mchunk;
 878                if (uchunk > ubytes) {
 879                        uchunk = ubytes;
 880                        /* Zero the trailing part of the page */
 881                        memset(ptr + uchunk, 0, mchunk - uchunk);
 882                }
 883                result = copy_from_user(ptr, buf, uchunk);
 884                kexec_flush_icache_page(page);
 885                kunmap(page);
 886                if (result) {
 887                        result = -EFAULT;
 888                        goto out;
 889                }
 890                ubytes -= uchunk;
 891                maddr  += mchunk;
 892                buf    += mchunk;
 893                mbytes -= mchunk;
 894        }
 895out:
 896        return result;
 897}
 898
 899static int kimage_load_segment(struct kimage *image,
 900                                struct kexec_segment *segment)
 901{
 902        int result = -ENOMEM;
 903
 904        switch (image->type) {
 905        case KEXEC_TYPE_DEFAULT:
 906                result = kimage_load_normal_segment(image, segment);
 907                break;
 908        case KEXEC_TYPE_CRASH:
 909                result = kimage_load_crash_segment(image, segment);
 910                break;
 911        }
 912
 913        return result;
 914}
 915
 916/*
 917 * Exec Kernel system call: for obvious reasons only root may call it.
 918 *
 919 * This call breaks up into three pieces.
 920 * - A generic part which loads the new kernel from the current
 921 *   address space, and very carefully places the data in the
 922 *   allocated pages.
 923 *
 924 * - A generic part that interacts with the kernel and tells all of
 925 *   the devices to shut down.  Preventing on-going dmas, and placing
 926 *   the devices in a consistent state so a later kernel can
 927 *   reinitialize them.
 928 *
 929 * - A machine specific part that includes the syscall number
 930 *   and the copies the image to it's final destination.  And
 931 *   jumps into the image at entry.
 932 *
 933 * kexec does not sync, or unmount filesystems so if you need
 934 * that to happen you need to do that yourself.
 935 */
 936struct kimage *kexec_image;
 937struct kimage *kexec_crash_image;
 938
 939static DEFINE_MUTEX(kexec_mutex);
 940
 941SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 942                struct kexec_segment __user *, segments, unsigned long, flags)
 943{
 944        struct kimage **dest_image, *image;
 945        int result;
 946
 947        /* We only trust the superuser with rebooting the system. */
 948        if (!capable(CAP_SYS_BOOT))
 949                return -EPERM;
 950
 951        /*
 952         * Verify we have a legal set of flags
 953         * This leaves us room for future extensions.
 954         */
 955        if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 956                return -EINVAL;
 957
 958        /* Verify we are on the appropriate architecture */
 959        if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 960                ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 961                return -EINVAL;
 962
 963        /* Put an artificial cap on the number
 964         * of segments passed to kexec_load.
 965         */
 966        if (nr_segments > KEXEC_SEGMENT_MAX)
 967                return -EINVAL;
 968
 969        image = NULL;
 970        result = 0;
 971
 972        /* Because we write directly to the reserved memory
 973         * region when loading crash kernels we need a mutex here to
 974         * prevent multiple crash  kernels from attempting to load
 975         * simultaneously, and to prevent a crash kernel from loading
 976         * over the top of a in use crash kernel.
 977         *
 978         * KISS: always take the mutex.
 979         */
 980        if (!mutex_trylock(&kexec_mutex))
 981                return -EBUSY;
 982
 983        dest_image = &kexec_image;
 984        if (flags & KEXEC_ON_CRASH)
 985                dest_image = &kexec_crash_image;
 986        if (nr_segments > 0) {
 987                unsigned long i;
 988
 989                /* Loading another kernel to reboot into */
 990                if ((flags & KEXEC_ON_CRASH) == 0)
 991                        result = kimage_normal_alloc(&image, entry,
 992                                                        nr_segments, segments);
 993                /* Loading another kernel to switch to if this one crashes */
 994                else if (flags & KEXEC_ON_CRASH) {
 995                        /* Free any current crash dump kernel before
 996                         * we corrupt it.
 997                         */
 998                        kimage_free(xchg(&kexec_crash_image, NULL));
 999                        result = kimage_crash_alloc(&image, entry,
1000                                                     nr_segments, segments);
1001                        crash_map_reserved_pages();
1002                }
1003                if (result)
1004                        goto out;
1005
1006                if (flags & KEXEC_PRESERVE_CONTEXT)
1007                        image->preserve_context = 1;
1008                result = machine_kexec_prepare(image);
1009                if (result)
1010                        goto out;
1011
1012                for (i = 0; i < nr_segments; i++) {
1013                        result = kimage_load_segment(image, &image->segment[i]);
1014                        if (result)
1015                                goto out;
1016                }
1017                kimage_terminate(image);
1018                if (flags & KEXEC_ON_CRASH)
1019                        crash_unmap_reserved_pages();
1020        }
1021        /* Install the new kernel, and  Uninstall the old */
1022        image = xchg(dest_image, image);
1023
1024out:
1025        mutex_unlock(&kexec_mutex);
1026        kimage_free(image);
1027
1028        return result;
1029}
1030
1031/*
1032 * Add and remove page tables for crashkernel memory
1033 *
1034 * Provide an empty default implementation here -- architecture
1035 * code may override this
1036 */
1037void __weak crash_map_reserved_pages(void)
1038{}
1039
1040void __weak crash_unmap_reserved_pages(void)
1041{}
1042
1043#ifdef CONFIG_COMPAT
1044asmlinkage long compat_sys_kexec_load(unsigned long entry,
1045                                unsigned long nr_segments,
1046                                struct compat_kexec_segment __user *segments,
1047                                unsigned long flags)
1048{
1049        struct compat_kexec_segment in;
1050        struct kexec_segment out, __user *ksegments;
1051        unsigned long i, result;
1052
1053        /* Don't allow clients that don't understand the native
1054         * architecture to do anything.
1055         */
1056        if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1057                return -EINVAL;
1058
1059        if (nr_segments > KEXEC_SEGMENT_MAX)
1060                return -EINVAL;
1061
1062        ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1063        for (i=0; i < nr_segments; i++) {
1064                result = copy_from_user(&in, &segments[i], sizeof(in));
1065                if (result)
1066                        return -EFAULT;
1067
1068                out.buf   = compat_ptr(in.buf);
1069                out.bufsz = in.bufsz;
1070                out.mem   = in.mem;
1071                out.memsz = in.memsz;
1072
1073                result = copy_to_user(&ksegments[i], &out, sizeof(out));
1074                if (result)
1075                        return -EFAULT;
1076        }
1077
1078        return sys_kexec_load(entry, nr_segments, ksegments, flags);
1079}
1080#endif
1081
1082void crash_kexec(struct pt_regs *regs)
1083{
1084        /* Take the kexec_mutex here to prevent sys_kexec_load
1085         * running on one cpu from replacing the crash kernel
1086         * we are using after a panic on a different cpu.
1087         *
1088         * If the crash kernel was not located in a fixed area
1089         * of memory the xchg(&kexec_crash_image) would be
1090         * sufficient.  But since I reuse the memory...
1091         */
1092        if (mutex_trylock(&kexec_mutex)) {
1093                if (kexec_crash_image) {
1094                        struct pt_regs fixed_regs;
1095
1096                        crash_setup_regs(&fixed_regs, regs);
1097                        crash_save_vmcoreinfo();
1098                        machine_crash_shutdown(&fixed_regs);
1099                        machine_kexec(kexec_crash_image);
1100                }
1101                mutex_unlock(&kexec_mutex);
1102        }
1103}
1104
1105size_t crash_get_memory_size(void)
1106{
1107        size_t size = 0;
1108        mutex_lock(&kexec_mutex);
1109        if (crashk_res.end != crashk_res.start)
1110                size = resource_size(&crashk_res);
1111        mutex_unlock(&kexec_mutex);
1112        return size;
1113}
1114
1115void __weak crash_free_reserved_phys_range(unsigned long begin,
1116                                           unsigned long end)
1117{
1118        unsigned long addr;
1119
1120        for (addr = begin; addr < end; addr += PAGE_SIZE) {
1121                ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1122                init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1123                free_page((unsigned long)__va(addr));
1124                totalram_pages++;
1125        }
1126}
1127
1128int crash_shrink_memory(unsigned long new_size)
1129{
1130        int ret = 0;
1131        unsigned long start, end;
1132        unsigned long old_size;
1133        struct resource *ram_res;
1134
1135        mutex_lock(&kexec_mutex);
1136
1137        if (kexec_crash_image) {
1138                ret = -ENOENT;
1139                goto unlock;
1140        }
1141        start = crashk_res.start;
1142        end = crashk_res.end;
1143        old_size = (end == 0) ? 0 : end - start + 1;
1144        if (new_size >= old_size) {
1145                ret = (new_size == old_size) ? 0 : -EINVAL;
1146                goto unlock;
1147        }
1148
1149        ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1150        if (!ram_res) {
1151                ret = -ENOMEM;
1152                goto unlock;
1153        }
1154
1155        start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
1156        end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
1157
1158        crash_map_reserved_pages();
1159        crash_free_reserved_phys_range(end, crashk_res.end);
1160
1161        if ((start == end) && (crashk_res.parent != NULL))
1162                release_resource(&crashk_res);
1163
1164        ram_res->start = end;
1165        ram_res->end = crashk_res.end;
1166        ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1167        ram_res->name = "System RAM";
1168
1169        crashk_res.end = end - 1;
1170
1171        insert_resource(&iomem_resource, ram_res);
1172        crash_unmap_reserved_pages();
1173
1174unlock:
1175        mutex_unlock(&kexec_mutex);
1176        return ret;
1177}
1178
1179static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1180                            size_t data_len)
1181{
1182        struct elf_note note;
1183
1184        note.n_namesz = strlen(name) + 1;
1185        note.n_descsz = data_len;
1186        note.n_type   = type;
1187        memcpy(buf, &note, sizeof(note));
1188        buf += (sizeof(note) + 3)/4;
1189        memcpy(buf, name, note.n_namesz);
1190        buf += (note.n_namesz + 3)/4;
1191        memcpy(buf, data, note.n_descsz);
1192        buf += (note.n_descsz + 3)/4;
1193
1194        return buf;
1195}
1196
1197static void final_note(u32 *buf)
1198{
1199        struct elf_note note;
1200
1201        note.n_namesz = 0;
1202        note.n_descsz = 0;
1203        note.n_type   = 0;
1204        memcpy(buf, &note, sizeof(note));
1205}
1206
1207void crash_save_cpu(struct pt_regs *regs, int cpu)
1208{
1209        struct elf_prstatus prstatus;
1210        u32 *buf;
1211
1212        if ((cpu < 0) || (cpu >= nr_cpu_ids))
1213                return;
1214
1215        /* Using ELF notes here is opportunistic.
1216         * I need a well defined structure format
1217         * for the data I pass, and I need tags
1218         * on the data to indicate what information I have
1219         * squirrelled away.  ELF notes happen to provide
1220         * all of that, so there is no need to invent something new.
1221         */
1222        buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1223        if (!buf)
1224                return;
1225        memset(&prstatus, 0, sizeof(prstatus));
1226        prstatus.pr_pid = current->pid;
1227        elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1228        buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1229                              &prstatus, sizeof(prstatus));
1230        final_note(buf);
1231}
1232
1233static int __init crash_notes_memory_init(void)
1234{
1235        /* Allocate memory for saving cpu registers. */
1236        crash_notes = alloc_percpu(note_buf_t);
1237        if (!crash_notes) {
1238                printk("Kexec: Memory allocation for saving cpu register"
1239                " states failed\n");
1240                return -ENOMEM;
1241        }
1242        return 0;
1243}
1244module_init(crash_notes_memory_init)
1245
1246
1247/*
1248 * parsing the "crashkernel" commandline
1249 *
1250 * this code is intended to be called from architecture specific code
1251 */
1252
1253
1254/*
1255 * This function parses command lines in the format
1256 *
1257 *   crashkernel=ramsize-range:size[,...][@offset]
1258 *
1259 * The function returns 0 on success and -EINVAL on failure.
1260 */
1261static int __init parse_crashkernel_mem(char                    *cmdline,
1262                                        unsigned long long      system_ram,
1263                                        unsigned long long      *crash_size,
1264                                        unsigned long long      *crash_base)
1265{
1266        char *cur = cmdline, *tmp;
1267
1268        /* for each entry of the comma-separated list */
1269        do {
1270                unsigned long long start, end = ULLONG_MAX, size;
1271
1272                /* get the start of the range */
1273                start = memparse(cur, &tmp);
1274                if (cur == tmp) {
1275                        pr_warning("crashkernel: Memory value expected\n");
1276                        return -EINVAL;
1277                }
1278                cur = tmp;
1279                if (*cur != '-') {
1280                        pr_warning("crashkernel: '-' expected\n");
1281                        return -EINVAL;
1282                }
1283                cur++;
1284
1285                /* if no ':' is here, than we read the end */
1286                if (*cur != ':') {
1287                        end = memparse(cur, &tmp);
1288                        if (cur == tmp) {
1289                                pr_warning("crashkernel: Memory "
1290                                                "value expected\n");
1291                                return -EINVAL;
1292                        }
1293                        cur = tmp;
1294                        if (end <= start) {
1295                                pr_warning("crashkernel: end <= start\n");
1296                                return -EINVAL;
1297                        }
1298                }
1299
1300                if (*cur != ':') {
1301                        pr_warning("crashkernel: ':' expected\n");
1302                        return -EINVAL;
1303                }
1304                cur++;
1305
1306                size = memparse(cur, &tmp);
1307                if (cur == tmp) {
1308                        pr_warning("Memory value expected\n");
1309                        return -EINVAL;
1310                }
1311                cur = tmp;
1312                if (size >= system_ram) {
1313                        pr_warning("crashkernel: invalid size\n");
1314                        return -EINVAL;
1315                }
1316
1317                /* match ? */
1318                if (system_ram >= start && system_ram < end) {
1319                        *crash_size = size;
1320                        break;
1321                }
1322        } while (*cur++ == ',');
1323
1324        if (*crash_size > 0) {
1325                while (*cur && *cur != ' ' && *cur != '@')
1326                        cur++;
1327                if (*cur == '@') {
1328                        cur++;
1329                        *crash_base = memparse(cur, &tmp);
1330                        if (cur == tmp) {
1331                                pr_warning("Memory value expected "
1332                                                "after '@'\n");
1333                                return -EINVAL;
1334                        }
1335                }
1336        }
1337
1338        return 0;
1339}
1340
1341/*
1342 * That function parses "simple" (old) crashkernel command lines like
1343 *
1344 *      crashkernel=size[@offset]
1345 *
1346 * It returns 0 on success and -EINVAL on failure.
1347 */
1348static int __init parse_crashkernel_simple(char                 *cmdline,
1349                                           unsigned long long   *crash_size,
1350                                           unsigned long long   *crash_base)
1351{
1352        char *cur = cmdline;
1353
1354        *crash_size = memparse(cmdline, &cur);
1355        if (cmdline == cur) {
1356                pr_warning("crashkernel: memory value expected\n");
1357                return -EINVAL;
1358        }
1359
1360        if (*cur == '@')
1361                *crash_base = memparse(cur+1, &cur);
1362
1363        return 0;
1364}
1365
1366/*
1367 * That function is the entry point for command line parsing and should be
1368 * called from the arch-specific code.
1369 */
1370int __init parse_crashkernel(char                *cmdline,
1371                             unsigned long long system_ram,
1372                             unsigned long long *crash_size,
1373                             unsigned long long *crash_base)
1374{
1375        char    *p = cmdline, *ck_cmdline = NULL;
1376        char    *first_colon, *first_space;
1377
1378        BUG_ON(!crash_size || !crash_base);
1379        *crash_size = 0;
1380        *crash_base = 0;
1381
1382        /* find crashkernel and use the last one if there are more */
1383        p = strstr(p, "crashkernel=");
1384        while (p) {
1385                ck_cmdline = p;
1386                p = strstr(p+1, "crashkernel=");
1387        }
1388
1389        if (!ck_cmdline)
1390                return -EINVAL;
1391
1392        ck_cmdline += 12; /* strlen("crashkernel=") */
1393
1394        /*
1395         * if the commandline contains a ':', then that's the extended
1396         * syntax -- if not, it must be the classic syntax
1397         */
1398        first_colon = strchr(ck_cmdline, ':');
1399        first_space = strchr(ck_cmdline, ' ');
1400        if (first_colon && (!first_space || first_colon < first_space))
1401                return parse_crashkernel_mem(ck_cmdline, system_ram,
1402                                crash_size, crash_base);
1403        else
1404                return parse_crashkernel_simple(ck_cmdline, crash_size,
1405                                crash_base);
1406
1407        return 0;
1408}
1409
1410
1411static void update_vmcoreinfo_note(void)
1412{
1413        u32 *buf = vmcoreinfo_note;
1414
1415        if (!vmcoreinfo_size)
1416                return;
1417        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1418                              vmcoreinfo_size);
1419        final_note(buf);
1420}
1421
1422void crash_save_vmcoreinfo(void)
1423{
1424        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1425        update_vmcoreinfo_note();
1426}
1427
1428void vmcoreinfo_append_str(const char *fmt, ...)
1429{
1430        va_list args;
1431        char buf[0x50];
1432        int r;
1433
1434        va_start(args, fmt);
1435        r = vsnprintf(buf, sizeof(buf), fmt, args);
1436        va_end(args);
1437
1438        if (r + vmcoreinfo_size > vmcoreinfo_max_size)
1439                r = vmcoreinfo_max_size - vmcoreinfo_size;
1440
1441        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1442
1443        vmcoreinfo_size += r;
1444}
1445
1446/*
1447 * provide an empty default implementation here -- architecture
1448 * code may override this
1449 */
1450void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
1451{}
1452
1453unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1454{
1455        return __pa((unsigned long)(char *)&vmcoreinfo_note);
1456}
1457
1458static int __init crash_save_vmcoreinfo_init(void)
1459{
1460        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1461        VMCOREINFO_PAGESIZE(PAGE_SIZE);
1462
1463        VMCOREINFO_SYMBOL(init_uts_ns);
1464        VMCOREINFO_SYMBOL(node_online_map);
1465        VMCOREINFO_SYMBOL(swapper_pg_dir);
1466        VMCOREINFO_SYMBOL(_stext);
1467        VMCOREINFO_SYMBOL(vmlist);
1468
1469#ifndef CONFIG_NEED_MULTIPLE_NODES
1470        VMCOREINFO_SYMBOL(mem_map);
1471        VMCOREINFO_SYMBOL(contig_page_data);
1472#endif
1473#ifdef CONFIG_SPARSEMEM
1474        VMCOREINFO_SYMBOL(mem_section);
1475        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1476        VMCOREINFO_STRUCT_SIZE(mem_section);
1477        VMCOREINFO_OFFSET(mem_section, section_mem_map);
1478#endif
1479        VMCOREINFO_STRUCT_SIZE(page);
1480        VMCOREINFO_STRUCT_SIZE(pglist_data);
1481        VMCOREINFO_STRUCT_SIZE(zone);
1482        VMCOREINFO_STRUCT_SIZE(free_area);
1483        VMCOREINFO_STRUCT_SIZE(list_head);
1484        VMCOREINFO_SIZE(nodemask_t);
1485        VMCOREINFO_OFFSET(page, flags);
1486        VMCOREINFO_OFFSET(page, _count);
1487        VMCOREINFO_OFFSET(page, mapping);
1488        VMCOREINFO_OFFSET(page, lru);
1489        VMCOREINFO_OFFSET(pglist_data, node_zones);
1490        VMCOREINFO_OFFSET(pglist_data, nr_zones);
1491#ifdef CONFIG_FLAT_NODE_MEM_MAP
1492        VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1493#endif
1494        VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1495        VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1496        VMCOREINFO_OFFSET(pglist_data, node_id);
1497        VMCOREINFO_OFFSET(zone, free_area);
1498        VMCOREINFO_OFFSET(zone, vm_stat);
1499        VMCOREINFO_OFFSET(zone, spanned_pages);
1500        VMCOREINFO_OFFSET(free_area, free_list);
1501        VMCOREINFO_OFFSET(list_head, next);
1502        VMCOREINFO_OFFSET(list_head, prev);
1503        VMCOREINFO_OFFSET(vm_struct, addr);
1504        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1505        log_buf_kexec_setup();
1506        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1507        VMCOREINFO_NUMBER(NR_FREE_PAGES);
1508        VMCOREINFO_NUMBER(PG_lru);
1509        VMCOREINFO_NUMBER(PG_private);
1510        VMCOREINFO_NUMBER(PG_swapcache);
1511
1512        arch_crash_save_vmcoreinfo();
1513        update_vmcoreinfo_note();
1514
1515        return 0;
1516}
1517
1518module_init(crash_save_vmcoreinfo_init)
1519
1520/*
1521 * Move into place and start executing a preloaded standalone
1522 * executable.  If nothing was preloaded return an error.
1523 */
1524int kernel_kexec(void)
1525{
1526        int error = 0;
1527
1528        if (!mutex_trylock(&kexec_mutex))
1529                return -EBUSY;
1530        if (!kexec_image) {
1531                error = -EINVAL;
1532                goto Unlock;
1533        }
1534
1535#ifdef CONFIG_KEXEC_JUMP
1536        if (kexec_image->preserve_context) {
1537                lock_system_sleep();
1538                pm_prepare_console();
1539                error = freeze_processes();
1540                if (error) {
1541                        error = -EBUSY;
1542                        goto Restore_console;
1543                }
1544                suspend_console();
1545                error = dpm_suspend_start(PMSG_FREEZE);
1546                if (error)
1547                        goto Resume_console;
1548                /* At this point, dpm_suspend_start() has been called,
1549                 * but *not* dpm_suspend_noirq(). We *must* call
1550                 * dpm_suspend_noirq() now.  Otherwise, drivers for
1551                 * some devices (e.g. interrupt controllers) become
1552                 * desynchronized with the actual state of the
1553                 * hardware at resume time, and evil weirdness ensues.
1554                 */
1555                error = dpm_suspend_noirq(PMSG_FREEZE);
1556                if (error)
1557                        goto Resume_devices;
1558                error = disable_nonboot_cpus();
1559                if (error)
1560                        goto Enable_cpus;
1561                local_irq_disable();
1562                error = syscore_suspend();
1563                if (error)
1564                        goto Enable_irqs;
1565        } else
1566#endif
1567        {
1568                kernel_restart_prepare(NULL);
1569                printk(KERN_EMERG "Starting new kernel\n");
1570                machine_shutdown();
1571        }
1572
1573        machine_kexec(kexec_image);
1574
1575#ifdef CONFIG_KEXEC_JUMP
1576        if (kexec_image->preserve_context) {
1577                syscore_resume();
1578 Enable_irqs:
1579                local_irq_enable();
1580 Enable_cpus:
1581                enable_nonboot_cpus();
1582                dpm_resume_noirq(PMSG_RESTORE);
1583 Resume_devices:
1584                dpm_resume_end(PMSG_RESTORE);
1585 Resume_console:
1586                resume_console();
1587                thaw_processes();
1588 Restore_console:
1589                pm_restore_console();
1590                unlock_system_sleep();
1591        }
1592#endif
1593
1594 Unlock:
1595        mutex_unlock(&kexec_mutex);
1596        return error;
1597}
1598