linux/kernel/kexec.c
<<
>>
Prefs
   1/*
   2 * kexec.c - kexec system call
   3 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4 *
   5 * This source code is licensed under the GNU General Public License,
   6 * Version 2.  See the file COPYING for more details.
   7 */
   8
   9#include <linux/capability.h>
  10#include <linux/mm.h>
  11#include <linux/file.h>
  12#include <linux/slab.h>
  13#include <linux/fs.h>
  14#include <linux/kexec.h>
  15#include <linux/spinlock.h>
  16#include <linux/list.h>
  17#include <linux/highmem.h>
  18#include <linux/syscalls.h>
  19#include <linux/reboot.h>
  20#include <linux/ioport.h>
  21#include <linux/hardirq.h>
  22#include <linux/elf.h>
  23#include <linux/elfcore.h>
  24#include <linux/utsrelease.h>
  25#include <linux/utsname.h>
  26#include <linux/numa.h>
  27
  28#include <asm/page.h>
  29#include <asm/uaccess.h>
  30#include <asm/io.h>
  31#include <asm/system.h>
  32#include <asm/sections.h>
  33
  34/* Per cpu memory for storing cpu states in case of system crash. */
  35note_buf_t* crash_notes;
  36
  37/* vmcoreinfo stuff */
  38unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
  39u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
  40size_t vmcoreinfo_size;
  41size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
  42
  43/* Location of the reserved area for the crash kernel */
  44struct resource crashk_res = {
  45        .name  = "Crash kernel",
  46        .start = 0,
  47        .end   = 0,
  48        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  49};
  50
  51int kexec_should_crash(struct task_struct *p)
  52{
  53        if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
  54                return 1;
  55        return 0;
  56}
  57
  58/*
  59 * When kexec transitions to the new kernel there is a one-to-one
  60 * mapping between physical and virtual addresses.  On processors
  61 * where you can disable the MMU this is trivial, and easy.  For
  62 * others it is still a simple predictable page table to setup.
  63 *
  64 * In that environment kexec copies the new kernel to its final
  65 * resting place.  This means I can only support memory whose
  66 * physical address can fit in an unsigned long.  In particular
  67 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  68 * If the assembly stub has more restrictive requirements
  69 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  70 * defined more restrictively in <asm/kexec.h>.
  71 *
  72 * The code for the transition from the current kernel to the
  73 * the new kernel is placed in the control_code_buffer, whose size
  74 * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
  75 * page of memory is necessary, but some architectures require more.
  76 * Because this memory must be identity mapped in the transition from
  77 * virtual to physical addresses it must live in the range
  78 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  79 * modifiable.
  80 *
  81 * The assembly stub in the control code buffer is passed a linked list
  82 * of descriptor pages detailing the source pages of the new kernel,
  83 * and the destination addresses of those source pages.  As this data
  84 * structure is not used in the context of the current OS, it must
  85 * be self-contained.
  86 *
  87 * The code has been made to work with highmem pages and will use a
  88 * destination page in its final resting place (if it happens
  89 * to allocate it).  The end product of this is that most of the
  90 * physical address space, and most of RAM can be used.
  91 *
  92 * Future directions include:
  93 *  - allocating a page table with the control code buffer identity
  94 *    mapped, to simplify machine_kexec and make kexec_on_panic more
  95 *    reliable.
  96 */
  97
  98/*
  99 * KIMAGE_NO_DEST is an impossible destination address..., for
 100 * allocating pages whose destination address we do not care about.
 101 */
 102#define KIMAGE_NO_DEST (-1UL)
 103
 104static int kimage_is_destination_range(struct kimage *image,
 105                                       unsigned long start, unsigned long end);
 106static struct page *kimage_alloc_page(struct kimage *image,
 107                                       gfp_t gfp_mask,
 108                                       unsigned long dest);
 109
 110static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 111                            unsigned long nr_segments,
 112                            struct kexec_segment __user *segments)
 113{
 114        size_t segment_bytes;
 115        struct kimage *image;
 116        unsigned long i;
 117        int result;
 118
 119        /* Allocate a controlling structure */
 120        result = -ENOMEM;
 121        image = kzalloc(sizeof(*image), GFP_KERNEL);
 122        if (!image)
 123                goto out;
 124
 125        image->head = 0;
 126        image->entry = &image->head;
 127        image->last_entry = &image->head;
 128        image->control_page = ~0; /* By default this does not apply */
 129        image->start = entry;
 130        image->type = KEXEC_TYPE_DEFAULT;
 131
 132        /* Initialize the list of control pages */
 133        INIT_LIST_HEAD(&image->control_pages);
 134
 135        /* Initialize the list of destination pages */
 136        INIT_LIST_HEAD(&image->dest_pages);
 137
 138        /* Initialize the list of unuseable pages */
 139        INIT_LIST_HEAD(&image->unuseable_pages);
 140
 141        /* Read in the segments */
 142        image->nr_segments = nr_segments;
 143        segment_bytes = nr_segments * sizeof(*segments);
 144        result = copy_from_user(image->segment, segments, segment_bytes);
 145        if (result)
 146                goto out;
 147
 148        /*
 149         * Verify we have good destination addresses.  The caller is
 150         * responsible for making certain we don't attempt to load
 151         * the new image into invalid or reserved areas of RAM.  This
 152         * just verifies it is an address we can use.
 153         *
 154         * Since the kernel does everything in page size chunks ensure
 155         * the destination addreses are page aligned.  Too many
 156         * special cases crop of when we don't do this.  The most
 157         * insidious is getting overlapping destination addresses
 158         * simply because addresses are changed to page size
 159         * granularity.
 160         */
 161        result = -EADDRNOTAVAIL;
 162        for (i = 0; i < nr_segments; i++) {
 163                unsigned long mstart, mend;
 164
 165                mstart = image->segment[i].mem;
 166                mend   = mstart + image->segment[i].memsz;
 167                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 168                        goto out;
 169                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 170                        goto out;
 171        }
 172
 173        /* Verify our destination addresses do not overlap.
 174         * If we alloed overlapping destination addresses
 175         * through very weird things can happen with no
 176         * easy explanation as one segment stops on another.
 177         */
 178        result = -EINVAL;
 179        for (i = 0; i < nr_segments; i++) {
 180                unsigned long mstart, mend;
 181                unsigned long j;
 182
 183                mstart = image->segment[i].mem;
 184                mend   = mstart + image->segment[i].memsz;
 185                for (j = 0; j < i; j++) {
 186                        unsigned long pstart, pend;
 187                        pstart = image->segment[j].mem;
 188                        pend   = pstart + image->segment[j].memsz;
 189                        /* Do the segments overlap ? */
 190                        if ((mend > pstart) && (mstart < pend))
 191                                goto out;
 192                }
 193        }
 194
 195        /* Ensure our buffer sizes are strictly less than
 196         * our memory sizes.  This should always be the case,
 197         * and it is easier to check up front than to be surprised
 198         * later on.
 199         */
 200        result = -EINVAL;
 201        for (i = 0; i < nr_segments; i++) {
 202                if (image->segment[i].bufsz > image->segment[i].memsz)
 203                        goto out;
 204        }
 205
 206        result = 0;
 207out:
 208        if (result == 0)
 209                *rimage = image;
 210        else
 211                kfree(image);
 212
 213        return result;
 214
 215}
 216
 217static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 218                                unsigned long nr_segments,
 219                                struct kexec_segment __user *segments)
 220{
 221        int result;
 222        struct kimage *image;
 223
 224        /* Allocate and initialize a controlling structure */
 225        image = NULL;
 226        result = do_kimage_alloc(&image, entry, nr_segments, segments);
 227        if (result)
 228                goto out;
 229
 230        *rimage = image;
 231
 232        /*
 233         * Find a location for the control code buffer, and add it
 234         * the vector of segments so that it's pages will also be
 235         * counted as destination pages.
 236         */
 237        result = -ENOMEM;
 238        image->control_code_page = kimage_alloc_control_pages(image,
 239                                           get_order(KEXEC_CONTROL_CODE_SIZE));
 240        if (!image->control_code_page) {
 241                printk(KERN_ERR "Could not allocate control_code_buffer\n");
 242                goto out;
 243        }
 244
 245        result = 0;
 246 out:
 247        if (result == 0)
 248                *rimage = image;
 249        else
 250                kfree(image);
 251
 252        return result;
 253}
 254
 255static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 256                                unsigned long nr_segments,
 257                                struct kexec_segment __user *segments)
 258{
 259        int result;
 260        struct kimage *image;
 261        unsigned long i;
 262
 263        image = NULL;
 264        /* Verify we have a valid entry point */
 265        if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 266                result = -EADDRNOTAVAIL;
 267                goto out;
 268        }
 269
 270        /* Allocate and initialize a controlling structure */
 271        result = do_kimage_alloc(&image, entry, nr_segments, segments);
 272        if (result)
 273                goto out;
 274
 275        /* Enable the special crash kernel control page
 276         * allocation policy.
 277         */
 278        image->control_page = crashk_res.start;
 279        image->type = KEXEC_TYPE_CRASH;
 280
 281        /*
 282         * Verify we have good destination addresses.  Normally
 283         * the caller is responsible for making certain we don't
 284         * attempt to load the new image into invalid or reserved
 285         * areas of RAM.  But crash kernels are preloaded into a
 286         * reserved area of ram.  We must ensure the addresses
 287         * are in the reserved area otherwise preloading the
 288         * kernel could corrupt things.
 289         */
 290        result = -EADDRNOTAVAIL;
 291        for (i = 0; i < nr_segments; i++) {
 292                unsigned long mstart, mend;
 293
 294                mstart = image->segment[i].mem;
 295                mend = mstart + image->segment[i].memsz - 1;
 296                /* Ensure we are within the crash kernel limits */
 297                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 298                        goto out;
 299        }
 300
 301        /*
 302         * Find a location for the control code buffer, and add
 303         * the vector of segments so that it's pages will also be
 304         * counted as destination pages.
 305         */
 306        result = -ENOMEM;
 307        image->control_code_page = kimage_alloc_control_pages(image,
 308                                           get_order(KEXEC_CONTROL_CODE_SIZE));
 309        if (!image->control_code_page) {
 310                printk(KERN_ERR "Could not allocate control_code_buffer\n");
 311                goto out;
 312        }
 313
 314        result = 0;
 315out:
 316        if (result == 0)
 317                *rimage = image;
 318        else
 319                kfree(image);
 320
 321        return result;
 322}
 323
 324static int kimage_is_destination_range(struct kimage *image,
 325                                        unsigned long start,
 326                                        unsigned long end)
 327{
 328        unsigned long i;
 329
 330        for (i = 0; i < image->nr_segments; i++) {
 331                unsigned long mstart, mend;
 332
 333                mstart = image->segment[i].mem;
 334                mend = mstart + image->segment[i].memsz;
 335                if ((end > mstart) && (start < mend))
 336                        return 1;
 337        }
 338
 339        return 0;
 340}
 341
 342static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 343{
 344        struct page *pages;
 345
 346        pages = alloc_pages(gfp_mask, order);
 347        if (pages) {
 348                unsigned int count, i;
 349                pages->mapping = NULL;
 350                set_page_private(pages, order);
 351                count = 1 << order;
 352                for (i = 0; i < count; i++)
 353                        SetPageReserved(pages + i);
 354        }
 355
 356        return pages;
 357}
 358
 359static void kimage_free_pages(struct page *page)
 360{
 361        unsigned int order, count, i;
 362
 363        order = page_private(page);
 364        count = 1 << order;
 365        for (i = 0; i < count; i++)
 366                ClearPageReserved(page + i);
 367        __free_pages(page, order);
 368}
 369
 370static void kimage_free_page_list(struct list_head *list)
 371{
 372        struct list_head *pos, *next;
 373
 374        list_for_each_safe(pos, next, list) {
 375                struct page *page;
 376
 377                page = list_entry(pos, struct page, lru);
 378                list_del(&page->lru);
 379                kimage_free_pages(page);
 380        }
 381}
 382
 383static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 384                                                        unsigned int order)
 385{
 386        /* Control pages are special, they are the intermediaries
 387         * that are needed while we copy the rest of the pages
 388         * to their final resting place.  As such they must
 389         * not conflict with either the destination addresses
 390         * or memory the kernel is already using.
 391         *
 392         * The only case where we really need more than one of
 393         * these are for architectures where we cannot disable
 394         * the MMU and must instead generate an identity mapped
 395         * page table for all of the memory.
 396         *
 397         * At worst this runs in O(N) of the image size.
 398         */
 399        struct list_head extra_pages;
 400        struct page *pages;
 401        unsigned int count;
 402
 403        count = 1 << order;
 404        INIT_LIST_HEAD(&extra_pages);
 405
 406        /* Loop while I can allocate a page and the page allocated
 407         * is a destination page.
 408         */
 409        do {
 410                unsigned long pfn, epfn, addr, eaddr;
 411
 412                pages = kimage_alloc_pages(GFP_KERNEL, order);
 413                if (!pages)
 414                        break;
 415                pfn   = page_to_pfn(pages);
 416                epfn  = pfn + count;
 417                addr  = pfn << PAGE_SHIFT;
 418                eaddr = epfn << PAGE_SHIFT;
 419                if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 420                              kimage_is_destination_range(image, addr, eaddr)) {
 421                        list_add(&pages->lru, &extra_pages);
 422                        pages = NULL;
 423                }
 424        } while (!pages);
 425
 426        if (pages) {
 427                /* Remember the allocated page... */
 428                list_add(&pages->lru, &image->control_pages);
 429
 430                /* Because the page is already in it's destination
 431                 * location we will never allocate another page at
 432                 * that address.  Therefore kimage_alloc_pages
 433                 * will not return it (again) and we don't need
 434                 * to give it an entry in image->segment[].
 435                 */
 436        }
 437        /* Deal with the destination pages I have inadvertently allocated.
 438         *
 439         * Ideally I would convert multi-page allocations into single
 440         * page allocations, and add everyting to image->dest_pages.
 441         *
 442         * For now it is simpler to just free the pages.
 443         */
 444        kimage_free_page_list(&extra_pages);
 445
 446        return pages;
 447}
 448
 449static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 450                                                      unsigned int order)
 451{
 452        /* Control pages are special, they are the intermediaries
 453         * that are needed while we copy the rest of the pages
 454         * to their final resting place.  As such they must
 455         * not conflict with either the destination addresses
 456         * or memory the kernel is already using.
 457         *
 458         * Control pages are also the only pags we must allocate
 459         * when loading a crash kernel.  All of the other pages
 460         * are specified by the segments and we just memcpy
 461         * into them directly.
 462         *
 463         * The only case where we really need more than one of
 464         * these are for architectures where we cannot disable
 465         * the MMU and must instead generate an identity mapped
 466         * page table for all of the memory.
 467         *
 468         * Given the low demand this implements a very simple
 469         * allocator that finds the first hole of the appropriate
 470         * size in the reserved memory region, and allocates all
 471         * of the memory up to and including the hole.
 472         */
 473        unsigned long hole_start, hole_end, size;
 474        struct page *pages;
 475
 476        pages = NULL;
 477        size = (1 << order) << PAGE_SHIFT;
 478        hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 479        hole_end   = hole_start + size - 1;
 480        while (hole_end <= crashk_res.end) {
 481                unsigned long i;
 482
 483                if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 484                        break;
 485                if (hole_end > crashk_res.end)
 486                        break;
 487                /* See if I overlap any of the segments */
 488                for (i = 0; i < image->nr_segments; i++) {
 489                        unsigned long mstart, mend;
 490
 491                        mstart = image->segment[i].mem;
 492                        mend   = mstart + image->segment[i].memsz - 1;
 493                        if ((hole_end >= mstart) && (hole_start <= mend)) {
 494                                /* Advance the hole to the end of the segment */
 495                                hole_start = (mend + (size - 1)) & ~(size - 1);
 496                                hole_end   = hole_start + size - 1;
 497                                break;
 498                        }
 499                }
 500                /* If I don't overlap any segments I have found my hole! */
 501                if (i == image->nr_segments) {
 502                        pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 503                        break;
 504                }
 505        }
 506        if (pages)
 507                image->control_page = hole_end;
 508
 509        return pages;
 510}
 511
 512
 513struct page *kimage_alloc_control_pages(struct kimage *image,
 514                                         unsigned int order)
 515{
 516        struct page *pages = NULL;
 517
 518        switch (image->type) {
 519        case KEXEC_TYPE_DEFAULT:
 520                pages = kimage_alloc_normal_control_pages(image, order);
 521                break;
 522        case KEXEC_TYPE_CRASH:
 523                pages = kimage_alloc_crash_control_pages(image, order);
 524                break;
 525        }
 526
 527        return pages;
 528}
 529
 530static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 531{
 532        if (*image->entry != 0)
 533                image->entry++;
 534
 535        if (image->entry == image->last_entry) {
 536                kimage_entry_t *ind_page;
 537                struct page *page;
 538
 539                page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 540                if (!page)
 541                        return -ENOMEM;
 542
 543                ind_page = page_address(page);
 544                *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 545                image->entry = ind_page;
 546                image->last_entry = ind_page +
 547                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 548        }
 549        *image->entry = entry;
 550        image->entry++;
 551        *image->entry = 0;
 552
 553        return 0;
 554}
 555
 556static int kimage_set_destination(struct kimage *image,
 557                                   unsigned long destination)
 558{
 559        int result;
 560
 561        destination &= PAGE_MASK;
 562        result = kimage_add_entry(image, destination | IND_DESTINATION);
 563        if (result == 0)
 564                image->destination = destination;
 565
 566        return result;
 567}
 568
 569
 570static int kimage_add_page(struct kimage *image, unsigned long page)
 571{
 572        int result;
 573
 574        page &= PAGE_MASK;
 575        result = kimage_add_entry(image, page | IND_SOURCE);
 576        if (result == 0)
 577                image->destination += PAGE_SIZE;
 578
 579        return result;
 580}
 581
 582
 583static void kimage_free_extra_pages(struct kimage *image)
 584{
 585        /* Walk through and free any extra destination pages I may have */
 586        kimage_free_page_list(&image->dest_pages);
 587
 588        /* Walk through and free any unuseable pages I have cached */
 589        kimage_free_page_list(&image->unuseable_pages);
 590
 591}
 592static int kimage_terminate(struct kimage *image)
 593{
 594        if (*image->entry != 0)
 595                image->entry++;
 596
 597        *image->entry = IND_DONE;
 598
 599        return 0;
 600}
 601
 602#define for_each_kimage_entry(image, ptr, entry) \
 603        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 604                ptr = (entry & IND_INDIRECTION)? \
 605                        phys_to_virt((entry & PAGE_MASK)): ptr +1)
 606
 607static void kimage_free_entry(kimage_entry_t entry)
 608{
 609        struct page *page;
 610
 611        page = pfn_to_page(entry >> PAGE_SHIFT);
 612        kimage_free_pages(page);
 613}
 614
 615static void kimage_free(struct kimage *image)
 616{
 617        kimage_entry_t *ptr, entry;
 618        kimage_entry_t ind = 0;
 619
 620        if (!image)
 621                return;
 622
 623        kimage_free_extra_pages(image);
 624        for_each_kimage_entry(image, ptr, entry) {
 625                if (entry & IND_INDIRECTION) {
 626                        /* Free the previous indirection page */
 627                        if (ind & IND_INDIRECTION)
 628                                kimage_free_entry(ind);
 629                        /* Save this indirection page until we are
 630                         * done with it.
 631                         */
 632                        ind = entry;
 633                }
 634                else if (entry & IND_SOURCE)
 635                        kimage_free_entry(entry);
 636        }
 637        /* Free the final indirection page */
 638        if (ind & IND_INDIRECTION)
 639                kimage_free_entry(ind);
 640
 641        /* Handle any machine specific cleanup */
 642        machine_kexec_cleanup(image);
 643
 644        /* Free the kexec control pages... */
 645        kimage_free_page_list(&image->control_pages);
 646        kfree(image);
 647}
 648
 649static kimage_entry_t *kimage_dst_used(struct kimage *image,
 650                                        unsigned long page)
 651{
 652        kimage_entry_t *ptr, entry;
 653        unsigned long destination = 0;
 654
 655        for_each_kimage_entry(image, ptr, entry) {
 656                if (entry & IND_DESTINATION)
 657                        destination = entry & PAGE_MASK;
 658                else if (entry & IND_SOURCE) {
 659                        if (page == destination)
 660                                return ptr;
 661                        destination += PAGE_SIZE;
 662                }
 663        }
 664
 665        return NULL;
 666}
 667
 668static struct page *kimage_alloc_page(struct kimage *image,
 669                                        gfp_t gfp_mask,
 670                                        unsigned long destination)
 671{
 672        /*
 673         * Here we implement safeguards to ensure that a source page
 674         * is not copied to its destination page before the data on
 675         * the destination page is no longer useful.
 676         *
 677         * To do this we maintain the invariant that a source page is
 678         * either its own destination page, or it is not a
 679         * destination page at all.
 680         *
 681         * That is slightly stronger than required, but the proof
 682         * that no problems will not occur is trivial, and the
 683         * implementation is simply to verify.
 684         *
 685         * When allocating all pages normally this algorithm will run
 686         * in O(N) time, but in the worst case it will run in O(N^2)
 687         * time.   If the runtime is a problem the data structures can
 688         * be fixed.
 689         */
 690        struct page *page;
 691        unsigned long addr;
 692
 693        /*
 694         * Walk through the list of destination pages, and see if I
 695         * have a match.
 696         */
 697        list_for_each_entry(page, &image->dest_pages, lru) {
 698                addr = page_to_pfn(page) << PAGE_SHIFT;
 699                if (addr == destination) {
 700                        list_del(&page->lru);
 701                        return page;
 702                }
 703        }
 704        page = NULL;
 705        while (1) {
 706                kimage_entry_t *old;
 707
 708                /* Allocate a page, if we run out of memory give up */
 709                page = kimage_alloc_pages(gfp_mask, 0);
 710                if (!page)
 711                        return NULL;
 712                /* If the page cannot be used file it away */
 713                if (page_to_pfn(page) >
 714                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 715                        list_add(&page->lru, &image->unuseable_pages);
 716                        continue;
 717                }
 718                addr = page_to_pfn(page) << PAGE_SHIFT;
 719
 720                /* If it is the destination page we want use it */
 721                if (addr == destination)
 722                        break;
 723
 724                /* If the page is not a destination page use it */
 725                if (!kimage_is_destination_range(image, addr,
 726                                                  addr + PAGE_SIZE))
 727                        break;
 728
 729                /*
 730                 * I know that the page is someones destination page.
 731                 * See if there is already a source page for this
 732                 * destination page.  And if so swap the source pages.
 733                 */
 734                old = kimage_dst_used(image, addr);
 735                if (old) {
 736                        /* If so move it */
 737                        unsigned long old_addr;
 738                        struct page *old_page;
 739
 740                        old_addr = *old & PAGE_MASK;
 741                        old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 742                        copy_highpage(page, old_page);
 743                        *old = addr | (*old & ~PAGE_MASK);
 744
 745                        /* The old page I have found cannot be a
 746                         * destination page, so return it.
 747                         */
 748                        addr = old_addr;
 749                        page = old_page;
 750                        break;
 751                }
 752                else {
 753                        /* Place the page on the destination list I
 754                         * will use it later.
 755                         */
 756                        list_add(&page->lru, &image->dest_pages);
 757                }
 758        }
 759
 760        return page;
 761}
 762
 763static int kimage_load_normal_segment(struct kimage *image,
 764                                         struct kexec_segment *segment)
 765{
 766        unsigned long maddr;
 767        unsigned long ubytes, mbytes;
 768        int result;
 769        unsigned char __user *buf;
 770
 771        result = 0;
 772        buf = segment->buf;
 773        ubytes = segment->bufsz;
 774        mbytes = segment->memsz;
 775        maddr = segment->mem;
 776
 777        result = kimage_set_destination(image, maddr);
 778        if (result < 0)
 779                goto out;
 780
 781        while (mbytes) {
 782                struct page *page;
 783                char *ptr;
 784                size_t uchunk, mchunk;
 785
 786                page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 787                if (!page) {
 788                        result  = -ENOMEM;
 789                        goto out;
 790                }
 791                result = kimage_add_page(image, page_to_pfn(page)
 792                                                                << PAGE_SHIFT);
 793                if (result < 0)
 794                        goto out;
 795
 796                ptr = kmap(page);
 797                /* Start with a clear page */
 798                memset(ptr, 0, PAGE_SIZE);
 799                ptr += maddr & ~PAGE_MASK;
 800                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 801                if (mchunk > mbytes)
 802                        mchunk = mbytes;
 803
 804                uchunk = mchunk;
 805                if (uchunk > ubytes)
 806                        uchunk = ubytes;
 807
 808                result = copy_from_user(ptr, buf, uchunk);
 809                kunmap(page);
 810                if (result) {
 811                        result = (result < 0) ? result : -EIO;
 812                        goto out;
 813                }
 814                ubytes -= uchunk;
 815                maddr  += mchunk;
 816                buf    += mchunk;
 817                mbytes -= mchunk;
 818        }
 819out:
 820        return result;
 821}
 822
 823static int kimage_load_crash_segment(struct kimage *image,
 824                                        struct kexec_segment *segment)
 825{
 826        /* For crash dumps kernels we simply copy the data from
 827         * user space to it's destination.
 828         * We do things a page at a time for the sake of kmap.
 829         */
 830        unsigned long maddr;
 831        unsigned long ubytes, mbytes;
 832        int result;
 833        unsigned char __user *buf;
 834
 835        result = 0;
 836        buf = segment->buf;
 837        ubytes = segment->bufsz;
 838        mbytes = segment->memsz;
 839        maddr = segment->mem;
 840        while (mbytes) {
 841                struct page *page;
 842                char *ptr;
 843                size_t uchunk, mchunk;
 844
 845                page = pfn_to_page(maddr >> PAGE_SHIFT);
 846                if (!page) {
 847                        result  = -ENOMEM;
 848                        goto out;
 849                }
 850                ptr = kmap(page);
 851                ptr += maddr & ~PAGE_MASK;
 852                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 853                if (mchunk > mbytes)
 854                        mchunk = mbytes;
 855
 856                uchunk = mchunk;
 857                if (uchunk > ubytes) {
 858                        uchunk = ubytes;
 859                        /* Zero the trailing part of the page */
 860                        memset(ptr + uchunk, 0, mchunk - uchunk);
 861                }
 862                result = copy_from_user(ptr, buf, uchunk);
 863                kexec_flush_icache_page(page);
 864                kunmap(page);
 865                if (result) {
 866                        result = (result < 0) ? result : -EIO;
 867                        goto out;
 868                }
 869                ubytes -= uchunk;
 870                maddr  += mchunk;
 871                buf    += mchunk;
 872                mbytes -= mchunk;
 873        }
 874out:
 875        return result;
 876}
 877
 878static int kimage_load_segment(struct kimage *image,
 879                                struct kexec_segment *segment)
 880{
 881        int result = -ENOMEM;
 882
 883        switch (image->type) {
 884        case KEXEC_TYPE_DEFAULT:
 885                result = kimage_load_normal_segment(image, segment);
 886                break;
 887        case KEXEC_TYPE_CRASH:
 888                result = kimage_load_crash_segment(image, segment);
 889                break;
 890        }
 891
 892        return result;
 893}
 894
 895/*
 896 * Exec Kernel system call: for obvious reasons only root may call it.
 897 *
 898 * This call breaks up into three pieces.
 899 * - A generic part which loads the new kernel from the current
 900 *   address space, and very carefully places the data in the
 901 *   allocated pages.
 902 *
 903 * - A generic part that interacts with the kernel and tells all of
 904 *   the devices to shut down.  Preventing on-going dmas, and placing
 905 *   the devices in a consistent state so a later kernel can
 906 *   reinitialize them.
 907 *
 908 * - A machine specific part that includes the syscall number
 909 *   and the copies the image to it's final destination.  And
 910 *   jumps into the image at entry.
 911 *
 912 * kexec does not sync, or unmount filesystems so if you need
 913 * that to happen you need to do that yourself.
 914 */
 915struct kimage *kexec_image;
 916struct kimage *kexec_crash_image;
 917/*
 918 * A home grown binary mutex.
 919 * Nothing can wait so this mutex is safe to use
 920 * in interrupt context :)
 921 */
 922static int kexec_lock;
 923
 924asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 925                                struct kexec_segment __user *segments,
 926                                unsigned long flags)
 927{
 928        struct kimage **dest_image, *image;
 929        int locked;
 930        int result;
 931
 932        /* We only trust the superuser with rebooting the system. */
 933        if (!capable(CAP_SYS_BOOT))
 934                return -EPERM;
 935
 936        /*
 937         * Verify we have a legal set of flags
 938         * This leaves us room for future extensions.
 939         */
 940        if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 941                return -EINVAL;
 942
 943        /* Verify we are on the appropriate architecture */
 944        if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 945                ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 946                return -EINVAL;
 947
 948        /* Put an artificial cap on the number
 949         * of segments passed to kexec_load.
 950         */
 951        if (nr_segments > KEXEC_SEGMENT_MAX)
 952                return -EINVAL;
 953
 954        image = NULL;
 955        result = 0;
 956
 957        /* Because we write directly to the reserved memory
 958         * region when loading crash kernels we need a mutex here to
 959         * prevent multiple crash  kernels from attempting to load
 960         * simultaneously, and to prevent a crash kernel from loading
 961         * over the top of a in use crash kernel.
 962         *
 963         * KISS: always take the mutex.
 964         */
 965        locked = xchg(&kexec_lock, 1);
 966        if (locked)
 967                return -EBUSY;
 968
 969        dest_image = &kexec_image;
 970        if (flags & KEXEC_ON_CRASH)
 971                dest_image = &kexec_crash_image;
 972        if (nr_segments > 0) {
 973                unsigned long i;
 974
 975                /* Loading another kernel to reboot into */
 976                if ((flags & KEXEC_ON_CRASH) == 0)
 977                        result = kimage_normal_alloc(&image, entry,
 978                                                        nr_segments, segments);
 979                /* Loading another kernel to switch to if this one crashes */
 980                else if (flags & KEXEC_ON_CRASH) {
 981                        /* Free any current crash dump kernel before
 982                         * we corrupt it.
 983                         */
 984                        kimage_free(xchg(&kexec_crash_image, NULL));
 985                        result = kimage_crash_alloc(&image, entry,
 986                                                     nr_segments, segments);
 987                }
 988                if (result)
 989                        goto out;
 990
 991                result = machine_kexec_prepare(image);
 992                if (result)
 993                        goto out;
 994
 995                for (i = 0; i < nr_segments; i++) {
 996                        result = kimage_load_segment(image, &image->segment[i]);
 997                        if (result)
 998                                goto out;
 999                }
1000                result = kimage_terminate(image);
1001                if (result)
1002                        goto out;
1003        }
1004        /* Install the new kernel, and  Uninstall the old */
1005        image = xchg(dest_image, image);
1006
1007out:
1008        locked = xchg(&kexec_lock, 0); /* Release the mutex */
1009        BUG_ON(!locked);
1010        kimage_free(image);
1011
1012        return result;
1013}
1014
1015#ifdef CONFIG_COMPAT
1016asmlinkage long compat_sys_kexec_load(unsigned long entry,
1017                                unsigned long nr_segments,
1018                                struct compat_kexec_segment __user *segments,
1019                                unsigned long flags)
1020{
1021        struct compat_kexec_segment in;
1022        struct kexec_segment out, __user *ksegments;
1023        unsigned long i, result;
1024
1025        /* Don't allow clients that don't understand the native
1026         * architecture to do anything.
1027         */
1028        if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1029                return -EINVAL;
1030
1031        if (nr_segments > KEXEC_SEGMENT_MAX)
1032                return -EINVAL;
1033
1034        ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1035        for (i=0; i < nr_segments; i++) {
1036                result = copy_from_user(&in, &segments[i], sizeof(in));
1037                if (result)
1038                        return -EFAULT;
1039
1040                out.buf   = compat_ptr(in.buf);
1041                out.bufsz = in.bufsz;
1042                out.mem   = in.mem;
1043                out.memsz = in.memsz;
1044
1045                result = copy_to_user(&ksegments[i], &out, sizeof(out));
1046                if (result)
1047                        return -EFAULT;
1048        }
1049
1050        return sys_kexec_load(entry, nr_segments, ksegments, flags);
1051}
1052#endif
1053
1054void crash_kexec(struct pt_regs *regs)
1055{
1056        int locked;
1057
1058
1059        /* Take the kexec_lock here to prevent sys_kexec_load
1060         * running on one cpu from replacing the crash kernel
1061         * we are using after a panic on a different cpu.
1062         *
1063         * If the crash kernel was not located in a fixed area
1064         * of memory the xchg(&kexec_crash_image) would be
1065         * sufficient.  But since I reuse the memory...
1066         */
1067        locked = xchg(&kexec_lock, 1);
1068        if (!locked) {
1069                if (kexec_crash_image) {
1070                        struct pt_regs fixed_regs;
1071                        crash_setup_regs(&fixed_regs, regs);
1072                        crash_save_vmcoreinfo();
1073                        machine_crash_shutdown(&fixed_regs);
1074                        machine_kexec(kexec_crash_image);
1075                }
1076                locked = xchg(&kexec_lock, 0);
1077                BUG_ON(!locked);
1078        }
1079}
1080
1081static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1082                            size_t data_len)
1083{
1084        struct elf_note note;
1085
1086        note.n_namesz = strlen(name) + 1;
1087        note.n_descsz = data_len;
1088        note.n_type   = type;
1089        memcpy(buf, &note, sizeof(note));
1090        buf += (sizeof(note) + 3)/4;
1091        memcpy(buf, name, note.n_namesz);
1092        buf += (note.n_namesz + 3)/4;
1093        memcpy(buf, data, note.n_descsz);
1094        buf += (note.n_descsz + 3)/4;
1095
1096        return buf;
1097}
1098
1099static void final_note(u32 *buf)
1100{
1101        struct elf_note note;
1102
1103        note.n_namesz = 0;
1104        note.n_descsz = 0;
1105        note.n_type   = 0;
1106        memcpy(buf, &note, sizeof(note));
1107}
1108
1109void crash_save_cpu(struct pt_regs *regs, int cpu)
1110{
1111        struct elf_prstatus prstatus;
1112        u32 *buf;
1113
1114        if ((cpu < 0) || (cpu >= NR_CPUS))
1115                return;
1116
1117        /* Using ELF notes here is opportunistic.
1118         * I need a well defined structure format
1119         * for the data I pass, and I need tags
1120         * on the data to indicate what information I have
1121         * squirrelled away.  ELF notes happen to provide
1122         * all of that, so there is no need to invent something new.
1123         */
1124        buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1125        if (!buf)
1126                return;
1127        memset(&prstatus, 0, sizeof(prstatus));
1128        prstatus.pr_pid = current->pid;
1129        elf_core_copy_regs(&prstatus.pr_reg, regs);
1130        buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1131                              &prstatus, sizeof(prstatus));
1132        final_note(buf);
1133}
1134
1135static int __init crash_notes_memory_init(void)
1136{
1137        /* Allocate memory for saving cpu registers. */
1138        crash_notes = alloc_percpu(note_buf_t);
1139        if (!crash_notes) {
1140                printk("Kexec: Memory allocation for saving cpu register"
1141                " states failed\n");
1142                return -ENOMEM;
1143        }
1144        return 0;
1145}
1146module_init(crash_notes_memory_init)
1147
1148
1149/*
1150 * parsing the "crashkernel" commandline
1151 *
1152 * this code is intended to be called from architecture specific code
1153 */
1154
1155
1156/*
1157 * This function parses command lines in the format
1158 *
1159 *   crashkernel=ramsize-range:size[,...][@offset]
1160 *
1161 * The function returns 0 on success and -EINVAL on failure.
1162 */
1163static int __init parse_crashkernel_mem(char                    *cmdline,
1164                                        unsigned long long      system_ram,
1165                                        unsigned long long      *crash_size,
1166                                        unsigned long long      *crash_base)
1167{
1168        char *cur = cmdline, *tmp;
1169
1170        /* for each entry of the comma-separated list */
1171        do {
1172                unsigned long long start, end = ULLONG_MAX, size;
1173
1174                /* get the start of the range */
1175                start = memparse(cur, &tmp);
1176                if (cur == tmp) {
1177                        pr_warning("crashkernel: Memory value expected\n");
1178                        return -EINVAL;
1179                }
1180                cur = tmp;
1181                if (*cur != '-') {
1182                        pr_warning("crashkernel: '-' expected\n");
1183                        return -EINVAL;
1184                }
1185                cur++;
1186
1187                /* if no ':' is here, than we read the end */
1188                if (*cur != ':') {
1189                        end = memparse(cur, &tmp);
1190                        if (cur == tmp) {
1191                                pr_warning("crashkernel: Memory "
1192                                                "value expected\n");
1193                                return -EINVAL;
1194                        }
1195                        cur = tmp;
1196                        if (end <= start) {
1197                                pr_warning("crashkernel: end <= start\n");
1198                                return -EINVAL;
1199                        }
1200                }
1201
1202                if (*cur != ':') {
1203                        pr_warning("crashkernel: ':' expected\n");
1204                        return -EINVAL;
1205                }
1206                cur++;
1207
1208                size = memparse(cur, &tmp);
1209                if (cur == tmp) {
1210                        pr_warning("Memory value expected\n");
1211                        return -EINVAL;
1212                }
1213                cur = tmp;
1214                if (size >= system_ram) {
1215                        pr_warning("crashkernel: invalid size\n");
1216                        return -EINVAL;
1217                }
1218
1219                /* match ? */
1220                if (system_ram >= start && system_ram < end) {
1221                        *crash_size = size;
1222                        break;
1223                }
1224        } while (*cur++ == ',');
1225
1226        if (*crash_size > 0) {
1227                while (*cur != ' ' && *cur != '@')
1228                        cur++;
1229                if (*cur == '@') {
1230                        cur++;
1231                        *crash_base = memparse(cur, &tmp);
1232                        if (cur == tmp) {
1233                                pr_warning("Memory value expected "
1234                                                "after '@'\n");
1235                                return -EINVAL;
1236                        }
1237                }
1238        }
1239
1240        return 0;
1241}
1242
1243/*
1244 * That function parses "simple" (old) crashkernel command lines like
1245 *
1246 *      crashkernel=size[@offset]
1247 *
1248 * It returns 0 on success and -EINVAL on failure.
1249 */
1250static int __init parse_crashkernel_simple(char                 *cmdline,
1251                                           unsigned long long   *crash_size,
1252                                           unsigned long long   *crash_base)
1253{
1254        char *cur = cmdline;
1255
1256        *crash_size = memparse(cmdline, &cur);
1257        if (cmdline == cur) {
1258                pr_warning("crashkernel: memory value expected\n");
1259                return -EINVAL;
1260        }
1261
1262        if (*cur == '@')
1263                *crash_base = memparse(cur+1, &cur);
1264
1265        return 0;
1266}
1267
1268/*
1269 * That function is the entry point for command line parsing and should be
1270 * called from the arch-specific code.
1271 */
1272int __init parse_crashkernel(char                *cmdline,
1273                             unsigned long long system_ram,
1274                             unsigned long long *crash_size,
1275                             unsigned long long *crash_base)
1276{
1277        char    *p = cmdline, *ck_cmdline = NULL;
1278        char    *first_colon, *first_space;
1279
1280        BUG_ON(!crash_size || !crash_base);
1281        *crash_size = 0;
1282        *crash_base = 0;
1283
1284        /* find crashkernel and use the last one if there are more */
1285        p = strstr(p, "crashkernel=");
1286        while (p) {
1287                ck_cmdline = p;
1288                p = strstr(p+1, "crashkernel=");
1289        }
1290
1291        if (!ck_cmdline)
1292                return -EINVAL;
1293
1294        ck_cmdline += 12; /* strlen("crashkernel=") */
1295
1296        /*
1297         * if the commandline contains a ':', then that's the extended
1298         * syntax -- if not, it must be the classic syntax
1299         */
1300        first_colon = strchr(ck_cmdline, ':');
1301        first_space = strchr(ck_cmdline, ' ');
1302        if (first_colon && (!first_space || first_colon < first_space))
1303                return parse_crashkernel_mem(ck_cmdline, system_ram,
1304                                crash_size, crash_base);
1305        else
1306                return parse_crashkernel_simple(ck_cmdline, crash_size,
1307                                crash_base);
1308
1309        return 0;
1310}
1311
1312
1313
1314void crash_save_vmcoreinfo(void)
1315{
1316        u32 *buf;
1317
1318        if (!vmcoreinfo_size)
1319                return;
1320
1321        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1322
1323        buf = (u32 *)vmcoreinfo_note;
1324
1325        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1326                              vmcoreinfo_size);
1327
1328        final_note(buf);
1329}
1330
1331void vmcoreinfo_append_str(const char *fmt, ...)
1332{
1333        va_list args;
1334        char buf[0x50];
1335        int r;
1336
1337        va_start(args, fmt);
1338        r = vsnprintf(buf, sizeof(buf), fmt, args);
1339        va_end(args);
1340
1341        if (r + vmcoreinfo_size > vmcoreinfo_max_size)
1342                r = vmcoreinfo_max_size - vmcoreinfo_size;
1343
1344        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1345
1346        vmcoreinfo_size += r;
1347}
1348
1349/*
1350 * provide an empty default implementation here -- architecture
1351 * code may override this
1352 */
1353void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
1354{}
1355
1356unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1357{
1358        return __pa((unsigned long)(char *)&vmcoreinfo_note);
1359}
1360
1361static int __init crash_save_vmcoreinfo_init(void)
1362{
1363        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1364        VMCOREINFO_PAGESIZE(PAGE_SIZE);
1365
1366        VMCOREINFO_SYMBOL(init_uts_ns);
1367        VMCOREINFO_SYMBOL(node_online_map);
1368        VMCOREINFO_SYMBOL(swapper_pg_dir);
1369        VMCOREINFO_SYMBOL(_stext);
1370
1371#ifndef CONFIG_NEED_MULTIPLE_NODES
1372        VMCOREINFO_SYMBOL(mem_map);
1373        VMCOREINFO_SYMBOL(contig_page_data);
1374#endif
1375#ifdef CONFIG_SPARSEMEM
1376        VMCOREINFO_SYMBOL(mem_section);
1377        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1378        VMCOREINFO_STRUCT_SIZE(mem_section);
1379        VMCOREINFO_OFFSET(mem_section, section_mem_map);
1380#endif
1381        VMCOREINFO_STRUCT_SIZE(page);
1382        VMCOREINFO_STRUCT_SIZE(pglist_data);
1383        VMCOREINFO_STRUCT_SIZE(zone);
1384        VMCOREINFO_STRUCT_SIZE(free_area);
1385        VMCOREINFO_STRUCT_SIZE(list_head);
1386        VMCOREINFO_SIZE(nodemask_t);
1387        VMCOREINFO_OFFSET(page, flags);
1388        VMCOREINFO_OFFSET(page, _count);
1389        VMCOREINFO_OFFSET(page, mapping);
1390        VMCOREINFO_OFFSET(page, lru);
1391        VMCOREINFO_OFFSET(pglist_data, node_zones);
1392        VMCOREINFO_OFFSET(pglist_data, nr_zones);
1393#ifdef CONFIG_FLAT_NODE_MEM_MAP
1394        VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1395#endif
1396        VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1397        VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1398        VMCOREINFO_OFFSET(pglist_data, node_id);
1399        VMCOREINFO_OFFSET(zone, free_area);
1400        VMCOREINFO_OFFSET(zone, vm_stat);
1401        VMCOREINFO_OFFSET(zone, spanned_pages);
1402        VMCOREINFO_OFFSET(free_area, free_list);
1403        VMCOREINFO_OFFSET(list_head, next);
1404        VMCOREINFO_OFFSET(list_head, prev);
1405        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1406        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1407        VMCOREINFO_NUMBER(NR_FREE_PAGES);
1408        VMCOREINFO_NUMBER(PG_lru);
1409        VMCOREINFO_NUMBER(PG_private);
1410        VMCOREINFO_NUMBER(PG_swapcache);
1411
1412        arch_crash_save_vmcoreinfo();
1413
1414        return 0;
1415}
1416
1417module_init(crash_save_vmcoreinfo_init)
1418
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.