linux-bk/arch/x86_64/mm/init.c
<<
>>
Prefs
   1/*
   2 *  linux/arch/x86_64/mm/init.c
   3 *
   4 *  Copyright (C) 1995  Linus Torvalds
   5 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
   6 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
   7 */
   8
   9#include <linux/config.h>
  10#include <linux/signal.h>
  11#include <linux/sched.h>
  12#include <linux/kernel.h>
  13#include <linux/errno.h>
  14#include <linux/string.h>
  15#include <linux/types.h>
  16#include <linux/ptrace.h>
  17#include <linux/mman.h>
  18#include <linux/mm.h>
  19#include <linux/swap.h>
  20#include <linux/smp.h>
  21#include <linux/init.h>
  22#include <linux/pagemap.h>
  23#include <linux/bootmem.h>
  24#include <linux/proc_fs.h>
  25
  26#include <asm/processor.h>
  27#include <asm/system.h>
  28#include <asm/uaccess.h>
  29#include <asm/pgtable.h>
  30#include <asm/pgalloc.h>
  31#include <asm/dma.h>
  32#include <asm/fixmap.h>
  33#include <asm/e820.h>
  34#include <asm/apic.h>
  35#include <asm/tlb.h>
  36#include <asm/mmu_context.h>
  37#include <asm/proto.h>
  38#include <asm/smp.h>
  39
  40#ifndef Dprintk
  41#define Dprintk(x...)
  42#endif
  43
  44extern char _stext[];
  45
  46DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
  47
  48/*
  49 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  50 * physical space so we can cache the place of the first one and move
  51 * around without checking the pgd every time.
  52 */
  53
  54void show_mem(void)
  55{
  56        int i, total = 0, reserved = 0;
  57        int shared = 0, cached = 0;
  58        pg_data_t *pgdat;
  59        struct page *page;
  60
  61        printk("Mem-info:\n");
  62        show_free_areas();
  63        printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
  64
  65        for_each_pgdat(pgdat) {
  66               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
  67                       page = pgdat->node_mem_map + i;
  68                total++;
  69                       if (PageReserved(page))
  70                        reserved++;
  71                       else if (PageSwapCache(page))
  72                        cached++;
  73                       else if (page_count(page))
  74                               shared += page_count(page) - 1;
  75               }
  76        }
  77        printk("%d pages of RAM\n", total);
  78        printk("%d reserved pages\n",reserved);
  79        printk("%d pages shared\n",shared);
  80        printk("%d pages swap cached\n",cached);
  81}
  82
  83/* References to section boundaries */
  84
  85extern char _text, _etext, _edata, __bss_start, _end[];
  86extern char __init_begin, __init_end;
  87
  88int after_bootmem;
  89
  90static void *spp_getpage(void)
  91{ 
  92        void *ptr;
  93        if (after_bootmem)
  94                ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
  95        else
  96                ptr = alloc_bootmem_pages(PAGE_SIZE);
  97        if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
  98                panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
  99
 100        Dprintk("spp_getpage %p\n", ptr);
 101        return ptr;
 102} 
 103
 104static void set_pte_phys(unsigned long vaddr,
 105                         unsigned long phys, pgprot_t prot)
 106{
 107        pml4_t *level4;
 108        pgd_t *pgd;
 109        pmd_t *pmd;
 110        pte_t *pte, new_pte;
 111
 112        Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
 113
 114        level4 = pml4_offset_k(vaddr);
 115        if (pml4_none(*level4)) {
 116                printk("PML4 FIXMAP MISSING, it should be setup in head.S!\n");
 117                return;
 118        }
 119        pgd = level3_offset_k(level4, vaddr);
 120        if (pgd_none(*pgd)) {
 121                pmd = (pmd_t *) spp_getpage(); 
 122                set_pgd(pgd, __pgd(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
 123                if (pmd != pmd_offset(pgd, 0)) {
 124                        printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pgd,0));
 125                        return;
 126                }
 127        }
 128        pmd = pmd_offset(pgd, vaddr);
 129        if (pmd_none(*pmd)) {
 130                pte = (pte_t *) spp_getpage();
 131                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
 132                if (pte != pte_offset_kernel(pmd, 0)) {
 133                        printk("PAGETABLE BUG #02!\n");
 134                        return;
 135                }
 136        }
 137        new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
 138
 139        pte = pte_offset_kernel(pmd, vaddr);
 140        if (!pte_none(*pte) &&
 141            pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
 142                pte_ERROR(*pte);
 143        set_pte(pte, new_pte);
 144
 145        /*
 146         * It's enough to flush this one mapping.
 147         * (PGE mappings get flushed as well)
 148         */
 149        __flush_tlb_one(vaddr);
 150}
 151
 152/* NOTE: this is meant to be run only at boot */
 153void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 154{
 155        unsigned long address = __fix_to_virt(idx);
 156
 157        if (idx >= __end_of_fixed_addresses) {
 158                printk("Invalid __set_fixmap\n");
 159                return;
 160        }
 161        set_pte_phys(address, phys, prot);
 162}
 163
 164unsigned long __initdata table_start, table_end; 
 165
 166extern pmd_t temp_boot_pmds[]; 
 167
 168static  struct temp_map { 
 169        pmd_t *pmd;
 170        void  *address; 
 171        int    allocated; 
 172} temp_mappings[] __initdata = { 
 173        { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
 174        { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
 175        {}
 176}; 
 177
 178static __init void *alloc_low_page(int *index, unsigned long *phys) 
 179{ 
 180        struct temp_map *ti;
 181        int i; 
 182        unsigned long pfn = table_end++, paddr; 
 183        void *adr;
 184
 185        if (pfn >= end_pfn) 
 186                panic("alloc_low_page: ran out of memory"); 
 187        for (i = 0; temp_mappings[i].allocated; i++) {
 188                if (!temp_mappings[i].pmd) 
 189                        panic("alloc_low_page: ran out of temp mappings"); 
 190        } 
 191        ti = &temp_mappings[i];
 192        paddr = (pfn << PAGE_SHIFT) & PMD_MASK; 
 193        set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); 
 194        ti->allocated = 1; 
 195        __flush_tlb();         
 196        adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
 197        *index = i; 
 198        *phys  = pfn * PAGE_SIZE;  
 199        return adr; 
 200} 
 201
 202static __init void unmap_low_page(int i)
 203{ 
 204        struct temp_map *ti = &temp_mappings[i];
 205        set_pmd(ti->pmd, __pmd(0));
 206        ti->allocated = 0; 
 207} 
 208
 209static void __init phys_pgd_init(pgd_t *pgd, unsigned long address, unsigned long end)
 210{ 
 211        long i, j; 
 212
 213        i = pgd_index(address);
 214        pgd = pgd + i;
 215        for (; i < PTRS_PER_PGD; pgd++, i++) {
 216                int map; 
 217                unsigned long paddr, pmd_phys;
 218                pmd_t *pmd;
 219
 220                paddr = (address & PML4_MASK) + i*PGDIR_SIZE;
 221                if (paddr >= end) { 
 222                        for (; i < PTRS_PER_PGD; i++, pgd++) 
 223                                set_pgd(pgd, __pgd(0)); 
 224                        break;
 225                } 
 226
 227                if (!e820_mapped(paddr, paddr+PGDIR_SIZE, 0)) { 
 228                        set_pgd(pgd, __pgd(0)); 
 229                        continue;
 230                } 
 231
 232                pmd = alloc_low_page(&map, &pmd_phys);
 233                set_pgd(pgd, __pgd(pmd_phys | _KERNPG_TABLE));
 234                for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
 235                        unsigned long pe;
 236
 237                        if (paddr >= end) { 
 238                                for (; j < PTRS_PER_PMD; j++, pmd++)
 239                                        set_pmd(pmd,  __pmd(0)); 
 240                                break;
 241                }
 242                        pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
 243                        pe &= __supported_pte_mask;
 244                        set_pmd(pmd, __pmd(pe));
 245                }
 246                unmap_low_page(map);
 247        }
 248        __flush_tlb();
 249} 
 250
 251/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
 252   This runs before bootmem is initialized and gets pages directly from the 
 253   physical memory. To access them they are temporarily mapped. */
 254void __init init_memory_mapping(void) 
 255{ 
 256        unsigned long adr;             
 257        unsigned long end;
 258        unsigned long next; 
 259        unsigned long pgds, pmds, tables; 
 260
 261        Dprintk("init_memory_mapping\n");
 262
 263        end = end_pfn_map << PAGE_SHIFT;
 264
 265        /* 
 266         * Find space for the kernel direct mapping tables.
 267         * Later we should allocate these tables in the local node of the memory
 268         * mapped.  Unfortunately this is done currently before the nodes are 
 269         * discovered.
 270         */
 271
 272        pgds = (end + PGDIR_SIZE - 1) >> PGDIR_SHIFT;
 273        pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 
 274        tables = round_up(pgds*8, PAGE_SIZE) + round_up(pmds * 8, PAGE_SIZE); 
 275
 276        table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables); 
 277        if (table_start == -1UL) 
 278                panic("Cannot find space for the kernel page tables"); 
 279
 280        table_start >>= PAGE_SHIFT; 
 281        table_end = table_start;
 282       
 283        end += __PAGE_OFFSET; /* turn virtual */        
 284
 285        for (adr = PAGE_OFFSET; adr < end; adr = next) { 
 286                int map;
 287                unsigned long pgd_phys; 
 288                pgd_t *pgd = alloc_low_page(&map, &pgd_phys);
 289                next = adr + PML4_SIZE;
 290                if (next > end) 
 291                        next = end; 
 292                phys_pgd_init(pgd, adr-PAGE_OFFSET, next-PAGE_OFFSET); 
 293                set_pml4(init_level4_pgt + pml4_index(adr), mk_kernel_pml4(pgd_phys));
 294                unmap_low_page(map);   
 295        } 
 296        asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
 297        __flush_tlb_all();
 298        early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
 299               table_start<<PAGE_SHIFT, 
 300               table_end<<PAGE_SHIFT);
 301}
 302
 303extern struct x8664_pda cpu_pda[NR_CPUS];
 304
 305static unsigned long low_pml4[NR_CPUS];
 306
 307void swap_low_mappings(void)
 308{
 309        int i;
 310        for (i = 0; i < NR_CPUS; i++) {
 311                unsigned long t;
 312                if (!cpu_pda[i].level4_pgt) 
 313                        continue;
 314                t = cpu_pda[i].level4_pgt[0];
 315                cpu_pda[i].level4_pgt[0] = low_pml4[i];
 316                low_pml4[i] = t;
 317        }
 318        flush_tlb_all();
 319}
 320
 321void zap_low_mappings(void)
 322{
 323        swap_low_mappings();
 324}
 325
 326#ifndef CONFIG_DISCONTIGMEM
 327void __init paging_init(void)
 328{
 329        {
 330                unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
 331                unsigned int max_dma;
 332
 333                max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 334
 335                if (end_pfn < max_dma)
 336                        zones_size[ZONE_DMA] = end_pfn;
 337                else {
 338                        zones_size[ZONE_DMA] = max_dma;
 339                        zones_size[ZONE_NORMAL] = end_pfn - max_dma;
 340                }
 341                free_area_init(zones_size);
 342        }
 343        return;
 344}
 345#endif
 346
 347/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
 348   from the CPU leading to inconsistent cache lines. address and size
 349   must be aligned to 2MB boundaries. 
 350   Does nothing when the mapping doesn't exist. */
 351void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
 352{
 353        unsigned long end = address + size;
 354
 355        BUG_ON(address & ~LARGE_PAGE_MASK);
 356        BUG_ON(size & ~LARGE_PAGE_MASK); 
 357        
 358        for (; address < end; address += LARGE_PAGE_SIZE) { 
 359                pgd_t *pgd = pgd_offset_k(address);
 360               pmd_t *pmd;
 361                if (!pgd || pgd_none(*pgd))
 362                        continue; 
 363               pmd = pmd_offset(pgd, address);
 364                if (!pmd || pmd_none(*pmd))
 365                        continue; 
 366                if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
 367                        /* Could handle this, but it should not happen currently. */
 368                        printk(KERN_ERR 
 369               "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
 370                        pmd_ERROR(*pmd); 
 371                }
 372                set_pmd(pmd, __pmd(0));                 
 373        }
 374        __flush_tlb_all();
 375} 
 376
 377static inline int page_is_ram (unsigned long pagenr)
 378{
 379        int i;
 380
 381        for (i = 0; i < e820.nr_map; i++) {
 382                unsigned long addr, end;
 383
 384                if (e820.map[i].type != E820_RAM)       /* not usable memory */
 385                        continue;
 386                /*
 387                 *      !!!FIXME!!! Some BIOSen report areas as RAM that
 388                 *      are not. Notably the 640->1Mb area. We need a sanity
 389                 *      check here.
 390                 */
 391                addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
 392                end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
 393                if  ((pagenr >= addr) && (pagenr < end))
 394                        return 1;
 395        }
 396        return 0;
 397}
 398
 399static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 400                         kcore_vsyscall;
 401
 402void __init mem_init(void)
 403{
 404        int codesize, reservedpages, datasize, initsize;
 405        int tmp;
 406
 407#ifdef CONFIG_SWIOTLB
 408        if (!iommu_aperture && end_pfn >= 0xffffffff>>PAGE_SHIFT)
 409               swiotlb = 1;
 410        if (swiotlb)
 411                swiotlb_init(); 
 412#endif
 413
 414        /* How many end-of-memory variables you have, grandma! */
 415        max_low_pfn = end_pfn;
 416        max_pfn = end_pfn;
 417        num_physpages = end_pfn;
 418        high_memory = (void *) __va(end_pfn * PAGE_SIZE);
 419
 420        /* clear the zero-page */
 421        memset(empty_zero_page, 0, PAGE_SIZE);
 422
 423        reservedpages = 0;
 424
 425        /* this will put all low memory onto the freelists */
 426#ifdef CONFIG_DISCONTIGMEM
 427        totalram_pages += numa_free_all_bootmem();
 428        tmp = 0;
 429        /* should count reserved pages here for all nodes */ 
 430#else
 431        max_mapnr = end_pfn;
 432        if (!mem_map) BUG();
 433
 434        totalram_pages += free_all_bootmem();
 435
 436        for (tmp = 0; tmp < end_pfn; tmp++)
 437                /*
 438                 * Only count reserved RAM pages
 439                 */
 440                if (page_is_ram(tmp) && PageReserved(mem_map+tmp))
 441                        reservedpages++;
 442#endif
 443
 444        after_bootmem = 1;
 445
 446        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 447        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 448        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 449
 450        /* Register memory areas for /proc/kcore */
 451        kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
 452        kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
 453                   VMALLOC_END-VMALLOC_START);
 454        kclist_add(&kcore_kernel, &_stext, _end - _stext);
 455        kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
 456        kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
 457                                 VSYSCALL_END - VSYSCALL_START);
 458
 459        printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
 460                (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 461                end_pfn << (PAGE_SHIFT-10),
 462                codesize >> 10,
 463                reservedpages << (PAGE_SHIFT-10),
 464                datasize >> 10,
 465                initsize >> 10);
 466
 467        /*
 468         * Subtle. SMP is doing its boot stuff late (because it has to
 469         * fork idle threads) - but it also needs low mappings for the
 470         * protected-mode entry to work. We zap these entries only after
 471         * the WP-bit has been tested.
 472         */
 473#ifndef CONFIG_SMP
 474        zap_low_mappings();
 475#endif
 476}
 477
 478void free_initmem(void)
 479{
 480        unsigned long addr;
 481
 482        addr = (unsigned long)(&__init_begin);
 483        for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 484                ClearPageReserved(virt_to_page(addr));
 485                set_page_count(virt_to_page(addr), 1);
 486#ifdef CONFIG_INIT_DEBUG
 487                memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
 488#endif
 489                free_page(addr);
 490                totalram_pages++;
 491        }
 492        printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
 493}
 494
 495#ifdef CONFIG_BLK_DEV_INITRD
 496void free_initrd_mem(unsigned long start, unsigned long end)
 497{
 498        if (start < (unsigned long)&_end)
 499                return;
 500        printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 501        for (; start < end; start += PAGE_SIZE) {
 502                ClearPageReserved(virt_to_page(start));
 503                set_page_count(virt_to_page(start), 1);
 504                free_page(start);
 505                totalram_pages++;
 506        }
 507}
 508#endif
 509
 510void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
 511{ 
 512        /* Should check here against the e820 map to avoid double free */ 
 513#ifdef CONFIG_DISCONTIGMEM
 514        int nid = phys_to_nid(phys);
 515        reserve_bootmem_node(NODE_DATA(nid), phys, len);
 516#else                   
 517        reserve_bootmem(phys, len);    
 518#endif
 519}
 520
 521int kern_addr_valid(unsigned long addr) 
 522{ 
 523        unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
 524       pml4_t *pml4;
 525       pgd_t *pgd;
 526       pmd_t *pmd;
 527       pte_t *pte;
 528
 529        if (above != 0 && above != -1UL)
 530                return 0; 
 531        
 532       pml4 = pml4_offset_k(addr);
 533        if (pml4_none(*pml4))
 534                return 0;
 535
 536       pgd = pgd_offset_k(addr);
 537        if (pgd_none(*pgd))
 538                return 0; 
 539
 540       pmd = pmd_offset(pgd, addr);
 541        if (pmd_none(*pmd))
 542                return 0;
 543        if (pmd_large(*pmd))
 544                return pfn_valid(pmd_pfn(*pmd));
 545
 546       pte = pte_offset_kernel(pmd, addr);
 547        if (pte_none(*pte))
 548                return 0;
 549        return pfn_valid(pte_pfn(*pte));
 550}
 551
 552#ifdef CONFIG_SYSCTL
 553#include <linux/sysctl.h>
 554
 555extern int exception_trace, page_fault_trace;
 556
 557static ctl_table debug_table2[] = {
 558        { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
 559          proc_dointvec },
 560#ifdef CONFIG_CHECKING
 561        { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
 562          proc_dointvec },
 563#endif
 564        { 0, }
 565}; 
 566
 567static ctl_table debug_root_table2[] = { 
 568        { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
 569           .child = debug_table2 }, 
 570        { 0 }, 
 571}; 
 572
 573static __init int x8664_sysctl_init(void)
 574{ 
 575        register_sysctl_table(debug_root_table2, 1);
 576        return 0;
 577}
 578__initcall(x8664_sysctl_init);
 579#endif
 580
 581/* Pseudo VMAs to allow ptrace access for the vsyscall pages.  x86-64 has two
 582   different ones: one for 32bit and one for 64bit. Use the appropiate
 583   for the target task. */
 584
 585static struct vm_area_struct gate_vma = {
 586        .vm_start = VSYSCALL_START,
 587        .vm_end = VSYSCALL_END,
 588        .vm_page_prot = PAGE_READONLY
 589};
 590
 591static struct vm_area_struct gate32_vma = {
 592        .vm_start = VSYSCALL32_BASE,
 593        .vm_end = VSYSCALL32_END,
 594        .vm_page_prot = PAGE_READONLY
 595};
 596
 597struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 598{
 599        return test_tsk_thread_flag(tsk, TIF_IA32) ? &gate32_vma : &gate_vma;
 600}
 601
 602int in_gate_area(struct task_struct *task, unsigned long addr)
 603{
 604        struct vm_area_struct *vma = get_gate_vma(task);
 605        return (addr >= vma->vm_start) && (addr < vma->vm_end);
 606}
 607
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.