linux/arch/x86_64/mm/init.c
<<
>>
Prefs
   1/*
   2 *  linux/arch/x86_64/mm/init.c
   3 *
   4 *  Copyright (C) 1995  Linus Torvalds
   5 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
   6 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
   7 */
   8
   9#include <linux/config.h>
  10#include <linux/signal.h>
  11#include <linux/sched.h>
  12#include <linux/kernel.h>
  13#include <linux/errno.h>
  14#include <linux/string.h>
  15#include <linux/types.h>
  16#include <linux/ptrace.h>
  17#include <linux/mman.h>
  18#include <linux/mm.h>
  19#include <linux/swap.h>
  20#include <linux/smp.h>
  21#include <linux/init.h>
  22#include <linux/pagemap.h>
  23#include <linux/bootmem.h>
  24#include <linux/proc_fs.h>
  25
  26#include <asm/processor.h>
  27#include <asm/system.h>
  28#include <asm/uaccess.h>
  29#include <asm/pgtable.h>
  30#include <asm/pgalloc.h>
  31#include <asm/dma.h>
  32#include <asm/fixmap.h>
  33#include <asm/e820.h>
  34#include <asm/apic.h>
  35#include <asm/tlb.h>
  36#include <asm/mmu_context.h>
  37#include <asm/proto.h>
  38#include <asm/smp.h>
  39
  40#ifndef Dprintk
  41#define Dprintk(x...)
  42#endif
  43
  44#ifdef CONFIG_GART_IOMMU
  45extern int swiotlb;
  46#endif
  47
  48extern char _stext[];
  49
  50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
  51
  52/*
  53 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  54 * physical space so we can cache the place of the first one and move
  55 * around without checking the pgd every time.
  56 */
  57
  58void show_mem(void)
  59{
  60        int i, total = 0, reserved = 0;
  61        int shared = 0, cached = 0;
  62        pg_data_t *pgdat;
  63        struct page *page;
  64
  65        printk("Mem-info:\n");
  66        show_free_areas();
  67        printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
  68
  69        for_each_pgdat(pgdat) {
  70               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
  71                        page = pfn_to_page(pgdat->node_start_pfn + i);
  72                        total++;
  73                       if (PageReserved(page))
  74                        reserved++;
  75                       else if (PageSwapCache(page))
  76                        cached++;
  77                       else if (page_count(page))
  78                               shared += page_count(page) - 1;
  79               }
  80        }
  81        printk("%d pages of RAM\n", total);
  82        printk("%d reserved pages\n",reserved);
  83        printk("%d pages shared\n",shared);
  84        printk("%d pages swap cached\n",cached);
  85}
  86
  87/* References to section boundaries */
  88
  89extern char _text, _etext, _edata, __bss_start, _end[];
  90extern char __init_begin, __init_end;
  91
  92int after_bootmem;
  93
  94static void *spp_getpage(void)
  95{ 
  96        void *ptr;
  97        if (after_bootmem)
  98                ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
  99        else
 100                ptr = alloc_bootmem_pages(PAGE_SIZE);
 101        if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
 102                panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
 103
 104        Dprintk("spp_getpage %p\n", ptr);
 105        return ptr;
 106} 
 107
 108static void set_pte_phys(unsigned long vaddr,
 109                         unsigned long phys, pgprot_t prot)
 110{
 111        pgd_t *pgd;
 112        pud_t *pud;
 113        pmd_t *pmd;
 114        pte_t *pte, new_pte;
 115
 116        Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
 117
 118        pgd = pgd_offset_k(vaddr);
 119        if (pgd_none(*pgd)) {
 120                printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
 121                return;
 122        }
 123        pud = pud_offset(pgd, vaddr);
 124        if (pud_none(*pud)) {
 125                pmd = (pmd_t *) spp_getpage(); 
 126                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
 127                if (pmd != pmd_offset(pud, 0)) {
 128                        printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
 129                        return;
 130                }
 131        }
 132        pmd = pmd_offset(pud, vaddr);
 133        if (pmd_none(*pmd)) {
 134                pte = (pte_t *) spp_getpage();
 135                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
 136                if (pte != pte_offset_kernel(pmd, 0)) {
 137                        printk("PAGETABLE BUG #02!\n");
 138                        return;
 139                }
 140        }
 141        new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
 142
 143        pte = pte_offset_kernel(pmd, vaddr);
 144        if (!pte_none(*pte) &&
 145            pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
 146                pte_ERROR(*pte);
 147        set_pte(pte, new_pte);
 148
 149        /*
 150         * It's enough to flush this one mapping.
 151         * (PGE mappings get flushed as well)
 152         */
 153        __flush_tlb_one(vaddr);
 154}
 155
 156/* NOTE: this is meant to be run only at boot */
 157void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 158{
 159        unsigned long address = __fix_to_virt(idx);
 160
 161        if (idx >= __end_of_fixed_addresses) {
 162                printk("Invalid __set_fixmap\n");
 163                return;
 164        }
 165        set_pte_phys(address, phys, prot);
 166}
 167
 168unsigned long __initdata table_start, table_end; 
 169
 170extern pmd_t temp_boot_pmds[]; 
 171
 172static  struct temp_map { 
 173        pmd_t *pmd;
 174        void  *address; 
 175        int    allocated; 
 176} temp_mappings[] __initdata = { 
 177        { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
 178        { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
 179        {}
 180}; 
 181
 182static __init void *alloc_low_page(int *index, unsigned long *phys) 
 183{ 
 184        struct temp_map *ti;
 185        int i; 
 186        unsigned long pfn = table_end++, paddr; 
 187        void *adr;
 188
 189        if (pfn >= end_pfn) 
 190                panic("alloc_low_page: ran out of memory"); 
 191        for (i = 0; temp_mappings[i].allocated; i++) {
 192                if (!temp_mappings[i].pmd) 
 193                        panic("alloc_low_page: ran out of temp mappings"); 
 194        } 
 195        ti = &temp_mappings[i];
 196        paddr = (pfn << PAGE_SHIFT) & PMD_MASK; 
 197        set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); 
 198        ti->allocated = 1; 
 199        __flush_tlb();         
 200        adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
 201        *index = i; 
 202        *phys  = pfn * PAGE_SIZE;  
 203        return adr; 
 204} 
 205
 206static __init void unmap_low_page(int i)
 207{ 
 208        struct temp_map *ti = &temp_mappings[i];
 209        set_pmd(ti->pmd, __pmd(0));
 210        ti->allocated = 0; 
 211} 
 212
 213static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
 214{ 
 215        long i, j; 
 216
 217        i = pud_index(address);
 218        pud = pud + i;
 219        for (; i < PTRS_PER_PUD; pud++, i++) {
 220                int map; 
 221                unsigned long paddr, pmd_phys;
 222                pmd_t *pmd;
 223
 224                paddr = address + i*PUD_SIZE;
 225                if (paddr >= end) { 
 226                        for (; i < PTRS_PER_PUD; i++, pud++) 
 227                                set_pud(pud, __pud(0)); 
 228                        break;
 229                } 
 230
 231                if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 
 232                        set_pud(pud, __pud(0)); 
 233                        continue;
 234                } 
 235
 236                pmd = alloc_low_page(&map, &pmd_phys);
 237                set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
 238                for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
 239                        unsigned long pe;
 240
 241                        if (paddr >= end) { 
 242                                for (; j < PTRS_PER_PMD; j++, pmd++)
 243                                        set_pmd(pmd,  __pmd(0)); 
 244                                break;
 245                }
 246                        pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
 247                        pe &= __supported_pte_mask;
 248                        set_pmd(pmd, __pmd(pe));
 249                }
 250                unmap_low_page(map);
 251        }
 252        __flush_tlb();
 253} 
 254
 255static void __init find_early_table_space(unsigned long end)
 256{
 257        unsigned long puds, pmds, tables;
 258
 259        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
 260        pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
 261        tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
 262                 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
 263
 264        table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
 265        if (table_start == -1UL)
 266                panic("Cannot find space for the kernel page tables");
 267
 268        table_start >>= PAGE_SHIFT;
 269        table_end = table_start;
 270}
 271
 272/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
 273   This runs before bootmem is initialized and gets pages directly from the 
 274   physical memory. To access them they are temporarily mapped. */
 275void __init init_memory_mapping(unsigned long start, unsigned long end)
 276{ 
 277        unsigned long next; 
 278
 279        Dprintk("init_memory_mapping\n");
 280
 281        /* 
 282         * Find space for the kernel direct mapping tables.
 283         * Later we should allocate these tables in the local node of the memory
 284         * mapped.  Unfortunately this is done currently before the nodes are 
 285         * discovered.
 286         */
 287        find_early_table_space(end);
 288
 289        start = (unsigned long)__va(start);
 290        end = (unsigned long)__va(end);
 291
 292        for (; start < end; start = next) {
 293                int map;
 294                unsigned long pud_phys; 
 295                pud_t *pud = alloc_low_page(&map, &pud_phys);
 296                next = start + PGDIR_SIZE;
 297                if (next > end) 
 298                        next = end; 
 299                phys_pud_init(pud, __pa(start), __pa(next));
 300                set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
 301                unmap_low_page(map);   
 302        } 
 303
 304        asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
 305        __flush_tlb_all();
 306        early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
 307               table_start<<PAGE_SHIFT, 
 308               table_end<<PAGE_SHIFT);
 309}
 310
 311extern struct x8664_pda cpu_pda[NR_CPUS];
 312
 313/* Assumes all CPUs still execute in init_mm */
 314void zap_low_mappings(void)
 315{
 316        pgd_t *pgd = pgd_offset_k(0UL);
 317        pgd_clear(pgd);
 318        flush_tlb_all();
 319}
 320
 321#ifndef CONFIG_DISCONTIGMEM
 322void __init paging_init(void)
 323{
 324        {
 325                unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
 326                unsigned int max_dma;
 327
 328                max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 329
 330                if (end_pfn < max_dma)
 331                        zones_size[ZONE_DMA] = end_pfn;
 332                else {
 333                        zones_size[ZONE_DMA] = max_dma;
 334                        zones_size[ZONE_NORMAL] = end_pfn - max_dma;
 335                }
 336                free_area_init(zones_size);
 337        }
 338        return;
 339}
 340#endif
 341
 342/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
 343   from the CPU leading to inconsistent cache lines. address and size
 344   must be aligned to 2MB boundaries. 
 345   Does nothing when the mapping doesn't exist. */
 346void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
 347{
 348        unsigned long end = address + size;
 349
 350        BUG_ON(address & ~LARGE_PAGE_MASK);
 351        BUG_ON(size & ~LARGE_PAGE_MASK); 
 352        
 353        for (; address < end; address += LARGE_PAGE_SIZE) { 
 354                pgd_t *pgd = pgd_offset_k(address);
 355                pud_t *pud;
 356                pmd_t *pmd;
 357                if (pgd_none(*pgd))
 358                        continue;
 359                pud = pud_offset(pgd, address);
 360                if (pud_none(*pud))
 361                        continue; 
 362                pmd = pmd_offset(pud, address);
 363                if (!pmd || pmd_none(*pmd))
 364                        continue; 
 365                if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
 366                        /* Could handle this, but it should not happen currently. */
 367                        printk(KERN_ERR 
 368               "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
 369                        pmd_ERROR(*pmd); 
 370                }
 371                set_pmd(pmd, __pmd(0));                 
 372        }
 373        __flush_tlb_all();
 374} 
 375
 376static inline int page_is_ram (unsigned long pagenr)
 377{
 378        int i;
 379
 380        for (i = 0; i < e820.nr_map; i++) {
 381                unsigned long addr, end;
 382
 383                if (e820.map[i].type != E820_RAM)       /* not usable memory */
 384                        continue;
 385                /*
 386                 *      !!!FIXME!!! Some BIOSen report areas as RAM that
 387                 *      are not. Notably the 640->1Mb area. We need a sanity
 388                 *      check here.
 389                 */
 390                addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
 391                end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
 392                if  ((pagenr >= addr) && (pagenr < end))
 393                        return 1;
 394        }
 395        return 0;
 396}
 397
 398extern int swiotlb_force;
 399
 400static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 401                         kcore_vsyscall;
 402
 403void __init mem_init(void)
 404{
 405        int codesize, reservedpages, datasize, initsize;
 406        int tmp;
 407
 408#ifdef CONFIG_SWIOTLB
 409        if (swiotlb_force)
 410                swiotlb = 1;
 411        if (!iommu_aperture &&
 412            (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
 413               swiotlb = 1;
 414        if (swiotlb)
 415                swiotlb_init(); 
 416#endif
 417
 418        /* How many end-of-memory variables you have, grandma! */
 419        max_low_pfn = end_pfn;
 420        max_pfn = end_pfn;
 421        num_physpages = end_pfn;
 422        high_memory = (void *) __va(end_pfn * PAGE_SIZE);
 423
 424        /* clear the zero-page */
 425        memset(empty_zero_page, 0, PAGE_SIZE);
 426
 427        reservedpages = 0;
 428
 429        /* this will put all low memory onto the freelists */
 430#ifdef CONFIG_DISCONTIGMEM
 431        totalram_pages += numa_free_all_bootmem();
 432        tmp = 0;
 433        /* should count reserved pages here for all nodes */ 
 434#else
 435        max_mapnr = end_pfn;
 436        if (!mem_map) BUG();
 437
 438        totalram_pages += free_all_bootmem();
 439
 440        for (tmp = 0; tmp < end_pfn; tmp++)
 441                /*
 442                 * Only count reserved RAM pages
 443                 */
 444                if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 445                        reservedpages++;
 446#endif
 447
 448        after_bootmem = 1;
 449
 450        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 451        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 452        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 453
 454        /* Register memory areas for /proc/kcore */
 455        kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
 456        kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
 457                   VMALLOC_END-VMALLOC_START);
 458        kclist_add(&kcore_kernel, &_stext, _end - _stext);
 459        kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
 460        kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
 461                                 VSYSCALL_END - VSYSCALL_START);
 462
 463        printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
 464                (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 465                end_pfn << (PAGE_SHIFT-10),
 466                codesize >> 10,
 467                reservedpages << (PAGE_SHIFT-10),
 468                datasize >> 10,
 469                initsize >> 10);
 470
 471        /*
 472         * Subtle. SMP is doing its boot stuff late (because it has to
 473         * fork idle threads) - but it also needs low mappings for the
 474         * protected-mode entry to work. We zap these entries only after
 475         * the WP-bit has been tested.
 476         */
 477#ifndef CONFIG_SMP
 478        zap_low_mappings();
 479#endif
 480}
 481
 482extern char __initdata_begin[], __initdata_end[];
 483
 484void free_initmem(void)
 485{
 486        unsigned long addr;
 487
 488        addr = (unsigned long)(&__init_begin);
 489        for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 490                ClearPageReserved(virt_to_page(addr));
 491                set_page_count(virt_to_page(addr), 1);
 492                memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
 493                free_page(addr);
 494                totalram_pages++;
 495        }
 496        memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
 497        printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
 498}
 499
 500#ifdef CONFIG_BLK_DEV_INITRD
 501void free_initrd_mem(unsigned long start, unsigned long end)
 502{
 503        if (start < (unsigned long)&_end)
 504                return;
 505        printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 506        for (; start < end; start += PAGE_SIZE) {
 507                ClearPageReserved(virt_to_page(start));
 508                set_page_count(virt_to_page(start), 1);
 509                free_page(start);
 510                totalram_pages++;
 511        }
 512}
 513#endif
 514
 515void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
 516{ 
 517        /* Should check here against the e820 map to avoid double free */ 
 518#ifdef CONFIG_DISCONTIGMEM
 519        int nid = phys_to_nid(phys);
 520        reserve_bootmem_node(NODE_DATA(nid), phys, len);
 521#else                   
 522        reserve_bootmem(phys, len);    
 523#endif
 524}
 525
 526int kern_addr_valid(unsigned long addr) 
 527{ 
 528        unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
 529       pgd_t *pgd;
 530       pud_t *pud;
 531       pmd_t *pmd;
 532       pte_t *pte;
 533
 534        if (above != 0 && above != -1UL)
 535                return 0; 
 536        
 537        pgd = pgd_offset_k(addr);
 538        if (pgd_none(*pgd))
 539                return 0;
 540
 541        pud = pud_offset(pgd, addr);
 542        if (pud_none(*pud))
 543                return 0; 
 544
 545        pmd = pmd_offset(pud, addr);
 546        if (pmd_none(*pmd))
 547                return 0;
 548        if (pmd_large(*pmd))
 549                return pfn_valid(pmd_pfn(*pmd));
 550
 551        pte = pte_offset_kernel(pmd, addr);
 552        if (pte_none(*pte))
 553                return 0;
 554        return pfn_valid(pte_pfn(*pte));
 555}
 556
 557#ifdef CONFIG_SYSCTL
 558#include <linux/sysctl.h>
 559
 560extern int exception_trace, page_fault_trace;
 561
 562static ctl_table debug_table2[] = {
 563        { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
 564          proc_dointvec },
 565#ifdef CONFIG_CHECKING
 566        { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
 567          proc_dointvec },
 568#endif
 569        { 0, }
 570}; 
 571
 572static ctl_table debug_root_table2[] = { 
 573        { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
 574           .child = debug_table2 }, 
 575        { 0 }, 
 576}; 
 577
 578static __init int x8664_sysctl_init(void)
 579{ 
 580        register_sysctl_table(debug_root_table2, 1);
 581        return 0;
 582}
 583__initcall(x8664_sysctl_init);
 584#endif
 585
 586/* Pseudo VMAs to allow ptrace access for the vsyscall pages.  x86-64 has two
 587   different ones: one for 32bit and one for 64bit. Use the appropiate
 588   for the target task. */
 589
 590static struct vm_area_struct gate_vma = {
 591        .vm_start = VSYSCALL_START,
 592        .vm_end = VSYSCALL_END,
 593        .vm_page_prot = PAGE_READONLY
 594};
 595
 596static struct vm_area_struct gate32_vma = {
 597        .vm_start = VSYSCALL32_BASE,
 598        .vm_end = VSYSCALL32_END,
 599        .vm_page_prot = PAGE_READONLY
 600};
 601
 602struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 603{
 604#ifdef CONFIG_IA32_EMULATION
 605        if (test_tsk_thread_flag(tsk, TIF_IA32)) {
 606                /* lookup code assumes the pages are present. set them up
 607                   now */
 608                if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0)
 609                        return NULL;
 610                return &gate32_vma;
 611        }
 612#endif
 613        return &gate_vma;
 614}
 615
 616int in_gate_area(struct task_struct *task, unsigned long addr)
 617{
 618        struct vm_area_struct *vma = get_gate_vma(task);
 619        return (addr >= vma->vm_start) && (addr < vma->vm_end);
 620}
 621
 622/* Use this when you have no reliable task/vma, typically from interrupt
 623 * context.  It is less reliable than using the task's vma and may give
 624 * false positives.
 625 */
 626int in_gate_area_no_task(unsigned long addr)
 627{
 628        return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) ||
 629                ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END)));
 630}
 631
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.