1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/rmap.h>
49#include <linux/module.h>
50#include <linux/delayacct.h>
51#include <linux/init.h>
52#include <linux/writeback.h>
53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
55#include <linux/kallsyms.h>
56#include <linux/swapops.h>
57#include <linux/elf.h>
58
59#include <asm/pgalloc.h>
60#include <asm/uaccess.h>
61#include <asm/tlb.h>
62#include <asm/tlbflush.h>
63#include <asm/pgtable.h>
64
65#include "internal.h"
66
67#ifndef CONFIG_NEED_MULTIPLE_NODES
68
69unsigned long max_mapnr;
70struct page *mem_map;
71
72EXPORT_SYMBOL(max_mapnr);
73EXPORT_SYMBOL(mem_map);
74#endif
75
76unsigned long num_physpages;
77
78
79
80
81
82
83
84void * high_memory;
85
86EXPORT_SYMBOL(num_physpages);
87EXPORT_SYMBOL(high_memory);
88
89
90
91
92
93
94
95int randomize_va_space __read_mostly =
96#ifdef CONFIG_COMPAT_BRK
97 1;
98#else
99 2;
100#endif
101
102static int __init disable_randmaps(char *s)
103{
104 randomize_va_space = 0;
105 return 1;
106}
107__setup("norandmaps", disable_randmaps);
108
109
110
111
112
113
114
115
116void pgd_clear_bad(pgd_t *pgd)
117{
118 pgd_ERROR(*pgd);
119 pgd_clear(pgd);
120}
121
122void pud_clear_bad(pud_t *pud)
123{
124 pud_ERROR(*pud);
125 pud_clear(pud);
126}
127
128void pmd_clear_bad(pmd_t *pmd)
129{
130 pmd_ERROR(*pmd);
131 pmd_clear(pmd);
132}
133
134
135
136
137
138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
139{
140 pgtable_t token = pmd_pgtable(*pmd);
141 pmd_clear(pmd);
142 pte_free_tlb(tlb, token);
143 tlb->mm->nr_ptes--;
144}
145
146static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
147 unsigned long addr, unsigned long end,
148 unsigned long floor, unsigned long ceiling)
149{
150 pmd_t *pmd;
151 unsigned long next;
152 unsigned long start;
153
154 start = addr;
155 pmd = pmd_offset(pud, addr);
156 do {
157 next = pmd_addr_end(addr, end);
158 if (pmd_none_or_clear_bad(pmd))
159 continue;
160 free_pte_range(tlb, pmd);
161 } while (pmd++, addr = next, addr != end);
162
163 start &= PUD_MASK;
164 if (start < floor)
165 return;
166 if (ceiling) {
167 ceiling &= PUD_MASK;
168 if (!ceiling)
169 return;
170 }
171 if (end - 1 > ceiling - 1)
172 return;
173
174 pmd = pmd_offset(pud, start);
175 pud_clear(pud);
176 pmd_free_tlb(tlb, pmd);
177}
178
179static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
180 unsigned long addr, unsigned long end,
181 unsigned long floor, unsigned long ceiling)
182{
183 pud_t *pud;
184 unsigned long next;
185 unsigned long start;
186
187 start = addr;
188 pud = pud_offset(pgd, addr);
189 do {
190 next = pud_addr_end(addr, end);
191 if (pud_none_or_clear_bad(pud))
192 continue;
193 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
194 } while (pud++, addr = next, addr != end);
195
196 start &= PGDIR_MASK;
197 if (start < floor)
198 return;
199 if (ceiling) {
200 ceiling &= PGDIR_MASK;
201 if (!ceiling)
202 return;
203 }
204 if (end - 1 > ceiling - 1)
205 return;
206
207 pud = pud_offset(pgd, start);
208 pgd_clear(pgd);
209 pud_free_tlb(tlb, pud);
210}
211
212
213
214
215
216
217void free_pgd_range(struct mmu_gather *tlb,
218 unsigned long addr, unsigned long end,
219 unsigned long floor, unsigned long ceiling)
220{
221 pgd_t *pgd;
222 unsigned long next;
223 unsigned long start;
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251 addr &= PMD_MASK;
252 if (addr < floor) {
253 addr += PMD_SIZE;
254 if (!addr)
255 return;
256 }
257 if (ceiling) {
258 ceiling &= PMD_MASK;
259 if (!ceiling)
260 return;
261 }
262 if (end - 1 > ceiling - 1)
263 end -= PMD_SIZE;
264 if (addr > end - 1)
265 return;
266
267 start = addr;
268 pgd = pgd_offset(tlb->mm, addr);
269 do {
270 next = pgd_addr_end(addr, end);
271 if (pgd_none_or_clear_bad(pgd))
272 continue;
273 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
274 } while (pgd++, addr = next, addr != end);
275}
276
277void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
278 unsigned long floor, unsigned long ceiling)
279{
280 while (vma) {
281 struct vm_area_struct *next = vma->vm_next;
282 unsigned long addr = vma->vm_start;
283
284
285
286
287 anon_vma_unlink(vma);
288 unlink_file_vma(vma);
289
290 if (is_vm_hugetlb_page(vma)) {
291 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
292 floor, next? next->vm_start: ceiling);
293 } else {
294
295
296
297 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
298 && !is_vm_hugetlb_page(next)) {
299 vma = next;
300 next = vma->vm_next;
301 anon_vma_unlink(vma);
302 unlink_file_vma(vma);
303 }
304 free_pgd_range(tlb, addr, vma->vm_end,
305 floor, next? next->vm_start: ceiling);
306 }
307 vma = next;
308 }
309}
310
311int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
312{
313 pgtable_t new = pte_alloc_one(mm, address);
314 if (!new)
315 return -ENOMEM;
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330 smp_wmb();
331
332 spin_lock(&mm->page_table_lock);
333 if (!pmd_present(*pmd)) {
334 mm->nr_ptes++;
335 pmd_populate(mm, pmd, new);
336 new = NULL;
337 }
338 spin_unlock(&mm->page_table_lock);
339 if (new)
340 pte_free(mm, new);
341 return 0;
342}
343
344int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
345{
346 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
347 if (!new)
348 return -ENOMEM;
349
350 smp_wmb();
351
352 spin_lock(&init_mm.page_table_lock);
353 if (!pmd_present(*pmd)) {
354 pmd_populate_kernel(&init_mm, pmd, new);
355 new = NULL;
356 }
357 spin_unlock(&init_mm.page_table_lock);
358 if (new)
359 pte_free_kernel(&init_mm, new);
360 return 0;
361}
362
363static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
364{
365 if (file_rss)
366 add_mm_counter(mm, file_rss, file_rss);
367 if (anon_rss)
368 add_mm_counter(mm, anon_rss, anon_rss);
369}
370
371
372
373
374
375
376
377
378static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
379 pte_t pte, struct page *page)
380{
381 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
382 pud_t *pud = pud_offset(pgd, addr);
383 pmd_t *pmd = pmd_offset(pud, addr);
384 struct address_space *mapping;
385 pgoff_t index;
386 static unsigned long resume;
387 static unsigned long nr_shown;
388 static unsigned long nr_unshown;
389
390
391
392
393
394 if (nr_shown == 60) {
395 if (time_before(jiffies, resume)) {
396 nr_unshown++;
397 return;
398 }
399 if (nr_unshown) {
400 printk(KERN_ALERT
401 "BUG: Bad page map: %lu messages suppressed\n",
402 nr_unshown);
403 nr_unshown = 0;
404 }
405 nr_shown = 0;
406 }
407 if (nr_shown++ == 0)
408 resume = jiffies + 60 * HZ;
409
410 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
411 index = linear_page_index(vma, addr);
412
413 printk(KERN_ALERT
414 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
415 current->comm,
416 (long long)pte_val(pte), (long long)pmd_val(*pmd));
417 if (page) {
418 printk(KERN_ALERT
419 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
420 page, (void *)page->flags, page_count(page),
421 page_mapcount(page), page->mapping, page->index);
422 }
423 printk(KERN_ALERT
424 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
425 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
426
427
428
429 if (vma->vm_ops)
430 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
431 (unsigned long)vma->vm_ops->fault);
432 if (vma->vm_file && vma->vm_file->f_op)
433 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
434 (unsigned long)vma->vm_file->f_op->mmap);
435 dump_stack();
436 add_taint(TAINT_BAD_PAGE);
437}
438
439static inline int is_cow_mapping(unsigned int flags)
440{
441 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
442}
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486#ifdef __HAVE_ARCH_PTE_SPECIAL
487# define HAVE_PTE_SPECIAL 1
488#else
489# define HAVE_PTE_SPECIAL 0
490#endif
491struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
492 pte_t pte)
493{
494 unsigned long pfn = pte_pfn(pte);
495
496 if (HAVE_PTE_SPECIAL) {
497 if (likely(!pte_special(pte)))
498 goto check_pfn;
499 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
500 print_bad_pte(vma, addr, pte, NULL);
501 return NULL;
502 }
503
504
505
506 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
507 if (vma->vm_flags & VM_MIXEDMAP) {
508 if (!pfn_valid(pfn))
509 return NULL;
510 goto out;
511 } else {
512 unsigned long off;
513 off = (addr - vma->vm_start) >> PAGE_SHIFT;
514 if (pfn == vma->vm_pgoff + off)
515 return NULL;
516 if (!is_cow_mapping(vma->vm_flags))
517 return NULL;
518 }
519 }
520
521check_pfn:
522 if (unlikely(pfn > highest_memmap_pfn)) {
523 print_bad_pte(vma, addr, pte, NULL);
524 return NULL;
525 }
526
527
528
529
530
531out:
532 return pfn_to_page(pfn);
533}
534
535
536
537
538
539
540
541static inline void
542copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
543 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
544 unsigned long addr, int *rss)
545{
546 unsigned long vm_flags = vma->vm_flags;
547 pte_t pte = *src_pte;
548 struct page *page;
549
550
551 if (unlikely(!pte_present(pte))) {
552 if (!pte_file(pte)) {
553 swp_entry_t entry = pte_to_swp_entry(pte);
554
555 swap_duplicate(entry);
556
557 if (unlikely(list_empty(&dst_mm->mmlist))) {
558 spin_lock(&mmlist_lock);
559 if (list_empty(&dst_mm->mmlist))
560 list_add(&dst_mm->mmlist,
561 &src_mm->mmlist);
562 spin_unlock(&mmlist_lock);
563 }
564 if (is_write_migration_entry(entry) &&
565 is_cow_mapping(vm_flags)) {
566
567
568
569
570 make_migration_entry_read(&entry);
571 pte = swp_entry_to_pte(entry);
572 set_pte_at(src_mm, addr, src_pte, pte);
573 }
574 }
575 goto out_set_pte;
576 }
577
578
579
580
581
582 if (is_cow_mapping(vm_flags)) {
583 ptep_set_wrprotect(src_mm, addr, src_pte);
584 pte = pte_wrprotect(pte);
585 }
586
587
588
589
590
591 if (vm_flags & VM_SHARED)
592 pte = pte_mkclean(pte);
593 pte = pte_mkold(pte);
594
595 page = vm_normal_page(vma, addr, pte);
596 if (page) {
597 get_page(page);
598 page_dup_rmap(page, vma, addr);
599 rss[!!PageAnon(page)]++;
600 }
601
602out_set_pte:
603 set_pte_at(dst_mm, addr, dst_pte, pte);
604}
605
606static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
607 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
608 unsigned long addr, unsigned long end)
609{
610 pte_t *src_pte, *dst_pte;
611 spinlock_t *src_ptl, *dst_ptl;
612 int progress = 0;
613 int rss[2];
614
615again:
616 rss[1] = rss[0] = 0;
617 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
618 if (!dst_pte)
619 return -ENOMEM;
620 src_pte = pte_offset_map_nested(src_pmd, addr);
621 src_ptl = pte_lockptr(src_mm, src_pmd);
622 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
623 arch_enter_lazy_mmu_mode();
624
625 do {
626
627
628
629
630 if (progress >= 32) {
631 progress = 0;
632 if (need_resched() ||
633 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
634 break;
635 }
636 if (pte_none(*src_pte)) {
637 progress++;
638 continue;
639 }
640 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
641 progress += 8;
642 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
643
644 arch_leave_lazy_mmu_mode();
645 spin_unlock(src_ptl);
646 pte_unmap_nested(src_pte - 1);
647 add_mm_rss(dst_mm, rss[0], rss[1]);
648 pte_unmap_unlock(dst_pte - 1, dst_ptl);
649 cond_resched();
650 if (addr != end)
651 goto again;
652 return 0;
653}
654
655static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
656 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
657 unsigned long addr, unsigned long end)
658{
659 pmd_t *src_pmd, *dst_pmd;
660 unsigned long next;
661
662 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
663 if (!dst_pmd)
664 return -ENOMEM;
665 src_pmd = pmd_offset(src_pud, addr);
666 do {
667 next = pmd_addr_end(addr, end);
668 if (pmd_none_or_clear_bad(src_pmd))
669 continue;
670 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
671 vma, addr, next))
672 return -ENOMEM;
673 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
674 return 0;
675}
676
677static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
678 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
679 unsigned long addr, unsigned long end)
680{
681 pud_t *src_pud, *dst_pud;
682 unsigned long next;
683
684 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
685 if (!dst_pud)
686 return -ENOMEM;
687 src_pud = pud_offset(src_pgd, addr);
688 do {
689 next = pud_addr_end(addr, end);
690 if (pud_none_or_clear_bad(src_pud))
691 continue;
692 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
693 vma, addr, next))
694 return -ENOMEM;
695 } while (dst_pud++, src_pud++, addr = next, addr != end);
696 return 0;
697}
698
699int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
700 struct vm_area_struct *vma)
701{
702 pgd_t *src_pgd, *dst_pgd;
703 unsigned long next;
704 unsigned long addr = vma->vm_start;
705 unsigned long end = vma->vm_end;
706 int ret;
707
708
709
710
711
712
713
714 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
715 if (!vma->anon_vma)
716 return 0;
717 }
718
719 if (is_vm_hugetlb_page(vma))
720 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
721
722 if (unlikely(is_pfn_mapping(vma))) {
723
724
725
726
727 ret = track_pfn_vma_copy(vma);
728 if (ret)
729 return ret;
730 }
731
732
733
734
735
736
737
738 if (is_cow_mapping(vma->vm_flags))
739 mmu_notifier_invalidate_range_start(src_mm, addr, end);
740
741 ret = 0;
742 dst_pgd = pgd_offset(dst_mm, addr);
743 src_pgd = pgd_offset(src_mm, addr);
744 do {
745 next = pgd_addr_end(addr, end);
746 if (pgd_none_or_clear_bad(src_pgd))
747 continue;
748 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
749 vma, addr, next))) {
750 ret = -ENOMEM;
751 break;
752 }
753 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
754
755 if (is_cow_mapping(vma->vm_flags))
756 mmu_notifier_invalidate_range_end(src_mm,
757 vma->vm_start, end);
758 return ret;
759}
760
761static unsigned long zap_pte_range(struct mmu_gather *tlb,
762 struct vm_area_struct *vma, pmd_t *pmd,
763 unsigned long addr, unsigned long end,
764 long *zap_work, struct zap_details *details)
765{
766 struct mm_struct *mm = tlb->mm;
767 pte_t *pte;
768 spinlock_t *ptl;
769 int file_rss = 0;
770 int anon_rss = 0;
771
772 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
773 arch_enter_lazy_mmu_mode();
774 do {
775 pte_t ptent = *pte;
776 if (pte_none(ptent)) {
777 (*zap_work)--;
778 continue;
779 }
780
781 (*zap_work) -= PAGE_SIZE;
782
783 if (pte_present(ptent)) {
784 struct page *page;
785
786 page = vm_normal_page(vma, addr, ptent);
787 if (unlikely(details) && page) {
788
789
790
791
792
793 if (details->check_mapping &&
794 details->check_mapping != page->mapping)
795 continue;
796
797
798
799
800 if (details->nonlinear_vma &&
801 (page->index < details->first_index ||
802 page->index > details->last_index))
803 continue;
804 }
805 ptent = ptep_get_and_clear_full(mm, addr, pte,
806 tlb->fullmm);
807 tlb_remove_tlb_entry(tlb, pte, addr);
808 if (unlikely(!page))
809 continue;
810 if (unlikely(details) && details->nonlinear_vma
811 && linear_page_index(details->nonlinear_vma,
812 addr) != page->index)
813 set_pte_at(mm, addr, pte,
814 pgoff_to_pte(page->index));
815 if (PageAnon(page))
816 anon_rss--;
817 else {
818 if (pte_dirty(ptent))
819 set_page_dirty(page);
820 if (pte_young(ptent) &&
821 likely(!VM_SequentialReadHint(vma)))
822 mark_page_accessed(page);
823 file_rss--;
824 }
825 page_remove_rmap(page);
826 if (unlikely(page_mapcount(page) < 0))
827 print_bad_pte(vma, addr, ptent, page);
828 tlb_remove_page(tlb, page);
829 continue;
830 }
831
832
833
834
835 if (unlikely(details))
836 continue;
837 if (pte_file(ptent)) {
838 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
839 print_bad_pte(vma, addr, ptent, NULL);
840 } else if
841 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
842 print_bad_pte(vma, addr, ptent, NULL);
843 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
844 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
845
846 add_mm_rss(mm, file_rss, anon_rss);
847 arch_leave_lazy_mmu_mode();
848 pte_unmap_unlock(pte - 1, ptl);
849
850 return addr;
851}
852
853static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
854 struct vm_area_struct *vma, pud_t *pud,
855 unsigned long addr, unsigned long end,
856 long *zap_work, struct zap_details *details)
857{
858 pmd_t *pmd;
859 unsigned long next;
860
861 pmd = pmd_offset(pud, addr);
862 do {
863 next = pmd_addr_end(addr, end);
864 if (pmd_none_or_clear_bad(pmd)) {
865 (*zap_work)--;
866 continue;
867 }
868 next = zap_pte_range(tlb, vma, pmd, addr, next,
869 zap_work, details);
870 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
871
872 return addr;
873}
874
875static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
876 struct vm_area_struct *vma, pgd_t *pgd,
877 unsigned long addr, unsigned long end,
878 long *zap_work, struct zap_details *details)
879{
880 pud_t *pud;
881 unsigned long next;
882
883 pud = pud_offset(pgd, addr);
884 do {
885 next = pud_addr_end(addr, end);
886 if (pud_none_or_clear_bad(pud)) {
887 (*zap_work)--;
888 continue;
889 }
890 next = zap_pmd_range(tlb, vma, pud, addr, next,
891 zap_work, details);
892 } while (pud++, addr = next, (addr != end && *zap_work > 0));
893
894 return addr;
895}
896
897static unsigned long unmap_page_range(struct mmu_gather *tlb,
898 struct vm_area_struct *vma,
899 unsigned long addr, unsigned long end,
900 long *zap_work, struct zap_details *details)
901{
902 pgd_t *pgd;
903 unsigned long next;
904
905 if (details && !details->check_mapping && !details->nonlinear_vma)
906 details = NULL;
907
908 BUG_ON(addr >= end);
909 tlb_start_vma(tlb, vma);
910 pgd = pgd_offset(vma->vm_mm, addr);
911 do {
912 next = pgd_addr_end(addr, end);
913 if (pgd_none_or_clear_bad(pgd)) {
914 (*zap_work)--;
915 continue;
916 }
917 next = zap_pud_range(tlb, vma, pgd, addr, next,
918 zap_work, details);
919 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
920 tlb_end_vma(tlb, vma);
921
922 return addr;
923}
924
925#ifdef CONFIG_PREEMPT
926# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
927#else
928
929# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
930#endif
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958unsigned long unmap_vmas(struct mmu_gather **tlbp,
959 struct vm_area_struct *vma, unsigned long start_addr,
960 unsigned long end_addr, unsigned long *nr_accounted,
961 struct zap_details *details)
962{
963 long zap_work = ZAP_BLOCK_SIZE;
964 unsigned long tlb_start = 0;
965 int tlb_start_valid = 0;
966 unsigned long start = start_addr;
967 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
968 int fullmm = (*tlbp)->fullmm;
969 struct mm_struct *mm = vma->vm_mm;
970
971 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
972 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
973 unsigned long end;
974
975 start = max(vma->vm_start, start_addr);
976 if (start >= vma->vm_end)
977 continue;
978 end = min(vma->vm_end, end_addr);
979 if (end <= vma->vm_start)
980 continue;
981
982 if (vma->vm_flags & VM_ACCOUNT)
983 *nr_accounted += (end - start) >> PAGE_SHIFT;
984
985 if (unlikely(is_pfn_mapping(vma)))
986 untrack_pfn_vma(vma, 0, 0);
987
988 while (start != end) {
989 if (!tlb_start_valid) {
990 tlb_start = start;
991 tlb_start_valid = 1;
992 }
993
994 if (unlikely(is_vm_hugetlb_page(vma))) {
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006 if (vma->vm_file) {
1007 unmap_hugepage_range(vma, start, end, NULL);
1008 zap_work -= (end - start) /
1009 pages_per_huge_page(hstate_vma(vma));
1010 }
1011
1012 start = end;
1013 } else
1014 start = unmap_page_range(*tlbp, vma,
1015 start, end, &zap_work, details);
1016
1017 if (zap_work > 0) {
1018 BUG_ON(start != end);
1019 break;
1020 }
1021
1022 tlb_finish_mmu(*tlbp, tlb_start, start);
1023
1024 if (need_resched() ||
1025 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1026 if (i_mmap_lock) {
1027 *tlbp = NULL;
1028 goto out;
1029 }
1030 cond_resched();
1031 }
1032
1033 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1034 tlb_start_valid = 0;
1035 zap_work = ZAP_BLOCK_SIZE;
1036 }
1037 }
1038out:
1039 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1040 return start;
1041}
1042
1043
1044
1045
1046
1047
1048
1049
1050unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1051 unsigned long size, struct zap_details *details)
1052{
1053 struct mm_struct *mm = vma->vm_mm;
1054 struct mmu_gather *tlb;
1055 unsigned long end = address + size;
1056 unsigned long nr_accounted = 0;
1057
1058 lru_add_drain();
1059 tlb = tlb_gather_mmu(mm, 0);
1060 update_hiwater_rss(mm);
1061 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1062 if (tlb)
1063 tlb_finish_mmu(tlb, address, end);
1064 return end;
1065}
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1080 unsigned long size)
1081{
1082 if (address < vma->vm_start || address + size > vma->vm_end ||
1083 !(vma->vm_flags & VM_PFNMAP))
1084 return -1;
1085 zap_page_range(vma, address, size, NULL);
1086 return 0;
1087}
1088EXPORT_SYMBOL_GPL(zap_vma_ptes);
1089
1090
1091
1092
1093struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1094 unsigned int flags)
1095{
1096 pgd_t *pgd;
1097 pud_t *pud;
1098 pmd_t *pmd;
1099 pte_t *ptep, pte;
1100 spinlock_t *ptl;
1101 struct page *page;
1102 struct mm_struct *mm = vma->vm_mm;
1103
1104 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1105 if (!IS_ERR(page)) {
1106 BUG_ON(flags & FOLL_GET);
1107 goto out;
1108 }
1109
1110 page = NULL;
1111 pgd = pgd_offset(mm, address);
1112 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1113 goto no_page_table;
1114
1115 pud = pud_offset(pgd, address);
1116 if (pud_none(*pud))
1117 goto no_page_table;
1118 if (pud_huge(*pud)) {
1119 BUG_ON(flags & FOLL_GET);
1120 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1121 goto out;
1122 }
1123 if (unlikely(pud_bad(*pud)))
1124 goto no_page_table;
1125
1126 pmd = pmd_offset(pud, address);
1127 if (pmd_none(*pmd))
1128 goto no_page_table;
1129 if (pmd_huge(*pmd)) {
1130 BUG_ON(flags & FOLL_GET);
1131 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1132 goto out;
1133 }
1134 if (unlikely(pmd_bad(*pmd)))
1135 goto no_page_table;
1136
1137 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1138
1139 pte = *ptep;
1140 if (!pte_present(pte))
1141 goto no_page;
1142 if ((flags & FOLL_WRITE) && !pte_write(pte))
1143 goto unlock;
1144 page = vm_normal_page(vma, address, pte);
1145 if (unlikely(!page))
1146 goto bad_page;
1147
1148 if (flags & FOLL_GET)
1149 get_page(page);
1150 if (flags & FOLL_TOUCH) {
1151 if ((flags & FOLL_WRITE) &&
1152 !pte_dirty(pte) && !PageDirty(page))
1153 set_page_dirty(page);
1154
1155
1156
1157
1158
1159 mark_page_accessed(page);
1160 }
1161unlock:
1162 pte_unmap_unlock(ptep, ptl);
1163out:
1164 return page;
1165
1166bad_page:
1167 pte_unmap_unlock(ptep, ptl);
1168 return ERR_PTR(-EFAULT);
1169
1170no_page:
1171 pte_unmap_unlock(ptep, ptl);
1172 if (!pte_none(pte))
1173 return page;
1174
1175no_page_table:
1176
1177
1178
1179
1180 if (flags & FOLL_ANON) {
1181 page = ZERO_PAGE(0);
1182 if (flags & FOLL_GET)
1183 get_page(page);
1184 BUG_ON(flags & FOLL_WRITE);
1185 }
1186 return page;
1187}
1188
1189
1190static inline int use_zero_page(struct vm_area_struct *vma)
1191{
1192
1193
1194
1195
1196
1197
1198
1199 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1200 return 0;
1201
1202
1203
1204 return !vma->vm_ops || !vma->vm_ops->fault;
1205}
1206
1207
1208
1209int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1210 unsigned long start, int len, int flags,
1211 struct page **pages, struct vm_area_struct **vmas)
1212{
1213 int i;
1214 unsigned int vm_flags = 0;
1215 int write = !!(flags & GUP_FLAGS_WRITE);
1216 int force = !!(flags & GUP_FLAGS_FORCE);
1217 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1218 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1219
1220 if (len <= 0)
1221 return 0;
1222
1223
1224
1225
1226 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1227 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1228 i = 0;
1229
1230 do {
1231 struct vm_area_struct *vma;
1232 unsigned int foll_flags;
1233
1234 vma = find_extend_vma(mm, start);
1235 if (!vma && in_gate_area(tsk, start)) {
1236 unsigned long pg = start & PAGE_MASK;
1237 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1238 pgd_t *pgd;
1239 pud_t *pud;
1240 pmd_t *pmd;
1241 pte_t *pte;
1242
1243
1244 if (!ignore && write)
1245 return i ? : -EFAULT;
1246 if (pg > TASK_SIZE)
1247 pgd = pgd_offset_k(pg);
1248 else
1249 pgd = pgd_offset_gate(mm, pg);
1250 BUG_ON(pgd_none(*pgd));
1251 pud = pud_offset(pgd, pg);
1252 BUG_ON(pud_none(*pud));
1253 pmd = pmd_offset(pud, pg);
1254 if (pmd_none(*pmd))
1255 return i ? : -EFAULT;
1256 pte = pte_offset_map(pmd, pg);
1257 if (pte_none(*pte)) {
1258 pte_unmap(pte);
1259 return i ? : -EFAULT;
1260 }
1261 if (pages) {
1262 struct page *page = vm_normal_page(gate_vma, start, *pte);
1263 pages[i] = page;
1264 if (page)
1265 get_page(page);
1266 }
1267 pte_unmap(pte);
1268 if (vmas)
1269 vmas[i] = gate_vma;
1270 i++;
1271 start += PAGE_SIZE;
1272 len--;
1273 continue;
1274 }
1275
1276 if (!vma ||
1277 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1278 (!ignore && !(vm_flags & vma->vm_flags)))
1279 return i ? : -EFAULT;
1280
1281 if (is_vm_hugetlb_page(vma)) {
1282 i = follow_hugetlb_page(mm, vma, pages, vmas,
1283 &start, &len, i, write);
1284 continue;
1285 }
1286
1287 foll_flags = FOLL_TOUCH;
1288 if (pages)
1289 foll_flags |= FOLL_GET;
1290 if (!write && use_zero_page(vma))
1291 foll_flags |= FOLL_ANON;
1292
1293 do {
1294 struct page *page;
1295
1296
1297
1298
1299
1300
1301
1302
1303 if (unlikely(!ignore_sigkill &&
1304 fatal_signal_pending(current)))
1305 return i ? i : -ERESTARTSYS;
1306
1307 if (write)
1308 foll_flags |= FOLL_WRITE;
1309
1310 cond_resched();
1311 while (!(page = follow_page(vma, start, foll_flags))) {
1312 int ret;
1313 ret = handle_mm_fault(mm, vma, start,
1314 foll_flags & FOLL_WRITE);
1315 if (ret & VM_FAULT_ERROR) {
1316 if (ret & VM_FAULT_OOM)
1317 return i ? i : -ENOMEM;
1318 else if (ret & VM_FAULT_SIGBUS)
1319 return i ? i : -EFAULT;
1320 BUG();
1321 }
1322 if (ret & VM_FAULT_MAJOR)
1323 tsk->maj_flt++;
1324 else
1325 tsk->min_flt++;
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339 if ((ret & VM_FAULT_WRITE) &&
1340 !(vma->vm_flags & VM_WRITE))
1341 foll_flags &= ~FOLL_WRITE;
1342
1343 cond_resched();
1344 }
1345 if (IS_ERR(page))
1346 return i ? i : PTR_ERR(page);
1347 if (pages) {
1348 pages[i] = page;
1349
1350 flush_anon_page(vma, page, start);
1351 flush_dcache_page(page);
1352 }
1353 if (vmas)
1354 vmas[i] = vma;
1355 i++;
1356 start += PAGE_SIZE;
1357 len--;
1358 } while (len && start < vma->vm_end);
1359 } while (len);
1360 return i;
1361}
1362
1363int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1364 unsigned long start, int len, int write, int force,
1365 struct page **pages, struct vm_area_struct **vmas)
1366{
1367 int flags = 0;
1368
1369 if (write)
1370 flags |= GUP_FLAGS_WRITE;
1371 if (force)
1372 flags |= GUP_FLAGS_FORCE;
1373
1374 return __get_user_pages(tsk, mm,
1375 start, len, flags,
1376 pages, vmas);
1377}
1378
1379EXPORT_SYMBOL(get_user_pages);
1380
1381pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1382 spinlock_t **ptl)
1383{
1384 pgd_t * pgd = pgd_offset(mm, addr);
1385 pud_t * pud = pud_alloc(mm, pgd, addr);
1386 if (pud) {
1387 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1388 if (pmd)
1389 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1390 }
1391 return NULL;
1392}
1393
1394
1395
1396
1397
1398
1399
1400
1401static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1402 struct page *page, pgprot_t prot)
1403{
1404 struct mm_struct *mm = vma->vm_mm;
1405 int retval;
1406 pte_t *pte;
1407 spinlock_t *ptl;
1408
1409 retval = -EINVAL;
1410 if (PageAnon(page))
1411 goto out;
1412 retval = -ENOMEM;
1413 flush_dcache_page(page);
1414 pte = get_locked_pte(mm, addr, &ptl);
1415 if (!pte)
1416 goto out;
1417 retval = -EBUSY;
1418 if (!pte_none(*pte))
1419 goto out_unlock;
1420
1421
1422 get_page(page);
1423 inc_mm_counter(mm, file_rss);
1424 page_add_file_rmap(page);
1425 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1426
1427 retval = 0;
1428 pte_unmap_unlock(pte, ptl);
1429 return retval;
1430out_unlock:
1431 pte_unmap_unlock(pte, ptl);
1432out:
1433 return retval;
1434}
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1459 struct page *page)
1460{
1461 if (addr < vma->vm_start || addr >= vma->vm_end)
1462 return -EFAULT;
1463 if (!page_count(page))
1464 return -EINVAL;
1465 vma->vm_flags |= VM_INSERTPAGE;
1466 return insert_page(vma, addr, page, vma->vm_page_prot);
1467}
1468EXPORT_SYMBOL(vm_insert_page);
1469
1470static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1471 unsigned long pfn, pgprot_t prot)
1472{
1473 struct mm_struct *mm = vma->vm_mm;
1474 int retval;
1475 pte_t *pte, entry;
1476 spinlock_t *ptl;
1477
1478 retval = -ENOMEM;
1479 pte = get_locked_pte(mm, addr, &ptl);
1480 if (!pte)
1481 goto out;
1482 retval = -EBUSY;
1483 if (!pte_none(*pte))
1484 goto out_unlock;
1485
1486
1487 entry = pte_mkspecial(pfn_pte(pfn, prot));
1488 set_pte_at(mm, addr, pte, entry);
1489 update_mmu_cache(vma, addr, entry);
1490
1491 retval = 0;
1492out_unlock:
1493 pte_unmap_unlock(pte, ptl);
1494out:
1495 return retval;
1496}
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1516 unsigned long pfn)
1517{
1518 int ret;
1519 pgprot_t pgprot = vma->vm_page_prot;
1520
1521
1522
1523
1524
1525
1526 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1527 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1528 (VM_PFNMAP|VM_MIXEDMAP));
1529 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1530 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1531
1532 if (addr < vma->vm_start || addr >= vma->vm_end)
1533 return -EFAULT;
1534 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
1535 return -EINVAL;
1536
1537 ret = insert_pfn(vma, addr, pfn, pgprot);
1538
1539 if (ret)
1540 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
1541
1542 return ret;
1543}
1544EXPORT_SYMBOL(vm_insert_pfn);
1545
1546int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1547 unsigned long pfn)
1548{
1549 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1550
1551 if (addr < vma->vm_start || addr >= vma->vm_end)
1552 return -EFAULT;
1553
1554
1555
1556
1557
1558
1559
1560 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1561 struct page *page;
1562
1563 page = pfn_to_page(pfn);
1564 return insert_page(vma, addr, page, vma->vm_page_prot);
1565 }
1566 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1567}
1568EXPORT_SYMBOL(vm_insert_mixed);
1569
1570
1571
1572
1573
1574
1575static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1576 unsigned long addr, unsigned long end,
1577 unsigned long pfn, pgprot_t prot)
1578{
1579 pte_t *pte;
1580 spinlock_t *ptl;
1581
1582 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1583 if (!pte)
1584 return -ENOMEM;
1585 arch_enter_lazy_mmu_mode();
1586 do {
1587 BUG_ON(!pte_none(*pte));
1588 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1589 pfn++;
1590 } while (pte++, addr += PAGE_SIZE, addr != end);
1591 arch_leave_lazy_mmu_mode();
1592 pte_unmap_unlock(pte - 1, ptl);
1593 return 0;
1594}
1595
1596static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1597 unsigned long addr, unsigned long end,
1598 unsigned long pfn, pgprot_t prot)
1599{
1600 pmd_t *pmd;
1601 unsigned long next;
1602
1603 pfn -= addr >> PAGE_SHIFT;
1604 pmd = pmd_alloc(mm, pud, addr);
1605 if (!pmd)
1606 return -ENOMEM;
1607 do {
1608 next = pmd_addr_end(addr, end);
1609 if (remap_pte_range(mm, pmd, addr, next,
1610 pfn + (addr >> PAGE_SHIFT), prot))
1611 return -ENOMEM;
1612 } while (pmd++, addr = next, addr != end);
1613 return 0;
1614}
1615
1616static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1617 unsigned long addr, unsigned long end,
1618 unsigned long pfn, pgprot_t prot)
1619{
1620 pud_t *pud;
1621 unsigned long next;
1622
1623 pfn -= addr >> PAGE_SHIFT;
1624 pud = pud_alloc(mm, pgd, addr);
1625 if (!pud)
1626 return -ENOMEM;
1627 do {
1628 next = pud_addr_end(addr, end);
1629 if (remap_pmd_range(mm, pud, addr, next,
1630 pfn + (addr >> PAGE_SHIFT), prot))
1631 return -ENOMEM;
1632 } while (pud++, addr = next, addr != end);
1633 return 0;
1634}
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1647 unsigned long pfn, unsigned long size, pgprot_t prot)
1648{
1649 pgd_t *pgd;
1650 unsigned long next;
1651 unsigned long end = addr + PAGE_ALIGN(size);
1652 struct mm_struct *mm = vma->vm_mm;
1653 int err;
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673 if (addr == vma->vm_start && end == vma->vm_end) {
1674 vma->vm_pgoff = pfn;
1675 vma->vm_flags |= VM_PFN_AT_MMAP;
1676 } else if (is_cow_mapping(vma->vm_flags))
1677 return -EINVAL;
1678
1679 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1680
1681 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
1682 if (err) {
1683
1684
1685
1686
1687 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1688 vma->vm_flags &= ~VM_PFN_AT_MMAP;
1689 return -EINVAL;
1690 }
1691
1692 BUG_ON(addr >= end);
1693 pfn -= addr >> PAGE_SHIFT;
1694 pgd = pgd_offset(mm, addr);
1695 flush_cache_range(vma, addr, end);
1696 do {
1697 next = pgd_addr_end(addr, end);
1698 err = remap_pud_range(mm, pgd, addr, next,
1699 pfn + (addr >> PAGE_SHIFT), prot);
1700 if (err)
1701 break;
1702 } while (pgd++, addr = next, addr != end);
1703
1704 if (err)
1705 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
1706
1707 return err;
1708}
1709EXPORT_SYMBOL(remap_pfn_range);
1710
1711static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1712 unsigned long addr, unsigned long end,
1713 pte_fn_t fn, void *data)
1714{
1715 pte_t *pte;
1716 int err;
1717 pgtable_t token;
1718 spinlock_t *uninitialized_var(ptl);
1719
1720 pte = (mm == &init_mm) ?
1721 pte_alloc_kernel(pmd, addr) :
1722 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1723 if (!pte)
1724 return -ENOMEM;
1725
1726 BUG_ON(pmd_huge(*pmd));
1727
1728 arch_enter_lazy_mmu_mode();
1729
1730 token = pmd_pgtable(*pmd);
1731
1732 do {
1733 err = fn(pte, token, addr, data);
1734 if (err)
1735 break;
1736 } while (pte++, addr += PAGE_SIZE, addr != end);
1737
1738 arch_leave_lazy_mmu_mode();
1739
1740 if (mm != &init_mm)
1741 pte_unmap_unlock(pte-1, ptl);
1742 return err;
1743}
1744
1745static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1746 unsigned long addr, unsigned long end,
1747 pte_fn_t fn, void *data)
1748{
1749 pmd_t *pmd;
1750 unsigned long next;
1751 int err;
1752
1753 BUG_ON(pud_huge(*pud));
1754
1755 pmd = pmd_alloc(mm, pud, addr);
1756 if (!pmd)
1757 return -ENOMEM;
1758 do {
1759 next = pmd_addr_end(addr, end);
1760 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1761 if (err)
1762 break;
1763 } while (pmd++, addr = next, addr != end);
1764 return err;
1765}
1766
1767static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1768 unsigned long addr, unsigned long end,
1769 pte_fn_t fn, void *data)
1770{
1771 pud_t *pud;
1772 unsigned long next;
1773 int err;
1774
1775 pud = pud_alloc(mm, pgd, addr);
1776 if (!pud)
1777 return -ENOMEM;
1778 do {
1779 next = pud_addr_end(addr, end);
1780 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1781 if (err)
1782 break;
1783 } while (pud++, addr = next, addr != end);
1784 return err;
1785}
1786
1787
1788
1789
1790
1791int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1792 unsigned long size, pte_fn_t fn, void *data)
1793{
1794 pgd_t *pgd;
1795 unsigned long next;
1796 unsigned long start = addr, end = addr + size;
1797 int err;
1798
1799 BUG_ON(addr >= end);
1800 mmu_notifier_invalidate_range_start(mm, start, end);
1801 pgd = pgd_offset(mm, addr);
1802 do {
1803 next = pgd_addr_end(addr, end);
1804 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1805 if (err)
1806 break;
1807 } while (pgd++, addr = next, addr != end);
1808 mmu_notifier_invalidate_range_end(mm, start, end);
1809 return err;
1810}
1811EXPORT_SYMBOL_GPL(apply_to_page_range);
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1823 pte_t *page_table, pte_t orig_pte)
1824{
1825 int same = 1;
1826#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1827 if (sizeof(pte_t) > sizeof(unsigned long)) {
1828 spinlock_t *ptl = pte_lockptr(mm, pmd);
1829 spin_lock(ptl);
1830 same = pte_same(*page_table, orig_pte);
1831 spin_unlock(ptl);
1832 }
1833#endif
1834 pte_unmap(page_table);
1835 return same;
1836}
1837
1838
1839
1840
1841
1842
1843
1844static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1845{
1846 if (likely(vma->vm_flags & VM_WRITE))
1847 pte = pte_mkwrite(pte);
1848 return pte;
1849}
1850
1851static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1852{
1853
1854
1855
1856
1857
1858
1859 if (unlikely(!src)) {
1860 void *kaddr = kmap_atomic(dst, KM_USER0);
1861 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1862
1863
1864
1865
1866
1867
1868
1869 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1870 memset(kaddr, 0, PAGE_SIZE);
1871 kunmap_atomic(kaddr, KM_USER0);
1872 flush_dcache_page(dst);
1873 } else
1874 copy_user_highpage(dst, src, va, vma);
1875}
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1896 unsigned long address, pte_t *page_table, pmd_t *pmd,
1897 spinlock_t *ptl, pte_t orig_pte)
1898{
1899 struct page *old_page, *new_page;
1900 pte_t entry;
1901 int reuse = 0, ret = 0;
1902 int page_mkwrite = 0;
1903 struct page *dirty_page = NULL;
1904
1905 old_page = vm_normal_page(vma, address, orig_pte);
1906 if (!old_page) {
1907
1908
1909
1910
1911
1912
1913
1914 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1915 (VM_WRITE|VM_SHARED))
1916 goto reuse;
1917 goto gotten;
1918 }
1919
1920
1921
1922
1923
1924 if (PageAnon(old_page)) {
1925 if (!trylock_page(old_page)) {
1926 page_cache_get(old_page);
1927 pte_unmap_unlock(page_table, ptl);
1928 lock_page(old_page);
1929 page_table = pte_offset_map_lock(mm, pmd, address,
1930 &ptl);
1931 if (!pte_same(*page_table, orig_pte)) {
1932 unlock_page(old_page);
1933 page_cache_release(old_page);
1934 goto unlock;
1935 }
1936 page_cache_release(old_page);
1937 }
1938 reuse = reuse_swap_page(old_page);
1939 unlock_page(old_page);
1940 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1941 (VM_WRITE|VM_SHARED))) {
1942
1943
1944
1945
1946
1947 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1948 struct vm_fault vmf;
1949 int tmp;
1950
1951 vmf.virtual_address = (void __user *)(address &
1952 PAGE_MASK);
1953 vmf.pgoff = old_page->index;
1954 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1955 vmf.page = old_page;
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965 page_cache_get(old_page);
1966 pte_unmap_unlock(page_table, ptl);
1967
1968 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
1969 if (unlikely(tmp &
1970 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
1971 ret = tmp;
1972 goto unwritable_page;
1973 }
1974 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
1975 lock_page(old_page);
1976 if (!old_page->mapping) {
1977 ret = 0;
1978 unlock_page(old_page);
1979 goto unwritable_page;
1980 }
1981 } else
1982 VM_BUG_ON(!PageLocked(old_page));
1983
1984
1985
1986
1987
1988
1989
1990 page_table = pte_offset_map_lock(mm, pmd, address,
1991 &ptl);
1992 if (!pte_same(*page_table, orig_pte)) {
1993 unlock_page(old_page);
1994 page_cache_release(old_page);
1995 goto unlock;
1996 }
1997
1998 page_mkwrite = 1;
1999 }
2000 dirty_page = old_page;
2001 get_page(dirty_page);
2002 reuse = 1;
2003 }
2004
2005 if (reuse) {
2006reuse:
2007 flush_cache_page(vma, address, pte_pfn(orig_pte));
2008 entry = pte_mkyoung(orig_pte);
2009 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2010 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2011 update_mmu_cache(vma, address, entry);
2012 ret |= VM_FAULT_WRITE;
2013 goto unlock;
2014 }
2015
2016
2017
2018
2019 page_cache_get(old_page);
2020gotten:
2021 pte_unmap_unlock(page_table, ptl);
2022
2023 if (unlikely(anon_vma_prepare(vma)))
2024 goto oom;
2025 VM_BUG_ON(old_page == ZERO_PAGE(0));
2026 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2027 if (!new_page)
2028 goto oom;
2029
2030
2031
2032
2033 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2034 lock_page(old_page);
2035 clear_page_mlock(old_page);
2036 unlock_page(old_page);
2037 }
2038 cow_user_page(new_page, old_page, address, vma);
2039 __SetPageUptodate(new_page);
2040
2041 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2042 goto oom_free_new;
2043
2044
2045
2046
2047 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2048 if (likely(pte_same(*page_table, orig_pte))) {
2049 if (old_page) {
2050 if (!PageAnon(old_page)) {
2051 dec_mm_counter(mm, file_rss);
2052 inc_mm_counter(mm, anon_rss);
2053 }
2054 } else
2055 inc_mm_counter(mm, anon_rss);
2056 flush_cache_page(vma, address, pte_pfn(orig_pte));
2057 entry = mk_pte(new_page, vma->vm_page_prot);
2058 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2059
2060
2061
2062
2063
2064
2065 ptep_clear_flush_notify(vma, address, page_table);
2066 page_add_new_anon_rmap(new_page, vma, address);
2067 set_pte_at(mm, address, page_table, entry);
2068 update_mmu_cache(vma, address, entry);
2069 if (old_page) {
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092 page_remove_rmap(old_page);
2093 }
2094
2095
2096 new_page = old_page;
2097 ret |= VM_FAULT_WRITE;
2098 } else
2099 mem_cgroup_uncharge_page(new_page);
2100
2101 if (new_page)
2102 page_cache_release(new_page);
2103 if (old_page)
2104 page_cache_release(old_page);
2105unlock:
2106 pte_unmap_unlock(page_table, ptl);
2107 if (dirty_page) {
2108
2109
2110
2111
2112
2113
2114
2115
2116 if (!page_mkwrite) {
2117 wait_on_page_locked(dirty_page);
2118 set_page_dirty_balance(dirty_page, page_mkwrite);
2119 }
2120 put_page(dirty_page);
2121 if (page_mkwrite) {
2122 struct address_space *mapping = dirty_page->mapping;
2123
2124 set_page_dirty(dirty_page);
2125 unlock_page(dirty_page);
2126 page_cache_release(dirty_page);
2127 if (mapping) {
2128
2129
2130
2131
2132 balance_dirty_pages_ratelimited(mapping);
2133 }
2134 }
2135
2136
2137 if (vma->vm_file)
2138 file_update_time(vma->vm_file);
2139 }
2140 return ret;
2141oom_free_new:
2142 page_cache_release(new_page);
2143oom:
2144 if (old_page) {
2145 if (page_mkwrite) {
2146 unlock_page(old_page);
2147 page_cache_release(old_page);
2148 }
2149 page_cache_release(old_page);
2150 }
2151 return VM_FAULT_OOM;
2152
2153unwritable_page:
2154 page_cache_release(old_page);
2155 return ret;
2156}
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2191
2192static void reset_vma_truncate_counts(struct address_space *mapping)
2193{
2194 struct vm_area_struct *vma;
2195 struct prio_tree_iter iter;
2196
2197 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2198 vma->vm_truncate_count = 0;
2199 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2200 vma->vm_truncate_count = 0;
2201}
2202
2203static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2204 unsigned long start_addr, unsigned long end_addr,
2205 struct zap_details *details)
2206{
2207 unsigned long restart_addr;
2208 int need_break;
2209
2210
2211
2212
2213
2214
2215
2216
2217again:
2218 restart_addr = vma->vm_truncate_count;
2219 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2220 start_addr = restart_addr;
2221 if (start_addr >= end_addr) {
2222
2223 vma->vm_truncate_count = details->truncate_count;
2224 return 0;
2225 }
2226 }
2227
2228 restart_addr = zap_page_range(vma, start_addr,
2229 end_addr - start_addr, details);
2230 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2231
2232 if (restart_addr >= end_addr) {
2233
2234 vma->vm_truncate_count = details->truncate_count;
2235 if (!need_break)
2236 return 0;
2237 } else {
2238
2239 vma->vm_truncate_count = restart_addr;
2240 if (!need_break)
2241 goto again;
2242 }
2243
2244 spin_unlock(details->i_mmap_lock);
2245 cond_resched();
2246 spin_lock(details->i_mmap_lock);
2247 return -EINTR;
2248}
2249
2250static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2251 struct zap_details *details)
2252{
2253 struct vm_area_struct *vma;
2254 struct prio_tree_iter iter;
2255 pgoff_t vba, vea, zba, zea;
2256
2257restart:
2258 vma_prio_tree_foreach(vma, &iter, root,
2259 details->first_index, details->last_index) {
2260
2261 if (vma->vm_truncate_count == details->truncate_count)
2262 continue;
2263
2264 vba = vma->vm_pgoff;
2265 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2266
2267 zba = details->first_index;
2268 if (zba < vba)
2269 zba = vba;
2270 zea = details->last_index;
2271 if (zea > vea)
2272 zea = vea;
2273
2274 if (unmap_mapping_range_vma(vma,
2275 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2276 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2277 details) < 0)
2278 goto restart;
2279 }
2280}
2281
2282static inline void unmap_mapping_range_list(struct list_head *head,
2283 struct zap_details *details)
2284{
2285 struct vm_area_struct *vma;
2286
2287
2288
2289
2290
2291
2292
2293restart:
2294 list_for_each_entry(vma, head, shared.vm_set.list) {
2295
2296 if (vma->vm_truncate_count == details->truncate_count)
2297 continue;
2298 details->nonlinear_vma = vma;
2299 if (unmap_mapping_range_vma(vma, vma->vm_start,
2300 vma->vm_end, details) < 0)
2301 goto restart;
2302 }
2303}
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319void unmap_mapping_range(struct address_space *mapping,
2320 loff_t const holebegin, loff_t const holelen, int even_cows)
2321{
2322 struct zap_details details;
2323 pgoff_t hba = holebegin >> PAGE_SHIFT;
2324 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2325
2326
2327 if (sizeof(holelen) > sizeof(hlen)) {
2328 long long holeend =
2329 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2330 if (holeend & ~(long long)ULONG_MAX)
2331 hlen = ULONG_MAX - hba + 1;
2332 }
2333
2334 details.check_mapping = even_cows? NULL: mapping;
2335 details.nonlinear_vma = NULL;
2336 details.first_index = hba;
2337 details.last_index = hba + hlen - 1;
2338 if (details.last_index < details.first_index)
2339 details.last_index = ULONG_MAX;
2340 details.i_mmap_lock = &mapping->i_mmap_lock;
2341
2342 spin_lock(&mapping->i_mmap_lock);
2343
2344
2345 mapping->truncate_count++;
2346 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2347 if (mapping->truncate_count == 0)
2348 reset_vma_truncate_counts(mapping);
2349 mapping->truncate_count++;
2350 }
2351 details.truncate_count = mapping->truncate_count;
2352
2353 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2354 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2355 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2356 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2357 spin_unlock(&mapping->i_mmap_lock);
2358}
2359EXPORT_SYMBOL(unmap_mapping_range);
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370int vmtruncate(struct inode * inode, loff_t offset)
2371{
2372 if (inode->i_size < offset) {
2373 unsigned long limit;
2374
2375 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2376 if (limit != RLIM_INFINITY && offset > limit)
2377 goto out_sig;
2378 if (offset > inode->i_sb->s_maxbytes)
2379 goto out_big;
2380 i_size_write(inode, offset);
2381 } else {
2382 struct address_space *mapping = inode->i_mapping;
2383
2384
2385
2386
2387
2388
2389 if (IS_SWAPFILE(inode))
2390 return -ETXTBSY;
2391 i_size_write(inode, offset);
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2403 truncate_inode_pages(mapping, offset);
2404 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2405 }
2406
2407 if (inode->i_op->truncate)
2408 inode->i_op->truncate(inode);
2409 return 0;
2410
2411out_sig:
2412 send_sig(SIGXFSZ, current, 0);
2413out_big:
2414 return -EFBIG;
2415}
2416EXPORT_SYMBOL(vmtruncate);
2417
2418int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2419{
2420 struct address_space *mapping = inode->i_mapping;
2421
2422
2423
2424
2425
2426
2427 if (!inode->i_op->truncate_range)
2428 return -ENOSYS;
2429
2430 mutex_lock(&inode->i_mutex);
2431 down_write(&inode->i_alloc_sem);
2432 unmap_mapping_range(mapping, offset, (end - offset), 1);
2433 truncate_inode_pages_range(mapping, offset, end);
2434 unmap_mapping_range(mapping, offset, (end - offset), 1);
2435 inode->i_op->truncate_range(inode, offset, end);
2436 up_write(&inode->i_alloc_sem);
2437 mutex_unlock(&inode->i_mutex);
2438
2439 return 0;
2440}
2441
2442
2443
2444
2445
2446
2447static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2448 unsigned long address, pte_t *page_table, pmd_t *pmd,
2449 int write_access, pte_t orig_pte)
2450{
2451 spinlock_t *ptl;
2452 struct page *page;
2453 swp_entry_t entry;
2454 pte_t pte;
2455 struct mem_cgroup *ptr = NULL;
2456 int ret = 0;
2457
2458 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2459 goto out;
2460
2461 entry = pte_to_swp_entry(orig_pte);
2462 if (is_migration_entry(entry)) {
2463 migration_entry_wait(mm, pmd, address);
2464 goto out;
2465 }
2466 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2467 page = lookup_swap_cache(entry);
2468 if (!page) {
2469 grab_swap_token();
2470 page = swapin_readahead(entry,
2471 GFP_HIGHUSER_MOVABLE, vma, address);
2472 if (!page) {
2473
2474
2475
2476
2477 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2478 if (likely(pte_same(*page_table, orig_pte)))
2479 ret = VM_FAULT_OOM;
2480 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2481 goto unlock;
2482 }
2483
2484
2485 ret = VM_FAULT_MAJOR;
2486 count_vm_event(PGMAJFAULT);
2487 }
2488
2489 lock_page(page);
2490 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2491
2492 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2493 ret = VM_FAULT_OOM;
2494 goto out_page;
2495 }
2496
2497
2498
2499
2500 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2501 if (unlikely(!pte_same(*page_table, orig_pte)))
2502 goto out_nomap;
2503
2504 if (unlikely(!PageUptodate(page))) {
2505 ret = VM_FAULT_SIGBUS;
2506 goto out_nomap;
2507 }
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523 inc_mm_counter(mm, anon_rss);
2524 pte = mk_pte(page, vma->vm_page_prot);
2525 if (write_access && reuse_swap_page(page)) {
2526 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2527 write_access = 0;
2528 }
2529 flush_icache_page(vma, page);
2530 set_pte_at(mm, address, page_table, pte);
2531 page_add_anon_rmap(page, vma, address);
2532
2533 mem_cgroup_commit_charge_swapin(page, ptr);
2534
2535 swap_free(entry);
2536 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2537 try_to_free_swap(page);
2538 unlock_page(page);
2539
2540 if (write_access) {
2541 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2542 if (ret & VM_FAULT_ERROR)
2543 ret &= VM_FAULT_ERROR;
2544 goto out;
2545 }
2546
2547
2548 update_mmu_cache(vma, address, pte);
2549unlock:
2550 pte_unmap_unlock(page_table, ptl);
2551out:
2552 return ret;
2553out_nomap:
2554 mem_cgroup_cancel_charge_swapin(ptr);
2555 pte_unmap_unlock(page_table, ptl);
2556out_page:
2557 unlock_page(page);
2558 page_cache_release(page);
2559 return ret;
2560}
2561
2562
2563
2564
2565
2566
2567static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2568 unsigned long address, pte_t *page_table, pmd_t *pmd,
2569 int write_access)
2570{
2571 struct page *page;
2572 spinlock_t *ptl;
2573 pte_t entry;
2574
2575
2576 pte_unmap(page_table);
2577
2578 if (unlikely(anon_vma_prepare(vma)))
2579 goto oom;
2580 page = alloc_zeroed_user_highpage_movable(vma, address);
2581 if (!page)
2582 goto oom;
2583 __SetPageUptodate(page);
2584
2585 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2586 goto oom_free_page;
2587
2588 entry = mk_pte(page, vma->vm_page_prot);
2589 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2590
2591 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2592 if (!pte_none(*page_table))
2593 goto release;
2594 inc_mm_counter(mm, anon_rss);
2595 page_add_new_anon_rmap(page, vma, address);
2596 set_pte_at(mm, address, page_table, entry);
2597
2598
2599 update_mmu_cache(vma, address, entry);
2600unlock:
2601 pte_unmap_unlock(page_table, ptl);
2602 return 0;
2603release:
2604 mem_cgroup_uncharge_page(page);
2605 page_cache_release(page);
2606 goto unlock;
2607oom_free_page:
2608 page_cache_release(page);
2609oom:
2610 return VM_FAULT_OOM;
2611}
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2627 unsigned long address, pmd_t *pmd,
2628 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2629{
2630 pte_t *page_table;
2631 spinlock_t *ptl;
2632 struct page *page;
2633 pte_t entry;
2634 int anon = 0;
2635 int charged = 0;
2636 struct page *dirty_page = NULL;
2637 struct vm_fault vmf;
2638 int ret;
2639 int page_mkwrite = 0;
2640
2641 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2642 vmf.pgoff = pgoff;
2643 vmf.flags = flags;
2644 vmf.page = NULL;
2645
2646 ret = vma->vm_ops->fault(vma, &vmf);
2647 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2648 return ret;
2649
2650
2651
2652
2653
2654 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2655 lock_page(vmf.page);
2656 else
2657 VM_BUG_ON(!PageLocked(vmf.page));
2658
2659
2660
2661
2662 page = vmf.page;
2663 if (flags & FAULT_FLAG_WRITE) {
2664 if (!(vma->vm_flags & VM_SHARED)) {
2665 anon = 1;
2666 if (unlikely(anon_vma_prepare(vma))) {
2667 ret = VM_FAULT_OOM;
2668 goto out;
2669 }
2670 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
2671 vma, address);
2672 if (!page) {
2673 ret = VM_FAULT_OOM;
2674 goto out;
2675 }
2676 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2677 ret = VM_FAULT_OOM;
2678 page_cache_release(page);
2679 goto out;
2680 }
2681 charged = 1;
2682
2683
2684
2685
2686 if (vma->vm_flags & VM_LOCKED)
2687 clear_page_mlock(vmf.page);
2688 copy_user_highpage(page, vmf.page, address, vma);
2689 __SetPageUptodate(page);
2690 } else {
2691
2692
2693
2694
2695
2696 if (vma->vm_ops->page_mkwrite) {
2697 int tmp;
2698
2699 unlock_page(page);
2700 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2701 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2702 if (unlikely(tmp &
2703 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2704 ret = tmp;
2705 goto unwritable_page;
2706 }
2707 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2708 lock_page(page);
2709 if (!page->mapping) {
2710 ret = 0;
2711 unlock_page(page);
2712 goto unwritable_page;
2713 }
2714 } else
2715 VM_BUG_ON(!PageLocked(page));
2716 page_mkwrite = 1;
2717 }
2718 }
2719
2720 }
2721
2722 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735 if (likely(pte_same(*page_table, orig_pte))) {
2736 flush_icache_page(vma, page);
2737 entry = mk_pte(page, vma->vm_page_prot);
2738 if (flags & FAULT_FLAG_WRITE)
2739 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2740 if (anon) {
2741 inc_mm_counter(mm, anon_rss);
2742 page_add_new_anon_rmap(page, vma, address);
2743 } else {
2744 inc_mm_counter(mm, file_rss);
2745 page_add_file_rmap(page);
2746 if (flags & FAULT_FLAG_WRITE) {
2747 dirty_page = page;
2748 get_page(dirty_page);
2749 }
2750 }
2751 set_pte_at(mm, address, page_table, entry);
2752
2753
2754 update_mmu_cache(vma, address, entry);
2755 } else {
2756 if (charged)
2757 mem_cgroup_uncharge_page(page);
2758 if (anon)
2759 page_cache_release(page);
2760 else
2761 anon = 1;
2762 }
2763
2764 pte_unmap_unlock(page_table, ptl);
2765
2766out:
2767 if (dirty_page) {
2768 struct address_space *mapping = page->mapping;
2769
2770 if (set_page_dirty(dirty_page))
2771 page_mkwrite = 1;
2772 unlock_page(dirty_page);
2773 put_page(dirty_page);
2774 if (page_mkwrite && mapping) {
2775
2776
2777
2778
2779 balance_dirty_pages_ratelimited(mapping);
2780 }
2781
2782
2783 if (vma->vm_file)
2784 file_update_time(vma->vm_file);
2785 } else {
2786 unlock_page(vmf.page);
2787 if (anon)
2788 page_cache_release(vmf.page);
2789 }
2790
2791 return ret;
2792
2793unwritable_page:
2794 page_cache_release(page);
2795 return ret;
2796}
2797
2798static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2799 unsigned long address, pte_t *page_table, pmd_t *pmd,
2800 int write_access, pte_t orig_pte)
2801{
2802 pgoff_t pgoff = (((address & PAGE_MASK)
2803 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2804 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2805
2806 pte_unmap(page_table);
2807 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2808}
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2820 unsigned long address, pte_t *page_table, pmd_t *pmd,
2821 int write_access, pte_t orig_pte)
2822{
2823 unsigned int flags = FAULT_FLAG_NONLINEAR |
2824 (write_access ? FAULT_FLAG_WRITE : 0);
2825 pgoff_t pgoff;
2826
2827 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2828 return 0;
2829
2830 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2831
2832
2833
2834 print_bad_pte(vma, address, orig_pte, NULL);
2835 return VM_FAULT_OOM;
2836 }
2837
2838 pgoff = pte_to_pgoff(orig_pte);
2839 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2840}
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855static inline int handle_pte_fault(struct mm_struct *mm,
2856 struct vm_area_struct *vma, unsigned long address,
2857 pte_t *pte, pmd_t *pmd, int write_access)
2858{
2859 pte_t entry;
2860 spinlock_t *ptl;
2861
2862 entry = *pte;
2863 if (!pte_present(entry)) {
2864 if (pte_none(entry)) {
2865 if (vma->vm_ops) {
2866 if (likely(vma->vm_ops->fault))
2867 return do_linear_fault(mm, vma, address,
2868 pte, pmd, write_access, entry);
2869 }
2870 return do_anonymous_page(mm, vma, address,
2871 pte, pmd, write_access);
2872 }
2873 if (pte_file(entry))
2874 return do_nonlinear_fault(mm, vma, address,
2875 pte, pmd, write_access, entry);
2876 return do_swap_page(mm, vma, address,
2877 pte, pmd, write_access, entry);
2878 }
2879
2880 ptl = pte_lockptr(mm, pmd);
2881 spin_lock(ptl);
2882 if (unlikely(!pte_same(*pte, entry)))
2883 goto unlock;
2884 if (write_access) {
2885 if (!pte_write(entry))
2886 return do_wp_page(mm, vma, address,
2887 pte, pmd, ptl, entry);
2888 entry = pte_mkdirty(entry);
2889 }
2890 entry = pte_mkyoung(entry);
2891 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
2892 update_mmu_cache(vma, address, entry);
2893 } else {
2894
2895
2896
2897
2898
2899
2900 if (write_access)
2901 flush_tlb_page(vma, address);
2902 }
2903unlock:
2904 pte_unmap_unlock(pte, ptl);
2905 return 0;
2906}
2907
2908
2909
2910
2911int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2912 unsigned long address, int write_access)
2913{
2914 pgd_t *pgd;
2915 pud_t *pud;
2916 pmd_t *pmd;
2917 pte_t *pte;
2918
2919 __set_current_state(TASK_RUNNING);
2920
2921 count_vm_event(PGFAULT);
2922
2923 if (unlikely(is_vm_hugetlb_page(vma)))
2924 return hugetlb_fault(mm, vma, address, write_access);
2925
2926 pgd = pgd_offset(mm, address);
2927 pud = pud_alloc(mm, pgd, address);
2928 if (!pud)
2929 return VM_FAULT_OOM;
2930 pmd = pmd_alloc(mm, pud, address);
2931 if (!pmd)
2932 return VM_FAULT_OOM;
2933 pte = pte_alloc_map(mm, pmd, address);
2934 if (!pte)
2935 return VM_FAULT_OOM;
2936
2937 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2938}
2939
2940#ifndef __PAGETABLE_PUD_FOLDED
2941
2942
2943
2944
2945int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2946{
2947 pud_t *new = pud_alloc_one(mm, address);
2948 if (!new)
2949 return -ENOMEM;
2950
2951 smp_wmb();
2952
2953 spin_lock(&mm->page_table_lock);
2954 if (pgd_present(*pgd))
2955 pud_free(mm, new);
2956 else
2957 pgd_populate(mm, pgd, new);
2958 spin_unlock(&mm->page_table_lock);
2959 return 0;
2960}
2961#endif
2962
2963#ifndef __PAGETABLE_PMD_FOLDED
2964
2965
2966
2967
2968int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2969{
2970 pmd_t *new = pmd_alloc_one(mm, address);
2971 if (!new)
2972 return -ENOMEM;
2973
2974 smp_wmb();
2975
2976 spin_lock(&mm->page_table_lock);
2977#ifndef __ARCH_HAS_4LEVEL_HACK
2978 if (pud_present(*pud))
2979 pmd_free(mm, new);
2980 else
2981 pud_populate(mm, pud, new);
2982#else
2983 if (pgd_present(*pud))
2984 pmd_free(mm, new);
2985 else
2986 pgd_populate(mm, pud, new);
2987#endif
2988 spin_unlock(&mm->page_table_lock);
2989 return 0;
2990}
2991#endif
2992
2993int make_pages_present(unsigned long addr, unsigned long end)
2994{
2995 int ret, len, write;
2996 struct vm_area_struct * vma;
2997
2998 vma = find_vma(current->mm, addr);
2999 if (!vma)
3000 return -ENOMEM;
3001 write = (vma->vm_flags & VM_WRITE) != 0;
3002 BUG_ON(addr >= end);
3003 BUG_ON(end > vma->vm_end);
3004 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3005 ret = get_user_pages(current, current->mm, addr,
3006 len, write, 0, NULL, NULL);
3007 if (ret < 0)
3008 return ret;
3009 return ret == len ? 0 : -EFAULT;
3010}
3011
3012#if !defined(__HAVE_ARCH_GATE_AREA)
3013
3014#if defined(AT_SYSINFO_EHDR)
3015static struct vm_area_struct gate_vma;
3016
3017static int __init gate_vma_init(void)
3018{
3019 gate_vma.vm_mm = NULL;
3020 gate_vma.vm_start = FIXADDR_USER_START;
3021 gate_vma.vm_end = FIXADDR_USER_END;
3022 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3023 gate_vma.vm_page_prot = __P101;
3024
3025
3026
3027
3028
3029
3030 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3031 return 0;
3032}
3033__initcall(gate_vma_init);
3034#endif
3035
3036struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
3037{
3038#ifdef AT_SYSINFO_EHDR
3039 return &gate_vma;
3040#else
3041 return NULL;
3042#endif
3043}
3044
3045int in_gate_area_no_task(unsigned long addr)
3046{
3047#ifdef AT_SYSINFO_EHDR
3048 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3049 return 1;
3050#endif
3051 return 0;
3052}
3053
3054#endif
3055
3056#ifdef CONFIG_HAVE_IOREMAP_PROT
3057int follow_phys(struct vm_area_struct *vma,
3058 unsigned long address, unsigned int flags,
3059 unsigned long *prot, resource_size_t *phys)
3060{
3061 pgd_t *pgd;
3062 pud_t *pud;
3063 pmd_t *pmd;
3064 pte_t *ptep, pte;
3065 spinlock_t *ptl;
3066 resource_size_t phys_addr = 0;
3067 struct mm_struct *mm = vma->vm_mm;
3068 int ret = -EINVAL;
3069
3070 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3071 goto out;
3072
3073 pgd = pgd_offset(mm, address);
3074 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3075 goto out;
3076
3077 pud = pud_offset(pgd, address);
3078 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3079 goto out;
3080
3081 pmd = pmd_offset(pud, address);
3082 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3083 goto out;
3084
3085
3086 if (pmd_huge(*pmd))
3087 goto out;
3088
3089 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
3090 if (!ptep)
3091 goto out;
3092
3093 pte = *ptep;
3094 if (!pte_present(pte))
3095 goto unlock;
3096 if ((flags & FOLL_WRITE) && !pte_write(pte))
3097 goto unlock;
3098 phys_addr = pte_pfn(pte);
3099 phys_addr <<= PAGE_SHIFT;
3100
3101 *prot = pgprot_val(pte_pgprot(pte));
3102 *phys = phys_addr;
3103 ret = 0;
3104
3105unlock:
3106 pte_unmap_unlock(ptep, ptl);
3107out:
3108 return ret;
3109}
3110
3111int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3112 void *buf, int len, int write)
3113{
3114 resource_size_t phys_addr;
3115 unsigned long prot = 0;
3116 void __iomem *maddr;
3117 int offset = addr & (PAGE_SIZE-1);
3118
3119 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3120 return -EINVAL;
3121
3122 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3123 if (write)
3124 memcpy_toio(maddr + offset, buf, len);
3125 else
3126 memcpy_fromio(buf, maddr + offset, len);
3127 iounmap(maddr);
3128
3129 return len;
3130}
3131#endif
3132
3133
3134
3135
3136
3137
3138int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
3139{
3140 struct mm_struct *mm;
3141 struct vm_area_struct *vma;
3142 void *old_buf = buf;
3143
3144 mm = get_task_mm(tsk);
3145 if (!mm)
3146 return 0;
3147
3148 down_read(&mm->mmap_sem);
3149
3150 while (len) {
3151 int bytes, ret, offset;
3152 void *maddr;
3153 struct page *page = NULL;
3154
3155 ret = get_user_pages(tsk, mm, addr, 1,
3156 write, 1, &page, &vma);
3157 if (ret <= 0) {
3158
3159
3160
3161
3162#ifdef CONFIG_HAVE_IOREMAP_PROT
3163 vma = find_vma(mm, addr);
3164 if (!vma)
3165 break;
3166 if (vma->vm_ops && vma->vm_ops->access)
3167 ret = vma->vm_ops->access(vma, addr, buf,
3168 len, write);
3169 if (ret <= 0)
3170#endif
3171 break;
3172 bytes = ret;
3173 } else {
3174 bytes = len;
3175 offset = addr & (PAGE_SIZE-1);
3176 if (bytes > PAGE_SIZE-offset)
3177 bytes = PAGE_SIZE-offset;
3178
3179 maddr = kmap(page);
3180 if (write) {
3181 copy_to_user_page(vma, page, addr,
3182 maddr + offset, buf, bytes);
3183 set_page_dirty_lock(page);
3184 } else {
3185 copy_from_user_page(vma, page, addr,
3186 buf, maddr + offset, bytes);
3187 }
3188 kunmap(page);
3189 page_cache_release(page);
3190 }
3191 len -= bytes;
3192 buf += bytes;
3193 addr += bytes;
3194 }
3195 up_read(&mm->mmap_sem);
3196 mmput(mm);
3197
3198 return buf - old_buf;
3199}
3200
3201
3202
3203
3204void print_vma_addr(char *prefix, unsigned long ip)
3205{
3206 struct mm_struct *mm = current->mm;
3207 struct vm_area_struct *vma;
3208
3209
3210
3211
3212
3213 if (preempt_count())
3214 return;
3215
3216 down_read(&mm->mmap_sem);
3217 vma = find_vma(mm, ip);
3218 if (vma && vma->vm_file) {
3219 struct file *f = vma->vm_file;
3220 char *buf = (char *)__get_free_page(GFP_KERNEL);
3221 if (buf) {
3222 char *p, *s;
3223
3224 p = d_path(&f->f_path, buf, PAGE_SIZE);
3225 if (IS_ERR(p))
3226 p = "?";
3227 s = strrchr(p, '/');
3228 if (s)
3229 p = s+1;
3230 printk("%s%s[%lx+%lx]", prefix, p,
3231 vma->vm_start,
3232 vma->vm_end - vma->vm_start);
3233 free_page((unsigned long)buf);
3234 }
3235 }
3236 up_read(¤t->mm->mmap_sem);
3237}
3238
3239#ifdef CONFIG_PROVE_LOCKING
3240void might_fault(void)
3241{
3242
3243
3244
3245
3246
3247
3248 if (segment_eq(get_fs(), KERNEL_DS))
3249 return;
3250
3251 might_sleep();
3252
3253
3254
3255
3256
3257 if (!in_atomic() && current->mm)
3258 might_lock_read(¤t->mm->mmap_sem);
3259}
3260EXPORT_SYMBOL(might_fault);
3261#endif
3262