1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128void sync_mm_rss(struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 current->rss_stat.count[i] = 0;
136 }
137 }
138 current->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 sync_mm_rss(task->mm);
161}
162#else
163
164#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
165#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
166
167static void check_sync_rss_stat(struct task_struct *task)
168{
169}
170
171#endif
172
173#ifdef HAVE_GENERIC_MMU_GATHER
174
175static int tlb_next_batch(struct mmu_gather *tlb)
176{
177 struct mmu_gather_batch *batch;
178
179 batch = tlb->active;
180 if (batch->next) {
181 tlb->active = batch->next;
182 return 1;
183 }
184
185 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
186 if (!batch)
187 return 0;
188
189 batch->next = NULL;
190 batch->nr = 0;
191 batch->max = MAX_GATHER_BATCH;
192
193 tlb->active->next = batch;
194 tlb->active = batch;
195
196 return 1;
197}
198
199
200
201
202
203
204void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
205{
206 tlb->mm = mm;
207
208 tlb->fullmm = fullmm;
209 tlb->need_flush = 0;
210 tlb->fast_mode = (num_possible_cpus() == 1);
211 tlb->local.next = NULL;
212 tlb->local.nr = 0;
213 tlb->local.max = ARRAY_SIZE(tlb->__pages);
214 tlb->active = &tlb->local;
215
216#ifdef CONFIG_HAVE_RCU_TABLE_FREE
217 tlb->batch = NULL;
218#endif
219}
220
221void tlb_flush_mmu(struct mmu_gather *tlb)
222{
223 struct mmu_gather_batch *batch;
224
225 if (!tlb->need_flush)
226 return;
227 tlb->need_flush = 0;
228 tlb_flush(tlb);
229#ifdef CONFIG_HAVE_RCU_TABLE_FREE
230 tlb_table_flush(tlb);
231#endif
232
233 if (tlb_fast_mode(tlb))
234 return;
235
236 for (batch = &tlb->local; batch; batch = batch->next) {
237 free_pages_and_swap_cache(batch->pages, batch->nr);
238 batch->nr = 0;
239 }
240 tlb->active = &tlb->local;
241}
242
243
244
245
246
247void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
248{
249 struct mmu_gather_batch *batch, *next;
250
251 tlb_flush_mmu(tlb);
252
253
254 check_pgt_cache();
255
256 for (batch = tlb->local.next; batch; batch = next) {
257 next = batch->next;
258 free_pages((unsigned long)batch, 0);
259 }
260 tlb->local.next = NULL;
261}
262
263
264
265
266
267
268
269int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
270{
271 struct mmu_gather_batch *batch;
272
273 VM_BUG_ON(!tlb->need_flush);
274
275 if (tlb_fast_mode(tlb)) {
276 free_page_and_swap_cache(page);
277 return 1;
278 }
279
280 batch = tlb->active;
281 batch->pages[batch->nr++] = page;
282 if (batch->nr == batch->max) {
283 if (!tlb_next_batch(tlb))
284 return 0;
285 batch = tlb->active;
286 }
287 VM_BUG_ON(batch->nr > batch->max);
288
289 return batch->max - batch->nr;
290}
291
292#endif
293
294#ifdef CONFIG_HAVE_RCU_TABLE_FREE
295
296
297
298
299
300static void tlb_remove_table_smp_sync(void *arg)
301{
302
303}
304
305static void tlb_remove_table_one(void *table)
306{
307
308
309
310
311
312
313
314 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
315 __tlb_remove_table(table);
316}
317
318static void tlb_remove_table_rcu(struct rcu_head *head)
319{
320 struct mmu_table_batch *batch;
321 int i;
322
323 batch = container_of(head, struct mmu_table_batch, rcu);
324
325 for (i = 0; i < batch->nr; i++)
326 __tlb_remove_table(batch->tables[i]);
327
328 free_page((unsigned long)batch);
329}
330
331void tlb_table_flush(struct mmu_gather *tlb)
332{
333 struct mmu_table_batch **batch = &tlb->batch;
334
335 if (*batch) {
336 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
337 *batch = NULL;
338 }
339}
340
341void tlb_remove_table(struct mmu_gather *tlb, void *table)
342{
343 struct mmu_table_batch **batch = &tlb->batch;
344
345 tlb->need_flush = 1;
346
347
348
349
350
351 if (atomic_read(&tlb->mm->mm_users) < 2) {
352 __tlb_remove_table(table);
353 return;
354 }
355
356 if (*batch == NULL) {
357 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
358 if (*batch == NULL) {
359 tlb_remove_table_one(table);
360 return;
361 }
362 (*batch)->nr = 0;
363 }
364 (*batch)->tables[(*batch)->nr++] = table;
365 if ((*batch)->nr == MAX_TABLE_BATCH)
366 tlb_table_flush(tlb);
367}
368
369#endif
370
371
372
373
374
375
376
377void pgd_clear_bad(pgd_t *pgd)
378{
379 pgd_ERROR(*pgd);
380 pgd_clear(pgd);
381}
382
383void pud_clear_bad(pud_t *pud)
384{
385 pud_ERROR(*pud);
386 pud_clear(pud);
387}
388
389void pmd_clear_bad(pmd_t *pmd)
390{
391 pmd_ERROR(*pmd);
392 pmd_clear(pmd);
393}
394
395
396
397
398
399static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
400 unsigned long addr)
401{
402 pgtable_t token = pmd_pgtable(*pmd);
403 pmd_clear(pmd);
404 pte_free_tlb(tlb, token, addr);
405 tlb->mm->nr_ptes--;
406}
407
408static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
409 unsigned long addr, unsigned long end,
410 unsigned long floor, unsigned long ceiling)
411{
412 pmd_t *pmd;
413 unsigned long next;
414 unsigned long start;
415
416 start = addr;
417 pmd = pmd_offset(pud, addr);
418 do {
419 next = pmd_addr_end(addr, end);
420 if (pmd_none_or_clear_bad(pmd))
421 continue;
422 free_pte_range(tlb, pmd, addr);
423 } while (pmd++, addr = next, addr != end);
424
425 start &= PUD_MASK;
426 if (start < floor)
427 return;
428 if (ceiling) {
429 ceiling &= PUD_MASK;
430 if (!ceiling)
431 return;
432 }
433 if (end - 1 > ceiling - 1)
434 return;
435
436 pmd = pmd_offset(pud, start);
437 pud_clear(pud);
438 pmd_free_tlb(tlb, pmd, start);
439}
440
441static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
442 unsigned long addr, unsigned long end,
443 unsigned long floor, unsigned long ceiling)
444{
445 pud_t *pud;
446 unsigned long next;
447 unsigned long start;
448
449 start = addr;
450 pud = pud_offset(pgd, addr);
451 do {
452 next = pud_addr_end(addr, end);
453 if (pud_none_or_clear_bad(pud))
454 continue;
455 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
456 } while (pud++, addr = next, addr != end);
457
458 start &= PGDIR_MASK;
459 if (start < floor)
460 return;
461 if (ceiling) {
462 ceiling &= PGDIR_MASK;
463 if (!ceiling)
464 return;
465 }
466 if (end - 1 > ceiling - 1)
467 return;
468
469 pud = pud_offset(pgd, start);
470 pgd_clear(pgd);
471 pud_free_tlb(tlb, pud, start);
472}
473
474
475
476
477
478
479void free_pgd_range(struct mmu_gather *tlb,
480 unsigned long addr, unsigned long end,
481 unsigned long floor, unsigned long ceiling)
482{
483 pgd_t *pgd;
484 unsigned long next;
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512 addr &= PMD_MASK;
513 if (addr < floor) {
514 addr += PMD_SIZE;
515 if (!addr)
516 return;
517 }
518 if (ceiling) {
519 ceiling &= PMD_MASK;
520 if (!ceiling)
521 return;
522 }
523 if (end - 1 > ceiling - 1)
524 end -= PMD_SIZE;
525 if (addr > end - 1)
526 return;
527
528 pgd = pgd_offset(tlb->mm, addr);
529 do {
530 next = pgd_addr_end(addr, end);
531 if (pgd_none_or_clear_bad(pgd))
532 continue;
533 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
534 } while (pgd++, addr = next, addr != end);
535}
536
537void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
538 unsigned long floor, unsigned long ceiling)
539{
540 while (vma) {
541 struct vm_area_struct *next = vma->vm_next;
542 unsigned long addr = vma->vm_start;
543
544
545
546
547
548 unlink_anon_vmas(vma);
549 unlink_file_vma(vma);
550
551 if (is_vm_hugetlb_page(vma)) {
552 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
553 floor, next? next->vm_start: ceiling);
554 } else {
555
556
557
558 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
559 && !is_vm_hugetlb_page(next)) {
560 vma = next;
561 next = vma->vm_next;
562 unlink_anon_vmas(vma);
563 unlink_file_vma(vma);
564 }
565 free_pgd_range(tlb, addr, vma->vm_end,
566 floor, next? next->vm_start: ceiling);
567 }
568 vma = next;
569 }
570}
571
572int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
573 pmd_t *pmd, unsigned long address)
574{
575 pgtable_t new = pte_alloc_one(mm, address);
576 int wait_split_huge_page;
577 if (!new)
578 return -ENOMEM;
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593 smp_wmb();
594
595 spin_lock(&mm->page_table_lock);
596 wait_split_huge_page = 0;
597 if (likely(pmd_none(*pmd))) {
598 mm->nr_ptes++;
599 pmd_populate(mm, pmd, new);
600 new = NULL;
601 } else if (unlikely(pmd_trans_splitting(*pmd)))
602 wait_split_huge_page = 1;
603 spin_unlock(&mm->page_table_lock);
604 if (new)
605 pte_free(mm, new);
606 if (wait_split_huge_page)
607 wait_split_huge_page(vma->anon_vma, pmd);
608 return 0;
609}
610
611int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
612{
613 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
614 if (!new)
615 return -ENOMEM;
616
617 smp_wmb();
618
619 spin_lock(&init_mm.page_table_lock);
620 if (likely(pmd_none(*pmd))) {
621 pmd_populate_kernel(&init_mm, pmd, new);
622 new = NULL;
623 } else
624 VM_BUG_ON(pmd_trans_splitting(*pmd));
625 spin_unlock(&init_mm.page_table_lock);
626 if (new)
627 pte_free_kernel(&init_mm, new);
628 return 0;
629}
630
631static inline void init_rss_vec(int *rss)
632{
633 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
634}
635
636static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
637{
638 int i;
639
640 if (current->mm == mm)
641 sync_mm_rss(mm);
642 for (i = 0; i < NR_MM_COUNTERS; i++)
643 if (rss[i])
644 add_mm_counter(mm, i, rss[i]);
645}
646
647
648
649
650
651
652
653
654static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
655 pte_t pte, struct page *page)
656{
657 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
658 pud_t *pud = pud_offset(pgd, addr);
659 pmd_t *pmd = pmd_offset(pud, addr);
660 struct address_space *mapping;
661 pgoff_t index;
662 static unsigned long resume;
663 static unsigned long nr_shown;
664 static unsigned long nr_unshown;
665
666
667
668
669
670 if (nr_shown == 60) {
671 if (time_before(jiffies, resume)) {
672 nr_unshown++;
673 return;
674 }
675 if (nr_unshown) {
676 printk(KERN_ALERT
677 "BUG: Bad page map: %lu messages suppressed\n",
678 nr_unshown);
679 nr_unshown = 0;
680 }
681 nr_shown = 0;
682 }
683 if (nr_shown++ == 0)
684 resume = jiffies + 60 * HZ;
685
686 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
687 index = linear_page_index(vma, addr);
688
689 printk(KERN_ALERT
690 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
691 current->comm,
692 (long long)pte_val(pte), (long long)pmd_val(*pmd));
693 if (page)
694 dump_page(page);
695 printk(KERN_ALERT
696 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
697 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
698
699
700
701 if (vma->vm_ops)
702 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
703 (unsigned long)vma->vm_ops->fault);
704 if (vma->vm_file && vma->vm_file->f_op)
705 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
706 (unsigned long)vma->vm_file->f_op->mmap);
707 dump_stack();
708 add_taint(TAINT_BAD_PAGE);
709}
710
711static inline int is_cow_mapping(vm_flags_t flags)
712{
713 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
714}
715
716#ifndef is_zero_pfn
717static inline int is_zero_pfn(unsigned long pfn)
718{
719 return pfn == zero_pfn;
720}
721#endif
722
723#ifndef my_zero_pfn
724static inline unsigned long my_zero_pfn(unsigned long addr)
725{
726 return zero_pfn;
727}
728#endif
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772#ifdef __HAVE_ARCH_PTE_SPECIAL
773# define HAVE_PTE_SPECIAL 1
774#else
775# define HAVE_PTE_SPECIAL 0
776#endif
777struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
778 pte_t pte)
779{
780 unsigned long pfn = pte_pfn(pte);
781
782 if (HAVE_PTE_SPECIAL) {
783 if (likely(!pte_special(pte)))
784 goto check_pfn;
785 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
786 return NULL;
787 if (!is_zero_pfn(pfn))
788 print_bad_pte(vma, addr, pte, NULL);
789 return NULL;
790 }
791
792
793
794 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
795 if (vma->vm_flags & VM_MIXEDMAP) {
796 if (!pfn_valid(pfn))
797 return NULL;
798 goto out;
799 } else {
800 unsigned long off;
801 off = (addr - vma->vm_start) >> PAGE_SHIFT;
802 if (pfn == vma->vm_pgoff + off)
803 return NULL;
804 if (!is_cow_mapping(vma->vm_flags))
805 return NULL;
806 }
807 }
808
809 if (is_zero_pfn(pfn))
810 return NULL;
811check_pfn:
812 if (unlikely(pfn > highest_memmap_pfn)) {
813 print_bad_pte(vma, addr, pte, NULL);
814 return NULL;
815 }
816
817
818
819
820
821out:
822 return pfn_to_page(pfn);
823}
824
825
826
827
828
829
830
831static inline unsigned long
832copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
833 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
834 unsigned long addr, int *rss)
835{
836 unsigned long vm_flags = vma->vm_flags;
837 pte_t pte = *src_pte;
838 struct page *page;
839
840
841 if (unlikely(!pte_present(pte))) {
842 if (!pte_file(pte)) {
843 swp_entry_t entry = pte_to_swp_entry(pte);
844
845 if (swap_duplicate(entry) < 0)
846 return entry.val;
847
848
849 if (unlikely(list_empty(&dst_mm->mmlist))) {
850 spin_lock(&mmlist_lock);
851 if (list_empty(&dst_mm->mmlist))
852 list_add(&dst_mm->mmlist,
853 &src_mm->mmlist);
854 spin_unlock(&mmlist_lock);
855 }
856 if (likely(!non_swap_entry(entry)))
857 rss[MM_SWAPENTS]++;
858 else if (is_migration_entry(entry)) {
859 page = migration_entry_to_page(entry);
860
861 if (PageAnon(page))
862 rss[MM_ANONPAGES]++;
863 else
864 rss[MM_FILEPAGES]++;
865
866 if (is_write_migration_entry(entry) &&
867 is_cow_mapping(vm_flags)) {
868
869
870
871
872 make_migration_entry_read(&entry);
873 pte = swp_entry_to_pte(entry);
874 set_pte_at(src_mm, addr, src_pte, pte);
875 }
876 }
877 }
878 goto out_set_pte;
879 }
880
881
882
883
884
885 if (is_cow_mapping(vm_flags)) {
886 ptep_set_wrprotect(src_mm, addr, src_pte);
887 pte = pte_wrprotect(pte);
888 }
889
890
891
892
893
894 if (vm_flags & VM_SHARED)
895 pte = pte_mkclean(pte);
896 pte = pte_mkold(pte);
897
898 page = vm_normal_page(vma, addr, pte);
899 if (page) {
900 get_page(page);
901 page_dup_rmap(page);
902 if (PageAnon(page))
903 rss[MM_ANONPAGES]++;
904 else
905 rss[MM_FILEPAGES]++;
906 }
907
908out_set_pte:
909 set_pte_at(dst_mm, addr, dst_pte, pte);
910 return 0;
911}
912
913int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
914 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
915 unsigned long addr, unsigned long end)
916{
917 pte_t *orig_src_pte, *orig_dst_pte;
918 pte_t *src_pte, *dst_pte;
919 spinlock_t *src_ptl, *dst_ptl;
920 int progress = 0;
921 int rss[NR_MM_COUNTERS];
922 swp_entry_t entry = (swp_entry_t){0};
923
924again:
925 init_rss_vec(rss);
926
927 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
928 if (!dst_pte)
929 return -ENOMEM;
930 src_pte = pte_offset_map(src_pmd, addr);
931 src_ptl = pte_lockptr(src_mm, src_pmd);
932 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
933 orig_src_pte = src_pte;
934 orig_dst_pte = dst_pte;
935 arch_enter_lazy_mmu_mode();
936
937 do {
938
939
940
941
942 if (progress >= 32) {
943 progress = 0;
944 if (need_resched() ||
945 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
946 break;
947 }
948 if (pte_none(*src_pte)) {
949 progress++;
950 continue;
951 }
952 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
953 vma, addr, rss);
954 if (entry.val)
955 break;
956 progress += 8;
957 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
958
959 arch_leave_lazy_mmu_mode();
960 spin_unlock(src_ptl);
961 pte_unmap(orig_src_pte);
962 add_mm_rss_vec(dst_mm, rss);
963 pte_unmap_unlock(orig_dst_pte, dst_ptl);
964 cond_resched();
965
966 if (entry.val) {
967 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
968 return -ENOMEM;
969 progress = 0;
970 }
971 if (addr != end)
972 goto again;
973 return 0;
974}
975
976static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
977 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
978 unsigned long addr, unsigned long end)
979{
980 pmd_t *src_pmd, *dst_pmd;
981 unsigned long next;
982
983 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
984 if (!dst_pmd)
985 return -ENOMEM;
986 src_pmd = pmd_offset(src_pud, addr);
987 do {
988 next = pmd_addr_end(addr, end);
989 if (pmd_trans_huge(*src_pmd)) {
990 int err;
991 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
992 err = copy_huge_pmd(dst_mm, src_mm,
993 dst_pmd, src_pmd, addr, vma);
994 if (err == -ENOMEM)
995 return -ENOMEM;
996 if (!err)
997 continue;
998
999 }
1000 if (pmd_none_or_clear_bad(src_pmd))
1001 continue;
1002 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1003 vma, addr, next))
1004 return -ENOMEM;
1005 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1006 return 0;
1007}
1008
1009static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1010 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1011 unsigned long addr, unsigned long end)
1012{
1013 pud_t *src_pud, *dst_pud;
1014 unsigned long next;
1015
1016 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1017 if (!dst_pud)
1018 return -ENOMEM;
1019 src_pud = pud_offset(src_pgd, addr);
1020 do {
1021 next = pud_addr_end(addr, end);
1022 if (pud_none_or_clear_bad(src_pud))
1023 continue;
1024 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1025 vma, addr, next))
1026 return -ENOMEM;
1027 } while (dst_pud++, src_pud++, addr = next, addr != end);
1028 return 0;
1029}
1030
1031int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1032 struct vm_area_struct *vma)
1033{
1034 pgd_t *src_pgd, *dst_pgd;
1035 unsigned long next;
1036 unsigned long addr = vma->vm_start;
1037 unsigned long end = vma->vm_end;
1038 int ret;
1039
1040
1041
1042
1043
1044
1045
1046 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
1047 if (!vma->anon_vma)
1048 return 0;
1049 }
1050
1051 if (is_vm_hugetlb_page(vma))
1052 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1053
1054 if (unlikely(is_pfn_mapping(vma))) {
1055
1056
1057
1058
1059 ret = track_pfn_vma_copy(vma);
1060 if (ret)
1061 return ret;
1062 }
1063
1064
1065
1066
1067
1068
1069
1070 if (is_cow_mapping(vma->vm_flags))
1071 mmu_notifier_invalidate_range_start(src_mm, addr, end);
1072
1073 ret = 0;
1074 dst_pgd = pgd_offset(dst_mm, addr);
1075 src_pgd = pgd_offset(src_mm, addr);
1076 do {
1077 next = pgd_addr_end(addr, end);
1078 if (pgd_none_or_clear_bad(src_pgd))
1079 continue;
1080 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1081 vma, addr, next))) {
1082 ret = -ENOMEM;
1083 break;
1084 }
1085 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1086
1087 if (is_cow_mapping(vma->vm_flags))
1088 mmu_notifier_invalidate_range_end(src_mm,
1089 vma->vm_start, end);
1090 return ret;
1091}
1092
1093static unsigned long zap_pte_range(struct mmu_gather *tlb,
1094 struct vm_area_struct *vma, pmd_t *pmd,
1095 unsigned long addr, unsigned long end,
1096 struct zap_details *details)
1097{
1098 struct mm_struct *mm = tlb->mm;
1099 int force_flush = 0;
1100 int rss[NR_MM_COUNTERS];
1101 spinlock_t *ptl;
1102 pte_t *start_pte;
1103 pte_t *pte;
1104
1105again:
1106 init_rss_vec(rss);
1107 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1108 pte = start_pte;
1109 arch_enter_lazy_mmu_mode();
1110 do {
1111 pte_t ptent = *pte;
1112 if (pte_none(ptent)) {
1113 continue;
1114 }
1115
1116 if (pte_present(ptent)) {
1117 struct page *page;
1118
1119 page = vm_normal_page(vma, addr, ptent);
1120 if (unlikely(details) && page) {
1121
1122
1123
1124
1125
1126 if (details->check_mapping &&
1127 details->check_mapping != page->mapping)
1128 continue;
1129
1130
1131
1132
1133 if (details->nonlinear_vma &&
1134 (page->index < details->first_index ||
1135 page->index > details->last_index))
1136 continue;
1137 }
1138 ptent = ptep_get_and_clear_full(mm, addr, pte,
1139 tlb->fullmm);
1140 tlb_remove_tlb_entry(tlb, pte, addr);
1141 if (unlikely(!page))
1142 continue;
1143 if (unlikely(details) && details->nonlinear_vma
1144 && linear_page_index(details->nonlinear_vma,
1145 addr) != page->index)
1146 set_pte_at(mm, addr, pte,
1147 pgoff_to_pte(page->index));
1148 if (PageAnon(page))
1149 rss[MM_ANONPAGES]--;
1150 else {
1151 if (pte_dirty(ptent))
1152 set_page_dirty(page);
1153 if (pte_young(ptent) &&
1154 likely(!VM_SequentialReadHint(vma)))
1155 mark_page_accessed(page);
1156 rss[MM_FILEPAGES]--;
1157 }
1158 page_remove_rmap(page);
1159 if (unlikely(page_mapcount(page) < 0))
1160 print_bad_pte(vma, addr, ptent, page);
1161 force_flush = !__tlb_remove_page(tlb, page);
1162 if (force_flush)
1163 break;
1164 continue;
1165 }
1166
1167
1168
1169
1170 if (unlikely(details))
1171 continue;
1172 if (pte_file(ptent)) {
1173 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1174 print_bad_pte(vma, addr, ptent, NULL);
1175 } else {
1176 swp_entry_t entry = pte_to_swp_entry(ptent);
1177
1178 if (!non_swap_entry(entry))
1179 rss[MM_SWAPENTS]--;
1180 else if (is_migration_entry(entry)) {
1181 struct page *page;
1182
1183 page = migration_entry_to_page(entry);
1184
1185 if (PageAnon(page))
1186 rss[MM_ANONPAGES]--;
1187 else
1188 rss[MM_FILEPAGES]--;
1189 }
1190 if (unlikely(!free_swap_and_cache(entry)))
1191 print_bad_pte(vma, addr, ptent, NULL);
1192 }
1193 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1194 } while (pte++, addr += PAGE_SIZE, addr != end);
1195
1196 add_mm_rss_vec(mm, rss);
1197 arch_leave_lazy_mmu_mode();
1198 pte_unmap_unlock(start_pte, ptl);
1199
1200
1201
1202
1203
1204
1205 if (force_flush) {
1206 force_flush = 0;
1207 tlb_flush_mmu(tlb);
1208 if (addr != end)
1209 goto again;
1210 }
1211
1212 return addr;
1213}
1214
1215static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1216 struct vm_area_struct *vma, pud_t *pud,
1217 unsigned long addr, unsigned long end,
1218 struct zap_details *details)
1219{
1220 pmd_t *pmd;
1221 unsigned long next;
1222
1223 pmd = pmd_offset(pud, addr);
1224 do {
1225 next = pmd_addr_end(addr, end);
1226 if (pmd_trans_huge(*pmd)) {
1227 if (next - addr != HPAGE_PMD_SIZE) {
1228#ifdef CONFIG_DEBUG_VM
1229 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1230 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1231 __func__, addr, end,
1232 vma->vm_start,
1233 vma->vm_end);
1234 BUG();
1235 }
1236#endif
1237 split_huge_page_pmd(vma->vm_mm, pmd);
1238 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1239 goto next;
1240
1241 }
1242
1243
1244
1245
1246
1247
1248
1249 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1250 goto next;
1251 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1252next:
1253 cond_resched();
1254 } while (pmd++, addr = next, addr != end);
1255
1256 return addr;
1257}
1258
1259static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1260 struct vm_area_struct *vma, pgd_t *pgd,
1261 unsigned long addr, unsigned long end,
1262 struct zap_details *details)
1263{
1264 pud_t *pud;
1265 unsigned long next;
1266
1267 pud = pud_offset(pgd, addr);
1268 do {
1269 next = pud_addr_end(addr, end);
1270 if (pud_none_or_clear_bad(pud))
1271 continue;
1272 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1273 } while (pud++, addr = next, addr != end);
1274
1275 return addr;
1276}
1277
1278static void unmap_page_range(struct mmu_gather *tlb,
1279 struct vm_area_struct *vma,
1280 unsigned long addr, unsigned long end,
1281 struct zap_details *details)
1282{
1283 pgd_t *pgd;
1284 unsigned long next;
1285
1286 if (details && !details->check_mapping && !details->nonlinear_vma)
1287 details = NULL;
1288
1289 BUG_ON(addr >= end);
1290 mem_cgroup_uncharge_start();
1291 tlb_start_vma(tlb, vma);
1292 pgd = pgd_offset(vma->vm_mm, addr);
1293 do {
1294 next = pgd_addr_end(addr, end);
1295 if (pgd_none_or_clear_bad(pgd))
1296 continue;
1297 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1298 } while (pgd++, addr = next, addr != end);
1299 tlb_end_vma(tlb, vma);
1300 mem_cgroup_uncharge_end();
1301}
1302
1303
1304static void unmap_single_vma(struct mmu_gather *tlb,
1305 struct vm_area_struct *vma, unsigned long start_addr,
1306 unsigned long end_addr,
1307 struct zap_details *details)
1308{
1309 unsigned long start = max(vma->vm_start, start_addr);
1310 unsigned long end;
1311
1312 if (start >= vma->vm_end)
1313 return;
1314 end = min(vma->vm_end, end_addr);
1315 if (end <= vma->vm_start)
1316 return;
1317
1318 if (vma->vm_file)
1319 uprobe_munmap(vma, start, end);
1320
1321 if (unlikely(is_pfn_mapping(vma)))
1322 untrack_pfn_vma(vma, 0, 0);
1323
1324 if (start != end) {
1325 if (unlikely(is_vm_hugetlb_page(vma))) {
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337 if (vma->vm_file)
1338 unmap_hugepage_range(vma, start, end, NULL);
1339 } else
1340 unmap_page_range(tlb, vma, start, end, details);
1341 }
1342}
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362void unmap_vmas(struct mmu_gather *tlb,
1363 struct vm_area_struct *vma, unsigned long start_addr,
1364 unsigned long end_addr)
1365{
1366 struct mm_struct *mm = vma->vm_mm;
1367
1368 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1369 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1370 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1371 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1372}
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1384 unsigned long size, struct zap_details *details)
1385{
1386 struct mm_struct *mm = vma->vm_mm;
1387 struct mmu_gather tlb;
1388 unsigned long end = start + size;
1389
1390 lru_add_drain();
1391 tlb_gather_mmu(&tlb, mm, 0);
1392 update_hiwater_rss(mm);
1393 mmu_notifier_invalidate_range_start(mm, start, end);
1394 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1395 unmap_single_vma(&tlb, vma, start, end, details);
1396 mmu_notifier_invalidate_range_end(mm, start, end);
1397 tlb_finish_mmu(&tlb, start, end);
1398}
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1410 unsigned long size, struct zap_details *details)
1411{
1412 struct mm_struct *mm = vma->vm_mm;
1413 struct mmu_gather tlb;
1414 unsigned long end = address + size;
1415
1416 lru_add_drain();
1417 tlb_gather_mmu(&tlb, mm, 0);
1418 update_hiwater_rss(mm);
1419 mmu_notifier_invalidate_range_start(mm, address, end);
1420 unmap_single_vma(&tlb, vma, address, end, details);
1421 mmu_notifier_invalidate_range_end(mm, address, end);
1422 tlb_finish_mmu(&tlb, address, end);
1423}
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1438 unsigned long size)
1439{
1440 if (address < vma->vm_start || address + size > vma->vm_end ||
1441 !(vma->vm_flags & VM_PFNMAP))
1442 return -1;
1443 zap_page_range_single(vma, address, size, NULL);
1444 return 0;
1445}
1446EXPORT_SYMBOL_GPL(zap_vma_ptes);
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1461 unsigned int flags)
1462{
1463 pgd_t *pgd;
1464 pud_t *pud;
1465 pmd_t *pmd;
1466 pte_t *ptep, pte;
1467 spinlock_t *ptl;
1468 struct page *page;
1469 struct mm_struct *mm = vma->vm_mm;
1470
1471 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1472 if (!IS_ERR(page)) {
1473 BUG_ON(flags & FOLL_GET);
1474 goto out;
1475 }
1476
1477 page = NULL;
1478 pgd = pgd_offset(mm, address);
1479 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1480 goto no_page_table;
1481
1482 pud = pud_offset(pgd, address);
1483 if (pud_none(*pud))
1484 goto no_page_table;
1485 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1486 BUG_ON(flags & FOLL_GET);
1487 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1488 goto out;
1489 }
1490 if (unlikely(pud_bad(*pud)))
1491 goto no_page_table;
1492
1493 pmd = pmd_offset(pud, address);
1494 if (pmd_none(*pmd))
1495 goto no_page_table;
1496 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1497 BUG_ON(flags & FOLL_GET);
1498 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1499 goto out;
1500 }
1501 if (pmd_trans_huge(*pmd)) {
1502 if (flags & FOLL_SPLIT) {
1503 split_huge_page_pmd(mm, pmd);
1504 goto split_fallthrough;
1505 }
1506 spin_lock(&mm->page_table_lock);
1507 if (likely(pmd_trans_huge(*pmd))) {
1508 if (unlikely(pmd_trans_splitting(*pmd))) {
1509 spin_unlock(&mm->page_table_lock);
1510 wait_split_huge_page(vma->anon_vma, pmd);
1511 } else {
1512 page = follow_trans_huge_pmd(mm, address,
1513 pmd, flags);
1514 spin_unlock(&mm->page_table_lock);
1515 goto out;
1516 }
1517 } else
1518 spin_unlock(&mm->page_table_lock);
1519
1520 }
1521split_fallthrough:
1522 if (unlikely(pmd_bad(*pmd)))
1523 goto no_page_table;
1524
1525 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1526
1527 pte = *ptep;
1528 if (!pte_present(pte))
1529 goto no_page;
1530 if ((flags & FOLL_WRITE) && !pte_write(pte))
1531 goto unlock;
1532
1533 page = vm_normal_page(vma, address, pte);
1534 if (unlikely(!page)) {
1535 if ((flags & FOLL_DUMP) ||
1536 !is_zero_pfn(pte_pfn(pte)))
1537 goto bad_page;
1538 page = pte_page(pte);
1539 }
1540
1541 if (flags & FOLL_GET)
1542 get_page_foll(page);
1543 if (flags & FOLL_TOUCH) {
1544 if ((flags & FOLL_WRITE) &&
1545 !pte_dirty(pte) && !PageDirty(page))
1546 set_page_dirty(page);
1547
1548
1549
1550
1551
1552 mark_page_accessed(page);
1553 }
1554 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564 if (page->mapping && trylock_page(page)) {
1565 lru_add_drain();
1566
1567
1568
1569
1570
1571 if (page->mapping)
1572 mlock_vma_page(page);
1573 unlock_page(page);
1574 }
1575 }
1576unlock:
1577 pte_unmap_unlock(ptep, ptl);
1578out:
1579 return page;
1580
1581bad_page:
1582 pte_unmap_unlock(ptep, ptl);
1583 return ERR_PTR(-EFAULT);
1584
1585no_page:
1586 pte_unmap_unlock(ptep, ptl);
1587 if (!pte_none(pte))
1588 return page;
1589
1590no_page_table:
1591
1592
1593
1594
1595
1596
1597
1598
1599 if ((flags & FOLL_DUMP) &&
1600 (!vma->vm_ops || !vma->vm_ops->fault))
1601 return ERR_PTR(-EFAULT);
1602 return page;
1603}
1604
1605static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1606{
1607 return stack_guard_page_start(vma, addr) ||
1608 stack_guard_page_end(vma, addr+PAGE_SIZE);
1609}
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1661 unsigned long start, int nr_pages, unsigned int gup_flags,
1662 struct page **pages, struct vm_area_struct **vmas,
1663 int *nonblocking)
1664{
1665 int i;
1666 unsigned long vm_flags;
1667
1668 if (nr_pages <= 0)
1669 return 0;
1670
1671 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1672
1673
1674
1675
1676
1677 vm_flags = (gup_flags & FOLL_WRITE) ?
1678 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1679 vm_flags &= (gup_flags & FOLL_FORCE) ?
1680 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1681 i = 0;
1682
1683 do {
1684 struct vm_area_struct *vma;
1685
1686 vma = find_extend_vma(mm, start);
1687 if (!vma && in_gate_area(mm, start)) {
1688 unsigned long pg = start & PAGE_MASK;
1689 pgd_t *pgd;
1690 pud_t *pud;
1691 pmd_t *pmd;
1692 pte_t *pte;
1693
1694
1695 if (gup_flags & FOLL_WRITE)
1696 return i ? : -EFAULT;
1697 if (pg > TASK_SIZE)
1698 pgd = pgd_offset_k(pg);
1699 else
1700 pgd = pgd_offset_gate(mm, pg);
1701 BUG_ON(pgd_none(*pgd));
1702 pud = pud_offset(pgd, pg);
1703 BUG_ON(pud_none(*pud));
1704 pmd = pmd_offset(pud, pg);
1705 if (pmd_none(*pmd))
1706 return i ? : -EFAULT;
1707 VM_BUG_ON(pmd_trans_huge(*pmd));
1708 pte = pte_offset_map(pmd, pg);
1709 if (pte_none(*pte)) {
1710 pte_unmap(pte);
1711 return i ? : -EFAULT;
1712 }
1713 vma = get_gate_vma(mm);
1714 if (pages) {
1715 struct page *page;
1716
1717 page = vm_normal_page(vma, start, *pte);
1718 if (!page) {
1719 if (!(gup_flags & FOLL_DUMP) &&
1720 is_zero_pfn(pte_pfn(*pte)))
1721 page = pte_page(*pte);
1722 else {
1723 pte_unmap(pte);
1724 return i ? : -EFAULT;
1725 }
1726 }
1727 pages[i] = page;
1728 get_page(page);
1729 }
1730 pte_unmap(pte);
1731 goto next_page;
1732 }
1733
1734 if (!vma ||
1735 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1736 !(vm_flags & vma->vm_flags))
1737 return i ? : -EFAULT;
1738
1739 if (is_vm_hugetlb_page(vma)) {
1740 i = follow_hugetlb_page(mm, vma, pages, vmas,
1741 &start, &nr_pages, i, gup_flags);
1742 continue;
1743 }
1744
1745 do {
1746 struct page *page;
1747 unsigned int foll_flags = gup_flags;
1748
1749
1750
1751
1752
1753 if (unlikely(fatal_signal_pending(current)))
1754 return i ? i : -ERESTARTSYS;
1755
1756 cond_resched();
1757 while (!(page = follow_page(vma, start, foll_flags))) {
1758 int ret;
1759 unsigned int fault_flags = 0;
1760
1761
1762 if (foll_flags & FOLL_MLOCK) {
1763 if (stack_guard_page(vma, start))
1764 goto next_page;
1765 }
1766 if (foll_flags & FOLL_WRITE)
1767 fault_flags |= FAULT_FLAG_WRITE;
1768 if (nonblocking)
1769 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1770 if (foll_flags & FOLL_NOWAIT)
1771 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1772
1773 ret = handle_mm_fault(mm, vma, start,
1774 fault_flags);
1775
1776 if (ret & VM_FAULT_ERROR) {
1777 if (ret & VM_FAULT_OOM)
1778 return i ? i : -ENOMEM;
1779 if (ret & (VM_FAULT_HWPOISON |
1780 VM_FAULT_HWPOISON_LARGE)) {
1781 if (i)
1782 return i;
1783 else if (gup_flags & FOLL_HWPOISON)
1784 return -EHWPOISON;
1785 else
1786 return -EFAULT;
1787 }
1788 if (ret & VM_FAULT_SIGBUS)
1789 return i ? i : -EFAULT;
1790 BUG();
1791 }
1792
1793 if (tsk) {
1794 if (ret & VM_FAULT_MAJOR)
1795 tsk->maj_flt++;
1796 else
1797 tsk->min_flt++;
1798 }
1799
1800 if (ret & VM_FAULT_RETRY) {
1801 if (nonblocking)
1802 *nonblocking = 0;
1803 return i;
1804 }
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818 if ((ret & VM_FAULT_WRITE) &&
1819 !(vma->vm_flags & VM_WRITE))
1820 foll_flags &= ~FOLL_WRITE;
1821
1822 cond_resched();
1823 }
1824 if (IS_ERR(page))
1825 return i ? i : PTR_ERR(page);
1826 if (pages) {
1827 pages[i] = page;
1828
1829 flush_anon_page(vma, page, start);
1830 flush_dcache_page(page);
1831 }
1832next_page:
1833 if (vmas)
1834 vmas[i] = vma;
1835 i++;
1836 start += PAGE_SIZE;
1837 nr_pages--;
1838 } while (nr_pages && start < vma->vm_end);
1839 } while (nr_pages);
1840 return i;
1841}
1842EXPORT_SYMBOL(__get_user_pages);
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1872 unsigned long address, unsigned int fault_flags)
1873{
1874 struct vm_area_struct *vma;
1875 int ret;
1876
1877 vma = find_extend_vma(mm, address);
1878 if (!vma || address < vma->vm_start)
1879 return -EFAULT;
1880
1881 ret = handle_mm_fault(mm, vma, address, fault_flags);
1882 if (ret & VM_FAULT_ERROR) {
1883 if (ret & VM_FAULT_OOM)
1884 return -ENOMEM;
1885 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1886 return -EHWPOISON;
1887 if (ret & VM_FAULT_SIGBUS)
1888 return -EFAULT;
1889 BUG();
1890 }
1891 if (tsk) {
1892 if (ret & VM_FAULT_MAJOR)
1893 tsk->maj_flt++;
1894 else
1895 tsk->min_flt++;
1896 }
1897 return 0;
1898}
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1952 unsigned long start, int nr_pages, int write, int force,
1953 struct page **pages, struct vm_area_struct **vmas)
1954{
1955 int flags = FOLL_TOUCH;
1956
1957 if (pages)
1958 flags |= FOLL_GET;
1959 if (write)
1960 flags |= FOLL_WRITE;
1961 if (force)
1962 flags |= FOLL_FORCE;
1963
1964 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1965 NULL);
1966}
1967EXPORT_SYMBOL(get_user_pages);
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983#ifdef CONFIG_ELF_CORE
1984struct page *get_dump_page(unsigned long addr)
1985{
1986 struct vm_area_struct *vma;
1987 struct page *page;
1988
1989 if (__get_user_pages(current, current->mm, addr, 1,
1990 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1991 NULL) < 1)
1992 return NULL;
1993 flush_cache_page(vma, addr, page_to_pfn(page));
1994 return page;
1995}
1996#endif
1997
1998pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1999 spinlock_t **ptl)
2000{
2001 pgd_t * pgd = pgd_offset(mm, addr);
2002 pud_t * pud = pud_alloc(mm, pgd, addr);
2003 if (pud) {
2004 pmd_t * pmd = pmd_alloc(mm, pud, addr);
2005 if (pmd) {
2006 VM_BUG_ON(pmd_trans_huge(*pmd));
2007 return pte_alloc_map_lock(mm, pmd, addr, ptl);
2008 }
2009 }
2010 return NULL;
2011}
2012
2013
2014
2015
2016
2017
2018
2019
2020static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2021 struct page *page, pgprot_t prot)
2022{
2023 struct mm_struct *mm = vma->vm_mm;
2024 int retval;
2025 pte_t *pte;
2026 spinlock_t *ptl;
2027
2028 retval = -EINVAL;
2029 if (PageAnon(page))
2030 goto out;
2031 retval = -ENOMEM;
2032 flush_dcache_page(page);
2033 pte = get_locked_pte(mm, addr, &ptl);
2034 if (!pte)
2035 goto out;
2036 retval = -EBUSY;
2037 if (!pte_none(*pte))
2038 goto out_unlock;
2039
2040
2041 get_page(page);
2042 inc_mm_counter_fast(mm, MM_FILEPAGES);
2043 page_add_file_rmap(page);
2044 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2045
2046 retval = 0;
2047 pte_unmap_unlock(pte, ptl);
2048 return retval;
2049out_unlock:
2050 pte_unmap_unlock(pte, ptl);
2051out:
2052 return retval;
2053}
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2078 struct page *page)
2079{
2080 if (addr < vma->vm_start || addr >= vma->vm_end)
2081 return -EFAULT;
2082 if (!page_count(page))
2083 return -EINVAL;
2084 vma->vm_flags |= VM_INSERTPAGE;
2085 return insert_page(vma, addr, page, vma->vm_page_prot);
2086}
2087EXPORT_SYMBOL(vm_insert_page);
2088
2089static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2090 unsigned long pfn, pgprot_t prot)
2091{
2092 struct mm_struct *mm = vma->vm_mm;
2093 int retval;
2094 pte_t *pte, entry;
2095 spinlock_t *ptl;
2096
2097 retval = -ENOMEM;
2098 pte = get_locked_pte(mm, addr, &ptl);
2099 if (!pte)
2100 goto out;
2101 retval = -EBUSY;
2102 if (!pte_none(*pte))
2103 goto out_unlock;
2104
2105
2106 entry = pte_mkspecial(pfn_pte(pfn, prot));
2107 set_pte_at(mm, addr, pte, entry);
2108 update_mmu_cache(vma, addr, pte);
2109
2110 retval = 0;
2111out_unlock:
2112 pte_unmap_unlock(pte, ptl);
2113out:
2114 return retval;
2115}
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2135 unsigned long pfn)
2136{
2137 int ret;
2138 pgprot_t pgprot = vma->vm_page_prot;
2139
2140
2141
2142
2143
2144
2145 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2146 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2147 (VM_PFNMAP|VM_MIXEDMAP));
2148 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2149 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2150
2151 if (addr < vma->vm_start || addr >= vma->vm_end)
2152 return -EFAULT;
2153 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
2154 return -EINVAL;
2155
2156 ret = insert_pfn(vma, addr, pfn, pgprot);
2157
2158 if (ret)
2159 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2160
2161 return ret;
2162}
2163EXPORT_SYMBOL(vm_insert_pfn);
2164
2165int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2166 unsigned long pfn)
2167{
2168 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2169
2170 if (addr < vma->vm_start || addr >= vma->vm_end)
2171 return -EFAULT;
2172
2173
2174
2175
2176
2177
2178
2179
2180 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2181 struct page *page;
2182
2183 page = pfn_to_page(pfn);
2184 return insert_page(vma, addr, page, vma->vm_page_prot);
2185 }
2186 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2187}
2188EXPORT_SYMBOL(vm_insert_mixed);
2189
2190
2191
2192
2193
2194
2195static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2196 unsigned long addr, unsigned long end,
2197 unsigned long pfn, pgprot_t prot)
2198{
2199 pte_t *pte;
2200 spinlock_t *ptl;
2201
2202 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2203 if (!pte)
2204 return -ENOMEM;
2205 arch_enter_lazy_mmu_mode();
2206 do {
2207 BUG_ON(!pte_none(*pte));
2208 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2209 pfn++;
2210 } while (pte++, addr += PAGE_SIZE, addr != end);
2211 arch_leave_lazy_mmu_mode();
2212 pte_unmap_unlock(pte - 1, ptl);
2213 return 0;
2214}
2215
2216static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2217 unsigned long addr, unsigned long end,
2218 unsigned long pfn, pgprot_t prot)
2219{
2220 pmd_t *pmd;
2221 unsigned long next;
2222
2223 pfn -= addr >> PAGE_SHIFT;
2224 pmd = pmd_alloc(mm, pud, addr);
2225 if (!pmd)
2226 return -ENOMEM;
2227 VM_BUG_ON(pmd_trans_huge(*pmd));
2228 do {
2229 next = pmd_addr_end(addr, end);
2230 if (remap_pte_range(mm, pmd, addr, next,
2231 pfn + (addr >> PAGE_SHIFT), prot))
2232 return -ENOMEM;
2233 } while (pmd++, addr = next, addr != end);
2234 return 0;
2235}
2236
2237static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2238 unsigned long addr, unsigned long end,
2239 unsigned long pfn, pgprot_t prot)
2240{
2241 pud_t *pud;
2242 unsigned long next;
2243
2244 pfn -= addr >> PAGE_SHIFT;
2245 pud = pud_alloc(mm, pgd, addr);
2246 if (!pud)
2247 return -ENOMEM;
2248 do {
2249 next = pud_addr_end(addr, end);
2250 if (remap_pmd_range(mm, pud, addr, next,
2251 pfn + (addr >> PAGE_SHIFT), prot))
2252 return -ENOMEM;
2253 } while (pud++, addr = next, addr != end);
2254 return 0;
2255}
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2268 unsigned long pfn, unsigned long size, pgprot_t prot)
2269{
2270 pgd_t *pgd;
2271 unsigned long next;
2272 unsigned long end = addr + PAGE_ALIGN(size);
2273 struct mm_struct *mm = vma->vm_mm;
2274 int err;
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294 if (addr == vma->vm_start && end == vma->vm_end) {
2295 vma->vm_pgoff = pfn;
2296 vma->vm_flags |= VM_PFN_AT_MMAP;
2297 } else if (is_cow_mapping(vma->vm_flags))
2298 return -EINVAL;
2299
2300 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2301
2302 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
2303 if (err) {
2304
2305
2306
2307
2308 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2309 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2310 return -EINVAL;
2311 }
2312
2313 BUG_ON(addr >= end);
2314 pfn -= addr >> PAGE_SHIFT;
2315 pgd = pgd_offset(mm, addr);
2316 flush_cache_range(vma, addr, end);
2317 do {
2318 next = pgd_addr_end(addr, end);
2319 err = remap_pud_range(mm, pgd, addr, next,
2320 pfn + (addr >> PAGE_SHIFT), prot);
2321 if (err)
2322 break;
2323 } while (pgd++, addr = next, addr != end);
2324
2325 if (err)
2326 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
2327
2328 return err;
2329}
2330EXPORT_SYMBOL(remap_pfn_range);
2331
2332static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2333 unsigned long addr, unsigned long end,
2334 pte_fn_t fn, void *data)
2335{
2336 pte_t *pte;
2337 int err;
2338 pgtable_t token;
2339 spinlock_t *uninitialized_var(ptl);
2340
2341 pte = (mm == &init_mm) ?
2342 pte_alloc_kernel(pmd, addr) :
2343 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2344 if (!pte)
2345 return -ENOMEM;
2346
2347 BUG_ON(pmd_huge(*pmd));
2348
2349 arch_enter_lazy_mmu_mode();
2350
2351 token = pmd_pgtable(*pmd);
2352
2353 do {
2354 err = fn(pte++, token, addr, data);
2355 if (err)
2356 break;
2357 } while (addr += PAGE_SIZE, addr != end);
2358
2359 arch_leave_lazy_mmu_mode();
2360
2361 if (mm != &init_mm)
2362 pte_unmap_unlock(pte-1, ptl);
2363 return err;
2364}
2365
2366static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2367 unsigned long addr, unsigned long end,
2368 pte_fn_t fn, void *data)
2369{
2370 pmd_t *pmd;
2371 unsigned long next;
2372 int err;
2373
2374 BUG_ON(pud_huge(*pud));
2375
2376 pmd = pmd_alloc(mm, pud, addr);
2377 if (!pmd)
2378 return -ENOMEM;
2379 do {
2380 next = pmd_addr_end(addr, end);
2381 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2382 if (err)
2383 break;
2384 } while (pmd++, addr = next, addr != end);
2385 return err;
2386}
2387
2388static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2389 unsigned long addr, unsigned long end,
2390 pte_fn_t fn, void *data)
2391{
2392 pud_t *pud;
2393 unsigned long next;
2394 int err;
2395
2396 pud = pud_alloc(mm, pgd, addr);
2397 if (!pud)
2398 return -ENOMEM;
2399 do {
2400 next = pud_addr_end(addr, end);
2401 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2402 if (err)
2403 break;
2404 } while (pud++, addr = next, addr != end);
2405 return err;
2406}
2407
2408
2409
2410
2411
2412int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2413 unsigned long size, pte_fn_t fn, void *data)
2414{
2415 pgd_t *pgd;
2416 unsigned long next;
2417 unsigned long end = addr + size;
2418 int err;
2419
2420 BUG_ON(addr >= end);
2421 pgd = pgd_offset(mm, addr);
2422 do {
2423 next = pgd_addr_end(addr, end);
2424 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2425 if (err)
2426 break;
2427 } while (pgd++, addr = next, addr != end);
2428
2429 return err;
2430}
2431EXPORT_SYMBOL_GPL(apply_to_page_range);
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2443 pte_t *page_table, pte_t orig_pte)
2444{
2445 int same = 1;
2446#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2447 if (sizeof(pte_t) > sizeof(unsigned long)) {
2448 spinlock_t *ptl = pte_lockptr(mm, pmd);
2449 spin_lock(ptl);
2450 same = pte_same(*page_table, orig_pte);
2451 spin_unlock(ptl);
2452 }
2453#endif
2454 pte_unmap(page_table);
2455 return same;
2456}
2457
2458static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2459{
2460
2461
2462
2463
2464
2465
2466 if (unlikely(!src)) {
2467 void *kaddr = kmap_atomic(dst);
2468 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2469
2470
2471
2472
2473
2474
2475
2476 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2477 clear_page(kaddr);
2478 kunmap_atomic(kaddr);
2479 flush_dcache_page(dst);
2480 } else
2481 copy_user_highpage(dst, src, va, vma);
2482}
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2503 unsigned long address, pte_t *page_table, pmd_t *pmd,
2504 spinlock_t *ptl, pte_t orig_pte)
2505 __releases(ptl)
2506{
2507 struct page *old_page, *new_page;
2508 pte_t entry;
2509 int ret = 0;
2510 int page_mkwrite = 0;
2511 struct page *dirty_page = NULL;
2512
2513 old_page = vm_normal_page(vma, address, orig_pte);
2514 if (!old_page) {
2515
2516
2517
2518
2519
2520
2521
2522 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2523 (VM_WRITE|VM_SHARED))
2524 goto reuse;
2525 goto gotten;
2526 }
2527
2528
2529
2530
2531
2532 if (PageAnon(old_page) && !PageKsm(old_page)) {
2533 if (!trylock_page(old_page)) {
2534 page_cache_get(old_page);
2535 pte_unmap_unlock(page_table, ptl);
2536 lock_page(old_page);
2537 page_table = pte_offset_map_lock(mm, pmd, address,
2538 &ptl);
2539 if (!pte_same(*page_table, orig_pte)) {
2540 unlock_page(old_page);
2541 goto unlock;
2542 }
2543 page_cache_release(old_page);
2544 }
2545 if (reuse_swap_page(old_page)) {
2546
2547
2548
2549
2550
2551 page_move_anon_rmap(old_page, vma, address);
2552 unlock_page(old_page);
2553 goto reuse;
2554 }
2555 unlock_page(old_page);
2556 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2557 (VM_WRITE|VM_SHARED))) {
2558
2559
2560
2561
2562
2563 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2564 struct vm_fault vmf;
2565 int tmp;
2566
2567 vmf.virtual_address = (void __user *)(address &
2568 PAGE_MASK);
2569 vmf.pgoff = old_page->index;
2570 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2571 vmf.page = old_page;
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581 page_cache_get(old_page);
2582 pte_unmap_unlock(page_table, ptl);
2583
2584 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2585 if (unlikely(tmp &
2586 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2587 ret = tmp;
2588 goto unwritable_page;
2589 }
2590 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2591 lock_page(old_page);
2592 if (!old_page->mapping) {
2593 ret = 0;
2594 unlock_page(old_page);
2595 goto unwritable_page;
2596 }
2597 } else
2598 VM_BUG_ON(!PageLocked(old_page));
2599
2600
2601
2602
2603
2604
2605
2606 page_table = pte_offset_map_lock(mm, pmd, address,
2607 &ptl);
2608 if (!pte_same(*page_table, orig_pte)) {
2609 unlock_page(old_page);
2610 goto unlock;
2611 }
2612
2613 page_mkwrite = 1;
2614 }
2615 dirty_page = old_page;
2616 get_page(dirty_page);
2617
2618reuse:
2619 flush_cache_page(vma, address, pte_pfn(orig_pte));
2620 entry = pte_mkyoung(orig_pte);
2621 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2622 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2623 update_mmu_cache(vma, address, page_table);
2624 pte_unmap_unlock(page_table, ptl);
2625 ret |= VM_FAULT_WRITE;
2626
2627 if (!dirty_page)
2628 return ret;
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638 if (!page_mkwrite) {
2639 wait_on_page_locked(dirty_page);
2640 set_page_dirty_balance(dirty_page, page_mkwrite);
2641 }
2642 put_page(dirty_page);
2643 if (page_mkwrite) {
2644 struct address_space *mapping = dirty_page->mapping;
2645
2646 set_page_dirty(dirty_page);
2647 unlock_page(dirty_page);
2648 page_cache_release(dirty_page);
2649 if (mapping) {
2650
2651
2652
2653
2654 balance_dirty_pages_ratelimited(mapping);
2655 }
2656 }
2657
2658
2659 if (vma->vm_file)
2660 file_update_time(vma->vm_file);
2661
2662 return ret;
2663 }
2664
2665
2666
2667
2668 page_cache_get(old_page);
2669gotten:
2670 pte_unmap_unlock(page_table, ptl);
2671
2672 if (unlikely(anon_vma_prepare(vma)))
2673 goto oom;
2674
2675 if (is_zero_pfn(pte_pfn(orig_pte))) {
2676 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2677 if (!new_page)
2678 goto oom;
2679 } else {
2680 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2681 if (!new_page)
2682 goto oom;
2683 cow_user_page(new_page, old_page, address, vma);
2684 }
2685 __SetPageUptodate(new_page);
2686
2687 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2688 goto oom_free_new;
2689
2690
2691
2692
2693 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2694 if (likely(pte_same(*page_table, orig_pte))) {
2695 if (old_page) {
2696 if (!PageAnon(old_page)) {
2697 dec_mm_counter_fast(mm, MM_FILEPAGES);
2698 inc_mm_counter_fast(mm, MM_ANONPAGES);
2699 }
2700 } else
2701 inc_mm_counter_fast(mm, MM_ANONPAGES);
2702 flush_cache_page(vma, address, pte_pfn(orig_pte));
2703 entry = mk_pte(new_page, vma->vm_page_prot);
2704 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2705
2706
2707
2708
2709
2710
2711 ptep_clear_flush(vma, address, page_table);
2712 page_add_new_anon_rmap(new_page, vma, address);
2713
2714
2715
2716
2717
2718 set_pte_at_notify(mm, address, page_table, entry);
2719 update_mmu_cache(vma, address, page_table);
2720 if (old_page) {
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743 page_remove_rmap(old_page);
2744 }
2745
2746
2747 new_page = old_page;
2748 ret |= VM_FAULT_WRITE;
2749 } else
2750 mem_cgroup_uncharge_page(new_page);
2751
2752 if (new_page)
2753 page_cache_release(new_page);
2754unlock:
2755 pte_unmap_unlock(page_table, ptl);
2756 if (old_page) {
2757
2758
2759
2760
2761 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2762 lock_page(old_page);
2763 munlock_vma_page(old_page);
2764 unlock_page(old_page);
2765 }
2766 page_cache_release(old_page);
2767 }
2768 return ret;
2769oom_free_new:
2770 page_cache_release(new_page);
2771oom:
2772 if (old_page) {
2773 if (page_mkwrite) {
2774 unlock_page(old_page);
2775 page_cache_release(old_page);
2776 }
2777 page_cache_release(old_page);
2778 }
2779 return VM_FAULT_OOM;
2780
2781unwritable_page:
2782 page_cache_release(old_page);
2783 return ret;
2784}
2785
2786static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2787 unsigned long start_addr, unsigned long end_addr,
2788 struct zap_details *details)
2789{
2790 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2791}
2792
2793static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2794 struct zap_details *details)
2795{
2796 struct vm_area_struct *vma;
2797 struct prio_tree_iter iter;
2798 pgoff_t vba, vea, zba, zea;
2799
2800 vma_prio_tree_foreach(vma, &iter, root,
2801 details->first_index, details->last_index) {
2802
2803 vba = vma->vm_pgoff;
2804 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2805
2806 zba = details->first_index;
2807 if (zba < vba)
2808 zba = vba;
2809 zea = details->last_index;
2810 if (zea > vea)
2811 zea = vea;
2812
2813 unmap_mapping_range_vma(vma,
2814 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2815 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2816 details);
2817 }
2818}
2819
2820static inline void unmap_mapping_range_list(struct list_head *head,
2821 struct zap_details *details)
2822{
2823 struct vm_area_struct *vma;
2824
2825
2826
2827
2828
2829
2830
2831 list_for_each_entry(vma, head, shared.vm_set.list) {
2832 details->nonlinear_vma = vma;
2833 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2834 }
2835}
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851void unmap_mapping_range(struct address_space *mapping,
2852 loff_t const holebegin, loff_t const holelen, int even_cows)
2853{
2854 struct zap_details details;
2855 pgoff_t hba = holebegin >> PAGE_SHIFT;
2856 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2857
2858
2859 if (sizeof(holelen) > sizeof(hlen)) {
2860 long long holeend =
2861 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2862 if (holeend & ~(long long)ULONG_MAX)
2863 hlen = ULONG_MAX - hba + 1;
2864 }
2865
2866 details.check_mapping = even_cows? NULL: mapping;
2867 details.nonlinear_vma = NULL;
2868 details.first_index = hba;
2869 details.last_index = hba + hlen - 1;
2870 if (details.last_index < details.first_index)
2871 details.last_index = ULONG_MAX;
2872
2873
2874 mutex_lock(&mapping->i_mmap_mutex);
2875 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2876 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2877 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2878 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2879 mutex_unlock(&mapping->i_mmap_mutex);
2880}
2881EXPORT_SYMBOL(unmap_mapping_range);
2882
2883
2884
2885
2886
2887
2888static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2889 unsigned long address, pte_t *page_table, pmd_t *pmd,
2890 unsigned int flags, pte_t orig_pte)
2891{
2892 spinlock_t *ptl;
2893 struct page *page, *swapcache = NULL;
2894 swp_entry_t entry;
2895 pte_t pte;
2896 int locked;
2897 struct mem_cgroup *ptr;
2898 int exclusive = 0;
2899 int ret = 0;
2900
2901 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2902 goto out;
2903
2904 entry = pte_to_swp_entry(orig_pte);
2905 if (unlikely(non_swap_entry(entry))) {
2906 if (is_migration_entry(entry)) {
2907 migration_entry_wait(mm, pmd, address);
2908 } else if (is_hwpoison_entry(entry)) {
2909 ret = VM_FAULT_HWPOISON;
2910 } else {
2911 print_bad_pte(vma, address, orig_pte, NULL);
2912 ret = VM_FAULT_SIGBUS;
2913 }
2914 goto out;
2915 }
2916 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2917 page = lookup_swap_cache(entry);
2918 if (!page) {
2919 page = swapin_readahead(entry,
2920 GFP_HIGHUSER_MOVABLE, vma, address);
2921 if (!page) {
2922
2923
2924
2925
2926 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2927 if (likely(pte_same(*page_table, orig_pte)))
2928 ret = VM_FAULT_OOM;
2929 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2930 goto unlock;
2931 }
2932
2933
2934 ret = VM_FAULT_MAJOR;
2935 count_vm_event(PGMAJFAULT);
2936 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2937 } else if (PageHWPoison(page)) {
2938
2939
2940
2941
2942 ret = VM_FAULT_HWPOISON;
2943 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2944 goto out_release;
2945 }
2946
2947 locked = lock_page_or_retry(page, mm, flags);
2948
2949 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2950 if (!locked) {
2951 ret |= VM_FAULT_RETRY;
2952 goto out_release;
2953 }
2954
2955
2956
2957
2958
2959
2960
2961 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2962 goto out_page;
2963
2964 if (ksm_might_need_to_copy(page, vma, address)) {
2965 swapcache = page;
2966 page = ksm_does_need_to_copy(page, vma, address);
2967
2968 if (unlikely(!page)) {
2969 ret = VM_FAULT_OOM;
2970 page = swapcache;
2971 swapcache = NULL;
2972 goto out_page;
2973 }
2974 }
2975
2976 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2977 ret = VM_FAULT_OOM;
2978 goto out_page;
2979 }
2980
2981
2982
2983
2984 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2985 if (unlikely(!pte_same(*page_table, orig_pte)))
2986 goto out_nomap;
2987
2988 if (unlikely(!PageUptodate(page))) {
2989 ret = VM_FAULT_SIGBUS;
2990 goto out_nomap;
2991 }
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007 inc_mm_counter_fast(mm, MM_ANONPAGES);
3008 dec_mm_counter_fast(mm, MM_SWAPENTS);
3009 pte = mk_pte(page, vma->vm_page_prot);
3010 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
3011 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3012 flags &= ~FAULT_FLAG_WRITE;
3013 ret |= VM_FAULT_WRITE;
3014 exclusive = 1;
3015 }
3016 flush_icache_page(vma, page);
3017 set_pte_at(mm, address, page_table, pte);
3018 do_page_add_anon_rmap(page, vma, address, exclusive);
3019
3020 mem_cgroup_commit_charge_swapin(page, ptr);
3021
3022 swap_free(entry);
3023 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3024 try_to_free_swap(page);
3025 unlock_page(page);
3026 if (swapcache) {
3027
3028
3029
3030
3031
3032
3033
3034
3035 unlock_page(swapcache);
3036 page_cache_release(swapcache);
3037 }
3038
3039 if (flags & FAULT_FLAG_WRITE) {
3040 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3041 if (ret & VM_FAULT_ERROR)
3042 ret &= VM_FAULT_ERROR;
3043 goto out;
3044 }
3045
3046
3047 update_mmu_cache(vma, address, page_table);
3048unlock:
3049 pte_unmap_unlock(page_table, ptl);
3050out:
3051 return ret;
3052out_nomap:
3053 mem_cgroup_cancel_charge_swapin(ptr);
3054 pte_unmap_unlock(page_table, ptl);
3055out_page:
3056 unlock_page(page);
3057out_release:
3058 page_cache_release(page);
3059 if (swapcache) {
3060 unlock_page(swapcache);
3061 page_cache_release(swapcache);
3062 }
3063 return ret;
3064}
3065
3066
3067
3068
3069
3070
3071static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3072{
3073 address &= PAGE_MASK;
3074 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3075 struct vm_area_struct *prev = vma->vm_prev;
3076
3077
3078
3079
3080
3081
3082
3083 if (prev && prev->vm_end == address)
3084 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3085
3086 expand_downwards(vma, address - PAGE_SIZE);
3087 }
3088 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3089 struct vm_area_struct *next = vma->vm_next;
3090
3091
3092 if (next && next->vm_start == address + PAGE_SIZE)
3093 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3094
3095 expand_upwards(vma, address + PAGE_SIZE);
3096 }
3097 return 0;
3098}
3099
3100
3101
3102
3103
3104
3105static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3106 unsigned long address, pte_t *page_table, pmd_t *pmd,
3107 unsigned int flags)
3108{
3109 struct page *page;
3110 spinlock_t *ptl;
3111 pte_t entry;
3112
3113 pte_unmap(page_table);
3114
3115
3116 if (check_stack_guard_page(vma, address) < 0)
3117 return VM_FAULT_SIGBUS;
3118
3119
3120 if (!(flags & FAULT_FLAG_WRITE)) {
3121 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3122 vma->vm_page_prot));
3123 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3124 if (!pte_none(*page_table))
3125 goto unlock;
3126 goto setpte;
3127 }
3128
3129
3130 if (unlikely(anon_vma_prepare(vma)))
3131 goto oom;
3132 page = alloc_zeroed_user_highpage_movable(vma, address);
3133 if (!page)
3134 goto oom;
3135 __SetPageUptodate(page);
3136
3137 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3138 goto oom_free_page;
3139
3140 entry = mk_pte(page, vma->vm_page_prot);
3141 if (vma->vm_flags & VM_WRITE)
3142 entry = pte_mkwrite(pte_mkdirty(entry));
3143
3144 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3145 if (!pte_none(*page_table))
3146 goto release;
3147
3148 inc_mm_counter_fast(mm, MM_ANONPAGES);
3149 page_add_new_anon_rmap(page, vma, address);
3150setpte:
3151 set_pte_at(mm, address, page_table, entry);
3152
3153
3154 update_mmu_cache(vma, address, page_table);
3155unlock:
3156 pte_unmap_unlock(page_table, ptl);
3157 return 0;
3158release:
3159 mem_cgroup_uncharge_page(page);
3160 page_cache_release(page);
3161 goto unlock;
3162oom_free_page:
3163 page_cache_release(page);
3164oom:
3165 return VM_FAULT_OOM;
3166}
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3182 unsigned long address, pmd_t *pmd,
3183 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3184{
3185 pte_t *page_table;
3186 spinlock_t *ptl;
3187 struct page *page;
3188 struct page *cow_page;
3189 pte_t entry;
3190 int anon = 0;
3191 struct page *dirty_page = NULL;
3192 struct vm_fault vmf;
3193 int ret;
3194 int page_mkwrite = 0;
3195
3196
3197
3198
3199
3200 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3201
3202 if (unlikely(anon_vma_prepare(vma)))
3203 return VM_FAULT_OOM;
3204
3205 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3206 if (!cow_page)
3207 return VM_FAULT_OOM;
3208
3209 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3210 page_cache_release(cow_page);
3211 return VM_FAULT_OOM;
3212 }
3213 } else
3214 cow_page = NULL;
3215
3216 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3217 vmf.pgoff = pgoff;
3218 vmf.flags = flags;
3219 vmf.page = NULL;
3220
3221 ret = vma->vm_ops->fault(vma, &vmf);
3222 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3223 VM_FAULT_RETRY)))
3224 goto uncharge_out;
3225
3226 if (unlikely(PageHWPoison(vmf.page))) {
3227 if (ret & VM_FAULT_LOCKED)
3228 unlock_page(vmf.page);
3229 ret = VM_FAULT_HWPOISON;
3230 goto uncharge_out;
3231 }
3232
3233
3234
3235
3236
3237 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3238 lock_page(vmf.page);
3239 else
3240 VM_BUG_ON(!PageLocked(vmf.page));
3241
3242
3243
3244
3245 page = vmf.page;
3246 if (flags & FAULT_FLAG_WRITE) {
3247 if (!(vma->vm_flags & VM_SHARED)) {
3248 page = cow_page;
3249 anon = 1;
3250 copy_user_highpage(page, vmf.page, address, vma);
3251 __SetPageUptodate(page);
3252 } else {
3253
3254
3255
3256
3257
3258 if (vma->vm_ops->page_mkwrite) {
3259 int tmp;
3260
3261 unlock_page(page);
3262 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3263 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3264 if (unlikely(tmp &
3265 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3266 ret = tmp;
3267 goto unwritable_page;
3268 }
3269 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3270 lock_page(page);
3271 if (!page->mapping) {
3272 ret = 0;
3273 unlock_page(page);
3274 goto unwritable_page;
3275 }
3276 } else
3277 VM_BUG_ON(!PageLocked(page));
3278 page_mkwrite = 1;
3279 }
3280 }
3281
3282 }
3283
3284 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297 if (likely(pte_same(*page_table, orig_pte))) {
3298 flush_icache_page(vma, page);
3299 entry = mk_pte(page, vma->vm_page_prot);
3300 if (flags & FAULT_FLAG_WRITE)
3301 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3302 if (anon) {
3303 inc_mm_counter_fast(mm, MM_ANONPAGES);
3304 page_add_new_anon_rmap(page, vma, address);
3305 } else {
3306 inc_mm_counter_fast(mm, MM_FILEPAGES);
3307 page_add_file_rmap(page);
3308 if (flags & FAULT_FLAG_WRITE) {
3309 dirty_page = page;
3310 get_page(dirty_page);
3311 }
3312 }
3313 set_pte_at(mm, address, page_table, entry);
3314
3315
3316 update_mmu_cache(vma, address, page_table);
3317 } else {
3318 if (cow_page)
3319 mem_cgroup_uncharge_page(cow_page);
3320 if (anon)
3321 page_cache_release(page);
3322 else
3323 anon = 1;
3324 }
3325
3326 pte_unmap_unlock(page_table, ptl);
3327
3328 if (dirty_page) {
3329 struct address_space *mapping = page->mapping;
3330
3331 if (set_page_dirty(dirty_page))
3332 page_mkwrite = 1;
3333 unlock_page(dirty_page);
3334 put_page(dirty_page);
3335 if (page_mkwrite && mapping) {
3336
3337
3338
3339
3340 balance_dirty_pages_ratelimited(mapping);
3341 }
3342
3343
3344 if (vma->vm_file)
3345 file_update_time(vma->vm_file);
3346 } else {
3347 unlock_page(vmf.page);
3348 if (anon)
3349 page_cache_release(vmf.page);
3350 }
3351
3352 return ret;
3353
3354unwritable_page:
3355 page_cache_release(page);
3356 return ret;
3357uncharge_out:
3358
3359 if (cow_page) {
3360 mem_cgroup_uncharge_page(cow_page);
3361 page_cache_release(cow_page);
3362 }
3363 return ret;
3364}
3365
3366static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3367 unsigned long address, pte_t *page_table, pmd_t *pmd,
3368 unsigned int flags, pte_t orig_pte)
3369{
3370 pgoff_t pgoff = (((address & PAGE_MASK)
3371 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3372
3373 pte_unmap(page_table);
3374 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3375}
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3387 unsigned long address, pte_t *page_table, pmd_t *pmd,
3388 unsigned int flags, pte_t orig_pte)
3389{
3390 pgoff_t pgoff;
3391
3392 flags |= FAULT_FLAG_NONLINEAR;
3393
3394 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3395 return 0;
3396
3397 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3398
3399
3400
3401 print_bad_pte(vma, address, orig_pte, NULL);
3402 return VM_FAULT_SIGBUS;
3403 }
3404
3405 pgoff = pte_to_pgoff(orig_pte);
3406 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3407}
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422int handle_pte_fault(struct mm_struct *mm,
3423 struct vm_area_struct *vma, unsigned long address,
3424 pte_t *pte, pmd_t *pmd, unsigned int flags)
3425{
3426 pte_t entry;
3427 spinlock_t *ptl;
3428
3429 entry = *pte;
3430 if (!pte_present(entry)) {
3431 if (pte_none(entry)) {
3432 if (vma->vm_ops) {
3433 if (likely(vma->vm_ops->fault))
3434 return do_linear_fault(mm, vma, address,
3435 pte, pmd, flags, entry);
3436 }
3437 return do_anonymous_page(mm, vma, address,
3438 pte, pmd, flags);
3439 }
3440 if (pte_file(entry))
3441 return do_nonlinear_fault(mm, vma, address,
3442 pte, pmd, flags, entry);
3443 return do_swap_page(mm, vma, address,
3444 pte, pmd, flags, entry);
3445 }
3446
3447 ptl = pte_lockptr(mm, pmd);
3448 spin_lock(ptl);
3449 if (unlikely(!pte_same(*pte, entry)))
3450 goto unlock;
3451 if (flags & FAULT_FLAG_WRITE) {
3452 if (!pte_write(entry))
3453 return do_wp_page(mm, vma, address,
3454 pte, pmd, ptl, entry);
3455 entry = pte_mkdirty(entry);
3456 }
3457 entry = pte_mkyoung(entry);
3458 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3459 update_mmu_cache(vma, address, pte);
3460 } else {
3461
3462
3463
3464
3465
3466
3467 if (flags & FAULT_FLAG_WRITE)
3468 flush_tlb_fix_spurious_fault(vma, address);
3469 }
3470unlock:
3471 pte_unmap_unlock(pte, ptl);
3472 return 0;
3473}
3474
3475
3476
3477
3478int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3479 unsigned long address, unsigned int flags)
3480{
3481 pgd_t *pgd;
3482 pud_t *pud;
3483 pmd_t *pmd;
3484 pte_t *pte;
3485
3486 __set_current_state(TASK_RUNNING);
3487
3488 count_vm_event(PGFAULT);
3489 mem_cgroup_count_vm_event(mm, PGFAULT);
3490
3491
3492 check_sync_rss_stat(current);
3493
3494 if (unlikely(is_vm_hugetlb_page(vma)))
3495 return hugetlb_fault(mm, vma, address, flags);
3496
3497retry:
3498 pgd = pgd_offset(mm, address);
3499 pud = pud_alloc(mm, pgd, address);
3500 if (!pud)
3501 return VM_FAULT_OOM;
3502 pmd = pmd_alloc(mm, pud, address);
3503 if (!pmd)
3504 return VM_FAULT_OOM;
3505 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3506 if (!vma->vm_ops)
3507 return do_huge_pmd_anonymous_page(mm, vma, address,
3508 pmd, flags);
3509 } else {
3510 pmd_t orig_pmd = *pmd;
3511 int ret;
3512
3513 barrier();
3514 if (pmd_trans_huge(orig_pmd)) {
3515 if (flags & FAULT_FLAG_WRITE &&
3516 !pmd_write(orig_pmd) &&
3517 !pmd_trans_splitting(orig_pmd)) {
3518 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3519 orig_pmd);
3520
3521
3522
3523
3524
3525 if (unlikely(ret & VM_FAULT_OOM))
3526 goto retry;
3527 return ret;
3528 }
3529 return 0;
3530 }
3531 }
3532
3533
3534
3535
3536
3537
3538 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3539 return VM_FAULT_OOM;
3540
3541 if (unlikely(pmd_trans_huge(*pmd)))
3542 return 0;
3543
3544
3545
3546
3547
3548
3549 pte = pte_offset_map(pmd, address);
3550
3551 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3552}
3553
3554#ifndef __PAGETABLE_PUD_FOLDED
3555
3556
3557
3558
3559int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3560{
3561 pud_t *new = pud_alloc_one(mm, address);
3562 if (!new)
3563 return -ENOMEM;
3564
3565 smp_wmb();
3566
3567 spin_lock(&mm->page_table_lock);
3568 if (pgd_present(*pgd))
3569 pud_free(mm, new);
3570 else
3571 pgd_populate(mm, pgd, new);
3572 spin_unlock(&mm->page_table_lock);
3573 return 0;
3574}
3575#endif
3576
3577#ifndef __PAGETABLE_PMD_FOLDED
3578
3579
3580
3581
3582int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3583{
3584 pmd_t *new = pmd_alloc_one(mm, address);
3585 if (!new)
3586 return -ENOMEM;
3587
3588 smp_wmb();
3589
3590 spin_lock(&mm->page_table_lock);
3591#ifndef __ARCH_HAS_4LEVEL_HACK
3592 if (pud_present(*pud))
3593 pmd_free(mm, new);
3594 else
3595 pud_populate(mm, pud, new);
3596#else
3597 if (pgd_present(*pud))
3598 pmd_free(mm, new);
3599 else
3600 pgd_populate(mm, pud, new);
3601#endif
3602 spin_unlock(&mm->page_table_lock);
3603 return 0;
3604}
3605#endif
3606
3607int make_pages_present(unsigned long addr, unsigned long end)
3608{
3609 int ret, len, write;
3610 struct vm_area_struct * vma;
3611
3612 vma = find_vma(current->mm, addr);
3613 if (!vma)
3614 return -ENOMEM;
3615
3616
3617
3618
3619
3620 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3621 BUG_ON(addr >= end);
3622 BUG_ON(end > vma->vm_end);
3623 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3624 ret = get_user_pages(current, current->mm, addr,
3625 len, write, 0, NULL, NULL);
3626 if (ret < 0)
3627 return ret;
3628 return ret == len ? 0 : -EFAULT;
3629}
3630
3631#if !defined(__HAVE_ARCH_GATE_AREA)
3632
3633#if defined(AT_SYSINFO_EHDR)
3634static struct vm_area_struct gate_vma;
3635
3636static int __init gate_vma_init(void)
3637{
3638 gate_vma.vm_mm = NULL;
3639 gate_vma.vm_start = FIXADDR_USER_START;
3640 gate_vma.vm_end = FIXADDR_USER_END;
3641 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3642 gate_vma.vm_page_prot = __P101;
3643
3644 return 0;
3645}
3646__initcall(gate_vma_init);
3647#endif
3648
3649struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3650{
3651#ifdef AT_SYSINFO_EHDR
3652 return &gate_vma;
3653#else
3654 return NULL;
3655#endif
3656}
3657
3658int in_gate_area_no_mm(unsigned long addr)
3659{
3660#ifdef AT_SYSINFO_EHDR
3661 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3662 return 1;
3663#endif
3664 return 0;
3665}
3666
3667#endif
3668
3669static int __follow_pte(struct mm_struct *mm, unsigned long address,
3670 pte_t **ptepp, spinlock_t **ptlp)
3671{
3672 pgd_t *pgd;
3673 pud_t *pud;
3674 pmd_t *pmd;
3675 pte_t *ptep;
3676
3677 pgd = pgd_offset(mm, address);
3678 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3679 goto out;
3680
3681 pud = pud_offset(pgd, address);
3682 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3683 goto out;
3684
3685 pmd = pmd_offset(pud, address);
3686 VM_BUG_ON(pmd_trans_huge(*pmd));
3687 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3688 goto out;
3689
3690
3691 if (pmd_huge(*pmd))
3692 goto out;
3693
3694 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3695 if (!ptep)
3696 goto out;
3697 if (!pte_present(*ptep))
3698 goto unlock;
3699 *ptepp = ptep;
3700 return 0;
3701unlock:
3702 pte_unmap_unlock(ptep, *ptlp);
3703out:
3704 return -EINVAL;
3705}
3706
3707static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3708 pte_t **ptepp, spinlock_t **ptlp)
3709{
3710 int res;
3711
3712
3713 (void) __cond_lock(*ptlp,
3714 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3715 return res;
3716}
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3729 unsigned long *pfn)
3730{
3731 int ret = -EINVAL;
3732 spinlock_t *ptl;
3733 pte_t *ptep;
3734
3735 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3736 return ret;
3737
3738 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3739 if (ret)
3740 return ret;
3741 *pfn = pte_pfn(*ptep);
3742 pte_unmap_unlock(ptep, ptl);
3743 return 0;
3744}
3745EXPORT_SYMBOL(follow_pfn);
3746
3747#ifdef CONFIG_HAVE_IOREMAP_PROT
3748int follow_phys(struct vm_area_struct *vma,
3749 unsigned long address, unsigned int flags,
3750 unsigned long *prot, resource_size_t *phys)
3751{
3752 int ret = -EINVAL;
3753 pte_t *ptep, pte;
3754 spinlock_t *ptl;
3755
3756 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3757 goto out;
3758
3759 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3760 goto out;
3761 pte = *ptep;
3762
3763 if ((flags & FOLL_WRITE) && !pte_write(pte))
3764 goto unlock;
3765
3766 *prot = pgprot_val(pte_pgprot(pte));
3767 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3768
3769 ret = 0;
3770unlock:
3771 pte_unmap_unlock(ptep, ptl);
3772out:
3773 return ret;
3774}
3775
3776int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3777 void *buf, int len, int write)
3778{
3779 resource_size_t phys_addr;
3780 unsigned long prot = 0;
3781 void __iomem *maddr;
3782 int offset = addr & (PAGE_SIZE-1);
3783
3784 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3785 return -EINVAL;
3786
3787 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3788 if (write)
3789 memcpy_toio(maddr + offset, buf, len);
3790 else
3791 memcpy_fromio(buf, maddr + offset, len);
3792 iounmap(maddr);
3793
3794 return len;
3795}
3796#endif
3797
3798
3799
3800
3801
3802static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3803 unsigned long addr, void *buf, int len, int write)
3804{
3805 struct vm_area_struct *vma;
3806 void *old_buf = buf;
3807
3808 down_read(&mm->mmap_sem);
3809
3810 while (len) {
3811 int bytes, ret, offset;
3812 void *maddr;
3813 struct page *page = NULL;
3814
3815 ret = get_user_pages(tsk, mm, addr, 1,
3816 write, 1, &page, &vma);
3817 if (ret <= 0) {
3818
3819
3820
3821
3822#ifdef CONFIG_HAVE_IOREMAP_PROT
3823 vma = find_vma(mm, addr);
3824 if (!vma || vma->vm_start > addr)
3825 break;
3826 if (vma->vm_ops && vma->vm_ops->access)
3827 ret = vma->vm_ops->access(vma, addr, buf,
3828 len, write);
3829 if (ret <= 0)
3830#endif
3831 break;
3832 bytes = ret;
3833 } else {
3834 bytes = len;
3835 offset = addr & (PAGE_SIZE-1);
3836 if (bytes > PAGE_SIZE-offset)
3837 bytes = PAGE_SIZE-offset;
3838
3839 maddr = kmap(page);
3840 if (write) {
3841 copy_to_user_page(vma, page, addr,
3842 maddr + offset, buf, bytes);
3843 set_page_dirty_lock(page);
3844 } else {
3845 copy_from_user_page(vma, page, addr,
3846 buf, maddr + offset, bytes);
3847 }
3848 kunmap(page);
3849 page_cache_release(page);
3850 }
3851 len -= bytes;
3852 buf += bytes;
3853 addr += bytes;
3854 }
3855 up_read(&mm->mmap_sem);
3856
3857 return buf - old_buf;
3858}
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3871 void *buf, int len, int write)
3872{
3873 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3874}
3875
3876
3877
3878
3879
3880
3881int access_process_vm(struct task_struct *tsk, unsigned long addr,
3882 void *buf, int len, int write)
3883{
3884 struct mm_struct *mm;
3885 int ret;
3886
3887 mm = get_task_mm(tsk);
3888 if (!mm)
3889 return 0;
3890
3891 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3892 mmput(mm);
3893
3894 return ret;
3895}
3896
3897
3898
3899
3900void print_vma_addr(char *prefix, unsigned long ip)
3901{
3902 struct mm_struct *mm = current->mm;
3903 struct vm_area_struct *vma;
3904
3905
3906
3907
3908
3909 if (preempt_count())
3910 return;
3911
3912 down_read(&mm->mmap_sem);
3913 vma = find_vma(mm, ip);
3914 if (vma && vma->vm_file) {
3915 struct file *f = vma->vm_file;
3916 char *buf = (char *)__get_free_page(GFP_KERNEL);
3917 if (buf) {
3918 char *p, *s;
3919
3920 p = d_path(&f->f_path, buf, PAGE_SIZE);
3921 if (IS_ERR(p))
3922 p = "?";
3923 s = strrchr(p, '/');
3924 if (s)
3925 p = s+1;
3926 printk("%s%s[%lx+%lx]", prefix, p,
3927 vma->vm_start,
3928 vma->vm_end - vma->vm_start);
3929 free_page((unsigned long)buf);
3930 }
3931 }
3932 up_read(¤t->mm->mmap_sem);
3933}
3934
3935#ifdef CONFIG_PROVE_LOCKING
3936void might_fault(void)
3937{
3938
3939
3940
3941
3942
3943
3944 if (segment_eq(get_fs(), KERNEL_DS))
3945 return;
3946
3947 might_sleep();
3948
3949
3950
3951
3952
3953 if (!in_atomic() && current->mm)
3954 might_lock_read(¤t->mm->mmap_sem);
3955}
3956EXPORT_SYMBOL(might_fault);
3957#endif
3958
3959#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3960static void clear_gigantic_page(struct page *page,
3961 unsigned long addr,
3962 unsigned int pages_per_huge_page)
3963{
3964 int i;
3965 struct page *p = page;
3966
3967 might_sleep();
3968 for (i = 0; i < pages_per_huge_page;
3969 i++, p = mem_map_next(p, page, i)) {
3970 cond_resched();
3971 clear_user_highpage(p, addr + i * PAGE_SIZE);
3972 }
3973}
3974void clear_huge_page(struct page *page,
3975 unsigned long addr, unsigned int pages_per_huge_page)
3976{
3977 int i;
3978
3979 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3980 clear_gigantic_page(page, addr, pages_per_huge_page);
3981 return;
3982 }
3983
3984 might_sleep();
3985 for (i = 0; i < pages_per_huge_page; i++) {
3986 cond_resched();
3987 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3988 }
3989}
3990
3991static void copy_user_gigantic_page(struct page *dst, struct page *src,
3992 unsigned long addr,
3993 struct vm_area_struct *vma,
3994 unsigned int pages_per_huge_page)
3995{
3996 int i;
3997 struct page *dst_base = dst;
3998 struct page *src_base = src;
3999
4000 for (i = 0; i < pages_per_huge_page; ) {
4001 cond_resched();
4002 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4003
4004 i++;
4005 dst = mem_map_next(dst, dst_base, i);
4006 src = mem_map_next(src, src_base, i);
4007 }
4008}
4009
4010void copy_user_huge_page(struct page *dst, struct page *src,
4011 unsigned long addr, struct vm_area_struct *vma,
4012 unsigned int pages_per_huge_page)
4013{
4014 int i;
4015
4016 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4017 copy_user_gigantic_page(dst, src, addr, vma,
4018 pages_per_huge_page);
4019 return;
4020 }
4021
4022 might_sleep();
4023 for (i = 0; i < pages_per_huge_page; i++) {
4024 cond_resched();
4025 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4026 }
4027}
4028#endif
4029