1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128void sync_mm_rss(struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 current->rss_stat.count[i] = 0;
136 }
137 }
138 current->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 sync_mm_rss(task->mm);
161}
162#else
163
164#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
165#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
166
167static void check_sync_rss_stat(struct task_struct *task)
168{
169}
170
171#endif
172
173#ifdef HAVE_GENERIC_MMU_GATHER
174
175static int tlb_next_batch(struct mmu_gather *tlb)
176{
177 struct mmu_gather_batch *batch;
178
179 batch = tlb->active;
180 if (batch->next) {
181 tlb->active = batch->next;
182 return 1;
183 }
184
185 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
186 if (!batch)
187 return 0;
188
189 batch->next = NULL;
190 batch->nr = 0;
191 batch->max = MAX_GATHER_BATCH;
192
193 tlb->active->next = batch;
194 tlb->active = batch;
195
196 return 1;
197}
198
199
200
201
202
203
204void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
205{
206 tlb->mm = mm;
207
208 tlb->fullmm = fullmm;
209 tlb->start = -1UL;
210 tlb->end = 0;
211 tlb->need_flush = 0;
212 tlb->fast_mode = (num_possible_cpus() == 1);
213 tlb->local.next = NULL;
214 tlb->local.nr = 0;
215 tlb->local.max = ARRAY_SIZE(tlb->__pages);
216 tlb->active = &tlb->local;
217
218#ifdef CONFIG_HAVE_RCU_TABLE_FREE
219 tlb->batch = NULL;
220#endif
221}
222
223void tlb_flush_mmu(struct mmu_gather *tlb)
224{
225 struct mmu_gather_batch *batch;
226
227 if (!tlb->need_flush)
228 return;
229 tlb->need_flush = 0;
230 tlb_flush(tlb);
231#ifdef CONFIG_HAVE_RCU_TABLE_FREE
232 tlb_table_flush(tlb);
233#endif
234
235 if (tlb_fast_mode(tlb))
236 return;
237
238 for (batch = &tlb->local; batch; batch = batch->next) {
239 free_pages_and_swap_cache(batch->pages, batch->nr);
240 batch->nr = 0;
241 }
242 tlb->active = &tlb->local;
243}
244
245
246
247
248
249void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
250{
251 struct mmu_gather_batch *batch, *next;
252
253 tlb->start = start;
254 tlb->end = end;
255 tlb_flush_mmu(tlb);
256
257
258 check_pgt_cache();
259
260 for (batch = tlb->local.next; batch; batch = next) {
261 next = batch->next;
262 free_pages((unsigned long)batch, 0);
263 }
264 tlb->local.next = NULL;
265}
266
267
268
269
270
271
272
273int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
274{
275 struct mmu_gather_batch *batch;
276
277 VM_BUG_ON(!tlb->need_flush);
278
279 if (tlb_fast_mode(tlb)) {
280 free_page_and_swap_cache(page);
281 return 1;
282 }
283
284 batch = tlb->active;
285 batch->pages[batch->nr++] = page;
286 if (batch->nr == batch->max) {
287 if (!tlb_next_batch(tlb))
288 return 0;
289 batch = tlb->active;
290 }
291 VM_BUG_ON(batch->nr > batch->max);
292
293 return batch->max - batch->nr;
294}
295
296#endif
297
298#ifdef CONFIG_HAVE_RCU_TABLE_FREE
299
300
301
302
303
304static void tlb_remove_table_smp_sync(void *arg)
305{
306
307}
308
309static void tlb_remove_table_one(void *table)
310{
311
312
313
314
315
316
317
318 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
319 __tlb_remove_table(table);
320}
321
322static void tlb_remove_table_rcu(struct rcu_head *head)
323{
324 struct mmu_table_batch *batch;
325 int i;
326
327 batch = container_of(head, struct mmu_table_batch, rcu);
328
329 for (i = 0; i < batch->nr; i++)
330 __tlb_remove_table(batch->tables[i]);
331
332 free_page((unsigned long)batch);
333}
334
335void tlb_table_flush(struct mmu_gather *tlb)
336{
337 struct mmu_table_batch **batch = &tlb->batch;
338
339 if (*batch) {
340 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
341 *batch = NULL;
342 }
343}
344
345void tlb_remove_table(struct mmu_gather *tlb, void *table)
346{
347 struct mmu_table_batch **batch = &tlb->batch;
348
349 tlb->need_flush = 1;
350
351
352
353
354
355 if (atomic_read(&tlb->mm->mm_users) < 2) {
356 __tlb_remove_table(table);
357 return;
358 }
359
360 if (*batch == NULL) {
361 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
362 if (*batch == NULL) {
363 tlb_remove_table_one(table);
364 return;
365 }
366 (*batch)->nr = 0;
367 }
368 (*batch)->tables[(*batch)->nr++] = table;
369 if ((*batch)->nr == MAX_TABLE_BATCH)
370 tlb_table_flush(tlb);
371}
372
373#endif
374
375
376
377
378
379
380
381void pgd_clear_bad(pgd_t *pgd)
382{
383 pgd_ERROR(*pgd);
384 pgd_clear(pgd);
385}
386
387void pud_clear_bad(pud_t *pud)
388{
389 pud_ERROR(*pud);
390 pud_clear(pud);
391}
392
393void pmd_clear_bad(pmd_t *pmd)
394{
395 pmd_ERROR(*pmd);
396 pmd_clear(pmd);
397}
398
399
400
401
402
403static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
404 unsigned long addr)
405{
406 pgtable_t token = pmd_pgtable(*pmd);
407 pmd_clear(pmd);
408 pte_free_tlb(tlb, token, addr);
409 tlb->mm->nr_ptes--;
410}
411
412static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
413 unsigned long addr, unsigned long end,
414 unsigned long floor, unsigned long ceiling)
415{
416 pmd_t *pmd;
417 unsigned long next;
418 unsigned long start;
419
420 start = addr;
421 pmd = pmd_offset(pud, addr);
422 do {
423 next = pmd_addr_end(addr, end);
424 if (pmd_none_or_clear_bad(pmd))
425 continue;
426 free_pte_range(tlb, pmd, addr);
427 } while (pmd++, addr = next, addr != end);
428
429 start &= PUD_MASK;
430 if (start < floor)
431 return;
432 if (ceiling) {
433 ceiling &= PUD_MASK;
434 if (!ceiling)
435 return;
436 }
437 if (end - 1 > ceiling - 1)
438 return;
439
440 pmd = pmd_offset(pud, start);
441 pud_clear(pud);
442 pmd_free_tlb(tlb, pmd, start);
443}
444
445static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
446 unsigned long addr, unsigned long end,
447 unsigned long floor, unsigned long ceiling)
448{
449 pud_t *pud;
450 unsigned long next;
451 unsigned long start;
452
453 start = addr;
454 pud = pud_offset(pgd, addr);
455 do {
456 next = pud_addr_end(addr, end);
457 if (pud_none_or_clear_bad(pud))
458 continue;
459 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
460 } while (pud++, addr = next, addr != end);
461
462 start &= PGDIR_MASK;
463 if (start < floor)
464 return;
465 if (ceiling) {
466 ceiling &= PGDIR_MASK;
467 if (!ceiling)
468 return;
469 }
470 if (end - 1 > ceiling - 1)
471 return;
472
473 pud = pud_offset(pgd, start);
474 pgd_clear(pgd);
475 pud_free_tlb(tlb, pud, start);
476}
477
478
479
480
481
482
483void free_pgd_range(struct mmu_gather *tlb,
484 unsigned long addr, unsigned long end,
485 unsigned long floor, unsigned long ceiling)
486{
487 pgd_t *pgd;
488 unsigned long next;
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 addr &= PMD_MASK;
517 if (addr < floor) {
518 addr += PMD_SIZE;
519 if (!addr)
520 return;
521 }
522 if (ceiling) {
523 ceiling &= PMD_MASK;
524 if (!ceiling)
525 return;
526 }
527 if (end - 1 > ceiling - 1)
528 end -= PMD_SIZE;
529 if (addr > end - 1)
530 return;
531
532 pgd = pgd_offset(tlb->mm, addr);
533 do {
534 next = pgd_addr_end(addr, end);
535 if (pgd_none_or_clear_bad(pgd))
536 continue;
537 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
538 } while (pgd++, addr = next, addr != end);
539}
540
541void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
542 unsigned long floor, unsigned long ceiling)
543{
544 while (vma) {
545 struct vm_area_struct *next = vma->vm_next;
546 unsigned long addr = vma->vm_start;
547
548
549
550
551
552 unlink_anon_vmas(vma);
553 unlink_file_vma(vma);
554
555 if (is_vm_hugetlb_page(vma)) {
556 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
557 floor, next? next->vm_start: ceiling);
558 } else {
559
560
561
562 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
563 && !is_vm_hugetlb_page(next)) {
564 vma = next;
565 next = vma->vm_next;
566 unlink_anon_vmas(vma);
567 unlink_file_vma(vma);
568 }
569 free_pgd_range(tlb, addr, vma->vm_end,
570 floor, next? next->vm_start: ceiling);
571 }
572 vma = next;
573 }
574}
575
576int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
577 pmd_t *pmd, unsigned long address)
578{
579 pgtable_t new = pte_alloc_one(mm, address);
580 int wait_split_huge_page;
581 if (!new)
582 return -ENOMEM;
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597 smp_wmb();
598
599 spin_lock(&mm->page_table_lock);
600 wait_split_huge_page = 0;
601 if (likely(pmd_none(*pmd))) {
602 mm->nr_ptes++;
603 pmd_populate(mm, pmd, new);
604 new = NULL;
605 } else if (unlikely(pmd_trans_splitting(*pmd)))
606 wait_split_huge_page = 1;
607 spin_unlock(&mm->page_table_lock);
608 if (new)
609 pte_free(mm, new);
610 if (wait_split_huge_page)
611 wait_split_huge_page(vma->anon_vma, pmd);
612 return 0;
613}
614
615int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
616{
617 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
618 if (!new)
619 return -ENOMEM;
620
621 smp_wmb();
622
623 spin_lock(&init_mm.page_table_lock);
624 if (likely(pmd_none(*pmd))) {
625 pmd_populate_kernel(&init_mm, pmd, new);
626 new = NULL;
627 } else
628 VM_BUG_ON(pmd_trans_splitting(*pmd));
629 spin_unlock(&init_mm.page_table_lock);
630 if (new)
631 pte_free_kernel(&init_mm, new);
632 return 0;
633}
634
635static inline void init_rss_vec(int *rss)
636{
637 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
638}
639
640static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
641{
642 int i;
643
644 if (current->mm == mm)
645 sync_mm_rss(mm);
646 for (i = 0; i < NR_MM_COUNTERS; i++)
647 if (rss[i])
648 add_mm_counter(mm, i, rss[i]);
649}
650
651
652
653
654
655
656
657
658static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
659 pte_t pte, struct page *page)
660{
661 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
662 pud_t *pud = pud_offset(pgd, addr);
663 pmd_t *pmd = pmd_offset(pud, addr);
664 struct address_space *mapping;
665 pgoff_t index;
666 static unsigned long resume;
667 static unsigned long nr_shown;
668 static unsigned long nr_unshown;
669
670
671
672
673
674 if (nr_shown == 60) {
675 if (time_before(jiffies, resume)) {
676 nr_unshown++;
677 return;
678 }
679 if (nr_unshown) {
680 printk(KERN_ALERT
681 "BUG: Bad page map: %lu messages suppressed\n",
682 nr_unshown);
683 nr_unshown = 0;
684 }
685 nr_shown = 0;
686 }
687 if (nr_shown++ == 0)
688 resume = jiffies + 60 * HZ;
689
690 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
691 index = linear_page_index(vma, addr);
692
693 printk(KERN_ALERT
694 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
695 current->comm,
696 (long long)pte_val(pte), (long long)pmd_val(*pmd));
697 if (page)
698 dump_page(page);
699 printk(KERN_ALERT
700 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
701 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
702
703
704
705 if (vma->vm_ops)
706 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
707 (unsigned long)vma->vm_ops->fault);
708 if (vma->vm_file && vma->vm_file->f_op)
709 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
710 (unsigned long)vma->vm_file->f_op->mmap);
711 dump_stack();
712 add_taint(TAINT_BAD_PAGE);
713}
714
715static inline bool is_cow_mapping(vm_flags_t flags)
716{
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718}
719
720#ifndef is_zero_pfn
721static inline int is_zero_pfn(unsigned long pfn)
722{
723 return pfn == zero_pfn;
724}
725#endif
726
727#ifndef my_zero_pfn
728static inline unsigned long my_zero_pfn(unsigned long addr)
729{
730 return zero_pfn;
731}
732#endif
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776#ifdef __HAVE_ARCH_PTE_SPECIAL
777# define HAVE_PTE_SPECIAL 1
778#else
779# define HAVE_PTE_SPECIAL 0
780#endif
781struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
782 pte_t pte)
783{
784 unsigned long pfn = pte_pfn(pte);
785
786 if (HAVE_PTE_SPECIAL) {
787 if (likely(!pte_special(pte)))
788 goto check_pfn;
789 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
790 return NULL;
791 if (!is_zero_pfn(pfn))
792 print_bad_pte(vma, addr, pte, NULL);
793 return NULL;
794 }
795
796
797
798 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
799 if (vma->vm_flags & VM_MIXEDMAP) {
800 if (!pfn_valid(pfn))
801 return NULL;
802 goto out;
803 } else {
804 unsigned long off;
805 off = (addr - vma->vm_start) >> PAGE_SHIFT;
806 if (pfn == vma->vm_pgoff + off)
807 return NULL;
808 if (!is_cow_mapping(vma->vm_flags))
809 return NULL;
810 }
811 }
812
813 if (is_zero_pfn(pfn))
814 return NULL;
815check_pfn:
816 if (unlikely(pfn > highest_memmap_pfn)) {
817 print_bad_pte(vma, addr, pte, NULL);
818 return NULL;
819 }
820
821
822
823
824
825out:
826 return pfn_to_page(pfn);
827}
828
829
830
831
832
833
834
835static inline unsigned long
836copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
837 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
838 unsigned long addr, int *rss)
839{
840 unsigned long vm_flags = vma->vm_flags;
841 pte_t pte = *src_pte;
842 struct page *page;
843
844
845 if (unlikely(!pte_present(pte))) {
846 if (!pte_file(pte)) {
847 swp_entry_t entry = pte_to_swp_entry(pte);
848
849 if (swap_duplicate(entry) < 0)
850 return entry.val;
851
852
853 if (unlikely(list_empty(&dst_mm->mmlist))) {
854 spin_lock(&mmlist_lock);
855 if (list_empty(&dst_mm->mmlist))
856 list_add(&dst_mm->mmlist,
857 &src_mm->mmlist);
858 spin_unlock(&mmlist_lock);
859 }
860 if (likely(!non_swap_entry(entry)))
861 rss[MM_SWAPENTS]++;
862 else if (is_migration_entry(entry)) {
863 page = migration_entry_to_page(entry);
864
865 if (PageAnon(page))
866 rss[MM_ANONPAGES]++;
867 else
868 rss[MM_FILEPAGES]++;
869
870 if (is_write_migration_entry(entry) &&
871 is_cow_mapping(vm_flags)) {
872
873
874
875
876 make_migration_entry_read(&entry);
877 pte = swp_entry_to_pte(entry);
878 set_pte_at(src_mm, addr, src_pte, pte);
879 }
880 }
881 }
882 goto out_set_pte;
883 }
884
885
886
887
888
889 if (is_cow_mapping(vm_flags)) {
890 ptep_set_wrprotect(src_mm, addr, src_pte);
891 pte = pte_wrprotect(pte);
892 }
893
894
895
896
897
898 if (vm_flags & VM_SHARED)
899 pte = pte_mkclean(pte);
900 pte = pte_mkold(pte);
901
902 page = vm_normal_page(vma, addr, pte);
903 if (page) {
904 get_page(page);
905 page_dup_rmap(page);
906 if (PageAnon(page))
907 rss[MM_ANONPAGES]++;
908 else
909 rss[MM_FILEPAGES]++;
910 }
911
912out_set_pte:
913 set_pte_at(dst_mm, addr, dst_pte, pte);
914 return 0;
915}
916
917int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
918 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
919 unsigned long addr, unsigned long end)
920{
921 pte_t *orig_src_pte, *orig_dst_pte;
922 pte_t *src_pte, *dst_pte;
923 spinlock_t *src_ptl, *dst_ptl;
924 int progress = 0;
925 int rss[NR_MM_COUNTERS];
926 swp_entry_t entry = (swp_entry_t){0};
927
928again:
929 init_rss_vec(rss);
930
931 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
932 if (!dst_pte)
933 return -ENOMEM;
934 src_pte = pte_offset_map(src_pmd, addr);
935 src_ptl = pte_lockptr(src_mm, src_pmd);
936 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
937 orig_src_pte = src_pte;
938 orig_dst_pte = dst_pte;
939 arch_enter_lazy_mmu_mode();
940
941 do {
942
943
944
945
946 if (progress >= 32) {
947 progress = 0;
948 if (need_resched() ||
949 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
950 break;
951 }
952 if (pte_none(*src_pte)) {
953 progress++;
954 continue;
955 }
956 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
957 vma, addr, rss);
958 if (entry.val)
959 break;
960 progress += 8;
961 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
962
963 arch_leave_lazy_mmu_mode();
964 spin_unlock(src_ptl);
965 pte_unmap(orig_src_pte);
966 add_mm_rss_vec(dst_mm, rss);
967 pte_unmap_unlock(orig_dst_pte, dst_ptl);
968 cond_resched();
969
970 if (entry.val) {
971 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
972 return -ENOMEM;
973 progress = 0;
974 }
975 if (addr != end)
976 goto again;
977 return 0;
978}
979
980static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
981 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
982 unsigned long addr, unsigned long end)
983{
984 pmd_t *src_pmd, *dst_pmd;
985 unsigned long next;
986
987 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
988 if (!dst_pmd)
989 return -ENOMEM;
990 src_pmd = pmd_offset(src_pud, addr);
991 do {
992 next = pmd_addr_end(addr, end);
993 if (pmd_trans_huge(*src_pmd)) {
994 int err;
995 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
996 err = copy_huge_pmd(dst_mm, src_mm,
997 dst_pmd, src_pmd, addr, vma);
998 if (err == -ENOMEM)
999 return -ENOMEM;
1000 if (!err)
1001 continue;
1002
1003 }
1004 if (pmd_none_or_clear_bad(src_pmd))
1005 continue;
1006 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1007 vma, addr, next))
1008 return -ENOMEM;
1009 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1010 return 0;
1011}
1012
1013static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1014 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1015 unsigned long addr, unsigned long end)
1016{
1017 pud_t *src_pud, *dst_pud;
1018 unsigned long next;
1019
1020 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1021 if (!dst_pud)
1022 return -ENOMEM;
1023 src_pud = pud_offset(src_pgd, addr);
1024 do {
1025 next = pud_addr_end(addr, end);
1026 if (pud_none_or_clear_bad(src_pud))
1027 continue;
1028 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1029 vma, addr, next))
1030 return -ENOMEM;
1031 } while (dst_pud++, src_pud++, addr = next, addr != end);
1032 return 0;
1033}
1034
1035int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1036 struct vm_area_struct *vma)
1037{
1038 pgd_t *src_pgd, *dst_pgd;
1039 unsigned long next;
1040 unsigned long addr = vma->vm_start;
1041 unsigned long end = vma->vm_end;
1042 unsigned long mmun_start;
1043 unsigned long mmun_end;
1044 bool is_cow;
1045 int ret;
1046
1047
1048
1049
1050
1051
1052
1053 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1054 VM_PFNMAP | VM_MIXEDMAP))) {
1055 if (!vma->anon_vma)
1056 return 0;
1057 }
1058
1059 if (is_vm_hugetlb_page(vma))
1060 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1061
1062 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1063
1064
1065
1066
1067 ret = track_pfn_copy(vma);
1068 if (ret)
1069 return ret;
1070 }
1071
1072
1073
1074
1075
1076
1077
1078 is_cow = is_cow_mapping(vma->vm_flags);
1079 mmun_start = addr;
1080 mmun_end = end;
1081 if (is_cow)
1082 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1083 mmun_end);
1084
1085 ret = 0;
1086 dst_pgd = pgd_offset(dst_mm, addr);
1087 src_pgd = pgd_offset(src_mm, addr);
1088 do {
1089 next = pgd_addr_end(addr, end);
1090 if (pgd_none_or_clear_bad(src_pgd))
1091 continue;
1092 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1093 vma, addr, next))) {
1094 ret = -ENOMEM;
1095 break;
1096 }
1097 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1098
1099 if (is_cow)
1100 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1101 return ret;
1102}
1103
1104static unsigned long zap_pte_range(struct mmu_gather *tlb,
1105 struct vm_area_struct *vma, pmd_t *pmd,
1106 unsigned long addr, unsigned long end,
1107 struct zap_details *details)
1108{
1109 struct mm_struct *mm = tlb->mm;
1110 int force_flush = 0;
1111 int rss[NR_MM_COUNTERS];
1112 spinlock_t *ptl;
1113 pte_t *start_pte;
1114 pte_t *pte;
1115
1116again:
1117 init_rss_vec(rss);
1118 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1119 pte = start_pte;
1120 arch_enter_lazy_mmu_mode();
1121 do {
1122 pte_t ptent = *pte;
1123 if (pte_none(ptent)) {
1124 continue;
1125 }
1126
1127 if (pte_present(ptent)) {
1128 struct page *page;
1129
1130 page = vm_normal_page(vma, addr, ptent);
1131 if (unlikely(details) && page) {
1132
1133
1134
1135
1136
1137 if (details->check_mapping &&
1138 details->check_mapping != page->mapping)
1139 continue;
1140
1141
1142
1143
1144 if (details->nonlinear_vma &&
1145 (page->index < details->first_index ||
1146 page->index > details->last_index))
1147 continue;
1148 }
1149 ptent = ptep_get_and_clear_full(mm, addr, pte,
1150 tlb->fullmm);
1151 tlb_remove_tlb_entry(tlb, pte, addr);
1152 if (unlikely(!page))
1153 continue;
1154 if (unlikely(details) && details->nonlinear_vma
1155 && linear_page_index(details->nonlinear_vma,
1156 addr) != page->index)
1157 set_pte_at(mm, addr, pte,
1158 pgoff_to_pte(page->index));
1159 if (PageAnon(page))
1160 rss[MM_ANONPAGES]--;
1161 else {
1162 if (pte_dirty(ptent))
1163 set_page_dirty(page);
1164 if (pte_young(ptent) &&
1165 likely(!VM_SequentialReadHint(vma)))
1166 mark_page_accessed(page);
1167 rss[MM_FILEPAGES]--;
1168 }
1169 page_remove_rmap(page);
1170 if (unlikely(page_mapcount(page) < 0))
1171 print_bad_pte(vma, addr, ptent, page);
1172 force_flush = !__tlb_remove_page(tlb, page);
1173 if (force_flush)
1174 break;
1175 continue;
1176 }
1177
1178
1179
1180
1181 if (unlikely(details))
1182 continue;
1183 if (pte_file(ptent)) {
1184 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1185 print_bad_pte(vma, addr, ptent, NULL);
1186 } else {
1187 swp_entry_t entry = pte_to_swp_entry(ptent);
1188
1189 if (!non_swap_entry(entry))
1190 rss[MM_SWAPENTS]--;
1191 else if (is_migration_entry(entry)) {
1192 struct page *page;
1193
1194 page = migration_entry_to_page(entry);
1195
1196 if (PageAnon(page))
1197 rss[MM_ANONPAGES]--;
1198 else
1199 rss[MM_FILEPAGES]--;
1200 }
1201 if (unlikely(!free_swap_and_cache(entry)))
1202 print_bad_pte(vma, addr, ptent, NULL);
1203 }
1204 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1205 } while (pte++, addr += PAGE_SIZE, addr != end);
1206
1207 add_mm_rss_vec(mm, rss);
1208 arch_leave_lazy_mmu_mode();
1209 pte_unmap_unlock(start_pte, ptl);
1210
1211
1212
1213
1214
1215
1216 if (force_flush) {
1217 force_flush = 0;
1218
1219#ifdef HAVE_GENERIC_MMU_GATHER
1220 tlb->start = addr;
1221 tlb->end = end;
1222#endif
1223 tlb_flush_mmu(tlb);
1224 if (addr != end)
1225 goto again;
1226 }
1227
1228 return addr;
1229}
1230
1231static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1232 struct vm_area_struct *vma, pud_t *pud,
1233 unsigned long addr, unsigned long end,
1234 struct zap_details *details)
1235{
1236 pmd_t *pmd;
1237 unsigned long next;
1238
1239 pmd = pmd_offset(pud, addr);
1240 do {
1241 next = pmd_addr_end(addr, end);
1242 if (pmd_trans_huge(*pmd)) {
1243 if (next - addr != HPAGE_PMD_SIZE) {
1244#ifdef CONFIG_DEBUG_VM
1245 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1246 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1247 __func__, addr, end,
1248 vma->vm_start,
1249 vma->vm_end);
1250 BUG();
1251 }
1252#endif
1253 split_huge_page_pmd(vma->vm_mm, pmd);
1254 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1255 goto next;
1256
1257 }
1258
1259
1260
1261
1262
1263
1264
1265 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1266 goto next;
1267 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1268next:
1269 cond_resched();
1270 } while (pmd++, addr = next, addr != end);
1271
1272 return addr;
1273}
1274
1275static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1276 struct vm_area_struct *vma, pgd_t *pgd,
1277 unsigned long addr, unsigned long end,
1278 struct zap_details *details)
1279{
1280 pud_t *pud;
1281 unsigned long next;
1282
1283 pud = pud_offset(pgd, addr);
1284 do {
1285 next = pud_addr_end(addr, end);
1286 if (pud_none_or_clear_bad(pud))
1287 continue;
1288 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1289 } while (pud++, addr = next, addr != end);
1290
1291 return addr;
1292}
1293
1294static void unmap_page_range(struct mmu_gather *tlb,
1295 struct vm_area_struct *vma,
1296 unsigned long addr, unsigned long end,
1297 struct zap_details *details)
1298{
1299 pgd_t *pgd;
1300 unsigned long next;
1301
1302 if (details && !details->check_mapping && !details->nonlinear_vma)
1303 details = NULL;
1304
1305 BUG_ON(addr >= end);
1306 mem_cgroup_uncharge_start();
1307 tlb_start_vma(tlb, vma);
1308 pgd = pgd_offset(vma->vm_mm, addr);
1309 do {
1310 next = pgd_addr_end(addr, end);
1311 if (pgd_none_or_clear_bad(pgd))
1312 continue;
1313 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1314 } while (pgd++, addr = next, addr != end);
1315 tlb_end_vma(tlb, vma);
1316 mem_cgroup_uncharge_end();
1317}
1318
1319
1320static void unmap_single_vma(struct mmu_gather *tlb,
1321 struct vm_area_struct *vma, unsigned long start_addr,
1322 unsigned long end_addr,
1323 struct zap_details *details)
1324{
1325 unsigned long start = max(vma->vm_start, start_addr);
1326 unsigned long end;
1327
1328 if (start >= vma->vm_end)
1329 return;
1330 end = min(vma->vm_end, end_addr);
1331 if (end <= vma->vm_start)
1332 return;
1333
1334 if (vma->vm_file)
1335 uprobe_munmap(vma, start, end);
1336
1337 if (unlikely(vma->vm_flags & VM_PFNMAP))
1338 untrack_pfn(vma, 0, 0);
1339
1340 if (start != end) {
1341 if (unlikely(is_vm_hugetlb_page(vma))) {
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353 if (vma->vm_file) {
1354 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1355 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1356 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1357 }
1358 } else
1359 unmap_page_range(tlb, vma, start, end, details);
1360 }
1361}
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381void unmap_vmas(struct mmu_gather *tlb,
1382 struct vm_area_struct *vma, unsigned long start_addr,
1383 unsigned long end_addr)
1384{
1385 struct mm_struct *mm = vma->vm_mm;
1386
1387 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1388 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1389 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1390 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1391}
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1403 unsigned long size, struct zap_details *details)
1404{
1405 struct mm_struct *mm = vma->vm_mm;
1406 struct mmu_gather tlb;
1407 unsigned long end = start + size;
1408
1409 lru_add_drain();
1410 tlb_gather_mmu(&tlb, mm, 0);
1411 update_hiwater_rss(mm);
1412 mmu_notifier_invalidate_range_start(mm, start, end);
1413 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1414 unmap_single_vma(&tlb, vma, start, end, details);
1415 mmu_notifier_invalidate_range_end(mm, start, end);
1416 tlb_finish_mmu(&tlb, start, end);
1417}
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1429 unsigned long size, struct zap_details *details)
1430{
1431 struct mm_struct *mm = vma->vm_mm;
1432 struct mmu_gather tlb;
1433 unsigned long end = address + size;
1434
1435 lru_add_drain();
1436 tlb_gather_mmu(&tlb, mm, 0);
1437 update_hiwater_rss(mm);
1438 mmu_notifier_invalidate_range_start(mm, address, end);
1439 unmap_single_vma(&tlb, vma, address, end, details);
1440 mmu_notifier_invalidate_range_end(mm, address, end);
1441 tlb_finish_mmu(&tlb, address, end);
1442}
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1457 unsigned long size)
1458{
1459 if (address < vma->vm_start || address + size > vma->vm_end ||
1460 !(vma->vm_flags & VM_PFNMAP))
1461 return -1;
1462 zap_page_range_single(vma, address, size, NULL);
1463 return 0;
1464}
1465EXPORT_SYMBOL_GPL(zap_vma_ptes);
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1480 unsigned int flags)
1481{
1482 pgd_t *pgd;
1483 pud_t *pud;
1484 pmd_t *pmd;
1485 pte_t *ptep, pte;
1486 spinlock_t *ptl;
1487 struct page *page;
1488 struct mm_struct *mm = vma->vm_mm;
1489
1490 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1491 if (!IS_ERR(page)) {
1492 BUG_ON(flags & FOLL_GET);
1493 goto out;
1494 }
1495
1496 page = NULL;
1497 pgd = pgd_offset(mm, address);
1498 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1499 goto no_page_table;
1500
1501 pud = pud_offset(pgd, address);
1502 if (pud_none(*pud))
1503 goto no_page_table;
1504 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1505 BUG_ON(flags & FOLL_GET);
1506 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1507 goto out;
1508 }
1509 if (unlikely(pud_bad(*pud)))
1510 goto no_page_table;
1511
1512 pmd = pmd_offset(pud, address);
1513 if (pmd_none(*pmd))
1514 goto no_page_table;
1515 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1516 BUG_ON(flags & FOLL_GET);
1517 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1518 goto out;
1519 }
1520 if (pmd_trans_huge(*pmd)) {
1521 if (flags & FOLL_SPLIT) {
1522 split_huge_page_pmd(mm, pmd);
1523 goto split_fallthrough;
1524 }
1525 spin_lock(&mm->page_table_lock);
1526 if (likely(pmd_trans_huge(*pmd))) {
1527 if (unlikely(pmd_trans_splitting(*pmd))) {
1528 spin_unlock(&mm->page_table_lock);
1529 wait_split_huge_page(vma->anon_vma, pmd);
1530 } else {
1531 page = follow_trans_huge_pmd(vma, address,
1532 pmd, flags);
1533 spin_unlock(&mm->page_table_lock);
1534 goto out;
1535 }
1536 } else
1537 spin_unlock(&mm->page_table_lock);
1538
1539 }
1540split_fallthrough:
1541 if (unlikely(pmd_bad(*pmd)))
1542 goto no_page_table;
1543
1544 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1545
1546 pte = *ptep;
1547 if (!pte_present(pte))
1548 goto no_page;
1549 if ((flags & FOLL_WRITE) && !pte_write(pte))
1550 goto unlock;
1551
1552 page = vm_normal_page(vma, address, pte);
1553 if (unlikely(!page)) {
1554 if ((flags & FOLL_DUMP) ||
1555 !is_zero_pfn(pte_pfn(pte)))
1556 goto bad_page;
1557 page = pte_page(pte);
1558 }
1559
1560 if (flags & FOLL_GET)
1561 get_page_foll(page);
1562 if (flags & FOLL_TOUCH) {
1563 if ((flags & FOLL_WRITE) &&
1564 !pte_dirty(pte) && !PageDirty(page))
1565 set_page_dirty(page);
1566
1567
1568
1569
1570
1571 mark_page_accessed(page);
1572 }
1573 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583 if (page->mapping && trylock_page(page)) {
1584 lru_add_drain();
1585
1586
1587
1588
1589
1590
1591 mlock_vma_page(page);
1592 unlock_page(page);
1593 }
1594 }
1595unlock:
1596 pte_unmap_unlock(ptep, ptl);
1597out:
1598 return page;
1599
1600bad_page:
1601 pte_unmap_unlock(ptep, ptl);
1602 return ERR_PTR(-EFAULT);
1603
1604no_page:
1605 pte_unmap_unlock(ptep, ptl);
1606 if (!pte_none(pte))
1607 return page;
1608
1609no_page_table:
1610
1611
1612
1613
1614
1615
1616
1617
1618 if ((flags & FOLL_DUMP) &&
1619 (!vma->vm_ops || !vma->vm_ops->fault))
1620 return ERR_PTR(-EFAULT);
1621 return page;
1622}
1623
1624static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1625{
1626 return stack_guard_page_start(vma, addr) ||
1627 stack_guard_page_end(vma, addr+PAGE_SIZE);
1628}
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1680 unsigned long start, int nr_pages, unsigned int gup_flags,
1681 struct page **pages, struct vm_area_struct **vmas,
1682 int *nonblocking)
1683{
1684 int i;
1685 unsigned long vm_flags;
1686
1687 if (nr_pages <= 0)
1688 return 0;
1689
1690 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1691
1692
1693
1694
1695
1696 vm_flags = (gup_flags & FOLL_WRITE) ?
1697 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1698 vm_flags &= (gup_flags & FOLL_FORCE) ?
1699 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1700 i = 0;
1701
1702 do {
1703 struct vm_area_struct *vma;
1704
1705 vma = find_extend_vma(mm, start);
1706 if (!vma && in_gate_area(mm, start)) {
1707 unsigned long pg = start & PAGE_MASK;
1708 pgd_t *pgd;
1709 pud_t *pud;
1710 pmd_t *pmd;
1711 pte_t *pte;
1712
1713
1714 if (gup_flags & FOLL_WRITE)
1715 return i ? : -EFAULT;
1716 if (pg > TASK_SIZE)
1717 pgd = pgd_offset_k(pg);
1718 else
1719 pgd = pgd_offset_gate(mm, pg);
1720 BUG_ON(pgd_none(*pgd));
1721 pud = pud_offset(pgd, pg);
1722 BUG_ON(pud_none(*pud));
1723 pmd = pmd_offset(pud, pg);
1724 if (pmd_none(*pmd))
1725 return i ? : -EFAULT;
1726 VM_BUG_ON(pmd_trans_huge(*pmd));
1727 pte = pte_offset_map(pmd, pg);
1728 if (pte_none(*pte)) {
1729 pte_unmap(pte);
1730 return i ? : -EFAULT;
1731 }
1732 vma = get_gate_vma(mm);
1733 if (pages) {
1734 struct page *page;
1735
1736 page = vm_normal_page(vma, start, *pte);
1737 if (!page) {
1738 if (!(gup_flags & FOLL_DUMP) &&
1739 is_zero_pfn(pte_pfn(*pte)))
1740 page = pte_page(*pte);
1741 else {
1742 pte_unmap(pte);
1743 return i ? : -EFAULT;
1744 }
1745 }
1746 pages[i] = page;
1747 get_page(page);
1748 }
1749 pte_unmap(pte);
1750 goto next_page;
1751 }
1752
1753 if (!vma ||
1754 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1755 !(vm_flags & vma->vm_flags))
1756 return i ? : -EFAULT;
1757
1758 if (is_vm_hugetlb_page(vma)) {
1759 i = follow_hugetlb_page(mm, vma, pages, vmas,
1760 &start, &nr_pages, i, gup_flags);
1761 continue;
1762 }
1763
1764 do {
1765 struct page *page;
1766 unsigned int foll_flags = gup_flags;
1767
1768
1769
1770
1771
1772 if (unlikely(fatal_signal_pending(current)))
1773 return i ? i : -ERESTARTSYS;
1774
1775 cond_resched();
1776 while (!(page = follow_page(vma, start, foll_flags))) {
1777 int ret;
1778 unsigned int fault_flags = 0;
1779
1780
1781 if (foll_flags & FOLL_MLOCK) {
1782 if (stack_guard_page(vma, start))
1783 goto next_page;
1784 }
1785 if (foll_flags & FOLL_WRITE)
1786 fault_flags |= FAULT_FLAG_WRITE;
1787 if (nonblocking)
1788 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1789 if (foll_flags & FOLL_NOWAIT)
1790 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1791
1792 ret = handle_mm_fault(mm, vma, start,
1793 fault_flags);
1794
1795 if (ret & VM_FAULT_ERROR) {
1796 if (ret & VM_FAULT_OOM)
1797 return i ? i : -ENOMEM;
1798 if (ret & (VM_FAULT_HWPOISON |
1799 VM_FAULT_HWPOISON_LARGE)) {
1800 if (i)
1801 return i;
1802 else if (gup_flags & FOLL_HWPOISON)
1803 return -EHWPOISON;
1804 else
1805 return -EFAULT;
1806 }
1807 if (ret & VM_FAULT_SIGBUS)
1808 return i ? i : -EFAULT;
1809 BUG();
1810 }
1811
1812 if (tsk) {
1813 if (ret & VM_FAULT_MAJOR)
1814 tsk->maj_flt++;
1815 else
1816 tsk->min_flt++;
1817 }
1818
1819 if (ret & VM_FAULT_RETRY) {
1820 if (nonblocking)
1821 *nonblocking = 0;
1822 return i;
1823 }
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837 if ((ret & VM_FAULT_WRITE) &&
1838 !(vma->vm_flags & VM_WRITE))
1839 foll_flags &= ~FOLL_WRITE;
1840
1841 cond_resched();
1842 }
1843 if (IS_ERR(page))
1844 return i ? i : PTR_ERR(page);
1845 if (pages) {
1846 pages[i] = page;
1847
1848 flush_anon_page(vma, page, start);
1849 flush_dcache_page(page);
1850 }
1851next_page:
1852 if (vmas)
1853 vmas[i] = vma;
1854 i++;
1855 start += PAGE_SIZE;
1856 nr_pages--;
1857 } while (nr_pages && start < vma->vm_end);
1858 } while (nr_pages);
1859 return i;
1860}
1861EXPORT_SYMBOL(__get_user_pages);
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1891 unsigned long address, unsigned int fault_flags)
1892{
1893 struct vm_area_struct *vma;
1894 int ret;
1895
1896 vma = find_extend_vma(mm, address);
1897 if (!vma || address < vma->vm_start)
1898 return -EFAULT;
1899
1900 ret = handle_mm_fault(mm, vma, address, fault_flags);
1901 if (ret & VM_FAULT_ERROR) {
1902 if (ret & VM_FAULT_OOM)
1903 return -ENOMEM;
1904 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1905 return -EHWPOISON;
1906 if (ret & VM_FAULT_SIGBUS)
1907 return -EFAULT;
1908 BUG();
1909 }
1910 if (tsk) {
1911 if (ret & VM_FAULT_MAJOR)
1912 tsk->maj_flt++;
1913 else
1914 tsk->min_flt++;
1915 }
1916 return 0;
1917}
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1971 unsigned long start, int nr_pages, int write, int force,
1972 struct page **pages, struct vm_area_struct **vmas)
1973{
1974 int flags = FOLL_TOUCH;
1975
1976 if (pages)
1977 flags |= FOLL_GET;
1978 if (write)
1979 flags |= FOLL_WRITE;
1980 if (force)
1981 flags |= FOLL_FORCE;
1982
1983 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1984 NULL);
1985}
1986EXPORT_SYMBOL(get_user_pages);
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002#ifdef CONFIG_ELF_CORE
2003struct page *get_dump_page(unsigned long addr)
2004{
2005 struct vm_area_struct *vma;
2006 struct page *page;
2007
2008 if (__get_user_pages(current, current->mm, addr, 1,
2009 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
2010 NULL) < 1)
2011 return NULL;
2012 flush_cache_page(vma, addr, page_to_pfn(page));
2013 return page;
2014}
2015#endif
2016
2017pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2018 spinlock_t **ptl)
2019{
2020 pgd_t * pgd = pgd_offset(mm, addr);
2021 pud_t * pud = pud_alloc(mm, pgd, addr);
2022 if (pud) {
2023 pmd_t * pmd = pmd_alloc(mm, pud, addr);
2024 if (pmd) {
2025 VM_BUG_ON(pmd_trans_huge(*pmd));
2026 return pte_alloc_map_lock(mm, pmd, addr, ptl);
2027 }
2028 }
2029 return NULL;
2030}
2031
2032
2033
2034
2035
2036
2037
2038
2039static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2040 struct page *page, pgprot_t prot)
2041{
2042 struct mm_struct *mm = vma->vm_mm;
2043 int retval;
2044 pte_t *pte;
2045 spinlock_t *ptl;
2046
2047 retval = -EINVAL;
2048 if (PageAnon(page))
2049 goto out;
2050 retval = -ENOMEM;
2051 flush_dcache_page(page);
2052 pte = get_locked_pte(mm, addr, &ptl);
2053 if (!pte)
2054 goto out;
2055 retval = -EBUSY;
2056 if (!pte_none(*pte))
2057 goto out_unlock;
2058
2059
2060 get_page(page);
2061 inc_mm_counter_fast(mm, MM_FILEPAGES);
2062 page_add_file_rmap(page);
2063 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2064
2065 retval = 0;
2066 pte_unmap_unlock(pte, ptl);
2067 return retval;
2068out_unlock:
2069 pte_unmap_unlock(pte, ptl);
2070out:
2071 return retval;
2072}
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2102 struct page *page)
2103{
2104 if (addr < vma->vm_start || addr >= vma->vm_end)
2105 return -EFAULT;
2106 if (!page_count(page))
2107 return -EINVAL;
2108 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2109 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
2110 BUG_ON(vma->vm_flags & VM_PFNMAP);
2111 vma->vm_flags |= VM_MIXEDMAP;
2112 }
2113 return insert_page(vma, addr, page, vma->vm_page_prot);
2114}
2115EXPORT_SYMBOL(vm_insert_page);
2116
2117static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2118 unsigned long pfn, pgprot_t prot)
2119{
2120 struct mm_struct *mm = vma->vm_mm;
2121 int retval;
2122 pte_t *pte, entry;
2123 spinlock_t *ptl;
2124
2125 retval = -ENOMEM;
2126 pte = get_locked_pte(mm, addr, &ptl);
2127 if (!pte)
2128 goto out;
2129 retval = -EBUSY;
2130 if (!pte_none(*pte))
2131 goto out_unlock;
2132
2133
2134 entry = pte_mkspecial(pfn_pte(pfn, prot));
2135 set_pte_at(mm, addr, pte, entry);
2136 update_mmu_cache(vma, addr, pte);
2137
2138 retval = 0;
2139out_unlock:
2140 pte_unmap_unlock(pte, ptl);
2141out:
2142 return retval;
2143}
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2163 unsigned long pfn)
2164{
2165 int ret;
2166 pgprot_t pgprot = vma->vm_page_prot;
2167
2168
2169
2170
2171
2172
2173 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2174 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2175 (VM_PFNMAP|VM_MIXEDMAP));
2176 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2177 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2178
2179 if (addr < vma->vm_start || addr >= vma->vm_end)
2180 return -EFAULT;
2181 if (track_pfn_insert(vma, &pgprot, pfn))
2182 return -EINVAL;
2183
2184 ret = insert_pfn(vma, addr, pfn, pgprot);
2185
2186 return ret;
2187}
2188EXPORT_SYMBOL(vm_insert_pfn);
2189
2190int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2191 unsigned long pfn)
2192{
2193 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2194
2195 if (addr < vma->vm_start || addr >= vma->vm_end)
2196 return -EFAULT;
2197
2198
2199
2200
2201
2202
2203
2204
2205 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2206 struct page *page;
2207
2208 page = pfn_to_page(pfn);
2209 return insert_page(vma, addr, page, vma->vm_page_prot);
2210 }
2211 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2212}
2213EXPORT_SYMBOL(vm_insert_mixed);
2214
2215
2216
2217
2218
2219
2220static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2221 unsigned long addr, unsigned long end,
2222 unsigned long pfn, pgprot_t prot)
2223{
2224 pte_t *pte;
2225 spinlock_t *ptl;
2226
2227 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2228 if (!pte)
2229 return -ENOMEM;
2230 arch_enter_lazy_mmu_mode();
2231 do {
2232 BUG_ON(!pte_none(*pte));
2233 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2234 pfn++;
2235 } while (pte++, addr += PAGE_SIZE, addr != end);
2236 arch_leave_lazy_mmu_mode();
2237 pte_unmap_unlock(pte - 1, ptl);
2238 return 0;
2239}
2240
2241static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2242 unsigned long addr, unsigned long end,
2243 unsigned long pfn, pgprot_t prot)
2244{
2245 pmd_t *pmd;
2246 unsigned long next;
2247
2248 pfn -= addr >> PAGE_SHIFT;
2249 pmd = pmd_alloc(mm, pud, addr);
2250 if (!pmd)
2251 return -ENOMEM;
2252 VM_BUG_ON(pmd_trans_huge(*pmd));
2253 do {
2254 next = pmd_addr_end(addr, end);
2255 if (remap_pte_range(mm, pmd, addr, next,
2256 pfn + (addr >> PAGE_SHIFT), prot))
2257 return -ENOMEM;
2258 } while (pmd++, addr = next, addr != end);
2259 return 0;
2260}
2261
2262static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2263 unsigned long addr, unsigned long end,
2264 unsigned long pfn, pgprot_t prot)
2265{
2266 pud_t *pud;
2267 unsigned long next;
2268
2269 pfn -= addr >> PAGE_SHIFT;
2270 pud = pud_alloc(mm, pgd, addr);
2271 if (!pud)
2272 return -ENOMEM;
2273 do {
2274 next = pud_addr_end(addr, end);
2275 if (remap_pmd_range(mm, pud, addr, next,
2276 pfn + (addr >> PAGE_SHIFT), prot))
2277 return -ENOMEM;
2278 } while (pud++, addr = next, addr != end);
2279 return 0;
2280}
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2293 unsigned long pfn, unsigned long size, pgprot_t prot)
2294{
2295 pgd_t *pgd;
2296 unsigned long next;
2297 unsigned long end = addr + PAGE_ALIGN(size);
2298 struct mm_struct *mm = vma->vm_mm;
2299 int err;
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319 if (is_cow_mapping(vma->vm_flags)) {
2320 if (addr != vma->vm_start || end != vma->vm_end)
2321 return -EINVAL;
2322 vma->vm_pgoff = pfn;
2323 }
2324
2325 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2326 if (err)
2327 return -EINVAL;
2328
2329 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2330
2331 BUG_ON(addr >= end);
2332 pfn -= addr >> PAGE_SHIFT;
2333 pgd = pgd_offset(mm, addr);
2334 flush_cache_range(vma, addr, end);
2335 do {
2336 next = pgd_addr_end(addr, end);
2337 err = remap_pud_range(mm, pgd, addr, next,
2338 pfn + (addr >> PAGE_SHIFT), prot);
2339 if (err)
2340 break;
2341 } while (pgd++, addr = next, addr != end);
2342
2343 if (err)
2344 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2345
2346 return err;
2347}
2348EXPORT_SYMBOL(remap_pfn_range);
2349
2350static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2351 unsigned long addr, unsigned long end,
2352 pte_fn_t fn, void *data)
2353{
2354 pte_t *pte;
2355 int err;
2356 pgtable_t token;
2357 spinlock_t *uninitialized_var(ptl);
2358
2359 pte = (mm == &init_mm) ?
2360 pte_alloc_kernel(pmd, addr) :
2361 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2362 if (!pte)
2363 return -ENOMEM;
2364
2365 BUG_ON(pmd_huge(*pmd));
2366
2367 arch_enter_lazy_mmu_mode();
2368
2369 token = pmd_pgtable(*pmd);
2370
2371 do {
2372 err = fn(pte++, token, addr, data);
2373 if (err)
2374 break;
2375 } while (addr += PAGE_SIZE, addr != end);
2376
2377 arch_leave_lazy_mmu_mode();
2378
2379 if (mm != &init_mm)
2380 pte_unmap_unlock(pte-1, ptl);
2381 return err;
2382}
2383
2384static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2385 unsigned long addr, unsigned long end,
2386 pte_fn_t fn, void *data)
2387{
2388 pmd_t *pmd;
2389 unsigned long next;
2390 int err;
2391
2392 BUG_ON(pud_huge(*pud));
2393
2394 pmd = pmd_alloc(mm, pud, addr);
2395 if (!pmd)
2396 return -ENOMEM;
2397 do {
2398 next = pmd_addr_end(addr, end);
2399 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2400 if (err)
2401 break;
2402 } while (pmd++, addr = next, addr != end);
2403 return err;
2404}
2405
2406static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2407 unsigned long addr, unsigned long end,
2408 pte_fn_t fn, void *data)
2409{
2410 pud_t *pud;
2411 unsigned long next;
2412 int err;
2413
2414 pud = pud_alloc(mm, pgd, addr);
2415 if (!pud)
2416 return -ENOMEM;
2417 do {
2418 next = pud_addr_end(addr, end);
2419 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2420 if (err)
2421 break;
2422 } while (pud++, addr = next, addr != end);
2423 return err;
2424}
2425
2426
2427
2428
2429
2430int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2431 unsigned long size, pte_fn_t fn, void *data)
2432{
2433 pgd_t *pgd;
2434 unsigned long next;
2435 unsigned long end = addr + size;
2436 int err;
2437
2438 BUG_ON(addr >= end);
2439 pgd = pgd_offset(mm, addr);
2440 do {
2441 next = pgd_addr_end(addr, end);
2442 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2443 if (err)
2444 break;
2445 } while (pgd++, addr = next, addr != end);
2446
2447 return err;
2448}
2449EXPORT_SYMBOL_GPL(apply_to_page_range);
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2461 pte_t *page_table, pte_t orig_pte)
2462{
2463 int same = 1;
2464#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2465 if (sizeof(pte_t) > sizeof(unsigned long)) {
2466 spinlock_t *ptl = pte_lockptr(mm, pmd);
2467 spin_lock(ptl);
2468 same = pte_same(*page_table, orig_pte);
2469 spin_unlock(ptl);
2470 }
2471#endif
2472 pte_unmap(page_table);
2473 return same;
2474}
2475
2476static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2477{
2478
2479
2480
2481
2482
2483
2484 if (unlikely(!src)) {
2485 void *kaddr = kmap_atomic(dst);
2486 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2487
2488
2489
2490
2491
2492
2493
2494 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2495 clear_page(kaddr);
2496 kunmap_atomic(kaddr);
2497 flush_dcache_page(dst);
2498 } else
2499 copy_user_highpage(dst, src, va, vma);
2500}
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2521 unsigned long address, pte_t *page_table, pmd_t *pmd,
2522 spinlock_t *ptl, pte_t orig_pte)
2523 __releases(ptl)
2524{
2525 struct page *old_page, *new_page = NULL;
2526 pte_t entry;
2527 int ret = 0;
2528 int page_mkwrite = 0;
2529 struct page *dirty_page = NULL;
2530 unsigned long mmun_start = 0;
2531 unsigned long mmun_end = 0;
2532
2533 old_page = vm_normal_page(vma, address, orig_pte);
2534 if (!old_page) {
2535
2536
2537
2538
2539
2540
2541
2542 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2543 (VM_WRITE|VM_SHARED))
2544 goto reuse;
2545 goto gotten;
2546 }
2547
2548
2549
2550
2551
2552 if (PageAnon(old_page) && !PageKsm(old_page)) {
2553 if (!trylock_page(old_page)) {
2554 page_cache_get(old_page);
2555 pte_unmap_unlock(page_table, ptl);
2556 lock_page(old_page);
2557 page_table = pte_offset_map_lock(mm, pmd, address,
2558 &ptl);
2559 if (!pte_same(*page_table, orig_pte)) {
2560 unlock_page(old_page);
2561 goto unlock;
2562 }
2563 page_cache_release(old_page);
2564 }
2565 if (reuse_swap_page(old_page)) {
2566
2567
2568
2569
2570
2571 page_move_anon_rmap(old_page, vma, address);
2572 unlock_page(old_page);
2573 goto reuse;
2574 }
2575 unlock_page(old_page);
2576 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2577 (VM_WRITE|VM_SHARED))) {
2578
2579
2580
2581
2582
2583 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2584 struct vm_fault vmf;
2585 int tmp;
2586
2587 vmf.virtual_address = (void __user *)(address &
2588 PAGE_MASK);
2589 vmf.pgoff = old_page->index;
2590 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2591 vmf.page = old_page;
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601 page_cache_get(old_page);
2602 pte_unmap_unlock(page_table, ptl);
2603
2604 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2605 if (unlikely(tmp &
2606 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2607 ret = tmp;
2608 goto unwritable_page;
2609 }
2610 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2611 lock_page(old_page);
2612 if (!old_page->mapping) {
2613 ret = 0;
2614 unlock_page(old_page);
2615 goto unwritable_page;
2616 }
2617 } else
2618 VM_BUG_ON(!PageLocked(old_page));
2619
2620
2621
2622
2623
2624
2625
2626 page_table = pte_offset_map_lock(mm, pmd, address,
2627 &ptl);
2628 if (!pte_same(*page_table, orig_pte)) {
2629 unlock_page(old_page);
2630 goto unlock;
2631 }
2632
2633 page_mkwrite = 1;
2634 }
2635 dirty_page = old_page;
2636 get_page(dirty_page);
2637
2638reuse:
2639 flush_cache_page(vma, address, pte_pfn(orig_pte));
2640 entry = pte_mkyoung(orig_pte);
2641 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2642 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2643 update_mmu_cache(vma, address, page_table);
2644 pte_unmap_unlock(page_table, ptl);
2645 ret |= VM_FAULT_WRITE;
2646
2647 if (!dirty_page)
2648 return ret;
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658 if (!page_mkwrite) {
2659 wait_on_page_locked(dirty_page);
2660 set_page_dirty_balance(dirty_page, page_mkwrite);
2661
2662 if (vma->vm_file)
2663 file_update_time(vma->vm_file);
2664 }
2665 put_page(dirty_page);
2666 if (page_mkwrite) {
2667 struct address_space *mapping = dirty_page->mapping;
2668
2669 set_page_dirty(dirty_page);
2670 unlock_page(dirty_page);
2671 page_cache_release(dirty_page);
2672 if (mapping) {
2673
2674
2675
2676
2677 balance_dirty_pages_ratelimited(mapping);
2678 }
2679 }
2680
2681 return ret;
2682 }
2683
2684
2685
2686
2687 page_cache_get(old_page);
2688gotten:
2689 pte_unmap_unlock(page_table, ptl);
2690
2691 if (unlikely(anon_vma_prepare(vma)))
2692 goto oom;
2693
2694 if (is_zero_pfn(pte_pfn(orig_pte))) {
2695 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2696 if (!new_page)
2697 goto oom;
2698 } else {
2699 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2700 if (!new_page)
2701 goto oom;
2702 cow_user_page(new_page, old_page, address, vma);
2703 }
2704 __SetPageUptodate(new_page);
2705
2706 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2707 goto oom_free_new;
2708
2709 mmun_start = address & PAGE_MASK;
2710 mmun_end = mmun_start + PAGE_SIZE;
2711 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2712
2713
2714
2715
2716 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2717 if (likely(pte_same(*page_table, orig_pte))) {
2718 if (old_page) {
2719 if (!PageAnon(old_page)) {
2720 dec_mm_counter_fast(mm, MM_FILEPAGES);
2721 inc_mm_counter_fast(mm, MM_ANONPAGES);
2722 }
2723 } else
2724 inc_mm_counter_fast(mm, MM_ANONPAGES);
2725 flush_cache_page(vma, address, pte_pfn(orig_pte));
2726 entry = mk_pte(new_page, vma->vm_page_prot);
2727 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2728
2729
2730
2731
2732
2733
2734 ptep_clear_flush(vma, address, page_table);
2735 page_add_new_anon_rmap(new_page, vma, address);
2736
2737
2738
2739
2740
2741 set_pte_at_notify(mm, address, page_table, entry);
2742 update_mmu_cache(vma, address, page_table);
2743 if (old_page) {
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766 page_remove_rmap(old_page);
2767 }
2768
2769
2770 new_page = old_page;
2771 ret |= VM_FAULT_WRITE;
2772 } else
2773 mem_cgroup_uncharge_page(new_page);
2774
2775 if (new_page)
2776 page_cache_release(new_page);
2777unlock:
2778 pte_unmap_unlock(page_table, ptl);
2779 if (mmun_end > mmun_start)
2780 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2781 if (old_page) {
2782
2783
2784
2785
2786 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2787 lock_page(old_page);
2788 munlock_vma_page(old_page);
2789 unlock_page(old_page);
2790 }
2791 page_cache_release(old_page);
2792 }
2793 return ret;
2794oom_free_new:
2795 page_cache_release(new_page);
2796oom:
2797 if (old_page) {
2798 if (page_mkwrite) {
2799 unlock_page(old_page);
2800 page_cache_release(old_page);
2801 }
2802 page_cache_release(old_page);
2803 }
2804 return VM_FAULT_OOM;
2805
2806unwritable_page:
2807 page_cache_release(old_page);
2808 return ret;
2809}
2810
2811static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2812 unsigned long start_addr, unsigned long end_addr,
2813 struct zap_details *details)
2814{
2815 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2816}
2817
2818static inline void unmap_mapping_range_tree(struct rb_root *root,
2819 struct zap_details *details)
2820{
2821 struct vm_area_struct *vma;
2822 pgoff_t vba, vea, zba, zea;
2823
2824 vma_interval_tree_foreach(vma, root,
2825 details->first_index, details->last_index) {
2826
2827 vba = vma->vm_pgoff;
2828 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2829
2830 zba = details->first_index;
2831 if (zba < vba)
2832 zba = vba;
2833 zea = details->last_index;
2834 if (zea > vea)
2835 zea = vea;
2836
2837 unmap_mapping_range_vma(vma,
2838 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2839 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2840 details);
2841 }
2842}
2843
2844static inline void unmap_mapping_range_list(struct list_head *head,
2845 struct zap_details *details)
2846{
2847 struct vm_area_struct *vma;
2848
2849
2850
2851
2852
2853
2854
2855 list_for_each_entry(vma, head, shared.nonlinear) {
2856 details->nonlinear_vma = vma;
2857 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2858 }
2859}
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875void unmap_mapping_range(struct address_space *mapping,
2876 loff_t const holebegin, loff_t const holelen, int even_cows)
2877{
2878 struct zap_details details;
2879 pgoff_t hba = holebegin >> PAGE_SHIFT;
2880 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2881
2882
2883 if (sizeof(holelen) > sizeof(hlen)) {
2884 long long holeend =
2885 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2886 if (holeend & ~(long long)ULONG_MAX)
2887 hlen = ULONG_MAX - hba + 1;
2888 }
2889
2890 details.check_mapping = even_cows? NULL: mapping;
2891 details.nonlinear_vma = NULL;
2892 details.first_index = hba;
2893 details.last_index = hba + hlen - 1;
2894 if (details.last_index < details.first_index)
2895 details.last_index = ULONG_MAX;
2896
2897
2898 mutex_lock(&mapping->i_mmap_mutex);
2899 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2900 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2901 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2902 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2903 mutex_unlock(&mapping->i_mmap_mutex);
2904}
2905EXPORT_SYMBOL(unmap_mapping_range);
2906
2907
2908
2909
2910
2911
2912static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2913 unsigned long address, pte_t *page_table, pmd_t *pmd,
2914 unsigned int flags, pte_t orig_pte)
2915{
2916 spinlock_t *ptl;
2917 struct page *page, *swapcache = NULL;
2918 swp_entry_t entry;
2919 pte_t pte;
2920 int locked;
2921 struct mem_cgroup *ptr;
2922 int exclusive = 0;
2923 int ret = 0;
2924
2925 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2926 goto out;
2927
2928 entry = pte_to_swp_entry(orig_pte);
2929 if (unlikely(non_swap_entry(entry))) {
2930 if (is_migration_entry(entry)) {
2931 migration_entry_wait(mm, pmd, address);
2932 } else if (is_hwpoison_entry(entry)) {
2933 ret = VM_FAULT_HWPOISON;
2934 } else {
2935 print_bad_pte(vma, address, orig_pte, NULL);
2936 ret = VM_FAULT_SIGBUS;
2937 }
2938 goto out;
2939 }
2940 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2941 page = lookup_swap_cache(entry);
2942 if (!page) {
2943 page = swapin_readahead(entry,
2944 GFP_HIGHUSER_MOVABLE, vma, address);
2945 if (!page) {
2946
2947
2948
2949
2950 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2951 if (likely(pte_same(*page_table, orig_pte)))
2952 ret = VM_FAULT_OOM;
2953 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2954 goto unlock;
2955 }
2956
2957
2958 ret = VM_FAULT_MAJOR;
2959 count_vm_event(PGMAJFAULT);
2960 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2961 } else if (PageHWPoison(page)) {
2962
2963
2964
2965
2966 ret = VM_FAULT_HWPOISON;
2967 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2968 goto out_release;
2969 }
2970
2971 locked = lock_page_or_retry(page, mm, flags);
2972
2973 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2974 if (!locked) {
2975 ret |= VM_FAULT_RETRY;
2976 goto out_release;
2977 }
2978
2979
2980
2981
2982
2983
2984
2985 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2986 goto out_page;
2987
2988 if (ksm_might_need_to_copy(page, vma, address)) {
2989 swapcache = page;
2990 page = ksm_does_need_to_copy(page, vma, address);
2991
2992 if (unlikely(!page)) {
2993 ret = VM_FAULT_OOM;
2994 page = swapcache;
2995 swapcache = NULL;
2996 goto out_page;
2997 }
2998 }
2999
3000 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
3001 ret = VM_FAULT_OOM;
3002 goto out_page;
3003 }
3004
3005
3006
3007
3008 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3009 if (unlikely(!pte_same(*page_table, orig_pte)))
3010 goto out_nomap;
3011
3012 if (unlikely(!PageUptodate(page))) {
3013 ret = VM_FAULT_SIGBUS;
3014 goto out_nomap;
3015 }
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031 inc_mm_counter_fast(mm, MM_ANONPAGES);
3032 dec_mm_counter_fast(mm, MM_SWAPENTS);
3033 pte = mk_pte(page, vma->vm_page_prot);
3034 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
3035 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3036 flags &= ~FAULT_FLAG_WRITE;
3037 ret |= VM_FAULT_WRITE;
3038 exclusive = 1;
3039 }
3040 flush_icache_page(vma, page);
3041 set_pte_at(mm, address, page_table, pte);
3042 do_page_add_anon_rmap(page, vma, address, exclusive);
3043
3044 mem_cgroup_commit_charge_swapin(page, ptr);
3045
3046 swap_free(entry);
3047 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3048 try_to_free_swap(page);
3049 unlock_page(page);
3050 if (swapcache) {
3051
3052
3053
3054
3055
3056
3057
3058
3059 unlock_page(swapcache);
3060 page_cache_release(swapcache);
3061 }
3062
3063 if (flags & FAULT_FLAG_WRITE) {
3064 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3065 if (ret & VM_FAULT_ERROR)
3066 ret &= VM_FAULT_ERROR;
3067 goto out;
3068 }
3069
3070
3071 update_mmu_cache(vma, address, page_table);
3072unlock:
3073 pte_unmap_unlock(page_table, ptl);
3074out:
3075 return ret;
3076out_nomap:
3077 mem_cgroup_cancel_charge_swapin(ptr);
3078 pte_unmap_unlock(page_table, ptl);
3079out_page:
3080 unlock_page(page);
3081out_release:
3082 page_cache_release(page);
3083 if (swapcache) {
3084 unlock_page(swapcache);
3085 page_cache_release(swapcache);
3086 }
3087 return ret;
3088}
3089
3090
3091
3092
3093
3094
3095static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3096{
3097 address &= PAGE_MASK;
3098 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3099 struct vm_area_struct *prev = vma->vm_prev;
3100
3101
3102
3103
3104
3105
3106
3107 if (prev && prev->vm_end == address)
3108 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3109
3110 expand_downwards(vma, address - PAGE_SIZE);
3111 }
3112 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3113 struct vm_area_struct *next = vma->vm_next;
3114
3115
3116 if (next && next->vm_start == address + PAGE_SIZE)
3117 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3118
3119 expand_upwards(vma, address + PAGE_SIZE);
3120 }
3121 return 0;
3122}
3123
3124
3125
3126
3127
3128
3129static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3130 unsigned long address, pte_t *page_table, pmd_t *pmd,
3131 unsigned int flags)
3132{
3133 struct page *page;
3134 spinlock_t *ptl;
3135 pte_t entry;
3136
3137 pte_unmap(page_table);
3138
3139
3140 if (check_stack_guard_page(vma, address) < 0)
3141 return VM_FAULT_SIGBUS;
3142
3143
3144 if (!(flags & FAULT_FLAG_WRITE)) {
3145 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3146 vma->vm_page_prot));
3147 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3148 if (!pte_none(*page_table))
3149 goto unlock;
3150 goto setpte;
3151 }
3152
3153
3154 if (unlikely(anon_vma_prepare(vma)))
3155 goto oom;
3156 page = alloc_zeroed_user_highpage_movable(vma, address);
3157 if (!page)
3158 goto oom;
3159 __SetPageUptodate(page);
3160
3161 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3162 goto oom_free_page;
3163
3164 entry = mk_pte(page, vma->vm_page_prot);
3165 if (vma->vm_flags & VM_WRITE)
3166 entry = pte_mkwrite(pte_mkdirty(entry));
3167
3168 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3169 if (!pte_none(*page_table))
3170 goto release;
3171
3172 inc_mm_counter_fast(mm, MM_ANONPAGES);
3173 page_add_new_anon_rmap(page, vma, address);
3174setpte:
3175 set_pte_at(mm, address, page_table, entry);
3176
3177
3178 update_mmu_cache(vma, address, page_table);
3179unlock:
3180 pte_unmap_unlock(page_table, ptl);
3181 return 0;
3182release:
3183 mem_cgroup_uncharge_page(page);
3184 page_cache_release(page);
3185 goto unlock;
3186oom_free_page:
3187 page_cache_release(page);
3188oom:
3189 return VM_FAULT_OOM;
3190}
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3206 unsigned long address, pmd_t *pmd,
3207 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3208{
3209 pte_t *page_table;
3210 spinlock_t *ptl;
3211 struct page *page;
3212 struct page *cow_page;
3213 pte_t entry;
3214 int anon = 0;
3215 struct page *dirty_page = NULL;
3216 struct vm_fault vmf;
3217 int ret;
3218 int page_mkwrite = 0;
3219
3220
3221
3222
3223
3224 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3225
3226 if (unlikely(anon_vma_prepare(vma)))
3227 return VM_FAULT_OOM;
3228
3229 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3230 if (!cow_page)
3231 return VM_FAULT_OOM;
3232
3233 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3234 page_cache_release(cow_page);
3235 return VM_FAULT_OOM;
3236 }
3237 } else
3238 cow_page = NULL;
3239
3240 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3241 vmf.pgoff = pgoff;
3242 vmf.flags = flags;
3243 vmf.page = NULL;
3244
3245 ret = vma->vm_ops->fault(vma, &vmf);
3246 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3247 VM_FAULT_RETRY)))
3248 goto uncharge_out;
3249
3250 if (unlikely(PageHWPoison(vmf.page))) {
3251 if (ret & VM_FAULT_LOCKED)
3252 unlock_page(vmf.page);
3253 ret = VM_FAULT_HWPOISON;
3254 goto uncharge_out;
3255 }
3256
3257
3258
3259
3260
3261 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3262 lock_page(vmf.page);
3263 else
3264 VM_BUG_ON(!PageLocked(vmf.page));
3265
3266
3267
3268
3269 page = vmf.page;
3270 if (flags & FAULT_FLAG_WRITE) {
3271 if (!(vma->vm_flags & VM_SHARED)) {
3272 page = cow_page;
3273 anon = 1;
3274 copy_user_highpage(page, vmf.page, address, vma);
3275 __SetPageUptodate(page);
3276 } else {
3277
3278
3279
3280
3281
3282 if (vma->vm_ops->page_mkwrite) {
3283 int tmp;
3284
3285 unlock_page(page);
3286 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3287 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3288 if (unlikely(tmp &
3289 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3290 ret = tmp;
3291 goto unwritable_page;
3292 }
3293 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3294 lock_page(page);
3295 if (!page->mapping) {
3296 ret = 0;
3297 unlock_page(page);
3298 goto unwritable_page;
3299 }
3300 } else
3301 VM_BUG_ON(!PageLocked(page));
3302 page_mkwrite = 1;
3303 }
3304 }
3305
3306 }
3307
3308 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321 if (likely(pte_same(*page_table, orig_pte))) {
3322 flush_icache_page(vma, page);
3323 entry = mk_pte(page, vma->vm_page_prot);
3324 if (flags & FAULT_FLAG_WRITE)
3325 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3326 if (anon) {
3327 inc_mm_counter_fast(mm, MM_ANONPAGES);
3328 page_add_new_anon_rmap(page, vma, address);
3329 } else {
3330 inc_mm_counter_fast(mm, MM_FILEPAGES);
3331 page_add_file_rmap(page);
3332 if (flags & FAULT_FLAG_WRITE) {
3333 dirty_page = page;
3334 get_page(dirty_page);
3335 }
3336 }
3337 set_pte_at(mm, address, page_table, entry);
3338
3339
3340 update_mmu_cache(vma, address, page_table);
3341 } else {
3342 if (cow_page)
3343 mem_cgroup_uncharge_page(cow_page);
3344 if (anon)
3345 page_cache_release(page);
3346 else
3347 anon = 1;
3348 }
3349
3350 pte_unmap_unlock(page_table, ptl);
3351
3352 if (dirty_page) {
3353 struct address_space *mapping = page->mapping;
3354 int dirtied = 0;
3355
3356 if (set_page_dirty(dirty_page))
3357 dirtied = 1;
3358 unlock_page(dirty_page);
3359 put_page(dirty_page);
3360 if ((dirtied || page_mkwrite) && mapping) {
3361
3362
3363
3364
3365 balance_dirty_pages_ratelimited(mapping);
3366 }
3367
3368
3369 if (vma->vm_file && !page_mkwrite)
3370 file_update_time(vma->vm_file);
3371 } else {
3372 unlock_page(vmf.page);
3373 if (anon)
3374 page_cache_release(vmf.page);
3375 }
3376
3377 return ret;
3378
3379unwritable_page:
3380 page_cache_release(page);
3381 return ret;
3382uncharge_out:
3383
3384 if (cow_page) {
3385 mem_cgroup_uncharge_page(cow_page);
3386 page_cache_release(cow_page);
3387 }
3388 return ret;
3389}
3390
3391static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3392 unsigned long address, pte_t *page_table, pmd_t *pmd,
3393 unsigned int flags, pte_t orig_pte)
3394{
3395 pgoff_t pgoff = (((address & PAGE_MASK)
3396 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3397
3398 pte_unmap(page_table);
3399 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3400}
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3412 unsigned long address, pte_t *page_table, pmd_t *pmd,
3413 unsigned int flags, pte_t orig_pte)
3414{
3415 pgoff_t pgoff;
3416
3417 flags |= FAULT_FLAG_NONLINEAR;
3418
3419 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3420 return 0;
3421
3422 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3423
3424
3425
3426 print_bad_pte(vma, address, orig_pte, NULL);
3427 return VM_FAULT_SIGBUS;
3428 }
3429
3430 pgoff = pte_to_pgoff(orig_pte);
3431 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3432}
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447int handle_pte_fault(struct mm_struct *mm,
3448 struct vm_area_struct *vma, unsigned long address,
3449 pte_t *pte, pmd_t *pmd, unsigned int flags)
3450{
3451 pte_t entry;
3452 spinlock_t *ptl;
3453
3454 entry = *pte;
3455 if (!pte_present(entry)) {
3456 if (pte_none(entry)) {
3457 if (vma->vm_ops) {
3458 if (likely(vma->vm_ops->fault))
3459 return do_linear_fault(mm, vma, address,
3460 pte, pmd, flags, entry);
3461 }
3462 return do_anonymous_page(mm, vma, address,
3463 pte, pmd, flags);
3464 }
3465 if (pte_file(entry))
3466 return do_nonlinear_fault(mm, vma, address,
3467 pte, pmd, flags, entry);
3468 return do_swap_page(mm, vma, address,
3469 pte, pmd, flags, entry);
3470 }
3471
3472 ptl = pte_lockptr(mm, pmd);
3473 spin_lock(ptl);
3474 if (unlikely(!pte_same(*pte, entry)))
3475 goto unlock;
3476 if (flags & FAULT_FLAG_WRITE) {
3477 if (!pte_write(entry))
3478 return do_wp_page(mm, vma, address,
3479 pte, pmd, ptl, entry);
3480 entry = pte_mkdirty(entry);
3481 }
3482 entry = pte_mkyoung(entry);
3483 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3484 update_mmu_cache(vma, address, pte);
3485 } else {
3486
3487
3488
3489
3490
3491
3492 if (flags & FAULT_FLAG_WRITE)
3493 flush_tlb_fix_spurious_fault(vma, address);
3494 }
3495unlock:
3496 pte_unmap_unlock(pte, ptl);
3497 return 0;
3498}
3499
3500
3501
3502
3503int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3504 unsigned long address, unsigned int flags)
3505{
3506 pgd_t *pgd;
3507 pud_t *pud;
3508 pmd_t *pmd;
3509 pte_t *pte;
3510
3511 __set_current_state(TASK_RUNNING);
3512
3513 count_vm_event(PGFAULT);
3514 mem_cgroup_count_vm_event(mm, PGFAULT);
3515
3516
3517 check_sync_rss_stat(current);
3518
3519 if (unlikely(is_vm_hugetlb_page(vma)))
3520 return hugetlb_fault(mm, vma, address, flags);
3521
3522retry:
3523 pgd = pgd_offset(mm, address);
3524 pud = pud_alloc(mm, pgd, address);
3525 if (!pud)
3526 return VM_FAULT_OOM;
3527 pmd = pmd_alloc(mm, pud, address);
3528 if (!pmd)
3529 return VM_FAULT_OOM;
3530 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3531 if (!vma->vm_ops)
3532 return do_huge_pmd_anonymous_page(mm, vma, address,
3533 pmd, flags);
3534 } else {
3535 pmd_t orig_pmd = *pmd;
3536 int ret;
3537
3538 barrier();
3539 if (pmd_trans_huge(orig_pmd)) {
3540 if (flags & FAULT_FLAG_WRITE &&
3541 !pmd_write(orig_pmd) &&
3542 !pmd_trans_splitting(orig_pmd)) {
3543 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3544 orig_pmd);
3545
3546
3547
3548
3549
3550 if (unlikely(ret & VM_FAULT_OOM))
3551 goto retry;
3552 return ret;
3553 }
3554 return 0;
3555 }
3556 }
3557
3558
3559
3560
3561
3562
3563 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3564 return VM_FAULT_OOM;
3565
3566 if (unlikely(pmd_trans_huge(*pmd)))
3567 return 0;
3568
3569
3570
3571
3572
3573
3574 pte = pte_offset_map(pmd, address);
3575
3576 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3577}
3578
3579#ifndef __PAGETABLE_PUD_FOLDED
3580
3581
3582
3583
3584int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3585{
3586 pud_t *new = pud_alloc_one(mm, address);
3587 if (!new)
3588 return -ENOMEM;
3589
3590 smp_wmb();
3591
3592 spin_lock(&mm->page_table_lock);
3593 if (pgd_present(*pgd))
3594 pud_free(mm, new);
3595 else
3596 pgd_populate(mm, pgd, new);
3597 spin_unlock(&mm->page_table_lock);
3598 return 0;
3599}
3600#endif
3601
3602#ifndef __PAGETABLE_PMD_FOLDED
3603
3604
3605
3606
3607int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3608{
3609 pmd_t *new = pmd_alloc_one(mm, address);
3610 if (!new)
3611 return -ENOMEM;
3612
3613 smp_wmb();
3614
3615 spin_lock(&mm->page_table_lock);
3616#ifndef __ARCH_HAS_4LEVEL_HACK
3617 if (pud_present(*pud))
3618 pmd_free(mm, new);
3619 else
3620 pud_populate(mm, pud, new);
3621#else
3622 if (pgd_present(*pud))
3623 pmd_free(mm, new);
3624 else
3625 pgd_populate(mm, pud, new);
3626#endif
3627 spin_unlock(&mm->page_table_lock);
3628 return 0;
3629}
3630#endif
3631
3632int make_pages_present(unsigned long addr, unsigned long end)
3633{
3634 int ret, len, write;
3635 struct vm_area_struct * vma;
3636
3637 vma = find_vma(current->mm, addr);
3638 if (!vma)
3639 return -ENOMEM;
3640
3641
3642
3643
3644
3645 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3646 BUG_ON(addr >= end);
3647 BUG_ON(end > vma->vm_end);
3648 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3649 ret = get_user_pages(current, current->mm, addr,
3650 len, write, 0, NULL, NULL);
3651 if (ret < 0)
3652 return ret;
3653 return ret == len ? 0 : -EFAULT;
3654}
3655
3656#if !defined(__HAVE_ARCH_GATE_AREA)
3657
3658#if defined(AT_SYSINFO_EHDR)
3659static struct vm_area_struct gate_vma;
3660
3661static int __init gate_vma_init(void)
3662{
3663 gate_vma.vm_mm = NULL;
3664 gate_vma.vm_start = FIXADDR_USER_START;
3665 gate_vma.vm_end = FIXADDR_USER_END;
3666 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3667 gate_vma.vm_page_prot = __P101;
3668
3669 return 0;
3670}
3671__initcall(gate_vma_init);
3672#endif
3673
3674struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3675{
3676#ifdef AT_SYSINFO_EHDR
3677 return &gate_vma;
3678#else
3679 return NULL;
3680#endif
3681}
3682
3683int in_gate_area_no_mm(unsigned long addr)
3684{
3685#ifdef AT_SYSINFO_EHDR
3686 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3687 return 1;
3688#endif
3689 return 0;
3690}
3691
3692#endif
3693
3694static int __follow_pte(struct mm_struct *mm, unsigned long address,
3695 pte_t **ptepp, spinlock_t **ptlp)
3696{
3697 pgd_t *pgd;
3698 pud_t *pud;
3699 pmd_t *pmd;
3700 pte_t *ptep;
3701
3702 pgd = pgd_offset(mm, address);
3703 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3704 goto out;
3705
3706 pud = pud_offset(pgd, address);
3707 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3708 goto out;
3709
3710 pmd = pmd_offset(pud, address);
3711 VM_BUG_ON(pmd_trans_huge(*pmd));
3712 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3713 goto out;
3714
3715
3716 if (pmd_huge(*pmd))
3717 goto out;
3718
3719 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3720 if (!ptep)
3721 goto out;
3722 if (!pte_present(*ptep))
3723 goto unlock;
3724 *ptepp = ptep;
3725 return 0;
3726unlock:
3727 pte_unmap_unlock(ptep, *ptlp);
3728out:
3729 return -EINVAL;
3730}
3731
3732static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3733 pte_t **ptepp, spinlock_t **ptlp)
3734{
3735 int res;
3736
3737
3738 (void) __cond_lock(*ptlp,
3739 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3740 return res;
3741}
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3754 unsigned long *pfn)
3755{
3756 int ret = -EINVAL;
3757 spinlock_t *ptl;
3758 pte_t *ptep;
3759
3760 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3761 return ret;
3762
3763 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3764 if (ret)
3765 return ret;
3766 *pfn = pte_pfn(*ptep);
3767 pte_unmap_unlock(ptep, ptl);
3768 return 0;
3769}
3770EXPORT_SYMBOL(follow_pfn);
3771
3772#ifdef CONFIG_HAVE_IOREMAP_PROT
3773int follow_phys(struct vm_area_struct *vma,
3774 unsigned long address, unsigned int flags,
3775 unsigned long *prot, resource_size_t *phys)
3776{
3777 int ret = -EINVAL;
3778 pte_t *ptep, pte;
3779 spinlock_t *ptl;
3780
3781 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3782 goto out;
3783
3784 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3785 goto out;
3786 pte = *ptep;
3787
3788 if ((flags & FOLL_WRITE) && !pte_write(pte))
3789 goto unlock;
3790
3791 *prot = pgprot_val(pte_pgprot(pte));
3792 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3793
3794 ret = 0;
3795unlock:
3796 pte_unmap_unlock(ptep, ptl);
3797out:
3798 return ret;
3799}
3800
3801int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3802 void *buf, int len, int write)
3803{
3804 resource_size_t phys_addr;
3805 unsigned long prot = 0;
3806 void __iomem *maddr;
3807 int offset = addr & (PAGE_SIZE-1);
3808
3809 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3810 return -EINVAL;
3811
3812 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3813 if (write)
3814 memcpy_toio(maddr + offset, buf, len);
3815 else
3816 memcpy_fromio(buf, maddr + offset, len);
3817 iounmap(maddr);
3818
3819 return len;
3820}
3821#endif
3822
3823
3824
3825
3826
3827static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3828 unsigned long addr, void *buf, int len, int write)
3829{
3830 struct vm_area_struct *vma;
3831 void *old_buf = buf;
3832
3833 down_read(&mm->mmap_sem);
3834
3835 while (len) {
3836 int bytes, ret, offset;
3837 void *maddr;
3838 struct page *page = NULL;
3839
3840 ret = get_user_pages(tsk, mm, addr, 1,
3841 write, 1, &page, &vma);
3842 if (ret <= 0) {
3843
3844
3845
3846
3847#ifdef CONFIG_HAVE_IOREMAP_PROT
3848 vma = find_vma(mm, addr);
3849 if (!vma || vma->vm_start > addr)
3850 break;
3851 if (vma->vm_ops && vma->vm_ops->access)
3852 ret = vma->vm_ops->access(vma, addr, buf,
3853 len, write);
3854 if (ret <= 0)
3855#endif
3856 break;
3857 bytes = ret;
3858 } else {
3859 bytes = len;
3860 offset = addr & (PAGE_SIZE-1);
3861 if (bytes > PAGE_SIZE-offset)
3862 bytes = PAGE_SIZE-offset;
3863
3864 maddr = kmap(page);
3865 if (write) {
3866 copy_to_user_page(vma, page, addr,
3867 maddr + offset, buf, bytes);
3868 set_page_dirty_lock(page);
3869 } else {
3870 copy_from_user_page(vma, page, addr,
3871 buf, maddr + offset, bytes);
3872 }
3873 kunmap(page);
3874 page_cache_release(page);
3875 }
3876 len -= bytes;
3877 buf += bytes;
3878 addr += bytes;
3879 }
3880 up_read(&mm->mmap_sem);
3881
3882 return buf - old_buf;
3883}
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3896 void *buf, int len, int write)
3897{
3898 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3899}
3900
3901
3902
3903
3904
3905
3906int access_process_vm(struct task_struct *tsk, unsigned long addr,
3907 void *buf, int len, int write)
3908{
3909 struct mm_struct *mm;
3910 int ret;
3911
3912 mm = get_task_mm(tsk);
3913 if (!mm)
3914 return 0;
3915
3916 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3917 mmput(mm);
3918
3919 return ret;
3920}
3921
3922
3923
3924
3925void print_vma_addr(char *prefix, unsigned long ip)
3926{
3927 struct mm_struct *mm = current->mm;
3928 struct vm_area_struct *vma;
3929
3930
3931
3932
3933
3934 if (preempt_count())
3935 return;
3936
3937 down_read(&mm->mmap_sem);
3938 vma = find_vma(mm, ip);
3939 if (vma && vma->vm_file) {
3940 struct file *f = vma->vm_file;
3941 char *buf = (char *)__get_free_page(GFP_KERNEL);
3942 if (buf) {
3943 char *p, *s;
3944
3945 p = d_path(&f->f_path, buf, PAGE_SIZE);
3946 if (IS_ERR(p))
3947 p = "?";
3948 s = strrchr(p, '/');
3949 if (s)
3950 p = s+1;
3951 printk("%s%s[%lx+%lx]", prefix, p,
3952 vma->vm_start,
3953 vma->vm_end - vma->vm_start);
3954 free_page((unsigned long)buf);
3955 }
3956 }
3957 up_read(&mm->mmap_sem);
3958}
3959
3960#ifdef CONFIG_PROVE_LOCKING
3961void might_fault(void)
3962{
3963
3964
3965
3966
3967
3968
3969 if (segment_eq(get_fs(), KERNEL_DS))
3970 return;
3971
3972 might_sleep();
3973
3974
3975
3976
3977
3978 if (!in_atomic() && current->mm)
3979 might_lock_read(¤t->mm->mmap_sem);
3980}
3981EXPORT_SYMBOL(might_fault);
3982#endif
3983
3984#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3985static void clear_gigantic_page(struct page *page,
3986 unsigned long addr,
3987 unsigned int pages_per_huge_page)
3988{
3989 int i;
3990 struct page *p = page;
3991
3992 might_sleep();
3993 for (i = 0; i < pages_per_huge_page;
3994 i++, p = mem_map_next(p, page, i)) {
3995 cond_resched();
3996 clear_user_highpage(p, addr + i * PAGE_SIZE);
3997 }
3998}
3999void clear_huge_page(struct page *page,
4000 unsigned long addr, unsigned int pages_per_huge_page)
4001{
4002 int i;
4003
4004 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4005 clear_gigantic_page(page, addr, pages_per_huge_page);
4006 return;
4007 }
4008
4009 might_sleep();
4010 for (i = 0; i < pages_per_huge_page; i++) {
4011 cond_resched();
4012 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4013 }
4014}
4015
4016static void copy_user_gigantic_page(struct page *dst, struct page *src,
4017 unsigned long addr,
4018 struct vm_area_struct *vma,
4019 unsigned int pages_per_huge_page)
4020{
4021 int i;
4022 struct page *dst_base = dst;
4023 struct page *src_base = src;
4024
4025 for (i = 0; i < pages_per_huge_page; ) {
4026 cond_resched();
4027 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4028
4029 i++;
4030 dst = mem_map_next(dst, dst_base, i);
4031 src = mem_map_next(src, src_base, i);
4032 }
4033}
4034
4035void copy_user_huge_page(struct page *dst, struct page *src,
4036 unsigned long addr, struct vm_area_struct *vma,
4037 unsigned int pages_per_huge_page)
4038{
4039 int i;
4040
4041 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4042 copy_user_gigantic_page(dst, src, addr, vma,
4043 pages_per_huge_page);
4044 return;
4045 }
4046
4047 might_sleep();
4048 for (i = 0; i < pages_per_huge_page; i++) {
4049 cond_resched();
4050 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4051 }
4052}
4053#endif
4054