1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128void sync_mm_rss(struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 current->rss_stat.count[i] = 0;
136 }
137 }
138 current->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 sync_mm_rss(task->mm);
161}
162#else
163
164#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
165#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
166
167static void check_sync_rss_stat(struct task_struct *task)
168{
169}
170
171#endif
172
173#ifdef HAVE_GENERIC_MMU_GATHER
174
175static int tlb_next_batch(struct mmu_gather *tlb)
176{
177 struct mmu_gather_batch *batch;
178
179 batch = tlb->active;
180 if (batch->next) {
181 tlb->active = batch->next;
182 return 1;
183 }
184
185 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
186 return 0;
187
188 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
189 if (!batch)
190 return 0;
191
192 tlb->batch_count++;
193 batch->next = NULL;
194 batch->nr = 0;
195 batch->max = MAX_GATHER_BATCH;
196
197 tlb->active->next = batch;
198 tlb->active = batch;
199
200 return 1;
201}
202
203
204
205
206
207
208void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
209{
210 tlb->mm = mm;
211
212 tlb->fullmm = fullmm;
213 tlb->start = -1UL;
214 tlb->end = 0;
215 tlb->need_flush = 0;
216 tlb->fast_mode = (num_possible_cpus() == 1);
217 tlb->local.next = NULL;
218 tlb->local.nr = 0;
219 tlb->local.max = ARRAY_SIZE(tlb->__pages);
220 tlb->active = &tlb->local;
221 tlb->batch_count = 0;
222
223#ifdef CONFIG_HAVE_RCU_TABLE_FREE
224 tlb->batch = NULL;
225#endif
226}
227
228void tlb_flush_mmu(struct mmu_gather *tlb)
229{
230 struct mmu_gather_batch *batch;
231
232 if (!tlb->need_flush)
233 return;
234 tlb->need_flush = 0;
235 tlb_flush(tlb);
236#ifdef CONFIG_HAVE_RCU_TABLE_FREE
237 tlb_table_flush(tlb);
238#endif
239
240 if (tlb_fast_mode(tlb))
241 return;
242
243 for (batch = &tlb->local; batch; batch = batch->next) {
244 free_pages_and_swap_cache(batch->pages, batch->nr);
245 batch->nr = 0;
246 }
247 tlb->active = &tlb->local;
248}
249
250
251
252
253
254void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
255{
256 struct mmu_gather_batch *batch, *next;
257
258 tlb->start = start;
259 tlb->end = end;
260 tlb_flush_mmu(tlb);
261
262
263 check_pgt_cache();
264
265 for (batch = tlb->local.next; batch; batch = next) {
266 next = batch->next;
267 free_pages((unsigned long)batch, 0);
268 }
269 tlb->local.next = NULL;
270}
271
272
273
274
275
276
277
278int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
279{
280 struct mmu_gather_batch *batch;
281
282 VM_BUG_ON(!tlb->need_flush);
283
284 if (tlb_fast_mode(tlb)) {
285 free_page_and_swap_cache(page);
286 return 1;
287 }
288
289 batch = tlb->active;
290 batch->pages[batch->nr++] = page;
291 if (batch->nr == batch->max) {
292 if (!tlb_next_batch(tlb))
293 return 0;
294 batch = tlb->active;
295 }
296 VM_BUG_ON(batch->nr > batch->max);
297
298 return batch->max - batch->nr;
299}
300
301#endif
302
303#ifdef CONFIG_HAVE_RCU_TABLE_FREE
304
305
306
307
308
309static void tlb_remove_table_smp_sync(void *arg)
310{
311
312}
313
314static void tlb_remove_table_one(void *table)
315{
316
317
318
319
320
321
322
323 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
324 __tlb_remove_table(table);
325}
326
327static void tlb_remove_table_rcu(struct rcu_head *head)
328{
329 struct mmu_table_batch *batch;
330 int i;
331
332 batch = container_of(head, struct mmu_table_batch, rcu);
333
334 for (i = 0; i < batch->nr; i++)
335 __tlb_remove_table(batch->tables[i]);
336
337 free_page((unsigned long)batch);
338}
339
340void tlb_table_flush(struct mmu_gather *tlb)
341{
342 struct mmu_table_batch **batch = &tlb->batch;
343
344 if (*batch) {
345 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
346 *batch = NULL;
347 }
348}
349
350void tlb_remove_table(struct mmu_gather *tlb, void *table)
351{
352 struct mmu_table_batch **batch = &tlb->batch;
353
354 tlb->need_flush = 1;
355
356
357
358
359
360 if (atomic_read(&tlb->mm->mm_users) < 2) {
361 __tlb_remove_table(table);
362 return;
363 }
364
365 if (*batch == NULL) {
366 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
367 if (*batch == NULL) {
368 tlb_remove_table_one(table);
369 return;
370 }
371 (*batch)->nr = 0;
372 }
373 (*batch)->tables[(*batch)->nr++] = table;
374 if ((*batch)->nr == MAX_TABLE_BATCH)
375 tlb_table_flush(tlb);
376}
377
378#endif
379
380
381
382
383
384
385
386void pgd_clear_bad(pgd_t *pgd)
387{
388 pgd_ERROR(*pgd);
389 pgd_clear(pgd);
390}
391
392void pud_clear_bad(pud_t *pud)
393{
394 pud_ERROR(*pud);
395 pud_clear(pud);
396}
397
398void pmd_clear_bad(pmd_t *pmd)
399{
400 pmd_ERROR(*pmd);
401 pmd_clear(pmd);
402}
403
404
405
406
407
408static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
409 unsigned long addr)
410{
411 pgtable_t token = pmd_pgtable(*pmd);
412 pmd_clear(pmd);
413 pte_free_tlb(tlb, token, addr);
414 tlb->mm->nr_ptes--;
415}
416
417static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
418 unsigned long addr, unsigned long end,
419 unsigned long floor, unsigned long ceiling)
420{
421 pmd_t *pmd;
422 unsigned long next;
423 unsigned long start;
424
425 start = addr;
426 pmd = pmd_offset(pud, addr);
427 do {
428 next = pmd_addr_end(addr, end);
429 if (pmd_none_or_clear_bad(pmd))
430 continue;
431 free_pte_range(tlb, pmd, addr);
432 } while (pmd++, addr = next, addr != end);
433
434 start &= PUD_MASK;
435 if (start < floor)
436 return;
437 if (ceiling) {
438 ceiling &= PUD_MASK;
439 if (!ceiling)
440 return;
441 }
442 if (end - 1 > ceiling - 1)
443 return;
444
445 pmd = pmd_offset(pud, start);
446 pud_clear(pud);
447 pmd_free_tlb(tlb, pmd, start);
448}
449
450static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
451 unsigned long addr, unsigned long end,
452 unsigned long floor, unsigned long ceiling)
453{
454 pud_t *pud;
455 unsigned long next;
456 unsigned long start;
457
458 start = addr;
459 pud = pud_offset(pgd, addr);
460 do {
461 next = pud_addr_end(addr, end);
462 if (pud_none_or_clear_bad(pud))
463 continue;
464 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
465 } while (pud++, addr = next, addr != end);
466
467 start &= PGDIR_MASK;
468 if (start < floor)
469 return;
470 if (ceiling) {
471 ceiling &= PGDIR_MASK;
472 if (!ceiling)
473 return;
474 }
475 if (end - 1 > ceiling - 1)
476 return;
477
478 pud = pud_offset(pgd, start);
479 pgd_clear(pgd);
480 pud_free_tlb(tlb, pud, start);
481}
482
483
484
485
486
487
488void free_pgd_range(struct mmu_gather *tlb,
489 unsigned long addr, unsigned long end,
490 unsigned long floor, unsigned long ceiling)
491{
492 pgd_t *pgd;
493 unsigned long next;
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521 addr &= PMD_MASK;
522 if (addr < floor) {
523 addr += PMD_SIZE;
524 if (!addr)
525 return;
526 }
527 if (ceiling) {
528 ceiling &= PMD_MASK;
529 if (!ceiling)
530 return;
531 }
532 if (end - 1 > ceiling - 1)
533 end -= PMD_SIZE;
534 if (addr > end - 1)
535 return;
536
537 pgd = pgd_offset(tlb->mm, addr);
538 do {
539 next = pgd_addr_end(addr, end);
540 if (pgd_none_or_clear_bad(pgd))
541 continue;
542 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
543 } while (pgd++, addr = next, addr != end);
544}
545
546void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
547 unsigned long floor, unsigned long ceiling)
548{
549 while (vma) {
550 struct vm_area_struct *next = vma->vm_next;
551 unsigned long addr = vma->vm_start;
552
553
554
555
556
557 unlink_anon_vmas(vma);
558 unlink_file_vma(vma);
559
560 if (is_vm_hugetlb_page(vma)) {
561 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
562 floor, next? next->vm_start: ceiling);
563 } else {
564
565
566
567 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
568 && !is_vm_hugetlb_page(next)) {
569 vma = next;
570 next = vma->vm_next;
571 unlink_anon_vmas(vma);
572 unlink_file_vma(vma);
573 }
574 free_pgd_range(tlb, addr, vma->vm_end,
575 floor, next? next->vm_start: ceiling);
576 }
577 vma = next;
578 }
579}
580
581int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
582 pmd_t *pmd, unsigned long address)
583{
584 pgtable_t new = pte_alloc_one(mm, address);
585 int wait_split_huge_page;
586 if (!new)
587 return -ENOMEM;
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602 smp_wmb();
603
604 spin_lock(&mm->page_table_lock);
605 wait_split_huge_page = 0;
606 if (likely(pmd_none(*pmd))) {
607 mm->nr_ptes++;
608 pmd_populate(mm, pmd, new);
609 new = NULL;
610 } else if (unlikely(pmd_trans_splitting(*pmd)))
611 wait_split_huge_page = 1;
612 spin_unlock(&mm->page_table_lock);
613 if (new)
614 pte_free(mm, new);
615 if (wait_split_huge_page)
616 wait_split_huge_page(vma->anon_vma, pmd);
617 return 0;
618}
619
620int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
621{
622 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
623 if (!new)
624 return -ENOMEM;
625
626 smp_wmb();
627
628 spin_lock(&init_mm.page_table_lock);
629 if (likely(pmd_none(*pmd))) {
630 pmd_populate_kernel(&init_mm, pmd, new);
631 new = NULL;
632 } else
633 VM_BUG_ON(pmd_trans_splitting(*pmd));
634 spin_unlock(&init_mm.page_table_lock);
635 if (new)
636 pte_free_kernel(&init_mm, new);
637 return 0;
638}
639
640static inline void init_rss_vec(int *rss)
641{
642 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
643}
644
645static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
646{
647 int i;
648
649 if (current->mm == mm)
650 sync_mm_rss(mm);
651 for (i = 0; i < NR_MM_COUNTERS; i++)
652 if (rss[i])
653 add_mm_counter(mm, i, rss[i]);
654}
655
656
657
658
659
660
661
662
663static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
664 pte_t pte, struct page *page)
665{
666 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
667 pud_t *pud = pud_offset(pgd, addr);
668 pmd_t *pmd = pmd_offset(pud, addr);
669 struct address_space *mapping;
670 pgoff_t index;
671 static unsigned long resume;
672 static unsigned long nr_shown;
673 static unsigned long nr_unshown;
674
675
676
677
678
679 if (nr_shown == 60) {
680 if (time_before(jiffies, resume)) {
681 nr_unshown++;
682 return;
683 }
684 if (nr_unshown) {
685 printk(KERN_ALERT
686 "BUG: Bad page map: %lu messages suppressed\n",
687 nr_unshown);
688 nr_unshown = 0;
689 }
690 nr_shown = 0;
691 }
692 if (nr_shown++ == 0)
693 resume = jiffies + 60 * HZ;
694
695 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
696 index = linear_page_index(vma, addr);
697
698 printk(KERN_ALERT
699 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
700 current->comm,
701 (long long)pte_val(pte), (long long)pmd_val(*pmd));
702 if (page)
703 dump_page(page);
704 printk(KERN_ALERT
705 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
706 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
707
708
709
710 if (vma->vm_ops)
711 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
712 (unsigned long)vma->vm_ops->fault);
713 if (vma->vm_file && vma->vm_file->f_op)
714 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
715 (unsigned long)vma->vm_file->f_op->mmap);
716 dump_stack();
717 add_taint(TAINT_BAD_PAGE);
718}
719
720static inline bool is_cow_mapping(vm_flags_t flags)
721{
722 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
723}
724
725#ifndef is_zero_pfn
726static inline int is_zero_pfn(unsigned long pfn)
727{
728 return pfn == zero_pfn;
729}
730#endif
731
732#ifndef my_zero_pfn
733static inline unsigned long my_zero_pfn(unsigned long addr)
734{
735 return zero_pfn;
736}
737#endif
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781#ifdef __HAVE_ARCH_PTE_SPECIAL
782# define HAVE_PTE_SPECIAL 1
783#else
784# define HAVE_PTE_SPECIAL 0
785#endif
786struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
787 pte_t pte)
788{
789 unsigned long pfn = pte_pfn(pte);
790
791 if (HAVE_PTE_SPECIAL) {
792 if (likely(!pte_special(pte)))
793 goto check_pfn;
794 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
795 return NULL;
796 if (!is_zero_pfn(pfn))
797 print_bad_pte(vma, addr, pte, NULL);
798 return NULL;
799 }
800
801
802
803 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
804 if (vma->vm_flags & VM_MIXEDMAP) {
805 if (!pfn_valid(pfn))
806 return NULL;
807 goto out;
808 } else {
809 unsigned long off;
810 off = (addr - vma->vm_start) >> PAGE_SHIFT;
811 if (pfn == vma->vm_pgoff + off)
812 return NULL;
813 if (!is_cow_mapping(vma->vm_flags))
814 return NULL;
815 }
816 }
817
818 if (is_zero_pfn(pfn))
819 return NULL;
820check_pfn:
821 if (unlikely(pfn > highest_memmap_pfn)) {
822 print_bad_pte(vma, addr, pte, NULL);
823 return NULL;
824 }
825
826
827
828
829
830out:
831 return pfn_to_page(pfn);
832}
833
834
835
836
837
838
839
840static inline unsigned long
841copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
842 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
843 unsigned long addr, int *rss)
844{
845 unsigned long vm_flags = vma->vm_flags;
846 pte_t pte = *src_pte;
847 struct page *page;
848
849
850 if (unlikely(!pte_present(pte))) {
851 if (!pte_file(pte)) {
852 swp_entry_t entry = pte_to_swp_entry(pte);
853
854 if (swap_duplicate(entry) < 0)
855 return entry.val;
856
857
858 if (unlikely(list_empty(&dst_mm->mmlist))) {
859 spin_lock(&mmlist_lock);
860 if (list_empty(&dst_mm->mmlist))
861 list_add(&dst_mm->mmlist,
862 &src_mm->mmlist);
863 spin_unlock(&mmlist_lock);
864 }
865 if (likely(!non_swap_entry(entry)))
866 rss[MM_SWAPENTS]++;
867 else if (is_migration_entry(entry)) {
868 page = migration_entry_to_page(entry);
869
870 if (PageAnon(page))
871 rss[MM_ANONPAGES]++;
872 else
873 rss[MM_FILEPAGES]++;
874
875 if (is_write_migration_entry(entry) &&
876 is_cow_mapping(vm_flags)) {
877
878
879
880
881 make_migration_entry_read(&entry);
882 pte = swp_entry_to_pte(entry);
883 set_pte_at(src_mm, addr, src_pte, pte);
884 }
885 }
886 }
887 goto out_set_pte;
888 }
889
890
891
892
893
894 if (is_cow_mapping(vm_flags)) {
895 ptep_set_wrprotect(src_mm, addr, src_pte);
896 pte = pte_wrprotect(pte);
897 }
898
899
900
901
902
903 if (vm_flags & VM_SHARED)
904 pte = pte_mkclean(pte);
905 pte = pte_mkold(pte);
906
907 page = vm_normal_page(vma, addr, pte);
908 if (page) {
909 get_page(page);
910 page_dup_rmap(page);
911 if (PageAnon(page))
912 rss[MM_ANONPAGES]++;
913 else
914 rss[MM_FILEPAGES]++;
915 }
916
917out_set_pte:
918 set_pte_at(dst_mm, addr, dst_pte, pte);
919 return 0;
920}
921
922int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
923 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
924 unsigned long addr, unsigned long end)
925{
926 pte_t *orig_src_pte, *orig_dst_pte;
927 pte_t *src_pte, *dst_pte;
928 spinlock_t *src_ptl, *dst_ptl;
929 int progress = 0;
930 int rss[NR_MM_COUNTERS];
931 swp_entry_t entry = (swp_entry_t){0};
932
933again:
934 init_rss_vec(rss);
935
936 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
937 if (!dst_pte)
938 return -ENOMEM;
939 src_pte = pte_offset_map(src_pmd, addr);
940 src_ptl = pte_lockptr(src_mm, src_pmd);
941 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
942 orig_src_pte = src_pte;
943 orig_dst_pte = dst_pte;
944 arch_enter_lazy_mmu_mode();
945
946 do {
947
948
949
950
951 if (progress >= 32) {
952 progress = 0;
953 if (need_resched() ||
954 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
955 break;
956 }
957 if (pte_none(*src_pte)) {
958 progress++;
959 continue;
960 }
961 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
962 vma, addr, rss);
963 if (entry.val)
964 break;
965 progress += 8;
966 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
967
968 arch_leave_lazy_mmu_mode();
969 spin_unlock(src_ptl);
970 pte_unmap(orig_src_pte);
971 add_mm_rss_vec(dst_mm, rss);
972 pte_unmap_unlock(orig_dst_pte, dst_ptl);
973 cond_resched();
974
975 if (entry.val) {
976 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
977 return -ENOMEM;
978 progress = 0;
979 }
980 if (addr != end)
981 goto again;
982 return 0;
983}
984
985static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
986 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
987 unsigned long addr, unsigned long end)
988{
989 pmd_t *src_pmd, *dst_pmd;
990 unsigned long next;
991
992 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
993 if (!dst_pmd)
994 return -ENOMEM;
995 src_pmd = pmd_offset(src_pud, addr);
996 do {
997 next = pmd_addr_end(addr, end);
998 if (pmd_trans_huge(*src_pmd)) {
999 int err;
1000 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
1001 err = copy_huge_pmd(dst_mm, src_mm,
1002 dst_pmd, src_pmd, addr, vma);
1003 if (err == -ENOMEM)
1004 return -ENOMEM;
1005 if (!err)
1006 continue;
1007
1008 }
1009 if (pmd_none_or_clear_bad(src_pmd))
1010 continue;
1011 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1012 vma, addr, next))
1013 return -ENOMEM;
1014 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1015 return 0;
1016}
1017
1018static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1019 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1020 unsigned long addr, unsigned long end)
1021{
1022 pud_t *src_pud, *dst_pud;
1023 unsigned long next;
1024
1025 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1026 if (!dst_pud)
1027 return -ENOMEM;
1028 src_pud = pud_offset(src_pgd, addr);
1029 do {
1030 next = pud_addr_end(addr, end);
1031 if (pud_none_or_clear_bad(src_pud))
1032 continue;
1033 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1034 vma, addr, next))
1035 return -ENOMEM;
1036 } while (dst_pud++, src_pud++, addr = next, addr != end);
1037 return 0;
1038}
1039
1040int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1041 struct vm_area_struct *vma)
1042{
1043 pgd_t *src_pgd, *dst_pgd;
1044 unsigned long next;
1045 unsigned long addr = vma->vm_start;
1046 unsigned long end = vma->vm_end;
1047 unsigned long mmun_start;
1048 unsigned long mmun_end;
1049 bool is_cow;
1050 int ret;
1051
1052
1053
1054
1055
1056
1057
1058 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1059 VM_PFNMAP | VM_MIXEDMAP))) {
1060 if (!vma->anon_vma)
1061 return 0;
1062 }
1063
1064 if (is_vm_hugetlb_page(vma))
1065 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1066
1067 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1068
1069
1070
1071
1072 ret = track_pfn_copy(vma);
1073 if (ret)
1074 return ret;
1075 }
1076
1077
1078
1079
1080
1081
1082
1083 is_cow = is_cow_mapping(vma->vm_flags);
1084 mmun_start = addr;
1085 mmun_end = end;
1086 if (is_cow)
1087 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1088 mmun_end);
1089
1090 ret = 0;
1091 dst_pgd = pgd_offset(dst_mm, addr);
1092 src_pgd = pgd_offset(src_mm, addr);
1093 do {
1094 next = pgd_addr_end(addr, end);
1095 if (pgd_none_or_clear_bad(src_pgd))
1096 continue;
1097 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1098 vma, addr, next))) {
1099 ret = -ENOMEM;
1100 break;
1101 }
1102 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1103
1104 if (is_cow)
1105 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1106 return ret;
1107}
1108
1109static unsigned long zap_pte_range(struct mmu_gather *tlb,
1110 struct vm_area_struct *vma, pmd_t *pmd,
1111 unsigned long addr, unsigned long end,
1112 struct zap_details *details)
1113{
1114 struct mm_struct *mm = tlb->mm;
1115 int force_flush = 0;
1116 int rss[NR_MM_COUNTERS];
1117 spinlock_t *ptl;
1118 pte_t *start_pte;
1119 pte_t *pte;
1120
1121again:
1122 init_rss_vec(rss);
1123 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1124 pte = start_pte;
1125 arch_enter_lazy_mmu_mode();
1126 do {
1127 pte_t ptent = *pte;
1128 if (pte_none(ptent)) {
1129 continue;
1130 }
1131
1132 if (pte_present(ptent)) {
1133 struct page *page;
1134
1135 page = vm_normal_page(vma, addr, ptent);
1136 if (unlikely(details) && page) {
1137
1138
1139
1140
1141
1142 if (details->check_mapping &&
1143 details->check_mapping != page->mapping)
1144 continue;
1145
1146
1147
1148
1149 if (details->nonlinear_vma &&
1150 (page->index < details->first_index ||
1151 page->index > details->last_index))
1152 continue;
1153 }
1154 ptent = ptep_get_and_clear_full(mm, addr, pte,
1155 tlb->fullmm);
1156 tlb_remove_tlb_entry(tlb, pte, addr);
1157 if (unlikely(!page))
1158 continue;
1159 if (unlikely(details) && details->nonlinear_vma
1160 && linear_page_index(details->nonlinear_vma,
1161 addr) != page->index)
1162 set_pte_at(mm, addr, pte,
1163 pgoff_to_pte(page->index));
1164 if (PageAnon(page))
1165 rss[MM_ANONPAGES]--;
1166 else {
1167 if (pte_dirty(ptent))
1168 set_page_dirty(page);
1169 if (pte_young(ptent) &&
1170 likely(!VM_SequentialReadHint(vma)))
1171 mark_page_accessed(page);
1172 rss[MM_FILEPAGES]--;
1173 }
1174 page_remove_rmap(page);
1175 if (unlikely(page_mapcount(page) < 0))
1176 print_bad_pte(vma, addr, ptent, page);
1177 force_flush = !__tlb_remove_page(tlb, page);
1178 if (force_flush)
1179 break;
1180 continue;
1181 }
1182
1183
1184
1185
1186 if (unlikely(details))
1187 continue;
1188 if (pte_file(ptent)) {
1189 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1190 print_bad_pte(vma, addr, ptent, NULL);
1191 } else {
1192 swp_entry_t entry = pte_to_swp_entry(ptent);
1193
1194 if (!non_swap_entry(entry))
1195 rss[MM_SWAPENTS]--;
1196 else if (is_migration_entry(entry)) {
1197 struct page *page;
1198
1199 page = migration_entry_to_page(entry);
1200
1201 if (PageAnon(page))
1202 rss[MM_ANONPAGES]--;
1203 else
1204 rss[MM_FILEPAGES]--;
1205 }
1206 if (unlikely(!free_swap_and_cache(entry)))
1207 print_bad_pte(vma, addr, ptent, NULL);
1208 }
1209 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1210 } while (pte++, addr += PAGE_SIZE, addr != end);
1211
1212 add_mm_rss_vec(mm, rss);
1213 arch_leave_lazy_mmu_mode();
1214 pte_unmap_unlock(start_pte, ptl);
1215
1216
1217
1218
1219
1220
1221 if (force_flush) {
1222 force_flush = 0;
1223
1224#ifdef HAVE_GENERIC_MMU_GATHER
1225 tlb->start = addr;
1226 tlb->end = end;
1227#endif
1228 tlb_flush_mmu(tlb);
1229 if (addr != end)
1230 goto again;
1231 }
1232
1233 return addr;
1234}
1235
1236static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1237 struct vm_area_struct *vma, pud_t *pud,
1238 unsigned long addr, unsigned long end,
1239 struct zap_details *details)
1240{
1241 pmd_t *pmd;
1242 unsigned long next;
1243
1244 pmd = pmd_offset(pud, addr);
1245 do {
1246 next = pmd_addr_end(addr, end);
1247 if (pmd_trans_huge(*pmd)) {
1248 if (next - addr != HPAGE_PMD_SIZE) {
1249#ifdef CONFIG_DEBUG_VM
1250 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1251 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1252 __func__, addr, end,
1253 vma->vm_start,
1254 vma->vm_end);
1255 BUG();
1256 }
1257#endif
1258 split_huge_page_pmd(vma->vm_mm, pmd);
1259 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1260 goto next;
1261
1262 }
1263
1264
1265
1266
1267
1268
1269
1270 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1271 goto next;
1272 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1273next:
1274 cond_resched();
1275 } while (pmd++, addr = next, addr != end);
1276
1277 return addr;
1278}
1279
1280static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1281 struct vm_area_struct *vma, pgd_t *pgd,
1282 unsigned long addr, unsigned long end,
1283 struct zap_details *details)
1284{
1285 pud_t *pud;
1286 unsigned long next;
1287
1288 pud = pud_offset(pgd, addr);
1289 do {
1290 next = pud_addr_end(addr, end);
1291 if (pud_none_or_clear_bad(pud))
1292 continue;
1293 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1294 } while (pud++, addr = next, addr != end);
1295
1296 return addr;
1297}
1298
1299static void unmap_page_range(struct mmu_gather *tlb,
1300 struct vm_area_struct *vma,
1301 unsigned long addr, unsigned long end,
1302 struct zap_details *details)
1303{
1304 pgd_t *pgd;
1305 unsigned long next;
1306
1307 if (details && !details->check_mapping && !details->nonlinear_vma)
1308 details = NULL;
1309
1310 BUG_ON(addr >= end);
1311 mem_cgroup_uncharge_start();
1312 tlb_start_vma(tlb, vma);
1313 pgd = pgd_offset(vma->vm_mm, addr);
1314 do {
1315 next = pgd_addr_end(addr, end);
1316 if (pgd_none_or_clear_bad(pgd))
1317 continue;
1318 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1319 } while (pgd++, addr = next, addr != end);
1320 tlb_end_vma(tlb, vma);
1321 mem_cgroup_uncharge_end();
1322}
1323
1324
1325static void unmap_single_vma(struct mmu_gather *tlb,
1326 struct vm_area_struct *vma, unsigned long start_addr,
1327 unsigned long end_addr,
1328 struct zap_details *details)
1329{
1330 unsigned long start = max(vma->vm_start, start_addr);
1331 unsigned long end;
1332
1333 if (start >= vma->vm_end)
1334 return;
1335 end = min(vma->vm_end, end_addr);
1336 if (end <= vma->vm_start)
1337 return;
1338
1339 if (vma->vm_file)
1340 uprobe_munmap(vma, start, end);
1341
1342 if (unlikely(vma->vm_flags & VM_PFNMAP))
1343 untrack_pfn(vma, 0, 0);
1344
1345 if (start != end) {
1346 if (unlikely(is_vm_hugetlb_page(vma))) {
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358 if (vma->vm_file) {
1359 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1360 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1361 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1362 }
1363 } else
1364 unmap_page_range(tlb, vma, start, end, details);
1365 }
1366}
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386void unmap_vmas(struct mmu_gather *tlb,
1387 struct vm_area_struct *vma, unsigned long start_addr,
1388 unsigned long end_addr)
1389{
1390 struct mm_struct *mm = vma->vm_mm;
1391
1392 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1393 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1394 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1395 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1396}
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1408 unsigned long size, struct zap_details *details)
1409{
1410 struct mm_struct *mm = vma->vm_mm;
1411 struct mmu_gather tlb;
1412 unsigned long end = start + size;
1413
1414 lru_add_drain();
1415 tlb_gather_mmu(&tlb, mm, 0);
1416 update_hiwater_rss(mm);
1417 mmu_notifier_invalidate_range_start(mm, start, end);
1418 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1419 unmap_single_vma(&tlb, vma, start, end, details);
1420 mmu_notifier_invalidate_range_end(mm, start, end);
1421 tlb_finish_mmu(&tlb, start, end);
1422}
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1434 unsigned long size, struct zap_details *details)
1435{
1436 struct mm_struct *mm = vma->vm_mm;
1437 struct mmu_gather tlb;
1438 unsigned long end = address + size;
1439
1440 lru_add_drain();
1441 tlb_gather_mmu(&tlb, mm, 0);
1442 update_hiwater_rss(mm);
1443 mmu_notifier_invalidate_range_start(mm, address, end);
1444 unmap_single_vma(&tlb, vma, address, end, details);
1445 mmu_notifier_invalidate_range_end(mm, address, end);
1446 tlb_finish_mmu(&tlb, address, end);
1447}
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1462 unsigned long size)
1463{
1464 if (address < vma->vm_start || address + size > vma->vm_end ||
1465 !(vma->vm_flags & VM_PFNMAP))
1466 return -1;
1467 zap_page_range_single(vma, address, size, NULL);
1468 return 0;
1469}
1470EXPORT_SYMBOL_GPL(zap_vma_ptes);
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1485 unsigned int flags)
1486{
1487 pgd_t *pgd;
1488 pud_t *pud;
1489 pmd_t *pmd;
1490 pte_t *ptep, pte;
1491 spinlock_t *ptl;
1492 struct page *page;
1493 struct mm_struct *mm = vma->vm_mm;
1494
1495 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1496 if (!IS_ERR(page)) {
1497 BUG_ON(flags & FOLL_GET);
1498 goto out;
1499 }
1500
1501 page = NULL;
1502 pgd = pgd_offset(mm, address);
1503 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1504 goto no_page_table;
1505
1506 pud = pud_offset(pgd, address);
1507 if (pud_none(*pud))
1508 goto no_page_table;
1509 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1510 BUG_ON(flags & FOLL_GET);
1511 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1512 goto out;
1513 }
1514 if (unlikely(pud_bad(*pud)))
1515 goto no_page_table;
1516
1517 pmd = pmd_offset(pud, address);
1518 if (pmd_none(*pmd))
1519 goto no_page_table;
1520 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1521 BUG_ON(flags & FOLL_GET);
1522 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1523 goto out;
1524 }
1525 if (pmd_trans_huge(*pmd)) {
1526 if (flags & FOLL_SPLIT) {
1527 split_huge_page_pmd(mm, pmd);
1528 goto split_fallthrough;
1529 }
1530 spin_lock(&mm->page_table_lock);
1531 if (likely(pmd_trans_huge(*pmd))) {
1532 if (unlikely(pmd_trans_splitting(*pmd))) {
1533 spin_unlock(&mm->page_table_lock);
1534 wait_split_huge_page(vma->anon_vma, pmd);
1535 } else {
1536 page = follow_trans_huge_pmd(vma, address,
1537 pmd, flags);
1538 spin_unlock(&mm->page_table_lock);
1539 goto out;
1540 }
1541 } else
1542 spin_unlock(&mm->page_table_lock);
1543
1544 }
1545split_fallthrough:
1546 if (unlikely(pmd_bad(*pmd)))
1547 goto no_page_table;
1548
1549 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1550
1551 pte = *ptep;
1552 if (!pte_present(pte))
1553 goto no_page;
1554 if ((flags & FOLL_WRITE) && !pte_write(pte))
1555 goto unlock;
1556
1557 page = vm_normal_page(vma, address, pte);
1558 if (unlikely(!page)) {
1559 if ((flags & FOLL_DUMP) ||
1560 !is_zero_pfn(pte_pfn(pte)))
1561 goto bad_page;
1562 page = pte_page(pte);
1563 }
1564
1565 if (flags & FOLL_GET)
1566 get_page_foll(page);
1567 if (flags & FOLL_TOUCH) {
1568 if ((flags & FOLL_WRITE) &&
1569 !pte_dirty(pte) && !PageDirty(page))
1570 set_page_dirty(page);
1571
1572
1573
1574
1575
1576 mark_page_accessed(page);
1577 }
1578 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588 if (page->mapping && trylock_page(page)) {
1589 lru_add_drain();
1590
1591
1592
1593
1594
1595
1596 mlock_vma_page(page);
1597 unlock_page(page);
1598 }
1599 }
1600unlock:
1601 pte_unmap_unlock(ptep, ptl);
1602out:
1603 return page;
1604
1605bad_page:
1606 pte_unmap_unlock(ptep, ptl);
1607 return ERR_PTR(-EFAULT);
1608
1609no_page:
1610 pte_unmap_unlock(ptep, ptl);
1611 if (!pte_none(pte))
1612 return page;
1613
1614no_page_table:
1615
1616
1617
1618
1619
1620
1621
1622
1623 if ((flags & FOLL_DUMP) &&
1624 (!vma->vm_ops || !vma->vm_ops->fault))
1625 return ERR_PTR(-EFAULT);
1626 return page;
1627}
1628
1629static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1630{
1631 return stack_guard_page_start(vma, addr) ||
1632 stack_guard_page_end(vma, addr+PAGE_SIZE);
1633}
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1685 unsigned long start, int nr_pages, unsigned int gup_flags,
1686 struct page **pages, struct vm_area_struct **vmas,
1687 int *nonblocking)
1688{
1689 int i;
1690 unsigned long vm_flags;
1691
1692 if (nr_pages <= 0)
1693 return 0;
1694
1695 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1696
1697
1698
1699
1700
1701 vm_flags = (gup_flags & FOLL_WRITE) ?
1702 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1703 vm_flags &= (gup_flags & FOLL_FORCE) ?
1704 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1705 i = 0;
1706
1707 do {
1708 struct vm_area_struct *vma;
1709
1710 vma = find_extend_vma(mm, start);
1711 if (!vma && in_gate_area(mm, start)) {
1712 unsigned long pg = start & PAGE_MASK;
1713 pgd_t *pgd;
1714 pud_t *pud;
1715 pmd_t *pmd;
1716 pte_t *pte;
1717
1718
1719 if (gup_flags & FOLL_WRITE)
1720 return i ? : -EFAULT;
1721 if (pg > TASK_SIZE)
1722 pgd = pgd_offset_k(pg);
1723 else
1724 pgd = pgd_offset_gate(mm, pg);
1725 BUG_ON(pgd_none(*pgd));
1726 pud = pud_offset(pgd, pg);
1727 BUG_ON(pud_none(*pud));
1728 pmd = pmd_offset(pud, pg);
1729 if (pmd_none(*pmd))
1730 return i ? : -EFAULT;
1731 VM_BUG_ON(pmd_trans_huge(*pmd));
1732 pte = pte_offset_map(pmd, pg);
1733 if (pte_none(*pte)) {
1734 pte_unmap(pte);
1735 return i ? : -EFAULT;
1736 }
1737 vma = get_gate_vma(mm);
1738 if (pages) {
1739 struct page *page;
1740
1741 page = vm_normal_page(vma, start, *pte);
1742 if (!page) {
1743 if (!(gup_flags & FOLL_DUMP) &&
1744 is_zero_pfn(pte_pfn(*pte)))
1745 page = pte_page(*pte);
1746 else {
1747 pte_unmap(pte);
1748 return i ? : -EFAULT;
1749 }
1750 }
1751 pages[i] = page;
1752 get_page(page);
1753 }
1754 pte_unmap(pte);
1755 goto next_page;
1756 }
1757
1758 if (!vma ||
1759 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1760 !(vm_flags & vma->vm_flags))
1761 return i ? : -EFAULT;
1762
1763 if (is_vm_hugetlb_page(vma)) {
1764 i = follow_hugetlb_page(mm, vma, pages, vmas,
1765 &start, &nr_pages, i, gup_flags);
1766 continue;
1767 }
1768
1769 do {
1770 struct page *page;
1771 unsigned int foll_flags = gup_flags;
1772
1773
1774
1775
1776
1777 if (unlikely(fatal_signal_pending(current)))
1778 return i ? i : -ERESTARTSYS;
1779
1780 cond_resched();
1781 while (!(page = follow_page(vma, start, foll_flags))) {
1782 int ret;
1783 unsigned int fault_flags = 0;
1784
1785
1786 if (foll_flags & FOLL_MLOCK) {
1787 if (stack_guard_page(vma, start))
1788 goto next_page;
1789 }
1790 if (foll_flags & FOLL_WRITE)
1791 fault_flags |= FAULT_FLAG_WRITE;
1792 if (nonblocking)
1793 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1794 if (foll_flags & FOLL_NOWAIT)
1795 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1796
1797 ret = handle_mm_fault(mm, vma, start,
1798 fault_flags);
1799
1800 if (ret & VM_FAULT_ERROR) {
1801 if (ret & VM_FAULT_OOM)
1802 return i ? i : -ENOMEM;
1803 if (ret & (VM_FAULT_HWPOISON |
1804 VM_FAULT_HWPOISON_LARGE)) {
1805 if (i)
1806 return i;
1807 else if (gup_flags & FOLL_HWPOISON)
1808 return -EHWPOISON;
1809 else
1810 return -EFAULT;
1811 }
1812 if (ret & VM_FAULT_SIGBUS)
1813 return i ? i : -EFAULT;
1814 BUG();
1815 }
1816
1817 if (tsk) {
1818 if (ret & VM_FAULT_MAJOR)
1819 tsk->maj_flt++;
1820 else
1821 tsk->min_flt++;
1822 }
1823
1824 if (ret & VM_FAULT_RETRY) {
1825 if (nonblocking)
1826 *nonblocking = 0;
1827 return i;
1828 }
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842 if ((ret & VM_FAULT_WRITE) &&
1843 !(vma->vm_flags & VM_WRITE))
1844 foll_flags &= ~FOLL_WRITE;
1845
1846 cond_resched();
1847 }
1848 if (IS_ERR(page))
1849 return i ? i : PTR_ERR(page);
1850 if (pages) {
1851 pages[i] = page;
1852
1853 flush_anon_page(vma, page, start);
1854 flush_dcache_page(page);
1855 }
1856next_page:
1857 if (vmas)
1858 vmas[i] = vma;
1859 i++;
1860 start += PAGE_SIZE;
1861 nr_pages--;
1862 } while (nr_pages && start < vma->vm_end);
1863 } while (nr_pages);
1864 return i;
1865}
1866EXPORT_SYMBOL(__get_user_pages);
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1896 unsigned long address, unsigned int fault_flags)
1897{
1898 struct vm_area_struct *vma;
1899 int ret;
1900
1901 vma = find_extend_vma(mm, address);
1902 if (!vma || address < vma->vm_start)
1903 return -EFAULT;
1904
1905 ret = handle_mm_fault(mm, vma, address, fault_flags);
1906 if (ret & VM_FAULT_ERROR) {
1907 if (ret & VM_FAULT_OOM)
1908 return -ENOMEM;
1909 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1910 return -EHWPOISON;
1911 if (ret & VM_FAULT_SIGBUS)
1912 return -EFAULT;
1913 BUG();
1914 }
1915 if (tsk) {
1916 if (ret & VM_FAULT_MAJOR)
1917 tsk->maj_flt++;
1918 else
1919 tsk->min_flt++;
1920 }
1921 return 0;
1922}
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1976 unsigned long start, int nr_pages, int write, int force,
1977 struct page **pages, struct vm_area_struct **vmas)
1978{
1979 int flags = FOLL_TOUCH;
1980
1981 if (pages)
1982 flags |= FOLL_GET;
1983 if (write)
1984 flags |= FOLL_WRITE;
1985 if (force)
1986 flags |= FOLL_FORCE;
1987
1988 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1989 NULL);
1990}
1991EXPORT_SYMBOL(get_user_pages);
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007#ifdef CONFIG_ELF_CORE
2008struct page *get_dump_page(unsigned long addr)
2009{
2010 struct vm_area_struct *vma;
2011 struct page *page;
2012
2013 if (__get_user_pages(current, current->mm, addr, 1,
2014 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
2015 NULL) < 1)
2016 return NULL;
2017 flush_cache_page(vma, addr, page_to_pfn(page));
2018 return page;
2019}
2020#endif
2021
2022pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2023 spinlock_t **ptl)
2024{
2025 pgd_t * pgd = pgd_offset(mm, addr);
2026 pud_t * pud = pud_alloc(mm, pgd, addr);
2027 if (pud) {
2028 pmd_t * pmd = pmd_alloc(mm, pud, addr);
2029 if (pmd) {
2030 VM_BUG_ON(pmd_trans_huge(*pmd));
2031 return pte_alloc_map_lock(mm, pmd, addr, ptl);
2032 }
2033 }
2034 return NULL;
2035}
2036
2037
2038
2039
2040
2041
2042
2043
2044static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2045 struct page *page, pgprot_t prot)
2046{
2047 struct mm_struct *mm = vma->vm_mm;
2048 int retval;
2049 pte_t *pte;
2050 spinlock_t *ptl;
2051
2052 retval = -EINVAL;
2053 if (PageAnon(page))
2054 goto out;
2055 retval = -ENOMEM;
2056 flush_dcache_page(page);
2057 pte = get_locked_pte(mm, addr, &ptl);
2058 if (!pte)
2059 goto out;
2060 retval = -EBUSY;
2061 if (!pte_none(*pte))
2062 goto out_unlock;
2063
2064
2065 get_page(page);
2066 inc_mm_counter_fast(mm, MM_FILEPAGES);
2067 page_add_file_rmap(page);
2068 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2069
2070 retval = 0;
2071 pte_unmap_unlock(pte, ptl);
2072 return retval;
2073out_unlock:
2074 pte_unmap_unlock(pte, ptl);
2075out:
2076 return retval;
2077}
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2107 struct page *page)
2108{
2109 if (addr < vma->vm_start || addr >= vma->vm_end)
2110 return -EFAULT;
2111 if (!page_count(page))
2112 return -EINVAL;
2113 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2114 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
2115 BUG_ON(vma->vm_flags & VM_PFNMAP);
2116 vma->vm_flags |= VM_MIXEDMAP;
2117 }
2118 return insert_page(vma, addr, page, vma->vm_page_prot);
2119}
2120EXPORT_SYMBOL(vm_insert_page);
2121
2122static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2123 unsigned long pfn, pgprot_t prot)
2124{
2125 struct mm_struct *mm = vma->vm_mm;
2126 int retval;
2127 pte_t *pte, entry;
2128 spinlock_t *ptl;
2129
2130 retval = -ENOMEM;
2131 pte = get_locked_pte(mm, addr, &ptl);
2132 if (!pte)
2133 goto out;
2134 retval = -EBUSY;
2135 if (!pte_none(*pte))
2136 goto out_unlock;
2137
2138
2139 entry = pte_mkspecial(pfn_pte(pfn, prot));
2140 set_pte_at(mm, addr, pte, entry);
2141 update_mmu_cache(vma, addr, pte);
2142
2143 retval = 0;
2144out_unlock:
2145 pte_unmap_unlock(pte, ptl);
2146out:
2147 return retval;
2148}
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2168 unsigned long pfn)
2169{
2170 int ret;
2171 pgprot_t pgprot = vma->vm_page_prot;
2172
2173
2174
2175
2176
2177
2178 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2179 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2180 (VM_PFNMAP|VM_MIXEDMAP));
2181 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2182 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2183
2184 if (addr < vma->vm_start || addr >= vma->vm_end)
2185 return -EFAULT;
2186 if (track_pfn_insert(vma, &pgprot, pfn))
2187 return -EINVAL;
2188
2189 ret = insert_pfn(vma, addr, pfn, pgprot);
2190
2191 return ret;
2192}
2193EXPORT_SYMBOL(vm_insert_pfn);
2194
2195int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2196 unsigned long pfn)
2197{
2198 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2199
2200 if (addr < vma->vm_start || addr >= vma->vm_end)
2201 return -EFAULT;
2202
2203
2204
2205
2206
2207
2208
2209
2210 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2211 struct page *page;
2212
2213 page = pfn_to_page(pfn);
2214 return insert_page(vma, addr, page, vma->vm_page_prot);
2215 }
2216 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2217}
2218EXPORT_SYMBOL(vm_insert_mixed);
2219
2220
2221
2222
2223
2224
2225static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2226 unsigned long addr, unsigned long end,
2227 unsigned long pfn, pgprot_t prot)
2228{
2229 pte_t *pte;
2230 spinlock_t *ptl;
2231
2232 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2233 if (!pte)
2234 return -ENOMEM;
2235 arch_enter_lazy_mmu_mode();
2236 do {
2237 BUG_ON(!pte_none(*pte));
2238 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2239 pfn++;
2240 } while (pte++, addr += PAGE_SIZE, addr != end);
2241 arch_leave_lazy_mmu_mode();
2242 pte_unmap_unlock(pte - 1, ptl);
2243 return 0;
2244}
2245
2246static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2247 unsigned long addr, unsigned long end,
2248 unsigned long pfn, pgprot_t prot)
2249{
2250 pmd_t *pmd;
2251 unsigned long next;
2252
2253 pfn -= addr >> PAGE_SHIFT;
2254 pmd = pmd_alloc(mm, pud, addr);
2255 if (!pmd)
2256 return -ENOMEM;
2257 VM_BUG_ON(pmd_trans_huge(*pmd));
2258 do {
2259 next = pmd_addr_end(addr, end);
2260 if (remap_pte_range(mm, pmd, addr, next,
2261 pfn + (addr >> PAGE_SHIFT), prot))
2262 return -ENOMEM;
2263 } while (pmd++, addr = next, addr != end);
2264 return 0;
2265}
2266
2267static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2268 unsigned long addr, unsigned long end,
2269 unsigned long pfn, pgprot_t prot)
2270{
2271 pud_t *pud;
2272 unsigned long next;
2273
2274 pfn -= addr >> PAGE_SHIFT;
2275 pud = pud_alloc(mm, pgd, addr);
2276 if (!pud)
2277 return -ENOMEM;
2278 do {
2279 next = pud_addr_end(addr, end);
2280 if (remap_pmd_range(mm, pud, addr, next,
2281 pfn + (addr >> PAGE_SHIFT), prot))
2282 return -ENOMEM;
2283 } while (pud++, addr = next, addr != end);
2284 return 0;
2285}
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2298 unsigned long pfn, unsigned long size, pgprot_t prot)
2299{
2300 pgd_t *pgd;
2301 unsigned long next;
2302 unsigned long end = addr + PAGE_ALIGN(size);
2303 struct mm_struct *mm = vma->vm_mm;
2304 int err;
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324 if (is_cow_mapping(vma->vm_flags)) {
2325 if (addr != vma->vm_start || end != vma->vm_end)
2326 return -EINVAL;
2327 vma->vm_pgoff = pfn;
2328 }
2329
2330 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2331 if (err)
2332 return -EINVAL;
2333
2334 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2335
2336 BUG_ON(addr >= end);
2337 pfn -= addr >> PAGE_SHIFT;
2338 pgd = pgd_offset(mm, addr);
2339 flush_cache_range(vma, addr, end);
2340 do {
2341 next = pgd_addr_end(addr, end);
2342 err = remap_pud_range(mm, pgd, addr, next,
2343 pfn + (addr >> PAGE_SHIFT), prot);
2344 if (err)
2345 break;
2346 } while (pgd++, addr = next, addr != end);
2347
2348 if (err)
2349 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2350
2351 return err;
2352}
2353EXPORT_SYMBOL(remap_pfn_range);
2354
2355static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2356 unsigned long addr, unsigned long end,
2357 pte_fn_t fn, void *data)
2358{
2359 pte_t *pte;
2360 int err;
2361 pgtable_t token;
2362 spinlock_t *uninitialized_var(ptl);
2363
2364 pte = (mm == &init_mm) ?
2365 pte_alloc_kernel(pmd, addr) :
2366 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2367 if (!pte)
2368 return -ENOMEM;
2369
2370 BUG_ON(pmd_huge(*pmd));
2371
2372 arch_enter_lazy_mmu_mode();
2373
2374 token = pmd_pgtable(*pmd);
2375
2376 do {
2377 err = fn(pte++, token, addr, data);
2378 if (err)
2379 break;
2380 } while (addr += PAGE_SIZE, addr != end);
2381
2382 arch_leave_lazy_mmu_mode();
2383
2384 if (mm != &init_mm)
2385 pte_unmap_unlock(pte-1, ptl);
2386 return err;
2387}
2388
2389static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2390 unsigned long addr, unsigned long end,
2391 pte_fn_t fn, void *data)
2392{
2393 pmd_t *pmd;
2394 unsigned long next;
2395 int err;
2396
2397 BUG_ON(pud_huge(*pud));
2398
2399 pmd = pmd_alloc(mm, pud, addr);
2400 if (!pmd)
2401 return -ENOMEM;
2402 do {
2403 next = pmd_addr_end(addr, end);
2404 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2405 if (err)
2406 break;
2407 } while (pmd++, addr = next, addr != end);
2408 return err;
2409}
2410
2411static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2412 unsigned long addr, unsigned long end,
2413 pte_fn_t fn, void *data)
2414{
2415 pud_t *pud;
2416 unsigned long next;
2417 int err;
2418
2419 pud = pud_alloc(mm, pgd, addr);
2420 if (!pud)
2421 return -ENOMEM;
2422 do {
2423 next = pud_addr_end(addr, end);
2424 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2425 if (err)
2426 break;
2427 } while (pud++, addr = next, addr != end);
2428 return err;
2429}
2430
2431
2432
2433
2434
2435int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2436 unsigned long size, pte_fn_t fn, void *data)
2437{
2438 pgd_t *pgd;
2439 unsigned long next;
2440 unsigned long end = addr + size;
2441 int err;
2442
2443 BUG_ON(addr >= end);
2444 pgd = pgd_offset(mm, addr);
2445 do {
2446 next = pgd_addr_end(addr, end);
2447 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2448 if (err)
2449 break;
2450 } while (pgd++, addr = next, addr != end);
2451
2452 return err;
2453}
2454EXPORT_SYMBOL_GPL(apply_to_page_range);
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2466 pte_t *page_table, pte_t orig_pte)
2467{
2468 int same = 1;
2469#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2470 if (sizeof(pte_t) > sizeof(unsigned long)) {
2471 spinlock_t *ptl = pte_lockptr(mm, pmd);
2472 spin_lock(ptl);
2473 same = pte_same(*page_table, orig_pte);
2474 spin_unlock(ptl);
2475 }
2476#endif
2477 pte_unmap(page_table);
2478 return same;
2479}
2480
2481static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2482{
2483
2484
2485
2486
2487
2488
2489 if (unlikely(!src)) {
2490 void *kaddr = kmap_atomic(dst);
2491 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2492
2493
2494
2495
2496
2497
2498
2499 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2500 clear_page(kaddr);
2501 kunmap_atomic(kaddr);
2502 flush_dcache_page(dst);
2503 } else
2504 copy_user_highpage(dst, src, va, vma);
2505}
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2526 unsigned long address, pte_t *page_table, pmd_t *pmd,
2527 spinlock_t *ptl, pte_t orig_pte)
2528 __releases(ptl)
2529{
2530 struct page *old_page, *new_page = NULL;
2531 pte_t entry;
2532 int ret = 0;
2533 int page_mkwrite = 0;
2534 struct page *dirty_page = NULL;
2535 unsigned long mmun_start = 0;
2536 unsigned long mmun_end = 0;
2537
2538 old_page = vm_normal_page(vma, address, orig_pte);
2539 if (!old_page) {
2540
2541
2542
2543
2544
2545
2546
2547 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2548 (VM_WRITE|VM_SHARED))
2549 goto reuse;
2550 goto gotten;
2551 }
2552
2553
2554
2555
2556
2557 if (PageAnon(old_page) && !PageKsm(old_page)) {
2558 if (!trylock_page(old_page)) {
2559 page_cache_get(old_page);
2560 pte_unmap_unlock(page_table, ptl);
2561 lock_page(old_page);
2562 page_table = pte_offset_map_lock(mm, pmd, address,
2563 &ptl);
2564 if (!pte_same(*page_table, orig_pte)) {
2565 unlock_page(old_page);
2566 goto unlock;
2567 }
2568 page_cache_release(old_page);
2569 }
2570 if (reuse_swap_page(old_page)) {
2571
2572
2573
2574
2575
2576 page_move_anon_rmap(old_page, vma, address);
2577 unlock_page(old_page);
2578 goto reuse;
2579 }
2580 unlock_page(old_page);
2581 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2582 (VM_WRITE|VM_SHARED))) {
2583
2584
2585
2586
2587
2588 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2589 struct vm_fault vmf;
2590 int tmp;
2591
2592 vmf.virtual_address = (void __user *)(address &
2593 PAGE_MASK);
2594 vmf.pgoff = old_page->index;
2595 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2596 vmf.page = old_page;
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606 page_cache_get(old_page);
2607 pte_unmap_unlock(page_table, ptl);
2608
2609 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2610 if (unlikely(tmp &
2611 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2612 ret = tmp;
2613 goto unwritable_page;
2614 }
2615 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2616 lock_page(old_page);
2617 if (!old_page->mapping) {
2618 ret = 0;
2619 unlock_page(old_page);
2620 goto unwritable_page;
2621 }
2622 } else
2623 VM_BUG_ON(!PageLocked(old_page));
2624
2625
2626
2627
2628
2629
2630
2631 page_table = pte_offset_map_lock(mm, pmd, address,
2632 &ptl);
2633 if (!pte_same(*page_table, orig_pte)) {
2634 unlock_page(old_page);
2635 goto unlock;
2636 }
2637
2638 page_mkwrite = 1;
2639 }
2640 dirty_page = old_page;
2641 get_page(dirty_page);
2642
2643reuse:
2644 flush_cache_page(vma, address, pte_pfn(orig_pte));
2645 entry = pte_mkyoung(orig_pte);
2646 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2647 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2648 update_mmu_cache(vma, address, page_table);
2649 pte_unmap_unlock(page_table, ptl);
2650 ret |= VM_FAULT_WRITE;
2651
2652 if (!dirty_page)
2653 return ret;
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663 if (!page_mkwrite) {
2664 wait_on_page_locked(dirty_page);
2665 set_page_dirty_balance(dirty_page, page_mkwrite);
2666
2667 if (vma->vm_file)
2668 file_update_time(vma->vm_file);
2669 }
2670 put_page(dirty_page);
2671 if (page_mkwrite) {
2672 struct address_space *mapping = dirty_page->mapping;
2673
2674 set_page_dirty(dirty_page);
2675 unlock_page(dirty_page);
2676 page_cache_release(dirty_page);
2677 if (mapping) {
2678
2679
2680
2681
2682 balance_dirty_pages_ratelimited(mapping);
2683 }
2684 }
2685
2686 return ret;
2687 }
2688
2689
2690
2691
2692 page_cache_get(old_page);
2693gotten:
2694 pte_unmap_unlock(page_table, ptl);
2695
2696 if (unlikely(anon_vma_prepare(vma)))
2697 goto oom;
2698
2699 if (is_zero_pfn(pte_pfn(orig_pte))) {
2700 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2701 if (!new_page)
2702 goto oom;
2703 } else {
2704 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2705 if (!new_page)
2706 goto oom;
2707 cow_user_page(new_page, old_page, address, vma);
2708 }
2709 __SetPageUptodate(new_page);
2710
2711 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2712 goto oom_free_new;
2713
2714 mmun_start = address & PAGE_MASK;
2715 mmun_end = mmun_start + PAGE_SIZE;
2716 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2717
2718
2719
2720
2721 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2722 if (likely(pte_same(*page_table, orig_pte))) {
2723 if (old_page) {
2724 if (!PageAnon(old_page)) {
2725 dec_mm_counter_fast(mm, MM_FILEPAGES);
2726 inc_mm_counter_fast(mm, MM_ANONPAGES);
2727 }
2728 } else
2729 inc_mm_counter_fast(mm, MM_ANONPAGES);
2730 flush_cache_page(vma, address, pte_pfn(orig_pte));
2731 entry = mk_pte(new_page, vma->vm_page_prot);
2732 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2733
2734
2735
2736
2737
2738
2739 ptep_clear_flush(vma, address, page_table);
2740 page_add_new_anon_rmap(new_page, vma, address);
2741
2742
2743
2744
2745
2746 set_pte_at_notify(mm, address, page_table, entry);
2747 update_mmu_cache(vma, address, page_table);
2748 if (old_page) {
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771 page_remove_rmap(old_page);
2772 }
2773
2774
2775 new_page = old_page;
2776 ret |= VM_FAULT_WRITE;
2777 } else
2778 mem_cgroup_uncharge_page(new_page);
2779
2780 if (new_page)
2781 page_cache_release(new_page);
2782unlock:
2783 pte_unmap_unlock(page_table, ptl);
2784 if (mmun_end > mmun_start)
2785 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2786 if (old_page) {
2787
2788
2789
2790
2791 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2792 lock_page(old_page);
2793 munlock_vma_page(old_page);
2794 unlock_page(old_page);
2795 }
2796 page_cache_release(old_page);
2797 }
2798 return ret;
2799oom_free_new:
2800 page_cache_release(new_page);
2801oom:
2802 if (old_page) {
2803 if (page_mkwrite) {
2804 unlock_page(old_page);
2805 page_cache_release(old_page);
2806 }
2807 page_cache_release(old_page);
2808 }
2809 return VM_FAULT_OOM;
2810
2811unwritable_page:
2812 page_cache_release(old_page);
2813 return ret;
2814}
2815
2816static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2817 unsigned long start_addr, unsigned long end_addr,
2818 struct zap_details *details)
2819{
2820 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2821}
2822
2823static inline void unmap_mapping_range_tree(struct rb_root *root,
2824 struct zap_details *details)
2825{
2826 struct vm_area_struct *vma;
2827 pgoff_t vba, vea, zba, zea;
2828
2829 vma_interval_tree_foreach(vma, root,
2830 details->first_index, details->last_index) {
2831
2832 vba = vma->vm_pgoff;
2833 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2834
2835 zba = details->first_index;
2836 if (zba < vba)
2837 zba = vba;
2838 zea = details->last_index;
2839 if (zea > vea)
2840 zea = vea;
2841
2842 unmap_mapping_range_vma(vma,
2843 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2844 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2845 details);
2846 }
2847}
2848
2849static inline void unmap_mapping_range_list(struct list_head *head,
2850 struct zap_details *details)
2851{
2852 struct vm_area_struct *vma;
2853
2854
2855
2856
2857
2858
2859
2860 list_for_each_entry(vma, head, shared.nonlinear) {
2861 details->nonlinear_vma = vma;
2862 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2863 }
2864}
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880void unmap_mapping_range(struct address_space *mapping,
2881 loff_t const holebegin, loff_t const holelen, int even_cows)
2882{
2883 struct zap_details details;
2884 pgoff_t hba = holebegin >> PAGE_SHIFT;
2885 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2886
2887
2888 if (sizeof(holelen) > sizeof(hlen)) {
2889 long long holeend =
2890 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2891 if (holeend & ~(long long)ULONG_MAX)
2892 hlen = ULONG_MAX - hba + 1;
2893 }
2894
2895 details.check_mapping = even_cows? NULL: mapping;
2896 details.nonlinear_vma = NULL;
2897 details.first_index = hba;
2898 details.last_index = hba + hlen - 1;
2899 if (details.last_index < details.first_index)
2900 details.last_index = ULONG_MAX;
2901
2902
2903 mutex_lock(&mapping->i_mmap_mutex);
2904 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2905 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2906 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2907 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2908 mutex_unlock(&mapping->i_mmap_mutex);
2909}
2910EXPORT_SYMBOL(unmap_mapping_range);
2911
2912
2913
2914
2915
2916
2917static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2918 unsigned long address, pte_t *page_table, pmd_t *pmd,
2919 unsigned int flags, pte_t orig_pte)
2920{
2921 spinlock_t *ptl;
2922 struct page *page, *swapcache = NULL;
2923 swp_entry_t entry;
2924 pte_t pte;
2925 int locked;
2926 struct mem_cgroup *ptr;
2927 int exclusive = 0;
2928 int ret = 0;
2929
2930 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2931 goto out;
2932
2933 entry = pte_to_swp_entry(orig_pte);
2934 if (unlikely(non_swap_entry(entry))) {
2935 if (is_migration_entry(entry)) {
2936 migration_entry_wait(mm, pmd, address);
2937 } else if (is_hwpoison_entry(entry)) {
2938 ret = VM_FAULT_HWPOISON;
2939 } else {
2940 print_bad_pte(vma, address, orig_pte, NULL);
2941 ret = VM_FAULT_SIGBUS;
2942 }
2943 goto out;
2944 }
2945 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2946 page = lookup_swap_cache(entry);
2947 if (!page) {
2948 page = swapin_readahead(entry,
2949 GFP_HIGHUSER_MOVABLE, vma, address);
2950 if (!page) {
2951
2952
2953
2954
2955 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2956 if (likely(pte_same(*page_table, orig_pte)))
2957 ret = VM_FAULT_OOM;
2958 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2959 goto unlock;
2960 }
2961
2962
2963 ret = VM_FAULT_MAJOR;
2964 count_vm_event(PGMAJFAULT);
2965 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2966 } else if (PageHWPoison(page)) {
2967
2968
2969
2970
2971 ret = VM_FAULT_HWPOISON;
2972 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2973 goto out_release;
2974 }
2975
2976 locked = lock_page_or_retry(page, mm, flags);
2977
2978 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2979 if (!locked) {
2980 ret |= VM_FAULT_RETRY;
2981 goto out_release;
2982 }
2983
2984
2985
2986
2987
2988
2989
2990 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2991 goto out_page;
2992
2993 if (ksm_might_need_to_copy(page, vma, address)) {
2994 swapcache = page;
2995 page = ksm_does_need_to_copy(page, vma, address);
2996
2997 if (unlikely(!page)) {
2998 ret = VM_FAULT_OOM;
2999 page = swapcache;
3000 swapcache = NULL;
3001 goto out_page;
3002 }
3003 }
3004
3005 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
3006 ret = VM_FAULT_OOM;
3007 goto out_page;
3008 }
3009
3010
3011
3012
3013 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3014 if (unlikely(!pte_same(*page_table, orig_pte)))
3015 goto out_nomap;
3016
3017 if (unlikely(!PageUptodate(page))) {
3018 ret = VM_FAULT_SIGBUS;
3019 goto out_nomap;
3020 }
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036 inc_mm_counter_fast(mm, MM_ANONPAGES);
3037 dec_mm_counter_fast(mm, MM_SWAPENTS);
3038 pte = mk_pte(page, vma->vm_page_prot);
3039 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
3040 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3041 flags &= ~FAULT_FLAG_WRITE;
3042 ret |= VM_FAULT_WRITE;
3043 exclusive = 1;
3044 }
3045 flush_icache_page(vma, page);
3046 set_pte_at(mm, address, page_table, pte);
3047 do_page_add_anon_rmap(page, vma, address, exclusive);
3048
3049 mem_cgroup_commit_charge_swapin(page, ptr);
3050
3051 swap_free(entry);
3052 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3053 try_to_free_swap(page);
3054 unlock_page(page);
3055 if (swapcache) {
3056
3057
3058
3059
3060
3061
3062
3063
3064 unlock_page(swapcache);
3065 page_cache_release(swapcache);
3066 }
3067
3068 if (flags & FAULT_FLAG_WRITE) {
3069 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3070 if (ret & VM_FAULT_ERROR)
3071 ret &= VM_FAULT_ERROR;
3072 goto out;
3073 }
3074
3075
3076 update_mmu_cache(vma, address, page_table);
3077unlock:
3078 pte_unmap_unlock(page_table, ptl);
3079out:
3080 return ret;
3081out_nomap:
3082 mem_cgroup_cancel_charge_swapin(ptr);
3083 pte_unmap_unlock(page_table, ptl);
3084out_page:
3085 unlock_page(page);
3086out_release:
3087 page_cache_release(page);
3088 if (swapcache) {
3089 unlock_page(swapcache);
3090 page_cache_release(swapcache);
3091 }
3092 return ret;
3093}
3094
3095
3096
3097
3098
3099
3100static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3101{
3102 address &= PAGE_MASK;
3103 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3104 struct vm_area_struct *prev = vma->vm_prev;
3105
3106
3107
3108
3109
3110
3111
3112 if (prev && prev->vm_end == address)
3113 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3114
3115 expand_downwards(vma, address - PAGE_SIZE);
3116 }
3117 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3118 struct vm_area_struct *next = vma->vm_next;
3119
3120
3121 if (next && next->vm_start == address + PAGE_SIZE)
3122 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3123
3124 expand_upwards(vma, address + PAGE_SIZE);
3125 }
3126 return 0;
3127}
3128
3129
3130
3131
3132
3133
3134static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3135 unsigned long address, pte_t *page_table, pmd_t *pmd,
3136 unsigned int flags)
3137{
3138 struct page *page;
3139 spinlock_t *ptl;
3140 pte_t entry;
3141
3142 pte_unmap(page_table);
3143
3144
3145 if (check_stack_guard_page(vma, address) < 0)
3146 return VM_FAULT_SIGBUS;
3147
3148
3149 if (!(flags & FAULT_FLAG_WRITE)) {
3150 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3151 vma->vm_page_prot));
3152 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3153 if (!pte_none(*page_table))
3154 goto unlock;
3155 goto setpte;
3156 }
3157
3158
3159 if (unlikely(anon_vma_prepare(vma)))
3160 goto oom;
3161 page = alloc_zeroed_user_highpage_movable(vma, address);
3162 if (!page)
3163 goto oom;
3164 __SetPageUptodate(page);
3165
3166 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3167 goto oom_free_page;
3168
3169 entry = mk_pte(page, vma->vm_page_prot);
3170 if (vma->vm_flags & VM_WRITE)
3171 entry = pte_mkwrite(pte_mkdirty(entry));
3172
3173 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3174 if (!pte_none(*page_table))
3175 goto release;
3176
3177 inc_mm_counter_fast(mm, MM_ANONPAGES);
3178 page_add_new_anon_rmap(page, vma, address);
3179setpte:
3180 set_pte_at(mm, address, page_table, entry);
3181
3182
3183 update_mmu_cache(vma, address, page_table);
3184unlock:
3185 pte_unmap_unlock(page_table, ptl);
3186 return 0;
3187release:
3188 mem_cgroup_uncharge_page(page);
3189 page_cache_release(page);
3190 goto unlock;
3191oom_free_page:
3192 page_cache_release(page);
3193oom:
3194 return VM_FAULT_OOM;
3195}
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3211 unsigned long address, pmd_t *pmd,
3212 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3213{
3214 pte_t *page_table;
3215 spinlock_t *ptl;
3216 struct page *page;
3217 struct page *cow_page;
3218 pte_t entry;
3219 int anon = 0;
3220 struct page *dirty_page = NULL;
3221 struct vm_fault vmf;
3222 int ret;
3223 int page_mkwrite = 0;
3224
3225
3226
3227
3228
3229 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3230
3231 if (unlikely(anon_vma_prepare(vma)))
3232 return VM_FAULT_OOM;
3233
3234 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3235 if (!cow_page)
3236 return VM_FAULT_OOM;
3237
3238 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3239 page_cache_release(cow_page);
3240 return VM_FAULT_OOM;
3241 }
3242 } else
3243 cow_page = NULL;
3244
3245 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3246 vmf.pgoff = pgoff;
3247 vmf.flags = flags;
3248 vmf.page = NULL;
3249
3250 ret = vma->vm_ops->fault(vma, &vmf);
3251 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3252 VM_FAULT_RETRY)))
3253 goto uncharge_out;
3254
3255 if (unlikely(PageHWPoison(vmf.page))) {
3256 if (ret & VM_FAULT_LOCKED)
3257 unlock_page(vmf.page);
3258 ret = VM_FAULT_HWPOISON;
3259 goto uncharge_out;
3260 }
3261
3262
3263
3264
3265
3266 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3267 lock_page(vmf.page);
3268 else
3269 VM_BUG_ON(!PageLocked(vmf.page));
3270
3271
3272
3273
3274 page = vmf.page;
3275 if (flags & FAULT_FLAG_WRITE) {
3276 if (!(vma->vm_flags & VM_SHARED)) {
3277 page = cow_page;
3278 anon = 1;
3279 copy_user_highpage(page, vmf.page, address, vma);
3280 __SetPageUptodate(page);
3281 } else {
3282
3283
3284
3285
3286
3287 if (vma->vm_ops->page_mkwrite) {
3288 int tmp;
3289
3290 unlock_page(page);
3291 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3292 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3293 if (unlikely(tmp &
3294 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3295 ret = tmp;
3296 goto unwritable_page;
3297 }
3298 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3299 lock_page(page);
3300 if (!page->mapping) {
3301 ret = 0;
3302 unlock_page(page);
3303 goto unwritable_page;
3304 }
3305 } else
3306 VM_BUG_ON(!PageLocked(page));
3307 page_mkwrite = 1;
3308 }
3309 }
3310
3311 }
3312
3313 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326 if (likely(pte_same(*page_table, orig_pte))) {
3327 flush_icache_page(vma, page);
3328 entry = mk_pte(page, vma->vm_page_prot);
3329 if (flags & FAULT_FLAG_WRITE)
3330 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3331 if (anon) {
3332 inc_mm_counter_fast(mm, MM_ANONPAGES);
3333 page_add_new_anon_rmap(page, vma, address);
3334 } else {
3335 inc_mm_counter_fast(mm, MM_FILEPAGES);
3336 page_add_file_rmap(page);
3337 if (flags & FAULT_FLAG_WRITE) {
3338 dirty_page = page;
3339 get_page(dirty_page);
3340 }
3341 }
3342 set_pte_at(mm, address, page_table, entry);
3343
3344
3345 update_mmu_cache(vma, address, page_table);
3346 } else {
3347 if (cow_page)
3348 mem_cgroup_uncharge_page(cow_page);
3349 if (anon)
3350 page_cache_release(page);
3351 else
3352 anon = 1;
3353 }
3354
3355 pte_unmap_unlock(page_table, ptl);
3356
3357 if (dirty_page) {
3358 struct address_space *mapping = page->mapping;
3359 int dirtied = 0;
3360
3361 if (set_page_dirty(dirty_page))
3362 dirtied = 1;
3363 unlock_page(dirty_page);
3364 put_page(dirty_page);
3365 if ((dirtied || page_mkwrite) && mapping) {
3366
3367
3368
3369
3370 balance_dirty_pages_ratelimited(mapping);
3371 }
3372
3373
3374 if (vma->vm_file && !page_mkwrite)
3375 file_update_time(vma->vm_file);
3376 } else {
3377 unlock_page(vmf.page);
3378 if (anon)
3379 page_cache_release(vmf.page);
3380 }
3381
3382 return ret;
3383
3384unwritable_page:
3385 page_cache_release(page);
3386 return ret;
3387uncharge_out:
3388
3389 if (cow_page) {
3390 mem_cgroup_uncharge_page(cow_page);
3391 page_cache_release(cow_page);
3392 }
3393 return ret;
3394}
3395
3396static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3397 unsigned long address, pte_t *page_table, pmd_t *pmd,
3398 unsigned int flags, pte_t orig_pte)
3399{
3400 pgoff_t pgoff = (((address & PAGE_MASK)
3401 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3402
3403 pte_unmap(page_table);
3404 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3405}
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3417 unsigned long address, pte_t *page_table, pmd_t *pmd,
3418 unsigned int flags, pte_t orig_pte)
3419{
3420 pgoff_t pgoff;
3421
3422 flags |= FAULT_FLAG_NONLINEAR;
3423
3424 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3425 return 0;
3426
3427 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3428
3429
3430
3431 print_bad_pte(vma, address, orig_pte, NULL);
3432 return VM_FAULT_SIGBUS;
3433 }
3434
3435 pgoff = pte_to_pgoff(orig_pte);
3436 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3437}
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452int handle_pte_fault(struct mm_struct *mm,
3453 struct vm_area_struct *vma, unsigned long address,
3454 pte_t *pte, pmd_t *pmd, unsigned int flags)
3455{
3456 pte_t entry;
3457 spinlock_t *ptl;
3458
3459 entry = *pte;
3460 if (!pte_present(entry)) {
3461 if (pte_none(entry)) {
3462 if (vma->vm_ops) {
3463 if (likely(vma->vm_ops->fault))
3464 return do_linear_fault(mm, vma, address,
3465 pte, pmd, flags, entry);
3466 }
3467 return do_anonymous_page(mm, vma, address,
3468 pte, pmd, flags);
3469 }
3470 if (pte_file(entry))
3471 return do_nonlinear_fault(mm, vma, address,
3472 pte, pmd, flags, entry);
3473 return do_swap_page(mm, vma, address,
3474 pte, pmd, flags, entry);
3475 }
3476
3477 ptl = pte_lockptr(mm, pmd);
3478 spin_lock(ptl);
3479 if (unlikely(!pte_same(*pte, entry)))
3480 goto unlock;
3481 if (flags & FAULT_FLAG_WRITE) {
3482 if (!pte_write(entry))
3483 return do_wp_page(mm, vma, address,
3484 pte, pmd, ptl, entry);
3485 entry = pte_mkdirty(entry);
3486 }
3487 entry = pte_mkyoung(entry);
3488 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3489 update_mmu_cache(vma, address, pte);
3490 } else {
3491
3492
3493
3494
3495
3496
3497 if (flags & FAULT_FLAG_WRITE)
3498 flush_tlb_fix_spurious_fault(vma, address);
3499 }
3500unlock:
3501 pte_unmap_unlock(pte, ptl);
3502 return 0;
3503}
3504
3505
3506
3507
3508int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3509 unsigned long address, unsigned int flags)
3510{
3511 pgd_t *pgd;
3512 pud_t *pud;
3513 pmd_t *pmd;
3514 pte_t *pte;
3515
3516 __set_current_state(TASK_RUNNING);
3517
3518 count_vm_event(PGFAULT);
3519 mem_cgroup_count_vm_event(mm, PGFAULT);
3520
3521
3522 check_sync_rss_stat(current);
3523
3524 if (unlikely(is_vm_hugetlb_page(vma)))
3525 return hugetlb_fault(mm, vma, address, flags);
3526
3527retry:
3528 pgd = pgd_offset(mm, address);
3529 pud = pud_alloc(mm, pgd, address);
3530 if (!pud)
3531 return VM_FAULT_OOM;
3532 pmd = pmd_alloc(mm, pud, address);
3533 if (!pmd)
3534 return VM_FAULT_OOM;
3535 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3536 if (!vma->vm_ops)
3537 return do_huge_pmd_anonymous_page(mm, vma, address,
3538 pmd, flags);
3539 } else {
3540 pmd_t orig_pmd = *pmd;
3541 int ret;
3542
3543 barrier();
3544 if (pmd_trans_huge(orig_pmd)) {
3545 if (flags & FAULT_FLAG_WRITE &&
3546 !pmd_write(orig_pmd) &&
3547 !pmd_trans_splitting(orig_pmd)) {
3548 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3549 orig_pmd);
3550
3551
3552
3553
3554
3555 if (unlikely(ret & VM_FAULT_OOM))
3556 goto retry;
3557 return ret;
3558 }
3559 return 0;
3560 }
3561 }
3562
3563
3564
3565
3566
3567
3568 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3569 return VM_FAULT_OOM;
3570
3571 if (unlikely(pmd_trans_huge(*pmd)))
3572 return 0;
3573
3574
3575
3576
3577
3578
3579 pte = pte_offset_map(pmd, address);
3580
3581 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3582}
3583
3584#ifndef __PAGETABLE_PUD_FOLDED
3585
3586
3587
3588
3589int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3590{
3591 pud_t *new = pud_alloc_one(mm, address);
3592 if (!new)
3593 return -ENOMEM;
3594
3595 smp_wmb();
3596
3597 spin_lock(&mm->page_table_lock);
3598 if (pgd_present(*pgd))
3599 pud_free(mm, new);
3600 else
3601 pgd_populate(mm, pgd, new);
3602 spin_unlock(&mm->page_table_lock);
3603 return 0;
3604}
3605#endif
3606
3607#ifndef __PAGETABLE_PMD_FOLDED
3608
3609
3610
3611
3612int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3613{
3614 pmd_t *new = pmd_alloc_one(mm, address);
3615 if (!new)
3616 return -ENOMEM;
3617
3618 smp_wmb();
3619
3620 spin_lock(&mm->page_table_lock);
3621#ifndef __ARCH_HAS_4LEVEL_HACK
3622 if (pud_present(*pud))
3623 pmd_free(mm, new);
3624 else
3625 pud_populate(mm, pud, new);
3626#else
3627 if (pgd_present(*pud))
3628 pmd_free(mm, new);
3629 else
3630 pgd_populate(mm, pud, new);
3631#endif
3632 spin_unlock(&mm->page_table_lock);
3633 return 0;
3634}
3635#endif
3636
3637int make_pages_present(unsigned long addr, unsigned long end)
3638{
3639 int ret, len, write;
3640 struct vm_area_struct * vma;
3641
3642 vma = find_vma(current->mm, addr);
3643 if (!vma)
3644 return -ENOMEM;
3645
3646
3647
3648
3649
3650 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3651 BUG_ON(addr >= end);
3652 BUG_ON(end > vma->vm_end);
3653 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3654 ret = get_user_pages(current, current->mm, addr,
3655 len, write, 0, NULL, NULL);
3656 if (ret < 0)
3657 return ret;
3658 return ret == len ? 0 : -EFAULT;
3659}
3660
3661#if !defined(__HAVE_ARCH_GATE_AREA)
3662
3663#if defined(AT_SYSINFO_EHDR)
3664static struct vm_area_struct gate_vma;
3665
3666static int __init gate_vma_init(void)
3667{
3668 gate_vma.vm_mm = NULL;
3669 gate_vma.vm_start = FIXADDR_USER_START;
3670 gate_vma.vm_end = FIXADDR_USER_END;
3671 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3672 gate_vma.vm_page_prot = __P101;
3673
3674 return 0;
3675}
3676__initcall(gate_vma_init);
3677#endif
3678
3679struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3680{
3681#ifdef AT_SYSINFO_EHDR
3682 return &gate_vma;
3683#else
3684 return NULL;
3685#endif
3686}
3687
3688int in_gate_area_no_mm(unsigned long addr)
3689{
3690#ifdef AT_SYSINFO_EHDR
3691 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3692 return 1;
3693#endif
3694 return 0;
3695}
3696
3697#endif
3698
3699static int __follow_pte(struct mm_struct *mm, unsigned long address,
3700 pte_t **ptepp, spinlock_t **ptlp)
3701{
3702 pgd_t *pgd;
3703 pud_t *pud;
3704 pmd_t *pmd;
3705 pte_t *ptep;
3706
3707 pgd = pgd_offset(mm, address);
3708 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3709 goto out;
3710
3711 pud = pud_offset(pgd, address);
3712 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3713 goto out;
3714
3715 pmd = pmd_offset(pud, address);
3716 VM_BUG_ON(pmd_trans_huge(*pmd));
3717 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3718 goto out;
3719
3720
3721 if (pmd_huge(*pmd))
3722 goto out;
3723
3724 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3725 if (!ptep)
3726 goto out;
3727 if (!pte_present(*ptep))
3728 goto unlock;
3729 *ptepp = ptep;
3730 return 0;
3731unlock:
3732 pte_unmap_unlock(ptep, *ptlp);
3733out:
3734 return -EINVAL;
3735}
3736
3737static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3738 pte_t **ptepp, spinlock_t **ptlp)
3739{
3740 int res;
3741
3742
3743 (void) __cond_lock(*ptlp,
3744 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3745 return res;
3746}
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3759 unsigned long *pfn)
3760{
3761 int ret = -EINVAL;
3762 spinlock_t *ptl;
3763 pte_t *ptep;
3764
3765 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3766 return ret;
3767
3768 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3769 if (ret)
3770 return ret;
3771 *pfn = pte_pfn(*ptep);
3772 pte_unmap_unlock(ptep, ptl);
3773 return 0;
3774}
3775EXPORT_SYMBOL(follow_pfn);
3776
3777#ifdef CONFIG_HAVE_IOREMAP_PROT
3778int follow_phys(struct vm_area_struct *vma,
3779 unsigned long address, unsigned int flags,
3780 unsigned long *prot, resource_size_t *phys)
3781{
3782 int ret = -EINVAL;
3783 pte_t *ptep, pte;
3784 spinlock_t *ptl;
3785
3786 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3787 goto out;
3788
3789 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3790 goto out;
3791 pte = *ptep;
3792
3793 if ((flags & FOLL_WRITE) && !pte_write(pte))
3794 goto unlock;
3795
3796 *prot = pgprot_val(pte_pgprot(pte));
3797 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3798
3799 ret = 0;
3800unlock:
3801 pte_unmap_unlock(ptep, ptl);
3802out:
3803 return ret;
3804}
3805
3806int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3807 void *buf, int len, int write)
3808{
3809 resource_size_t phys_addr;
3810 unsigned long prot = 0;
3811 void __iomem *maddr;
3812 int offset = addr & (PAGE_SIZE-1);
3813
3814 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3815 return -EINVAL;
3816
3817 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3818 if (write)
3819 memcpy_toio(maddr + offset, buf, len);
3820 else
3821 memcpy_fromio(buf, maddr + offset, len);
3822 iounmap(maddr);
3823
3824 return len;
3825}
3826#endif
3827
3828
3829
3830
3831
3832static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3833 unsigned long addr, void *buf, int len, int write)
3834{
3835 struct vm_area_struct *vma;
3836 void *old_buf = buf;
3837
3838 down_read(&mm->mmap_sem);
3839
3840 while (len) {
3841 int bytes, ret, offset;
3842 void *maddr;
3843 struct page *page = NULL;
3844
3845 ret = get_user_pages(tsk, mm, addr, 1,
3846 write, 1, &page, &vma);
3847 if (ret <= 0) {
3848
3849
3850
3851
3852#ifdef CONFIG_HAVE_IOREMAP_PROT
3853 vma = find_vma(mm, addr);
3854 if (!vma || vma->vm_start > addr)
3855 break;
3856 if (vma->vm_ops && vma->vm_ops->access)
3857 ret = vma->vm_ops->access(vma, addr, buf,
3858 len, write);
3859 if (ret <= 0)
3860#endif
3861 break;
3862 bytes = ret;
3863 } else {
3864 bytes = len;
3865 offset = addr & (PAGE_SIZE-1);
3866 if (bytes > PAGE_SIZE-offset)
3867 bytes = PAGE_SIZE-offset;
3868
3869 maddr = kmap(page);
3870 if (write) {
3871 copy_to_user_page(vma, page, addr,
3872 maddr + offset, buf, bytes);
3873 set_page_dirty_lock(page);
3874 } else {
3875 copy_from_user_page(vma, page, addr,
3876 buf, maddr + offset, bytes);
3877 }
3878 kunmap(page);
3879 page_cache_release(page);
3880 }
3881 len -= bytes;
3882 buf += bytes;
3883 addr += bytes;
3884 }
3885 up_read(&mm->mmap_sem);
3886
3887 return buf - old_buf;
3888}
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3901 void *buf, int len, int write)
3902{
3903 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3904}
3905
3906
3907
3908
3909
3910
3911int access_process_vm(struct task_struct *tsk, unsigned long addr,
3912 void *buf, int len, int write)
3913{
3914 struct mm_struct *mm;
3915 int ret;
3916
3917 mm = get_task_mm(tsk);
3918 if (!mm)
3919 return 0;
3920
3921 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3922 mmput(mm);
3923
3924 return ret;
3925}
3926
3927
3928
3929
3930void print_vma_addr(char *prefix, unsigned long ip)
3931{
3932 struct mm_struct *mm = current->mm;
3933 struct vm_area_struct *vma;
3934
3935
3936
3937
3938
3939 if (preempt_count())
3940 return;
3941
3942 down_read(&mm->mmap_sem);
3943 vma = find_vma(mm, ip);
3944 if (vma && vma->vm_file) {
3945 struct file *f = vma->vm_file;
3946 char *buf = (char *)__get_free_page(GFP_KERNEL);
3947 if (buf) {
3948 char *p, *s;
3949
3950 p = d_path(&f->f_path, buf, PAGE_SIZE);
3951 if (IS_ERR(p))
3952 p = "?";
3953 s = strrchr(p, '/');
3954 if (s)
3955 p = s+1;
3956 printk("%s%s[%lx+%lx]", prefix, p,
3957 vma->vm_start,
3958 vma->vm_end - vma->vm_start);
3959 free_page((unsigned long)buf);
3960 }
3961 }
3962 up_read(&mm->mmap_sem);
3963}
3964
3965#ifdef CONFIG_PROVE_LOCKING
3966void might_fault(void)
3967{
3968
3969
3970
3971
3972
3973
3974 if (segment_eq(get_fs(), KERNEL_DS))
3975 return;
3976
3977 might_sleep();
3978
3979
3980
3981
3982
3983 if (!in_atomic() && current->mm)
3984 might_lock_read(¤t->mm->mmap_sem);
3985}
3986EXPORT_SYMBOL(might_fault);
3987#endif
3988
3989#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3990static void clear_gigantic_page(struct page *page,
3991 unsigned long addr,
3992 unsigned int pages_per_huge_page)
3993{
3994 int i;
3995 struct page *p = page;
3996
3997 might_sleep();
3998 for (i = 0; i < pages_per_huge_page;
3999 i++, p = mem_map_next(p, page, i)) {
4000 cond_resched();
4001 clear_user_highpage(p, addr + i * PAGE_SIZE);
4002 }
4003}
4004void clear_huge_page(struct page *page,
4005 unsigned long addr, unsigned int pages_per_huge_page)
4006{
4007 int i;
4008
4009 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4010 clear_gigantic_page(page, addr, pages_per_huge_page);
4011 return;
4012 }
4013
4014 might_sleep();
4015 for (i = 0; i < pages_per_huge_page; i++) {
4016 cond_resched();
4017 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4018 }
4019}
4020
4021static void copy_user_gigantic_page(struct page *dst, struct page *src,
4022 unsigned long addr,
4023 struct vm_area_struct *vma,
4024 unsigned int pages_per_huge_page)
4025{
4026 int i;
4027 struct page *dst_base = dst;
4028 struct page *src_base = src;
4029
4030 for (i = 0; i < pages_per_huge_page; ) {
4031 cond_resched();
4032 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4033
4034 i++;
4035 dst = mem_map_next(dst, dst_base, i);
4036 src = mem_map_next(src, src_base, i);
4037 }
4038}
4039
4040void copy_user_huge_page(struct page *dst, struct page *src,
4041 unsigned long addr, struct vm_area_struct *vma,
4042 unsigned int pages_per_huge_page)
4043{
4044 int i;
4045
4046 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4047 copy_user_gigantic_page(dst, src, addr, vma,
4048 pages_per_huge_page);
4049 return;
4050 }
4051
4052 might_sleep();
4053 for (i = 0; i < pages_per_huge_page; i++) {
4054 cond_resched();
4055 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4056 }
4057}
4058#endif
4059