1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128void sync_mm_rss(struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 current->rss_stat.count[i] = 0;
136 }
137 }
138 current->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 sync_mm_rss(task->mm);
161}
162#else
163
164#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
165#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
166
167static void check_sync_rss_stat(struct task_struct *task)
168{
169}
170
171#endif
172
173#ifdef HAVE_GENERIC_MMU_GATHER
174
175static int tlb_next_batch(struct mmu_gather *tlb)
176{
177 struct mmu_gather_batch *batch;
178
179 batch = tlb->active;
180 if (batch->next) {
181 tlb->active = batch->next;
182 return 1;
183 }
184
185 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
186 if (!batch)
187 return 0;
188
189 batch->next = NULL;
190 batch->nr = 0;
191 batch->max = MAX_GATHER_BATCH;
192
193 tlb->active->next = batch;
194 tlb->active = batch;
195
196 return 1;
197}
198
199
200
201
202
203
204void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
205{
206 tlb->mm = mm;
207
208 tlb->fullmm = fullmm;
209 tlb->start = -1UL;
210 tlb->end = 0;
211 tlb->need_flush = 0;
212 tlb->fast_mode = (num_possible_cpus() == 1);
213 tlb->local.next = NULL;
214 tlb->local.nr = 0;
215 tlb->local.max = ARRAY_SIZE(tlb->__pages);
216 tlb->active = &tlb->local;
217
218#ifdef CONFIG_HAVE_RCU_TABLE_FREE
219 tlb->batch = NULL;
220#endif
221}
222
223void tlb_flush_mmu(struct mmu_gather *tlb)
224{
225 struct mmu_gather_batch *batch;
226
227 if (!tlb->need_flush)
228 return;
229 tlb->need_flush = 0;
230 tlb_flush(tlb);
231#ifdef CONFIG_HAVE_RCU_TABLE_FREE
232 tlb_table_flush(tlb);
233#endif
234
235 if (tlb_fast_mode(tlb))
236 return;
237
238 for (batch = &tlb->local; batch; batch = batch->next) {
239 free_pages_and_swap_cache(batch->pages, batch->nr);
240 batch->nr = 0;
241 }
242 tlb->active = &tlb->local;
243}
244
245
246
247
248
249void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
250{
251 struct mmu_gather_batch *batch, *next;
252
253 tlb->start = start;
254 tlb->end = end;
255 tlb_flush_mmu(tlb);
256
257
258 check_pgt_cache();
259
260 for (batch = tlb->local.next; batch; batch = next) {
261 next = batch->next;
262 free_pages((unsigned long)batch, 0);
263 }
264 tlb->local.next = NULL;
265}
266
267
268
269
270
271
272
273int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
274{
275 struct mmu_gather_batch *batch;
276
277 VM_BUG_ON(!tlb->need_flush);
278
279 if (tlb_fast_mode(tlb)) {
280 free_page_and_swap_cache(page);
281 return 1;
282 }
283
284 batch = tlb->active;
285 batch->pages[batch->nr++] = page;
286 if (batch->nr == batch->max) {
287 if (!tlb_next_batch(tlb))
288 return 0;
289 batch = tlb->active;
290 }
291 VM_BUG_ON(batch->nr > batch->max);
292
293 return batch->max - batch->nr;
294}
295
296#endif
297
298#ifdef CONFIG_HAVE_RCU_TABLE_FREE
299
300
301
302
303
304static void tlb_remove_table_smp_sync(void *arg)
305{
306
307}
308
309static void tlb_remove_table_one(void *table)
310{
311
312
313
314
315
316
317
318 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
319 __tlb_remove_table(table);
320}
321
322static void tlb_remove_table_rcu(struct rcu_head *head)
323{
324 struct mmu_table_batch *batch;
325 int i;
326
327 batch = container_of(head, struct mmu_table_batch, rcu);
328
329 for (i = 0; i < batch->nr; i++)
330 __tlb_remove_table(batch->tables[i]);
331
332 free_page((unsigned long)batch);
333}
334
335void tlb_table_flush(struct mmu_gather *tlb)
336{
337 struct mmu_table_batch **batch = &tlb->batch;
338
339 if (*batch) {
340 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
341 *batch = NULL;
342 }
343}
344
345void tlb_remove_table(struct mmu_gather *tlb, void *table)
346{
347 struct mmu_table_batch **batch = &tlb->batch;
348
349 tlb->need_flush = 1;
350
351
352
353
354
355 if (atomic_read(&tlb->mm->mm_users) < 2) {
356 __tlb_remove_table(table);
357 return;
358 }
359
360 if (*batch == NULL) {
361 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
362 if (*batch == NULL) {
363 tlb_remove_table_one(table);
364 return;
365 }
366 (*batch)->nr = 0;
367 }
368 (*batch)->tables[(*batch)->nr++] = table;
369 if ((*batch)->nr == MAX_TABLE_BATCH)
370 tlb_table_flush(tlb);
371}
372
373#endif
374
375
376
377
378
379
380
381void pgd_clear_bad(pgd_t *pgd)
382{
383 pgd_ERROR(*pgd);
384 pgd_clear(pgd);
385}
386
387void pud_clear_bad(pud_t *pud)
388{
389 pud_ERROR(*pud);
390 pud_clear(pud);
391}
392
393void pmd_clear_bad(pmd_t *pmd)
394{
395 pmd_ERROR(*pmd);
396 pmd_clear(pmd);
397}
398
399
400
401
402
403static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
404 unsigned long addr)
405{
406 pgtable_t token = pmd_pgtable(*pmd);
407 pmd_clear(pmd);
408 pte_free_tlb(tlb, token, addr);
409 tlb->mm->nr_ptes--;
410}
411
412static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
413 unsigned long addr, unsigned long end,
414 unsigned long floor, unsigned long ceiling)
415{
416 pmd_t *pmd;
417 unsigned long next;
418 unsigned long start;
419
420 start = addr;
421 pmd = pmd_offset(pud, addr);
422 do {
423 next = pmd_addr_end(addr, end);
424 if (pmd_none_or_clear_bad(pmd))
425 continue;
426 free_pte_range(tlb, pmd, addr);
427 } while (pmd++, addr = next, addr != end);
428
429 start &= PUD_MASK;
430 if (start < floor)
431 return;
432 if (ceiling) {
433 ceiling &= PUD_MASK;
434 if (!ceiling)
435 return;
436 }
437 if (end - 1 > ceiling - 1)
438 return;
439
440 pmd = pmd_offset(pud, start);
441 pud_clear(pud);
442 pmd_free_tlb(tlb, pmd, start);
443}
444
445static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
446 unsigned long addr, unsigned long end,
447 unsigned long floor, unsigned long ceiling)
448{
449 pud_t *pud;
450 unsigned long next;
451 unsigned long start;
452
453 start = addr;
454 pud = pud_offset(pgd, addr);
455 do {
456 next = pud_addr_end(addr, end);
457 if (pud_none_or_clear_bad(pud))
458 continue;
459 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
460 } while (pud++, addr = next, addr != end);
461
462 start &= PGDIR_MASK;
463 if (start < floor)
464 return;
465 if (ceiling) {
466 ceiling &= PGDIR_MASK;
467 if (!ceiling)
468 return;
469 }
470 if (end - 1 > ceiling - 1)
471 return;
472
473 pud = pud_offset(pgd, start);
474 pgd_clear(pgd);
475 pud_free_tlb(tlb, pud, start);
476}
477
478
479
480
481
482
483void free_pgd_range(struct mmu_gather *tlb,
484 unsigned long addr, unsigned long end,
485 unsigned long floor, unsigned long ceiling)
486{
487 pgd_t *pgd;
488 unsigned long next;
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 addr &= PMD_MASK;
517 if (addr < floor) {
518 addr += PMD_SIZE;
519 if (!addr)
520 return;
521 }
522 if (ceiling) {
523 ceiling &= PMD_MASK;
524 if (!ceiling)
525 return;
526 }
527 if (end - 1 > ceiling - 1)
528 end -= PMD_SIZE;
529 if (addr > end - 1)
530 return;
531
532 pgd = pgd_offset(tlb->mm, addr);
533 do {
534 next = pgd_addr_end(addr, end);
535 if (pgd_none_or_clear_bad(pgd))
536 continue;
537 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
538 } while (pgd++, addr = next, addr != end);
539}
540
541void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
542 unsigned long floor, unsigned long ceiling)
543{
544 while (vma) {
545 struct vm_area_struct *next = vma->vm_next;
546 unsigned long addr = vma->vm_start;
547
548
549
550
551
552 unlink_anon_vmas(vma);
553 unlink_file_vma(vma);
554
555 if (is_vm_hugetlb_page(vma)) {
556 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
557 floor, next? next->vm_start: ceiling);
558 } else {
559
560
561
562 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
563 && !is_vm_hugetlb_page(next)) {
564 vma = next;
565 next = vma->vm_next;
566 unlink_anon_vmas(vma);
567 unlink_file_vma(vma);
568 }
569 free_pgd_range(tlb, addr, vma->vm_end,
570 floor, next? next->vm_start: ceiling);
571 }
572 vma = next;
573 }
574}
575
576int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
577 pmd_t *pmd, unsigned long address)
578{
579 pgtable_t new = pte_alloc_one(mm, address);
580 int wait_split_huge_page;
581 if (!new)
582 return -ENOMEM;
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597 smp_wmb();
598
599 spin_lock(&mm->page_table_lock);
600 wait_split_huge_page = 0;
601 if (likely(pmd_none(*pmd))) {
602 mm->nr_ptes++;
603 pmd_populate(mm, pmd, new);
604 new = NULL;
605 } else if (unlikely(pmd_trans_splitting(*pmd)))
606 wait_split_huge_page = 1;
607 spin_unlock(&mm->page_table_lock);
608 if (new)
609 pte_free(mm, new);
610 if (wait_split_huge_page)
611 wait_split_huge_page(vma->anon_vma, pmd);
612 return 0;
613}
614
615int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
616{
617 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
618 if (!new)
619 return -ENOMEM;
620
621 smp_wmb();
622
623 spin_lock(&init_mm.page_table_lock);
624 if (likely(pmd_none(*pmd))) {
625 pmd_populate_kernel(&init_mm, pmd, new);
626 new = NULL;
627 } else
628 VM_BUG_ON(pmd_trans_splitting(*pmd));
629 spin_unlock(&init_mm.page_table_lock);
630 if (new)
631 pte_free_kernel(&init_mm, new);
632 return 0;
633}
634
635static inline void init_rss_vec(int *rss)
636{
637 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
638}
639
640static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
641{
642 int i;
643
644 if (current->mm == mm)
645 sync_mm_rss(mm);
646 for (i = 0; i < NR_MM_COUNTERS; i++)
647 if (rss[i])
648 add_mm_counter(mm, i, rss[i]);
649}
650
651
652
653
654
655
656
657
658static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
659 pte_t pte, struct page *page)
660{
661 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
662 pud_t *pud = pud_offset(pgd, addr);
663 pmd_t *pmd = pmd_offset(pud, addr);
664 struct address_space *mapping;
665 pgoff_t index;
666 static unsigned long resume;
667 static unsigned long nr_shown;
668 static unsigned long nr_unshown;
669
670
671
672
673
674 if (nr_shown == 60) {
675 if (time_before(jiffies, resume)) {
676 nr_unshown++;
677 return;
678 }
679 if (nr_unshown) {
680 printk(KERN_ALERT
681 "BUG: Bad page map: %lu messages suppressed\n",
682 nr_unshown);
683 nr_unshown = 0;
684 }
685 nr_shown = 0;
686 }
687 if (nr_shown++ == 0)
688 resume = jiffies + 60 * HZ;
689
690 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
691 index = linear_page_index(vma, addr);
692
693 printk(KERN_ALERT
694 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
695 current->comm,
696 (long long)pte_val(pte), (long long)pmd_val(*pmd));
697 if (page)
698 dump_page(page);
699 printk(KERN_ALERT
700 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
701 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
702
703
704
705 if (vma->vm_ops)
706 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
707 (unsigned long)vma->vm_ops->fault);
708 if (vma->vm_file && vma->vm_file->f_op)
709 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
710 (unsigned long)vma->vm_file->f_op->mmap);
711 dump_stack();
712 add_taint(TAINT_BAD_PAGE);
713}
714
715static inline int is_cow_mapping(vm_flags_t flags)
716{
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718}
719
720#ifndef is_zero_pfn
721static inline int is_zero_pfn(unsigned long pfn)
722{
723 return pfn == zero_pfn;
724}
725#endif
726
727#ifndef my_zero_pfn
728static inline unsigned long my_zero_pfn(unsigned long addr)
729{
730 return zero_pfn;
731}
732#endif
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776#ifdef __HAVE_ARCH_PTE_SPECIAL
777# define HAVE_PTE_SPECIAL 1
778#else
779# define HAVE_PTE_SPECIAL 0
780#endif
781struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
782 pte_t pte)
783{
784 unsigned long pfn = pte_pfn(pte);
785
786 if (HAVE_PTE_SPECIAL) {
787 if (likely(!pte_special(pte)))
788 goto check_pfn;
789 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
790 return NULL;
791 if (!is_zero_pfn(pfn))
792 print_bad_pte(vma, addr, pte, NULL);
793 return NULL;
794 }
795
796
797
798 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
799 if (vma->vm_flags & VM_MIXEDMAP) {
800 if (!pfn_valid(pfn))
801 return NULL;
802 goto out;
803 } else {
804 unsigned long off;
805 off = (addr - vma->vm_start) >> PAGE_SHIFT;
806 if (pfn == vma->vm_pgoff + off)
807 return NULL;
808 if (!is_cow_mapping(vma->vm_flags))
809 return NULL;
810 }
811 }
812
813 if (is_zero_pfn(pfn))
814 return NULL;
815check_pfn:
816 if (unlikely(pfn > highest_memmap_pfn)) {
817 print_bad_pte(vma, addr, pte, NULL);
818 return NULL;
819 }
820
821
822
823
824
825out:
826 return pfn_to_page(pfn);
827}
828
829
830
831
832
833
834
835static inline unsigned long
836copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
837 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
838 unsigned long addr, int *rss)
839{
840 unsigned long vm_flags = vma->vm_flags;
841 pte_t pte = *src_pte;
842 struct page *page;
843
844
845 if (unlikely(!pte_present(pte))) {
846 if (!pte_file(pte)) {
847 swp_entry_t entry = pte_to_swp_entry(pte);
848
849 if (swap_duplicate(entry) < 0)
850 return entry.val;
851
852
853 if (unlikely(list_empty(&dst_mm->mmlist))) {
854 spin_lock(&mmlist_lock);
855 if (list_empty(&dst_mm->mmlist))
856 list_add(&dst_mm->mmlist,
857 &src_mm->mmlist);
858 spin_unlock(&mmlist_lock);
859 }
860 if (likely(!non_swap_entry(entry)))
861 rss[MM_SWAPENTS]++;
862 else if (is_migration_entry(entry)) {
863 page = migration_entry_to_page(entry);
864
865 if (PageAnon(page))
866 rss[MM_ANONPAGES]++;
867 else
868 rss[MM_FILEPAGES]++;
869
870 if (is_write_migration_entry(entry) &&
871 is_cow_mapping(vm_flags)) {
872
873
874
875
876 make_migration_entry_read(&entry);
877 pte = swp_entry_to_pte(entry);
878 set_pte_at(src_mm, addr, src_pte, pte);
879 }
880 }
881 }
882 goto out_set_pte;
883 }
884
885
886
887
888
889 if (is_cow_mapping(vm_flags)) {
890 ptep_set_wrprotect(src_mm, addr, src_pte);
891 pte = pte_wrprotect(pte);
892 }
893
894
895
896
897
898 if (vm_flags & VM_SHARED)
899 pte = pte_mkclean(pte);
900 pte = pte_mkold(pte);
901
902 page = vm_normal_page(vma, addr, pte);
903 if (page) {
904 get_page(page);
905 page_dup_rmap(page);
906 if (PageAnon(page))
907 rss[MM_ANONPAGES]++;
908 else
909 rss[MM_FILEPAGES]++;
910 }
911
912out_set_pte:
913 set_pte_at(dst_mm, addr, dst_pte, pte);
914 return 0;
915}
916
917int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
918 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
919 unsigned long addr, unsigned long end)
920{
921 pte_t *orig_src_pte, *orig_dst_pte;
922 pte_t *src_pte, *dst_pte;
923 spinlock_t *src_ptl, *dst_ptl;
924 int progress = 0;
925 int rss[NR_MM_COUNTERS];
926 swp_entry_t entry = (swp_entry_t){0};
927
928again:
929 init_rss_vec(rss);
930
931 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
932 if (!dst_pte)
933 return -ENOMEM;
934 src_pte = pte_offset_map(src_pmd, addr);
935 src_ptl = pte_lockptr(src_mm, src_pmd);
936 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
937 orig_src_pte = src_pte;
938 orig_dst_pte = dst_pte;
939 arch_enter_lazy_mmu_mode();
940
941 do {
942
943
944
945
946 if (progress >= 32) {
947 progress = 0;
948 if (need_resched() ||
949 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
950 break;
951 }
952 if (pte_none(*src_pte)) {
953 progress++;
954 continue;
955 }
956 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
957 vma, addr, rss);
958 if (entry.val)
959 break;
960 progress += 8;
961 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
962
963 arch_leave_lazy_mmu_mode();
964 spin_unlock(src_ptl);
965 pte_unmap(orig_src_pte);
966 add_mm_rss_vec(dst_mm, rss);
967 pte_unmap_unlock(orig_dst_pte, dst_ptl);
968 cond_resched();
969
970 if (entry.val) {
971 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
972 return -ENOMEM;
973 progress = 0;
974 }
975 if (addr != end)
976 goto again;
977 return 0;
978}
979
980static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
981 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
982 unsigned long addr, unsigned long end)
983{
984 pmd_t *src_pmd, *dst_pmd;
985 unsigned long next;
986
987 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
988 if (!dst_pmd)
989 return -ENOMEM;
990 src_pmd = pmd_offset(src_pud, addr);
991 do {
992 next = pmd_addr_end(addr, end);
993 if (pmd_trans_huge(*src_pmd)) {
994 int err;
995 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
996 err = copy_huge_pmd(dst_mm, src_mm,
997 dst_pmd, src_pmd, addr, vma);
998 if (err == -ENOMEM)
999 return -ENOMEM;
1000 if (!err)
1001 continue;
1002
1003 }
1004 if (pmd_none_or_clear_bad(src_pmd))
1005 continue;
1006 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1007 vma, addr, next))
1008 return -ENOMEM;
1009 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1010 return 0;
1011}
1012
1013static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1014 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1015 unsigned long addr, unsigned long end)
1016{
1017 pud_t *src_pud, *dst_pud;
1018 unsigned long next;
1019
1020 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1021 if (!dst_pud)
1022 return -ENOMEM;
1023 src_pud = pud_offset(src_pgd, addr);
1024 do {
1025 next = pud_addr_end(addr, end);
1026 if (pud_none_or_clear_bad(src_pud))
1027 continue;
1028 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1029 vma, addr, next))
1030 return -ENOMEM;
1031 } while (dst_pud++, src_pud++, addr = next, addr != end);
1032 return 0;
1033}
1034
1035int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1036 struct vm_area_struct *vma)
1037{
1038 pgd_t *src_pgd, *dst_pgd;
1039 unsigned long next;
1040 unsigned long addr = vma->vm_start;
1041 unsigned long end = vma->vm_end;
1042 int ret;
1043
1044
1045
1046
1047
1048
1049
1050 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
1051 if (!vma->anon_vma)
1052 return 0;
1053 }
1054
1055 if (is_vm_hugetlb_page(vma))
1056 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1057
1058 if (unlikely(is_pfn_mapping(vma))) {
1059
1060
1061
1062
1063 ret = track_pfn_vma_copy(vma);
1064 if (ret)
1065 return ret;
1066 }
1067
1068
1069
1070
1071
1072
1073
1074 if (is_cow_mapping(vma->vm_flags))
1075 mmu_notifier_invalidate_range_start(src_mm, addr, end);
1076
1077 ret = 0;
1078 dst_pgd = pgd_offset(dst_mm, addr);
1079 src_pgd = pgd_offset(src_mm, addr);
1080 do {
1081 next = pgd_addr_end(addr, end);
1082 if (pgd_none_or_clear_bad(src_pgd))
1083 continue;
1084 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1085 vma, addr, next))) {
1086 ret = -ENOMEM;
1087 break;
1088 }
1089 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1090
1091 if (is_cow_mapping(vma->vm_flags))
1092 mmu_notifier_invalidate_range_end(src_mm,
1093 vma->vm_start, end);
1094 return ret;
1095}
1096
1097static unsigned long zap_pte_range(struct mmu_gather *tlb,
1098 struct vm_area_struct *vma, pmd_t *pmd,
1099 unsigned long addr, unsigned long end,
1100 struct zap_details *details)
1101{
1102 struct mm_struct *mm = tlb->mm;
1103 int force_flush = 0;
1104 int rss[NR_MM_COUNTERS];
1105 spinlock_t *ptl;
1106 pte_t *start_pte;
1107 pte_t *pte;
1108
1109again:
1110 init_rss_vec(rss);
1111 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1112 pte = start_pte;
1113 arch_enter_lazy_mmu_mode();
1114 do {
1115 pte_t ptent = *pte;
1116 if (pte_none(ptent)) {
1117 continue;
1118 }
1119
1120 if (pte_present(ptent)) {
1121 struct page *page;
1122
1123 page = vm_normal_page(vma, addr, ptent);
1124 if (unlikely(details) && page) {
1125
1126
1127
1128
1129
1130 if (details->check_mapping &&
1131 details->check_mapping != page->mapping)
1132 continue;
1133
1134
1135
1136
1137 if (details->nonlinear_vma &&
1138 (page->index < details->first_index ||
1139 page->index > details->last_index))
1140 continue;
1141 }
1142 ptent = ptep_get_and_clear_full(mm, addr, pte,
1143 tlb->fullmm);
1144 tlb_remove_tlb_entry(tlb, pte, addr);
1145 if (unlikely(!page))
1146 continue;
1147 if (unlikely(details) && details->nonlinear_vma
1148 && linear_page_index(details->nonlinear_vma,
1149 addr) != page->index)
1150 set_pte_at(mm, addr, pte,
1151 pgoff_to_pte(page->index));
1152 if (PageAnon(page))
1153 rss[MM_ANONPAGES]--;
1154 else {
1155 if (pte_dirty(ptent))
1156 set_page_dirty(page);
1157 if (pte_young(ptent) &&
1158 likely(!VM_SequentialReadHint(vma)))
1159 mark_page_accessed(page);
1160 rss[MM_FILEPAGES]--;
1161 }
1162 page_remove_rmap(page);
1163 if (unlikely(page_mapcount(page) < 0))
1164 print_bad_pte(vma, addr, ptent, page);
1165 force_flush = !__tlb_remove_page(tlb, page);
1166 if (force_flush)
1167 break;
1168 continue;
1169 }
1170
1171
1172
1173
1174 if (unlikely(details))
1175 continue;
1176 if (pte_file(ptent)) {
1177 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1178 print_bad_pte(vma, addr, ptent, NULL);
1179 } else {
1180 swp_entry_t entry = pte_to_swp_entry(ptent);
1181
1182 if (!non_swap_entry(entry))
1183 rss[MM_SWAPENTS]--;
1184 else if (is_migration_entry(entry)) {
1185 struct page *page;
1186
1187 page = migration_entry_to_page(entry);
1188
1189 if (PageAnon(page))
1190 rss[MM_ANONPAGES]--;
1191 else
1192 rss[MM_FILEPAGES]--;
1193 }
1194 if (unlikely(!free_swap_and_cache(entry)))
1195 print_bad_pte(vma, addr, ptent, NULL);
1196 }
1197 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1198 } while (pte++, addr += PAGE_SIZE, addr != end);
1199
1200 add_mm_rss_vec(mm, rss);
1201 arch_leave_lazy_mmu_mode();
1202 pte_unmap_unlock(start_pte, ptl);
1203
1204
1205
1206
1207
1208
1209 if (force_flush) {
1210 force_flush = 0;
1211
1212#ifdef HAVE_GENERIC_MMU_GATHER
1213 tlb->start = addr;
1214 tlb->end = end;
1215#endif
1216 tlb_flush_mmu(tlb);
1217 if (addr != end)
1218 goto again;
1219 }
1220
1221 return addr;
1222}
1223
1224static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1225 struct vm_area_struct *vma, pud_t *pud,
1226 unsigned long addr, unsigned long end,
1227 struct zap_details *details)
1228{
1229 pmd_t *pmd;
1230 unsigned long next;
1231
1232 pmd = pmd_offset(pud, addr);
1233 do {
1234 next = pmd_addr_end(addr, end);
1235 if (pmd_trans_huge(*pmd)) {
1236 if (next - addr != HPAGE_PMD_SIZE) {
1237#ifdef CONFIG_DEBUG_VM
1238 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1239 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1240 __func__, addr, end,
1241 vma->vm_start,
1242 vma->vm_end);
1243 BUG();
1244 }
1245#endif
1246 split_huge_page_pmd(vma->vm_mm, pmd);
1247 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1248 goto next;
1249
1250 }
1251
1252
1253
1254
1255
1256
1257
1258 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1259 goto next;
1260 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1261next:
1262 cond_resched();
1263 } while (pmd++, addr = next, addr != end);
1264
1265 return addr;
1266}
1267
1268static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1269 struct vm_area_struct *vma, pgd_t *pgd,
1270 unsigned long addr, unsigned long end,
1271 struct zap_details *details)
1272{
1273 pud_t *pud;
1274 unsigned long next;
1275
1276 pud = pud_offset(pgd, addr);
1277 do {
1278 next = pud_addr_end(addr, end);
1279 if (pud_none_or_clear_bad(pud))
1280 continue;
1281 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1282 } while (pud++, addr = next, addr != end);
1283
1284 return addr;
1285}
1286
1287static void unmap_page_range(struct mmu_gather *tlb,
1288 struct vm_area_struct *vma,
1289 unsigned long addr, unsigned long end,
1290 struct zap_details *details)
1291{
1292 pgd_t *pgd;
1293 unsigned long next;
1294
1295 if (details && !details->check_mapping && !details->nonlinear_vma)
1296 details = NULL;
1297
1298 BUG_ON(addr >= end);
1299 mem_cgroup_uncharge_start();
1300 tlb_start_vma(tlb, vma);
1301 pgd = pgd_offset(vma->vm_mm, addr);
1302 do {
1303 next = pgd_addr_end(addr, end);
1304 if (pgd_none_or_clear_bad(pgd))
1305 continue;
1306 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1307 } while (pgd++, addr = next, addr != end);
1308 tlb_end_vma(tlb, vma);
1309 mem_cgroup_uncharge_end();
1310}
1311
1312
1313static void unmap_single_vma(struct mmu_gather *tlb,
1314 struct vm_area_struct *vma, unsigned long start_addr,
1315 unsigned long end_addr,
1316 struct zap_details *details)
1317{
1318 unsigned long start = max(vma->vm_start, start_addr);
1319 unsigned long end;
1320
1321 if (start >= vma->vm_end)
1322 return;
1323 end = min(vma->vm_end, end_addr);
1324 if (end <= vma->vm_start)
1325 return;
1326
1327 if (vma->vm_file)
1328 uprobe_munmap(vma, start, end);
1329
1330 if (unlikely(is_pfn_mapping(vma)))
1331 untrack_pfn_vma(vma, 0, 0);
1332
1333 if (start != end) {
1334 if (unlikely(is_vm_hugetlb_page(vma))) {
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346 if (vma->vm_file) {
1347 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1348 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1349 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1350 }
1351 } else
1352 unmap_page_range(tlb, vma, start, end, details);
1353 }
1354}
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374void unmap_vmas(struct mmu_gather *tlb,
1375 struct vm_area_struct *vma, unsigned long start_addr,
1376 unsigned long end_addr)
1377{
1378 struct mm_struct *mm = vma->vm_mm;
1379
1380 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1381 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1382 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1383 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1384}
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1396 unsigned long size, struct zap_details *details)
1397{
1398 struct mm_struct *mm = vma->vm_mm;
1399 struct mmu_gather tlb;
1400 unsigned long end = start + size;
1401
1402 lru_add_drain();
1403 tlb_gather_mmu(&tlb, mm, 0);
1404 update_hiwater_rss(mm);
1405 mmu_notifier_invalidate_range_start(mm, start, end);
1406 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1407 unmap_single_vma(&tlb, vma, start, end, details);
1408 mmu_notifier_invalidate_range_end(mm, start, end);
1409 tlb_finish_mmu(&tlb, start, end);
1410}
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1422 unsigned long size, struct zap_details *details)
1423{
1424 struct mm_struct *mm = vma->vm_mm;
1425 struct mmu_gather tlb;
1426 unsigned long end = address + size;
1427
1428 lru_add_drain();
1429 tlb_gather_mmu(&tlb, mm, 0);
1430 update_hiwater_rss(mm);
1431 mmu_notifier_invalidate_range_start(mm, address, end);
1432 unmap_single_vma(&tlb, vma, address, end, details);
1433 mmu_notifier_invalidate_range_end(mm, address, end);
1434 tlb_finish_mmu(&tlb, address, end);
1435}
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1450 unsigned long size)
1451{
1452 if (address < vma->vm_start || address + size > vma->vm_end ||
1453 !(vma->vm_flags & VM_PFNMAP))
1454 return -1;
1455 zap_page_range_single(vma, address, size, NULL);
1456 return 0;
1457}
1458EXPORT_SYMBOL_GPL(zap_vma_ptes);
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1473 unsigned int flags)
1474{
1475 pgd_t *pgd;
1476 pud_t *pud;
1477 pmd_t *pmd;
1478 pte_t *ptep, pte;
1479 spinlock_t *ptl;
1480 struct page *page;
1481 struct mm_struct *mm = vma->vm_mm;
1482
1483 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1484 if (!IS_ERR(page)) {
1485 BUG_ON(flags & FOLL_GET);
1486 goto out;
1487 }
1488
1489 page = NULL;
1490 pgd = pgd_offset(mm, address);
1491 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1492 goto no_page_table;
1493
1494 pud = pud_offset(pgd, address);
1495 if (pud_none(*pud))
1496 goto no_page_table;
1497 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1498 BUG_ON(flags & FOLL_GET);
1499 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1500 goto out;
1501 }
1502 if (unlikely(pud_bad(*pud)))
1503 goto no_page_table;
1504
1505 pmd = pmd_offset(pud, address);
1506 if (pmd_none(*pmd))
1507 goto no_page_table;
1508 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1509 BUG_ON(flags & FOLL_GET);
1510 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1511 goto out;
1512 }
1513 if (pmd_trans_huge(*pmd)) {
1514 if (flags & FOLL_SPLIT) {
1515 split_huge_page_pmd(mm, pmd);
1516 goto split_fallthrough;
1517 }
1518 spin_lock(&mm->page_table_lock);
1519 if (likely(pmd_trans_huge(*pmd))) {
1520 if (unlikely(pmd_trans_splitting(*pmd))) {
1521 spin_unlock(&mm->page_table_lock);
1522 wait_split_huge_page(vma->anon_vma, pmd);
1523 } else {
1524 page = follow_trans_huge_pmd(mm, address,
1525 pmd, flags);
1526 spin_unlock(&mm->page_table_lock);
1527 goto out;
1528 }
1529 } else
1530 spin_unlock(&mm->page_table_lock);
1531
1532 }
1533split_fallthrough:
1534 if (unlikely(pmd_bad(*pmd)))
1535 goto no_page_table;
1536
1537 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1538
1539 pte = *ptep;
1540 if (!pte_present(pte))
1541 goto no_page;
1542 if ((flags & FOLL_WRITE) && !pte_write(pte))
1543 goto unlock;
1544
1545 page = vm_normal_page(vma, address, pte);
1546 if (unlikely(!page)) {
1547 if ((flags & FOLL_DUMP) ||
1548 !is_zero_pfn(pte_pfn(pte)))
1549 goto bad_page;
1550 page = pte_page(pte);
1551 }
1552
1553 if (flags & FOLL_GET)
1554 get_page_foll(page);
1555 if (flags & FOLL_TOUCH) {
1556 if ((flags & FOLL_WRITE) &&
1557 !pte_dirty(pte) && !PageDirty(page))
1558 set_page_dirty(page);
1559
1560
1561
1562
1563
1564 mark_page_accessed(page);
1565 }
1566 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576 if (page->mapping && trylock_page(page)) {
1577 lru_add_drain();
1578
1579
1580
1581
1582
1583 if (page->mapping)
1584 mlock_vma_page(page);
1585 unlock_page(page);
1586 }
1587 }
1588unlock:
1589 pte_unmap_unlock(ptep, ptl);
1590out:
1591 return page;
1592
1593bad_page:
1594 pte_unmap_unlock(ptep, ptl);
1595 return ERR_PTR(-EFAULT);
1596
1597no_page:
1598 pte_unmap_unlock(ptep, ptl);
1599 if (!pte_none(pte))
1600 return page;
1601
1602no_page_table:
1603
1604
1605
1606
1607
1608
1609
1610
1611 if ((flags & FOLL_DUMP) &&
1612 (!vma->vm_ops || !vma->vm_ops->fault))
1613 return ERR_PTR(-EFAULT);
1614 return page;
1615}
1616
1617static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1618{
1619 return stack_guard_page_start(vma, addr) ||
1620 stack_guard_page_end(vma, addr+PAGE_SIZE);
1621}
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1673 unsigned long start, int nr_pages, unsigned int gup_flags,
1674 struct page **pages, struct vm_area_struct **vmas,
1675 int *nonblocking)
1676{
1677 int i;
1678 unsigned long vm_flags;
1679
1680 if (nr_pages <= 0)
1681 return 0;
1682
1683 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1684
1685
1686
1687
1688
1689 vm_flags = (gup_flags & FOLL_WRITE) ?
1690 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1691 vm_flags &= (gup_flags & FOLL_FORCE) ?
1692 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1693 i = 0;
1694
1695 do {
1696 struct vm_area_struct *vma;
1697
1698 vma = find_extend_vma(mm, start);
1699 if (!vma && in_gate_area(mm, start)) {
1700 unsigned long pg = start & PAGE_MASK;
1701 pgd_t *pgd;
1702 pud_t *pud;
1703 pmd_t *pmd;
1704 pte_t *pte;
1705
1706
1707 if (gup_flags & FOLL_WRITE)
1708 return i ? : -EFAULT;
1709 if (pg > TASK_SIZE)
1710 pgd = pgd_offset_k(pg);
1711 else
1712 pgd = pgd_offset_gate(mm, pg);
1713 BUG_ON(pgd_none(*pgd));
1714 pud = pud_offset(pgd, pg);
1715 BUG_ON(pud_none(*pud));
1716 pmd = pmd_offset(pud, pg);
1717 if (pmd_none(*pmd))
1718 return i ? : -EFAULT;
1719 VM_BUG_ON(pmd_trans_huge(*pmd));
1720 pte = pte_offset_map(pmd, pg);
1721 if (pte_none(*pte)) {
1722 pte_unmap(pte);
1723 return i ? : -EFAULT;
1724 }
1725 vma = get_gate_vma(mm);
1726 if (pages) {
1727 struct page *page;
1728
1729 page = vm_normal_page(vma, start, *pte);
1730 if (!page) {
1731 if (!(gup_flags & FOLL_DUMP) &&
1732 is_zero_pfn(pte_pfn(*pte)))
1733 page = pte_page(*pte);
1734 else {
1735 pte_unmap(pte);
1736 return i ? : -EFAULT;
1737 }
1738 }
1739 pages[i] = page;
1740 get_page(page);
1741 }
1742 pte_unmap(pte);
1743 goto next_page;
1744 }
1745
1746 if (!vma ||
1747 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1748 !(vm_flags & vma->vm_flags))
1749 return i ? : -EFAULT;
1750
1751 if (is_vm_hugetlb_page(vma)) {
1752 i = follow_hugetlb_page(mm, vma, pages, vmas,
1753 &start, &nr_pages, i, gup_flags);
1754 continue;
1755 }
1756
1757 do {
1758 struct page *page;
1759 unsigned int foll_flags = gup_flags;
1760
1761
1762
1763
1764
1765 if (unlikely(fatal_signal_pending(current)))
1766 return i ? i : -ERESTARTSYS;
1767
1768 cond_resched();
1769 while (!(page = follow_page(vma, start, foll_flags))) {
1770 int ret;
1771 unsigned int fault_flags = 0;
1772
1773
1774 if (foll_flags & FOLL_MLOCK) {
1775 if (stack_guard_page(vma, start))
1776 goto next_page;
1777 }
1778 if (foll_flags & FOLL_WRITE)
1779 fault_flags |= FAULT_FLAG_WRITE;
1780 if (nonblocking)
1781 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1782 if (foll_flags & FOLL_NOWAIT)
1783 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1784
1785 ret = handle_mm_fault(mm, vma, start,
1786 fault_flags);
1787
1788 if (ret & VM_FAULT_ERROR) {
1789 if (ret & VM_FAULT_OOM)
1790 return i ? i : -ENOMEM;
1791 if (ret & (VM_FAULT_HWPOISON |
1792 VM_FAULT_HWPOISON_LARGE)) {
1793 if (i)
1794 return i;
1795 else if (gup_flags & FOLL_HWPOISON)
1796 return -EHWPOISON;
1797 else
1798 return -EFAULT;
1799 }
1800 if (ret & VM_FAULT_SIGBUS)
1801 return i ? i : -EFAULT;
1802 BUG();
1803 }
1804
1805 if (tsk) {
1806 if (ret & VM_FAULT_MAJOR)
1807 tsk->maj_flt++;
1808 else
1809 tsk->min_flt++;
1810 }
1811
1812 if (ret & VM_FAULT_RETRY) {
1813 if (nonblocking)
1814 *nonblocking = 0;
1815 return i;
1816 }
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830 if ((ret & VM_FAULT_WRITE) &&
1831 !(vma->vm_flags & VM_WRITE))
1832 foll_flags &= ~FOLL_WRITE;
1833
1834 cond_resched();
1835 }
1836 if (IS_ERR(page))
1837 return i ? i : PTR_ERR(page);
1838 if (pages) {
1839 pages[i] = page;
1840
1841 flush_anon_page(vma, page, start);
1842 flush_dcache_page(page);
1843 }
1844next_page:
1845 if (vmas)
1846 vmas[i] = vma;
1847 i++;
1848 start += PAGE_SIZE;
1849 nr_pages--;
1850 } while (nr_pages && start < vma->vm_end);
1851 } while (nr_pages);
1852 return i;
1853}
1854EXPORT_SYMBOL(__get_user_pages);
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1884 unsigned long address, unsigned int fault_flags)
1885{
1886 struct vm_area_struct *vma;
1887 int ret;
1888
1889 vma = find_extend_vma(mm, address);
1890 if (!vma || address < vma->vm_start)
1891 return -EFAULT;
1892
1893 ret = handle_mm_fault(mm, vma, address, fault_flags);
1894 if (ret & VM_FAULT_ERROR) {
1895 if (ret & VM_FAULT_OOM)
1896 return -ENOMEM;
1897 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1898 return -EHWPOISON;
1899 if (ret & VM_FAULT_SIGBUS)
1900 return -EFAULT;
1901 BUG();
1902 }
1903 if (tsk) {
1904 if (ret & VM_FAULT_MAJOR)
1905 tsk->maj_flt++;
1906 else
1907 tsk->min_flt++;
1908 }
1909 return 0;
1910}
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1964 unsigned long start, int nr_pages, int write, int force,
1965 struct page **pages, struct vm_area_struct **vmas)
1966{
1967 int flags = FOLL_TOUCH;
1968
1969 if (pages)
1970 flags |= FOLL_GET;
1971 if (write)
1972 flags |= FOLL_WRITE;
1973 if (force)
1974 flags |= FOLL_FORCE;
1975
1976 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1977 NULL);
1978}
1979EXPORT_SYMBOL(get_user_pages);
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995#ifdef CONFIG_ELF_CORE
1996struct page *get_dump_page(unsigned long addr)
1997{
1998 struct vm_area_struct *vma;
1999 struct page *page;
2000
2001 if (__get_user_pages(current, current->mm, addr, 1,
2002 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
2003 NULL) < 1)
2004 return NULL;
2005 flush_cache_page(vma, addr, page_to_pfn(page));
2006 return page;
2007}
2008#endif
2009
2010pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2011 spinlock_t **ptl)
2012{
2013 pgd_t * pgd = pgd_offset(mm, addr);
2014 pud_t * pud = pud_alloc(mm, pgd, addr);
2015 if (pud) {
2016 pmd_t * pmd = pmd_alloc(mm, pud, addr);
2017 if (pmd) {
2018 VM_BUG_ON(pmd_trans_huge(*pmd));
2019 return pte_alloc_map_lock(mm, pmd, addr, ptl);
2020 }
2021 }
2022 return NULL;
2023}
2024
2025
2026
2027
2028
2029
2030
2031
2032static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2033 struct page *page, pgprot_t prot)
2034{
2035 struct mm_struct *mm = vma->vm_mm;
2036 int retval;
2037 pte_t *pte;
2038 spinlock_t *ptl;
2039
2040 retval = -EINVAL;
2041 if (PageAnon(page))
2042 goto out;
2043 retval = -ENOMEM;
2044 flush_dcache_page(page);
2045 pte = get_locked_pte(mm, addr, &ptl);
2046 if (!pte)
2047 goto out;
2048 retval = -EBUSY;
2049 if (!pte_none(*pte))
2050 goto out_unlock;
2051
2052
2053 get_page(page);
2054 inc_mm_counter_fast(mm, MM_FILEPAGES);
2055 page_add_file_rmap(page);
2056 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2057
2058 retval = 0;
2059 pte_unmap_unlock(pte, ptl);
2060 return retval;
2061out_unlock:
2062 pte_unmap_unlock(pte, ptl);
2063out:
2064 return retval;
2065}
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2090 struct page *page)
2091{
2092 if (addr < vma->vm_start || addr >= vma->vm_end)
2093 return -EFAULT;
2094 if (!page_count(page))
2095 return -EINVAL;
2096 vma->vm_flags |= VM_INSERTPAGE;
2097 return insert_page(vma, addr, page, vma->vm_page_prot);
2098}
2099EXPORT_SYMBOL(vm_insert_page);
2100
2101static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2102 unsigned long pfn, pgprot_t prot)
2103{
2104 struct mm_struct *mm = vma->vm_mm;
2105 int retval;
2106 pte_t *pte, entry;
2107 spinlock_t *ptl;
2108
2109 retval = -ENOMEM;
2110 pte = get_locked_pte(mm, addr, &ptl);
2111 if (!pte)
2112 goto out;
2113 retval = -EBUSY;
2114 if (!pte_none(*pte))
2115 goto out_unlock;
2116
2117
2118 entry = pte_mkspecial(pfn_pte(pfn, prot));
2119 set_pte_at(mm, addr, pte, entry);
2120 update_mmu_cache(vma, addr, pte);
2121
2122 retval = 0;
2123out_unlock:
2124 pte_unmap_unlock(pte, ptl);
2125out:
2126 return retval;
2127}
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2147 unsigned long pfn)
2148{
2149 int ret;
2150 pgprot_t pgprot = vma->vm_page_prot;
2151
2152
2153
2154
2155
2156
2157 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2158 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2159 (VM_PFNMAP|VM_MIXEDMAP));
2160 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2161 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2162
2163 if (addr < vma->vm_start || addr >= vma->vm_end)
2164 return -EFAULT;
2165 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
2166 return -EINVAL;
2167
2168 ret = insert_pfn(vma, addr, pfn, pgprot);
2169
2170 if (ret)
2171 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2172
2173 return ret;
2174}
2175EXPORT_SYMBOL(vm_insert_pfn);
2176
2177int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2178 unsigned long pfn)
2179{
2180 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2181
2182 if (addr < vma->vm_start || addr >= vma->vm_end)
2183 return -EFAULT;
2184
2185
2186
2187
2188
2189
2190
2191
2192 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2193 struct page *page;
2194
2195 page = pfn_to_page(pfn);
2196 return insert_page(vma, addr, page, vma->vm_page_prot);
2197 }
2198 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2199}
2200EXPORT_SYMBOL(vm_insert_mixed);
2201
2202
2203
2204
2205
2206
2207static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2208 unsigned long addr, unsigned long end,
2209 unsigned long pfn, pgprot_t prot)
2210{
2211 pte_t *pte;
2212 spinlock_t *ptl;
2213
2214 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2215 if (!pte)
2216 return -ENOMEM;
2217 arch_enter_lazy_mmu_mode();
2218 do {
2219 BUG_ON(!pte_none(*pte));
2220 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2221 pfn++;
2222 } while (pte++, addr += PAGE_SIZE, addr != end);
2223 arch_leave_lazy_mmu_mode();
2224 pte_unmap_unlock(pte - 1, ptl);
2225 return 0;
2226}
2227
2228static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2229 unsigned long addr, unsigned long end,
2230 unsigned long pfn, pgprot_t prot)
2231{
2232 pmd_t *pmd;
2233 unsigned long next;
2234
2235 pfn -= addr >> PAGE_SHIFT;
2236 pmd = pmd_alloc(mm, pud, addr);
2237 if (!pmd)
2238 return -ENOMEM;
2239 VM_BUG_ON(pmd_trans_huge(*pmd));
2240 do {
2241 next = pmd_addr_end(addr, end);
2242 if (remap_pte_range(mm, pmd, addr, next,
2243 pfn + (addr >> PAGE_SHIFT), prot))
2244 return -ENOMEM;
2245 } while (pmd++, addr = next, addr != end);
2246 return 0;
2247}
2248
2249static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2250 unsigned long addr, unsigned long end,
2251 unsigned long pfn, pgprot_t prot)
2252{
2253 pud_t *pud;
2254 unsigned long next;
2255
2256 pfn -= addr >> PAGE_SHIFT;
2257 pud = pud_alloc(mm, pgd, addr);
2258 if (!pud)
2259 return -ENOMEM;
2260 do {
2261 next = pud_addr_end(addr, end);
2262 if (remap_pmd_range(mm, pud, addr, next,
2263 pfn + (addr >> PAGE_SHIFT), prot))
2264 return -ENOMEM;
2265 } while (pud++, addr = next, addr != end);
2266 return 0;
2267}
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2280 unsigned long pfn, unsigned long size, pgprot_t prot)
2281{
2282 pgd_t *pgd;
2283 unsigned long next;
2284 unsigned long end = addr + PAGE_ALIGN(size);
2285 struct mm_struct *mm = vma->vm_mm;
2286 int err;
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306 if (addr == vma->vm_start && end == vma->vm_end) {
2307 vma->vm_pgoff = pfn;
2308 vma->vm_flags |= VM_PFN_AT_MMAP;
2309 } else if (is_cow_mapping(vma->vm_flags))
2310 return -EINVAL;
2311
2312 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2313
2314 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
2315 if (err) {
2316
2317
2318
2319
2320 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2321 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2322 return -EINVAL;
2323 }
2324
2325 BUG_ON(addr >= end);
2326 pfn -= addr >> PAGE_SHIFT;
2327 pgd = pgd_offset(mm, addr);
2328 flush_cache_range(vma, addr, end);
2329 do {
2330 next = pgd_addr_end(addr, end);
2331 err = remap_pud_range(mm, pgd, addr, next,
2332 pfn + (addr >> PAGE_SHIFT), prot);
2333 if (err)
2334 break;
2335 } while (pgd++, addr = next, addr != end);
2336
2337 if (err)
2338 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
2339
2340 return err;
2341}
2342EXPORT_SYMBOL(remap_pfn_range);
2343
2344static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2345 unsigned long addr, unsigned long end,
2346 pte_fn_t fn, void *data)
2347{
2348 pte_t *pte;
2349 int err;
2350 pgtable_t token;
2351 spinlock_t *uninitialized_var(ptl);
2352
2353 pte = (mm == &init_mm) ?
2354 pte_alloc_kernel(pmd, addr) :
2355 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2356 if (!pte)
2357 return -ENOMEM;
2358
2359 BUG_ON(pmd_huge(*pmd));
2360
2361 arch_enter_lazy_mmu_mode();
2362
2363 token = pmd_pgtable(*pmd);
2364
2365 do {
2366 err = fn(pte++, token, addr, data);
2367 if (err)
2368 break;
2369 } while (addr += PAGE_SIZE, addr != end);
2370
2371 arch_leave_lazy_mmu_mode();
2372
2373 if (mm != &init_mm)
2374 pte_unmap_unlock(pte-1, ptl);
2375 return err;
2376}
2377
2378static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2379 unsigned long addr, unsigned long end,
2380 pte_fn_t fn, void *data)
2381{
2382 pmd_t *pmd;
2383 unsigned long next;
2384 int err;
2385
2386 BUG_ON(pud_huge(*pud));
2387
2388 pmd = pmd_alloc(mm, pud, addr);
2389 if (!pmd)
2390 return -ENOMEM;
2391 do {
2392 next = pmd_addr_end(addr, end);
2393 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2394 if (err)
2395 break;
2396 } while (pmd++, addr = next, addr != end);
2397 return err;
2398}
2399
2400static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2401 unsigned long addr, unsigned long end,
2402 pte_fn_t fn, void *data)
2403{
2404 pud_t *pud;
2405 unsigned long next;
2406 int err;
2407
2408 pud = pud_alloc(mm, pgd, addr);
2409 if (!pud)
2410 return -ENOMEM;
2411 do {
2412 next = pud_addr_end(addr, end);
2413 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2414 if (err)
2415 break;
2416 } while (pud++, addr = next, addr != end);
2417 return err;
2418}
2419
2420
2421
2422
2423
2424int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2425 unsigned long size, pte_fn_t fn, void *data)
2426{
2427 pgd_t *pgd;
2428 unsigned long next;
2429 unsigned long end = addr + size;
2430 int err;
2431
2432 BUG_ON(addr >= end);
2433 pgd = pgd_offset(mm, addr);
2434 do {
2435 next = pgd_addr_end(addr, end);
2436 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2437 if (err)
2438 break;
2439 } while (pgd++, addr = next, addr != end);
2440
2441 return err;
2442}
2443EXPORT_SYMBOL_GPL(apply_to_page_range);
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2455 pte_t *page_table, pte_t orig_pte)
2456{
2457 int same = 1;
2458#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2459 if (sizeof(pte_t) > sizeof(unsigned long)) {
2460 spinlock_t *ptl = pte_lockptr(mm, pmd);
2461 spin_lock(ptl);
2462 same = pte_same(*page_table, orig_pte);
2463 spin_unlock(ptl);
2464 }
2465#endif
2466 pte_unmap(page_table);
2467 return same;
2468}
2469
2470static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2471{
2472
2473
2474
2475
2476
2477
2478 if (unlikely(!src)) {
2479 void *kaddr = kmap_atomic(dst);
2480 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2481
2482
2483
2484
2485
2486
2487
2488 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2489 clear_page(kaddr);
2490 kunmap_atomic(kaddr);
2491 flush_dcache_page(dst);
2492 } else
2493 copy_user_highpage(dst, src, va, vma);
2494}
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2515 unsigned long address, pte_t *page_table, pmd_t *pmd,
2516 spinlock_t *ptl, pte_t orig_pte)
2517 __releases(ptl)
2518{
2519 struct page *old_page, *new_page;
2520 pte_t entry;
2521 int ret = 0;
2522 int page_mkwrite = 0;
2523 struct page *dirty_page = NULL;
2524
2525 old_page = vm_normal_page(vma, address, orig_pte);
2526 if (!old_page) {
2527
2528
2529
2530
2531
2532
2533
2534 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2535 (VM_WRITE|VM_SHARED))
2536 goto reuse;
2537 goto gotten;
2538 }
2539
2540
2541
2542
2543
2544 if (PageAnon(old_page) && !PageKsm(old_page)) {
2545 if (!trylock_page(old_page)) {
2546 page_cache_get(old_page);
2547 pte_unmap_unlock(page_table, ptl);
2548 lock_page(old_page);
2549 page_table = pte_offset_map_lock(mm, pmd, address,
2550 &ptl);
2551 if (!pte_same(*page_table, orig_pte)) {
2552 unlock_page(old_page);
2553 goto unlock;
2554 }
2555 page_cache_release(old_page);
2556 }
2557 if (reuse_swap_page(old_page)) {
2558
2559
2560
2561
2562
2563 page_move_anon_rmap(old_page, vma, address);
2564 unlock_page(old_page);
2565 goto reuse;
2566 }
2567 unlock_page(old_page);
2568 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2569 (VM_WRITE|VM_SHARED))) {
2570
2571
2572
2573
2574
2575 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2576 struct vm_fault vmf;
2577 int tmp;
2578
2579 vmf.virtual_address = (void __user *)(address &
2580 PAGE_MASK);
2581 vmf.pgoff = old_page->index;
2582 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2583 vmf.page = old_page;
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593 page_cache_get(old_page);
2594 pte_unmap_unlock(page_table, ptl);
2595
2596 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2597 if (unlikely(tmp &
2598 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2599 ret = tmp;
2600 goto unwritable_page;
2601 }
2602 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2603 lock_page(old_page);
2604 if (!old_page->mapping) {
2605 ret = 0;
2606 unlock_page(old_page);
2607 goto unwritable_page;
2608 }
2609 } else
2610 VM_BUG_ON(!PageLocked(old_page));
2611
2612
2613
2614
2615
2616
2617
2618 page_table = pte_offset_map_lock(mm, pmd, address,
2619 &ptl);
2620 if (!pte_same(*page_table, orig_pte)) {
2621 unlock_page(old_page);
2622 goto unlock;
2623 }
2624
2625 page_mkwrite = 1;
2626 }
2627 dirty_page = old_page;
2628 get_page(dirty_page);
2629
2630reuse:
2631 flush_cache_page(vma, address, pte_pfn(orig_pte));
2632 entry = pte_mkyoung(orig_pte);
2633 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2634 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2635 update_mmu_cache(vma, address, page_table);
2636 pte_unmap_unlock(page_table, ptl);
2637 ret |= VM_FAULT_WRITE;
2638
2639 if (!dirty_page)
2640 return ret;
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650 if (!page_mkwrite) {
2651 wait_on_page_locked(dirty_page);
2652 set_page_dirty_balance(dirty_page, page_mkwrite);
2653
2654 if (vma->vm_file)
2655 file_update_time(vma->vm_file);
2656 }
2657 put_page(dirty_page);
2658 if (page_mkwrite) {
2659 struct address_space *mapping = dirty_page->mapping;
2660
2661 set_page_dirty(dirty_page);
2662 unlock_page(dirty_page);
2663 page_cache_release(dirty_page);
2664 if (mapping) {
2665
2666
2667
2668
2669 balance_dirty_pages_ratelimited(mapping);
2670 }
2671 }
2672
2673 return ret;
2674 }
2675
2676
2677
2678
2679 page_cache_get(old_page);
2680gotten:
2681 pte_unmap_unlock(page_table, ptl);
2682
2683 if (unlikely(anon_vma_prepare(vma)))
2684 goto oom;
2685
2686 if (is_zero_pfn(pte_pfn(orig_pte))) {
2687 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2688 if (!new_page)
2689 goto oom;
2690 } else {
2691 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2692 if (!new_page)
2693 goto oom;
2694 cow_user_page(new_page, old_page, address, vma);
2695 }
2696 __SetPageUptodate(new_page);
2697
2698 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2699 goto oom_free_new;
2700
2701
2702
2703
2704 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2705 if (likely(pte_same(*page_table, orig_pte))) {
2706 if (old_page) {
2707 if (!PageAnon(old_page)) {
2708 dec_mm_counter_fast(mm, MM_FILEPAGES);
2709 inc_mm_counter_fast(mm, MM_ANONPAGES);
2710 }
2711 } else
2712 inc_mm_counter_fast(mm, MM_ANONPAGES);
2713 flush_cache_page(vma, address, pte_pfn(orig_pte));
2714 entry = mk_pte(new_page, vma->vm_page_prot);
2715 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2716
2717
2718
2719
2720
2721
2722 ptep_clear_flush(vma, address, page_table);
2723 page_add_new_anon_rmap(new_page, vma, address);
2724
2725
2726
2727
2728
2729 set_pte_at_notify(mm, address, page_table, entry);
2730 update_mmu_cache(vma, address, page_table);
2731 if (old_page) {
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754 page_remove_rmap(old_page);
2755 }
2756
2757
2758 new_page = old_page;
2759 ret |= VM_FAULT_WRITE;
2760 } else
2761 mem_cgroup_uncharge_page(new_page);
2762
2763 if (new_page)
2764 page_cache_release(new_page);
2765unlock:
2766 pte_unmap_unlock(page_table, ptl);
2767 if (old_page) {
2768
2769
2770
2771
2772 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2773 lock_page(old_page);
2774 munlock_vma_page(old_page);
2775 unlock_page(old_page);
2776 }
2777 page_cache_release(old_page);
2778 }
2779 return ret;
2780oom_free_new:
2781 page_cache_release(new_page);
2782oom:
2783 if (old_page) {
2784 if (page_mkwrite) {
2785 unlock_page(old_page);
2786 page_cache_release(old_page);
2787 }
2788 page_cache_release(old_page);
2789 }
2790 return VM_FAULT_OOM;
2791
2792unwritable_page:
2793 page_cache_release(old_page);
2794 return ret;
2795}
2796
2797static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2798 unsigned long start_addr, unsigned long end_addr,
2799 struct zap_details *details)
2800{
2801 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2802}
2803
2804static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2805 struct zap_details *details)
2806{
2807 struct vm_area_struct *vma;
2808 struct prio_tree_iter iter;
2809 pgoff_t vba, vea, zba, zea;
2810
2811 vma_prio_tree_foreach(vma, &iter, root,
2812 details->first_index, details->last_index) {
2813
2814 vba = vma->vm_pgoff;
2815 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2816
2817 zba = details->first_index;
2818 if (zba < vba)
2819 zba = vba;
2820 zea = details->last_index;
2821 if (zea > vea)
2822 zea = vea;
2823
2824 unmap_mapping_range_vma(vma,
2825 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2826 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2827 details);
2828 }
2829}
2830
2831static inline void unmap_mapping_range_list(struct list_head *head,
2832 struct zap_details *details)
2833{
2834 struct vm_area_struct *vma;
2835
2836
2837
2838
2839
2840
2841
2842 list_for_each_entry(vma, head, shared.vm_set.list) {
2843 details->nonlinear_vma = vma;
2844 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2845 }
2846}
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862void unmap_mapping_range(struct address_space *mapping,
2863 loff_t const holebegin, loff_t const holelen, int even_cows)
2864{
2865 struct zap_details details;
2866 pgoff_t hba = holebegin >> PAGE_SHIFT;
2867 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2868
2869
2870 if (sizeof(holelen) > sizeof(hlen)) {
2871 long long holeend =
2872 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2873 if (holeend & ~(long long)ULONG_MAX)
2874 hlen = ULONG_MAX - hba + 1;
2875 }
2876
2877 details.check_mapping = even_cows? NULL: mapping;
2878 details.nonlinear_vma = NULL;
2879 details.first_index = hba;
2880 details.last_index = hba + hlen - 1;
2881 if (details.last_index < details.first_index)
2882 details.last_index = ULONG_MAX;
2883
2884
2885 mutex_lock(&mapping->i_mmap_mutex);
2886 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2887 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2888 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2889 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2890 mutex_unlock(&mapping->i_mmap_mutex);
2891}
2892EXPORT_SYMBOL(unmap_mapping_range);
2893
2894
2895
2896
2897
2898
2899static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2900 unsigned long address, pte_t *page_table, pmd_t *pmd,
2901 unsigned int flags, pte_t orig_pte)
2902{
2903 spinlock_t *ptl;
2904 struct page *page, *swapcache = NULL;
2905 swp_entry_t entry;
2906 pte_t pte;
2907 int locked;
2908 struct mem_cgroup *ptr;
2909 int exclusive = 0;
2910 int ret = 0;
2911
2912 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2913 goto out;
2914
2915 entry = pte_to_swp_entry(orig_pte);
2916 if (unlikely(non_swap_entry(entry))) {
2917 if (is_migration_entry(entry)) {
2918 migration_entry_wait(mm, pmd, address);
2919 } else if (is_hwpoison_entry(entry)) {
2920 ret = VM_FAULT_HWPOISON;
2921 } else {
2922 print_bad_pte(vma, address, orig_pte, NULL);
2923 ret = VM_FAULT_SIGBUS;
2924 }
2925 goto out;
2926 }
2927 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2928 page = lookup_swap_cache(entry);
2929 if (!page) {
2930 page = swapin_readahead(entry,
2931 GFP_HIGHUSER_MOVABLE, vma, address);
2932 if (!page) {
2933
2934
2935
2936
2937 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2938 if (likely(pte_same(*page_table, orig_pte)))
2939 ret = VM_FAULT_OOM;
2940 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2941 goto unlock;
2942 }
2943
2944
2945 ret = VM_FAULT_MAJOR;
2946 count_vm_event(PGMAJFAULT);
2947 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2948 } else if (PageHWPoison(page)) {
2949
2950
2951
2952
2953 ret = VM_FAULT_HWPOISON;
2954 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2955 goto out_release;
2956 }
2957
2958 locked = lock_page_or_retry(page, mm, flags);
2959
2960 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2961 if (!locked) {
2962 ret |= VM_FAULT_RETRY;
2963 goto out_release;
2964 }
2965
2966
2967
2968
2969
2970
2971
2972 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2973 goto out_page;
2974
2975 if (ksm_might_need_to_copy(page, vma, address)) {
2976 swapcache = page;
2977 page = ksm_does_need_to_copy(page, vma, address);
2978
2979 if (unlikely(!page)) {
2980 ret = VM_FAULT_OOM;
2981 page = swapcache;
2982 swapcache = NULL;
2983 goto out_page;
2984 }
2985 }
2986
2987 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2988 ret = VM_FAULT_OOM;
2989 goto out_page;
2990 }
2991
2992
2993
2994
2995 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2996 if (unlikely(!pte_same(*page_table, orig_pte)))
2997 goto out_nomap;
2998
2999 if (unlikely(!PageUptodate(page))) {
3000 ret = VM_FAULT_SIGBUS;
3001 goto out_nomap;
3002 }
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018 inc_mm_counter_fast(mm, MM_ANONPAGES);
3019 dec_mm_counter_fast(mm, MM_SWAPENTS);
3020 pte = mk_pte(page, vma->vm_page_prot);
3021 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
3022 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3023 flags &= ~FAULT_FLAG_WRITE;
3024 ret |= VM_FAULT_WRITE;
3025 exclusive = 1;
3026 }
3027 flush_icache_page(vma, page);
3028 set_pte_at(mm, address, page_table, pte);
3029 do_page_add_anon_rmap(page, vma, address, exclusive);
3030
3031 mem_cgroup_commit_charge_swapin(page, ptr);
3032
3033 swap_free(entry);
3034 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3035 try_to_free_swap(page);
3036 unlock_page(page);
3037 if (swapcache) {
3038
3039
3040
3041
3042
3043
3044
3045
3046 unlock_page(swapcache);
3047 page_cache_release(swapcache);
3048 }
3049
3050 if (flags & FAULT_FLAG_WRITE) {
3051 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3052 if (ret & VM_FAULT_ERROR)
3053 ret &= VM_FAULT_ERROR;
3054 goto out;
3055 }
3056
3057
3058 update_mmu_cache(vma, address, page_table);
3059unlock:
3060 pte_unmap_unlock(page_table, ptl);
3061out:
3062 return ret;
3063out_nomap:
3064 mem_cgroup_cancel_charge_swapin(ptr);
3065 pte_unmap_unlock(page_table, ptl);
3066out_page:
3067 unlock_page(page);
3068out_release:
3069 page_cache_release(page);
3070 if (swapcache) {
3071 unlock_page(swapcache);
3072 page_cache_release(swapcache);
3073 }
3074 return ret;
3075}
3076
3077
3078
3079
3080
3081
3082static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3083{
3084 address &= PAGE_MASK;
3085 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3086 struct vm_area_struct *prev = vma->vm_prev;
3087
3088
3089
3090
3091
3092
3093
3094 if (prev && prev->vm_end == address)
3095 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3096
3097 expand_downwards(vma, address - PAGE_SIZE);
3098 }
3099 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3100 struct vm_area_struct *next = vma->vm_next;
3101
3102
3103 if (next && next->vm_start == address + PAGE_SIZE)
3104 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3105
3106 expand_upwards(vma, address + PAGE_SIZE);
3107 }
3108 return 0;
3109}
3110
3111
3112
3113
3114
3115
3116static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3117 unsigned long address, pte_t *page_table, pmd_t *pmd,
3118 unsigned int flags)
3119{
3120 struct page *page;
3121 spinlock_t *ptl;
3122 pte_t entry;
3123
3124 pte_unmap(page_table);
3125
3126
3127 if (check_stack_guard_page(vma, address) < 0)
3128 return VM_FAULT_SIGBUS;
3129
3130
3131 if (!(flags & FAULT_FLAG_WRITE)) {
3132 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3133 vma->vm_page_prot));
3134 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3135 if (!pte_none(*page_table))
3136 goto unlock;
3137 goto setpte;
3138 }
3139
3140
3141 if (unlikely(anon_vma_prepare(vma)))
3142 goto oom;
3143 page = alloc_zeroed_user_highpage_movable(vma, address);
3144 if (!page)
3145 goto oom;
3146 __SetPageUptodate(page);
3147
3148 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3149 goto oom_free_page;
3150
3151 entry = mk_pte(page, vma->vm_page_prot);
3152 if (vma->vm_flags & VM_WRITE)
3153 entry = pte_mkwrite(pte_mkdirty(entry));
3154
3155 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3156 if (!pte_none(*page_table))
3157 goto release;
3158
3159 inc_mm_counter_fast(mm, MM_ANONPAGES);
3160 page_add_new_anon_rmap(page, vma, address);
3161setpte:
3162 set_pte_at(mm, address, page_table, entry);
3163
3164
3165 update_mmu_cache(vma, address, page_table);
3166unlock:
3167 pte_unmap_unlock(page_table, ptl);
3168 return 0;
3169release:
3170 mem_cgroup_uncharge_page(page);
3171 page_cache_release(page);
3172 goto unlock;
3173oom_free_page:
3174 page_cache_release(page);
3175oom:
3176 return VM_FAULT_OOM;
3177}
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3193 unsigned long address, pmd_t *pmd,
3194 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3195{
3196 pte_t *page_table;
3197 spinlock_t *ptl;
3198 struct page *page;
3199 struct page *cow_page;
3200 pte_t entry;
3201 int anon = 0;
3202 struct page *dirty_page = NULL;
3203 struct vm_fault vmf;
3204 int ret;
3205 int page_mkwrite = 0;
3206
3207
3208
3209
3210
3211 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3212
3213 if (unlikely(anon_vma_prepare(vma)))
3214 return VM_FAULT_OOM;
3215
3216 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3217 if (!cow_page)
3218 return VM_FAULT_OOM;
3219
3220 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3221 page_cache_release(cow_page);
3222 return VM_FAULT_OOM;
3223 }
3224 } else
3225 cow_page = NULL;
3226
3227 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3228 vmf.pgoff = pgoff;
3229 vmf.flags = flags;
3230 vmf.page = NULL;
3231
3232 ret = vma->vm_ops->fault(vma, &vmf);
3233 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3234 VM_FAULT_RETRY)))
3235 goto uncharge_out;
3236
3237 if (unlikely(PageHWPoison(vmf.page))) {
3238 if (ret & VM_FAULT_LOCKED)
3239 unlock_page(vmf.page);
3240 ret = VM_FAULT_HWPOISON;
3241 goto uncharge_out;
3242 }
3243
3244
3245
3246
3247
3248 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3249 lock_page(vmf.page);
3250 else
3251 VM_BUG_ON(!PageLocked(vmf.page));
3252
3253
3254
3255
3256 page = vmf.page;
3257 if (flags & FAULT_FLAG_WRITE) {
3258 if (!(vma->vm_flags & VM_SHARED)) {
3259 page = cow_page;
3260 anon = 1;
3261 copy_user_highpage(page, vmf.page, address, vma);
3262 __SetPageUptodate(page);
3263 } else {
3264
3265
3266
3267
3268
3269 if (vma->vm_ops->page_mkwrite) {
3270 int tmp;
3271
3272 unlock_page(page);
3273 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3274 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3275 if (unlikely(tmp &
3276 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3277 ret = tmp;
3278 goto unwritable_page;
3279 }
3280 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3281 lock_page(page);
3282 if (!page->mapping) {
3283 ret = 0;
3284 unlock_page(page);
3285 goto unwritable_page;
3286 }
3287 } else
3288 VM_BUG_ON(!PageLocked(page));
3289 page_mkwrite = 1;
3290 }
3291 }
3292
3293 }
3294
3295 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308 if (likely(pte_same(*page_table, orig_pte))) {
3309 flush_icache_page(vma, page);
3310 entry = mk_pte(page, vma->vm_page_prot);
3311 if (flags & FAULT_FLAG_WRITE)
3312 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3313 if (anon) {
3314 inc_mm_counter_fast(mm, MM_ANONPAGES);
3315 page_add_new_anon_rmap(page, vma, address);
3316 } else {
3317 inc_mm_counter_fast(mm, MM_FILEPAGES);
3318 page_add_file_rmap(page);
3319 if (flags & FAULT_FLAG_WRITE) {
3320 dirty_page = page;
3321 get_page(dirty_page);
3322 }
3323 }
3324 set_pte_at(mm, address, page_table, entry);
3325
3326
3327 update_mmu_cache(vma, address, page_table);
3328 } else {
3329 if (cow_page)
3330 mem_cgroup_uncharge_page(cow_page);
3331 if (anon)
3332 page_cache_release(page);
3333 else
3334 anon = 1;
3335 }
3336
3337 pte_unmap_unlock(page_table, ptl);
3338
3339 if (dirty_page) {
3340 struct address_space *mapping = page->mapping;
3341 int dirtied = 0;
3342
3343 if (set_page_dirty(dirty_page))
3344 dirtied = 1;
3345 unlock_page(dirty_page);
3346 put_page(dirty_page);
3347 if ((dirtied || page_mkwrite) && mapping) {
3348
3349
3350
3351
3352 balance_dirty_pages_ratelimited(mapping);
3353 }
3354
3355
3356 if (vma->vm_file && !page_mkwrite)
3357 file_update_time(vma->vm_file);
3358 } else {
3359 unlock_page(vmf.page);
3360 if (anon)
3361 page_cache_release(vmf.page);
3362 }
3363
3364 return ret;
3365
3366unwritable_page:
3367 page_cache_release(page);
3368 return ret;
3369uncharge_out:
3370
3371 if (cow_page) {
3372 mem_cgroup_uncharge_page(cow_page);
3373 page_cache_release(cow_page);
3374 }
3375 return ret;
3376}
3377
3378static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3379 unsigned long address, pte_t *page_table, pmd_t *pmd,
3380 unsigned int flags, pte_t orig_pte)
3381{
3382 pgoff_t pgoff = (((address & PAGE_MASK)
3383 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3384
3385 pte_unmap(page_table);
3386 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3387}
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3399 unsigned long address, pte_t *page_table, pmd_t *pmd,
3400 unsigned int flags, pte_t orig_pte)
3401{
3402 pgoff_t pgoff;
3403
3404 flags |= FAULT_FLAG_NONLINEAR;
3405
3406 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3407 return 0;
3408
3409 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3410
3411
3412
3413 print_bad_pte(vma, address, orig_pte, NULL);
3414 return VM_FAULT_SIGBUS;
3415 }
3416
3417 pgoff = pte_to_pgoff(orig_pte);
3418 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3419}
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434int handle_pte_fault(struct mm_struct *mm,
3435 struct vm_area_struct *vma, unsigned long address,
3436 pte_t *pte, pmd_t *pmd, unsigned int flags)
3437{
3438 pte_t entry;
3439 spinlock_t *ptl;
3440
3441 entry = *pte;
3442 if (!pte_present(entry)) {
3443 if (pte_none(entry)) {
3444 if (vma->vm_ops) {
3445 if (likely(vma->vm_ops->fault))
3446 return do_linear_fault(mm, vma, address,
3447 pte, pmd, flags, entry);
3448 }
3449 return do_anonymous_page(mm, vma, address,
3450 pte, pmd, flags);
3451 }
3452 if (pte_file(entry))
3453 return do_nonlinear_fault(mm, vma, address,
3454 pte, pmd, flags, entry);
3455 return do_swap_page(mm, vma, address,
3456 pte, pmd, flags, entry);
3457 }
3458
3459 ptl = pte_lockptr(mm, pmd);
3460 spin_lock(ptl);
3461 if (unlikely(!pte_same(*pte, entry)))
3462 goto unlock;
3463 if (flags & FAULT_FLAG_WRITE) {
3464 if (!pte_write(entry))
3465 return do_wp_page(mm, vma, address,
3466 pte, pmd, ptl, entry);
3467 entry = pte_mkdirty(entry);
3468 }
3469 entry = pte_mkyoung(entry);
3470 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3471 update_mmu_cache(vma, address, pte);
3472 } else {
3473
3474
3475
3476
3477
3478
3479 if (flags & FAULT_FLAG_WRITE)
3480 flush_tlb_fix_spurious_fault(vma, address);
3481 }
3482unlock:
3483 pte_unmap_unlock(pte, ptl);
3484 return 0;
3485}
3486
3487
3488
3489
3490int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3491 unsigned long address, unsigned int flags)
3492{
3493 pgd_t *pgd;
3494 pud_t *pud;
3495 pmd_t *pmd;
3496 pte_t *pte;
3497
3498 __set_current_state(TASK_RUNNING);
3499
3500 count_vm_event(PGFAULT);
3501 mem_cgroup_count_vm_event(mm, PGFAULT);
3502
3503
3504 check_sync_rss_stat(current);
3505
3506 if (unlikely(is_vm_hugetlb_page(vma)))
3507 return hugetlb_fault(mm, vma, address, flags);
3508
3509retry:
3510 pgd = pgd_offset(mm, address);
3511 pud = pud_alloc(mm, pgd, address);
3512 if (!pud)
3513 return VM_FAULT_OOM;
3514 pmd = pmd_alloc(mm, pud, address);
3515 if (!pmd)
3516 return VM_FAULT_OOM;
3517 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3518 if (!vma->vm_ops)
3519 return do_huge_pmd_anonymous_page(mm, vma, address,
3520 pmd, flags);
3521 } else {
3522 pmd_t orig_pmd = *pmd;
3523 int ret;
3524
3525 barrier();
3526 if (pmd_trans_huge(orig_pmd)) {
3527 if (flags & FAULT_FLAG_WRITE &&
3528 !pmd_write(orig_pmd) &&
3529 !pmd_trans_splitting(orig_pmd)) {
3530 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3531 orig_pmd);
3532
3533
3534
3535
3536
3537 if (unlikely(ret & VM_FAULT_OOM))
3538 goto retry;
3539 return ret;
3540 }
3541 return 0;
3542 }
3543 }
3544
3545
3546
3547
3548
3549
3550 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3551 return VM_FAULT_OOM;
3552
3553 if (unlikely(pmd_trans_huge(*pmd)))
3554 return 0;
3555
3556
3557
3558
3559
3560
3561 pte = pte_offset_map(pmd, address);
3562
3563 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3564}
3565
3566#ifndef __PAGETABLE_PUD_FOLDED
3567
3568
3569
3570
3571int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3572{
3573 pud_t *new = pud_alloc_one(mm, address);
3574 if (!new)
3575 return -ENOMEM;
3576
3577 smp_wmb();
3578
3579 spin_lock(&mm->page_table_lock);
3580 if (pgd_present(*pgd))
3581 pud_free(mm, new);
3582 else
3583 pgd_populate(mm, pgd, new);
3584 spin_unlock(&mm->page_table_lock);
3585 return 0;
3586}
3587#endif
3588
3589#ifndef __PAGETABLE_PMD_FOLDED
3590
3591
3592
3593
3594int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3595{
3596 pmd_t *new = pmd_alloc_one(mm, address);
3597 if (!new)
3598 return -ENOMEM;
3599
3600 smp_wmb();
3601
3602 spin_lock(&mm->page_table_lock);
3603#ifndef __ARCH_HAS_4LEVEL_HACK
3604 if (pud_present(*pud))
3605 pmd_free(mm, new);
3606 else
3607 pud_populate(mm, pud, new);
3608#else
3609 if (pgd_present(*pud))
3610 pmd_free(mm, new);
3611 else
3612 pgd_populate(mm, pud, new);
3613#endif
3614 spin_unlock(&mm->page_table_lock);
3615 return 0;
3616}
3617#endif
3618
3619int make_pages_present(unsigned long addr, unsigned long end)
3620{
3621 int ret, len, write;
3622 struct vm_area_struct * vma;
3623
3624 vma = find_vma(current->mm, addr);
3625 if (!vma)
3626 return -ENOMEM;
3627
3628
3629
3630
3631
3632 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3633 BUG_ON(addr >= end);
3634 BUG_ON(end > vma->vm_end);
3635 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3636 ret = get_user_pages(current, current->mm, addr,
3637 len, write, 0, NULL, NULL);
3638 if (ret < 0)
3639 return ret;
3640 return ret == len ? 0 : -EFAULT;
3641}
3642
3643#if !defined(__HAVE_ARCH_GATE_AREA)
3644
3645#if defined(AT_SYSINFO_EHDR)
3646static struct vm_area_struct gate_vma;
3647
3648static int __init gate_vma_init(void)
3649{
3650 gate_vma.vm_mm = NULL;
3651 gate_vma.vm_start = FIXADDR_USER_START;
3652 gate_vma.vm_end = FIXADDR_USER_END;
3653 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3654 gate_vma.vm_page_prot = __P101;
3655
3656 return 0;
3657}
3658__initcall(gate_vma_init);
3659#endif
3660
3661struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3662{
3663#ifdef AT_SYSINFO_EHDR
3664 return &gate_vma;
3665#else
3666 return NULL;
3667#endif
3668}
3669
3670int in_gate_area_no_mm(unsigned long addr)
3671{
3672#ifdef AT_SYSINFO_EHDR
3673 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3674 return 1;
3675#endif
3676 return 0;
3677}
3678
3679#endif
3680
3681static int __follow_pte(struct mm_struct *mm, unsigned long address,
3682 pte_t **ptepp, spinlock_t **ptlp)
3683{
3684 pgd_t *pgd;
3685 pud_t *pud;
3686 pmd_t *pmd;
3687 pte_t *ptep;
3688
3689 pgd = pgd_offset(mm, address);
3690 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3691 goto out;
3692
3693 pud = pud_offset(pgd, address);
3694 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3695 goto out;
3696
3697 pmd = pmd_offset(pud, address);
3698 VM_BUG_ON(pmd_trans_huge(*pmd));
3699 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3700 goto out;
3701
3702
3703 if (pmd_huge(*pmd))
3704 goto out;
3705
3706 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3707 if (!ptep)
3708 goto out;
3709 if (!pte_present(*ptep))
3710 goto unlock;
3711 *ptepp = ptep;
3712 return 0;
3713unlock:
3714 pte_unmap_unlock(ptep, *ptlp);
3715out:
3716 return -EINVAL;
3717}
3718
3719static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3720 pte_t **ptepp, spinlock_t **ptlp)
3721{
3722 int res;
3723
3724
3725 (void) __cond_lock(*ptlp,
3726 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3727 return res;
3728}
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3741 unsigned long *pfn)
3742{
3743 int ret = -EINVAL;
3744 spinlock_t *ptl;
3745 pte_t *ptep;
3746
3747 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3748 return ret;
3749
3750 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3751 if (ret)
3752 return ret;
3753 *pfn = pte_pfn(*ptep);
3754 pte_unmap_unlock(ptep, ptl);
3755 return 0;
3756}
3757EXPORT_SYMBOL(follow_pfn);
3758
3759#ifdef CONFIG_HAVE_IOREMAP_PROT
3760int follow_phys(struct vm_area_struct *vma,
3761 unsigned long address, unsigned int flags,
3762 unsigned long *prot, resource_size_t *phys)
3763{
3764 int ret = -EINVAL;
3765 pte_t *ptep, pte;
3766 spinlock_t *ptl;
3767
3768 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3769 goto out;
3770
3771 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3772 goto out;
3773 pte = *ptep;
3774
3775 if ((flags & FOLL_WRITE) && !pte_write(pte))
3776 goto unlock;
3777
3778 *prot = pgprot_val(pte_pgprot(pte));
3779 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3780
3781 ret = 0;
3782unlock:
3783 pte_unmap_unlock(ptep, ptl);
3784out:
3785 return ret;
3786}
3787
3788int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3789 void *buf, int len, int write)
3790{
3791 resource_size_t phys_addr;
3792 unsigned long prot = 0;
3793 void __iomem *maddr;
3794 int offset = addr & (PAGE_SIZE-1);
3795
3796 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3797 return -EINVAL;
3798
3799 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3800 if (write)
3801 memcpy_toio(maddr + offset, buf, len);
3802 else
3803 memcpy_fromio(buf, maddr + offset, len);
3804 iounmap(maddr);
3805
3806 return len;
3807}
3808#endif
3809
3810
3811
3812
3813
3814static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3815 unsigned long addr, void *buf, int len, int write)
3816{
3817 struct vm_area_struct *vma;
3818 void *old_buf = buf;
3819
3820 down_read(&mm->mmap_sem);
3821
3822 while (len) {
3823 int bytes, ret, offset;
3824 void *maddr;
3825 struct page *page = NULL;
3826
3827 ret = get_user_pages(tsk, mm, addr, 1,
3828 write, 1, &page, &vma);
3829 if (ret <= 0) {
3830
3831
3832
3833
3834#ifdef CONFIG_HAVE_IOREMAP_PROT
3835 vma = find_vma(mm, addr);
3836 if (!vma || vma->vm_start > addr)
3837 break;
3838 if (vma->vm_ops && vma->vm_ops->access)
3839 ret = vma->vm_ops->access(vma, addr, buf,
3840 len, write);
3841 if (ret <= 0)
3842#endif
3843 break;
3844 bytes = ret;
3845 } else {
3846 bytes = len;
3847 offset = addr & (PAGE_SIZE-1);
3848 if (bytes > PAGE_SIZE-offset)
3849 bytes = PAGE_SIZE-offset;
3850
3851 maddr = kmap(page);
3852 if (write) {
3853 copy_to_user_page(vma, page, addr,
3854 maddr + offset, buf, bytes);
3855 set_page_dirty_lock(page);
3856 } else {
3857 copy_from_user_page(vma, page, addr,
3858 buf, maddr + offset, bytes);
3859 }
3860 kunmap(page);
3861 page_cache_release(page);
3862 }
3863 len -= bytes;
3864 buf += bytes;
3865 addr += bytes;
3866 }
3867 up_read(&mm->mmap_sem);
3868
3869 return buf - old_buf;
3870}
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3883 void *buf, int len, int write)
3884{
3885 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3886}
3887
3888
3889
3890
3891
3892
3893int access_process_vm(struct task_struct *tsk, unsigned long addr,
3894 void *buf, int len, int write)
3895{
3896 struct mm_struct *mm;
3897 int ret;
3898
3899 mm = get_task_mm(tsk);
3900 if (!mm)
3901 return 0;
3902
3903 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3904 mmput(mm);
3905
3906 return ret;
3907}
3908
3909
3910
3911
3912void print_vma_addr(char *prefix, unsigned long ip)
3913{
3914 struct mm_struct *mm = current->mm;
3915 struct vm_area_struct *vma;
3916
3917
3918
3919
3920
3921 if (preempt_count())
3922 return;
3923
3924 down_read(&mm->mmap_sem);
3925 vma = find_vma(mm, ip);
3926 if (vma && vma->vm_file) {
3927 struct file *f = vma->vm_file;
3928 char *buf = (char *)__get_free_page(GFP_KERNEL);
3929 if (buf) {
3930 char *p, *s;
3931
3932 p = d_path(&f->f_path, buf, PAGE_SIZE);
3933 if (IS_ERR(p))
3934 p = "?";
3935 s = strrchr(p, '/');
3936 if (s)
3937 p = s+1;
3938 printk("%s%s[%lx+%lx]", prefix, p,
3939 vma->vm_start,
3940 vma->vm_end - vma->vm_start);
3941 free_page((unsigned long)buf);
3942 }
3943 }
3944 up_read(&mm->mmap_sem);
3945}
3946
3947#ifdef CONFIG_PROVE_LOCKING
3948void might_fault(void)
3949{
3950
3951
3952
3953
3954
3955
3956 if (segment_eq(get_fs(), KERNEL_DS))
3957 return;
3958
3959 might_sleep();
3960
3961
3962
3963
3964
3965 if (!in_atomic() && current->mm)
3966 might_lock_read(¤t->mm->mmap_sem);
3967}
3968EXPORT_SYMBOL(might_fault);
3969#endif
3970
3971#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3972static void clear_gigantic_page(struct page *page,
3973 unsigned long addr,
3974 unsigned int pages_per_huge_page)
3975{
3976 int i;
3977 struct page *p = page;
3978
3979 might_sleep();
3980 for (i = 0; i < pages_per_huge_page;
3981 i++, p = mem_map_next(p, page, i)) {
3982 cond_resched();
3983 clear_user_highpage(p, addr + i * PAGE_SIZE);
3984 }
3985}
3986void clear_huge_page(struct page *page,
3987 unsigned long addr, unsigned int pages_per_huge_page)
3988{
3989 int i;
3990
3991 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3992 clear_gigantic_page(page, addr, pages_per_huge_page);
3993 return;
3994 }
3995
3996 might_sleep();
3997 for (i = 0; i < pages_per_huge_page; i++) {
3998 cond_resched();
3999 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4000 }
4001}
4002
4003static void copy_user_gigantic_page(struct page *dst, struct page *src,
4004 unsigned long addr,
4005 struct vm_area_struct *vma,
4006 unsigned int pages_per_huge_page)
4007{
4008 int i;
4009 struct page *dst_base = dst;
4010 struct page *src_base = src;
4011
4012 for (i = 0; i < pages_per_huge_page; ) {
4013 cond_resched();
4014 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4015
4016 i++;
4017 dst = mem_map_next(dst, dst_base, i);
4018 src = mem_map_next(src, src_base, i);
4019 }
4020}
4021
4022void copy_user_huge_page(struct page *dst, struct page *src,
4023 unsigned long addr, struct vm_area_struct *vma,
4024 unsigned int pages_per_huge_page)
4025{
4026 int i;
4027
4028 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4029 copy_user_gigantic_page(dst, src, addr, vma,
4030 pages_per_huge_page);
4031 return;
4032 }
4033
4034 might_sleep();
4035 for (i = 0; i < pages_per_huge_page; i++) {
4036 cond_resched();
4037 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4038 }
4039}
4040#endif
4041