1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/module.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0;
136 }
137 }
138 task->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167
168
169
170
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172
173
174
175
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184}
185#else
186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189
190static void check_sync_rss_stat(struct task_struct *task)
191{
192}
193
194#endif
195
196#ifdef HAVE_GENERIC_MMU_GATHER
197
198static int tlb_next_batch(struct mmu_gather *tlb)
199{
200 struct mmu_gather_batch *batch;
201
202 batch = tlb->active;
203 if (batch->next) {
204 tlb->active = batch->next;
205 return 1;
206 }
207
208 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
209 if (!batch)
210 return 0;
211
212 batch->next = NULL;
213 batch->nr = 0;
214 batch->max = MAX_GATHER_BATCH;
215
216 tlb->active->next = batch;
217 tlb->active = batch;
218
219 return 1;
220}
221
222
223
224
225
226
227void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
228{
229 tlb->mm = mm;
230
231 tlb->fullmm = fullmm;
232 tlb->need_flush = 0;
233 tlb->fast_mode = (num_possible_cpus() == 1);
234 tlb->local.next = NULL;
235 tlb->local.nr = 0;
236 tlb->local.max = ARRAY_SIZE(tlb->__pages);
237 tlb->active = &tlb->local;
238
239#ifdef CONFIG_HAVE_RCU_TABLE_FREE
240 tlb->batch = NULL;
241#endif
242}
243
244void tlb_flush_mmu(struct mmu_gather *tlb)
245{
246 struct mmu_gather_batch *batch;
247
248 if (!tlb->need_flush)
249 return;
250 tlb->need_flush = 0;
251 tlb_flush(tlb);
252#ifdef CONFIG_HAVE_RCU_TABLE_FREE
253 tlb_table_flush(tlb);
254#endif
255
256 if (tlb_fast_mode(tlb))
257 return;
258
259 for (batch = &tlb->local; batch; batch = batch->next) {
260 free_pages_and_swap_cache(batch->pages, batch->nr);
261 batch->nr = 0;
262 }
263 tlb->active = &tlb->local;
264}
265
266
267
268
269
270void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
271{
272 struct mmu_gather_batch *batch, *next;
273
274 tlb_flush_mmu(tlb);
275
276
277 check_pgt_cache();
278
279 for (batch = tlb->local.next; batch; batch = next) {
280 next = batch->next;
281 free_pages((unsigned long)batch, 0);
282 }
283 tlb->local.next = NULL;
284}
285
286
287
288
289
290
291
292int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{
294 struct mmu_gather_batch *batch;
295
296 tlb->need_flush = 1;
297
298 if (tlb_fast_mode(tlb)) {
299 free_page_and_swap_cache(page);
300 return 1;
301 }
302
303 batch = tlb->active;
304 batch->pages[batch->nr++] = page;
305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb))
307 return 0;
308 batch = tlb->active;
309 }
310 VM_BUG_ON(batch->nr > batch->max);
311
312 return batch->max - batch->nr;
313}
314
315#endif
316
317#ifdef CONFIG_HAVE_RCU_TABLE_FREE
318
319
320
321
322
323static void tlb_remove_table_smp_sync(void *arg)
324{
325
326}
327
328static void tlb_remove_table_one(void *table)
329{
330
331
332
333
334
335
336
337 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
338 __tlb_remove_table(table);
339}
340
341static void tlb_remove_table_rcu(struct rcu_head *head)
342{
343 struct mmu_table_batch *batch;
344 int i;
345
346 batch = container_of(head, struct mmu_table_batch, rcu);
347
348 for (i = 0; i < batch->nr; i++)
349 __tlb_remove_table(batch->tables[i]);
350
351 free_page((unsigned long)batch);
352}
353
354void tlb_table_flush(struct mmu_gather *tlb)
355{
356 struct mmu_table_batch **batch = &tlb->batch;
357
358 if (*batch) {
359 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
360 *batch = NULL;
361 }
362}
363
364void tlb_remove_table(struct mmu_gather *tlb, void *table)
365{
366 struct mmu_table_batch **batch = &tlb->batch;
367
368 tlb->need_flush = 1;
369
370
371
372
373
374 if (atomic_read(&tlb->mm->mm_users) < 2) {
375 __tlb_remove_table(table);
376 return;
377 }
378
379 if (*batch == NULL) {
380 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
381 if (*batch == NULL) {
382 tlb_remove_table_one(table);
383 return;
384 }
385 (*batch)->nr = 0;
386 }
387 (*batch)->tables[(*batch)->nr++] = table;
388 if ((*batch)->nr == MAX_TABLE_BATCH)
389 tlb_table_flush(tlb);
390}
391
392#endif
393
394
395
396
397
398
399
400void pgd_clear_bad(pgd_t *pgd)
401{
402 pgd_ERROR(*pgd);
403 pgd_clear(pgd);
404}
405
406void pud_clear_bad(pud_t *pud)
407{
408 pud_ERROR(*pud);
409 pud_clear(pud);
410}
411
412void pmd_clear_bad(pmd_t *pmd)
413{
414 pmd_ERROR(*pmd);
415 pmd_clear(pmd);
416}
417
418
419
420
421
422static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
423 unsigned long addr)
424{
425 pgtable_t token = pmd_pgtable(*pmd);
426 pmd_clear(pmd);
427 pte_free_tlb(tlb, token, addr);
428 tlb->mm->nr_ptes--;
429}
430
431static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
432 unsigned long addr, unsigned long end,
433 unsigned long floor, unsigned long ceiling)
434{
435 pmd_t *pmd;
436 unsigned long next;
437 unsigned long start;
438
439 start = addr;
440 pmd = pmd_offset(pud, addr);
441 do {
442 next = pmd_addr_end(addr, end);
443 if (pmd_none_or_clear_bad(pmd))
444 continue;
445 free_pte_range(tlb, pmd, addr);
446 } while (pmd++, addr = next, addr != end);
447
448 start &= PUD_MASK;
449 if (start < floor)
450 return;
451 if (ceiling) {
452 ceiling &= PUD_MASK;
453 if (!ceiling)
454 return;
455 }
456 if (end - 1 > ceiling - 1)
457 return;
458
459 pmd = pmd_offset(pud, start);
460 pud_clear(pud);
461 pmd_free_tlb(tlb, pmd, start);
462}
463
464static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
465 unsigned long addr, unsigned long end,
466 unsigned long floor, unsigned long ceiling)
467{
468 pud_t *pud;
469 unsigned long next;
470 unsigned long start;
471
472 start = addr;
473 pud = pud_offset(pgd, addr);
474 do {
475 next = pud_addr_end(addr, end);
476 if (pud_none_or_clear_bad(pud))
477 continue;
478 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
479 } while (pud++, addr = next, addr != end);
480
481 start &= PGDIR_MASK;
482 if (start < floor)
483 return;
484 if (ceiling) {
485 ceiling &= PGDIR_MASK;
486 if (!ceiling)
487 return;
488 }
489 if (end - 1 > ceiling - 1)
490 return;
491
492 pud = pud_offset(pgd, start);
493 pgd_clear(pgd);
494 pud_free_tlb(tlb, pud, start);
495}
496
497
498
499
500
501
502void free_pgd_range(struct mmu_gather *tlb,
503 unsigned long addr, unsigned long end,
504 unsigned long floor, unsigned long ceiling)
505{
506 pgd_t *pgd;
507 unsigned long next;
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535 addr &= PMD_MASK;
536 if (addr < floor) {
537 addr += PMD_SIZE;
538 if (!addr)
539 return;
540 }
541 if (ceiling) {
542 ceiling &= PMD_MASK;
543 if (!ceiling)
544 return;
545 }
546 if (end - 1 > ceiling - 1)
547 end -= PMD_SIZE;
548 if (addr > end - 1)
549 return;
550
551 pgd = pgd_offset(tlb->mm, addr);
552 do {
553 next = pgd_addr_end(addr, end);
554 if (pgd_none_or_clear_bad(pgd))
555 continue;
556 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
557 } while (pgd++, addr = next, addr != end);
558}
559
560void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
561 unsigned long floor, unsigned long ceiling)
562{
563 while (vma) {
564 struct vm_area_struct *next = vma->vm_next;
565 unsigned long addr = vma->vm_start;
566
567
568
569
570
571 unlink_anon_vmas(vma);
572 unlink_file_vma(vma);
573
574 if (is_vm_hugetlb_page(vma)) {
575 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
576 floor, next? next->vm_start: ceiling);
577 } else {
578
579
580
581 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
582 && !is_vm_hugetlb_page(next)) {
583 vma = next;
584 next = vma->vm_next;
585 unlink_anon_vmas(vma);
586 unlink_file_vma(vma);
587 }
588 free_pgd_range(tlb, addr, vma->vm_end,
589 floor, next? next->vm_start: ceiling);
590 }
591 vma = next;
592 }
593}
594
595int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
596 pmd_t *pmd, unsigned long address)
597{
598 pgtable_t new = pte_alloc_one(mm, address);
599 int wait_split_huge_page;
600 if (!new)
601 return -ENOMEM;
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616 smp_wmb();
617
618 spin_lock(&mm->page_table_lock);
619 wait_split_huge_page = 0;
620 if (likely(pmd_none(*pmd))) {
621 mm->nr_ptes++;
622 pmd_populate(mm, pmd, new);
623 new = NULL;
624 } else if (unlikely(pmd_trans_splitting(*pmd)))
625 wait_split_huge_page = 1;
626 spin_unlock(&mm->page_table_lock);
627 if (new)
628 pte_free(mm, new);
629 if (wait_split_huge_page)
630 wait_split_huge_page(vma->anon_vma, pmd);
631 return 0;
632}
633
634int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
635{
636 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
637 if (!new)
638 return -ENOMEM;
639
640 smp_wmb();
641
642 spin_lock(&init_mm.page_table_lock);
643 if (likely(pmd_none(*pmd))) {
644 pmd_populate_kernel(&init_mm, pmd, new);
645 new = NULL;
646 } else
647 VM_BUG_ON(pmd_trans_splitting(*pmd));
648 spin_unlock(&init_mm.page_table_lock);
649 if (new)
650 pte_free_kernel(&init_mm, new);
651 return 0;
652}
653
654static inline void init_rss_vec(int *rss)
655{
656 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
657}
658
659static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
660{
661 int i;
662
663 if (current->mm == mm)
664 sync_mm_rss(current, mm);
665 for (i = 0; i < NR_MM_COUNTERS; i++)
666 if (rss[i])
667 add_mm_counter(mm, i, rss[i]);
668}
669
670
671
672
673
674
675
676
677static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
678 pte_t pte, struct page *page)
679{
680 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
681 pud_t *pud = pud_offset(pgd, addr);
682 pmd_t *pmd = pmd_offset(pud, addr);
683 struct address_space *mapping;
684 pgoff_t index;
685 static unsigned long resume;
686 static unsigned long nr_shown;
687 static unsigned long nr_unshown;
688
689
690
691
692
693 if (nr_shown == 60) {
694 if (time_before(jiffies, resume)) {
695 nr_unshown++;
696 return;
697 }
698 if (nr_unshown) {
699 printk(KERN_ALERT
700 "BUG: Bad page map: %lu messages suppressed\n",
701 nr_unshown);
702 nr_unshown = 0;
703 }
704 nr_shown = 0;
705 }
706 if (nr_shown++ == 0)
707 resume = jiffies + 60 * HZ;
708
709 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
710 index = linear_page_index(vma, addr);
711
712 printk(KERN_ALERT
713 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
714 current->comm,
715 (long long)pte_val(pte), (long long)pmd_val(*pmd));
716 if (page)
717 dump_page(page);
718 printk(KERN_ALERT
719 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
720 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
721
722
723
724 if (vma->vm_ops)
725 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
726 (unsigned long)vma->vm_ops->fault);
727 if (vma->vm_file && vma->vm_file->f_op)
728 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
729 (unsigned long)vma->vm_file->f_op->mmap);
730 dump_stack();
731 add_taint(TAINT_BAD_PAGE);
732}
733
734static inline int is_cow_mapping(vm_flags_t flags)
735{
736 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
737}
738
739#ifndef is_zero_pfn
740static inline int is_zero_pfn(unsigned long pfn)
741{
742 return pfn == zero_pfn;
743}
744#endif
745
746#ifndef my_zero_pfn
747static inline unsigned long my_zero_pfn(unsigned long addr)
748{
749 return zero_pfn;
750}
751#endif
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795#ifdef __HAVE_ARCH_PTE_SPECIAL
796# define HAVE_PTE_SPECIAL 1
797#else
798# define HAVE_PTE_SPECIAL 0
799#endif
800struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
801 pte_t pte)
802{
803 unsigned long pfn = pte_pfn(pte);
804
805 if (HAVE_PTE_SPECIAL) {
806 if (likely(!pte_special(pte)))
807 goto check_pfn;
808 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
809 return NULL;
810 if (!is_zero_pfn(pfn))
811 print_bad_pte(vma, addr, pte, NULL);
812 return NULL;
813 }
814
815
816
817 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
818 if (vma->vm_flags & VM_MIXEDMAP) {
819 if (!pfn_valid(pfn))
820 return NULL;
821 goto out;
822 } else {
823 unsigned long off;
824 off = (addr - vma->vm_start) >> PAGE_SHIFT;
825 if (pfn == vma->vm_pgoff + off)
826 return NULL;
827 if (!is_cow_mapping(vma->vm_flags))
828 return NULL;
829 }
830 }
831
832 if (is_zero_pfn(pfn))
833 return NULL;
834check_pfn:
835 if (unlikely(pfn > highest_memmap_pfn)) {
836 print_bad_pte(vma, addr, pte, NULL);
837 return NULL;
838 }
839
840
841
842
843
844out:
845 return pfn_to_page(pfn);
846}
847
848
849
850
851
852
853
854static inline unsigned long
855copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
856 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
857 unsigned long addr, int *rss)
858{
859 unsigned long vm_flags = vma->vm_flags;
860 pte_t pte = *src_pte;
861 struct page *page;
862
863
864 if (unlikely(!pte_present(pte))) {
865 if (!pte_file(pte)) {
866 swp_entry_t entry = pte_to_swp_entry(pte);
867
868 if (swap_duplicate(entry) < 0)
869 return entry.val;
870
871
872 if (unlikely(list_empty(&dst_mm->mmlist))) {
873 spin_lock(&mmlist_lock);
874 if (list_empty(&dst_mm->mmlist))
875 list_add(&dst_mm->mmlist,
876 &src_mm->mmlist);
877 spin_unlock(&mmlist_lock);
878 }
879 if (likely(!non_swap_entry(entry)))
880 rss[MM_SWAPENTS]++;
881 else if (is_write_migration_entry(entry) &&
882 is_cow_mapping(vm_flags)) {
883
884
885
886
887 make_migration_entry_read(&entry);
888 pte = swp_entry_to_pte(entry);
889 set_pte_at(src_mm, addr, src_pte, pte);
890 }
891 }
892 goto out_set_pte;
893 }
894
895
896
897
898
899 if (is_cow_mapping(vm_flags)) {
900 ptep_set_wrprotect(src_mm, addr, src_pte);
901 pte = pte_wrprotect(pte);
902 }
903
904
905
906
907
908 if (vm_flags & VM_SHARED)
909 pte = pte_mkclean(pte);
910 pte = pte_mkold(pte);
911
912 page = vm_normal_page(vma, addr, pte);
913 if (page) {
914 get_page(page);
915 page_dup_rmap(page);
916 if (PageAnon(page))
917 rss[MM_ANONPAGES]++;
918 else
919 rss[MM_FILEPAGES]++;
920 }
921
922out_set_pte:
923 set_pte_at(dst_mm, addr, dst_pte, pte);
924 return 0;
925}
926
927int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
928 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
929 unsigned long addr, unsigned long end)
930{
931 pte_t *orig_src_pte, *orig_dst_pte;
932 pte_t *src_pte, *dst_pte;
933 spinlock_t *src_ptl, *dst_ptl;
934 int progress = 0;
935 int rss[NR_MM_COUNTERS];
936 swp_entry_t entry = (swp_entry_t){0};
937
938again:
939 init_rss_vec(rss);
940
941 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
942 if (!dst_pte)
943 return -ENOMEM;
944 src_pte = pte_offset_map(src_pmd, addr);
945 src_ptl = pte_lockptr(src_mm, src_pmd);
946 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
947 orig_src_pte = src_pte;
948 orig_dst_pte = dst_pte;
949 arch_enter_lazy_mmu_mode();
950
951 do {
952
953
954
955
956 if (progress >= 32) {
957 progress = 0;
958 if (need_resched() ||
959 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
960 break;
961 }
962 if (pte_none(*src_pte)) {
963 progress++;
964 continue;
965 }
966 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
967 vma, addr, rss);
968 if (entry.val)
969 break;
970 progress += 8;
971 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
972
973 arch_leave_lazy_mmu_mode();
974 spin_unlock(src_ptl);
975 pte_unmap(orig_src_pte);
976 add_mm_rss_vec(dst_mm, rss);
977 pte_unmap_unlock(orig_dst_pte, dst_ptl);
978 cond_resched();
979
980 if (entry.val) {
981 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
982 return -ENOMEM;
983 progress = 0;
984 }
985 if (addr != end)
986 goto again;
987 return 0;
988}
989
990static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
991 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
992 unsigned long addr, unsigned long end)
993{
994 pmd_t *src_pmd, *dst_pmd;
995 unsigned long next;
996
997 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
998 if (!dst_pmd)
999 return -ENOMEM;
1000 src_pmd = pmd_offset(src_pud, addr);
1001 do {
1002 next = pmd_addr_end(addr, end);
1003 if (pmd_trans_huge(*src_pmd)) {
1004 int err;
1005 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
1006 err = copy_huge_pmd(dst_mm, src_mm,
1007 dst_pmd, src_pmd, addr, vma);
1008 if (err == -ENOMEM)
1009 return -ENOMEM;
1010 if (!err)
1011 continue;
1012
1013 }
1014 if (pmd_none_or_clear_bad(src_pmd))
1015 continue;
1016 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1017 vma, addr, next))
1018 return -ENOMEM;
1019 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1020 return 0;
1021}
1022
1023static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1024 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1025 unsigned long addr, unsigned long end)
1026{
1027 pud_t *src_pud, *dst_pud;
1028 unsigned long next;
1029
1030 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1031 if (!dst_pud)
1032 return -ENOMEM;
1033 src_pud = pud_offset(src_pgd, addr);
1034 do {
1035 next = pud_addr_end(addr, end);
1036 if (pud_none_or_clear_bad(src_pud))
1037 continue;
1038 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1039 vma, addr, next))
1040 return -ENOMEM;
1041 } while (dst_pud++, src_pud++, addr = next, addr != end);
1042 return 0;
1043}
1044
1045int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1046 struct vm_area_struct *vma)
1047{
1048 pgd_t *src_pgd, *dst_pgd;
1049 unsigned long next;
1050 unsigned long addr = vma->vm_start;
1051 unsigned long end = vma->vm_end;
1052 int ret;
1053
1054
1055
1056
1057
1058
1059
1060 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
1061 if (!vma->anon_vma)
1062 return 0;
1063 }
1064
1065 if (is_vm_hugetlb_page(vma))
1066 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1067
1068 if (unlikely(is_pfn_mapping(vma))) {
1069
1070
1071
1072
1073 ret = track_pfn_vma_copy(vma);
1074 if (ret)
1075 return ret;
1076 }
1077
1078
1079
1080
1081
1082
1083
1084 if (is_cow_mapping(vma->vm_flags))
1085 mmu_notifier_invalidate_range_start(src_mm, addr, end);
1086
1087 ret = 0;
1088 dst_pgd = pgd_offset(dst_mm, addr);
1089 src_pgd = pgd_offset(src_mm, addr);
1090 do {
1091 next = pgd_addr_end(addr, end);
1092 if (pgd_none_or_clear_bad(src_pgd))
1093 continue;
1094 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1095 vma, addr, next))) {
1096 ret = -ENOMEM;
1097 break;
1098 }
1099 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1100
1101 if (is_cow_mapping(vma->vm_flags))
1102 mmu_notifier_invalidate_range_end(src_mm,
1103 vma->vm_start, end);
1104 return ret;
1105}
1106
1107static unsigned long zap_pte_range(struct mmu_gather *tlb,
1108 struct vm_area_struct *vma, pmd_t *pmd,
1109 unsigned long addr, unsigned long end,
1110 struct zap_details *details)
1111{
1112 struct mm_struct *mm = tlb->mm;
1113 int force_flush = 0;
1114 int rss[NR_MM_COUNTERS];
1115 spinlock_t *ptl;
1116 pte_t *start_pte;
1117 pte_t *pte;
1118
1119again:
1120 init_rss_vec(rss);
1121 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1122 pte = start_pte;
1123 arch_enter_lazy_mmu_mode();
1124 do {
1125 pte_t ptent = *pte;
1126 if (pte_none(ptent)) {
1127 continue;
1128 }
1129
1130 if (pte_present(ptent)) {
1131 struct page *page;
1132
1133 page = vm_normal_page(vma, addr, ptent);
1134 if (unlikely(details) && page) {
1135
1136
1137
1138
1139
1140 if (details->check_mapping &&
1141 details->check_mapping != page->mapping)
1142 continue;
1143
1144
1145
1146
1147 if (details->nonlinear_vma &&
1148 (page->index < details->first_index ||
1149 page->index > details->last_index))
1150 continue;
1151 }
1152 ptent = ptep_get_and_clear_full(mm, addr, pte,
1153 tlb->fullmm);
1154 tlb_remove_tlb_entry(tlb, pte, addr);
1155 if (unlikely(!page))
1156 continue;
1157 if (unlikely(details) && details->nonlinear_vma
1158 && linear_page_index(details->nonlinear_vma,
1159 addr) != page->index)
1160 set_pte_at(mm, addr, pte,
1161 pgoff_to_pte(page->index));
1162 if (PageAnon(page))
1163 rss[MM_ANONPAGES]--;
1164 else {
1165 if (pte_dirty(ptent))
1166 set_page_dirty(page);
1167 if (pte_young(ptent) &&
1168 likely(!VM_SequentialReadHint(vma)))
1169 mark_page_accessed(page);
1170 rss[MM_FILEPAGES]--;
1171 }
1172 page_remove_rmap(page);
1173 if (unlikely(page_mapcount(page) < 0))
1174 print_bad_pte(vma, addr, ptent, page);
1175 force_flush = !__tlb_remove_page(tlb, page);
1176 if (force_flush)
1177 break;
1178 continue;
1179 }
1180
1181
1182
1183
1184 if (unlikely(details))
1185 continue;
1186 if (pte_file(ptent)) {
1187 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1188 print_bad_pte(vma, addr, ptent, NULL);
1189 } else {
1190 swp_entry_t entry = pte_to_swp_entry(ptent);
1191
1192 if (!non_swap_entry(entry))
1193 rss[MM_SWAPENTS]--;
1194 if (unlikely(!free_swap_and_cache(entry)))
1195 print_bad_pte(vma, addr, ptent, NULL);
1196 }
1197 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1198 } while (pte++, addr += PAGE_SIZE, addr != end);
1199
1200 add_mm_rss_vec(mm, rss);
1201 arch_leave_lazy_mmu_mode();
1202 pte_unmap_unlock(start_pte, ptl);
1203
1204
1205
1206
1207
1208
1209 if (force_flush) {
1210 force_flush = 0;
1211 tlb_flush_mmu(tlb);
1212 if (addr != end)
1213 goto again;
1214 }
1215
1216 return addr;
1217}
1218
1219static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1220 struct vm_area_struct *vma, pud_t *pud,
1221 unsigned long addr, unsigned long end,
1222 struct zap_details *details)
1223{
1224 pmd_t *pmd;
1225 unsigned long next;
1226
1227 pmd = pmd_offset(pud, addr);
1228 do {
1229 next = pmd_addr_end(addr, end);
1230 if (pmd_trans_huge(*pmd)) {
1231 if (next-addr != HPAGE_PMD_SIZE) {
1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1233 split_huge_page_pmd(vma->vm_mm, pmd);
1234 } else if (zap_huge_pmd(tlb, vma, pmd))
1235 continue;
1236
1237 }
1238 if (pmd_none_or_clear_bad(pmd))
1239 continue;
1240 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1241 cond_resched();
1242 } while (pmd++, addr = next, addr != end);
1243
1244 return addr;
1245}
1246
1247static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1248 struct vm_area_struct *vma, pgd_t *pgd,
1249 unsigned long addr, unsigned long end,
1250 struct zap_details *details)
1251{
1252 pud_t *pud;
1253 unsigned long next;
1254
1255 pud = pud_offset(pgd, addr);
1256 do {
1257 next = pud_addr_end(addr, end);
1258 if (pud_none_or_clear_bad(pud))
1259 continue;
1260 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1261 } while (pud++, addr = next, addr != end);
1262
1263 return addr;
1264}
1265
1266static unsigned long unmap_page_range(struct mmu_gather *tlb,
1267 struct vm_area_struct *vma,
1268 unsigned long addr, unsigned long end,
1269 struct zap_details *details)
1270{
1271 pgd_t *pgd;
1272 unsigned long next;
1273
1274 if (details && !details->check_mapping && !details->nonlinear_vma)
1275 details = NULL;
1276
1277 BUG_ON(addr >= end);
1278 mem_cgroup_uncharge_start();
1279 tlb_start_vma(tlb, vma);
1280 pgd = pgd_offset(vma->vm_mm, addr);
1281 do {
1282 next = pgd_addr_end(addr, end);
1283 if (pgd_none_or_clear_bad(pgd))
1284 continue;
1285 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1286 } while (pgd++, addr = next, addr != end);
1287 tlb_end_vma(tlb, vma);
1288 mem_cgroup_uncharge_end();
1289
1290 return addr;
1291}
1292
1293#ifdef CONFIG_PREEMPT
1294# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1295#else
1296
1297# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1298#endif
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326unsigned long unmap_vmas(struct mmu_gather *tlb,
1327 struct vm_area_struct *vma, unsigned long start_addr,
1328 unsigned long end_addr, unsigned long *nr_accounted,
1329 struct zap_details *details)
1330{
1331 unsigned long start = start_addr;
1332 struct mm_struct *mm = vma->vm_mm;
1333
1334 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1335 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
1336 unsigned long end;
1337
1338 start = max(vma->vm_start, start_addr);
1339 if (start >= vma->vm_end)
1340 continue;
1341 end = min(vma->vm_end, end_addr);
1342 if (end <= vma->vm_start)
1343 continue;
1344
1345 if (vma->vm_flags & VM_ACCOUNT)
1346 *nr_accounted += (end - start) >> PAGE_SHIFT;
1347
1348 if (unlikely(is_pfn_mapping(vma)))
1349 untrack_pfn_vma(vma, 0, 0);
1350
1351 while (start != end) {
1352 if (unlikely(is_vm_hugetlb_page(vma))) {
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364 if (vma->vm_file)
1365 unmap_hugepage_range(vma, start, end, NULL);
1366
1367 start = end;
1368 } else
1369 start = unmap_page_range(tlb, vma, start, end, details);
1370 }
1371 }
1372
1373 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1374 return start;
1375}
1376
1377
1378
1379
1380
1381
1382
1383
1384unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1385 unsigned long size, struct zap_details *details)
1386{
1387 struct mm_struct *mm = vma->vm_mm;
1388 struct mmu_gather tlb;
1389 unsigned long end = address + size;
1390 unsigned long nr_accounted = 0;
1391
1392 lru_add_drain();
1393 tlb_gather_mmu(&tlb, mm, 0);
1394 update_hiwater_rss(mm);
1395 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1396 tlb_finish_mmu(&tlb, address, end);
1397 return end;
1398}
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1413 unsigned long size)
1414{
1415 if (address < vma->vm_start || address + size > vma->vm_end ||
1416 !(vma->vm_flags & VM_PFNMAP))
1417 return -1;
1418 zap_page_range(vma, address, size, NULL);
1419 return 0;
1420}
1421EXPORT_SYMBOL_GPL(zap_vma_ptes);
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1436 unsigned int flags)
1437{
1438 pgd_t *pgd;
1439 pud_t *pud;
1440 pmd_t *pmd;
1441 pte_t *ptep, pte;
1442 spinlock_t *ptl;
1443 struct page *page;
1444 struct mm_struct *mm = vma->vm_mm;
1445
1446 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1447 if (!IS_ERR(page)) {
1448 BUG_ON(flags & FOLL_GET);
1449 goto out;
1450 }
1451
1452 page = NULL;
1453 pgd = pgd_offset(mm, address);
1454 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1455 goto no_page_table;
1456
1457 pud = pud_offset(pgd, address);
1458 if (pud_none(*pud))
1459 goto no_page_table;
1460 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1461 BUG_ON(flags & FOLL_GET);
1462 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1463 goto out;
1464 }
1465 if (unlikely(pud_bad(*pud)))
1466 goto no_page_table;
1467
1468 pmd = pmd_offset(pud, address);
1469 if (pmd_none(*pmd))
1470 goto no_page_table;
1471 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1472 BUG_ON(flags & FOLL_GET);
1473 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1474 goto out;
1475 }
1476 if (pmd_trans_huge(*pmd)) {
1477 if (flags & FOLL_SPLIT) {
1478 split_huge_page_pmd(mm, pmd);
1479 goto split_fallthrough;
1480 }
1481 spin_lock(&mm->page_table_lock);
1482 if (likely(pmd_trans_huge(*pmd))) {
1483 if (unlikely(pmd_trans_splitting(*pmd))) {
1484 spin_unlock(&mm->page_table_lock);
1485 wait_split_huge_page(vma->anon_vma, pmd);
1486 } else {
1487 page = follow_trans_huge_pmd(mm, address,
1488 pmd, flags);
1489 spin_unlock(&mm->page_table_lock);
1490 goto out;
1491 }
1492 } else
1493 spin_unlock(&mm->page_table_lock);
1494
1495 }
1496split_fallthrough:
1497 if (unlikely(pmd_bad(*pmd)))
1498 goto no_page_table;
1499
1500 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1501
1502 pte = *ptep;
1503 if (!pte_present(pte))
1504 goto no_page;
1505 if ((flags & FOLL_WRITE) && !pte_write(pte))
1506 goto unlock;
1507
1508 page = vm_normal_page(vma, address, pte);
1509 if (unlikely(!page)) {
1510 if ((flags & FOLL_DUMP) ||
1511 !is_zero_pfn(pte_pfn(pte)))
1512 goto bad_page;
1513 page = pte_page(pte);
1514 }
1515
1516 if (flags & FOLL_GET)
1517 get_page(page);
1518 if (flags & FOLL_TOUCH) {
1519 if ((flags & FOLL_WRITE) &&
1520 !pte_dirty(pte) && !PageDirty(page))
1521 set_page_dirty(page);
1522
1523
1524
1525
1526
1527 mark_page_accessed(page);
1528 }
1529 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539 if (page->mapping && trylock_page(page)) {
1540 lru_add_drain();
1541
1542
1543
1544
1545
1546 if (page->mapping)
1547 mlock_vma_page(page);
1548 unlock_page(page);
1549 }
1550 }
1551unlock:
1552 pte_unmap_unlock(ptep, ptl);
1553out:
1554 return page;
1555
1556bad_page:
1557 pte_unmap_unlock(ptep, ptl);
1558 return ERR_PTR(-EFAULT);
1559
1560no_page:
1561 pte_unmap_unlock(ptep, ptl);
1562 if (!pte_none(pte))
1563 return page;
1564
1565no_page_table:
1566
1567
1568
1569
1570
1571
1572
1573
1574 if ((flags & FOLL_DUMP) &&
1575 (!vma->vm_ops || !vma->vm_ops->fault))
1576 return ERR_PTR(-EFAULT);
1577 return page;
1578}
1579
1580static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1581{
1582 return stack_guard_page_start(vma, addr) ||
1583 stack_guard_page_end(vma, addr+PAGE_SIZE);
1584}
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1636 unsigned long start, int nr_pages, unsigned int gup_flags,
1637 struct page **pages, struct vm_area_struct **vmas,
1638 int *nonblocking)
1639{
1640 int i;
1641 unsigned long vm_flags;
1642
1643 if (nr_pages <= 0)
1644 return 0;
1645
1646 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1647
1648
1649
1650
1651
1652 vm_flags = (gup_flags & FOLL_WRITE) ?
1653 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1654 vm_flags &= (gup_flags & FOLL_FORCE) ?
1655 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1656 i = 0;
1657
1658 do {
1659 struct vm_area_struct *vma;
1660
1661 vma = find_extend_vma(mm, start);
1662 if (!vma && in_gate_area(mm, start)) {
1663 unsigned long pg = start & PAGE_MASK;
1664 pgd_t *pgd;
1665 pud_t *pud;
1666 pmd_t *pmd;
1667 pte_t *pte;
1668
1669
1670 if (gup_flags & FOLL_WRITE)
1671 return i ? : -EFAULT;
1672 if (pg > TASK_SIZE)
1673 pgd = pgd_offset_k(pg);
1674 else
1675 pgd = pgd_offset_gate(mm, pg);
1676 BUG_ON(pgd_none(*pgd));
1677 pud = pud_offset(pgd, pg);
1678 BUG_ON(pud_none(*pud));
1679 pmd = pmd_offset(pud, pg);
1680 if (pmd_none(*pmd))
1681 return i ? : -EFAULT;
1682 VM_BUG_ON(pmd_trans_huge(*pmd));
1683 pte = pte_offset_map(pmd, pg);
1684 if (pte_none(*pte)) {
1685 pte_unmap(pte);
1686 return i ? : -EFAULT;
1687 }
1688 vma = get_gate_vma(mm);
1689 if (pages) {
1690 struct page *page;
1691
1692 page = vm_normal_page(vma, start, *pte);
1693 if (!page) {
1694 if (!(gup_flags & FOLL_DUMP) &&
1695 is_zero_pfn(pte_pfn(*pte)))
1696 page = pte_page(*pte);
1697 else {
1698 pte_unmap(pte);
1699 return i ? : -EFAULT;
1700 }
1701 }
1702 pages[i] = page;
1703 get_page(page);
1704 }
1705 pte_unmap(pte);
1706 goto next_page;
1707 }
1708
1709 if (!vma ||
1710 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1711 !(vm_flags & vma->vm_flags))
1712 return i ? : -EFAULT;
1713
1714 if (is_vm_hugetlb_page(vma)) {
1715 i = follow_hugetlb_page(mm, vma, pages, vmas,
1716 &start, &nr_pages, i, gup_flags);
1717 continue;
1718 }
1719
1720 do {
1721 struct page *page;
1722 unsigned int foll_flags = gup_flags;
1723
1724
1725
1726
1727
1728 if (unlikely(fatal_signal_pending(current)))
1729 return i ? i : -ERESTARTSYS;
1730
1731 cond_resched();
1732 while (!(page = follow_page(vma, start, foll_flags))) {
1733 int ret;
1734 unsigned int fault_flags = 0;
1735
1736
1737 if (foll_flags & FOLL_MLOCK) {
1738 if (stack_guard_page(vma, start))
1739 goto next_page;
1740 }
1741 if (foll_flags & FOLL_WRITE)
1742 fault_flags |= FAULT_FLAG_WRITE;
1743 if (nonblocking)
1744 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1745 if (foll_flags & FOLL_NOWAIT)
1746 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1747
1748 ret = handle_mm_fault(mm, vma, start,
1749 fault_flags);
1750
1751 if (ret & VM_FAULT_ERROR) {
1752 if (ret & VM_FAULT_OOM)
1753 return i ? i : -ENOMEM;
1754 if (ret & (VM_FAULT_HWPOISON |
1755 VM_FAULT_HWPOISON_LARGE)) {
1756 if (i)
1757 return i;
1758 else if (gup_flags & FOLL_HWPOISON)
1759 return -EHWPOISON;
1760 else
1761 return -EFAULT;
1762 }
1763 if (ret & VM_FAULT_SIGBUS)
1764 return i ? i : -EFAULT;
1765 BUG();
1766 }
1767
1768 if (tsk) {
1769 if (ret & VM_FAULT_MAJOR)
1770 tsk->maj_flt++;
1771 else
1772 tsk->min_flt++;
1773 }
1774
1775 if (ret & VM_FAULT_RETRY) {
1776 if (nonblocking)
1777 *nonblocking = 0;
1778 return i;
1779 }
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793 if ((ret & VM_FAULT_WRITE) &&
1794 !(vma->vm_flags & VM_WRITE))
1795 foll_flags &= ~FOLL_WRITE;
1796
1797 cond_resched();
1798 }
1799 if (IS_ERR(page))
1800 return i ? i : PTR_ERR(page);
1801 if (pages) {
1802 pages[i] = page;
1803
1804 flush_anon_page(vma, page, start);
1805 flush_dcache_page(page);
1806 }
1807next_page:
1808 if (vmas)
1809 vmas[i] = vma;
1810 i++;
1811 start += PAGE_SIZE;
1812 nr_pages--;
1813 } while (nr_pages && start < vma->vm_end);
1814 } while (nr_pages);
1815 return i;
1816}
1817EXPORT_SYMBOL(__get_user_pages);
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1847 unsigned long address, unsigned int fault_flags)
1848{
1849 struct vm_area_struct *vma;
1850 int ret;
1851
1852 vma = find_extend_vma(mm, address);
1853 if (!vma || address < vma->vm_start)
1854 return -EFAULT;
1855
1856 ret = handle_mm_fault(mm, vma, address, fault_flags);
1857 if (ret & VM_FAULT_ERROR) {
1858 if (ret & VM_FAULT_OOM)
1859 return -ENOMEM;
1860 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1861 return -EHWPOISON;
1862 if (ret & VM_FAULT_SIGBUS)
1863 return -EFAULT;
1864 BUG();
1865 }
1866 if (tsk) {
1867 if (ret & VM_FAULT_MAJOR)
1868 tsk->maj_flt++;
1869 else
1870 tsk->min_flt++;
1871 }
1872 return 0;
1873}
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1927 unsigned long start, int nr_pages, int write, int force,
1928 struct page **pages, struct vm_area_struct **vmas)
1929{
1930 int flags = FOLL_TOUCH;
1931
1932 if (pages)
1933 flags |= FOLL_GET;
1934 if (write)
1935 flags |= FOLL_WRITE;
1936 if (force)
1937 flags |= FOLL_FORCE;
1938
1939 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1940 NULL);
1941}
1942EXPORT_SYMBOL(get_user_pages);
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958#ifdef CONFIG_ELF_CORE
1959struct page *get_dump_page(unsigned long addr)
1960{
1961 struct vm_area_struct *vma;
1962 struct page *page;
1963
1964 if (__get_user_pages(current, current->mm, addr, 1,
1965 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1966 NULL) < 1)
1967 return NULL;
1968 flush_cache_page(vma, addr, page_to_pfn(page));
1969 return page;
1970}
1971#endif
1972
1973pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1974 spinlock_t **ptl)
1975{
1976 pgd_t * pgd = pgd_offset(mm, addr);
1977 pud_t * pud = pud_alloc(mm, pgd, addr);
1978 if (pud) {
1979 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1980 if (pmd) {
1981 VM_BUG_ON(pmd_trans_huge(*pmd));
1982 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1983 }
1984 }
1985 return NULL;
1986}
1987
1988
1989
1990
1991
1992
1993
1994
1995static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1996 struct page *page, pgprot_t prot)
1997{
1998 struct mm_struct *mm = vma->vm_mm;
1999 int retval;
2000 pte_t *pte;
2001 spinlock_t *ptl;
2002
2003 retval = -EINVAL;
2004 if (PageAnon(page))
2005 goto out;
2006 retval = -ENOMEM;
2007 flush_dcache_page(page);
2008 pte = get_locked_pte(mm, addr, &ptl);
2009 if (!pte)
2010 goto out;
2011 retval = -EBUSY;
2012 if (!pte_none(*pte))
2013 goto out_unlock;
2014
2015
2016 get_page(page);
2017 inc_mm_counter_fast(mm, MM_FILEPAGES);
2018 page_add_file_rmap(page);
2019 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2020
2021 retval = 0;
2022 pte_unmap_unlock(pte, ptl);
2023 return retval;
2024out_unlock:
2025 pte_unmap_unlock(pte, ptl);
2026out:
2027 return retval;
2028}
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2053 struct page *page)
2054{
2055 if (addr < vma->vm_start || addr >= vma->vm_end)
2056 return -EFAULT;
2057 if (!page_count(page))
2058 return -EINVAL;
2059 vma->vm_flags |= VM_INSERTPAGE;
2060 return insert_page(vma, addr, page, vma->vm_page_prot);
2061}
2062EXPORT_SYMBOL(vm_insert_page);
2063
2064static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2065 unsigned long pfn, pgprot_t prot)
2066{
2067 struct mm_struct *mm = vma->vm_mm;
2068 int retval;
2069 pte_t *pte, entry;
2070 spinlock_t *ptl;
2071
2072 retval = -ENOMEM;
2073 pte = get_locked_pte(mm, addr, &ptl);
2074 if (!pte)
2075 goto out;
2076 retval = -EBUSY;
2077 if (!pte_none(*pte))
2078 goto out_unlock;
2079
2080
2081 entry = pte_mkspecial(pfn_pte(pfn, prot));
2082 set_pte_at(mm, addr, pte, entry);
2083 update_mmu_cache(vma, addr, pte);
2084
2085 retval = 0;
2086out_unlock:
2087 pte_unmap_unlock(pte, ptl);
2088out:
2089 return retval;
2090}
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2110 unsigned long pfn)
2111{
2112 int ret;
2113 pgprot_t pgprot = vma->vm_page_prot;
2114
2115
2116
2117
2118
2119
2120 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2121 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2122 (VM_PFNMAP|VM_MIXEDMAP));
2123 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2124 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2125
2126 if (addr < vma->vm_start || addr >= vma->vm_end)
2127 return -EFAULT;
2128 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
2129 return -EINVAL;
2130
2131 ret = insert_pfn(vma, addr, pfn, pgprot);
2132
2133 if (ret)
2134 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2135
2136 return ret;
2137}
2138EXPORT_SYMBOL(vm_insert_pfn);
2139
2140int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2141 unsigned long pfn)
2142{
2143 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2144
2145 if (addr < vma->vm_start || addr >= vma->vm_end)
2146 return -EFAULT;
2147
2148
2149
2150
2151
2152
2153
2154
2155 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2156 struct page *page;
2157
2158 page = pfn_to_page(pfn);
2159 return insert_page(vma, addr, page, vma->vm_page_prot);
2160 }
2161 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2162}
2163EXPORT_SYMBOL(vm_insert_mixed);
2164
2165
2166
2167
2168
2169
2170static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2171 unsigned long addr, unsigned long end,
2172 unsigned long pfn, pgprot_t prot)
2173{
2174 pte_t *pte;
2175 spinlock_t *ptl;
2176
2177 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2178 if (!pte)
2179 return -ENOMEM;
2180 arch_enter_lazy_mmu_mode();
2181 do {
2182 BUG_ON(!pte_none(*pte));
2183 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2184 pfn++;
2185 } while (pte++, addr += PAGE_SIZE, addr != end);
2186 arch_leave_lazy_mmu_mode();
2187 pte_unmap_unlock(pte - 1, ptl);
2188 return 0;
2189}
2190
2191static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2192 unsigned long addr, unsigned long end,
2193 unsigned long pfn, pgprot_t prot)
2194{
2195 pmd_t *pmd;
2196 unsigned long next;
2197
2198 pfn -= addr >> PAGE_SHIFT;
2199 pmd = pmd_alloc(mm, pud, addr);
2200 if (!pmd)
2201 return -ENOMEM;
2202 VM_BUG_ON(pmd_trans_huge(*pmd));
2203 do {
2204 next = pmd_addr_end(addr, end);
2205 if (remap_pte_range(mm, pmd, addr, next,
2206 pfn + (addr >> PAGE_SHIFT), prot))
2207 return -ENOMEM;
2208 } while (pmd++, addr = next, addr != end);
2209 return 0;
2210}
2211
2212static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2213 unsigned long addr, unsigned long end,
2214 unsigned long pfn, pgprot_t prot)
2215{
2216 pud_t *pud;
2217 unsigned long next;
2218
2219 pfn -= addr >> PAGE_SHIFT;
2220 pud = pud_alloc(mm, pgd, addr);
2221 if (!pud)
2222 return -ENOMEM;
2223 do {
2224 next = pud_addr_end(addr, end);
2225 if (remap_pmd_range(mm, pud, addr, next,
2226 pfn + (addr >> PAGE_SHIFT), prot))
2227 return -ENOMEM;
2228 } while (pud++, addr = next, addr != end);
2229 return 0;
2230}
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2243 unsigned long pfn, unsigned long size, pgprot_t prot)
2244{
2245 pgd_t *pgd;
2246 unsigned long next;
2247 unsigned long end = addr + PAGE_ALIGN(size);
2248 struct mm_struct *mm = vma->vm_mm;
2249 int err;
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269 if (addr == vma->vm_start && end == vma->vm_end) {
2270 vma->vm_pgoff = pfn;
2271 vma->vm_flags |= VM_PFN_AT_MMAP;
2272 } else if (is_cow_mapping(vma->vm_flags))
2273 return -EINVAL;
2274
2275 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2276
2277 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
2278 if (err) {
2279
2280
2281
2282
2283 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2284 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2285 return -EINVAL;
2286 }
2287
2288 BUG_ON(addr >= end);
2289 pfn -= addr >> PAGE_SHIFT;
2290 pgd = pgd_offset(mm, addr);
2291 flush_cache_range(vma, addr, end);
2292 do {
2293 next = pgd_addr_end(addr, end);
2294 err = remap_pud_range(mm, pgd, addr, next,
2295 pfn + (addr >> PAGE_SHIFT), prot);
2296 if (err)
2297 break;
2298 } while (pgd++, addr = next, addr != end);
2299
2300 if (err)
2301 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
2302
2303 return err;
2304}
2305EXPORT_SYMBOL(remap_pfn_range);
2306
2307static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2308 unsigned long addr, unsigned long end,
2309 pte_fn_t fn, void *data)
2310{
2311 pte_t *pte;
2312 int err;
2313 pgtable_t token;
2314 spinlock_t *uninitialized_var(ptl);
2315
2316 pte = (mm == &init_mm) ?
2317 pte_alloc_kernel(pmd, addr) :
2318 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2319 if (!pte)
2320 return -ENOMEM;
2321
2322 BUG_ON(pmd_huge(*pmd));
2323
2324 arch_enter_lazy_mmu_mode();
2325
2326 token = pmd_pgtable(*pmd);
2327
2328 do {
2329 err = fn(pte++, token, addr, data);
2330 if (err)
2331 break;
2332 } while (addr += PAGE_SIZE, addr != end);
2333
2334 arch_leave_lazy_mmu_mode();
2335
2336 if (mm != &init_mm)
2337 pte_unmap_unlock(pte-1, ptl);
2338 return err;
2339}
2340
2341static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2342 unsigned long addr, unsigned long end,
2343 pte_fn_t fn, void *data)
2344{
2345 pmd_t *pmd;
2346 unsigned long next;
2347 int err;
2348
2349 BUG_ON(pud_huge(*pud));
2350
2351 pmd = pmd_alloc(mm, pud, addr);
2352 if (!pmd)
2353 return -ENOMEM;
2354 do {
2355 next = pmd_addr_end(addr, end);
2356 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2357 if (err)
2358 break;
2359 } while (pmd++, addr = next, addr != end);
2360 return err;
2361}
2362
2363static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2364 unsigned long addr, unsigned long end,
2365 pte_fn_t fn, void *data)
2366{
2367 pud_t *pud;
2368 unsigned long next;
2369 int err;
2370
2371 pud = pud_alloc(mm, pgd, addr);
2372 if (!pud)
2373 return -ENOMEM;
2374 do {
2375 next = pud_addr_end(addr, end);
2376 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2377 if (err)
2378 break;
2379 } while (pud++, addr = next, addr != end);
2380 return err;
2381}
2382
2383
2384
2385
2386
2387int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2388 unsigned long size, pte_fn_t fn, void *data)
2389{
2390 pgd_t *pgd;
2391 unsigned long next;
2392 unsigned long end = addr + size;
2393 int err;
2394
2395 BUG_ON(addr >= end);
2396 pgd = pgd_offset(mm, addr);
2397 do {
2398 next = pgd_addr_end(addr, end);
2399 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2400 if (err)
2401 break;
2402 } while (pgd++, addr = next, addr != end);
2403
2404 return err;
2405}
2406EXPORT_SYMBOL_GPL(apply_to_page_range);
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2418 pte_t *page_table, pte_t orig_pte)
2419{
2420 int same = 1;
2421#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2422 if (sizeof(pte_t) > sizeof(unsigned long)) {
2423 spinlock_t *ptl = pte_lockptr(mm, pmd);
2424 spin_lock(ptl);
2425 same = pte_same(*page_table, orig_pte);
2426 spin_unlock(ptl);
2427 }
2428#endif
2429 pte_unmap(page_table);
2430 return same;
2431}
2432
2433static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2434{
2435
2436
2437
2438
2439
2440
2441 if (unlikely(!src)) {
2442 void *kaddr = kmap_atomic(dst, KM_USER0);
2443 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2444
2445
2446
2447
2448
2449
2450
2451 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2452 clear_page(kaddr);
2453 kunmap_atomic(kaddr, KM_USER0);
2454 flush_dcache_page(dst);
2455 } else
2456 copy_user_highpage(dst, src, va, vma);
2457}
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2478 unsigned long address, pte_t *page_table, pmd_t *pmd,
2479 spinlock_t *ptl, pte_t orig_pte)
2480 __releases(ptl)
2481{
2482 struct page *old_page, *new_page;
2483 pte_t entry;
2484 int ret = 0;
2485 int page_mkwrite = 0;
2486 struct page *dirty_page = NULL;
2487
2488 old_page = vm_normal_page(vma, address, orig_pte);
2489 if (!old_page) {
2490
2491
2492
2493
2494
2495
2496
2497 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2498 (VM_WRITE|VM_SHARED))
2499 goto reuse;
2500 goto gotten;
2501 }
2502
2503
2504
2505
2506
2507 if (PageAnon(old_page) && !PageKsm(old_page)) {
2508 if (!trylock_page(old_page)) {
2509 page_cache_get(old_page);
2510 pte_unmap_unlock(page_table, ptl);
2511 lock_page(old_page);
2512 page_table = pte_offset_map_lock(mm, pmd, address,
2513 &ptl);
2514 if (!pte_same(*page_table, orig_pte)) {
2515 unlock_page(old_page);
2516 goto unlock;
2517 }
2518 page_cache_release(old_page);
2519 }
2520 if (reuse_swap_page(old_page)) {
2521
2522
2523
2524
2525
2526 page_move_anon_rmap(old_page, vma, address);
2527 unlock_page(old_page);
2528 goto reuse;
2529 }
2530 unlock_page(old_page);
2531 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2532 (VM_WRITE|VM_SHARED))) {
2533
2534
2535
2536
2537
2538 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2539 struct vm_fault vmf;
2540 int tmp;
2541
2542 vmf.virtual_address = (void __user *)(address &
2543 PAGE_MASK);
2544 vmf.pgoff = old_page->index;
2545 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2546 vmf.page = old_page;
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556 page_cache_get(old_page);
2557 pte_unmap_unlock(page_table, ptl);
2558
2559 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2560 if (unlikely(tmp &
2561 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2562 ret = tmp;
2563 goto unwritable_page;
2564 }
2565 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2566 lock_page(old_page);
2567 if (!old_page->mapping) {
2568 ret = 0;
2569 unlock_page(old_page);
2570 goto unwritable_page;
2571 }
2572 } else
2573 VM_BUG_ON(!PageLocked(old_page));
2574
2575
2576
2577
2578
2579
2580
2581 page_table = pte_offset_map_lock(mm, pmd, address,
2582 &ptl);
2583 if (!pte_same(*page_table, orig_pte)) {
2584 unlock_page(old_page);
2585 goto unlock;
2586 }
2587
2588 page_mkwrite = 1;
2589 }
2590 dirty_page = old_page;
2591 get_page(dirty_page);
2592
2593reuse:
2594 flush_cache_page(vma, address, pte_pfn(orig_pte));
2595 entry = pte_mkyoung(orig_pte);
2596 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2597 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2598 update_mmu_cache(vma, address, page_table);
2599 pte_unmap_unlock(page_table, ptl);
2600 ret |= VM_FAULT_WRITE;
2601
2602 if (!dirty_page)
2603 return ret;
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613 if (!page_mkwrite) {
2614 wait_on_page_locked(dirty_page);
2615 set_page_dirty_balance(dirty_page, page_mkwrite);
2616 }
2617 put_page(dirty_page);
2618 if (page_mkwrite) {
2619 struct address_space *mapping = dirty_page->mapping;
2620
2621 set_page_dirty(dirty_page);
2622 unlock_page(dirty_page);
2623 page_cache_release(dirty_page);
2624 if (mapping) {
2625
2626
2627
2628
2629 balance_dirty_pages_ratelimited(mapping);
2630 }
2631 }
2632
2633
2634 if (vma->vm_file)
2635 file_update_time(vma->vm_file);
2636
2637 return ret;
2638 }
2639
2640
2641
2642
2643 page_cache_get(old_page);
2644gotten:
2645 pte_unmap_unlock(page_table, ptl);
2646
2647 if (unlikely(anon_vma_prepare(vma)))
2648 goto oom;
2649
2650 if (is_zero_pfn(pte_pfn(orig_pte))) {
2651 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2652 if (!new_page)
2653 goto oom;
2654 } else {
2655 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2656 if (!new_page)
2657 goto oom;
2658 cow_user_page(new_page, old_page, address, vma);
2659 }
2660 __SetPageUptodate(new_page);
2661
2662 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2663 goto oom_free_new;
2664
2665
2666
2667
2668 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2669 if (likely(pte_same(*page_table, orig_pte))) {
2670 if (old_page) {
2671 if (!PageAnon(old_page)) {
2672 dec_mm_counter_fast(mm, MM_FILEPAGES);
2673 inc_mm_counter_fast(mm, MM_ANONPAGES);
2674 }
2675 } else
2676 inc_mm_counter_fast(mm, MM_ANONPAGES);
2677 flush_cache_page(vma, address, pte_pfn(orig_pte));
2678 entry = mk_pte(new_page, vma->vm_page_prot);
2679 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2680
2681
2682
2683
2684
2685
2686 ptep_clear_flush(vma, address, page_table);
2687 page_add_new_anon_rmap(new_page, vma, address);
2688
2689
2690
2691
2692
2693 set_pte_at_notify(mm, address, page_table, entry);
2694 update_mmu_cache(vma, address, page_table);
2695 if (old_page) {
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718 page_remove_rmap(old_page);
2719 }
2720
2721
2722 new_page = old_page;
2723 ret |= VM_FAULT_WRITE;
2724 } else
2725 mem_cgroup_uncharge_page(new_page);
2726
2727 if (new_page)
2728 page_cache_release(new_page);
2729unlock:
2730 pte_unmap_unlock(page_table, ptl);
2731 if (old_page) {
2732
2733
2734
2735
2736 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2737 lock_page(old_page);
2738 munlock_vma_page(old_page);
2739 unlock_page(old_page);
2740 }
2741 page_cache_release(old_page);
2742 }
2743 return ret;
2744oom_free_new:
2745 page_cache_release(new_page);
2746oom:
2747 if (old_page) {
2748 if (page_mkwrite) {
2749 unlock_page(old_page);
2750 page_cache_release(old_page);
2751 }
2752 page_cache_release(old_page);
2753 }
2754 return VM_FAULT_OOM;
2755
2756unwritable_page:
2757 page_cache_release(old_page);
2758 return ret;
2759}
2760
2761static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2762 unsigned long start_addr, unsigned long end_addr,
2763 struct zap_details *details)
2764{
2765 zap_page_range(vma, start_addr, end_addr - start_addr, details);
2766}
2767
2768static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2769 struct zap_details *details)
2770{
2771 struct vm_area_struct *vma;
2772 struct prio_tree_iter iter;
2773 pgoff_t vba, vea, zba, zea;
2774
2775 vma_prio_tree_foreach(vma, &iter, root,
2776 details->first_index, details->last_index) {
2777
2778 vba = vma->vm_pgoff;
2779 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2780
2781 zba = details->first_index;
2782 if (zba < vba)
2783 zba = vba;
2784 zea = details->last_index;
2785 if (zea > vea)
2786 zea = vea;
2787
2788 unmap_mapping_range_vma(vma,
2789 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2790 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2791 details);
2792 }
2793}
2794
2795static inline void unmap_mapping_range_list(struct list_head *head,
2796 struct zap_details *details)
2797{
2798 struct vm_area_struct *vma;
2799
2800
2801
2802
2803
2804
2805
2806 list_for_each_entry(vma, head, shared.vm_set.list) {
2807 details->nonlinear_vma = vma;
2808 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2809 }
2810}
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826void unmap_mapping_range(struct address_space *mapping,
2827 loff_t const holebegin, loff_t const holelen, int even_cows)
2828{
2829 struct zap_details details;
2830 pgoff_t hba = holebegin >> PAGE_SHIFT;
2831 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2832
2833
2834 if (sizeof(holelen) > sizeof(hlen)) {
2835 long long holeend =
2836 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2837 if (holeend & ~(long long)ULONG_MAX)
2838 hlen = ULONG_MAX - hba + 1;
2839 }
2840
2841 details.check_mapping = even_cows? NULL: mapping;
2842 details.nonlinear_vma = NULL;
2843 details.first_index = hba;
2844 details.last_index = hba + hlen - 1;
2845 if (details.last_index < details.first_index)
2846 details.last_index = ULONG_MAX;
2847
2848
2849 mutex_lock(&mapping->i_mmap_mutex);
2850 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2851 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2852 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2853 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2854 mutex_unlock(&mapping->i_mmap_mutex);
2855}
2856EXPORT_SYMBOL(unmap_mapping_range);
2857
2858
2859
2860
2861
2862
2863static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2864 unsigned long address, pte_t *page_table, pmd_t *pmd,
2865 unsigned int flags, pte_t orig_pte)
2866{
2867 spinlock_t *ptl;
2868 struct page *page, *swapcache = NULL;
2869 swp_entry_t entry;
2870 pte_t pte;
2871 int locked;
2872 struct mem_cgroup *ptr;
2873 int exclusive = 0;
2874 int ret = 0;
2875
2876 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2877 goto out;
2878
2879 entry = pte_to_swp_entry(orig_pte);
2880 if (unlikely(non_swap_entry(entry))) {
2881 if (is_migration_entry(entry)) {
2882 migration_entry_wait(mm, pmd, address);
2883 } else if (is_hwpoison_entry(entry)) {
2884 ret = VM_FAULT_HWPOISON;
2885 } else {
2886 print_bad_pte(vma, address, orig_pte, NULL);
2887 ret = VM_FAULT_SIGBUS;
2888 }
2889 goto out;
2890 }
2891 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2892 page = lookup_swap_cache(entry);
2893 if (!page) {
2894 grab_swap_token(mm);
2895 page = swapin_readahead(entry,
2896 GFP_HIGHUSER_MOVABLE, vma, address);
2897 if (!page) {
2898
2899
2900
2901
2902 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2903 if (likely(pte_same(*page_table, orig_pte)))
2904 ret = VM_FAULT_OOM;
2905 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2906 goto unlock;
2907 }
2908
2909
2910 ret = VM_FAULT_MAJOR;
2911 count_vm_event(PGMAJFAULT);
2912 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2913 } else if (PageHWPoison(page)) {
2914
2915
2916
2917
2918 ret = VM_FAULT_HWPOISON;
2919 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2920 goto out_release;
2921 }
2922
2923 locked = lock_page_or_retry(page, mm, flags);
2924 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2925 if (!locked) {
2926 ret |= VM_FAULT_RETRY;
2927 goto out_release;
2928 }
2929
2930
2931
2932
2933
2934
2935
2936 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2937 goto out_page;
2938
2939 if (ksm_might_need_to_copy(page, vma, address)) {
2940 swapcache = page;
2941 page = ksm_does_need_to_copy(page, vma, address);
2942
2943 if (unlikely(!page)) {
2944 ret = VM_FAULT_OOM;
2945 page = swapcache;
2946 swapcache = NULL;
2947 goto out_page;
2948 }
2949 }
2950
2951 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2952 ret = VM_FAULT_OOM;
2953 goto out_page;
2954 }
2955
2956
2957
2958
2959 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2960 if (unlikely(!pte_same(*page_table, orig_pte)))
2961 goto out_nomap;
2962
2963 if (unlikely(!PageUptodate(page))) {
2964 ret = VM_FAULT_SIGBUS;
2965 goto out_nomap;
2966 }
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982 inc_mm_counter_fast(mm, MM_ANONPAGES);
2983 dec_mm_counter_fast(mm, MM_SWAPENTS);
2984 pte = mk_pte(page, vma->vm_page_prot);
2985 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2986 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2987 flags &= ~FAULT_FLAG_WRITE;
2988 ret |= VM_FAULT_WRITE;
2989 exclusive = 1;
2990 }
2991 flush_icache_page(vma, page);
2992 set_pte_at(mm, address, page_table, pte);
2993 do_page_add_anon_rmap(page, vma, address, exclusive);
2994
2995 mem_cgroup_commit_charge_swapin(page, ptr);
2996
2997 swap_free(entry);
2998 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2999 try_to_free_swap(page);
3000 unlock_page(page);
3001 if (swapcache) {
3002
3003
3004
3005
3006
3007
3008
3009
3010 unlock_page(swapcache);
3011 page_cache_release(swapcache);
3012 }
3013
3014 if (flags & FAULT_FLAG_WRITE) {
3015 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3016 if (ret & VM_FAULT_ERROR)
3017 ret &= VM_FAULT_ERROR;
3018 goto out;
3019 }
3020
3021
3022 update_mmu_cache(vma, address, page_table);
3023unlock:
3024 pte_unmap_unlock(page_table, ptl);
3025out:
3026 return ret;
3027out_nomap:
3028 mem_cgroup_cancel_charge_swapin(ptr);
3029 pte_unmap_unlock(page_table, ptl);
3030out_page:
3031 unlock_page(page);
3032out_release:
3033 page_cache_release(page);
3034 if (swapcache) {
3035 unlock_page(swapcache);
3036 page_cache_release(swapcache);
3037 }
3038 return ret;
3039}
3040
3041
3042
3043
3044
3045
3046static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3047{
3048 address &= PAGE_MASK;
3049 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3050 struct vm_area_struct *prev = vma->vm_prev;
3051
3052
3053
3054
3055
3056
3057
3058 if (prev && prev->vm_end == address)
3059 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3060
3061 expand_downwards(vma, address - PAGE_SIZE);
3062 }
3063 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3064 struct vm_area_struct *next = vma->vm_next;
3065
3066
3067 if (next && next->vm_start == address + PAGE_SIZE)
3068 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3069
3070 expand_upwards(vma, address + PAGE_SIZE);
3071 }
3072 return 0;
3073}
3074
3075
3076
3077
3078
3079
3080static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3081 unsigned long address, pte_t *page_table, pmd_t *pmd,
3082 unsigned int flags)
3083{
3084 struct page *page;
3085 spinlock_t *ptl;
3086 pte_t entry;
3087
3088 pte_unmap(page_table);
3089
3090
3091 if (check_stack_guard_page(vma, address) < 0)
3092 return VM_FAULT_SIGBUS;
3093
3094
3095 if (!(flags & FAULT_FLAG_WRITE)) {
3096 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3097 vma->vm_page_prot));
3098 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3099 if (!pte_none(*page_table))
3100 goto unlock;
3101 goto setpte;
3102 }
3103
3104
3105 if (unlikely(anon_vma_prepare(vma)))
3106 goto oom;
3107 page = alloc_zeroed_user_highpage_movable(vma, address);
3108 if (!page)
3109 goto oom;
3110 __SetPageUptodate(page);
3111
3112 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3113 goto oom_free_page;
3114
3115 entry = mk_pte(page, vma->vm_page_prot);
3116 if (vma->vm_flags & VM_WRITE)
3117 entry = pte_mkwrite(pte_mkdirty(entry));
3118
3119 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3120 if (!pte_none(*page_table))
3121 goto release;
3122
3123 inc_mm_counter_fast(mm, MM_ANONPAGES);
3124 page_add_new_anon_rmap(page, vma, address);
3125setpte:
3126 set_pte_at(mm, address, page_table, entry);
3127
3128
3129 update_mmu_cache(vma, address, page_table);
3130unlock:
3131 pte_unmap_unlock(page_table, ptl);
3132 return 0;
3133release:
3134 mem_cgroup_uncharge_page(page);
3135 page_cache_release(page);
3136 goto unlock;
3137oom_free_page:
3138 page_cache_release(page);
3139oom:
3140 return VM_FAULT_OOM;
3141}
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3157 unsigned long address, pmd_t *pmd,
3158 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3159{
3160 pte_t *page_table;
3161 spinlock_t *ptl;
3162 struct page *page;
3163 pte_t entry;
3164 int anon = 0;
3165 int charged = 0;
3166 struct page *dirty_page = NULL;
3167 struct vm_fault vmf;
3168 int ret;
3169 int page_mkwrite = 0;
3170
3171 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3172 vmf.pgoff = pgoff;
3173 vmf.flags = flags;
3174 vmf.page = NULL;
3175
3176 ret = vma->vm_ops->fault(vma, &vmf);
3177 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3178 VM_FAULT_RETRY)))
3179 return ret;
3180
3181 if (unlikely(PageHWPoison(vmf.page))) {
3182 if (ret & VM_FAULT_LOCKED)
3183 unlock_page(vmf.page);
3184 return VM_FAULT_HWPOISON;
3185 }
3186
3187
3188
3189
3190
3191 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3192 lock_page(vmf.page);
3193 else
3194 VM_BUG_ON(!PageLocked(vmf.page));
3195
3196
3197
3198
3199 page = vmf.page;
3200 if (flags & FAULT_FLAG_WRITE) {
3201 if (!(vma->vm_flags & VM_SHARED)) {
3202 anon = 1;
3203 if (unlikely(anon_vma_prepare(vma))) {
3204 ret = VM_FAULT_OOM;
3205 goto out;
3206 }
3207 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3208 vma, address);
3209 if (!page) {
3210 ret = VM_FAULT_OOM;
3211 goto out;
3212 }
3213 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3214 ret = VM_FAULT_OOM;
3215 page_cache_release(page);
3216 goto out;
3217 }
3218 charged = 1;
3219 copy_user_highpage(page, vmf.page, address, vma);
3220 __SetPageUptodate(page);
3221 } else {
3222
3223
3224
3225
3226
3227 if (vma->vm_ops->page_mkwrite) {
3228 int tmp;
3229
3230 unlock_page(page);
3231 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3232 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3233 if (unlikely(tmp &
3234 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3235 ret = tmp;
3236 goto unwritable_page;
3237 }
3238 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3239 lock_page(page);
3240 if (!page->mapping) {
3241 ret = 0;
3242 unlock_page(page);
3243 goto unwritable_page;
3244 }
3245 } else
3246 VM_BUG_ON(!PageLocked(page));
3247 page_mkwrite = 1;
3248 }
3249 }
3250
3251 }
3252
3253 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266 if (likely(pte_same(*page_table, orig_pte))) {
3267 flush_icache_page(vma, page);
3268 entry = mk_pte(page, vma->vm_page_prot);
3269 if (flags & FAULT_FLAG_WRITE)
3270 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3271 if (anon) {
3272 inc_mm_counter_fast(mm, MM_ANONPAGES);
3273 page_add_new_anon_rmap(page, vma, address);
3274 } else {
3275 inc_mm_counter_fast(mm, MM_FILEPAGES);
3276 page_add_file_rmap(page);
3277 if (flags & FAULT_FLAG_WRITE) {
3278 dirty_page = page;
3279 get_page(dirty_page);
3280 }
3281 }
3282 set_pte_at(mm, address, page_table, entry);
3283
3284
3285 update_mmu_cache(vma, address, page_table);
3286 } else {
3287 if (charged)
3288 mem_cgroup_uncharge_page(page);
3289 if (anon)
3290 page_cache_release(page);
3291 else
3292 anon = 1;
3293 }
3294
3295 pte_unmap_unlock(page_table, ptl);
3296
3297out:
3298 if (dirty_page) {
3299 struct address_space *mapping = page->mapping;
3300
3301 if (set_page_dirty(dirty_page))
3302 page_mkwrite = 1;
3303 unlock_page(dirty_page);
3304 put_page(dirty_page);
3305 if (page_mkwrite && mapping) {
3306
3307
3308
3309
3310 balance_dirty_pages_ratelimited(mapping);
3311 }
3312
3313
3314 if (vma->vm_file)
3315 file_update_time(vma->vm_file);
3316 } else {
3317 unlock_page(vmf.page);
3318 if (anon)
3319 page_cache_release(vmf.page);
3320 }
3321
3322 return ret;
3323
3324unwritable_page:
3325 page_cache_release(page);
3326 return ret;
3327}
3328
3329static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3330 unsigned long address, pte_t *page_table, pmd_t *pmd,
3331 unsigned int flags, pte_t orig_pte)
3332{
3333 pgoff_t pgoff = (((address & PAGE_MASK)
3334 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3335
3336 pte_unmap(page_table);
3337 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3338}
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3350 unsigned long address, pte_t *page_table, pmd_t *pmd,
3351 unsigned int flags, pte_t orig_pte)
3352{
3353 pgoff_t pgoff;
3354
3355 flags |= FAULT_FLAG_NONLINEAR;
3356
3357 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3358 return 0;
3359
3360 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3361
3362
3363
3364 print_bad_pte(vma, address, orig_pte, NULL);
3365 return VM_FAULT_SIGBUS;
3366 }
3367
3368 pgoff = pte_to_pgoff(orig_pte);
3369 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3370}
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385int handle_pte_fault(struct mm_struct *mm,
3386 struct vm_area_struct *vma, unsigned long address,
3387 pte_t *pte, pmd_t *pmd, unsigned int flags)
3388{
3389 pte_t entry;
3390 spinlock_t *ptl;
3391
3392 entry = *pte;
3393 if (!pte_present(entry)) {
3394 if (pte_none(entry)) {
3395 if (vma->vm_ops) {
3396 if (likely(vma->vm_ops->fault))
3397 return do_linear_fault(mm, vma, address,
3398 pte, pmd, flags, entry);
3399 }
3400 return do_anonymous_page(mm, vma, address,
3401 pte, pmd, flags);
3402 }
3403 if (pte_file(entry))
3404 return do_nonlinear_fault(mm, vma, address,
3405 pte, pmd, flags, entry);
3406 return do_swap_page(mm, vma, address,
3407 pte, pmd, flags, entry);
3408 }
3409
3410 ptl = pte_lockptr(mm, pmd);
3411 spin_lock(ptl);
3412 if (unlikely(!pte_same(*pte, entry)))
3413 goto unlock;
3414 if (flags & FAULT_FLAG_WRITE) {
3415 if (!pte_write(entry))
3416 return do_wp_page(mm, vma, address,
3417 pte, pmd, ptl, entry);
3418 entry = pte_mkdirty(entry);
3419 }
3420 entry = pte_mkyoung(entry);
3421 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3422 update_mmu_cache(vma, address, pte);
3423 } else {
3424
3425
3426
3427
3428
3429
3430 if (flags & FAULT_FLAG_WRITE)
3431 flush_tlb_fix_spurious_fault(vma, address);
3432 }
3433unlock:
3434 pte_unmap_unlock(pte, ptl);
3435 return 0;
3436}
3437
3438
3439
3440
3441int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3442 unsigned long address, unsigned int flags)
3443{
3444 pgd_t *pgd;
3445 pud_t *pud;
3446 pmd_t *pmd;
3447 pte_t *pte;
3448
3449 __set_current_state(TASK_RUNNING);
3450
3451 count_vm_event(PGFAULT);
3452 mem_cgroup_count_vm_event(mm, PGFAULT);
3453
3454
3455 check_sync_rss_stat(current);
3456
3457 if (unlikely(is_vm_hugetlb_page(vma)))
3458 return hugetlb_fault(mm, vma, address, flags);
3459
3460 pgd = pgd_offset(mm, address);
3461 pud = pud_alloc(mm, pgd, address);
3462 if (!pud)
3463 return VM_FAULT_OOM;
3464 pmd = pmd_alloc(mm, pud, address);
3465 if (!pmd)
3466 return VM_FAULT_OOM;
3467 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3468 if (!vma->vm_ops)
3469 return do_huge_pmd_anonymous_page(mm, vma, address,
3470 pmd, flags);
3471 } else {
3472 pmd_t orig_pmd = *pmd;
3473 barrier();
3474 if (pmd_trans_huge(orig_pmd)) {
3475 if (flags & FAULT_FLAG_WRITE &&
3476 !pmd_write(orig_pmd) &&
3477 !pmd_trans_splitting(orig_pmd))
3478 return do_huge_pmd_wp_page(mm, vma, address,
3479 pmd, orig_pmd);
3480 return 0;
3481 }
3482 }
3483
3484
3485
3486
3487
3488
3489 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3490 return VM_FAULT_OOM;
3491
3492 if (unlikely(pmd_trans_huge(*pmd)))
3493 return 0;
3494
3495
3496
3497
3498
3499
3500 pte = pte_offset_map(pmd, address);
3501
3502 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3503}
3504
3505#ifndef __PAGETABLE_PUD_FOLDED
3506
3507
3508
3509
3510int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3511{
3512 pud_t *new = pud_alloc_one(mm, address);
3513 if (!new)
3514 return -ENOMEM;
3515
3516 smp_wmb();
3517
3518 spin_lock(&mm->page_table_lock);
3519 if (pgd_present(*pgd))
3520 pud_free(mm, new);
3521 else
3522 pgd_populate(mm, pgd, new);
3523 spin_unlock(&mm->page_table_lock);
3524 return 0;
3525}
3526#endif
3527
3528#ifndef __PAGETABLE_PMD_FOLDED
3529
3530
3531
3532
3533int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3534{
3535 pmd_t *new = pmd_alloc_one(mm, address);
3536 if (!new)
3537 return -ENOMEM;
3538
3539 smp_wmb();
3540
3541 spin_lock(&mm->page_table_lock);
3542#ifndef __ARCH_HAS_4LEVEL_HACK
3543 if (pud_present(*pud))
3544 pmd_free(mm, new);
3545 else
3546 pud_populate(mm, pud, new);
3547#else
3548 if (pgd_present(*pud))
3549 pmd_free(mm, new);
3550 else
3551 pgd_populate(mm, pud, new);
3552#endif
3553 spin_unlock(&mm->page_table_lock);
3554 return 0;
3555}
3556#endif
3557
3558int make_pages_present(unsigned long addr, unsigned long end)
3559{
3560 int ret, len, write;
3561 struct vm_area_struct * vma;
3562
3563 vma = find_vma(current->mm, addr);
3564 if (!vma)
3565 return -ENOMEM;
3566
3567
3568
3569
3570
3571 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3572 BUG_ON(addr >= end);
3573 BUG_ON(end > vma->vm_end);
3574 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3575 ret = get_user_pages(current, current->mm, addr,
3576 len, write, 0, NULL, NULL);
3577 if (ret < 0)
3578 return ret;
3579 return ret == len ? 0 : -EFAULT;
3580}
3581
3582#if !defined(__HAVE_ARCH_GATE_AREA)
3583
3584#if defined(AT_SYSINFO_EHDR)
3585static struct vm_area_struct gate_vma;
3586
3587static int __init gate_vma_init(void)
3588{
3589 gate_vma.vm_mm = NULL;
3590 gate_vma.vm_start = FIXADDR_USER_START;
3591 gate_vma.vm_end = FIXADDR_USER_END;
3592 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3593 gate_vma.vm_page_prot = __P101;
3594
3595
3596
3597
3598
3599
3600 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3601 return 0;
3602}
3603__initcall(gate_vma_init);
3604#endif
3605
3606struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3607{
3608#ifdef AT_SYSINFO_EHDR
3609 return &gate_vma;
3610#else
3611 return NULL;
3612#endif
3613}
3614
3615int in_gate_area_no_mm(unsigned long addr)
3616{
3617#ifdef AT_SYSINFO_EHDR
3618 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3619 return 1;
3620#endif
3621 return 0;
3622}
3623
3624#endif
3625
3626static int __follow_pte(struct mm_struct *mm, unsigned long address,
3627 pte_t **ptepp, spinlock_t **ptlp)
3628{
3629 pgd_t *pgd;
3630 pud_t *pud;
3631 pmd_t *pmd;
3632 pte_t *ptep;
3633
3634 pgd = pgd_offset(mm, address);
3635 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3636 goto out;
3637
3638 pud = pud_offset(pgd, address);
3639 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3640 goto out;
3641
3642 pmd = pmd_offset(pud, address);
3643 VM_BUG_ON(pmd_trans_huge(*pmd));
3644 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3645 goto out;
3646
3647
3648 if (pmd_huge(*pmd))
3649 goto out;
3650
3651 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3652 if (!ptep)
3653 goto out;
3654 if (!pte_present(*ptep))
3655 goto unlock;
3656 *ptepp = ptep;
3657 return 0;
3658unlock:
3659 pte_unmap_unlock(ptep, *ptlp);
3660out:
3661 return -EINVAL;
3662}
3663
3664static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3665 pte_t **ptepp, spinlock_t **ptlp)
3666{
3667 int res;
3668
3669
3670 (void) __cond_lock(*ptlp,
3671 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3672 return res;
3673}
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3686 unsigned long *pfn)
3687{
3688 int ret = -EINVAL;
3689 spinlock_t *ptl;
3690 pte_t *ptep;
3691
3692 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3693 return ret;
3694
3695 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3696 if (ret)
3697 return ret;
3698 *pfn = pte_pfn(*ptep);
3699 pte_unmap_unlock(ptep, ptl);
3700 return 0;
3701}
3702EXPORT_SYMBOL(follow_pfn);
3703
3704#ifdef CONFIG_HAVE_IOREMAP_PROT
3705int follow_phys(struct vm_area_struct *vma,
3706 unsigned long address, unsigned int flags,
3707 unsigned long *prot, resource_size_t *phys)
3708{
3709 int ret = -EINVAL;
3710 pte_t *ptep, pte;
3711 spinlock_t *ptl;
3712
3713 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3714 goto out;
3715
3716 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3717 goto out;
3718 pte = *ptep;
3719
3720 if ((flags & FOLL_WRITE) && !pte_write(pte))
3721 goto unlock;
3722
3723 *prot = pgprot_val(pte_pgprot(pte));
3724 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3725
3726 ret = 0;
3727unlock:
3728 pte_unmap_unlock(ptep, ptl);
3729out:
3730 return ret;
3731}
3732
3733int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3734 void *buf, int len, int write)
3735{
3736 resource_size_t phys_addr;
3737 unsigned long prot = 0;
3738 void __iomem *maddr;
3739 int offset = addr & (PAGE_SIZE-1);
3740
3741 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3742 return -EINVAL;
3743
3744 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3745 if (write)
3746 memcpy_toio(maddr + offset, buf, len);
3747 else
3748 memcpy_fromio(buf, maddr + offset, len);
3749 iounmap(maddr);
3750
3751 return len;
3752}
3753#endif
3754
3755
3756
3757
3758
3759static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3760 unsigned long addr, void *buf, int len, int write)
3761{
3762 struct vm_area_struct *vma;
3763 void *old_buf = buf;
3764
3765 down_read(&mm->mmap_sem);
3766
3767 while (len) {
3768 int bytes, ret, offset;
3769 void *maddr;
3770 struct page *page = NULL;
3771
3772 ret = get_user_pages(tsk, mm, addr, 1,
3773 write, 1, &page, &vma);
3774 if (ret <= 0) {
3775
3776
3777
3778
3779#ifdef CONFIG_HAVE_IOREMAP_PROT
3780 vma = find_vma(mm, addr);
3781 if (!vma || vma->vm_start > addr)
3782 break;
3783 if (vma->vm_ops && vma->vm_ops->access)
3784 ret = vma->vm_ops->access(vma, addr, buf,
3785 len, write);
3786 if (ret <= 0)
3787#endif
3788 break;
3789 bytes = ret;
3790 } else {
3791 bytes = len;
3792 offset = addr & (PAGE_SIZE-1);
3793 if (bytes > PAGE_SIZE-offset)
3794 bytes = PAGE_SIZE-offset;
3795
3796 maddr = kmap(page);
3797 if (write) {
3798 copy_to_user_page(vma, page, addr,
3799 maddr + offset, buf, bytes);
3800 set_page_dirty_lock(page);
3801 } else {
3802 copy_from_user_page(vma, page, addr,
3803 buf, maddr + offset, bytes);
3804 }
3805 kunmap(page);
3806 page_cache_release(page);
3807 }
3808 len -= bytes;
3809 buf += bytes;
3810 addr += bytes;
3811 }
3812 up_read(&mm->mmap_sem);
3813
3814 return buf - old_buf;
3815}
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3828 void *buf, int len, int write)
3829{
3830 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3831}
3832
3833
3834
3835
3836
3837
3838int access_process_vm(struct task_struct *tsk, unsigned long addr,
3839 void *buf, int len, int write)
3840{
3841 struct mm_struct *mm;
3842 int ret;
3843
3844 mm = get_task_mm(tsk);
3845 if (!mm)
3846 return 0;
3847
3848 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3849 mmput(mm);
3850
3851 return ret;
3852}
3853
3854
3855
3856
3857void print_vma_addr(char *prefix, unsigned long ip)
3858{
3859 struct mm_struct *mm = current->mm;
3860 struct vm_area_struct *vma;
3861
3862
3863
3864
3865
3866 if (preempt_count())
3867 return;
3868
3869 down_read(&mm->mmap_sem);
3870 vma = find_vma(mm, ip);
3871 if (vma && vma->vm_file) {
3872 struct file *f = vma->vm_file;
3873 char *buf = (char *)__get_free_page(GFP_KERNEL);
3874 if (buf) {
3875 char *p, *s;
3876
3877 p = d_path(&f->f_path, buf, PAGE_SIZE);
3878 if (IS_ERR(p))
3879 p = "?";
3880 s = strrchr(p, '/');
3881 if (s)
3882 p = s+1;
3883 printk("%s%s[%lx+%lx]", prefix, p,
3884 vma->vm_start,
3885 vma->vm_end - vma->vm_start);
3886 free_page((unsigned long)buf);
3887 }
3888 }
3889 up_read(¤t->mm->mmap_sem);
3890}
3891
3892#ifdef CONFIG_PROVE_LOCKING
3893void might_fault(void)
3894{
3895
3896
3897
3898
3899
3900
3901 if (segment_eq(get_fs(), KERNEL_DS))
3902 return;
3903
3904 might_sleep();
3905
3906
3907
3908
3909
3910 if (!in_atomic() && current->mm)
3911 might_lock_read(¤t->mm->mmap_sem);
3912}
3913EXPORT_SYMBOL(might_fault);
3914#endif
3915
3916#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3917static void clear_gigantic_page(struct page *page,
3918 unsigned long addr,
3919 unsigned int pages_per_huge_page)
3920{
3921 int i;
3922 struct page *p = page;
3923
3924 might_sleep();
3925 for (i = 0; i < pages_per_huge_page;
3926 i++, p = mem_map_next(p, page, i)) {
3927 cond_resched();
3928 clear_user_highpage(p, addr + i * PAGE_SIZE);
3929 }
3930}
3931void clear_huge_page(struct page *page,
3932 unsigned long addr, unsigned int pages_per_huge_page)
3933{
3934 int i;
3935
3936 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3937 clear_gigantic_page(page, addr, pages_per_huge_page);
3938 return;
3939 }
3940
3941 might_sleep();
3942 for (i = 0; i < pages_per_huge_page; i++) {
3943 cond_resched();
3944 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3945 }
3946}
3947
3948static void copy_user_gigantic_page(struct page *dst, struct page *src,
3949 unsigned long addr,
3950 struct vm_area_struct *vma,
3951 unsigned int pages_per_huge_page)
3952{
3953 int i;
3954 struct page *dst_base = dst;
3955 struct page *src_base = src;
3956
3957 for (i = 0; i < pages_per_huge_page; ) {
3958 cond_resched();
3959 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3960
3961 i++;
3962 dst = mem_map_next(dst, dst_base, i);
3963 src = mem_map_next(src, src_base, i);
3964 }
3965}
3966
3967void copy_user_huge_page(struct page *dst, struct page *src,
3968 unsigned long addr, struct vm_area_struct *vma,
3969 unsigned int pages_per_huge_page)
3970{
3971 int i;
3972
3973 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3974 copy_user_gigantic_page(dst, src, addr, vma,
3975 pages_per_huge_page);
3976 return;
3977 }
3978
3979 might_sleep();
3980 for (i = 0; i < pages_per_huge_page; i++) {
3981 cond_resched();
3982 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3983 }
3984}
3985#endif
3986