1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0;
136 }
137 }
138 task->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167
168
169
170
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172
173
174
175
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184}
185#else
186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189
190static void check_sync_rss_stat(struct task_struct *task)
191{
192}
193
194#endif
195
196#ifdef HAVE_GENERIC_MMU_GATHER
197
198static int tlb_next_batch(struct mmu_gather *tlb)
199{
200 struct mmu_gather_batch *batch;
201
202 batch = tlb->active;
203 if (batch->next) {
204 tlb->active = batch->next;
205 return 1;
206 }
207
208 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
209 if (!batch)
210 return 0;
211
212 batch->next = NULL;
213 batch->nr = 0;
214 batch->max = MAX_GATHER_BATCH;
215
216 tlb->active->next = batch;
217 tlb->active = batch;
218
219 return 1;
220}
221
222
223
224
225
226
227void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
228{
229 tlb->mm = mm;
230
231 tlb->fullmm = fullmm;
232 tlb->need_flush = 0;
233 tlb->fast_mode = (num_possible_cpus() == 1);
234 tlb->local.next = NULL;
235 tlb->local.nr = 0;
236 tlb->local.max = ARRAY_SIZE(tlb->__pages);
237 tlb->active = &tlb->local;
238
239#ifdef CONFIG_HAVE_RCU_TABLE_FREE
240 tlb->batch = NULL;
241#endif
242}
243
244void tlb_flush_mmu(struct mmu_gather *tlb)
245{
246 struct mmu_gather_batch *batch;
247
248 if (!tlb->need_flush)
249 return;
250 tlb->need_flush = 0;
251 tlb_flush(tlb);
252#ifdef CONFIG_HAVE_RCU_TABLE_FREE
253 tlb_table_flush(tlb);
254#endif
255
256 if (tlb_fast_mode(tlb))
257 return;
258
259 for (batch = &tlb->local; batch; batch = batch->next) {
260 free_pages_and_swap_cache(batch->pages, batch->nr);
261 batch->nr = 0;
262 }
263 tlb->active = &tlb->local;
264}
265
266
267
268
269
270void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
271{
272 struct mmu_gather_batch *batch, *next;
273
274 tlb_flush_mmu(tlb);
275
276
277 check_pgt_cache();
278
279 for (batch = tlb->local.next; batch; batch = next) {
280 next = batch->next;
281 free_pages((unsigned long)batch, 0);
282 }
283 tlb->local.next = NULL;
284}
285
286
287
288
289
290
291
292int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{
294 struct mmu_gather_batch *batch;
295
296 tlb->need_flush = 1;
297
298 if (tlb_fast_mode(tlb)) {
299 free_page_and_swap_cache(page);
300 return 1;
301 }
302
303 batch = tlb->active;
304 batch->pages[batch->nr++] = page;
305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb))
307 return 0;
308 batch = tlb->active;
309 }
310 VM_BUG_ON(batch->nr > batch->max);
311
312 return batch->max - batch->nr;
313}
314
315#endif
316
317#ifdef CONFIG_HAVE_RCU_TABLE_FREE
318
319
320
321
322
323static void tlb_remove_table_smp_sync(void *arg)
324{
325
326}
327
328static void tlb_remove_table_one(void *table)
329{
330
331
332
333
334
335
336
337 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
338 __tlb_remove_table(table);
339}
340
341static void tlb_remove_table_rcu(struct rcu_head *head)
342{
343 struct mmu_table_batch *batch;
344 int i;
345
346 batch = container_of(head, struct mmu_table_batch, rcu);
347
348 for (i = 0; i < batch->nr; i++)
349 __tlb_remove_table(batch->tables[i]);
350
351 free_page((unsigned long)batch);
352}
353
354void tlb_table_flush(struct mmu_gather *tlb)
355{
356 struct mmu_table_batch **batch = &tlb->batch;
357
358 if (*batch) {
359 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
360 *batch = NULL;
361 }
362}
363
364void tlb_remove_table(struct mmu_gather *tlb, void *table)
365{
366 struct mmu_table_batch **batch = &tlb->batch;
367
368 tlb->need_flush = 1;
369
370
371
372
373
374 if (atomic_read(&tlb->mm->mm_users) < 2) {
375 __tlb_remove_table(table);
376 return;
377 }
378
379 if (*batch == NULL) {
380 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
381 if (*batch == NULL) {
382 tlb_remove_table_one(table);
383 return;
384 }
385 (*batch)->nr = 0;
386 }
387 (*batch)->tables[(*batch)->nr++] = table;
388 if ((*batch)->nr == MAX_TABLE_BATCH)
389 tlb_table_flush(tlb);
390}
391
392#endif
393
394
395
396
397
398
399
400void pgd_clear_bad(pgd_t *pgd)
401{
402 pgd_ERROR(*pgd);
403 pgd_clear(pgd);
404}
405
406void pud_clear_bad(pud_t *pud)
407{
408 pud_ERROR(*pud);
409 pud_clear(pud);
410}
411
412void pmd_clear_bad(pmd_t *pmd)
413{
414 pmd_ERROR(*pmd);
415 pmd_clear(pmd);
416}
417
418
419
420
421
422static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
423 unsigned long addr)
424{
425 pgtable_t token = pmd_pgtable(*pmd);
426 pmd_clear(pmd);
427 pte_free_tlb(tlb, token, addr);
428 tlb->mm->nr_ptes--;
429}
430
431static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
432 unsigned long addr, unsigned long end,
433 unsigned long floor, unsigned long ceiling)
434{
435 pmd_t *pmd;
436 unsigned long next;
437 unsigned long start;
438
439 start = addr;
440 pmd = pmd_offset(pud, addr);
441 do {
442 next = pmd_addr_end(addr, end);
443 if (pmd_none_or_clear_bad(pmd))
444 continue;
445 free_pte_range(tlb, pmd, addr);
446 } while (pmd++, addr = next, addr != end);
447
448 start &= PUD_MASK;
449 if (start < floor)
450 return;
451 if (ceiling) {
452 ceiling &= PUD_MASK;
453 if (!ceiling)
454 return;
455 }
456 if (end - 1 > ceiling - 1)
457 return;
458
459 pmd = pmd_offset(pud, start);
460 pud_clear(pud);
461 pmd_free_tlb(tlb, pmd, start);
462}
463
464static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
465 unsigned long addr, unsigned long end,
466 unsigned long floor, unsigned long ceiling)
467{
468 pud_t *pud;
469 unsigned long next;
470 unsigned long start;
471
472 start = addr;
473 pud = pud_offset(pgd, addr);
474 do {
475 next = pud_addr_end(addr, end);
476 if (pud_none_or_clear_bad(pud))
477 continue;
478 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
479 } while (pud++, addr = next, addr != end);
480
481 start &= PGDIR_MASK;
482 if (start < floor)
483 return;
484 if (ceiling) {
485 ceiling &= PGDIR_MASK;
486 if (!ceiling)
487 return;
488 }
489 if (end - 1 > ceiling - 1)
490 return;
491
492 pud = pud_offset(pgd, start);
493 pgd_clear(pgd);
494 pud_free_tlb(tlb, pud, start);
495}
496
497
498
499
500
501
502void free_pgd_range(struct mmu_gather *tlb,
503 unsigned long addr, unsigned long end,
504 unsigned long floor, unsigned long ceiling)
505{
506 pgd_t *pgd;
507 unsigned long next;
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535 addr &= PMD_MASK;
536 if (addr < floor) {
537 addr += PMD_SIZE;
538 if (!addr)
539 return;
540 }
541 if (ceiling) {
542 ceiling &= PMD_MASK;
543 if (!ceiling)
544 return;
545 }
546 if (end - 1 > ceiling - 1)
547 end -= PMD_SIZE;
548 if (addr > end - 1)
549 return;
550
551 pgd = pgd_offset(tlb->mm, addr);
552 do {
553 next = pgd_addr_end(addr, end);
554 if (pgd_none_or_clear_bad(pgd))
555 continue;
556 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
557 } while (pgd++, addr = next, addr != end);
558}
559
560void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
561 unsigned long floor, unsigned long ceiling)
562{
563 while (vma) {
564 struct vm_area_struct *next = vma->vm_next;
565 unsigned long addr = vma->vm_start;
566
567
568
569
570
571 unlink_anon_vmas(vma);
572 unlink_file_vma(vma);
573
574 if (is_vm_hugetlb_page(vma)) {
575 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
576 floor, next? next->vm_start: ceiling);
577 } else {
578
579
580
581 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
582 && !is_vm_hugetlb_page(next)) {
583 vma = next;
584 next = vma->vm_next;
585 unlink_anon_vmas(vma);
586 unlink_file_vma(vma);
587 }
588 free_pgd_range(tlb, addr, vma->vm_end,
589 floor, next? next->vm_start: ceiling);
590 }
591 vma = next;
592 }
593}
594
595int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
596 pmd_t *pmd, unsigned long address)
597{
598 pgtable_t new = pte_alloc_one(mm, address);
599 int wait_split_huge_page;
600 if (!new)
601 return -ENOMEM;
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616 smp_wmb();
617
618 spin_lock(&mm->page_table_lock);
619 wait_split_huge_page = 0;
620 if (likely(pmd_none(*pmd))) {
621 mm->nr_ptes++;
622 pmd_populate(mm, pmd, new);
623 new = NULL;
624 } else if (unlikely(pmd_trans_splitting(*pmd)))
625 wait_split_huge_page = 1;
626 spin_unlock(&mm->page_table_lock);
627 if (new)
628 pte_free(mm, new);
629 if (wait_split_huge_page)
630 wait_split_huge_page(vma->anon_vma, pmd);
631 return 0;
632}
633
634int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
635{
636 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
637 if (!new)
638 return -ENOMEM;
639
640 smp_wmb();
641
642 spin_lock(&init_mm.page_table_lock);
643 if (likely(pmd_none(*pmd))) {
644 pmd_populate_kernel(&init_mm, pmd, new);
645 new = NULL;
646 } else
647 VM_BUG_ON(pmd_trans_splitting(*pmd));
648 spin_unlock(&init_mm.page_table_lock);
649 if (new)
650 pte_free_kernel(&init_mm, new);
651 return 0;
652}
653
654static inline void init_rss_vec(int *rss)
655{
656 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
657}
658
659static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
660{
661 int i;
662
663 if (current->mm == mm)
664 sync_mm_rss(current, mm);
665 for (i = 0; i < NR_MM_COUNTERS; i++)
666 if (rss[i])
667 add_mm_counter(mm, i, rss[i]);
668}
669
670
671
672
673
674
675
676
677static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
678 pte_t pte, struct page *page)
679{
680 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
681 pud_t *pud = pud_offset(pgd, addr);
682 pmd_t *pmd = pmd_offset(pud, addr);
683 struct address_space *mapping;
684 pgoff_t index;
685 static unsigned long resume;
686 static unsigned long nr_shown;
687 static unsigned long nr_unshown;
688
689
690
691
692
693 if (nr_shown == 60) {
694 if (time_before(jiffies, resume)) {
695 nr_unshown++;
696 return;
697 }
698 if (nr_unshown) {
699 printk(KERN_ALERT
700 "BUG: Bad page map: %lu messages suppressed\n",
701 nr_unshown);
702 nr_unshown = 0;
703 }
704 nr_shown = 0;
705 }
706 if (nr_shown++ == 0)
707 resume = jiffies + 60 * HZ;
708
709 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
710 index = linear_page_index(vma, addr);
711
712 printk(KERN_ALERT
713 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
714 current->comm,
715 (long long)pte_val(pte), (long long)pmd_val(*pmd));
716 if (page)
717 dump_page(page);
718 printk(KERN_ALERT
719 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
720 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
721
722
723
724 if (vma->vm_ops)
725 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
726 (unsigned long)vma->vm_ops->fault);
727 if (vma->vm_file && vma->vm_file->f_op)
728 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
729 (unsigned long)vma->vm_file->f_op->mmap);
730 dump_stack();
731 add_taint(TAINT_BAD_PAGE);
732}
733
734static inline int is_cow_mapping(vm_flags_t flags)
735{
736 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
737}
738
739#ifndef is_zero_pfn
740static inline int is_zero_pfn(unsigned long pfn)
741{
742 return pfn == zero_pfn;
743}
744#endif
745
746#ifndef my_zero_pfn
747static inline unsigned long my_zero_pfn(unsigned long addr)
748{
749 return zero_pfn;
750}
751#endif
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795#ifdef __HAVE_ARCH_PTE_SPECIAL
796# define HAVE_PTE_SPECIAL 1
797#else
798# define HAVE_PTE_SPECIAL 0
799#endif
800struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
801 pte_t pte)
802{
803 unsigned long pfn = pte_pfn(pte);
804
805 if (HAVE_PTE_SPECIAL) {
806 if (likely(!pte_special(pte)))
807 goto check_pfn;
808 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
809 return NULL;
810 if (!is_zero_pfn(pfn))
811 print_bad_pte(vma, addr, pte, NULL);
812 return NULL;
813 }
814
815
816
817 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
818 if (vma->vm_flags & VM_MIXEDMAP) {
819 if (!pfn_valid(pfn))
820 return NULL;
821 goto out;
822 } else {
823 unsigned long off;
824 off = (addr - vma->vm_start) >> PAGE_SHIFT;
825 if (pfn == vma->vm_pgoff + off)
826 return NULL;
827 if (!is_cow_mapping(vma->vm_flags))
828 return NULL;
829 }
830 }
831
832 if (is_zero_pfn(pfn))
833 return NULL;
834check_pfn:
835 if (unlikely(pfn > highest_memmap_pfn)) {
836 print_bad_pte(vma, addr, pte, NULL);
837 return NULL;
838 }
839
840
841
842
843
844out:
845 return pfn_to_page(pfn);
846}
847
848
849
850
851
852
853
854static inline unsigned long
855copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
856 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
857 unsigned long addr, int *rss)
858{
859 unsigned long vm_flags = vma->vm_flags;
860 pte_t pte = *src_pte;
861 struct page *page;
862
863
864 if (unlikely(!pte_present(pte))) {
865 if (!pte_file(pte)) {
866 swp_entry_t entry = pte_to_swp_entry(pte);
867
868 if (swap_duplicate(entry) < 0)
869 return entry.val;
870
871
872 if (unlikely(list_empty(&dst_mm->mmlist))) {
873 spin_lock(&mmlist_lock);
874 if (list_empty(&dst_mm->mmlist))
875 list_add(&dst_mm->mmlist,
876 &src_mm->mmlist);
877 spin_unlock(&mmlist_lock);
878 }
879 if (likely(!non_swap_entry(entry)))
880 rss[MM_SWAPENTS]++;
881 else if (is_write_migration_entry(entry) &&
882 is_cow_mapping(vm_flags)) {
883
884
885
886
887 make_migration_entry_read(&entry);
888 pte = swp_entry_to_pte(entry);
889 set_pte_at(src_mm, addr, src_pte, pte);
890 }
891 }
892 goto out_set_pte;
893 }
894
895
896
897
898
899 if (is_cow_mapping(vm_flags)) {
900 ptep_set_wrprotect(src_mm, addr, src_pte);
901 pte = pte_wrprotect(pte);
902 }
903
904
905
906
907
908 if (vm_flags & VM_SHARED)
909 pte = pte_mkclean(pte);
910 pte = pte_mkold(pte);
911
912 page = vm_normal_page(vma, addr, pte);
913 if (page) {
914 get_page(page);
915 page_dup_rmap(page);
916 if (PageAnon(page))
917 rss[MM_ANONPAGES]++;
918 else
919 rss[MM_FILEPAGES]++;
920 }
921
922out_set_pte:
923 set_pte_at(dst_mm, addr, dst_pte, pte);
924 return 0;
925}
926
927int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
928 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
929 unsigned long addr, unsigned long end)
930{
931 pte_t *orig_src_pte, *orig_dst_pte;
932 pte_t *src_pte, *dst_pte;
933 spinlock_t *src_ptl, *dst_ptl;
934 int progress = 0;
935 int rss[NR_MM_COUNTERS];
936 swp_entry_t entry = (swp_entry_t){0};
937
938again:
939 init_rss_vec(rss);
940
941 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
942 if (!dst_pte)
943 return -ENOMEM;
944 src_pte = pte_offset_map(src_pmd, addr);
945 src_ptl = pte_lockptr(src_mm, src_pmd);
946 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
947 orig_src_pte = src_pte;
948 orig_dst_pte = dst_pte;
949 arch_enter_lazy_mmu_mode();
950
951 do {
952
953
954
955
956 if (progress >= 32) {
957 progress = 0;
958 if (need_resched() ||
959 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
960 break;
961 }
962 if (pte_none(*src_pte)) {
963 progress++;
964 continue;
965 }
966 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
967 vma, addr, rss);
968 if (entry.val)
969 break;
970 progress += 8;
971 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
972
973 arch_leave_lazy_mmu_mode();
974 spin_unlock(src_ptl);
975 pte_unmap(orig_src_pte);
976 add_mm_rss_vec(dst_mm, rss);
977 pte_unmap_unlock(orig_dst_pte, dst_ptl);
978 cond_resched();
979
980 if (entry.val) {
981 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
982 return -ENOMEM;
983 progress = 0;
984 }
985 if (addr != end)
986 goto again;
987 return 0;
988}
989
990static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
991 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
992 unsigned long addr, unsigned long end)
993{
994 pmd_t *src_pmd, *dst_pmd;
995 unsigned long next;
996
997 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
998 if (!dst_pmd)
999 return -ENOMEM;
1000 src_pmd = pmd_offset(src_pud, addr);
1001 do {
1002 next = pmd_addr_end(addr, end);
1003 if (pmd_trans_huge(*src_pmd)) {
1004 int err;
1005 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
1006 err = copy_huge_pmd(dst_mm, src_mm,
1007 dst_pmd, src_pmd, addr, vma);
1008 if (err == -ENOMEM)
1009 return -ENOMEM;
1010 if (!err)
1011 continue;
1012
1013 }
1014 if (pmd_none_or_clear_bad(src_pmd))
1015 continue;
1016 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1017 vma, addr, next))
1018 return -ENOMEM;
1019 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1020 return 0;
1021}
1022
1023static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1024 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1025 unsigned long addr, unsigned long end)
1026{
1027 pud_t *src_pud, *dst_pud;
1028 unsigned long next;
1029
1030 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1031 if (!dst_pud)
1032 return -ENOMEM;
1033 src_pud = pud_offset(src_pgd, addr);
1034 do {
1035 next = pud_addr_end(addr, end);
1036 if (pud_none_or_clear_bad(src_pud))
1037 continue;
1038 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1039 vma, addr, next))
1040 return -ENOMEM;
1041 } while (dst_pud++, src_pud++, addr = next, addr != end);
1042 return 0;
1043}
1044
1045int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1046 struct vm_area_struct *vma)
1047{
1048 pgd_t *src_pgd, *dst_pgd;
1049 unsigned long next;
1050 unsigned long addr = vma->vm_start;
1051 unsigned long end = vma->vm_end;
1052 int ret;
1053
1054
1055
1056
1057
1058
1059
1060 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
1061 if (!vma->anon_vma)
1062 return 0;
1063 }
1064
1065 if (is_vm_hugetlb_page(vma))
1066 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1067
1068 if (unlikely(is_pfn_mapping(vma))) {
1069
1070
1071
1072
1073 ret = track_pfn_vma_copy(vma);
1074 if (ret)
1075 return ret;
1076 }
1077
1078
1079
1080
1081
1082
1083
1084 if (is_cow_mapping(vma->vm_flags))
1085 mmu_notifier_invalidate_range_start(src_mm, addr, end);
1086
1087 ret = 0;
1088 dst_pgd = pgd_offset(dst_mm, addr);
1089 src_pgd = pgd_offset(src_mm, addr);
1090 do {
1091 next = pgd_addr_end(addr, end);
1092 if (pgd_none_or_clear_bad(src_pgd))
1093 continue;
1094 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1095 vma, addr, next))) {
1096 ret = -ENOMEM;
1097 break;
1098 }
1099 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1100
1101 if (is_cow_mapping(vma->vm_flags))
1102 mmu_notifier_invalidate_range_end(src_mm,
1103 vma->vm_start, end);
1104 return ret;
1105}
1106
1107static unsigned long zap_pte_range(struct mmu_gather *tlb,
1108 struct vm_area_struct *vma, pmd_t *pmd,
1109 unsigned long addr, unsigned long end,
1110 struct zap_details *details)
1111{
1112 struct mm_struct *mm = tlb->mm;
1113 int force_flush = 0;
1114 int rss[NR_MM_COUNTERS];
1115 spinlock_t *ptl;
1116 pte_t *start_pte;
1117 pte_t *pte;
1118
1119again:
1120 init_rss_vec(rss);
1121 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1122 pte = start_pte;
1123 arch_enter_lazy_mmu_mode();
1124 do {
1125 pte_t ptent = *pte;
1126 if (pte_none(ptent)) {
1127 continue;
1128 }
1129
1130 if (pte_present(ptent)) {
1131 struct page *page;
1132
1133 page = vm_normal_page(vma, addr, ptent);
1134 if (unlikely(details) && page) {
1135
1136
1137
1138
1139
1140 if (details->check_mapping &&
1141 details->check_mapping != page->mapping)
1142 continue;
1143
1144
1145
1146
1147 if (details->nonlinear_vma &&
1148 (page->index < details->first_index ||
1149 page->index > details->last_index))
1150 continue;
1151 }
1152 ptent = ptep_get_and_clear_full(mm, addr, pte,
1153 tlb->fullmm);
1154 tlb_remove_tlb_entry(tlb, pte, addr);
1155 if (unlikely(!page))
1156 continue;
1157 if (unlikely(details) && details->nonlinear_vma
1158 && linear_page_index(details->nonlinear_vma,
1159 addr) != page->index)
1160 set_pte_at(mm, addr, pte,
1161 pgoff_to_pte(page->index));
1162 if (PageAnon(page))
1163 rss[MM_ANONPAGES]--;
1164 else {
1165 if (pte_dirty(ptent))
1166 set_page_dirty(page);
1167 if (pte_young(ptent) &&
1168 likely(!VM_SequentialReadHint(vma)))
1169 mark_page_accessed(page);
1170 rss[MM_FILEPAGES]--;
1171 }
1172 page_remove_rmap(page);
1173 if (unlikely(page_mapcount(page) < 0))
1174 print_bad_pte(vma, addr, ptent, page);
1175 force_flush = !__tlb_remove_page(tlb, page);
1176 if (force_flush)
1177 break;
1178 continue;
1179 }
1180
1181
1182
1183
1184 if (unlikely(details))
1185 continue;
1186 if (pte_file(ptent)) {
1187 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1188 print_bad_pte(vma, addr, ptent, NULL);
1189 } else {
1190 swp_entry_t entry = pte_to_swp_entry(ptent);
1191
1192 if (!non_swap_entry(entry))
1193 rss[MM_SWAPENTS]--;
1194 if (unlikely(!free_swap_and_cache(entry)))
1195 print_bad_pte(vma, addr, ptent, NULL);
1196 }
1197 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1198 } while (pte++, addr += PAGE_SIZE, addr != end);
1199
1200 add_mm_rss_vec(mm, rss);
1201 arch_leave_lazy_mmu_mode();
1202 pte_unmap_unlock(start_pte, ptl);
1203
1204
1205
1206
1207
1208
1209 if (force_flush) {
1210 force_flush = 0;
1211 tlb_flush_mmu(tlb);
1212 if (addr != end)
1213 goto again;
1214 }
1215
1216 return addr;
1217}
1218
1219static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1220 struct vm_area_struct *vma, pud_t *pud,
1221 unsigned long addr, unsigned long end,
1222 struct zap_details *details)
1223{
1224 pmd_t *pmd;
1225 unsigned long next;
1226
1227 pmd = pmd_offset(pud, addr);
1228 do {
1229 next = pmd_addr_end(addr, end);
1230 if (pmd_trans_huge(*pmd)) {
1231 if (next-addr != HPAGE_PMD_SIZE) {
1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1233 split_huge_page_pmd(vma->vm_mm, pmd);
1234 } else if (zap_huge_pmd(tlb, vma, pmd))
1235 continue;
1236
1237 }
1238 if (pmd_none_or_clear_bad(pmd))
1239 continue;
1240 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1241 cond_resched();
1242 } while (pmd++, addr = next, addr != end);
1243
1244 return addr;
1245}
1246
1247static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1248 struct vm_area_struct *vma, pgd_t *pgd,
1249 unsigned long addr, unsigned long end,
1250 struct zap_details *details)
1251{
1252 pud_t *pud;
1253 unsigned long next;
1254
1255 pud = pud_offset(pgd, addr);
1256 do {
1257 next = pud_addr_end(addr, end);
1258 if (pud_none_or_clear_bad(pud))
1259 continue;
1260 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1261 } while (pud++, addr = next, addr != end);
1262
1263 return addr;
1264}
1265
1266static unsigned long unmap_page_range(struct mmu_gather *tlb,
1267 struct vm_area_struct *vma,
1268 unsigned long addr, unsigned long end,
1269 struct zap_details *details)
1270{
1271 pgd_t *pgd;
1272 unsigned long next;
1273
1274 if (details && !details->check_mapping && !details->nonlinear_vma)
1275 details = NULL;
1276
1277 BUG_ON(addr >= end);
1278 mem_cgroup_uncharge_start();
1279 tlb_start_vma(tlb, vma);
1280 pgd = pgd_offset(vma->vm_mm, addr);
1281 do {
1282 next = pgd_addr_end(addr, end);
1283 if (pgd_none_or_clear_bad(pgd))
1284 continue;
1285 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1286 } while (pgd++, addr = next, addr != end);
1287 tlb_end_vma(tlb, vma);
1288 mem_cgroup_uncharge_end();
1289
1290 return addr;
1291}
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315unsigned long unmap_vmas(struct mmu_gather *tlb,
1316 struct vm_area_struct *vma, unsigned long start_addr,
1317 unsigned long end_addr, unsigned long *nr_accounted,
1318 struct zap_details *details)
1319{
1320 unsigned long start = start_addr;
1321 struct mm_struct *mm = vma->vm_mm;
1322
1323 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1324 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
1325 unsigned long end;
1326
1327 start = max(vma->vm_start, start_addr);
1328 if (start >= vma->vm_end)
1329 continue;
1330 end = min(vma->vm_end, end_addr);
1331 if (end <= vma->vm_start)
1332 continue;
1333
1334 if (vma->vm_flags & VM_ACCOUNT)
1335 *nr_accounted += (end - start) >> PAGE_SHIFT;
1336
1337 if (unlikely(is_pfn_mapping(vma)))
1338 untrack_pfn_vma(vma, 0, 0);
1339
1340 while (start != end) {
1341 if (unlikely(is_vm_hugetlb_page(vma))) {
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353 if (vma->vm_file)
1354 unmap_hugepage_range(vma, start, end, NULL);
1355
1356 start = end;
1357 } else
1358 start = unmap_page_range(tlb, vma, start, end, details);
1359 }
1360 }
1361
1362 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1363 return start;
1364}
1365
1366
1367
1368
1369
1370
1371
1372
1373unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1374 unsigned long size, struct zap_details *details)
1375{
1376 struct mm_struct *mm = vma->vm_mm;
1377 struct mmu_gather tlb;
1378 unsigned long end = address + size;
1379 unsigned long nr_accounted = 0;
1380
1381 lru_add_drain();
1382 tlb_gather_mmu(&tlb, mm, 0);
1383 update_hiwater_rss(mm);
1384 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1385 tlb_finish_mmu(&tlb, address, end);
1386 return end;
1387}
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1402 unsigned long size)
1403{
1404 if (address < vma->vm_start || address + size > vma->vm_end ||
1405 !(vma->vm_flags & VM_PFNMAP))
1406 return -1;
1407 zap_page_range(vma, address, size, NULL);
1408 return 0;
1409}
1410EXPORT_SYMBOL_GPL(zap_vma_ptes);
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1425 unsigned int flags)
1426{
1427 pgd_t *pgd;
1428 pud_t *pud;
1429 pmd_t *pmd;
1430 pte_t *ptep, pte;
1431 spinlock_t *ptl;
1432 struct page *page;
1433 struct mm_struct *mm = vma->vm_mm;
1434
1435 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1436 if (!IS_ERR(page)) {
1437 BUG_ON(flags & FOLL_GET);
1438 goto out;
1439 }
1440
1441 page = NULL;
1442 pgd = pgd_offset(mm, address);
1443 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1444 goto no_page_table;
1445
1446 pud = pud_offset(pgd, address);
1447 if (pud_none(*pud))
1448 goto no_page_table;
1449 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1450 BUG_ON(flags & FOLL_GET);
1451 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1452 goto out;
1453 }
1454 if (unlikely(pud_bad(*pud)))
1455 goto no_page_table;
1456
1457 pmd = pmd_offset(pud, address);
1458 if (pmd_none(*pmd))
1459 goto no_page_table;
1460 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1461 BUG_ON(flags & FOLL_GET);
1462 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1463 goto out;
1464 }
1465 if (pmd_trans_huge(*pmd)) {
1466 if (flags & FOLL_SPLIT) {
1467 split_huge_page_pmd(mm, pmd);
1468 goto split_fallthrough;
1469 }
1470 spin_lock(&mm->page_table_lock);
1471 if (likely(pmd_trans_huge(*pmd))) {
1472 if (unlikely(pmd_trans_splitting(*pmd))) {
1473 spin_unlock(&mm->page_table_lock);
1474 wait_split_huge_page(vma->anon_vma, pmd);
1475 } else {
1476 page = follow_trans_huge_pmd(mm, address,
1477 pmd, flags);
1478 spin_unlock(&mm->page_table_lock);
1479 goto out;
1480 }
1481 } else
1482 spin_unlock(&mm->page_table_lock);
1483
1484 }
1485split_fallthrough:
1486 if (unlikely(pmd_bad(*pmd)))
1487 goto no_page_table;
1488
1489 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1490
1491 pte = *ptep;
1492 if (!pte_present(pte))
1493 goto no_page;
1494 if ((flags & FOLL_WRITE) && !pte_write(pte))
1495 goto unlock;
1496
1497 page = vm_normal_page(vma, address, pte);
1498 if (unlikely(!page)) {
1499 if ((flags & FOLL_DUMP) ||
1500 !is_zero_pfn(pte_pfn(pte)))
1501 goto bad_page;
1502 page = pte_page(pte);
1503 }
1504
1505 if (flags & FOLL_GET)
1506 get_page_foll(page);
1507 if (flags & FOLL_TOUCH) {
1508 if ((flags & FOLL_WRITE) &&
1509 !pte_dirty(pte) && !PageDirty(page))
1510 set_page_dirty(page);
1511
1512
1513
1514
1515
1516 mark_page_accessed(page);
1517 }
1518 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528 if (page->mapping && trylock_page(page)) {
1529 lru_add_drain();
1530
1531
1532
1533
1534
1535 if (page->mapping)
1536 mlock_vma_page(page);
1537 unlock_page(page);
1538 }
1539 }
1540unlock:
1541 pte_unmap_unlock(ptep, ptl);
1542out:
1543 return page;
1544
1545bad_page:
1546 pte_unmap_unlock(ptep, ptl);
1547 return ERR_PTR(-EFAULT);
1548
1549no_page:
1550 pte_unmap_unlock(ptep, ptl);
1551 if (!pte_none(pte))
1552 return page;
1553
1554no_page_table:
1555
1556
1557
1558
1559
1560
1561
1562
1563 if ((flags & FOLL_DUMP) &&
1564 (!vma->vm_ops || !vma->vm_ops->fault))
1565 return ERR_PTR(-EFAULT);
1566 return page;
1567}
1568
1569static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1570{
1571 return stack_guard_page_start(vma, addr) ||
1572 stack_guard_page_end(vma, addr+PAGE_SIZE);
1573}
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1625 unsigned long start, int nr_pages, unsigned int gup_flags,
1626 struct page **pages, struct vm_area_struct **vmas,
1627 int *nonblocking)
1628{
1629 int i;
1630 unsigned long vm_flags;
1631
1632 if (nr_pages <= 0)
1633 return 0;
1634
1635 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1636
1637
1638
1639
1640
1641 vm_flags = (gup_flags & FOLL_WRITE) ?
1642 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1643 vm_flags &= (gup_flags & FOLL_FORCE) ?
1644 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1645 i = 0;
1646
1647 do {
1648 struct vm_area_struct *vma;
1649
1650 vma = find_extend_vma(mm, start);
1651 if (!vma && in_gate_area(mm, start)) {
1652 unsigned long pg = start & PAGE_MASK;
1653 pgd_t *pgd;
1654 pud_t *pud;
1655 pmd_t *pmd;
1656 pte_t *pte;
1657
1658
1659 if (gup_flags & FOLL_WRITE)
1660 return i ? : -EFAULT;
1661 if (pg > TASK_SIZE)
1662 pgd = pgd_offset_k(pg);
1663 else
1664 pgd = pgd_offset_gate(mm, pg);
1665 BUG_ON(pgd_none(*pgd));
1666 pud = pud_offset(pgd, pg);
1667 BUG_ON(pud_none(*pud));
1668 pmd = pmd_offset(pud, pg);
1669 if (pmd_none(*pmd))
1670 return i ? : -EFAULT;
1671 VM_BUG_ON(pmd_trans_huge(*pmd));
1672 pte = pte_offset_map(pmd, pg);
1673 if (pte_none(*pte)) {
1674 pte_unmap(pte);
1675 return i ? : -EFAULT;
1676 }
1677 vma = get_gate_vma(mm);
1678 if (pages) {
1679 struct page *page;
1680
1681 page = vm_normal_page(vma, start, *pte);
1682 if (!page) {
1683 if (!(gup_flags & FOLL_DUMP) &&
1684 is_zero_pfn(pte_pfn(*pte)))
1685 page = pte_page(*pte);
1686 else {
1687 pte_unmap(pte);
1688 return i ? : -EFAULT;
1689 }
1690 }
1691 pages[i] = page;
1692 get_page(page);
1693 }
1694 pte_unmap(pte);
1695 goto next_page;
1696 }
1697
1698 if (!vma ||
1699 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1700 !(vm_flags & vma->vm_flags))
1701 return i ? : -EFAULT;
1702
1703 if (is_vm_hugetlb_page(vma)) {
1704 i = follow_hugetlb_page(mm, vma, pages, vmas,
1705 &start, &nr_pages, i, gup_flags);
1706 continue;
1707 }
1708
1709 do {
1710 struct page *page;
1711 unsigned int foll_flags = gup_flags;
1712
1713
1714
1715
1716
1717 if (unlikely(fatal_signal_pending(current)))
1718 return i ? i : -ERESTARTSYS;
1719
1720 cond_resched();
1721 while (!(page = follow_page(vma, start, foll_flags))) {
1722 int ret;
1723 unsigned int fault_flags = 0;
1724
1725
1726 if (foll_flags & FOLL_MLOCK) {
1727 if (stack_guard_page(vma, start))
1728 goto next_page;
1729 }
1730 if (foll_flags & FOLL_WRITE)
1731 fault_flags |= FAULT_FLAG_WRITE;
1732 if (nonblocking)
1733 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1734 if (foll_flags & FOLL_NOWAIT)
1735 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1736
1737 ret = handle_mm_fault(mm, vma, start,
1738 fault_flags);
1739
1740 if (ret & VM_FAULT_ERROR) {
1741 if (ret & VM_FAULT_OOM)
1742 return i ? i : -ENOMEM;
1743 if (ret & (VM_FAULT_HWPOISON |
1744 VM_FAULT_HWPOISON_LARGE)) {
1745 if (i)
1746 return i;
1747 else if (gup_flags & FOLL_HWPOISON)
1748 return -EHWPOISON;
1749 else
1750 return -EFAULT;
1751 }
1752 if (ret & VM_FAULT_SIGBUS)
1753 return i ? i : -EFAULT;
1754 BUG();
1755 }
1756
1757 if (tsk) {
1758 if (ret & VM_FAULT_MAJOR)
1759 tsk->maj_flt++;
1760 else
1761 tsk->min_flt++;
1762 }
1763
1764 if (ret & VM_FAULT_RETRY) {
1765 if (nonblocking)
1766 *nonblocking = 0;
1767 return i;
1768 }
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782 if ((ret & VM_FAULT_WRITE) &&
1783 !(vma->vm_flags & VM_WRITE))
1784 foll_flags &= ~FOLL_WRITE;
1785
1786 cond_resched();
1787 }
1788 if (IS_ERR(page))
1789 return i ? i : PTR_ERR(page);
1790 if (pages) {
1791 pages[i] = page;
1792
1793 flush_anon_page(vma, page, start);
1794 flush_dcache_page(page);
1795 }
1796next_page:
1797 if (vmas)
1798 vmas[i] = vma;
1799 i++;
1800 start += PAGE_SIZE;
1801 nr_pages--;
1802 } while (nr_pages && start < vma->vm_end);
1803 } while (nr_pages);
1804 return i;
1805}
1806EXPORT_SYMBOL(__get_user_pages);
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1836 unsigned long address, unsigned int fault_flags)
1837{
1838 struct vm_area_struct *vma;
1839 int ret;
1840
1841 vma = find_extend_vma(mm, address);
1842 if (!vma || address < vma->vm_start)
1843 return -EFAULT;
1844
1845 ret = handle_mm_fault(mm, vma, address, fault_flags);
1846 if (ret & VM_FAULT_ERROR) {
1847 if (ret & VM_FAULT_OOM)
1848 return -ENOMEM;
1849 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1850 return -EHWPOISON;
1851 if (ret & VM_FAULT_SIGBUS)
1852 return -EFAULT;
1853 BUG();
1854 }
1855 if (tsk) {
1856 if (ret & VM_FAULT_MAJOR)
1857 tsk->maj_flt++;
1858 else
1859 tsk->min_flt++;
1860 }
1861 return 0;
1862}
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1916 unsigned long start, int nr_pages, int write, int force,
1917 struct page **pages, struct vm_area_struct **vmas)
1918{
1919 int flags = FOLL_TOUCH;
1920
1921 if (pages)
1922 flags |= FOLL_GET;
1923 if (write)
1924 flags |= FOLL_WRITE;
1925 if (force)
1926 flags |= FOLL_FORCE;
1927
1928 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1929 NULL);
1930}
1931EXPORT_SYMBOL(get_user_pages);
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947#ifdef CONFIG_ELF_CORE
1948struct page *get_dump_page(unsigned long addr)
1949{
1950 struct vm_area_struct *vma;
1951 struct page *page;
1952
1953 if (__get_user_pages(current, current->mm, addr, 1,
1954 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1955 NULL) < 1)
1956 return NULL;
1957 flush_cache_page(vma, addr, page_to_pfn(page));
1958 return page;
1959}
1960#endif
1961
1962pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1963 spinlock_t **ptl)
1964{
1965 pgd_t * pgd = pgd_offset(mm, addr);
1966 pud_t * pud = pud_alloc(mm, pgd, addr);
1967 if (pud) {
1968 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1969 if (pmd) {
1970 VM_BUG_ON(pmd_trans_huge(*pmd));
1971 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1972 }
1973 }
1974 return NULL;
1975}
1976
1977
1978
1979
1980
1981
1982
1983
1984static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1985 struct page *page, pgprot_t prot)
1986{
1987 struct mm_struct *mm = vma->vm_mm;
1988 int retval;
1989 pte_t *pte;
1990 spinlock_t *ptl;
1991
1992 retval = -EINVAL;
1993 if (PageAnon(page))
1994 goto out;
1995 retval = -ENOMEM;
1996 flush_dcache_page(page);
1997 pte = get_locked_pte(mm, addr, &ptl);
1998 if (!pte)
1999 goto out;
2000 retval = -EBUSY;
2001 if (!pte_none(*pte))
2002 goto out_unlock;
2003
2004
2005 get_page(page);
2006 inc_mm_counter_fast(mm, MM_FILEPAGES);
2007 page_add_file_rmap(page);
2008 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2009
2010 retval = 0;
2011 pte_unmap_unlock(pte, ptl);
2012 return retval;
2013out_unlock:
2014 pte_unmap_unlock(pte, ptl);
2015out:
2016 return retval;
2017}
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2042 struct page *page)
2043{
2044 if (addr < vma->vm_start || addr >= vma->vm_end)
2045 return -EFAULT;
2046 if (!page_count(page))
2047 return -EINVAL;
2048 vma->vm_flags |= VM_INSERTPAGE;
2049 return insert_page(vma, addr, page, vma->vm_page_prot);
2050}
2051EXPORT_SYMBOL(vm_insert_page);
2052
2053static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2054 unsigned long pfn, pgprot_t prot)
2055{
2056 struct mm_struct *mm = vma->vm_mm;
2057 int retval;
2058 pte_t *pte, entry;
2059 spinlock_t *ptl;
2060
2061 retval = -ENOMEM;
2062 pte = get_locked_pte(mm, addr, &ptl);
2063 if (!pte)
2064 goto out;
2065 retval = -EBUSY;
2066 if (!pte_none(*pte))
2067 goto out_unlock;
2068
2069
2070 entry = pte_mkspecial(pfn_pte(pfn, prot));
2071 set_pte_at(mm, addr, pte, entry);
2072 update_mmu_cache(vma, addr, pte);
2073
2074 retval = 0;
2075out_unlock:
2076 pte_unmap_unlock(pte, ptl);
2077out:
2078 return retval;
2079}
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2099 unsigned long pfn)
2100{
2101 int ret;
2102 pgprot_t pgprot = vma->vm_page_prot;
2103
2104
2105
2106
2107
2108
2109 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2110 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2111 (VM_PFNMAP|VM_MIXEDMAP));
2112 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2113 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2114
2115 if (addr < vma->vm_start || addr >= vma->vm_end)
2116 return -EFAULT;
2117 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
2118 return -EINVAL;
2119
2120 ret = insert_pfn(vma, addr, pfn, pgprot);
2121
2122 if (ret)
2123 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2124
2125 return ret;
2126}
2127EXPORT_SYMBOL(vm_insert_pfn);
2128
2129int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2130 unsigned long pfn)
2131{
2132 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2133
2134 if (addr < vma->vm_start || addr >= vma->vm_end)
2135 return -EFAULT;
2136
2137
2138
2139
2140
2141
2142
2143
2144 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2145 struct page *page;
2146
2147 page = pfn_to_page(pfn);
2148 return insert_page(vma, addr, page, vma->vm_page_prot);
2149 }
2150 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2151}
2152EXPORT_SYMBOL(vm_insert_mixed);
2153
2154
2155
2156
2157
2158
2159static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2160 unsigned long addr, unsigned long end,
2161 unsigned long pfn, pgprot_t prot)
2162{
2163 pte_t *pte;
2164 spinlock_t *ptl;
2165
2166 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2167 if (!pte)
2168 return -ENOMEM;
2169 arch_enter_lazy_mmu_mode();
2170 do {
2171 BUG_ON(!pte_none(*pte));
2172 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2173 pfn++;
2174 } while (pte++, addr += PAGE_SIZE, addr != end);
2175 arch_leave_lazy_mmu_mode();
2176 pte_unmap_unlock(pte - 1, ptl);
2177 return 0;
2178}
2179
2180static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2181 unsigned long addr, unsigned long end,
2182 unsigned long pfn, pgprot_t prot)
2183{
2184 pmd_t *pmd;
2185 unsigned long next;
2186
2187 pfn -= addr >> PAGE_SHIFT;
2188 pmd = pmd_alloc(mm, pud, addr);
2189 if (!pmd)
2190 return -ENOMEM;
2191 VM_BUG_ON(pmd_trans_huge(*pmd));
2192 do {
2193 next = pmd_addr_end(addr, end);
2194 if (remap_pte_range(mm, pmd, addr, next,
2195 pfn + (addr >> PAGE_SHIFT), prot))
2196 return -ENOMEM;
2197 } while (pmd++, addr = next, addr != end);
2198 return 0;
2199}
2200
2201static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2202 unsigned long addr, unsigned long end,
2203 unsigned long pfn, pgprot_t prot)
2204{
2205 pud_t *pud;
2206 unsigned long next;
2207
2208 pfn -= addr >> PAGE_SHIFT;
2209 pud = pud_alloc(mm, pgd, addr);
2210 if (!pud)
2211 return -ENOMEM;
2212 do {
2213 next = pud_addr_end(addr, end);
2214 if (remap_pmd_range(mm, pud, addr, next,
2215 pfn + (addr >> PAGE_SHIFT), prot))
2216 return -ENOMEM;
2217 } while (pud++, addr = next, addr != end);
2218 return 0;
2219}
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2232 unsigned long pfn, unsigned long size, pgprot_t prot)
2233{
2234 pgd_t *pgd;
2235 unsigned long next;
2236 unsigned long end = addr + PAGE_ALIGN(size);
2237 struct mm_struct *mm = vma->vm_mm;
2238 int err;
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258 if (addr == vma->vm_start && end == vma->vm_end) {
2259 vma->vm_pgoff = pfn;
2260 vma->vm_flags |= VM_PFN_AT_MMAP;
2261 } else if (is_cow_mapping(vma->vm_flags))
2262 return -EINVAL;
2263
2264 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2265
2266 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
2267 if (err) {
2268
2269
2270
2271
2272 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2273 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2274 return -EINVAL;
2275 }
2276
2277 BUG_ON(addr >= end);
2278 pfn -= addr >> PAGE_SHIFT;
2279 pgd = pgd_offset(mm, addr);
2280 flush_cache_range(vma, addr, end);
2281 do {
2282 next = pgd_addr_end(addr, end);
2283 err = remap_pud_range(mm, pgd, addr, next,
2284 pfn + (addr >> PAGE_SHIFT), prot);
2285 if (err)
2286 break;
2287 } while (pgd++, addr = next, addr != end);
2288
2289 if (err)
2290 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
2291
2292 return err;
2293}
2294EXPORT_SYMBOL(remap_pfn_range);
2295
2296static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2297 unsigned long addr, unsigned long end,
2298 pte_fn_t fn, void *data)
2299{
2300 pte_t *pte;
2301 int err;
2302 pgtable_t token;
2303 spinlock_t *uninitialized_var(ptl);
2304
2305 pte = (mm == &init_mm) ?
2306 pte_alloc_kernel(pmd, addr) :
2307 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2308 if (!pte)
2309 return -ENOMEM;
2310
2311 BUG_ON(pmd_huge(*pmd));
2312
2313 arch_enter_lazy_mmu_mode();
2314
2315 token = pmd_pgtable(*pmd);
2316
2317 do {
2318 err = fn(pte++, token, addr, data);
2319 if (err)
2320 break;
2321 } while (addr += PAGE_SIZE, addr != end);
2322
2323 arch_leave_lazy_mmu_mode();
2324
2325 if (mm != &init_mm)
2326 pte_unmap_unlock(pte-1, ptl);
2327 return err;
2328}
2329
2330static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2331 unsigned long addr, unsigned long end,
2332 pte_fn_t fn, void *data)
2333{
2334 pmd_t *pmd;
2335 unsigned long next;
2336 int err;
2337
2338 BUG_ON(pud_huge(*pud));
2339
2340 pmd = pmd_alloc(mm, pud, addr);
2341 if (!pmd)
2342 return -ENOMEM;
2343 do {
2344 next = pmd_addr_end(addr, end);
2345 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2346 if (err)
2347 break;
2348 } while (pmd++, addr = next, addr != end);
2349 return err;
2350}
2351
2352static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2353 unsigned long addr, unsigned long end,
2354 pte_fn_t fn, void *data)
2355{
2356 pud_t *pud;
2357 unsigned long next;
2358 int err;
2359
2360 pud = pud_alloc(mm, pgd, addr);
2361 if (!pud)
2362 return -ENOMEM;
2363 do {
2364 next = pud_addr_end(addr, end);
2365 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2366 if (err)
2367 break;
2368 } while (pud++, addr = next, addr != end);
2369 return err;
2370}
2371
2372
2373
2374
2375
2376int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2377 unsigned long size, pte_fn_t fn, void *data)
2378{
2379 pgd_t *pgd;
2380 unsigned long next;
2381 unsigned long end = addr + size;
2382 int err;
2383
2384 BUG_ON(addr >= end);
2385 pgd = pgd_offset(mm, addr);
2386 do {
2387 next = pgd_addr_end(addr, end);
2388 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2389 if (err)
2390 break;
2391 } while (pgd++, addr = next, addr != end);
2392
2393 return err;
2394}
2395EXPORT_SYMBOL_GPL(apply_to_page_range);
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2407 pte_t *page_table, pte_t orig_pte)
2408{
2409 int same = 1;
2410#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2411 if (sizeof(pte_t) > sizeof(unsigned long)) {
2412 spinlock_t *ptl = pte_lockptr(mm, pmd);
2413 spin_lock(ptl);
2414 same = pte_same(*page_table, orig_pte);
2415 spin_unlock(ptl);
2416 }
2417#endif
2418 pte_unmap(page_table);
2419 return same;
2420}
2421
2422static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2423{
2424
2425
2426
2427
2428
2429
2430 if (unlikely(!src)) {
2431 void *kaddr = kmap_atomic(dst, KM_USER0);
2432 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2433
2434
2435
2436
2437
2438
2439
2440 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2441 clear_page(kaddr);
2442 kunmap_atomic(kaddr, KM_USER0);
2443 flush_dcache_page(dst);
2444 } else
2445 copy_user_highpage(dst, src, va, vma);
2446}
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2467 unsigned long address, pte_t *page_table, pmd_t *pmd,
2468 spinlock_t *ptl, pte_t orig_pte)
2469 __releases(ptl)
2470{
2471 struct page *old_page, *new_page;
2472 pte_t entry;
2473 int ret = 0;
2474 int page_mkwrite = 0;
2475 struct page *dirty_page = NULL;
2476
2477 old_page = vm_normal_page(vma, address, orig_pte);
2478 if (!old_page) {
2479
2480
2481
2482
2483
2484
2485
2486 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2487 (VM_WRITE|VM_SHARED))
2488 goto reuse;
2489 goto gotten;
2490 }
2491
2492
2493
2494
2495
2496 if (PageAnon(old_page) && !PageKsm(old_page)) {
2497 if (!trylock_page(old_page)) {
2498 page_cache_get(old_page);
2499 pte_unmap_unlock(page_table, ptl);
2500 lock_page(old_page);
2501 page_table = pte_offset_map_lock(mm, pmd, address,
2502 &ptl);
2503 if (!pte_same(*page_table, orig_pte)) {
2504 unlock_page(old_page);
2505 goto unlock;
2506 }
2507 page_cache_release(old_page);
2508 }
2509 if (reuse_swap_page(old_page)) {
2510
2511
2512
2513
2514
2515 page_move_anon_rmap(old_page, vma, address);
2516 unlock_page(old_page);
2517 goto reuse;
2518 }
2519 unlock_page(old_page);
2520 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2521 (VM_WRITE|VM_SHARED))) {
2522
2523
2524
2525
2526
2527 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2528 struct vm_fault vmf;
2529 int tmp;
2530
2531 vmf.virtual_address = (void __user *)(address &
2532 PAGE_MASK);
2533 vmf.pgoff = old_page->index;
2534 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2535 vmf.page = old_page;
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545 page_cache_get(old_page);
2546 pte_unmap_unlock(page_table, ptl);
2547
2548 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2549 if (unlikely(tmp &
2550 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2551 ret = tmp;
2552 goto unwritable_page;
2553 }
2554 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2555 lock_page(old_page);
2556 if (!old_page->mapping) {
2557 ret = 0;
2558 unlock_page(old_page);
2559 goto unwritable_page;
2560 }
2561 } else
2562 VM_BUG_ON(!PageLocked(old_page));
2563
2564
2565
2566
2567
2568
2569
2570 page_table = pte_offset_map_lock(mm, pmd, address,
2571 &ptl);
2572 if (!pte_same(*page_table, orig_pte)) {
2573 unlock_page(old_page);
2574 goto unlock;
2575 }
2576
2577 page_mkwrite = 1;
2578 }
2579 dirty_page = old_page;
2580 get_page(dirty_page);
2581
2582reuse:
2583 flush_cache_page(vma, address, pte_pfn(orig_pte));
2584 entry = pte_mkyoung(orig_pte);
2585 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2586 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2587 update_mmu_cache(vma, address, page_table);
2588 pte_unmap_unlock(page_table, ptl);
2589 ret |= VM_FAULT_WRITE;
2590
2591 if (!dirty_page)
2592 return ret;
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602 if (!page_mkwrite) {
2603 wait_on_page_locked(dirty_page);
2604 set_page_dirty_balance(dirty_page, page_mkwrite);
2605 }
2606 put_page(dirty_page);
2607 if (page_mkwrite) {
2608 struct address_space *mapping = dirty_page->mapping;
2609
2610 set_page_dirty(dirty_page);
2611 unlock_page(dirty_page);
2612 page_cache_release(dirty_page);
2613 if (mapping) {
2614
2615
2616
2617
2618 balance_dirty_pages_ratelimited(mapping);
2619 }
2620 }
2621
2622
2623 if (vma->vm_file)
2624 file_update_time(vma->vm_file);
2625
2626 return ret;
2627 }
2628
2629
2630
2631
2632 page_cache_get(old_page);
2633gotten:
2634 pte_unmap_unlock(page_table, ptl);
2635
2636 if (unlikely(anon_vma_prepare(vma)))
2637 goto oom;
2638
2639 if (is_zero_pfn(pte_pfn(orig_pte))) {
2640 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2641 if (!new_page)
2642 goto oom;
2643 } else {
2644 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2645 if (!new_page)
2646 goto oom;
2647 cow_user_page(new_page, old_page, address, vma);
2648 }
2649 __SetPageUptodate(new_page);
2650
2651 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2652 goto oom_free_new;
2653
2654
2655
2656
2657 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2658 if (likely(pte_same(*page_table, orig_pte))) {
2659 if (old_page) {
2660 if (!PageAnon(old_page)) {
2661 dec_mm_counter_fast(mm, MM_FILEPAGES);
2662 inc_mm_counter_fast(mm, MM_ANONPAGES);
2663 }
2664 } else
2665 inc_mm_counter_fast(mm, MM_ANONPAGES);
2666 flush_cache_page(vma, address, pte_pfn(orig_pte));
2667 entry = mk_pte(new_page, vma->vm_page_prot);
2668 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2669
2670
2671
2672
2673
2674
2675 ptep_clear_flush(vma, address, page_table);
2676 page_add_new_anon_rmap(new_page, vma, address);
2677
2678
2679
2680
2681
2682 set_pte_at_notify(mm, address, page_table, entry);
2683 update_mmu_cache(vma, address, page_table);
2684 if (old_page) {
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707 page_remove_rmap(old_page);
2708 }
2709
2710
2711 new_page = old_page;
2712 ret |= VM_FAULT_WRITE;
2713 } else
2714 mem_cgroup_uncharge_page(new_page);
2715
2716 if (new_page)
2717 page_cache_release(new_page);
2718unlock:
2719 pte_unmap_unlock(page_table, ptl);
2720 if (old_page) {
2721
2722
2723
2724
2725 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2726 lock_page(old_page);
2727 munlock_vma_page(old_page);
2728 unlock_page(old_page);
2729 }
2730 page_cache_release(old_page);
2731 }
2732 return ret;
2733oom_free_new:
2734 page_cache_release(new_page);
2735oom:
2736 if (old_page) {
2737 if (page_mkwrite) {
2738 unlock_page(old_page);
2739 page_cache_release(old_page);
2740 }
2741 page_cache_release(old_page);
2742 }
2743 return VM_FAULT_OOM;
2744
2745unwritable_page:
2746 page_cache_release(old_page);
2747 return ret;
2748}
2749
2750static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2751 unsigned long start_addr, unsigned long end_addr,
2752 struct zap_details *details)
2753{
2754 zap_page_range(vma, start_addr, end_addr - start_addr, details);
2755}
2756
2757static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2758 struct zap_details *details)
2759{
2760 struct vm_area_struct *vma;
2761 struct prio_tree_iter iter;
2762 pgoff_t vba, vea, zba, zea;
2763
2764 vma_prio_tree_foreach(vma, &iter, root,
2765 details->first_index, details->last_index) {
2766
2767 vba = vma->vm_pgoff;
2768 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2769
2770 zba = details->first_index;
2771 if (zba < vba)
2772 zba = vba;
2773 zea = details->last_index;
2774 if (zea > vea)
2775 zea = vea;
2776
2777 unmap_mapping_range_vma(vma,
2778 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2779 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2780 details);
2781 }
2782}
2783
2784static inline void unmap_mapping_range_list(struct list_head *head,
2785 struct zap_details *details)
2786{
2787 struct vm_area_struct *vma;
2788
2789
2790
2791
2792
2793
2794
2795 list_for_each_entry(vma, head, shared.vm_set.list) {
2796 details->nonlinear_vma = vma;
2797 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2798 }
2799}
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815void unmap_mapping_range(struct address_space *mapping,
2816 loff_t const holebegin, loff_t const holelen, int even_cows)
2817{
2818 struct zap_details details;
2819 pgoff_t hba = holebegin >> PAGE_SHIFT;
2820 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2821
2822
2823 if (sizeof(holelen) > sizeof(hlen)) {
2824 long long holeend =
2825 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2826 if (holeend & ~(long long)ULONG_MAX)
2827 hlen = ULONG_MAX - hba + 1;
2828 }
2829
2830 details.check_mapping = even_cows? NULL: mapping;
2831 details.nonlinear_vma = NULL;
2832 details.first_index = hba;
2833 details.last_index = hba + hlen - 1;
2834 if (details.last_index < details.first_index)
2835 details.last_index = ULONG_MAX;
2836
2837
2838 mutex_lock(&mapping->i_mmap_mutex);
2839 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2840 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2841 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2842 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2843 mutex_unlock(&mapping->i_mmap_mutex);
2844}
2845EXPORT_SYMBOL(unmap_mapping_range);
2846
2847
2848
2849
2850
2851
2852static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2853 unsigned long address, pte_t *page_table, pmd_t *pmd,
2854 unsigned int flags, pte_t orig_pte)
2855{
2856 spinlock_t *ptl;
2857 struct page *page, *swapcache = NULL;
2858 swp_entry_t entry;
2859 pte_t pte;
2860 int locked;
2861 struct mem_cgroup *ptr;
2862 int exclusive = 0;
2863 int ret = 0;
2864
2865 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2866 goto out;
2867
2868 entry = pte_to_swp_entry(orig_pte);
2869 if (unlikely(non_swap_entry(entry))) {
2870 if (is_migration_entry(entry)) {
2871 migration_entry_wait(mm, pmd, address);
2872 } else if (is_hwpoison_entry(entry)) {
2873 ret = VM_FAULT_HWPOISON;
2874 } else {
2875 print_bad_pte(vma, address, orig_pte, NULL);
2876 ret = VM_FAULT_SIGBUS;
2877 }
2878 goto out;
2879 }
2880 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2881 page = lookup_swap_cache(entry);
2882 if (!page) {
2883 grab_swap_token(mm);
2884 page = swapin_readahead(entry,
2885 GFP_HIGHUSER_MOVABLE, vma, address);
2886 if (!page) {
2887
2888
2889
2890
2891 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2892 if (likely(pte_same(*page_table, orig_pte)))
2893 ret = VM_FAULT_OOM;
2894 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2895 goto unlock;
2896 }
2897
2898
2899 ret = VM_FAULT_MAJOR;
2900 count_vm_event(PGMAJFAULT);
2901 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2902 } else if (PageHWPoison(page)) {
2903
2904
2905
2906
2907 ret = VM_FAULT_HWPOISON;
2908 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2909 goto out_release;
2910 }
2911
2912 locked = lock_page_or_retry(page, mm, flags);
2913 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2914 if (!locked) {
2915 ret |= VM_FAULT_RETRY;
2916 goto out_release;
2917 }
2918
2919
2920
2921
2922
2923
2924
2925 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2926 goto out_page;
2927
2928 if (ksm_might_need_to_copy(page, vma, address)) {
2929 swapcache = page;
2930 page = ksm_does_need_to_copy(page, vma, address);
2931
2932 if (unlikely(!page)) {
2933 ret = VM_FAULT_OOM;
2934 page = swapcache;
2935 swapcache = NULL;
2936 goto out_page;
2937 }
2938 }
2939
2940 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2941 ret = VM_FAULT_OOM;
2942 goto out_page;
2943 }
2944
2945
2946
2947
2948 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2949 if (unlikely(!pte_same(*page_table, orig_pte)))
2950 goto out_nomap;
2951
2952 if (unlikely(!PageUptodate(page))) {
2953 ret = VM_FAULT_SIGBUS;
2954 goto out_nomap;
2955 }
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971 inc_mm_counter_fast(mm, MM_ANONPAGES);
2972 dec_mm_counter_fast(mm, MM_SWAPENTS);
2973 pte = mk_pte(page, vma->vm_page_prot);
2974 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2975 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2976 flags &= ~FAULT_FLAG_WRITE;
2977 ret |= VM_FAULT_WRITE;
2978 exclusive = 1;
2979 }
2980 flush_icache_page(vma, page);
2981 set_pte_at(mm, address, page_table, pte);
2982 do_page_add_anon_rmap(page, vma, address, exclusive);
2983
2984 mem_cgroup_commit_charge_swapin(page, ptr);
2985
2986 swap_free(entry);
2987 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2988 try_to_free_swap(page);
2989 unlock_page(page);
2990 if (swapcache) {
2991
2992
2993
2994
2995
2996
2997
2998
2999 unlock_page(swapcache);
3000 page_cache_release(swapcache);
3001 }
3002
3003 if (flags & FAULT_FLAG_WRITE) {
3004 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3005 if (ret & VM_FAULT_ERROR)
3006 ret &= VM_FAULT_ERROR;
3007 goto out;
3008 }
3009
3010
3011 update_mmu_cache(vma, address, page_table);
3012unlock:
3013 pte_unmap_unlock(page_table, ptl);
3014out:
3015 return ret;
3016out_nomap:
3017 mem_cgroup_cancel_charge_swapin(ptr);
3018 pte_unmap_unlock(page_table, ptl);
3019out_page:
3020 unlock_page(page);
3021out_release:
3022 page_cache_release(page);
3023 if (swapcache) {
3024 unlock_page(swapcache);
3025 page_cache_release(swapcache);
3026 }
3027 return ret;
3028}
3029
3030
3031
3032
3033
3034
3035static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3036{
3037 address &= PAGE_MASK;
3038 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3039 struct vm_area_struct *prev = vma->vm_prev;
3040
3041
3042
3043
3044
3045
3046
3047 if (prev && prev->vm_end == address)
3048 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3049
3050 expand_downwards(vma, address - PAGE_SIZE);
3051 }
3052 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3053 struct vm_area_struct *next = vma->vm_next;
3054
3055
3056 if (next && next->vm_start == address + PAGE_SIZE)
3057 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3058
3059 expand_upwards(vma, address + PAGE_SIZE);
3060 }
3061 return 0;
3062}
3063
3064
3065
3066
3067
3068
3069static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3070 unsigned long address, pte_t *page_table, pmd_t *pmd,
3071 unsigned int flags)
3072{
3073 struct page *page;
3074 spinlock_t *ptl;
3075 pte_t entry;
3076
3077 pte_unmap(page_table);
3078
3079
3080 if (check_stack_guard_page(vma, address) < 0)
3081 return VM_FAULT_SIGBUS;
3082
3083
3084 if (!(flags & FAULT_FLAG_WRITE)) {
3085 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3086 vma->vm_page_prot));
3087 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3088 if (!pte_none(*page_table))
3089 goto unlock;
3090 goto setpte;
3091 }
3092
3093
3094 if (unlikely(anon_vma_prepare(vma)))
3095 goto oom;
3096 page = alloc_zeroed_user_highpage_movable(vma, address);
3097 if (!page)
3098 goto oom;
3099 __SetPageUptodate(page);
3100
3101 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3102 goto oom_free_page;
3103
3104 entry = mk_pte(page, vma->vm_page_prot);
3105 if (vma->vm_flags & VM_WRITE)
3106 entry = pte_mkwrite(pte_mkdirty(entry));
3107
3108 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3109 if (!pte_none(*page_table))
3110 goto release;
3111
3112 inc_mm_counter_fast(mm, MM_ANONPAGES);
3113 page_add_new_anon_rmap(page, vma, address);
3114setpte:
3115 set_pte_at(mm, address, page_table, entry);
3116
3117
3118 update_mmu_cache(vma, address, page_table);
3119unlock:
3120 pte_unmap_unlock(page_table, ptl);
3121 return 0;
3122release:
3123 mem_cgroup_uncharge_page(page);
3124 page_cache_release(page);
3125 goto unlock;
3126oom_free_page:
3127 page_cache_release(page);
3128oom:
3129 return VM_FAULT_OOM;
3130}
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3146 unsigned long address, pmd_t *pmd,
3147 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3148{
3149 pte_t *page_table;
3150 spinlock_t *ptl;
3151 struct page *page;
3152 struct page *cow_page;
3153 pte_t entry;
3154 int anon = 0;
3155 struct page *dirty_page = NULL;
3156 struct vm_fault vmf;
3157 int ret;
3158 int page_mkwrite = 0;
3159
3160
3161
3162
3163
3164 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3165
3166 if (unlikely(anon_vma_prepare(vma)))
3167 return VM_FAULT_OOM;
3168
3169 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3170 if (!cow_page)
3171 return VM_FAULT_OOM;
3172
3173 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3174 page_cache_release(cow_page);
3175 return VM_FAULT_OOM;
3176 }
3177 } else
3178 cow_page = NULL;
3179
3180 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3181 vmf.pgoff = pgoff;
3182 vmf.flags = flags;
3183 vmf.page = NULL;
3184
3185 ret = vma->vm_ops->fault(vma, &vmf);
3186 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3187 VM_FAULT_RETRY)))
3188 goto uncharge_out;
3189
3190 if (unlikely(PageHWPoison(vmf.page))) {
3191 if (ret & VM_FAULT_LOCKED)
3192 unlock_page(vmf.page);
3193 ret = VM_FAULT_HWPOISON;
3194 goto uncharge_out;
3195 }
3196
3197
3198
3199
3200
3201 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3202 lock_page(vmf.page);
3203 else
3204 VM_BUG_ON(!PageLocked(vmf.page));
3205
3206
3207
3208
3209 page = vmf.page;
3210 if (flags & FAULT_FLAG_WRITE) {
3211 if (!(vma->vm_flags & VM_SHARED)) {
3212 page = cow_page;
3213 anon = 1;
3214 copy_user_highpage(page, vmf.page, address, vma);
3215 __SetPageUptodate(page);
3216 } else {
3217
3218
3219
3220
3221
3222 if (vma->vm_ops->page_mkwrite) {
3223 int tmp;
3224
3225 unlock_page(page);
3226 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3227 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3228 if (unlikely(tmp &
3229 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3230 ret = tmp;
3231 goto unwritable_page;
3232 }
3233 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3234 lock_page(page);
3235 if (!page->mapping) {
3236 ret = 0;
3237 unlock_page(page);
3238 goto unwritable_page;
3239 }
3240 } else
3241 VM_BUG_ON(!PageLocked(page));
3242 page_mkwrite = 1;
3243 }
3244 }
3245
3246 }
3247
3248 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261 if (likely(pte_same(*page_table, orig_pte))) {
3262 flush_icache_page(vma, page);
3263 entry = mk_pte(page, vma->vm_page_prot);
3264 if (flags & FAULT_FLAG_WRITE)
3265 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3266 if (anon) {
3267 inc_mm_counter_fast(mm, MM_ANONPAGES);
3268 page_add_new_anon_rmap(page, vma, address);
3269 } else {
3270 inc_mm_counter_fast(mm, MM_FILEPAGES);
3271 page_add_file_rmap(page);
3272 if (flags & FAULT_FLAG_WRITE) {
3273 dirty_page = page;
3274 get_page(dirty_page);
3275 }
3276 }
3277 set_pte_at(mm, address, page_table, entry);
3278
3279
3280 update_mmu_cache(vma, address, page_table);
3281 } else {
3282 if (cow_page)
3283 mem_cgroup_uncharge_page(cow_page);
3284 if (anon)
3285 page_cache_release(page);
3286 else
3287 anon = 1;
3288 }
3289
3290 pte_unmap_unlock(page_table, ptl);
3291
3292 if (dirty_page) {
3293 struct address_space *mapping = page->mapping;
3294
3295 if (set_page_dirty(dirty_page))
3296 page_mkwrite = 1;
3297 unlock_page(dirty_page);
3298 put_page(dirty_page);
3299 if (page_mkwrite && mapping) {
3300
3301
3302
3303
3304 balance_dirty_pages_ratelimited(mapping);
3305 }
3306
3307
3308 if (vma->vm_file)
3309 file_update_time(vma->vm_file);
3310 } else {
3311 unlock_page(vmf.page);
3312 if (anon)
3313 page_cache_release(vmf.page);
3314 }
3315
3316 return ret;
3317
3318unwritable_page:
3319 page_cache_release(page);
3320 return ret;
3321uncharge_out:
3322
3323 if (cow_page) {
3324 mem_cgroup_uncharge_page(cow_page);
3325 page_cache_release(cow_page);
3326 }
3327 return ret;
3328}
3329
3330static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3331 unsigned long address, pte_t *page_table, pmd_t *pmd,
3332 unsigned int flags, pte_t orig_pte)
3333{
3334 pgoff_t pgoff = (((address & PAGE_MASK)
3335 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3336
3337 pte_unmap(page_table);
3338 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3339}
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3351 unsigned long address, pte_t *page_table, pmd_t *pmd,
3352 unsigned int flags, pte_t orig_pte)
3353{
3354 pgoff_t pgoff;
3355
3356 flags |= FAULT_FLAG_NONLINEAR;
3357
3358 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3359 return 0;
3360
3361 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3362
3363
3364
3365 print_bad_pte(vma, address, orig_pte, NULL);
3366 return VM_FAULT_SIGBUS;
3367 }
3368
3369 pgoff = pte_to_pgoff(orig_pte);
3370 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3371}
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386int handle_pte_fault(struct mm_struct *mm,
3387 struct vm_area_struct *vma, unsigned long address,
3388 pte_t *pte, pmd_t *pmd, unsigned int flags)
3389{
3390 pte_t entry;
3391 spinlock_t *ptl;
3392
3393 entry = *pte;
3394 if (!pte_present(entry)) {
3395 if (pte_none(entry)) {
3396 if (vma->vm_ops) {
3397 if (likely(vma->vm_ops->fault))
3398 return do_linear_fault(mm, vma, address,
3399 pte, pmd, flags, entry);
3400 }
3401 return do_anonymous_page(mm, vma, address,
3402 pte, pmd, flags);
3403 }
3404 if (pte_file(entry))
3405 return do_nonlinear_fault(mm, vma, address,
3406 pte, pmd, flags, entry);
3407 return do_swap_page(mm, vma, address,
3408 pte, pmd, flags, entry);
3409 }
3410
3411 ptl = pte_lockptr(mm, pmd);
3412 spin_lock(ptl);
3413 if (unlikely(!pte_same(*pte, entry)))
3414 goto unlock;
3415 if (flags & FAULT_FLAG_WRITE) {
3416 if (!pte_write(entry))
3417 return do_wp_page(mm, vma, address,
3418 pte, pmd, ptl, entry);
3419 entry = pte_mkdirty(entry);
3420 }
3421 entry = pte_mkyoung(entry);
3422 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3423 update_mmu_cache(vma, address, pte);
3424 } else {
3425
3426
3427
3428
3429
3430
3431 if (flags & FAULT_FLAG_WRITE)
3432 flush_tlb_fix_spurious_fault(vma, address);
3433 }
3434unlock:
3435 pte_unmap_unlock(pte, ptl);
3436 return 0;
3437}
3438
3439
3440
3441
3442int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3443 unsigned long address, unsigned int flags)
3444{
3445 pgd_t *pgd;
3446 pud_t *pud;
3447 pmd_t *pmd;
3448 pte_t *pte;
3449
3450 __set_current_state(TASK_RUNNING);
3451
3452 count_vm_event(PGFAULT);
3453 mem_cgroup_count_vm_event(mm, PGFAULT);
3454
3455
3456 check_sync_rss_stat(current);
3457
3458 if (unlikely(is_vm_hugetlb_page(vma)))
3459 return hugetlb_fault(mm, vma, address, flags);
3460
3461 pgd = pgd_offset(mm, address);
3462 pud = pud_alloc(mm, pgd, address);
3463 if (!pud)
3464 return VM_FAULT_OOM;
3465 pmd = pmd_alloc(mm, pud, address);
3466 if (!pmd)
3467 return VM_FAULT_OOM;
3468 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3469 if (!vma->vm_ops)
3470 return do_huge_pmd_anonymous_page(mm, vma, address,
3471 pmd, flags);
3472 } else {
3473 pmd_t orig_pmd = *pmd;
3474 barrier();
3475 if (pmd_trans_huge(orig_pmd)) {
3476 if (flags & FAULT_FLAG_WRITE &&
3477 !pmd_write(orig_pmd) &&
3478 !pmd_trans_splitting(orig_pmd))
3479 return do_huge_pmd_wp_page(mm, vma, address,
3480 pmd, orig_pmd);
3481 return 0;
3482 }
3483 }
3484
3485
3486
3487
3488
3489
3490 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3491 return VM_FAULT_OOM;
3492
3493 if (unlikely(pmd_trans_huge(*pmd)))
3494 return 0;
3495
3496
3497
3498
3499
3500
3501 pte = pte_offset_map(pmd, address);
3502
3503 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3504}
3505
3506#ifndef __PAGETABLE_PUD_FOLDED
3507
3508
3509
3510
3511int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3512{
3513 pud_t *new = pud_alloc_one(mm, address);
3514 if (!new)
3515 return -ENOMEM;
3516
3517 smp_wmb();
3518
3519 spin_lock(&mm->page_table_lock);
3520 if (pgd_present(*pgd))
3521 pud_free(mm, new);
3522 else
3523 pgd_populate(mm, pgd, new);
3524 spin_unlock(&mm->page_table_lock);
3525 return 0;
3526}
3527#endif
3528
3529#ifndef __PAGETABLE_PMD_FOLDED
3530
3531
3532
3533
3534int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3535{
3536 pmd_t *new = pmd_alloc_one(mm, address);
3537 if (!new)
3538 return -ENOMEM;
3539
3540 smp_wmb();
3541
3542 spin_lock(&mm->page_table_lock);
3543#ifndef __ARCH_HAS_4LEVEL_HACK
3544 if (pud_present(*pud))
3545 pmd_free(mm, new);
3546 else
3547 pud_populate(mm, pud, new);
3548#else
3549 if (pgd_present(*pud))
3550 pmd_free(mm, new);
3551 else
3552 pgd_populate(mm, pud, new);
3553#endif
3554 spin_unlock(&mm->page_table_lock);
3555 return 0;
3556}
3557#endif
3558
3559int make_pages_present(unsigned long addr, unsigned long end)
3560{
3561 int ret, len, write;
3562 struct vm_area_struct * vma;
3563
3564 vma = find_vma(current->mm, addr);
3565 if (!vma)
3566 return -ENOMEM;
3567
3568
3569
3570
3571
3572 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3573 BUG_ON(addr >= end);
3574 BUG_ON(end > vma->vm_end);
3575 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3576 ret = get_user_pages(current, current->mm, addr,
3577 len, write, 0, NULL, NULL);
3578 if (ret < 0)
3579 return ret;
3580 return ret == len ? 0 : -EFAULT;
3581}
3582
3583#if !defined(__HAVE_ARCH_GATE_AREA)
3584
3585#if defined(AT_SYSINFO_EHDR)
3586static struct vm_area_struct gate_vma;
3587
3588static int __init gate_vma_init(void)
3589{
3590 gate_vma.vm_mm = NULL;
3591 gate_vma.vm_start = FIXADDR_USER_START;
3592 gate_vma.vm_end = FIXADDR_USER_END;
3593 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3594 gate_vma.vm_page_prot = __P101;
3595
3596
3597
3598
3599
3600
3601 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3602 return 0;
3603}
3604__initcall(gate_vma_init);
3605#endif
3606
3607struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3608{
3609#ifdef AT_SYSINFO_EHDR
3610 return &gate_vma;
3611#else
3612 return NULL;
3613#endif
3614}
3615
3616int in_gate_area_no_mm(unsigned long addr)
3617{
3618#ifdef AT_SYSINFO_EHDR
3619 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3620 return 1;
3621#endif
3622 return 0;
3623}
3624
3625#endif
3626
3627static int __follow_pte(struct mm_struct *mm, unsigned long address,
3628 pte_t **ptepp, spinlock_t **ptlp)
3629{
3630 pgd_t *pgd;
3631 pud_t *pud;
3632 pmd_t *pmd;
3633 pte_t *ptep;
3634
3635 pgd = pgd_offset(mm, address);
3636 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3637 goto out;
3638
3639 pud = pud_offset(pgd, address);
3640 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3641 goto out;
3642
3643 pmd = pmd_offset(pud, address);
3644 VM_BUG_ON(pmd_trans_huge(*pmd));
3645 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3646 goto out;
3647
3648
3649 if (pmd_huge(*pmd))
3650 goto out;
3651
3652 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3653 if (!ptep)
3654 goto out;
3655 if (!pte_present(*ptep))
3656 goto unlock;
3657 *ptepp = ptep;
3658 return 0;
3659unlock:
3660 pte_unmap_unlock(ptep, *ptlp);
3661out:
3662 return -EINVAL;
3663}
3664
3665static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3666 pte_t **ptepp, spinlock_t **ptlp)
3667{
3668 int res;
3669
3670
3671 (void) __cond_lock(*ptlp,
3672 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3673 return res;
3674}
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3687 unsigned long *pfn)
3688{
3689 int ret = -EINVAL;
3690 spinlock_t *ptl;
3691 pte_t *ptep;
3692
3693 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3694 return ret;
3695
3696 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3697 if (ret)
3698 return ret;
3699 *pfn = pte_pfn(*ptep);
3700 pte_unmap_unlock(ptep, ptl);
3701 return 0;
3702}
3703EXPORT_SYMBOL(follow_pfn);
3704
3705#ifdef CONFIG_HAVE_IOREMAP_PROT
3706int follow_phys(struct vm_area_struct *vma,
3707 unsigned long address, unsigned int flags,
3708 unsigned long *prot, resource_size_t *phys)
3709{
3710 int ret = -EINVAL;
3711 pte_t *ptep, pte;
3712 spinlock_t *ptl;
3713
3714 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3715 goto out;
3716
3717 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3718 goto out;
3719 pte = *ptep;
3720
3721 if ((flags & FOLL_WRITE) && !pte_write(pte))
3722 goto unlock;
3723
3724 *prot = pgprot_val(pte_pgprot(pte));
3725 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3726
3727 ret = 0;
3728unlock:
3729 pte_unmap_unlock(ptep, ptl);
3730out:
3731 return ret;
3732}
3733
3734int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3735 void *buf, int len, int write)
3736{
3737 resource_size_t phys_addr;
3738 unsigned long prot = 0;
3739 void __iomem *maddr;
3740 int offset = addr & (PAGE_SIZE-1);
3741
3742 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3743 return -EINVAL;
3744
3745 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3746 if (write)
3747 memcpy_toio(maddr + offset, buf, len);
3748 else
3749 memcpy_fromio(buf, maddr + offset, len);
3750 iounmap(maddr);
3751
3752 return len;
3753}
3754#endif
3755
3756
3757
3758
3759
3760static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3761 unsigned long addr, void *buf, int len, int write)
3762{
3763 struct vm_area_struct *vma;
3764 void *old_buf = buf;
3765
3766 down_read(&mm->mmap_sem);
3767
3768 while (len) {
3769 int bytes, ret, offset;
3770 void *maddr;
3771 struct page *page = NULL;
3772
3773 ret = get_user_pages(tsk, mm, addr, 1,
3774 write, 1, &page, &vma);
3775 if (ret <= 0) {
3776
3777
3778
3779
3780#ifdef CONFIG_HAVE_IOREMAP_PROT
3781 vma = find_vma(mm, addr);
3782 if (!vma || vma->vm_start > addr)
3783 break;
3784 if (vma->vm_ops && vma->vm_ops->access)
3785 ret = vma->vm_ops->access(vma, addr, buf,
3786 len, write);
3787 if (ret <= 0)
3788#endif
3789 break;
3790 bytes = ret;
3791 } else {
3792 bytes = len;
3793 offset = addr & (PAGE_SIZE-1);
3794 if (bytes > PAGE_SIZE-offset)
3795 bytes = PAGE_SIZE-offset;
3796
3797 maddr = kmap(page);
3798 if (write) {
3799 copy_to_user_page(vma, page, addr,
3800 maddr + offset, buf, bytes);
3801 set_page_dirty_lock(page);
3802 } else {
3803 copy_from_user_page(vma, page, addr,
3804 buf, maddr + offset, bytes);
3805 }
3806 kunmap(page);
3807 page_cache_release(page);
3808 }
3809 len -= bytes;
3810 buf += bytes;
3811 addr += bytes;
3812 }
3813 up_read(&mm->mmap_sem);
3814
3815 return buf - old_buf;
3816}
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3829 void *buf, int len, int write)
3830{
3831 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3832}
3833
3834
3835
3836
3837
3838
3839int access_process_vm(struct task_struct *tsk, unsigned long addr,
3840 void *buf, int len, int write)
3841{
3842 struct mm_struct *mm;
3843 int ret;
3844
3845 mm = get_task_mm(tsk);
3846 if (!mm)
3847 return 0;
3848
3849 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3850 mmput(mm);
3851
3852 return ret;
3853}
3854
3855
3856
3857
3858void print_vma_addr(char *prefix, unsigned long ip)
3859{
3860 struct mm_struct *mm = current->mm;
3861 struct vm_area_struct *vma;
3862
3863
3864
3865
3866
3867 if (preempt_count())
3868 return;
3869
3870 down_read(&mm->mmap_sem);
3871 vma = find_vma(mm, ip);
3872 if (vma && vma->vm_file) {
3873 struct file *f = vma->vm_file;
3874 char *buf = (char *)__get_free_page(GFP_KERNEL);
3875 if (buf) {
3876 char *p, *s;
3877
3878 p = d_path(&f->f_path, buf, PAGE_SIZE);
3879 if (IS_ERR(p))
3880 p = "?";
3881 s = strrchr(p, '/');
3882 if (s)
3883 p = s+1;
3884 printk("%s%s[%lx+%lx]", prefix, p,
3885 vma->vm_start,
3886 vma->vm_end - vma->vm_start);
3887 free_page((unsigned long)buf);
3888 }
3889 }
3890 up_read(¤t->mm->mmap_sem);
3891}
3892
3893#ifdef CONFIG_PROVE_LOCKING
3894void might_fault(void)
3895{
3896
3897
3898
3899
3900
3901
3902 if (segment_eq(get_fs(), KERNEL_DS))
3903 return;
3904
3905 might_sleep();
3906
3907
3908
3909
3910
3911 if (!in_atomic() && current->mm)
3912 might_lock_read(¤t->mm->mmap_sem);
3913}
3914EXPORT_SYMBOL(might_fault);
3915#endif
3916
3917#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3918static void clear_gigantic_page(struct page *page,
3919 unsigned long addr,
3920 unsigned int pages_per_huge_page)
3921{
3922 int i;
3923 struct page *p = page;
3924
3925 might_sleep();
3926 for (i = 0; i < pages_per_huge_page;
3927 i++, p = mem_map_next(p, page, i)) {
3928 cond_resched();
3929 clear_user_highpage(p, addr + i * PAGE_SIZE);
3930 }
3931}
3932void clear_huge_page(struct page *page,
3933 unsigned long addr, unsigned int pages_per_huge_page)
3934{
3935 int i;
3936
3937 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3938 clear_gigantic_page(page, addr, pages_per_huge_page);
3939 return;
3940 }
3941
3942 might_sleep();
3943 for (i = 0; i < pages_per_huge_page; i++) {
3944 cond_resched();
3945 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3946 }
3947}
3948
3949static void copy_user_gigantic_page(struct page *dst, struct page *src,
3950 unsigned long addr,
3951 struct vm_area_struct *vma,
3952 unsigned int pages_per_huge_page)
3953{
3954 int i;
3955 struct page *dst_base = dst;
3956 struct page *src_base = src;
3957
3958 for (i = 0; i < pages_per_huge_page; ) {
3959 cond_resched();
3960 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3961
3962 i++;
3963 dst = mem_map_next(dst, dst_base, i);
3964 src = mem_map_next(src, src_base, i);
3965 }
3966}
3967
3968void copy_user_huge_page(struct page *dst, struct page *src,
3969 unsigned long addr, struct vm_area_struct *vma,
3970 unsigned int pages_per_huge_page)
3971{
3972 int i;
3973
3974 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3975 copy_user_gigantic_page(dst, src, addr, vma,
3976 pages_per_huge_page);
3977 return;
3978 }
3979
3980 might_sleep();
3981 for (i = 0; i < pages_per_huge_page; i++) {
3982 cond_resched();
3983 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3984 }
3985}
3986#endif
3987