1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81 struct address_space *mapping;
82 dev_t dev;
83
84 if (hwpoison_filter_dev_major == ~0U &&
85 hwpoison_filter_dev_minor == ~0U)
86 return 0;
87
88
89
90
91 if (PageSlab(p))
92 return -EINVAL;
93
94 mapping = page_mapping(p);
95 if (mapping == NULL || mapping->host == NULL)
96 return -EINVAL;
97
98 dev = mapping->host->i_sb->s_dev;
99 if (hwpoison_filter_dev_major != ~0U &&
100 hwpoison_filter_dev_major != MAJOR(dev))
101 return -EINVAL;
102 if (hwpoison_filter_dev_minor != ~0U &&
103 hwpoison_filter_dev_minor != MINOR(dev))
104 return -EINVAL;
105
106 return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111 if (!hwpoison_filter_flags_mask)
112 return 0;
113
114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115 hwpoison_filter_flags_value)
116 return 0;
117 else
118 return -EINVAL;
119}
120
121
122
123
124
125
126
127
128
129
130
131#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136 struct mem_cgroup *mem;
137 struct cgroup_subsys_state *css;
138 unsigned long ino;
139
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 mem = try_get_mem_cgroup_from_page(p);
144 if (!mem)
145 return -EINVAL;
146
147 css = mem_cgroup_css(mem);
148
149 if (!css->cgroup->dentry)
150 return -EINVAL;
151
152 ino = css->cgroup->dentry->d_inode->i_ino;
153 css_put(css);
154
155 if (ino != hwpoison_filter_memcg)
156 return -EINVAL;
157
158 return 0;
159}
160#else
161static int hwpoison_filter_task(struct page *p) { return 0; }
162#endif
163
164int hwpoison_filter(struct page *p)
165{
166 if (!hwpoison_filter_enable)
167 return 0;
168
169 if (hwpoison_filter_dev(p))
170 return -EINVAL;
171
172 if (hwpoison_filter_flags(p))
173 return -EINVAL;
174
175 if (hwpoison_filter_task(p))
176 return -EINVAL;
177
178 return 0;
179}
180#else
181int hwpoison_filter(struct page *p)
182{
183 return 0;
184}
185#endif
186
187EXPORT_SYMBOL_GPL(hwpoison_filter);
188
189
190
191
192
193static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
194 unsigned long pfn, struct page *page)
195{
196 struct siginfo si;
197 int ret;
198
199 printk(KERN_ERR
200 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
201 pfn, t->comm, t->pid);
202 si.si_signo = SIGBUS;
203 si.si_errno = 0;
204 si.si_code = BUS_MCEERR_AO;
205 si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno;
208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
210
211
212
213
214
215
216 ret = send_sig_info(SIGBUS, &si, t);
217 if (ret < 0)
218 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
219 t->comm, t->pid, ret);
220 return ret;
221}
222
223
224
225
226
227void shake_page(struct page *p, int access)
228{
229 if (!PageSlab(p)) {
230 lru_add_drain_all();
231 if (PageLRU(p))
232 return;
233 drain_all_pages();
234 if (PageLRU(p) || is_free_buddy_page(p))
235 return;
236 }
237
238
239
240
241
242 if (access) {
243 int nr;
244 do {
245 struct shrink_control shrink = {
246 .gfp_mask = GFP_KERNEL,
247 };
248
249 nr = shrink_slab(&shrink, 1000, 1000);
250 if (page_count(p) == 1)
251 break;
252 } while (nr > 10);
253 }
254}
255EXPORT_SYMBOL_GPL(shake_page);
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279struct to_kill {
280 struct list_head nd;
281 struct task_struct *tsk;
282 unsigned long addr;
283 char addr_valid;
284};
285
286
287
288
289
290
291
292
293
294
295
296static void add_to_kill(struct task_struct *tsk, struct page *p,
297 struct vm_area_struct *vma,
298 struct list_head *to_kill,
299 struct to_kill **tkc)
300{
301 struct to_kill *tk;
302
303 if (*tkc) {
304 tk = *tkc;
305 *tkc = NULL;
306 } else {
307 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
308 if (!tk) {
309 printk(KERN_ERR
310 "MCE: Out of memory while machine check handling\n");
311 return;
312 }
313 }
314 tk->addr = page_address_in_vma(p, vma);
315 tk->addr_valid = 1;
316
317
318
319
320
321
322
323 if (tk->addr == -EFAULT) {
324 pr_info("MCE: Unable to find user space address %lx in %s\n",
325 page_to_pfn(p), tsk->comm);
326 tk->addr_valid = 0;
327 }
328 get_task_struct(tsk);
329 tk->tsk = tsk;
330 list_add_tail(&tk->nd, to_kill);
331}
332
333
334
335
336
337
338
339
340
341static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
342 int fail, struct page *page, unsigned long pfn)
343{
344 struct to_kill *tk, *next;
345
346 list_for_each_entry_safe (tk, next, to_kill, nd) {
347 if (doit) {
348
349
350
351
352
353 if (fail || tk->addr_valid == 0) {
354 printk(KERN_ERR
355 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
356 pfn, tk->tsk->comm, tk->tsk->pid);
357 force_sig(SIGKILL, tk->tsk);
358 }
359
360
361
362
363
364
365
366 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
367 pfn, page) < 0)
368 printk(KERN_ERR
369 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
370 pfn, tk->tsk->comm, tk->tsk->pid);
371 }
372 put_task_struct(tk->tsk);
373 kfree(tk);
374 }
375}
376
377static int task_early_kill(struct task_struct *tsk)
378{
379 if (!tsk->mm)
380 return 0;
381 if (tsk->flags & PF_MCE_PROCESS)
382 return !!(tsk->flags & PF_MCE_EARLY);
383 return sysctl_memory_failure_early_kill;
384}
385
386
387
388
389static void collect_procs_anon(struct page *page, struct list_head *to_kill,
390 struct to_kill **tkc)
391{
392 struct vm_area_struct *vma;
393 struct task_struct *tsk;
394 struct anon_vma *av;
395
396 av = page_lock_anon_vma(page);
397 if (av == NULL)
398 return;
399
400 read_lock(&tasklist_lock);
401 for_each_process (tsk) {
402 struct anon_vma_chain *vmac;
403
404 if (!task_early_kill(tsk))
405 continue;
406 list_for_each_entry(vmac, &av->head, same_anon_vma) {
407 vma = vmac->vma;
408 if (!page_mapped_in_vma(page, vma))
409 continue;
410 if (vma->vm_mm == tsk->mm)
411 add_to_kill(tsk, page, vma, to_kill, tkc);
412 }
413 }
414 read_unlock(&tasklist_lock);
415 page_unlock_anon_vma(av);
416}
417
418
419
420
421static void collect_procs_file(struct page *page, struct list_head *to_kill,
422 struct to_kill **tkc)
423{
424 struct vm_area_struct *vma;
425 struct task_struct *tsk;
426 struct prio_tree_iter iter;
427 struct address_space *mapping = page->mapping;
428
429 mutex_lock(&mapping->i_mmap_mutex);
430 read_lock(&tasklist_lock);
431 for_each_process(tsk) {
432 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
433
434 if (!task_early_kill(tsk))
435 continue;
436
437 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
438 pgoff) {
439
440
441
442
443
444
445
446 if (vma->vm_mm == tsk->mm)
447 add_to_kill(tsk, page, vma, to_kill, tkc);
448 }
449 }
450 read_unlock(&tasklist_lock);
451 mutex_unlock(&mapping->i_mmap_mutex);
452}
453
454
455
456
457
458
459
460static void collect_procs(struct page *page, struct list_head *tokill)
461{
462 struct to_kill *tk;
463
464 if (!page->mapping)
465 return;
466
467 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
468 if (!tk)
469 return;
470 if (PageAnon(page))
471 collect_procs_anon(page, tokill, &tk);
472 else
473 collect_procs_file(page, tokill, &tk);
474 kfree(tk);
475}
476
477
478
479
480
481enum outcome {
482 IGNORED,
483 FAILED,
484 DELAYED,
485 RECOVERED,
486};
487
488static const char *action_name[] = {
489 [IGNORED] = "Ignored",
490 [FAILED] = "Failed",
491 [DELAYED] = "Delayed",
492 [RECOVERED] = "Recovered",
493};
494
495
496
497
498
499
500
501static int delete_from_lru_cache(struct page *p)
502{
503 if (!isolate_lru_page(p)) {
504
505
506
507
508 ClearPageActive(p);
509 ClearPageUnevictable(p);
510
511
512
513 page_cache_release(p);
514 return 0;
515 }
516 return -EIO;
517}
518
519
520
521
522
523
524static int me_kernel(struct page *p, unsigned long pfn)
525{
526 return IGNORED;
527}
528
529
530
531
532static int me_unknown(struct page *p, unsigned long pfn)
533{
534 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
535 return FAILED;
536}
537
538
539
540
541static int me_pagecache_clean(struct page *p, unsigned long pfn)
542{
543 int err;
544 int ret = FAILED;
545 struct address_space *mapping;
546
547 delete_from_lru_cache(p);
548
549
550
551
552
553 if (PageAnon(p))
554 return RECOVERED;
555
556
557
558
559
560
561
562
563 mapping = page_mapping(p);
564 if (!mapping) {
565
566
567
568 return FAILED;
569 }
570
571
572
573
574
575
576 if (mapping->a_ops->error_remove_page) {
577 err = mapping->a_ops->error_remove_page(mapping, p);
578 if (err != 0) {
579 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
580 pfn, err);
581 } else if (page_has_private(p) &&
582 !try_to_release_page(p, GFP_NOIO)) {
583 pr_info("MCE %#lx: failed to release buffers\n", pfn);
584 } else {
585 ret = RECOVERED;
586 }
587 } else {
588
589
590
591
592 if (invalidate_inode_page(p))
593 ret = RECOVERED;
594 else
595 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
596 pfn);
597 }
598 return ret;
599}
600
601
602
603
604
605
606static int me_pagecache_dirty(struct page *p, unsigned long pfn)
607{
608 struct address_space *mapping = page_mapping(p);
609
610 SetPageError(p);
611
612 if (mapping) {
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647 mapping_set_error(mapping, EIO);
648 }
649
650 return me_pagecache_clean(p, pfn);
651}
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672static int me_swapcache_dirty(struct page *p, unsigned long pfn)
673{
674 ClearPageDirty(p);
675
676 ClearPageUptodate(p);
677
678 if (!delete_from_lru_cache(p))
679 return DELAYED;
680 else
681 return FAILED;
682}
683
684static int me_swapcache_clean(struct page *p, unsigned long pfn)
685{
686 delete_from_swap_cache(p);
687
688 if (!delete_from_lru_cache(p))
689 return RECOVERED;
690 else
691 return FAILED;
692}
693
694
695
696
697
698
699
700static int me_huge_page(struct page *p, unsigned long pfn)
701{
702 int res = 0;
703 struct page *hpage = compound_head(p);
704
705
706
707
708
709
710
711
712
713
714 if (!(page_mapping(hpage) || PageAnon(hpage))) {
715 res = dequeue_hwpoisoned_huge_page(hpage);
716 if (!res)
717 return RECOVERED;
718 }
719 return DELAYED;
720}
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735#define dirty (1UL << PG_dirty)
736#define sc (1UL << PG_swapcache)
737#define unevict (1UL << PG_unevictable)
738#define mlock (1UL << PG_mlocked)
739#define writeback (1UL << PG_writeback)
740#define lru (1UL << PG_lru)
741#define swapbacked (1UL << PG_swapbacked)
742#define head (1UL << PG_head)
743#define tail (1UL << PG_tail)
744#define compound (1UL << PG_compound)
745#define slab (1UL << PG_slab)
746#define reserved (1UL << PG_reserved)
747
748static struct page_state {
749 unsigned long mask;
750 unsigned long res;
751 char *msg;
752 int (*action)(struct page *p, unsigned long pfn);
753} error_states[] = {
754 { reserved, reserved, "reserved kernel", me_kernel },
755
756
757
758
759
760
761
762
763
764
765 { slab, slab, "kernel slab", me_kernel },
766
767#ifdef CONFIG_PAGEFLAGS_EXTENDED
768 { head, head, "huge", me_huge_page },
769 { tail, tail, "huge", me_huge_page },
770#else
771 { compound, compound, "huge", me_huge_page },
772#endif
773
774 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
775 { sc|dirty, sc, "swapcache", me_swapcache_clean },
776
777 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
778 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
779
780 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
781 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
782
783 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
784 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
785
786
787
788
789 { 0, 0, "unknown page state", me_unknown },
790};
791
792#undef dirty
793#undef sc
794#undef unevict
795#undef mlock
796#undef writeback
797#undef lru
798#undef swapbacked
799#undef head
800#undef tail
801#undef compound
802#undef slab
803#undef reserved
804
805static void action_result(unsigned long pfn, char *msg, int result)
806{
807 struct page *page = pfn_to_page(pfn);
808
809 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
810 pfn,
811 PageDirty(page) ? "dirty " : "",
812 msg, action_name[result]);
813}
814
815static int page_action(struct page_state *ps, struct page *p,
816 unsigned long pfn)
817{
818 int result;
819 int count;
820
821 result = ps->action(p, pfn);
822 action_result(pfn, ps->msg, result);
823
824 count = page_count(p) - 1;
825 if (ps->action == me_swapcache_dirty && result == DELAYED)
826 count--;
827 if (count != 0) {
828 printk(KERN_ERR
829 "MCE %#lx: %s page still referenced by %d users\n",
830 pfn, ps->msg, count);
831 result = FAILED;
832 }
833
834
835
836
837
838
839 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
840}
841
842
843
844
845
846static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
847 int trapno)
848{
849 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
850 struct address_space *mapping;
851 LIST_HEAD(tokill);
852 int ret;
853 int kill = 1;
854 struct page *hpage = compound_head(p);
855 struct page *ppage;
856
857 if (PageReserved(p) || PageSlab(p))
858 return SWAP_SUCCESS;
859
860
861
862
863
864 if (!page_mapped(hpage))
865 return SWAP_SUCCESS;
866
867 if (PageKsm(p))
868 return SWAP_FAIL;
869
870 if (PageSwapCache(p)) {
871 printk(KERN_ERR
872 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
873 ttu |= TTU_IGNORE_HWPOISON;
874 }
875
876
877
878
879
880
881
882 mapping = page_mapping(hpage);
883 if (!PageDirty(hpage) && mapping &&
884 mapping_cap_writeback_dirty(mapping)) {
885 if (page_mkclean(hpage)) {
886 SetPageDirty(hpage);
887 } else {
888 kill = 0;
889 ttu |= TTU_IGNORE_HWPOISON;
890 printk(KERN_INFO
891 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
892 pfn);
893 }
894 }
895
896
897
898
899
900
901
902 ppage = hpage;
903
904 if (PageTransHuge(hpage)) {
905
906
907
908
909
910
911
912
913
914
915 if (!PageHuge(hpage) && PageAnon(hpage)) {
916 if (unlikely(split_huge_page(hpage))) {
917
918
919
920
921
922
923 printk(KERN_INFO
924 "MCE %#lx: failed to split THP\n", pfn);
925
926 BUG_ON(!PageHWPoison(p));
927 return SWAP_FAIL;
928 }
929
930 ppage = p;
931 }
932 }
933
934
935
936
937
938
939
940
941
942 if (kill)
943 collect_procs(ppage, &tokill);
944
945 if (hpage != ppage)
946 lock_page(ppage);
947
948 ret = try_to_unmap(ppage, ttu);
949 if (ret != SWAP_SUCCESS)
950 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
951 pfn, page_mapcount(ppage));
952
953 if (hpage != ppage)
954 unlock_page(ppage);
955
956
957
958
959
960
961
962
963
964
965 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
966 ret != SWAP_SUCCESS, p, pfn);
967
968 return ret;
969}
970
971static void set_page_hwpoison_huge_page(struct page *hpage)
972{
973 int i;
974 int nr_pages = 1 << compound_trans_order(hpage);
975 for (i = 0; i < nr_pages; i++)
976 SetPageHWPoison(hpage + i);
977}
978
979static void clear_page_hwpoison_huge_page(struct page *hpage)
980{
981 int i;
982 int nr_pages = 1 << compound_trans_order(hpage);
983 for (i = 0; i < nr_pages; i++)
984 ClearPageHWPoison(hpage + i);
985}
986
987int __memory_failure(unsigned long pfn, int trapno, int flags)
988{
989 struct page_state *ps;
990 struct page *p;
991 struct page *hpage;
992 int res;
993 unsigned int nr_pages;
994
995 if (!sysctl_memory_failure_recovery)
996 panic("Memory failure from trap %d on page %lx", trapno, pfn);
997
998 if (!pfn_valid(pfn)) {
999 printk(KERN_ERR
1000 "MCE %#lx: memory outside kernel control\n",
1001 pfn);
1002 return -ENXIO;
1003 }
1004
1005 p = pfn_to_page(pfn);
1006 hpage = compound_head(p);
1007 if (TestSetPageHWPoison(p)) {
1008 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1009 return 0;
1010 }
1011
1012 nr_pages = 1 << compound_trans_order(hpage);
1013 atomic_long_add(nr_pages, &mce_bad_pages);
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029 if (!(flags & MF_COUNT_INCREASED) &&
1030 !get_page_unless_zero(hpage)) {
1031 if (is_free_buddy_page(p)) {
1032 action_result(pfn, "free buddy", DELAYED);
1033 return 0;
1034 } else if (PageHuge(hpage)) {
1035
1036
1037
1038
1039 lock_page(hpage);
1040 if (!PageHWPoison(hpage)
1041 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1042 || (p != hpage && TestSetPageHWPoison(hpage))) {
1043 atomic_long_sub(nr_pages, &mce_bad_pages);
1044 return 0;
1045 }
1046 set_page_hwpoison_huge_page(hpage);
1047 res = dequeue_hwpoisoned_huge_page(hpage);
1048 action_result(pfn, "free huge",
1049 res ? IGNORED : DELAYED);
1050 unlock_page(hpage);
1051 return res;
1052 } else {
1053 action_result(pfn, "high order kernel", IGNORED);
1054 return -EBUSY;
1055 }
1056 }
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066 if (!PageHuge(p) && !PageTransCompound(p)) {
1067 if (!PageLRU(p))
1068 shake_page(p, 0);
1069 if (!PageLRU(p)) {
1070
1071
1072
1073 if (is_free_buddy_page(p)) {
1074 action_result(pfn, "free buddy, 2nd try",
1075 DELAYED);
1076 return 0;
1077 }
1078 action_result(pfn, "non LRU", IGNORED);
1079 put_page(p);
1080 return -EBUSY;
1081 }
1082 }
1083
1084
1085
1086
1087
1088
1089 lock_page(hpage);
1090
1091
1092
1093
1094 if (!PageHWPoison(p)) {
1095 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1096 res = 0;
1097 goto out;
1098 }
1099 if (hwpoison_filter(p)) {
1100 if (TestClearPageHWPoison(p))
1101 atomic_long_sub(nr_pages, &mce_bad_pages);
1102 unlock_page(hpage);
1103 put_page(hpage);
1104 return 0;
1105 }
1106
1107
1108
1109
1110
1111 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1112 action_result(pfn, "hugepage already hardware poisoned",
1113 IGNORED);
1114 unlock_page(hpage);
1115 put_page(hpage);
1116 return 0;
1117 }
1118
1119
1120
1121
1122
1123
1124 if (PageHuge(p))
1125 set_page_hwpoison_huge_page(hpage);
1126
1127 wait_on_page_writeback(p);
1128
1129
1130
1131
1132
1133 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1134 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1135 res = -EBUSY;
1136 goto out;
1137 }
1138
1139
1140
1141
1142 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1143 action_result(pfn, "already truncated LRU", IGNORED);
1144 res = -EBUSY;
1145 goto out;
1146 }
1147
1148 res = -EBUSY;
1149 for (ps = error_states;; ps++) {
1150 if ((p->flags & ps->mask) == ps->res) {
1151 res = page_action(ps, p, pfn);
1152 break;
1153 }
1154 }
1155out:
1156 unlock_page(hpage);
1157 return res;
1158}
1159EXPORT_SYMBOL_GPL(__memory_failure);
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178void memory_failure(unsigned long pfn, int trapno)
1179{
1180 __memory_failure(pfn, trapno, 0);
1181}
1182
1183#define MEMORY_FAILURE_FIFO_ORDER 4
1184#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1185
1186struct memory_failure_entry {
1187 unsigned long pfn;
1188 int trapno;
1189 int flags;
1190};
1191
1192struct memory_failure_cpu {
1193 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1194 MEMORY_FAILURE_FIFO_SIZE);
1195 spinlock_t lock;
1196 struct work_struct work;
1197};
1198
1199static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1219{
1220 struct memory_failure_cpu *mf_cpu;
1221 unsigned long proc_flags;
1222 struct memory_failure_entry entry = {
1223 .pfn = pfn,
1224 .trapno = trapno,
1225 .flags = flags,
1226 };
1227
1228 mf_cpu = &get_cpu_var(memory_failure_cpu);
1229 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1230 if (kfifo_put(&mf_cpu->fifo, &entry))
1231 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1232 else
1233 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1234 pfn);
1235 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1236 put_cpu_var(memory_failure_cpu);
1237}
1238EXPORT_SYMBOL_GPL(memory_failure_queue);
1239
1240static void memory_failure_work_func(struct work_struct *work)
1241{
1242 struct memory_failure_cpu *mf_cpu;
1243 struct memory_failure_entry entry = { 0, };
1244 unsigned long proc_flags;
1245 int gotten;
1246
1247 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1248 for (;;) {
1249 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1250 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1251 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1252 if (!gotten)
1253 break;
1254 __memory_failure(entry.pfn, entry.trapno, entry.flags);
1255 }
1256}
1257
1258static int __init memory_failure_init(void)
1259{
1260 struct memory_failure_cpu *mf_cpu;
1261 int cpu;
1262
1263 for_each_possible_cpu(cpu) {
1264 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1265 spin_lock_init(&mf_cpu->lock);
1266 INIT_KFIFO(mf_cpu->fifo);
1267 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1268 }
1269
1270 return 0;
1271}
1272core_initcall(memory_failure_init);
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286int unpoison_memory(unsigned long pfn)
1287{
1288 struct page *page;
1289 struct page *p;
1290 int freeit = 0;
1291 unsigned int nr_pages;
1292
1293 if (!pfn_valid(pfn))
1294 return -ENXIO;
1295
1296 p = pfn_to_page(pfn);
1297 page = compound_head(p);
1298
1299 if (!PageHWPoison(p)) {
1300 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1301 return 0;
1302 }
1303
1304 nr_pages = 1 << compound_trans_order(page);
1305
1306 if (!get_page_unless_zero(page)) {
1307
1308
1309
1310
1311
1312
1313 if (PageHuge(page)) {
1314 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1315 return 0;
1316 }
1317 if (TestClearPageHWPoison(p))
1318 atomic_long_sub(nr_pages, &mce_bad_pages);
1319 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1320 return 0;
1321 }
1322
1323 lock_page(page);
1324
1325
1326
1327
1328
1329
1330 if (TestClearPageHWPoison(page)) {
1331 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1332 atomic_long_sub(nr_pages, &mce_bad_pages);
1333 freeit = 1;
1334 if (PageHuge(page))
1335 clear_page_hwpoison_huge_page(page);
1336 }
1337 unlock_page(page);
1338
1339 put_page(page);
1340 if (freeit)
1341 put_page(page);
1342
1343 return 0;
1344}
1345EXPORT_SYMBOL(unpoison_memory);
1346
1347static struct page *new_page(struct page *p, unsigned long private, int **x)
1348{
1349 int nid = page_to_nid(p);
1350 if (PageHuge(p))
1351 return alloc_huge_page_node(page_hstate(compound_head(p)),
1352 nid);
1353 else
1354 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1355}
1356
1357
1358
1359
1360
1361
1362
1363static int get_any_page(struct page *p, unsigned long pfn, int flags)
1364{
1365 int ret;
1366
1367 if (flags & MF_COUNT_INCREASED)
1368 return 1;
1369
1370
1371
1372
1373
1374 lock_memory_hotplug();
1375
1376
1377
1378
1379
1380 set_migratetype_isolate(p);
1381
1382
1383
1384
1385 if (!get_page_unless_zero(compound_head(p))) {
1386 if (PageHuge(p)) {
1387 pr_info("get_any_page: %#lx free huge page\n", pfn);
1388 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1389 } else if (is_free_buddy_page(p)) {
1390 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1391
1392 SetPageHWPoison(p);
1393 ret = 0;
1394 } else {
1395 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1396 pfn, p->flags);
1397 ret = -EIO;
1398 }
1399 } else {
1400
1401 ret = 1;
1402 }
1403 unset_migratetype_isolate(p);
1404 unlock_memory_hotplug();
1405 return ret;
1406}
1407
1408static int soft_offline_huge_page(struct page *page, int flags)
1409{
1410 int ret;
1411 unsigned long pfn = page_to_pfn(page);
1412 struct page *hpage = compound_head(page);
1413 LIST_HEAD(pagelist);
1414
1415 ret = get_any_page(page, pfn, flags);
1416 if (ret < 0)
1417 return ret;
1418 if (ret == 0)
1419 goto done;
1420
1421 if (PageHWPoison(hpage)) {
1422 put_page(hpage);
1423 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1424 return -EBUSY;
1425 }
1426
1427
1428
1429 list_add(&hpage->lru, &pagelist);
1430 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1431 true);
1432 if (ret) {
1433 struct page *page1, *page2;
1434 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1435 put_page(page1);
1436
1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1438 pfn, ret, page->flags);
1439 if (ret > 0)
1440 ret = -EIO;
1441 return ret;
1442 }
1443done:
1444 if (!PageHWPoison(hpage))
1445 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1446 set_page_hwpoison_huge_page(hpage);
1447 dequeue_hwpoisoned_huge_page(hpage);
1448
1449 return ret;
1450}
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474int soft_offline_page(struct page *page, int flags)
1475{
1476 int ret;
1477 unsigned long pfn = page_to_pfn(page);
1478
1479 if (PageHuge(page))
1480 return soft_offline_huge_page(page, flags);
1481
1482 ret = get_any_page(page, pfn, flags);
1483 if (ret < 0)
1484 return ret;
1485 if (ret == 0)
1486 goto done;
1487
1488
1489
1490
1491 if (!PageLRU(page)) {
1492
1493
1494
1495 put_page(page);
1496 shake_page(page, 1);
1497
1498
1499
1500
1501 ret = get_any_page(page, pfn, 0);
1502 if (ret < 0)
1503 return ret;
1504 if (ret == 0)
1505 goto done;
1506 }
1507 if (!PageLRU(page)) {
1508 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1509 pfn, page->flags);
1510 return -EIO;
1511 }
1512
1513 lock_page(page);
1514 wait_on_page_writeback(page);
1515
1516
1517
1518
1519 if (PageHWPoison(page)) {
1520 unlock_page(page);
1521 put_page(page);
1522 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1523 return -EBUSY;
1524 }
1525
1526
1527
1528
1529
1530 ret = invalidate_inode_page(page);
1531 unlock_page(page);
1532
1533
1534
1535
1536 if (ret == 1) {
1537 put_page(page);
1538 ret = 0;
1539 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1540 goto done;
1541 }
1542
1543
1544
1545
1546
1547
1548 ret = isolate_lru_page(page);
1549
1550
1551
1552
1553 put_page(page);
1554 if (!ret) {
1555 LIST_HEAD(pagelist);
1556 inc_zone_page_state(page, NR_ISOLATED_ANON +
1557 page_is_file_cache(page));
1558 list_add(&page->lru, &pagelist);
1559 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1560 0, MIGRATE_SYNC);
1561 if (ret) {
1562 putback_lru_pages(&pagelist);
1563 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1564 pfn, ret, page->flags);
1565 if (ret > 0)
1566 ret = -EIO;
1567 }
1568 } else {
1569 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1570 pfn, ret, page_count(page), page->flags);
1571 }
1572 if (ret)
1573 return ret;
1574
1575done:
1576 atomic_long_add(1, &mce_bad_pages);
1577 SetPageHWPoison(page);
1578
1579 return ret;
1580}
1581