1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81 struct address_space *mapping;
82 dev_t dev;
83
84 if (hwpoison_filter_dev_major == ~0U &&
85 hwpoison_filter_dev_minor == ~0U)
86 return 0;
87
88
89
90
91 if (PageSlab(p))
92 return -EINVAL;
93
94 mapping = page_mapping(p);
95 if (mapping == NULL || mapping->host == NULL)
96 return -EINVAL;
97
98 dev = mapping->host->i_sb->s_dev;
99 if (hwpoison_filter_dev_major != ~0U &&
100 hwpoison_filter_dev_major != MAJOR(dev))
101 return -EINVAL;
102 if (hwpoison_filter_dev_minor != ~0U &&
103 hwpoison_filter_dev_minor != MINOR(dev))
104 return -EINVAL;
105
106 return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111 if (!hwpoison_filter_flags_mask)
112 return 0;
113
114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115 hwpoison_filter_flags_value)
116 return 0;
117 else
118 return -EINVAL;
119}
120
121
122
123
124
125
126
127
128
129
130
131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136 struct mem_cgroup *mem;
137 struct cgroup_subsys_state *css;
138 unsigned long ino;
139
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 mem = try_get_mem_cgroup_from_page(p);
144 if (!mem)
145 return -EINVAL;
146
147 css = mem_cgroup_css(mem);
148
149 if (!css->cgroup->dentry)
150 return -EINVAL;
151
152 ino = css->cgroup->dentry->d_inode->i_ino;
153 css_put(css);
154
155 if (ino != hwpoison_filter_memcg)
156 return -EINVAL;
157
158 return 0;
159}
160#else
161static int hwpoison_filter_task(struct page *p) { return 0; }
162#endif
163
164int hwpoison_filter(struct page *p)
165{
166 if (!hwpoison_filter_enable)
167 return 0;
168
169 if (hwpoison_filter_dev(p))
170 return -EINVAL;
171
172 if (hwpoison_filter_flags(p))
173 return -EINVAL;
174
175 if (hwpoison_filter_task(p))
176 return -EINVAL;
177
178 return 0;
179}
180#else
181int hwpoison_filter(struct page *p)
182{
183 return 0;
184}
185#endif
186
187EXPORT_SYMBOL_GPL(hwpoison_filter);
188
189
190
191
192
193
194static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
195 unsigned long pfn, struct page *page, int flags)
196{
197 struct siginfo si;
198 int ret;
199
200 printk(KERN_ERR
201 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
202 pfn, t->comm, t->pid);
203 si.si_signo = SIGBUS;
204 si.si_errno = 0;
205 si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno;
208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
210
211 if ((flags & MF_ACTION_REQUIRED) && t == current) {
212 si.si_code = BUS_MCEERR_AR;
213 ret = force_sig_info(SIGBUS, &si, t);
214 } else {
215
216
217
218
219
220
221 si.si_code = BUS_MCEERR_AO;
222 ret = send_sig_info(SIGBUS, &si, t);
223 }
224 if (ret < 0)
225 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
226 t->comm, t->pid, ret);
227 return ret;
228}
229
230
231
232
233
234void shake_page(struct page *p, int access)
235{
236 if (!PageSlab(p)) {
237 lru_add_drain_all();
238 if (PageLRU(p))
239 return;
240 drain_all_pages();
241 if (PageLRU(p) || is_free_buddy_page(p))
242 return;
243 }
244
245
246
247
248
249 if (access) {
250 int nr;
251 do {
252 struct shrink_control shrink = {
253 .gfp_mask = GFP_KERNEL,
254 };
255
256 nr = shrink_slab(&shrink, 1000, 1000);
257 if (page_count(p) == 1)
258 break;
259 } while (nr > 10);
260 }
261}
262EXPORT_SYMBOL_GPL(shake_page);
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286struct to_kill {
287 struct list_head nd;
288 struct task_struct *tsk;
289 unsigned long addr;
290 char addr_valid;
291};
292
293
294
295
296
297
298
299
300
301
302
303static void add_to_kill(struct task_struct *tsk, struct page *p,
304 struct vm_area_struct *vma,
305 struct list_head *to_kill,
306 struct to_kill **tkc)
307{
308 struct to_kill *tk;
309
310 if (*tkc) {
311 tk = *tkc;
312 *tkc = NULL;
313 } else {
314 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
315 if (!tk) {
316 printk(KERN_ERR
317 "MCE: Out of memory while machine check handling\n");
318 return;
319 }
320 }
321 tk->addr = page_address_in_vma(p, vma);
322 tk->addr_valid = 1;
323
324
325
326
327
328
329
330 if (tk->addr == -EFAULT) {
331 pr_info("MCE: Unable to find user space address %lx in %s\n",
332 page_to_pfn(p), tsk->comm);
333 tk->addr_valid = 0;
334 }
335 get_task_struct(tsk);
336 tk->tsk = tsk;
337 list_add_tail(&tk->nd, to_kill);
338}
339
340
341
342
343
344
345
346
347
348static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
349 int fail, struct page *page, unsigned long pfn,
350 int flags)
351{
352 struct to_kill *tk, *next;
353
354 list_for_each_entry_safe (tk, next, to_kill, nd) {
355 if (forcekill) {
356
357
358
359
360
361 if (fail || tk->addr_valid == 0) {
362 printk(KERN_ERR
363 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
364 pfn, tk->tsk->comm, tk->tsk->pid);
365 force_sig(SIGKILL, tk->tsk);
366 }
367
368
369
370
371
372
373
374 else if (kill_proc(tk->tsk, tk->addr, trapno,
375 pfn, page, flags) < 0)
376 printk(KERN_ERR
377 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
378 pfn, tk->tsk->comm, tk->tsk->pid);
379 }
380 put_task_struct(tk->tsk);
381 kfree(tk);
382 }
383}
384
385static int task_early_kill(struct task_struct *tsk)
386{
387 if (!tsk->mm)
388 return 0;
389 if (tsk->flags & PF_MCE_PROCESS)
390 return !!(tsk->flags & PF_MCE_EARLY);
391 return sysctl_memory_failure_early_kill;
392}
393
394
395
396
397static void collect_procs_anon(struct page *page, struct list_head *to_kill,
398 struct to_kill **tkc)
399{
400 struct vm_area_struct *vma;
401 struct task_struct *tsk;
402 struct anon_vma *av;
403 pgoff_t pgoff;
404
405 av = page_lock_anon_vma_read(page);
406 if (av == NULL)
407 return;
408
409 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
410 read_lock(&tasklist_lock);
411 for_each_process (tsk) {
412 struct anon_vma_chain *vmac;
413
414 if (!task_early_kill(tsk))
415 continue;
416 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
417 pgoff, pgoff) {
418 vma = vmac->vma;
419 if (!page_mapped_in_vma(page, vma))
420 continue;
421 if (vma->vm_mm == tsk->mm)
422 add_to_kill(tsk, page, vma, to_kill, tkc);
423 }
424 }
425 read_unlock(&tasklist_lock);
426 page_unlock_anon_vma_read(av);
427}
428
429
430
431
432static void collect_procs_file(struct page *page, struct list_head *to_kill,
433 struct to_kill **tkc)
434{
435 struct vm_area_struct *vma;
436 struct task_struct *tsk;
437 struct address_space *mapping = page->mapping;
438
439 mutex_lock(&mapping->i_mmap_mutex);
440 read_lock(&tasklist_lock);
441 for_each_process(tsk) {
442 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
443
444 if (!task_early_kill(tsk))
445 continue;
446
447 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
448 pgoff) {
449
450
451
452
453
454
455
456 if (vma->vm_mm == tsk->mm)
457 add_to_kill(tsk, page, vma, to_kill, tkc);
458 }
459 }
460 read_unlock(&tasklist_lock);
461 mutex_unlock(&mapping->i_mmap_mutex);
462}
463
464
465
466
467
468
469
470static void collect_procs(struct page *page, struct list_head *tokill)
471{
472 struct to_kill *tk;
473
474 if (!page->mapping)
475 return;
476
477 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
478 if (!tk)
479 return;
480 if (PageAnon(page))
481 collect_procs_anon(page, tokill, &tk);
482 else
483 collect_procs_file(page, tokill, &tk);
484 kfree(tk);
485}
486
487
488
489
490
491enum outcome {
492 IGNORED,
493 FAILED,
494 DELAYED,
495 RECOVERED,
496};
497
498static const char *action_name[] = {
499 [IGNORED] = "Ignored",
500 [FAILED] = "Failed",
501 [DELAYED] = "Delayed",
502 [RECOVERED] = "Recovered",
503};
504
505
506
507
508
509
510
511static int delete_from_lru_cache(struct page *p)
512{
513 if (!isolate_lru_page(p)) {
514
515
516
517
518 ClearPageActive(p);
519 ClearPageUnevictable(p);
520
521
522
523 page_cache_release(p);
524 return 0;
525 }
526 return -EIO;
527}
528
529
530
531
532
533
534static int me_kernel(struct page *p, unsigned long pfn)
535{
536 return IGNORED;
537}
538
539
540
541
542static int me_unknown(struct page *p, unsigned long pfn)
543{
544 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
545 return FAILED;
546}
547
548
549
550
551static int me_pagecache_clean(struct page *p, unsigned long pfn)
552{
553 int err;
554 int ret = FAILED;
555 struct address_space *mapping;
556
557 delete_from_lru_cache(p);
558
559
560
561
562
563 if (PageAnon(p))
564 return RECOVERED;
565
566
567
568
569
570
571
572
573 mapping = page_mapping(p);
574 if (!mapping) {
575
576
577
578 return FAILED;
579 }
580
581
582
583
584
585
586 if (mapping->a_ops->error_remove_page) {
587 err = mapping->a_ops->error_remove_page(mapping, p);
588 if (err != 0) {
589 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
590 pfn, err);
591 } else if (page_has_private(p) &&
592 !try_to_release_page(p, GFP_NOIO)) {
593 pr_info("MCE %#lx: failed to release buffers\n", pfn);
594 } else {
595 ret = RECOVERED;
596 }
597 } else {
598
599
600
601
602 if (invalidate_inode_page(p))
603 ret = RECOVERED;
604 else
605 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
606 pfn);
607 }
608 return ret;
609}
610
611
612
613
614
615
616static int me_pagecache_dirty(struct page *p, unsigned long pfn)
617{
618 struct address_space *mapping = page_mapping(p);
619
620 SetPageError(p);
621
622 if (mapping) {
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657 mapping_set_error(mapping, EIO);
658 }
659
660 return me_pagecache_clean(p, pfn);
661}
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682static int me_swapcache_dirty(struct page *p, unsigned long pfn)
683{
684 ClearPageDirty(p);
685
686 ClearPageUptodate(p);
687
688 if (!delete_from_lru_cache(p))
689 return DELAYED;
690 else
691 return FAILED;
692}
693
694static int me_swapcache_clean(struct page *p, unsigned long pfn)
695{
696 delete_from_swap_cache(p);
697
698 if (!delete_from_lru_cache(p))
699 return RECOVERED;
700 else
701 return FAILED;
702}
703
704
705
706
707
708
709
710static int me_huge_page(struct page *p, unsigned long pfn)
711{
712 int res = 0;
713 struct page *hpage = compound_head(p);
714
715
716
717
718
719
720
721
722
723
724 if (!(page_mapping(hpage) || PageAnon(hpage))) {
725 res = dequeue_hwpoisoned_huge_page(hpage);
726 if (!res)
727 return RECOVERED;
728 }
729 return DELAYED;
730}
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745#define dirty (1UL << PG_dirty)
746#define sc (1UL << PG_swapcache)
747#define unevict (1UL << PG_unevictable)
748#define mlock (1UL << PG_mlocked)
749#define writeback (1UL << PG_writeback)
750#define lru (1UL << PG_lru)
751#define swapbacked (1UL << PG_swapbacked)
752#define head (1UL << PG_head)
753#define tail (1UL << PG_tail)
754#define compound (1UL << PG_compound)
755#define slab (1UL << PG_slab)
756#define reserved (1UL << PG_reserved)
757
758static struct page_state {
759 unsigned long mask;
760 unsigned long res;
761 char *msg;
762 int (*action)(struct page *p, unsigned long pfn);
763} error_states[] = {
764 { reserved, reserved, "reserved kernel", me_kernel },
765
766
767
768
769
770
771
772
773
774
775 { slab, slab, "kernel slab", me_kernel },
776
777#ifdef CONFIG_PAGEFLAGS_EXTENDED
778 { head, head, "huge", me_huge_page },
779 { tail, tail, "huge", me_huge_page },
780#else
781 { compound, compound, "huge", me_huge_page },
782#endif
783
784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786
787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789
790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792
793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795
796
797
798
799 { 0, 0, "unknown page state", me_unknown },
800};
801
802#undef dirty
803#undef sc
804#undef unevict
805#undef mlock
806#undef writeback
807#undef lru
808#undef swapbacked
809#undef head
810#undef tail
811#undef compound
812#undef slab
813#undef reserved
814
815
816
817
818
819static void action_result(unsigned long pfn, char *msg, int result)
820{
821 pr_err("MCE %#lx: %s page recovery: %s\n",
822 pfn, msg, action_name[result]);
823}
824
825static int page_action(struct page_state *ps, struct page *p,
826 unsigned long pfn)
827{
828 int result;
829 int count;
830
831 result = ps->action(p, pfn);
832 action_result(pfn, ps->msg, result);
833
834 count = page_count(p) - 1;
835 if (ps->action == me_swapcache_dirty && result == DELAYED)
836 count--;
837 if (count != 0) {
838 printk(KERN_ERR
839 "MCE %#lx: %s page still referenced by %d users\n",
840 pfn, ps->msg, count);
841 result = FAILED;
842 }
843
844
845
846
847
848
849 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
850}
851
852
853
854
855
856static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
857 int trapno, int flags)
858{
859 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
860 struct address_space *mapping;
861 LIST_HEAD(tokill);
862 int ret;
863 int kill = 1, forcekill;
864 struct page *hpage = compound_head(p);
865 struct page *ppage;
866
867 if (PageReserved(p) || PageSlab(p))
868 return SWAP_SUCCESS;
869
870
871
872
873
874 if (!page_mapped(hpage))
875 return SWAP_SUCCESS;
876
877 if (PageKsm(p))
878 return SWAP_FAIL;
879
880 if (PageSwapCache(p)) {
881 printk(KERN_ERR
882 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
883 ttu |= TTU_IGNORE_HWPOISON;
884 }
885
886
887
888
889
890
891
892 mapping = page_mapping(hpage);
893 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
894 mapping_cap_writeback_dirty(mapping)) {
895 if (page_mkclean(hpage)) {
896 SetPageDirty(hpage);
897 } else {
898 kill = 0;
899 ttu |= TTU_IGNORE_HWPOISON;
900 printk(KERN_INFO
901 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
902 pfn);
903 }
904 }
905
906
907
908
909
910
911
912 ppage = hpage;
913
914 if (PageTransHuge(hpage)) {
915
916
917
918
919
920
921
922
923
924
925 if (!PageHuge(hpage) && PageAnon(hpage)) {
926 if (unlikely(split_huge_page(hpage))) {
927
928
929
930
931
932
933 printk(KERN_INFO
934 "MCE %#lx: failed to split THP\n", pfn);
935
936 BUG_ON(!PageHWPoison(p));
937 return SWAP_FAIL;
938 }
939
940 ppage = p;
941 }
942 }
943
944
945
946
947
948
949
950
951
952 if (kill)
953 collect_procs(ppage, &tokill);
954
955 if (hpage != ppage)
956 lock_page(ppage);
957
958 ret = try_to_unmap(ppage, ttu);
959 if (ret != SWAP_SUCCESS)
960 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
961 pfn, page_mapcount(ppage));
962
963 if (hpage != ppage)
964 unlock_page(ppage);
965
966
967
968
969
970
971
972
973
974
975
976 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
977 kill_procs(&tokill, forcekill, trapno,
978 ret != SWAP_SUCCESS, p, pfn, flags);
979
980 return ret;
981}
982
983static void set_page_hwpoison_huge_page(struct page *hpage)
984{
985 int i;
986 int nr_pages = 1 << compound_trans_order(hpage);
987 for (i = 0; i < nr_pages; i++)
988 SetPageHWPoison(hpage + i);
989}
990
991static void clear_page_hwpoison_huge_page(struct page *hpage)
992{
993 int i;
994 int nr_pages = 1 << compound_trans_order(hpage);
995 for (i = 0; i < nr_pages; i++)
996 ClearPageHWPoison(hpage + i);
997}
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017int memory_failure(unsigned long pfn, int trapno, int flags)
1018{
1019 struct page_state *ps;
1020 struct page *p;
1021 struct page *hpage;
1022 int res;
1023 unsigned int nr_pages;
1024
1025 if (!sysctl_memory_failure_recovery)
1026 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1027
1028 if (!pfn_valid(pfn)) {
1029 printk(KERN_ERR
1030 "MCE %#lx: memory outside kernel control\n",
1031 pfn);
1032 return -ENXIO;
1033 }
1034
1035 p = pfn_to_page(pfn);
1036 hpage = compound_head(p);
1037 if (TestSetPageHWPoison(p)) {
1038 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1039 return 0;
1040 }
1041
1042 nr_pages = 1 << compound_trans_order(hpage);
1043 atomic_long_add(nr_pages, &mce_bad_pages);
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059 if (!(flags & MF_COUNT_INCREASED) &&
1060 !get_page_unless_zero(hpage)) {
1061 if (is_free_buddy_page(p)) {
1062 action_result(pfn, "free buddy", DELAYED);
1063 return 0;
1064 } else if (PageHuge(hpage)) {
1065
1066
1067
1068
1069 lock_page(hpage);
1070 if (!PageHWPoison(hpage)
1071 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1072 || (p != hpage && TestSetPageHWPoison(hpage))) {
1073 atomic_long_sub(nr_pages, &mce_bad_pages);
1074 return 0;
1075 }
1076 set_page_hwpoison_huge_page(hpage);
1077 res = dequeue_hwpoisoned_huge_page(hpage);
1078 action_result(pfn, "free huge",
1079 res ? IGNORED : DELAYED);
1080 unlock_page(hpage);
1081 return res;
1082 } else {
1083 action_result(pfn, "high order kernel", IGNORED);
1084 return -EBUSY;
1085 }
1086 }
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096 if (!PageHuge(p) && !PageTransTail(p)) {
1097 if (!PageLRU(p))
1098 shake_page(p, 0);
1099 if (!PageLRU(p)) {
1100
1101
1102
1103 if (is_free_buddy_page(p)) {
1104 action_result(pfn, "free buddy, 2nd try",
1105 DELAYED);
1106 return 0;
1107 }
1108 action_result(pfn, "non LRU", IGNORED);
1109 put_page(p);
1110 return -EBUSY;
1111 }
1112 }
1113
1114
1115
1116
1117
1118
1119 lock_page(hpage);
1120
1121
1122
1123
1124 if (!PageHWPoison(p)) {
1125 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1126 res = 0;
1127 goto out;
1128 }
1129 if (hwpoison_filter(p)) {
1130 if (TestClearPageHWPoison(p))
1131 atomic_long_sub(nr_pages, &mce_bad_pages);
1132 unlock_page(hpage);
1133 put_page(hpage);
1134 return 0;
1135 }
1136
1137
1138
1139
1140
1141 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1142 action_result(pfn, "hugepage already hardware poisoned",
1143 IGNORED);
1144 unlock_page(hpage);
1145 put_page(hpage);
1146 return 0;
1147 }
1148
1149
1150
1151
1152
1153
1154 if (PageHuge(p))
1155 set_page_hwpoison_huge_page(hpage);
1156
1157 wait_on_page_writeback(p);
1158
1159
1160
1161
1162
1163 if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
1164 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1165 res = -EBUSY;
1166 goto out;
1167 }
1168
1169
1170
1171
1172 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1173 action_result(pfn, "already truncated LRU", IGNORED);
1174 res = -EBUSY;
1175 goto out;
1176 }
1177
1178 res = -EBUSY;
1179 for (ps = error_states;; ps++) {
1180 if ((p->flags & ps->mask) == ps->res) {
1181 res = page_action(ps, p, pfn);
1182 break;
1183 }
1184 }
1185out:
1186 unlock_page(hpage);
1187 return res;
1188}
1189EXPORT_SYMBOL_GPL(memory_failure);
1190
1191#define MEMORY_FAILURE_FIFO_ORDER 4
1192#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1193
1194struct memory_failure_entry {
1195 unsigned long pfn;
1196 int trapno;
1197 int flags;
1198};
1199
1200struct memory_failure_cpu {
1201 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1202 MEMORY_FAILURE_FIFO_SIZE);
1203 spinlock_t lock;
1204 struct work_struct work;
1205};
1206
1207static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1227{
1228 struct memory_failure_cpu *mf_cpu;
1229 unsigned long proc_flags;
1230 struct memory_failure_entry entry = {
1231 .pfn = pfn,
1232 .trapno = trapno,
1233 .flags = flags,
1234 };
1235
1236 mf_cpu = &get_cpu_var(memory_failure_cpu);
1237 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1238 if (kfifo_put(&mf_cpu->fifo, &entry))
1239 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1240 else
1241 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1242 pfn);
1243 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1244 put_cpu_var(memory_failure_cpu);
1245}
1246EXPORT_SYMBOL_GPL(memory_failure_queue);
1247
1248static void memory_failure_work_func(struct work_struct *work)
1249{
1250 struct memory_failure_cpu *mf_cpu;
1251 struct memory_failure_entry entry = { 0, };
1252 unsigned long proc_flags;
1253 int gotten;
1254
1255 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1256 for (;;) {
1257 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1258 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1259 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1260 if (!gotten)
1261 break;
1262 memory_failure(entry.pfn, entry.trapno, entry.flags);
1263 }
1264}
1265
1266static int __init memory_failure_init(void)
1267{
1268 struct memory_failure_cpu *mf_cpu;
1269 int cpu;
1270
1271 for_each_possible_cpu(cpu) {
1272 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1273 spin_lock_init(&mf_cpu->lock);
1274 INIT_KFIFO(mf_cpu->fifo);
1275 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1276 }
1277
1278 return 0;
1279}
1280core_initcall(memory_failure_init);
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294int unpoison_memory(unsigned long pfn)
1295{
1296 struct page *page;
1297 struct page *p;
1298 int freeit = 0;
1299 unsigned int nr_pages;
1300
1301 if (!pfn_valid(pfn))
1302 return -ENXIO;
1303
1304 p = pfn_to_page(pfn);
1305 page = compound_head(p);
1306
1307 if (!PageHWPoison(p)) {
1308 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1309 return 0;
1310 }
1311
1312 nr_pages = 1 << compound_trans_order(page);
1313
1314 if (!get_page_unless_zero(page)) {
1315
1316
1317
1318
1319
1320
1321 if (PageHuge(page)) {
1322 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1323 return 0;
1324 }
1325 if (TestClearPageHWPoison(p))
1326 atomic_long_sub(nr_pages, &mce_bad_pages);
1327 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1328 return 0;
1329 }
1330
1331 lock_page(page);
1332
1333
1334
1335
1336
1337
1338 if (TestClearPageHWPoison(page)) {
1339 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1340 atomic_long_sub(nr_pages, &mce_bad_pages);
1341 freeit = 1;
1342 if (PageHuge(page))
1343 clear_page_hwpoison_huge_page(page);
1344 }
1345 unlock_page(page);
1346
1347 put_page(page);
1348 if (freeit)
1349 put_page(page);
1350
1351 return 0;
1352}
1353EXPORT_SYMBOL(unpoison_memory);
1354
1355static struct page *new_page(struct page *p, unsigned long private, int **x)
1356{
1357 int nid = page_to_nid(p);
1358 if (PageHuge(p))
1359 return alloc_huge_page_node(page_hstate(compound_head(p)),
1360 nid);
1361 else
1362 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1363}
1364
1365
1366
1367
1368
1369
1370
1371static int get_any_page(struct page *p, unsigned long pfn, int flags)
1372{
1373 int ret;
1374
1375 if (flags & MF_COUNT_INCREASED)
1376 return 1;
1377
1378
1379
1380
1381
1382 lock_memory_hotplug();
1383
1384
1385
1386
1387
1388 set_migratetype_isolate(p, true);
1389
1390
1391
1392
1393 if (!get_page_unless_zero(compound_head(p))) {
1394 if (PageHuge(p)) {
1395 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1396 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1397 } else if (is_free_buddy_page(p)) {
1398 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1399
1400 SetPageHWPoison(p);
1401 ret = 0;
1402 } else {
1403 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1404 __func__, pfn, p->flags);
1405 ret = -EIO;
1406 }
1407 } else {
1408
1409 ret = 1;
1410 }
1411 unset_migratetype_isolate(p, MIGRATE_MOVABLE);
1412 unlock_memory_hotplug();
1413 return ret;
1414}
1415
1416static int soft_offline_huge_page(struct page *page, int flags)
1417{
1418 int ret;
1419 unsigned long pfn = page_to_pfn(page);
1420 struct page *hpage = compound_head(page);
1421
1422 ret = get_any_page(page, pfn, flags);
1423 if (ret < 0)
1424 return ret;
1425 if (ret == 0)
1426 goto done;
1427
1428 if (PageHWPoison(hpage)) {
1429 put_page(hpage);
1430 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1431 return -EBUSY;
1432 }
1433
1434
1435 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
1436 MIGRATE_SYNC);
1437 put_page(hpage);
1438 if (ret) {
1439 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1440 pfn, ret, page->flags);
1441 return ret;
1442 }
1443done:
1444 if (!PageHWPoison(hpage))
1445 atomic_long_add(1 << compound_trans_order(hpage),
1446 &mce_bad_pages);
1447 set_page_hwpoison_huge_page(hpage);
1448 dequeue_hwpoisoned_huge_page(hpage);
1449
1450 return ret;
1451}
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475int soft_offline_page(struct page *page, int flags)
1476{
1477 int ret;
1478 unsigned long pfn = page_to_pfn(page);
1479 struct page *hpage = compound_trans_head(page);
1480
1481 if (PageHuge(page))
1482 return soft_offline_huge_page(page, flags);
1483 if (PageTransHuge(hpage)) {
1484 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1485 pr_info("soft offline: %#lx: failed to split THP\n",
1486 pfn);
1487 return -EBUSY;
1488 }
1489 }
1490
1491 ret = get_any_page(page, pfn, flags);
1492 if (ret < 0)
1493 return ret;
1494 if (ret == 0)
1495 goto done;
1496
1497
1498
1499
1500 if (!PageLRU(page)) {
1501
1502
1503
1504 put_page(page);
1505 shake_page(page, 1);
1506
1507
1508
1509
1510 ret = get_any_page(page, pfn, 0);
1511 if (ret < 0)
1512 return ret;
1513 if (ret == 0)
1514 goto done;
1515 }
1516 if (!PageLRU(page)) {
1517 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1518 pfn, page->flags);
1519 return -EIO;
1520 }
1521
1522 lock_page(page);
1523 wait_on_page_writeback(page);
1524
1525
1526
1527
1528 if (PageHWPoison(page)) {
1529 unlock_page(page);
1530 put_page(page);
1531 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1532 return -EBUSY;
1533 }
1534
1535
1536
1537
1538
1539 ret = invalidate_inode_page(page);
1540 unlock_page(page);
1541
1542
1543
1544
1545 if (ret == 1) {
1546 put_page(page);
1547 ret = 0;
1548 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1549 goto done;
1550 }
1551
1552
1553
1554
1555
1556
1557 ret = isolate_lru_page(page);
1558
1559
1560
1561
1562 put_page(page);
1563 if (!ret) {
1564 LIST_HEAD(pagelist);
1565 inc_zone_page_state(page, NR_ISOLATED_ANON +
1566 page_is_file_cache(page));
1567 list_add(&page->lru, &pagelist);
1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1569 false, MIGRATE_SYNC,
1570 MR_MEMORY_FAILURE);
1571 if (ret) {
1572 putback_lru_pages(&pagelist);
1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1574 pfn, ret, page->flags);
1575 if (ret > 0)
1576 ret = -EIO;
1577 }
1578 } else {
1579 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1580 pfn, ret, page_count(page), page->flags);
1581 }
1582 if (ret)
1583 return ret;
1584
1585done:
1586 atomic_long_add(1, &mce_bad_pages);
1587 SetPageHWPoison(page);
1588
1589 return ret;
1590}
1591