1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34
35#include <asm/pgtable.h>
36#include <asm/tlbflush.h>
37#include <linux/swapops.h>
38#include <linux/page_cgroup.h>
39
40static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
41 unsigned char);
42static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44
45static DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles;
47long nr_swap_pages;
48long total_swap_pages;
49static int least_priority;
50
51static const char Bad_file[] = "Bad swap file entry ";
52static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry ";
55
56static struct swap_list_t swap_list = {-1, -1};
57
58static struct swap_info_struct *swap_info[MAX_SWAPFILES];
59
60static DEFINE_MUTEX(swapon_mutex);
61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
66static inline unsigned char swap_count(unsigned char ent)
67{
68 return ent & ~SWAP_HAS_CACHE;
69}
70
71
72static int
73__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
74{
75 swp_entry_t entry = swp_entry(si->type, offset);
76 struct page *page;
77 int ret = 0;
78
79 page = find_get_page(&swapper_space, entry.val);
80 if (!page)
81 return 0;
82
83
84
85
86
87
88
89 if (trylock_page(page)) {
90 ret = try_to_free_swap(page);
91 unlock_page(page);
92 }
93 page_cache_release(page);
94 return ret;
95}
96
97
98
99
100
101static int discard_swap(struct swap_info_struct *si)
102{
103 struct swap_extent *se;
104 sector_t start_block;
105 sector_t nr_blocks;
106 int err = 0;
107
108
109 se = &si->first_swap_extent;
110 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
111 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
112 if (nr_blocks) {
113 err = blkdev_issue_discard(si->bdev, start_block,
114 nr_blocks, GFP_KERNEL, 0);
115 if (err)
116 return err;
117 cond_resched();
118 }
119
120 list_for_each_entry(se, &si->first_swap_extent.list, list) {
121 start_block = se->start_block << (PAGE_SHIFT - 9);
122 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
123
124 err = blkdev_issue_discard(si->bdev, start_block,
125 nr_blocks, GFP_KERNEL, 0);
126 if (err)
127 break;
128
129 cond_resched();
130 }
131 return err;
132}
133
134
135
136
137
138static void discard_swap_cluster(struct swap_info_struct *si,
139 pgoff_t start_page, pgoff_t nr_pages)
140{
141 struct swap_extent *se = si->curr_swap_extent;
142 int found_extent = 0;
143
144 while (nr_pages) {
145 struct list_head *lh;
146
147 if (se->start_page <= start_page &&
148 start_page < se->start_page + se->nr_pages) {
149 pgoff_t offset = start_page - se->start_page;
150 sector_t start_block = se->start_block + offset;
151 sector_t nr_blocks = se->nr_pages - offset;
152
153 if (nr_blocks > nr_pages)
154 nr_blocks = nr_pages;
155 start_page += nr_blocks;
156 nr_pages -= nr_blocks;
157
158 if (!found_extent++)
159 si->curr_swap_extent = se;
160
161 start_block <<= PAGE_SHIFT - 9;
162 nr_blocks <<= PAGE_SHIFT - 9;
163 if (blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_NOIO, 0))
165 break;
166 }
167
168 lh = se->list.next;
169 se = list_entry(lh, struct swap_extent, list);
170 }
171}
172
173static int wait_for_discard(void *word)
174{
175 schedule();
176 return 0;
177}
178
179#define SWAPFILE_CLUSTER 256
180#define LATENCY_LIMIT 256
181
182static unsigned long scan_swap_map(struct swap_info_struct *si,
183 unsigned char usage)
184{
185 unsigned long offset;
186 unsigned long scan_base;
187 unsigned long last_in_cluster = 0;
188 int latency_ration = LATENCY_LIMIT;
189 int found_free_cluster = 0;
190
191
192
193
194
195
196
197
198
199
200
201
202 si->flags += SWP_SCANNING;
203 scan_base = offset = si->cluster_next;
204
205 if (unlikely(!si->cluster_nr--)) {
206 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
207 si->cluster_nr = SWAPFILE_CLUSTER - 1;
208 goto checks;
209 }
210 if (si->flags & SWP_DISCARDABLE) {
211
212
213
214
215
216
217
218 if (si->lowest_alloc)
219 goto checks;
220 si->lowest_alloc = si->max;
221 si->highest_alloc = 0;
222 }
223 spin_unlock(&swap_lock);
224
225
226
227
228
229
230
231
232
233 if (!(si->flags & SWP_SOLIDSTATE))
234 scan_base = offset = si->lowest_bit;
235 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
236
237
238 for (; last_in_cluster <= si->highest_bit; offset++) {
239 if (si->swap_map[offset])
240 last_in_cluster = offset + SWAPFILE_CLUSTER;
241 else if (offset == last_in_cluster) {
242 spin_lock(&swap_lock);
243 offset -= SWAPFILE_CLUSTER - 1;
244 si->cluster_next = offset;
245 si->cluster_nr = SWAPFILE_CLUSTER - 1;
246 found_free_cluster = 1;
247 goto checks;
248 }
249 if (unlikely(--latency_ration < 0)) {
250 cond_resched();
251 latency_ration = LATENCY_LIMIT;
252 }
253 }
254
255 offset = si->lowest_bit;
256 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
257
258
259 for (; last_in_cluster < scan_base; offset++) {
260 if (si->swap_map[offset])
261 last_in_cluster = offset + SWAPFILE_CLUSTER;
262 else if (offset == last_in_cluster) {
263 spin_lock(&swap_lock);
264 offset -= SWAPFILE_CLUSTER - 1;
265 si->cluster_next = offset;
266 si->cluster_nr = SWAPFILE_CLUSTER - 1;
267 found_free_cluster = 1;
268 goto checks;
269 }
270 if (unlikely(--latency_ration < 0)) {
271 cond_resched();
272 latency_ration = LATENCY_LIMIT;
273 }
274 }
275
276 offset = scan_base;
277 spin_lock(&swap_lock);
278 si->cluster_nr = SWAPFILE_CLUSTER - 1;
279 si->lowest_alloc = 0;
280 }
281
282checks:
283 if (!(si->flags & SWP_WRITEOK))
284 goto no_page;
285 if (!si->highest_bit)
286 goto no_page;
287 if (offset > si->highest_bit)
288 scan_base = offset = si->lowest_bit;
289
290
291 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
292 int swap_was_freed;
293 spin_unlock(&swap_lock);
294 swap_was_freed = __try_to_reclaim_swap(si, offset);
295 spin_lock(&swap_lock);
296
297 if (swap_was_freed)
298 goto checks;
299 goto scan;
300 }
301
302 if (si->swap_map[offset])
303 goto scan;
304
305 if (offset == si->lowest_bit)
306 si->lowest_bit++;
307 if (offset == si->highest_bit)
308 si->highest_bit--;
309 si->inuse_pages++;
310 if (si->inuse_pages == si->pages) {
311 si->lowest_bit = si->max;
312 si->highest_bit = 0;
313 }
314 si->swap_map[offset] = usage;
315 si->cluster_next = offset + 1;
316 si->flags -= SWP_SCANNING;
317
318 if (si->lowest_alloc) {
319
320
321
322
323 if (found_free_cluster) {
324
325
326
327
328
329
330
331 if (offset < si->highest_alloc &&
332 si->lowest_alloc <= last_in_cluster)
333 last_in_cluster = si->lowest_alloc - 1;
334 si->flags |= SWP_DISCARDING;
335 spin_unlock(&swap_lock);
336
337 if (offset < last_in_cluster)
338 discard_swap_cluster(si, offset,
339 last_in_cluster - offset + 1);
340
341 spin_lock(&swap_lock);
342 si->lowest_alloc = 0;
343 si->flags &= ~SWP_DISCARDING;
344
345 smp_mb();
346 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
347
348 } else if (si->flags & SWP_DISCARDING) {
349
350
351
352
353
354
355 spin_unlock(&swap_lock);
356 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
357 wait_for_discard, TASK_UNINTERRUPTIBLE);
358 spin_lock(&swap_lock);
359 } else {
360
361
362
363
364
365 if (offset < si->lowest_alloc)
366 si->lowest_alloc = offset;
367 if (offset > si->highest_alloc)
368 si->highest_alloc = offset;
369 }
370 }
371 return offset;
372
373scan:
374 spin_unlock(&swap_lock);
375 while (++offset <= si->highest_bit) {
376 if (!si->swap_map[offset]) {
377 spin_lock(&swap_lock);
378 goto checks;
379 }
380 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
381 spin_lock(&swap_lock);
382 goto checks;
383 }
384 if (unlikely(--latency_ration < 0)) {
385 cond_resched();
386 latency_ration = LATENCY_LIMIT;
387 }
388 }
389 offset = si->lowest_bit;
390 while (++offset < scan_base) {
391 if (!si->swap_map[offset]) {
392 spin_lock(&swap_lock);
393 goto checks;
394 }
395 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
396 spin_lock(&swap_lock);
397 goto checks;
398 }
399 if (unlikely(--latency_ration < 0)) {
400 cond_resched();
401 latency_ration = LATENCY_LIMIT;
402 }
403 }
404 spin_lock(&swap_lock);
405
406no_page:
407 si->flags -= SWP_SCANNING;
408 return 0;
409}
410
411swp_entry_t get_swap_page(void)
412{
413 struct swap_info_struct *si;
414 pgoff_t offset;
415 int type, next;
416 int wrapped = 0;
417
418 spin_lock(&swap_lock);
419 if (nr_swap_pages <= 0)
420 goto noswap;
421 nr_swap_pages--;
422
423 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
424 si = swap_info[type];
425 next = si->next;
426 if (next < 0 ||
427 (!wrapped && si->prio != swap_info[next]->prio)) {
428 next = swap_list.head;
429 wrapped++;
430 }
431
432 if (!si->highest_bit)
433 continue;
434 if (!(si->flags & SWP_WRITEOK))
435 continue;
436
437 swap_list.next = next;
438
439 offset = scan_swap_map(si, SWAP_HAS_CACHE);
440 if (offset) {
441 spin_unlock(&swap_lock);
442 return swp_entry(type, offset);
443 }
444 next = swap_list.next;
445 }
446
447 nr_swap_pages++;
448noswap:
449 spin_unlock(&swap_lock);
450 return (swp_entry_t) {0};
451}
452
453
454swp_entry_t get_swap_page_of_type(int type)
455{
456 struct swap_info_struct *si;
457 pgoff_t offset;
458
459 spin_lock(&swap_lock);
460 si = swap_info[type];
461 if (si && (si->flags & SWP_WRITEOK)) {
462 nr_swap_pages--;
463
464 offset = scan_swap_map(si, 1);
465 if (offset) {
466 spin_unlock(&swap_lock);
467 return swp_entry(type, offset);
468 }
469 nr_swap_pages++;
470 }
471 spin_unlock(&swap_lock);
472 return (swp_entry_t) {0};
473}
474
475static struct swap_info_struct *swap_info_get(swp_entry_t entry)
476{
477 struct swap_info_struct *p;
478 unsigned long offset, type;
479
480 if (!entry.val)
481 goto out;
482 type = swp_type(entry);
483 if (type >= nr_swapfiles)
484 goto bad_nofile;
485 p = swap_info[type];
486 if (!(p->flags & SWP_USED))
487 goto bad_device;
488 offset = swp_offset(entry);
489 if (offset >= p->max)
490 goto bad_offset;
491 if (!p->swap_map[offset])
492 goto bad_free;
493 spin_lock(&swap_lock);
494 return p;
495
496bad_free:
497 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
498 goto out;
499bad_offset:
500 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
501 goto out;
502bad_device:
503 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
504 goto out;
505bad_nofile:
506 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
507out:
508 return NULL;
509}
510
511static unsigned char swap_entry_free(struct swap_info_struct *p,
512 swp_entry_t entry, unsigned char usage)
513{
514 unsigned long offset = swp_offset(entry);
515 unsigned char count;
516 unsigned char has_cache;
517
518 count = p->swap_map[offset];
519 has_cache = count & SWAP_HAS_CACHE;
520 count &= ~SWAP_HAS_CACHE;
521
522 if (usage == SWAP_HAS_CACHE) {
523 VM_BUG_ON(!has_cache);
524 has_cache = 0;
525 } else if (count == SWAP_MAP_SHMEM) {
526
527
528
529
530 count = 0;
531 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
532 if (count == COUNT_CONTINUED) {
533 if (swap_count_continued(p, offset, count))
534 count = SWAP_MAP_MAX | COUNT_CONTINUED;
535 else
536 count = SWAP_MAP_MAX;
537 } else
538 count--;
539 }
540
541 if (!count)
542 mem_cgroup_uncharge_swap(entry);
543
544 usage = count | has_cache;
545 p->swap_map[offset] = usage;
546
547
548 if (!usage) {
549 struct gendisk *disk = p->bdev->bd_disk;
550 if (offset < p->lowest_bit)
551 p->lowest_bit = offset;
552 if (offset > p->highest_bit)
553 p->highest_bit = offset;
554 if (swap_list.next >= 0 &&
555 p->prio > swap_info[swap_list.next]->prio)
556 swap_list.next = p->type;
557 nr_swap_pages++;
558 p->inuse_pages--;
559 if ((p->flags & SWP_BLKDEV) &&
560 disk->fops->swap_slot_free_notify)
561 disk->fops->swap_slot_free_notify(p->bdev, offset);
562 }
563
564 return usage;
565}
566
567
568
569
570
571void swap_free(swp_entry_t entry)
572{
573 struct swap_info_struct *p;
574
575 p = swap_info_get(entry);
576 if (p) {
577 swap_entry_free(p, entry, 1);
578 spin_unlock(&swap_lock);
579 }
580}
581
582
583
584
585void swapcache_free(swp_entry_t entry, struct page *page)
586{
587 struct swap_info_struct *p;
588 unsigned char count;
589
590 p = swap_info_get(entry);
591 if (p) {
592 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
593 if (page)
594 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
595 spin_unlock(&swap_lock);
596 }
597}
598
599
600
601
602
603
604static inline int page_swapcount(struct page *page)
605{
606 int count = 0;
607 struct swap_info_struct *p;
608 swp_entry_t entry;
609
610 entry.val = page_private(page);
611 p = swap_info_get(entry);
612 if (p) {
613 count = swap_count(p->swap_map[swp_offset(entry)]);
614 spin_unlock(&swap_lock);
615 }
616 return count;
617}
618
619
620
621
622
623
624
625int reuse_swap_page(struct page *page)
626{
627 int count;
628
629 VM_BUG_ON(!PageLocked(page));
630 if (unlikely(PageKsm(page)))
631 return 0;
632 count = page_mapcount(page);
633 if (count <= 1 && PageSwapCache(page)) {
634 count += page_swapcount(page);
635 if (count == 1 && !PageWriteback(page)) {
636 delete_from_swap_cache(page);
637 SetPageDirty(page);
638 }
639 }
640 return count <= 1;
641}
642
643
644
645
646
647int try_to_free_swap(struct page *page)
648{
649 VM_BUG_ON(!PageLocked(page));
650
651 if (!PageSwapCache(page))
652 return 0;
653 if (PageWriteback(page))
654 return 0;
655 if (page_swapcount(page))
656 return 0;
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673 if (!(gfp_allowed_mask & __GFP_IO))
674 return 0;
675
676 delete_from_swap_cache(page);
677 SetPageDirty(page);
678 return 1;
679}
680
681
682
683
684
685int free_swap_and_cache(swp_entry_t entry)
686{
687 struct swap_info_struct *p;
688 struct page *page = NULL;
689
690 if (non_swap_entry(entry))
691 return 1;
692
693 p = swap_info_get(entry);
694 if (p) {
695 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
696 page = find_get_page(&swapper_space, entry.val);
697 if (page && !trylock_page(page)) {
698 page_cache_release(page);
699 page = NULL;
700 }
701 }
702 spin_unlock(&swap_lock);
703 }
704 if (page) {
705
706
707
708
709 if (PageSwapCache(page) && !PageWriteback(page) &&
710 (!page_mapped(page) || vm_swap_full())) {
711 delete_from_swap_cache(page);
712 SetPageDirty(page);
713 }
714 unlock_page(page);
715 page_cache_release(page);
716 }
717 return p != NULL;
718}
719
720#ifdef CONFIG_CGROUP_MEM_RES_CTLR
721
722
723
724
725
726
727
728
729
730
731int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
732{
733 struct page *page;
734 struct swap_info_struct *p;
735 int count = 0;
736
737 page = find_get_page(&swapper_space, ent.val);
738 if (page)
739 count += page_mapcount(page);
740 p = swap_info_get(ent);
741 if (p) {
742 count += swap_count(p->swap_map[swp_offset(ent)]);
743 spin_unlock(&swap_lock);
744 }
745
746 *pagep = page;
747 return count;
748}
749#endif
750
751#ifdef CONFIG_HIBERNATION
752
753
754
755
756
757
758
759
760int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
761{
762 struct block_device *bdev = NULL;
763 int type;
764
765 if (device)
766 bdev = bdget(device);
767
768 spin_lock(&swap_lock);
769 for (type = 0; type < nr_swapfiles; type++) {
770 struct swap_info_struct *sis = swap_info[type];
771
772 if (!(sis->flags & SWP_WRITEOK))
773 continue;
774
775 if (!bdev) {
776 if (bdev_p)
777 *bdev_p = bdgrab(sis->bdev);
778
779 spin_unlock(&swap_lock);
780 return type;
781 }
782 if (bdev == sis->bdev) {
783 struct swap_extent *se = &sis->first_swap_extent;
784
785 if (se->start_block == offset) {
786 if (bdev_p)
787 *bdev_p = bdgrab(sis->bdev);
788
789 spin_unlock(&swap_lock);
790 bdput(bdev);
791 return type;
792 }
793 }
794 }
795 spin_unlock(&swap_lock);
796 if (bdev)
797 bdput(bdev);
798
799 return -ENODEV;
800}
801
802
803
804
805
806sector_t swapdev_block(int type, pgoff_t offset)
807{
808 struct block_device *bdev;
809
810 if ((unsigned int)type >= nr_swapfiles)
811 return 0;
812 if (!(swap_info[type]->flags & SWP_WRITEOK))
813 return 0;
814 return map_swap_entry(swp_entry(type, offset), &bdev);
815}
816
817
818
819
820
821
822
823unsigned int count_swap_pages(int type, int free)
824{
825 unsigned int n = 0;
826
827 spin_lock(&swap_lock);
828 if ((unsigned int)type < nr_swapfiles) {
829 struct swap_info_struct *sis = swap_info[type];
830
831 if (sis->flags & SWP_WRITEOK) {
832 n = sis->pages;
833 if (free)
834 n -= sis->inuse_pages;
835 }
836 }
837 spin_unlock(&swap_lock);
838 return n;
839}
840#endif
841
842
843
844
845
846
847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
848 unsigned long addr, swp_entry_t entry, struct page *page)
849{
850 struct mem_cgroup *ptr;
851 spinlock_t *ptl;
852 pte_t *pte;
853 int ret = 1;
854
855 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
856 ret = -ENOMEM;
857 goto out_nolock;
858 }
859
860 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
861 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
862 if (ret > 0)
863 mem_cgroup_cancel_charge_swapin(ptr);
864 ret = 0;
865 goto out;
866 }
867
868 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
869 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
870 get_page(page);
871 set_pte_at(vma->vm_mm, addr, pte,
872 pte_mkold(mk_pte(page, vma->vm_page_prot)));
873 page_add_anon_rmap(page, vma, addr);
874 mem_cgroup_commit_charge_swapin(page, ptr);
875 swap_free(entry);
876
877
878
879
880 activate_page(page);
881out:
882 pte_unmap_unlock(pte, ptl);
883out_nolock:
884 return ret;
885}
886
887static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
888 unsigned long addr, unsigned long end,
889 swp_entry_t entry, struct page *page)
890{
891 pte_t swp_pte = swp_entry_to_pte(entry);
892 pte_t *pte;
893 int ret = 0;
894
895
896
897
898
899
900
901
902
903
904 pte = pte_offset_map(pmd, addr);
905 do {
906
907
908
909
910 if (unlikely(pte_same(*pte, swp_pte))) {
911 pte_unmap(pte);
912 ret = unuse_pte(vma, pmd, addr, entry, page);
913 if (ret)
914 goto out;
915 pte = pte_offset_map(pmd, addr);
916 }
917 } while (pte++, addr += PAGE_SIZE, addr != end);
918 pte_unmap(pte - 1);
919out:
920 return ret;
921}
922
923static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
924 unsigned long addr, unsigned long end,
925 swp_entry_t entry, struct page *page)
926{
927 pmd_t *pmd;
928 unsigned long next;
929 int ret;
930
931 pmd = pmd_offset(pud, addr);
932 do {
933 next = pmd_addr_end(addr, end);
934 if (unlikely(pmd_trans_huge(*pmd)))
935 continue;
936 if (pmd_none_or_clear_bad(pmd))
937 continue;
938 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
939 if (ret)
940 return ret;
941 } while (pmd++, addr = next, addr != end);
942 return 0;
943}
944
945static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
946 unsigned long addr, unsigned long end,
947 swp_entry_t entry, struct page *page)
948{
949 pud_t *pud;
950 unsigned long next;
951 int ret;
952
953 pud = pud_offset(pgd, addr);
954 do {
955 next = pud_addr_end(addr, end);
956 if (pud_none_or_clear_bad(pud))
957 continue;
958 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
959 if (ret)
960 return ret;
961 } while (pud++, addr = next, addr != end);
962 return 0;
963}
964
965static int unuse_vma(struct vm_area_struct *vma,
966 swp_entry_t entry, struct page *page)
967{
968 pgd_t *pgd;
969 unsigned long addr, end, next;
970 int ret;
971
972 if (page_anon_vma(page)) {
973 addr = page_address_in_vma(page, vma);
974 if (addr == -EFAULT)
975 return 0;
976 else
977 end = addr + PAGE_SIZE;
978 } else {
979 addr = vma->vm_start;
980 end = vma->vm_end;
981 }
982
983 pgd = pgd_offset(vma->vm_mm, addr);
984 do {
985 next = pgd_addr_end(addr, end);
986 if (pgd_none_or_clear_bad(pgd))
987 continue;
988 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
989 if (ret)
990 return ret;
991 } while (pgd++, addr = next, addr != end);
992 return 0;
993}
994
995static int unuse_mm(struct mm_struct *mm,
996 swp_entry_t entry, struct page *page)
997{
998 struct vm_area_struct *vma;
999 int ret = 0;
1000
1001 if (!down_read_trylock(&mm->mmap_sem)) {
1002
1003
1004
1005
1006 activate_page(page);
1007 unlock_page(page);
1008 down_read(&mm->mmap_sem);
1009 lock_page(page);
1010 }
1011 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1012 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1013 break;
1014 }
1015 up_read(&mm->mmap_sem);
1016 return (ret < 0)? ret: 0;
1017}
1018
1019
1020
1021
1022
1023static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1024 unsigned int prev)
1025{
1026 unsigned int max = si->max;
1027 unsigned int i = prev;
1028 unsigned char count;
1029
1030
1031
1032
1033
1034
1035
1036 for (;;) {
1037 if (++i >= max) {
1038 if (!prev) {
1039 i = 0;
1040 break;
1041 }
1042
1043
1044
1045
1046 max = prev + 1;
1047 prev = 0;
1048 i = 1;
1049 }
1050 count = si->swap_map[i];
1051 if (count && swap_count(count) != SWAP_MAP_BAD)
1052 break;
1053 }
1054 return i;
1055}
1056
1057
1058
1059
1060
1061
1062static int try_to_unuse(unsigned int type)
1063{
1064 struct swap_info_struct *si = swap_info[type];
1065 struct mm_struct *start_mm;
1066 unsigned char *swap_map;
1067 unsigned char swcount;
1068 struct page *page;
1069 swp_entry_t entry;
1070 unsigned int i = 0;
1071 int retval = 0;
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087 start_mm = &init_mm;
1088 atomic_inc(&init_mm.mm_users);
1089
1090
1091
1092
1093
1094
1095 while ((i = find_next_to_unuse(si, i)) != 0) {
1096 if (signal_pending(current)) {
1097 retval = -EINTR;
1098 break;
1099 }
1100
1101
1102
1103
1104
1105
1106 swap_map = &si->swap_map[i];
1107 entry = swp_entry(type, i);
1108 page = read_swap_cache_async(entry,
1109 GFP_HIGHUSER_MOVABLE, NULL, 0);
1110 if (!page) {
1111
1112
1113
1114
1115
1116
1117 if (!*swap_map)
1118 continue;
1119 retval = -ENOMEM;
1120 break;
1121 }
1122
1123
1124
1125
1126 if (atomic_read(&start_mm->mm_users) == 1) {
1127 mmput(start_mm);
1128 start_mm = &init_mm;
1129 atomic_inc(&init_mm.mm_users);
1130 }
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140 wait_on_page_locked(page);
1141 wait_on_page_writeback(page);
1142 lock_page(page);
1143 wait_on_page_writeback(page);
1144
1145
1146
1147
1148 swcount = *swap_map;
1149 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1150 retval = shmem_unuse(entry, page);
1151
1152 if (retval < 0)
1153 break;
1154 continue;
1155 }
1156 if (swap_count(swcount) && start_mm != &init_mm)
1157 retval = unuse_mm(start_mm, entry, page);
1158
1159 if (swap_count(*swap_map)) {
1160 int set_start_mm = (*swap_map >= swcount);
1161 struct list_head *p = &start_mm->mmlist;
1162 struct mm_struct *new_start_mm = start_mm;
1163 struct mm_struct *prev_mm = start_mm;
1164 struct mm_struct *mm;
1165
1166 atomic_inc(&new_start_mm->mm_users);
1167 atomic_inc(&prev_mm->mm_users);
1168 spin_lock(&mmlist_lock);
1169 while (swap_count(*swap_map) && !retval &&
1170 (p = p->next) != &start_mm->mmlist) {
1171 mm = list_entry(p, struct mm_struct, mmlist);
1172 if (!atomic_inc_not_zero(&mm->mm_users))
1173 continue;
1174 spin_unlock(&mmlist_lock);
1175 mmput(prev_mm);
1176 prev_mm = mm;
1177
1178 cond_resched();
1179
1180 swcount = *swap_map;
1181 if (!swap_count(swcount))
1182 ;
1183 else if (mm == &init_mm)
1184 set_start_mm = 1;
1185 else
1186 retval = unuse_mm(mm, entry, page);
1187
1188 if (set_start_mm && *swap_map < swcount) {
1189 mmput(new_start_mm);
1190 atomic_inc(&mm->mm_users);
1191 new_start_mm = mm;
1192 set_start_mm = 0;
1193 }
1194 spin_lock(&mmlist_lock);
1195 }
1196 spin_unlock(&mmlist_lock);
1197 mmput(prev_mm);
1198 mmput(start_mm);
1199 start_mm = new_start_mm;
1200 }
1201 if (retval) {
1202 unlock_page(page);
1203 page_cache_release(page);
1204 break;
1205 }
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226 if (swap_count(*swap_map) &&
1227 PageDirty(page) && PageSwapCache(page)) {
1228 struct writeback_control wbc = {
1229 .sync_mode = WB_SYNC_NONE,
1230 };
1231
1232 swap_writepage(page, &wbc);
1233 lock_page(page);
1234 wait_on_page_writeback(page);
1235 }
1236
1237
1238
1239
1240
1241
1242
1243
1244 if (PageSwapCache(page) &&
1245 likely(page_private(page) == entry.val))
1246 delete_from_swap_cache(page);
1247
1248
1249
1250
1251
1252
1253 SetPageDirty(page);
1254 unlock_page(page);
1255 page_cache_release(page);
1256
1257
1258
1259
1260
1261 cond_resched();
1262 }
1263
1264 mmput(start_mm);
1265 return retval;
1266}
1267
1268
1269
1270
1271
1272
1273
1274static void drain_mmlist(void)
1275{
1276 struct list_head *p, *next;
1277 unsigned int type;
1278
1279 for (type = 0; type < nr_swapfiles; type++)
1280 if (swap_info[type]->inuse_pages)
1281 return;
1282 spin_lock(&mmlist_lock);
1283 list_for_each_safe(p, next, &init_mm.mmlist)
1284 list_del_init(p);
1285 spin_unlock(&mmlist_lock);
1286}
1287
1288
1289
1290
1291
1292
1293
1294static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1295{
1296 struct swap_info_struct *sis;
1297 struct swap_extent *start_se;
1298 struct swap_extent *se;
1299 pgoff_t offset;
1300
1301 sis = swap_info[swp_type(entry)];
1302 *bdev = sis->bdev;
1303
1304 offset = swp_offset(entry);
1305 start_se = sis->curr_swap_extent;
1306 se = start_se;
1307
1308 for ( ; ; ) {
1309 struct list_head *lh;
1310
1311 if (se->start_page <= offset &&
1312 offset < (se->start_page + se->nr_pages)) {
1313 return se->start_block + (offset - se->start_page);
1314 }
1315 lh = se->list.next;
1316 se = list_entry(lh, struct swap_extent, list);
1317 sis->curr_swap_extent = se;
1318 BUG_ON(se == start_se);
1319 }
1320}
1321
1322
1323
1324
1325sector_t map_swap_page(struct page *page, struct block_device **bdev)
1326{
1327 swp_entry_t entry;
1328 entry.val = page_private(page);
1329 return map_swap_entry(entry, bdev);
1330}
1331
1332
1333
1334
1335static void destroy_swap_extents(struct swap_info_struct *sis)
1336{
1337 while (!list_empty(&sis->first_swap_extent.list)) {
1338 struct swap_extent *se;
1339
1340 se = list_entry(sis->first_swap_extent.list.next,
1341 struct swap_extent, list);
1342 list_del(&se->list);
1343 kfree(se);
1344 }
1345}
1346
1347
1348
1349
1350
1351
1352
1353static int
1354add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1355 unsigned long nr_pages, sector_t start_block)
1356{
1357 struct swap_extent *se;
1358 struct swap_extent *new_se;
1359 struct list_head *lh;
1360
1361 if (start_page == 0) {
1362 se = &sis->first_swap_extent;
1363 sis->curr_swap_extent = se;
1364 se->start_page = 0;
1365 se->nr_pages = nr_pages;
1366 se->start_block = start_block;
1367 return 1;
1368 } else {
1369 lh = sis->first_swap_extent.list.prev;
1370 se = list_entry(lh, struct swap_extent, list);
1371 BUG_ON(se->start_page + se->nr_pages != start_page);
1372 if (se->start_block + se->nr_pages == start_block) {
1373
1374 se->nr_pages += nr_pages;
1375 return 0;
1376 }
1377 }
1378
1379
1380
1381
1382 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1383 if (new_se == NULL)
1384 return -ENOMEM;
1385 new_se->start_page = start_page;
1386 new_se->nr_pages = nr_pages;
1387 new_se->start_block = start_block;
1388
1389 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1390 return 1;
1391}
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1425{
1426 struct inode *inode;
1427 unsigned blocks_per_page;
1428 unsigned long page_no;
1429 unsigned blkbits;
1430 sector_t probe_block;
1431 sector_t last_block;
1432 sector_t lowest_block = -1;
1433 sector_t highest_block = 0;
1434 int nr_extents = 0;
1435 int ret;
1436
1437 inode = sis->swap_file->f_mapping->host;
1438 if (S_ISBLK(inode->i_mode)) {
1439 ret = add_swap_extent(sis, 0, sis->max, 0);
1440 *span = sis->pages;
1441 goto out;
1442 }
1443
1444 blkbits = inode->i_blkbits;
1445 blocks_per_page = PAGE_SIZE >> blkbits;
1446
1447
1448
1449
1450
1451 probe_block = 0;
1452 page_no = 0;
1453 last_block = i_size_read(inode) >> blkbits;
1454 while ((probe_block + blocks_per_page) <= last_block &&
1455 page_no < sis->max) {
1456 unsigned block_in_page;
1457 sector_t first_block;
1458
1459 first_block = bmap(inode, probe_block);
1460 if (first_block == 0)
1461 goto bad_bmap;
1462
1463
1464
1465
1466 if (first_block & (blocks_per_page - 1)) {
1467 probe_block++;
1468 goto reprobe;
1469 }
1470
1471 for (block_in_page = 1; block_in_page < blocks_per_page;
1472 block_in_page++) {
1473 sector_t block;
1474
1475 block = bmap(inode, probe_block + block_in_page);
1476 if (block == 0)
1477 goto bad_bmap;
1478 if (block != first_block + block_in_page) {
1479
1480 probe_block++;
1481 goto reprobe;
1482 }
1483 }
1484
1485 first_block >>= (PAGE_SHIFT - blkbits);
1486 if (page_no) {
1487 if (first_block < lowest_block)
1488 lowest_block = first_block;
1489 if (first_block > highest_block)
1490 highest_block = first_block;
1491 }
1492
1493
1494
1495
1496 ret = add_swap_extent(sis, page_no, 1, first_block);
1497 if (ret < 0)
1498 goto out;
1499 nr_extents += ret;
1500 page_no++;
1501 probe_block += blocks_per_page;
1502reprobe:
1503 continue;
1504 }
1505 ret = nr_extents;
1506 *span = 1 + highest_block - lowest_block;
1507 if (page_no == 0)
1508 page_no = 1;
1509 sis->max = page_no;
1510 sis->pages = page_no - 1;
1511 sis->highest_bit = page_no - 1;
1512out:
1513 return ret;
1514bad_bmap:
1515 printk(KERN_ERR "swapon: swapfile has holes\n");
1516 ret = -EINVAL;
1517 goto out;
1518}
1519
1520static void enable_swap_info(struct swap_info_struct *p, int prio,
1521 unsigned char *swap_map)
1522{
1523 int i, prev;
1524
1525 spin_lock(&swap_lock);
1526 if (prio >= 0)
1527 p->prio = prio;
1528 else
1529 p->prio = --least_priority;
1530 p->swap_map = swap_map;
1531 p->flags |= SWP_WRITEOK;
1532 nr_swap_pages += p->pages;
1533 total_swap_pages += p->pages;
1534
1535
1536 prev = -1;
1537 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1538 if (p->prio >= swap_info[i]->prio)
1539 break;
1540 prev = i;
1541 }
1542 p->next = i;
1543 if (prev < 0)
1544 swap_list.head = swap_list.next = p->type;
1545 else
1546 swap_info[prev]->next = p->type;
1547 spin_unlock(&swap_lock);
1548}
1549
1550SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1551{
1552 struct swap_info_struct *p = NULL;
1553 unsigned char *swap_map;
1554 struct file *swap_file, *victim;
1555 struct address_space *mapping;
1556 struct inode *inode;
1557 char *pathname;
1558 int oom_score_adj;
1559 int i, type, prev;
1560 int err;
1561
1562 if (!capable(CAP_SYS_ADMIN))
1563 return -EPERM;
1564
1565 pathname = getname(specialfile);
1566 err = PTR_ERR(pathname);
1567 if (IS_ERR(pathname))
1568 goto out;
1569
1570 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1571 putname(pathname);
1572 err = PTR_ERR(victim);
1573 if (IS_ERR(victim))
1574 goto out;
1575
1576 mapping = victim->f_mapping;
1577 prev = -1;
1578 spin_lock(&swap_lock);
1579 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1580 p = swap_info[type];
1581 if (p->flags & SWP_WRITEOK) {
1582 if (p->swap_file->f_mapping == mapping)
1583 break;
1584 }
1585 prev = type;
1586 }
1587 if (type < 0) {
1588 err = -EINVAL;
1589 spin_unlock(&swap_lock);
1590 goto out_dput;
1591 }
1592 if (!security_vm_enough_memory(p->pages))
1593 vm_unacct_memory(p->pages);
1594 else {
1595 err = -ENOMEM;
1596 spin_unlock(&swap_lock);
1597 goto out_dput;
1598 }
1599 if (prev < 0)
1600 swap_list.head = p->next;
1601 else
1602 swap_info[prev]->next = p->next;
1603 if (type == swap_list.next) {
1604
1605 swap_list.next = swap_list.head;
1606 }
1607 if (p->prio < 0) {
1608 for (i = p->next; i >= 0; i = swap_info[i]->next)
1609 swap_info[i]->prio = p->prio--;
1610 least_priority++;
1611 }
1612 nr_swap_pages -= p->pages;
1613 total_swap_pages -= p->pages;
1614 p->flags &= ~SWP_WRITEOK;
1615 spin_unlock(&swap_lock);
1616
1617 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1618 err = try_to_unuse(type);
1619 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1620
1621 if (err) {
1622
1623
1624
1625
1626
1627
1628
1629 enable_swap_info(p, p->prio, p->swap_map);
1630 goto out_dput;
1631 }
1632
1633 destroy_swap_extents(p);
1634 if (p->flags & SWP_CONTINUED)
1635 free_swap_count_continuations(p);
1636
1637 mutex_lock(&swapon_mutex);
1638 spin_lock(&swap_lock);
1639 drain_mmlist();
1640
1641
1642 p->highest_bit = 0;
1643 while (p->flags >= SWP_SCANNING) {
1644 spin_unlock(&swap_lock);
1645 schedule_timeout_uninterruptible(1);
1646 spin_lock(&swap_lock);
1647 }
1648
1649 swap_file = p->swap_file;
1650 p->swap_file = NULL;
1651 p->max = 0;
1652 swap_map = p->swap_map;
1653 p->swap_map = NULL;
1654 p->flags = 0;
1655 spin_unlock(&swap_lock);
1656 mutex_unlock(&swapon_mutex);
1657 vfree(swap_map);
1658
1659 swap_cgroup_swapoff(type);
1660
1661 inode = mapping->host;
1662 if (S_ISBLK(inode->i_mode)) {
1663 struct block_device *bdev = I_BDEV(inode);
1664 set_blocksize(bdev, p->old_block_size);
1665 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1666 } else {
1667 mutex_lock(&inode->i_mutex);
1668 inode->i_flags &= ~S_SWAPFILE;
1669 mutex_unlock(&inode->i_mutex);
1670 }
1671 filp_close(swap_file, NULL);
1672 err = 0;
1673 atomic_inc(&proc_poll_event);
1674 wake_up_interruptible(&proc_poll_wait);
1675
1676out_dput:
1677 filp_close(victim, NULL);
1678out:
1679 return err;
1680}
1681
1682#ifdef CONFIG_PROC_FS
1683static unsigned swaps_poll(struct file *file, poll_table *wait)
1684{
1685 struct seq_file *seq = file->private_data;
1686
1687 poll_wait(file, &proc_poll_wait, wait);
1688
1689 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1690 seq->poll_event = atomic_read(&proc_poll_event);
1691 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1692 }
1693
1694 return POLLIN | POLLRDNORM;
1695}
1696
1697
1698static void *swap_start(struct seq_file *swap, loff_t *pos)
1699{
1700 struct swap_info_struct *si;
1701 int type;
1702 loff_t l = *pos;
1703
1704 mutex_lock(&swapon_mutex);
1705
1706 if (!l)
1707 return SEQ_START_TOKEN;
1708
1709 for (type = 0; type < nr_swapfiles; type++) {
1710 smp_rmb();
1711 si = swap_info[type];
1712 if (!(si->flags & SWP_USED) || !si->swap_map)
1713 continue;
1714 if (!--l)
1715 return si;
1716 }
1717
1718 return NULL;
1719}
1720
1721static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1722{
1723 struct swap_info_struct *si = v;
1724 int type;
1725
1726 if (v == SEQ_START_TOKEN)
1727 type = 0;
1728 else
1729 type = si->type + 1;
1730
1731 for (; type < nr_swapfiles; type++) {
1732 smp_rmb();
1733 si = swap_info[type];
1734 if (!(si->flags & SWP_USED) || !si->swap_map)
1735 continue;
1736 ++*pos;
1737 return si;
1738 }
1739
1740 return NULL;
1741}
1742
1743static void swap_stop(struct seq_file *swap, void *v)
1744{
1745 mutex_unlock(&swapon_mutex);
1746}
1747
1748static int swap_show(struct seq_file *swap, void *v)
1749{
1750 struct swap_info_struct *si = v;
1751 struct file *file;
1752 int len;
1753
1754 if (si == SEQ_START_TOKEN) {
1755 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1756 return 0;
1757 }
1758
1759 file = si->swap_file;
1760 len = seq_path(swap, &file->f_path, " \t\n\\");
1761 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1762 len < 40 ? 40 - len : 1, " ",
1763 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1764 "partition" : "file\t",
1765 si->pages << (PAGE_SHIFT - 10),
1766 si->inuse_pages << (PAGE_SHIFT - 10),
1767 si->prio);
1768 return 0;
1769}
1770
1771static const struct seq_operations swaps_op = {
1772 .start = swap_start,
1773 .next = swap_next,
1774 .stop = swap_stop,
1775 .show = swap_show
1776};
1777
1778static int swaps_open(struct inode *inode, struct file *file)
1779{
1780 struct seq_file *seq;
1781 int ret;
1782
1783 ret = seq_open(file, &swaps_op);
1784 if (ret)
1785 return ret;
1786
1787 seq = file->private_data;
1788 seq->poll_event = atomic_read(&proc_poll_event);
1789 return 0;
1790}
1791
1792static const struct file_operations proc_swaps_operations = {
1793 .open = swaps_open,
1794 .read = seq_read,
1795 .llseek = seq_lseek,
1796 .release = seq_release,
1797 .poll = swaps_poll,
1798};
1799
1800static int __init procswaps_init(void)
1801{
1802 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1803 return 0;
1804}
1805__initcall(procswaps_init);
1806#endif
1807
1808#ifdef MAX_SWAPFILES_CHECK
1809static int __init max_swapfiles_check(void)
1810{
1811 MAX_SWAPFILES_CHECK();
1812 return 0;
1813}
1814late_initcall(max_swapfiles_check);
1815#endif
1816
1817static struct swap_info_struct *alloc_swap_info(void)
1818{
1819 struct swap_info_struct *p;
1820 unsigned int type;
1821
1822 p = kzalloc(sizeof(*p), GFP_KERNEL);
1823 if (!p)
1824 return ERR_PTR(-ENOMEM);
1825
1826 spin_lock(&swap_lock);
1827 for (type = 0; type < nr_swapfiles; type++) {
1828 if (!(swap_info[type]->flags & SWP_USED))
1829 break;
1830 }
1831 if (type >= MAX_SWAPFILES) {
1832 spin_unlock(&swap_lock);
1833 kfree(p);
1834 return ERR_PTR(-EPERM);
1835 }
1836 if (type >= nr_swapfiles) {
1837 p->type = type;
1838 swap_info[type] = p;
1839
1840
1841
1842
1843
1844 smp_wmb();
1845 nr_swapfiles++;
1846 } else {
1847 kfree(p);
1848 p = swap_info[type];
1849
1850
1851
1852
1853 }
1854 INIT_LIST_HEAD(&p->first_swap_extent.list);
1855 p->flags = SWP_USED;
1856 p->next = -1;
1857 spin_unlock(&swap_lock);
1858
1859 return p;
1860}
1861
1862static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1863{
1864 int error;
1865
1866 if (S_ISBLK(inode->i_mode)) {
1867 p->bdev = bdgrab(I_BDEV(inode));
1868 error = blkdev_get(p->bdev,
1869 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1870 sys_swapon);
1871 if (error < 0) {
1872 p->bdev = NULL;
1873 return -EINVAL;
1874 }
1875 p->old_block_size = block_size(p->bdev);
1876 error = set_blocksize(p->bdev, PAGE_SIZE);
1877 if (error < 0)
1878 return error;
1879 p->flags |= SWP_BLKDEV;
1880 } else if (S_ISREG(inode->i_mode)) {
1881 p->bdev = inode->i_sb->s_bdev;
1882 mutex_lock(&inode->i_mutex);
1883 if (IS_SWAPFILE(inode))
1884 return -EBUSY;
1885 } else
1886 return -EINVAL;
1887
1888 return 0;
1889}
1890
1891static unsigned long read_swap_header(struct swap_info_struct *p,
1892 union swap_header *swap_header,
1893 struct inode *inode)
1894{
1895 int i;
1896 unsigned long maxpages;
1897 unsigned long swapfilepages;
1898
1899 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1900 printk(KERN_ERR "Unable to find swap-space signature\n");
1901 return 0;
1902 }
1903
1904
1905 if (swab32(swap_header->info.version) == 1) {
1906 swab32s(&swap_header->info.version);
1907 swab32s(&swap_header->info.last_page);
1908 swab32s(&swap_header->info.nr_badpages);
1909 for (i = 0; i < swap_header->info.nr_badpages; i++)
1910 swab32s(&swap_header->info.badpages[i]);
1911 }
1912
1913 if (swap_header->info.version != 1) {
1914 printk(KERN_WARNING
1915 "Unable to handle swap header version %d\n",
1916 swap_header->info.version);
1917 return 0;
1918 }
1919
1920 p->lowest_bit = 1;
1921 p->cluster_next = 1;
1922 p->cluster_nr = 0;
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939 maxpages = swp_offset(pte_to_swp_entry(
1940 swp_entry_to_pte(swp_entry(0, ~0UL))));
1941 maxpages = swp_offset(radix_to_swp_entry(
1942 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1943
1944 if (maxpages > swap_header->info.last_page) {
1945 maxpages = swap_header->info.last_page + 1;
1946
1947 if ((unsigned int)maxpages == 0)
1948 maxpages = UINT_MAX;
1949 }
1950 p->highest_bit = maxpages - 1;
1951
1952 if (!maxpages)
1953 return 0;
1954 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1955 if (swapfilepages && maxpages > swapfilepages) {
1956 printk(KERN_WARNING
1957 "Swap area shorter than signature indicates\n");
1958 return 0;
1959 }
1960 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1961 return 0;
1962 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1963 return 0;
1964
1965 return maxpages;
1966}
1967
1968static int setup_swap_map_and_extents(struct swap_info_struct *p,
1969 union swap_header *swap_header,
1970 unsigned char *swap_map,
1971 unsigned long maxpages,
1972 sector_t *span)
1973{
1974 int i;
1975 unsigned int nr_good_pages;
1976 int nr_extents;
1977
1978 nr_good_pages = maxpages - 1;
1979
1980 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1981 unsigned int page_nr = swap_header->info.badpages[i];
1982 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1983 return -EINVAL;
1984 if (page_nr < maxpages) {
1985 swap_map[page_nr] = SWAP_MAP_BAD;
1986 nr_good_pages--;
1987 }
1988 }
1989
1990 if (nr_good_pages) {
1991 swap_map[0] = SWAP_MAP_BAD;
1992 p->max = maxpages;
1993 p->pages = nr_good_pages;
1994 nr_extents = setup_swap_extents(p, span);
1995 if (nr_extents < 0)
1996 return nr_extents;
1997 nr_good_pages = p->pages;
1998 }
1999 if (!nr_good_pages) {
2000 printk(KERN_WARNING "Empty swap-file\n");
2001 return -EINVAL;
2002 }
2003
2004 return nr_extents;
2005}
2006
2007SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2008{
2009 struct swap_info_struct *p;
2010 char *name;
2011 struct file *swap_file = NULL;
2012 struct address_space *mapping;
2013 int i;
2014 int prio;
2015 int error;
2016 union swap_header *swap_header;
2017 int nr_extents;
2018 sector_t span;
2019 unsigned long maxpages;
2020 unsigned char *swap_map = NULL;
2021 struct page *page = NULL;
2022 struct inode *inode = NULL;
2023
2024 if (!capable(CAP_SYS_ADMIN))
2025 return -EPERM;
2026
2027 p = alloc_swap_info();
2028 if (IS_ERR(p))
2029 return PTR_ERR(p);
2030
2031 name = getname(specialfile);
2032 if (IS_ERR(name)) {
2033 error = PTR_ERR(name);
2034 name = NULL;
2035 goto bad_swap;
2036 }
2037 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2038 if (IS_ERR(swap_file)) {
2039 error = PTR_ERR(swap_file);
2040 swap_file = NULL;
2041 goto bad_swap;
2042 }
2043
2044 p->swap_file = swap_file;
2045 mapping = swap_file->f_mapping;
2046
2047 for (i = 0; i < nr_swapfiles; i++) {
2048 struct swap_info_struct *q = swap_info[i];
2049
2050 if (q == p || !q->swap_file)
2051 continue;
2052 if (mapping == q->swap_file->f_mapping) {
2053 error = -EBUSY;
2054 goto bad_swap;
2055 }
2056 }
2057
2058 inode = mapping->host;
2059
2060 error = claim_swapfile(p, inode);
2061 if (unlikely(error))
2062 goto bad_swap;
2063
2064
2065
2066
2067 if (!mapping->a_ops->readpage) {
2068 error = -EINVAL;
2069 goto bad_swap;
2070 }
2071 page = read_mapping_page(mapping, 0, swap_file);
2072 if (IS_ERR(page)) {
2073 error = PTR_ERR(page);
2074 goto bad_swap;
2075 }
2076 swap_header = kmap(page);
2077
2078 maxpages = read_swap_header(p, swap_header, inode);
2079 if (unlikely(!maxpages)) {
2080 error = -EINVAL;
2081 goto bad_swap;
2082 }
2083
2084
2085 swap_map = vzalloc(maxpages);
2086 if (!swap_map) {
2087 error = -ENOMEM;
2088 goto bad_swap;
2089 }
2090
2091 error = swap_cgroup_swapon(p->type, maxpages);
2092 if (error)
2093 goto bad_swap;
2094
2095 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2096 maxpages, &span);
2097 if (unlikely(nr_extents < 0)) {
2098 error = nr_extents;
2099 goto bad_swap;
2100 }
2101
2102 if (p->bdev) {
2103 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2104 p->flags |= SWP_SOLIDSTATE;
2105 p->cluster_next = 1 + (random32() % p->highest_bit);
2106 }
2107 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2108 p->flags |= SWP_DISCARDABLE;
2109 }
2110
2111 mutex_lock(&swapon_mutex);
2112 prio = -1;
2113 if (swap_flags & SWAP_FLAG_PREFER)
2114 prio =
2115 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2116 enable_swap_info(p, prio, swap_map);
2117
2118 printk(KERN_INFO "Adding %uk swap on %s. "
2119 "Priority:%d extents:%d across:%lluk %s%s\n",
2120 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2121 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2122 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2123 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2124
2125 mutex_unlock(&swapon_mutex);
2126 atomic_inc(&proc_poll_event);
2127 wake_up_interruptible(&proc_poll_wait);
2128
2129 if (S_ISREG(inode->i_mode))
2130 inode->i_flags |= S_SWAPFILE;
2131 error = 0;
2132 goto out;
2133bad_swap:
2134 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2135 set_blocksize(p->bdev, p->old_block_size);
2136 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2137 }
2138 destroy_swap_extents(p);
2139 swap_cgroup_swapoff(p->type);
2140 spin_lock(&swap_lock);
2141 p->swap_file = NULL;
2142 p->flags = 0;
2143 spin_unlock(&swap_lock);
2144 vfree(swap_map);
2145 if (swap_file) {
2146 if (inode && S_ISREG(inode->i_mode)) {
2147 mutex_unlock(&inode->i_mutex);
2148 inode = NULL;
2149 }
2150 filp_close(swap_file, NULL);
2151 }
2152out:
2153 if (page && !IS_ERR(page)) {
2154 kunmap(page);
2155 page_cache_release(page);
2156 }
2157 if (name)
2158 putname(name);
2159 if (inode && S_ISREG(inode->i_mode))
2160 mutex_unlock(&inode->i_mutex);
2161 return error;
2162}
2163
2164void si_swapinfo(struct sysinfo *val)
2165{
2166 unsigned int type;
2167 unsigned long nr_to_be_unused = 0;
2168
2169 spin_lock(&swap_lock);
2170 for (type = 0; type < nr_swapfiles; type++) {
2171 struct swap_info_struct *si = swap_info[type];
2172
2173 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2174 nr_to_be_unused += si->inuse_pages;
2175 }
2176 val->freeswap = nr_swap_pages + nr_to_be_unused;
2177 val->totalswap = total_swap_pages + nr_to_be_unused;
2178 spin_unlock(&swap_lock);
2179}
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2193{
2194 struct swap_info_struct *p;
2195 unsigned long offset, type;
2196 unsigned char count;
2197 unsigned char has_cache;
2198 int err = -EINVAL;
2199
2200 if (non_swap_entry(entry))
2201 goto out;
2202
2203 type = swp_type(entry);
2204 if (type >= nr_swapfiles)
2205 goto bad_file;
2206 p = swap_info[type];
2207 offset = swp_offset(entry);
2208
2209 spin_lock(&swap_lock);
2210 if (unlikely(offset >= p->max))
2211 goto unlock_out;
2212
2213 count = p->swap_map[offset];
2214 has_cache = count & SWAP_HAS_CACHE;
2215 count &= ~SWAP_HAS_CACHE;
2216 err = 0;
2217
2218 if (usage == SWAP_HAS_CACHE) {
2219
2220
2221 if (!has_cache && count)
2222 has_cache = SWAP_HAS_CACHE;
2223 else if (has_cache)
2224 err = -EEXIST;
2225 else
2226 err = -ENOENT;
2227
2228 } else if (count || has_cache) {
2229
2230 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2231 count += usage;
2232 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2233 err = -EINVAL;
2234 else if (swap_count_continued(p, offset, count))
2235 count = COUNT_CONTINUED;
2236 else
2237 err = -ENOMEM;
2238 } else
2239 err = -ENOENT;
2240
2241 p->swap_map[offset] = count | has_cache;
2242
2243unlock_out:
2244 spin_unlock(&swap_lock);
2245out:
2246 return err;
2247
2248bad_file:
2249 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2250 goto out;
2251}
2252
2253
2254
2255
2256
2257void swap_shmem_alloc(swp_entry_t entry)
2258{
2259 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2260}
2261
2262
2263
2264
2265
2266
2267
2268
2269int swap_duplicate(swp_entry_t entry)
2270{
2271 int err = 0;
2272
2273 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2274 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2275 return err;
2276}
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286int swapcache_prepare(swp_entry_t entry)
2287{
2288 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2289}
2290
2291
2292
2293
2294
2295int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2296{
2297 struct swap_info_struct *si;
2298 int our_page_cluster = page_cluster;
2299 pgoff_t target, toff;
2300 pgoff_t base, end;
2301 int nr_pages = 0;
2302
2303 if (!our_page_cluster)
2304 return 0;
2305
2306 si = swap_info[swp_type(entry)];
2307 target = swp_offset(entry);
2308 base = (target >> our_page_cluster) << our_page_cluster;
2309 end = base + (1 << our_page_cluster);
2310 if (!base)
2311 base++;
2312
2313 spin_lock(&swap_lock);
2314 if (end > si->max)
2315 end = si->max;
2316
2317
2318 for (toff = target; ++toff < end; nr_pages++) {
2319
2320 if (!si->swap_map[toff])
2321 break;
2322 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2323 break;
2324 }
2325
2326 for (toff = target; --toff >= base; nr_pages++) {
2327
2328 if (!si->swap_map[toff])
2329 break;
2330 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2331 break;
2332 }
2333 spin_unlock(&swap_lock);
2334
2335
2336
2337
2338
2339 *offset = ++toff;
2340 return nr_pages? ++nr_pages: 0;
2341}
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2359{
2360 struct swap_info_struct *si;
2361 struct page *head;
2362 struct page *page;
2363 struct page *list_page;
2364 pgoff_t offset;
2365 unsigned char count;
2366
2367
2368
2369
2370
2371 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2372
2373 si = swap_info_get(entry);
2374 if (!si) {
2375
2376
2377
2378
2379
2380 goto outer;
2381 }
2382
2383 offset = swp_offset(entry);
2384 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2385
2386 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2387
2388
2389
2390
2391
2392 goto out;
2393 }
2394
2395 if (!page) {
2396 spin_unlock(&swap_lock);
2397 return -ENOMEM;
2398 }
2399
2400
2401
2402
2403
2404
2405 head = vmalloc_to_page(si->swap_map + offset);
2406 offset &= ~PAGE_MASK;
2407
2408
2409
2410
2411
2412 if (!page_private(head)) {
2413 BUG_ON(count & COUNT_CONTINUED);
2414 INIT_LIST_HEAD(&head->lru);
2415 set_page_private(head, SWP_CONTINUED);
2416 si->flags |= SWP_CONTINUED;
2417 }
2418
2419 list_for_each_entry(list_page, &head->lru, lru) {
2420 unsigned char *map;
2421
2422
2423
2424
2425
2426 if (!(count & COUNT_CONTINUED))
2427 goto out;
2428
2429 map = kmap_atomic(list_page, KM_USER0) + offset;
2430 count = *map;
2431 kunmap_atomic(map, KM_USER0);
2432
2433
2434
2435
2436
2437 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2438 goto out;
2439 }
2440
2441 list_add_tail(&page->lru, &head->lru);
2442 page = NULL;
2443out:
2444 spin_unlock(&swap_lock);
2445outer:
2446 if (page)
2447 __free_page(page);
2448 return 0;
2449}
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459static bool swap_count_continued(struct swap_info_struct *si,
2460 pgoff_t offset, unsigned char count)
2461{
2462 struct page *head;
2463 struct page *page;
2464 unsigned char *map;
2465
2466 head = vmalloc_to_page(si->swap_map + offset);
2467 if (page_private(head) != SWP_CONTINUED) {
2468 BUG_ON(count & COUNT_CONTINUED);
2469 return false;
2470 }
2471
2472 offset &= ~PAGE_MASK;
2473 page = list_entry(head->lru.next, struct page, lru);
2474 map = kmap_atomic(page, KM_USER0) + offset;
2475
2476 if (count == SWAP_MAP_MAX)
2477 goto init_map;
2478
2479 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2480
2481
2482
2483 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2484 kunmap_atomic(map, KM_USER0);
2485 page = list_entry(page->lru.next, struct page, lru);
2486 BUG_ON(page == head);
2487 map = kmap_atomic(page, KM_USER0) + offset;
2488 }
2489 if (*map == SWAP_CONT_MAX) {
2490 kunmap_atomic(map, KM_USER0);
2491 page = list_entry(page->lru.next, struct page, lru);
2492 if (page == head)
2493 return false;
2494 map = kmap_atomic(page, KM_USER0) + offset;
2495init_map: *map = 0;
2496 }
2497 *map += 1;
2498 kunmap_atomic(map, KM_USER0);
2499 page = list_entry(page->lru.prev, struct page, lru);
2500 while (page != head) {
2501 map = kmap_atomic(page, KM_USER0) + offset;
2502 *map = COUNT_CONTINUED;
2503 kunmap_atomic(map, KM_USER0);
2504 page = list_entry(page->lru.prev, struct page, lru);
2505 }
2506 return true;
2507
2508 } else {
2509
2510
2511
2512 BUG_ON(count != COUNT_CONTINUED);
2513 while (*map == COUNT_CONTINUED) {
2514 kunmap_atomic(map, KM_USER0);
2515 page = list_entry(page->lru.next, struct page, lru);
2516 BUG_ON(page == head);
2517 map = kmap_atomic(page, KM_USER0) + offset;
2518 }
2519 BUG_ON(*map == 0);
2520 *map -= 1;
2521 if (*map == 0)
2522 count = 0;
2523 kunmap_atomic(map, KM_USER0);
2524 page = list_entry(page->lru.prev, struct page, lru);
2525 while (page != head) {
2526 map = kmap_atomic(page, KM_USER0) + offset;
2527 *map = SWAP_CONT_MAX | count;
2528 count = COUNT_CONTINUED;
2529 kunmap_atomic(map, KM_USER0);
2530 page = list_entry(page->lru.prev, struct page, lru);
2531 }
2532 return count == COUNT_CONTINUED;
2533 }
2534}
2535
2536
2537
2538
2539
2540static void free_swap_count_continuations(struct swap_info_struct *si)
2541{
2542 pgoff_t offset;
2543
2544 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2545 struct page *head;
2546 head = vmalloc_to_page(si->swap_map + offset);
2547 if (page_private(head)) {
2548 struct list_head *this, *next;
2549 list_for_each_safe(this, next, &head->lru) {
2550 struct page *page;
2551 page = list_entry(this, struct page, lru);
2552 list_del(this);
2553 __free_page(page);
2554 }
2555 }
2556 }
2557}
2558