1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/security.h>
28#include <linux/backing-dev.h>
29#include <linux/mutex.h>
30#include <linux/capability.h>
31#include <linux/syscalls.h>
32#include <linux/memcontrol.h>
33#include <linux/poll.h>
34#include <linux/oom.h>
35
36#include <asm/pgtable.h>
37#include <asm/tlbflush.h>
38#include <linux/swapops.h>
39#include <linux/page_cgroup.h>
40
41static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42 unsigned char);
43static void free_swap_count_continuations(struct swap_info_struct *);
44static sector_t map_swap_entry(swp_entry_t, struct block_device**);
45
46static DEFINE_SPINLOCK(swap_lock);
47static unsigned int nr_swapfiles;
48long nr_swap_pages;
49long total_swap_pages;
50static int least_priority;
51
52static const char Bad_file[] = "Bad swap file entry ";
53static const char Unused_file[] = "Unused swap file entry ";
54static const char Bad_offset[] = "Bad swap offset entry ";
55static const char Unused_offset[] = "Unused swap offset entry ";
56
57static struct swap_list_t swap_list = {-1, -1};
58
59static struct swap_info_struct *swap_info[MAX_SWAPFILES];
60
61static DEFINE_MUTEX(swapon_mutex);
62
63static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
64
65static atomic_t proc_poll_event = ATOMIC_INIT(0);
66
67static inline unsigned char swap_count(unsigned char ent)
68{
69 return ent & ~SWAP_HAS_CACHE;
70}
71
72
73static int
74__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
75{
76 swp_entry_t entry = swp_entry(si->type, offset);
77 struct page *page;
78 int ret = 0;
79
80 page = find_get_page(&swapper_space, entry.val);
81 if (!page)
82 return 0;
83
84
85
86
87
88
89
90 if (trylock_page(page)) {
91 ret = try_to_free_swap(page);
92 unlock_page(page);
93 }
94 page_cache_release(page);
95 return ret;
96}
97
98
99
100
101
102static int discard_swap(struct swap_info_struct *si)
103{
104 struct swap_extent *se;
105 sector_t start_block;
106 sector_t nr_blocks;
107 int err = 0;
108
109
110 se = &si->first_swap_extent;
111 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
112 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
113 if (nr_blocks) {
114 err = blkdev_issue_discard(si->bdev, start_block,
115 nr_blocks, GFP_KERNEL, 0);
116 if (err)
117 return err;
118 cond_resched();
119 }
120
121 list_for_each_entry(se, &si->first_swap_extent.list, list) {
122 start_block = se->start_block << (PAGE_SHIFT - 9);
123 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
124
125 err = blkdev_issue_discard(si->bdev, start_block,
126 nr_blocks, GFP_KERNEL, 0);
127 if (err)
128 break;
129
130 cond_resched();
131 }
132 return err;
133}
134
135
136
137
138
139static void discard_swap_cluster(struct swap_info_struct *si,
140 pgoff_t start_page, pgoff_t nr_pages)
141{
142 struct swap_extent *se = si->curr_swap_extent;
143 int found_extent = 0;
144
145 while (nr_pages) {
146 struct list_head *lh;
147
148 if (se->start_page <= start_page &&
149 start_page < se->start_page + se->nr_pages) {
150 pgoff_t offset = start_page - se->start_page;
151 sector_t start_block = se->start_block + offset;
152 sector_t nr_blocks = se->nr_pages - offset;
153
154 if (nr_blocks > nr_pages)
155 nr_blocks = nr_pages;
156 start_page += nr_blocks;
157 nr_pages -= nr_blocks;
158
159 if (!found_extent++)
160 si->curr_swap_extent = se;
161
162 start_block <<= PAGE_SHIFT - 9;
163 nr_blocks <<= PAGE_SHIFT - 9;
164 if (blkdev_issue_discard(si->bdev, start_block,
165 nr_blocks, GFP_NOIO, 0))
166 break;
167 }
168
169 lh = se->list.next;
170 se = list_entry(lh, struct swap_extent, list);
171 }
172}
173
174static int wait_for_discard(void *word)
175{
176 schedule();
177 return 0;
178}
179
180#define SWAPFILE_CLUSTER 256
181#define LATENCY_LIMIT 256
182
183static unsigned long scan_swap_map(struct swap_info_struct *si,
184 unsigned char usage)
185{
186 unsigned long offset;
187 unsigned long scan_base;
188 unsigned long last_in_cluster = 0;
189 int latency_ration = LATENCY_LIMIT;
190 int found_free_cluster = 0;
191
192
193
194
195
196
197
198
199
200
201
202
203 si->flags += SWP_SCANNING;
204 scan_base = offset = si->cluster_next;
205
206 if (unlikely(!si->cluster_nr--)) {
207 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
208 si->cluster_nr = SWAPFILE_CLUSTER - 1;
209 goto checks;
210 }
211 if (si->flags & SWP_DISCARDABLE) {
212
213
214
215
216
217
218
219 if (si->lowest_alloc)
220 goto checks;
221 si->lowest_alloc = si->max;
222 si->highest_alloc = 0;
223 }
224 spin_unlock(&swap_lock);
225
226
227
228
229
230
231
232
233
234 if (!(si->flags & SWP_SOLIDSTATE))
235 scan_base = offset = si->lowest_bit;
236 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
237
238
239 for (; last_in_cluster <= si->highest_bit; offset++) {
240 if (si->swap_map[offset])
241 last_in_cluster = offset + SWAPFILE_CLUSTER;
242 else if (offset == last_in_cluster) {
243 spin_lock(&swap_lock);
244 offset -= SWAPFILE_CLUSTER - 1;
245 si->cluster_next = offset;
246 si->cluster_nr = SWAPFILE_CLUSTER - 1;
247 found_free_cluster = 1;
248 goto checks;
249 }
250 if (unlikely(--latency_ration < 0)) {
251 cond_resched();
252 latency_ration = LATENCY_LIMIT;
253 }
254 }
255
256 offset = si->lowest_bit;
257 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
258
259
260 for (; last_in_cluster < scan_base; offset++) {
261 if (si->swap_map[offset])
262 last_in_cluster = offset + SWAPFILE_CLUSTER;
263 else if (offset == last_in_cluster) {
264 spin_lock(&swap_lock);
265 offset -= SWAPFILE_CLUSTER - 1;
266 si->cluster_next = offset;
267 si->cluster_nr = SWAPFILE_CLUSTER - 1;
268 found_free_cluster = 1;
269 goto checks;
270 }
271 if (unlikely(--latency_ration < 0)) {
272 cond_resched();
273 latency_ration = LATENCY_LIMIT;
274 }
275 }
276
277 offset = scan_base;
278 spin_lock(&swap_lock);
279 si->cluster_nr = SWAPFILE_CLUSTER - 1;
280 si->lowest_alloc = 0;
281 }
282
283checks:
284 if (!(si->flags & SWP_WRITEOK))
285 goto no_page;
286 if (!si->highest_bit)
287 goto no_page;
288 if (offset > si->highest_bit)
289 scan_base = offset = si->lowest_bit;
290
291
292 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
293 int swap_was_freed;
294 spin_unlock(&swap_lock);
295 swap_was_freed = __try_to_reclaim_swap(si, offset);
296 spin_lock(&swap_lock);
297
298 if (swap_was_freed)
299 goto checks;
300 goto scan;
301 }
302
303 if (si->swap_map[offset])
304 goto scan;
305
306 if (offset == si->lowest_bit)
307 si->lowest_bit++;
308 if (offset == si->highest_bit)
309 si->highest_bit--;
310 si->inuse_pages++;
311 if (si->inuse_pages == si->pages) {
312 si->lowest_bit = si->max;
313 si->highest_bit = 0;
314 }
315 si->swap_map[offset] = usage;
316 si->cluster_next = offset + 1;
317 si->flags -= SWP_SCANNING;
318
319 if (si->lowest_alloc) {
320
321
322
323
324 if (found_free_cluster) {
325
326
327
328
329
330
331
332 if (offset < si->highest_alloc &&
333 si->lowest_alloc <= last_in_cluster)
334 last_in_cluster = si->lowest_alloc - 1;
335 si->flags |= SWP_DISCARDING;
336 spin_unlock(&swap_lock);
337
338 if (offset < last_in_cluster)
339 discard_swap_cluster(si, offset,
340 last_in_cluster - offset + 1);
341
342 spin_lock(&swap_lock);
343 si->lowest_alloc = 0;
344 si->flags &= ~SWP_DISCARDING;
345
346 smp_mb();
347 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
348
349 } else if (si->flags & SWP_DISCARDING) {
350
351
352
353
354
355
356 spin_unlock(&swap_lock);
357 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
358 wait_for_discard, TASK_UNINTERRUPTIBLE);
359 spin_lock(&swap_lock);
360 } else {
361
362
363
364
365
366 if (offset < si->lowest_alloc)
367 si->lowest_alloc = offset;
368 if (offset > si->highest_alloc)
369 si->highest_alloc = offset;
370 }
371 }
372 return offset;
373
374scan:
375 spin_unlock(&swap_lock);
376 while (++offset <= si->highest_bit) {
377 if (!si->swap_map[offset]) {
378 spin_lock(&swap_lock);
379 goto checks;
380 }
381 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
382 spin_lock(&swap_lock);
383 goto checks;
384 }
385 if (unlikely(--latency_ration < 0)) {
386 cond_resched();
387 latency_ration = LATENCY_LIMIT;
388 }
389 }
390 offset = si->lowest_bit;
391 while (++offset < scan_base) {
392 if (!si->swap_map[offset]) {
393 spin_lock(&swap_lock);
394 goto checks;
395 }
396 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
397 spin_lock(&swap_lock);
398 goto checks;
399 }
400 if (unlikely(--latency_ration < 0)) {
401 cond_resched();
402 latency_ration = LATENCY_LIMIT;
403 }
404 }
405 spin_lock(&swap_lock);
406
407no_page:
408 si->flags -= SWP_SCANNING;
409 return 0;
410}
411
412swp_entry_t get_swap_page(void)
413{
414 struct swap_info_struct *si;
415 pgoff_t offset;
416 int type, next;
417 int wrapped = 0;
418
419 spin_lock(&swap_lock);
420 if (nr_swap_pages <= 0)
421 goto noswap;
422 nr_swap_pages--;
423
424 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
425 si = swap_info[type];
426 next = si->next;
427 if (next < 0 ||
428 (!wrapped && si->prio != swap_info[next]->prio)) {
429 next = swap_list.head;
430 wrapped++;
431 }
432
433 if (!si->highest_bit)
434 continue;
435 if (!(si->flags & SWP_WRITEOK))
436 continue;
437
438 swap_list.next = next;
439
440 offset = scan_swap_map(si, SWAP_HAS_CACHE);
441 if (offset) {
442 spin_unlock(&swap_lock);
443 return swp_entry(type, offset);
444 }
445 next = swap_list.next;
446 }
447
448 nr_swap_pages++;
449noswap:
450 spin_unlock(&swap_lock);
451 return (swp_entry_t) {0};
452}
453
454
455swp_entry_t get_swap_page_of_type(int type)
456{
457 struct swap_info_struct *si;
458 pgoff_t offset;
459
460 spin_lock(&swap_lock);
461 si = swap_info[type];
462 if (si && (si->flags & SWP_WRITEOK)) {
463 nr_swap_pages--;
464
465 offset = scan_swap_map(si, 1);
466 if (offset) {
467 spin_unlock(&swap_lock);
468 return swp_entry(type, offset);
469 }
470 nr_swap_pages++;
471 }
472 spin_unlock(&swap_lock);
473 return (swp_entry_t) {0};
474}
475
476static struct swap_info_struct *swap_info_get(swp_entry_t entry)
477{
478 struct swap_info_struct *p;
479 unsigned long offset, type;
480
481 if (!entry.val)
482 goto out;
483 type = swp_type(entry);
484 if (type >= nr_swapfiles)
485 goto bad_nofile;
486 p = swap_info[type];
487 if (!(p->flags & SWP_USED))
488 goto bad_device;
489 offset = swp_offset(entry);
490 if (offset >= p->max)
491 goto bad_offset;
492 if (!p->swap_map[offset])
493 goto bad_free;
494 spin_lock(&swap_lock);
495 return p;
496
497bad_free:
498 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
499 goto out;
500bad_offset:
501 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
502 goto out;
503bad_device:
504 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
505 goto out;
506bad_nofile:
507 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
508out:
509 return NULL;
510}
511
512static unsigned char swap_entry_free(struct swap_info_struct *p,
513 swp_entry_t entry, unsigned char usage)
514{
515 unsigned long offset = swp_offset(entry);
516 unsigned char count;
517 unsigned char has_cache;
518
519 count = p->swap_map[offset];
520 has_cache = count & SWAP_HAS_CACHE;
521 count &= ~SWAP_HAS_CACHE;
522
523 if (usage == SWAP_HAS_CACHE) {
524 VM_BUG_ON(!has_cache);
525 has_cache = 0;
526 } else if (count == SWAP_MAP_SHMEM) {
527
528
529
530
531 count = 0;
532 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
533 if (count == COUNT_CONTINUED) {
534 if (swap_count_continued(p, offset, count))
535 count = SWAP_MAP_MAX | COUNT_CONTINUED;
536 else
537 count = SWAP_MAP_MAX;
538 } else
539 count--;
540 }
541
542 if (!count)
543 mem_cgroup_uncharge_swap(entry);
544
545 usage = count | has_cache;
546 p->swap_map[offset] = usage;
547
548
549 if (!usage) {
550 struct gendisk *disk = p->bdev->bd_disk;
551 if (offset < p->lowest_bit)
552 p->lowest_bit = offset;
553 if (offset > p->highest_bit)
554 p->highest_bit = offset;
555 if (swap_list.next >= 0 &&
556 p->prio > swap_info[swap_list.next]->prio)
557 swap_list.next = p->type;
558 nr_swap_pages++;
559 p->inuse_pages--;
560 if ((p->flags & SWP_BLKDEV) &&
561 disk->fops->swap_slot_free_notify)
562 disk->fops->swap_slot_free_notify(p->bdev, offset);
563 }
564
565 return usage;
566}
567
568
569
570
571
572void swap_free(swp_entry_t entry)
573{
574 struct swap_info_struct *p;
575
576 p = swap_info_get(entry);
577 if (p) {
578 swap_entry_free(p, entry, 1);
579 spin_unlock(&swap_lock);
580 }
581}
582
583
584
585
586void swapcache_free(swp_entry_t entry, struct page *page)
587{
588 struct swap_info_struct *p;
589 unsigned char count;
590
591 p = swap_info_get(entry);
592 if (p) {
593 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
594 if (page)
595 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
596 spin_unlock(&swap_lock);
597 }
598}
599
600
601
602
603
604
605static inline int page_swapcount(struct page *page)
606{
607 int count = 0;
608 struct swap_info_struct *p;
609 swp_entry_t entry;
610
611 entry.val = page_private(page);
612 p = swap_info_get(entry);
613 if (p) {
614 count = swap_count(p->swap_map[swp_offset(entry)]);
615 spin_unlock(&swap_lock);
616 }
617 return count;
618}
619
620
621
622
623
624
625
626int reuse_swap_page(struct page *page)
627{
628 int count;
629
630 VM_BUG_ON(!PageLocked(page));
631 if (unlikely(PageKsm(page)))
632 return 0;
633 count = page_mapcount(page);
634 if (count <= 1 && PageSwapCache(page)) {
635 count += page_swapcount(page);
636 if (count == 1 && !PageWriteback(page)) {
637 delete_from_swap_cache(page);
638 SetPageDirty(page);
639 }
640 }
641 return count <= 1;
642}
643
644
645
646
647
648int try_to_free_swap(struct page *page)
649{
650 VM_BUG_ON(!PageLocked(page));
651
652 if (!PageSwapCache(page))
653 return 0;
654 if (PageWriteback(page))
655 return 0;
656 if (page_swapcount(page))
657 return 0;
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674 if (!(gfp_allowed_mask & __GFP_IO))
675 return 0;
676
677 delete_from_swap_cache(page);
678 SetPageDirty(page);
679 return 1;
680}
681
682
683
684
685
686int free_swap_and_cache(swp_entry_t entry)
687{
688 struct swap_info_struct *p;
689 struct page *page = NULL;
690
691 if (non_swap_entry(entry))
692 return 1;
693
694 p = swap_info_get(entry);
695 if (p) {
696 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
697 page = find_get_page(&swapper_space, entry.val);
698 if (page && !trylock_page(page)) {
699 page_cache_release(page);
700 page = NULL;
701 }
702 }
703 spin_unlock(&swap_lock);
704 }
705 if (page) {
706
707
708
709
710 if (PageSwapCache(page) && !PageWriteback(page) &&
711 (!page_mapped(page) || vm_swap_full())) {
712 delete_from_swap_cache(page);
713 SetPageDirty(page);
714 }
715 unlock_page(page);
716 page_cache_release(page);
717 }
718 return p != NULL;
719}
720
721#ifdef CONFIG_CGROUP_MEM_RES_CTLR
722
723
724
725
726
727
728
729
730
731
732int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
733{
734 struct page *page;
735 struct swap_info_struct *p;
736 int count = 0;
737
738 page = find_get_page(&swapper_space, ent.val);
739 if (page)
740 count += page_mapcount(page);
741 p = swap_info_get(ent);
742 if (p) {
743 count += swap_count(p->swap_map[swp_offset(ent)]);
744 spin_unlock(&swap_lock);
745 }
746
747 *pagep = page;
748 return count;
749}
750#endif
751
752#ifdef CONFIG_HIBERNATION
753
754
755
756
757
758
759
760
761int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
762{
763 struct block_device *bdev = NULL;
764 int type;
765
766 if (device)
767 bdev = bdget(device);
768
769 spin_lock(&swap_lock);
770 for (type = 0; type < nr_swapfiles; type++) {
771 struct swap_info_struct *sis = swap_info[type];
772
773 if (!(sis->flags & SWP_WRITEOK))
774 continue;
775
776 if (!bdev) {
777 if (bdev_p)
778 *bdev_p = bdgrab(sis->bdev);
779
780 spin_unlock(&swap_lock);
781 return type;
782 }
783 if (bdev == sis->bdev) {
784 struct swap_extent *se = &sis->first_swap_extent;
785
786 if (se->start_block == offset) {
787 if (bdev_p)
788 *bdev_p = bdgrab(sis->bdev);
789
790 spin_unlock(&swap_lock);
791 bdput(bdev);
792 return type;
793 }
794 }
795 }
796 spin_unlock(&swap_lock);
797 if (bdev)
798 bdput(bdev);
799
800 return -ENODEV;
801}
802
803
804
805
806
807sector_t swapdev_block(int type, pgoff_t offset)
808{
809 struct block_device *bdev;
810
811 if ((unsigned int)type >= nr_swapfiles)
812 return 0;
813 if (!(swap_info[type]->flags & SWP_WRITEOK))
814 return 0;
815 return map_swap_entry(swp_entry(type, offset), &bdev);
816}
817
818
819
820
821
822
823
824unsigned int count_swap_pages(int type, int free)
825{
826 unsigned int n = 0;
827
828 spin_lock(&swap_lock);
829 if ((unsigned int)type < nr_swapfiles) {
830 struct swap_info_struct *sis = swap_info[type];
831
832 if (sis->flags & SWP_WRITEOK) {
833 n = sis->pages;
834 if (free)
835 n -= sis->inuse_pages;
836 }
837 }
838 spin_unlock(&swap_lock);
839 return n;
840}
841#endif
842
843
844
845
846
847
848static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
849 unsigned long addr, swp_entry_t entry, struct page *page)
850{
851 struct mem_cgroup *ptr;
852 spinlock_t *ptl;
853 pte_t *pte;
854 int ret = 1;
855
856 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
857 ret = -ENOMEM;
858 goto out_nolock;
859 }
860
861 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
862 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
863 if (ret > 0)
864 mem_cgroup_cancel_charge_swapin(ptr);
865 ret = 0;
866 goto out;
867 }
868
869 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
870 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
871 get_page(page);
872 set_pte_at(vma->vm_mm, addr, pte,
873 pte_mkold(mk_pte(page, vma->vm_page_prot)));
874 page_add_anon_rmap(page, vma, addr);
875 mem_cgroup_commit_charge_swapin(page, ptr);
876 swap_free(entry);
877
878
879
880
881 activate_page(page);
882out:
883 pte_unmap_unlock(pte, ptl);
884out_nolock:
885 return ret;
886}
887
888static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
889 unsigned long addr, unsigned long end,
890 swp_entry_t entry, struct page *page)
891{
892 pte_t swp_pte = swp_entry_to_pte(entry);
893 pte_t *pte;
894 int ret = 0;
895
896
897
898
899
900
901
902
903
904
905 pte = pte_offset_map(pmd, addr);
906 do {
907
908
909
910
911 if (unlikely(pte_same(*pte, swp_pte))) {
912 pte_unmap(pte);
913 ret = unuse_pte(vma, pmd, addr, entry, page);
914 if (ret)
915 goto out;
916 pte = pte_offset_map(pmd, addr);
917 }
918 } while (pte++, addr += PAGE_SIZE, addr != end);
919 pte_unmap(pte - 1);
920out:
921 return ret;
922}
923
924static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
925 unsigned long addr, unsigned long end,
926 swp_entry_t entry, struct page *page)
927{
928 pmd_t *pmd;
929 unsigned long next;
930 int ret;
931
932 pmd = pmd_offset(pud, addr);
933 do {
934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd)))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret)
941 return ret;
942 } while (pmd++, addr = next, addr != end);
943 return 0;
944}
945
946static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
947 unsigned long addr, unsigned long end,
948 swp_entry_t entry, struct page *page)
949{
950 pud_t *pud;
951 unsigned long next;
952 int ret;
953
954 pud = pud_offset(pgd, addr);
955 do {
956 next = pud_addr_end(addr, end);
957 if (pud_none_or_clear_bad(pud))
958 continue;
959 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
960 if (ret)
961 return ret;
962 } while (pud++, addr = next, addr != end);
963 return 0;
964}
965
966static int unuse_vma(struct vm_area_struct *vma,
967 swp_entry_t entry, struct page *page)
968{
969 pgd_t *pgd;
970 unsigned long addr, end, next;
971 int ret;
972
973 if (page_anon_vma(page)) {
974 addr = page_address_in_vma(page, vma);
975 if (addr == -EFAULT)
976 return 0;
977 else
978 end = addr + PAGE_SIZE;
979 } else {
980 addr = vma->vm_start;
981 end = vma->vm_end;
982 }
983
984 pgd = pgd_offset(vma->vm_mm, addr);
985 do {
986 next = pgd_addr_end(addr, end);
987 if (pgd_none_or_clear_bad(pgd))
988 continue;
989 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
990 if (ret)
991 return ret;
992 } while (pgd++, addr = next, addr != end);
993 return 0;
994}
995
996static int unuse_mm(struct mm_struct *mm,
997 swp_entry_t entry, struct page *page)
998{
999 struct vm_area_struct *vma;
1000 int ret = 0;
1001
1002 if (!down_read_trylock(&mm->mmap_sem)) {
1003
1004
1005
1006
1007 activate_page(page);
1008 unlock_page(page);
1009 down_read(&mm->mmap_sem);
1010 lock_page(page);
1011 }
1012 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1013 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1014 break;
1015 }
1016 up_read(&mm->mmap_sem);
1017 return (ret < 0)? ret: 0;
1018}
1019
1020
1021
1022
1023
1024static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1025 unsigned int prev)
1026{
1027 unsigned int max = si->max;
1028 unsigned int i = prev;
1029 unsigned char count;
1030
1031
1032
1033
1034
1035
1036
1037 for (;;) {
1038 if (++i >= max) {
1039 if (!prev) {
1040 i = 0;
1041 break;
1042 }
1043
1044
1045
1046
1047 max = prev + 1;
1048 prev = 0;
1049 i = 1;
1050 }
1051 count = si->swap_map[i];
1052 if (count && swap_count(count) != SWAP_MAP_BAD)
1053 break;
1054 }
1055 return i;
1056}
1057
1058
1059
1060
1061
1062
1063static int try_to_unuse(unsigned int type)
1064{
1065 struct swap_info_struct *si = swap_info[type];
1066 struct mm_struct *start_mm;
1067 unsigned char *swap_map;
1068 unsigned char swcount;
1069 struct page *page;
1070 swp_entry_t entry;
1071 unsigned int i = 0;
1072 int retval = 0;
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088 start_mm = &init_mm;
1089 atomic_inc(&init_mm.mm_users);
1090
1091
1092
1093
1094
1095
1096 while ((i = find_next_to_unuse(si, i)) != 0) {
1097 if (signal_pending(current)) {
1098 retval = -EINTR;
1099 break;
1100 }
1101
1102
1103
1104
1105
1106
1107 swap_map = &si->swap_map[i];
1108 entry = swp_entry(type, i);
1109 page = read_swap_cache_async(entry,
1110 GFP_HIGHUSER_MOVABLE, NULL, 0);
1111 if (!page) {
1112
1113
1114
1115
1116
1117
1118 if (!*swap_map)
1119 continue;
1120 retval = -ENOMEM;
1121 break;
1122 }
1123
1124
1125
1126
1127 if (atomic_read(&start_mm->mm_users) == 1) {
1128 mmput(start_mm);
1129 start_mm = &init_mm;
1130 atomic_inc(&init_mm.mm_users);
1131 }
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141 wait_on_page_locked(page);
1142 wait_on_page_writeback(page);
1143 lock_page(page);
1144 wait_on_page_writeback(page);
1145
1146
1147
1148
1149 swcount = *swap_map;
1150 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1151 retval = shmem_unuse(entry, page);
1152
1153 if (retval < 0)
1154 break;
1155 continue;
1156 }
1157 if (swap_count(swcount) && start_mm != &init_mm)
1158 retval = unuse_mm(start_mm, entry, page);
1159
1160 if (swap_count(*swap_map)) {
1161 int set_start_mm = (*swap_map >= swcount);
1162 struct list_head *p = &start_mm->mmlist;
1163 struct mm_struct *new_start_mm = start_mm;
1164 struct mm_struct *prev_mm = start_mm;
1165 struct mm_struct *mm;
1166
1167 atomic_inc(&new_start_mm->mm_users);
1168 atomic_inc(&prev_mm->mm_users);
1169 spin_lock(&mmlist_lock);
1170 while (swap_count(*swap_map) && !retval &&
1171 (p = p->next) != &start_mm->mmlist) {
1172 mm = list_entry(p, struct mm_struct, mmlist);
1173 if (!atomic_inc_not_zero(&mm->mm_users))
1174 continue;
1175 spin_unlock(&mmlist_lock);
1176 mmput(prev_mm);
1177 prev_mm = mm;
1178
1179 cond_resched();
1180
1181 swcount = *swap_map;
1182 if (!swap_count(swcount))
1183 ;
1184 else if (mm == &init_mm)
1185 set_start_mm = 1;
1186 else
1187 retval = unuse_mm(mm, entry, page);
1188
1189 if (set_start_mm && *swap_map < swcount) {
1190 mmput(new_start_mm);
1191 atomic_inc(&mm->mm_users);
1192 new_start_mm = mm;
1193 set_start_mm = 0;
1194 }
1195 spin_lock(&mmlist_lock);
1196 }
1197 spin_unlock(&mmlist_lock);
1198 mmput(prev_mm);
1199 mmput(start_mm);
1200 start_mm = new_start_mm;
1201 }
1202 if (retval) {
1203 unlock_page(page);
1204 page_cache_release(page);
1205 break;
1206 }
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227 if (swap_count(*swap_map) &&
1228 PageDirty(page) && PageSwapCache(page)) {
1229 struct writeback_control wbc = {
1230 .sync_mode = WB_SYNC_NONE,
1231 };
1232
1233 swap_writepage(page, &wbc);
1234 lock_page(page);
1235 wait_on_page_writeback(page);
1236 }
1237
1238
1239
1240
1241
1242
1243
1244
1245 if (PageSwapCache(page) &&
1246 likely(page_private(page) == entry.val))
1247 delete_from_swap_cache(page);
1248
1249
1250
1251
1252
1253
1254 SetPageDirty(page);
1255 unlock_page(page);
1256 page_cache_release(page);
1257
1258
1259
1260
1261
1262 cond_resched();
1263 }
1264
1265 mmput(start_mm);
1266 return retval;
1267}
1268
1269
1270
1271
1272
1273
1274
1275static void drain_mmlist(void)
1276{
1277 struct list_head *p, *next;
1278 unsigned int type;
1279
1280 for (type = 0; type < nr_swapfiles; type++)
1281 if (swap_info[type]->inuse_pages)
1282 return;
1283 spin_lock(&mmlist_lock);
1284 list_for_each_safe(p, next, &init_mm.mmlist)
1285 list_del_init(p);
1286 spin_unlock(&mmlist_lock);
1287}
1288
1289
1290
1291
1292
1293
1294
1295static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1296{
1297 struct swap_info_struct *sis;
1298 struct swap_extent *start_se;
1299 struct swap_extent *se;
1300 pgoff_t offset;
1301
1302 sis = swap_info[swp_type(entry)];
1303 *bdev = sis->bdev;
1304
1305 offset = swp_offset(entry);
1306 start_se = sis->curr_swap_extent;
1307 se = start_se;
1308
1309 for ( ; ; ) {
1310 struct list_head *lh;
1311
1312 if (se->start_page <= offset &&
1313 offset < (se->start_page + se->nr_pages)) {
1314 return se->start_block + (offset - se->start_page);
1315 }
1316 lh = se->list.next;
1317 se = list_entry(lh, struct swap_extent, list);
1318 sis->curr_swap_extent = se;
1319 BUG_ON(se == start_se);
1320 }
1321}
1322
1323
1324
1325
1326sector_t map_swap_page(struct page *page, struct block_device **bdev)
1327{
1328 swp_entry_t entry;
1329 entry.val = page_private(page);
1330 return map_swap_entry(entry, bdev);
1331}
1332
1333
1334
1335
1336static void destroy_swap_extents(struct swap_info_struct *sis)
1337{
1338 while (!list_empty(&sis->first_swap_extent.list)) {
1339 struct swap_extent *se;
1340
1341 se = list_entry(sis->first_swap_extent.list.next,
1342 struct swap_extent, list);
1343 list_del(&se->list);
1344 kfree(se);
1345 }
1346}
1347
1348
1349
1350
1351
1352
1353
1354static int
1355add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1356 unsigned long nr_pages, sector_t start_block)
1357{
1358 struct swap_extent *se;
1359 struct swap_extent *new_se;
1360 struct list_head *lh;
1361
1362 if (start_page == 0) {
1363 se = &sis->first_swap_extent;
1364 sis->curr_swap_extent = se;
1365 se->start_page = 0;
1366 se->nr_pages = nr_pages;
1367 se->start_block = start_block;
1368 return 1;
1369 } else {
1370 lh = sis->first_swap_extent.list.prev;
1371 se = list_entry(lh, struct swap_extent, list);
1372 BUG_ON(se->start_page + se->nr_pages != start_page);
1373 if (se->start_block + se->nr_pages == start_block) {
1374
1375 se->nr_pages += nr_pages;
1376 return 0;
1377 }
1378 }
1379
1380
1381
1382
1383 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1384 if (new_se == NULL)
1385 return -ENOMEM;
1386 new_se->start_page = start_page;
1387 new_se->nr_pages = nr_pages;
1388 new_se->start_block = start_block;
1389
1390 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1391 return 1;
1392}
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1426{
1427 struct inode *inode;
1428 unsigned blocks_per_page;
1429 unsigned long page_no;
1430 unsigned blkbits;
1431 sector_t probe_block;
1432 sector_t last_block;
1433 sector_t lowest_block = -1;
1434 sector_t highest_block = 0;
1435 int nr_extents = 0;
1436 int ret;
1437
1438 inode = sis->swap_file->f_mapping->host;
1439 if (S_ISBLK(inode->i_mode)) {
1440 ret = add_swap_extent(sis, 0, sis->max, 0);
1441 *span = sis->pages;
1442 goto out;
1443 }
1444
1445 blkbits = inode->i_blkbits;
1446 blocks_per_page = PAGE_SIZE >> blkbits;
1447
1448
1449
1450
1451
1452 probe_block = 0;
1453 page_no = 0;
1454 last_block = i_size_read(inode) >> blkbits;
1455 while ((probe_block + blocks_per_page) <= last_block &&
1456 page_no < sis->max) {
1457 unsigned block_in_page;
1458 sector_t first_block;
1459
1460 first_block = bmap(inode, probe_block);
1461 if (first_block == 0)
1462 goto bad_bmap;
1463
1464
1465
1466
1467 if (first_block & (blocks_per_page - 1)) {
1468 probe_block++;
1469 goto reprobe;
1470 }
1471
1472 for (block_in_page = 1; block_in_page < blocks_per_page;
1473 block_in_page++) {
1474 sector_t block;
1475
1476 block = bmap(inode, probe_block + block_in_page);
1477 if (block == 0)
1478 goto bad_bmap;
1479 if (block != first_block + block_in_page) {
1480
1481 probe_block++;
1482 goto reprobe;
1483 }
1484 }
1485
1486 first_block >>= (PAGE_SHIFT - blkbits);
1487 if (page_no) {
1488 if (first_block < lowest_block)
1489 lowest_block = first_block;
1490 if (first_block > highest_block)
1491 highest_block = first_block;
1492 }
1493
1494
1495
1496
1497 ret = add_swap_extent(sis, page_no, 1, first_block);
1498 if (ret < 0)
1499 goto out;
1500 nr_extents += ret;
1501 page_no++;
1502 probe_block += blocks_per_page;
1503reprobe:
1504 continue;
1505 }
1506 ret = nr_extents;
1507 *span = 1 + highest_block - lowest_block;
1508 if (page_no == 0)
1509 page_no = 1;
1510 sis->max = page_no;
1511 sis->pages = page_no - 1;
1512 sis->highest_bit = page_no - 1;
1513out:
1514 return ret;
1515bad_bmap:
1516 printk(KERN_ERR "swapon: swapfile has holes\n");
1517 ret = -EINVAL;
1518 goto out;
1519}
1520
1521static void enable_swap_info(struct swap_info_struct *p, int prio,
1522 unsigned char *swap_map)
1523{
1524 int i, prev;
1525
1526 spin_lock(&swap_lock);
1527 if (prio >= 0)
1528 p->prio = prio;
1529 else
1530 p->prio = --least_priority;
1531 p->swap_map = swap_map;
1532 p->flags |= SWP_WRITEOK;
1533 nr_swap_pages += p->pages;
1534 total_swap_pages += p->pages;
1535
1536
1537 prev = -1;
1538 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1539 if (p->prio >= swap_info[i]->prio)
1540 break;
1541 prev = i;
1542 }
1543 p->next = i;
1544 if (prev < 0)
1545 swap_list.head = swap_list.next = p->type;
1546 else
1547 swap_info[prev]->next = p->type;
1548 spin_unlock(&swap_lock);
1549}
1550
1551SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1552{
1553 struct swap_info_struct *p = NULL;
1554 unsigned char *swap_map;
1555 struct file *swap_file, *victim;
1556 struct address_space *mapping;
1557 struct inode *inode;
1558 char *pathname;
1559 int oom_score_adj;
1560 int i, type, prev;
1561 int err;
1562
1563 if (!capable(CAP_SYS_ADMIN))
1564 return -EPERM;
1565
1566 pathname = getname(specialfile);
1567 err = PTR_ERR(pathname);
1568 if (IS_ERR(pathname))
1569 goto out;
1570
1571 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1572 putname(pathname);
1573 err = PTR_ERR(victim);
1574 if (IS_ERR(victim))
1575 goto out;
1576
1577 mapping = victim->f_mapping;
1578 prev = -1;
1579 spin_lock(&swap_lock);
1580 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1581 p = swap_info[type];
1582 if (p->flags & SWP_WRITEOK) {
1583 if (p->swap_file->f_mapping == mapping)
1584 break;
1585 }
1586 prev = type;
1587 }
1588 if (type < 0) {
1589 err = -EINVAL;
1590 spin_unlock(&swap_lock);
1591 goto out_dput;
1592 }
1593 if (!security_vm_enough_memory(p->pages))
1594 vm_unacct_memory(p->pages);
1595 else {
1596 err = -ENOMEM;
1597 spin_unlock(&swap_lock);
1598 goto out_dput;
1599 }
1600 if (prev < 0)
1601 swap_list.head = p->next;
1602 else
1603 swap_info[prev]->next = p->next;
1604 if (type == swap_list.next) {
1605
1606 swap_list.next = swap_list.head;
1607 }
1608 if (p->prio < 0) {
1609 for (i = p->next; i >= 0; i = swap_info[i]->next)
1610 swap_info[i]->prio = p->prio--;
1611 least_priority++;
1612 }
1613 nr_swap_pages -= p->pages;
1614 total_swap_pages -= p->pages;
1615 p->flags &= ~SWP_WRITEOK;
1616 spin_unlock(&swap_lock);
1617
1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1619 err = try_to_unuse(type);
1620 test_set_oom_score_adj(oom_score_adj);
1621
1622 if (err) {
1623
1624
1625
1626
1627
1628
1629
1630 enable_swap_info(p, p->prio, p->swap_map);
1631 goto out_dput;
1632 }
1633
1634 destroy_swap_extents(p);
1635 if (p->flags & SWP_CONTINUED)
1636 free_swap_count_continuations(p);
1637
1638 mutex_lock(&swapon_mutex);
1639 spin_lock(&swap_lock);
1640 drain_mmlist();
1641
1642
1643 p->highest_bit = 0;
1644 while (p->flags >= SWP_SCANNING) {
1645 spin_unlock(&swap_lock);
1646 schedule_timeout_uninterruptible(1);
1647 spin_lock(&swap_lock);
1648 }
1649
1650 swap_file = p->swap_file;
1651 p->swap_file = NULL;
1652 p->max = 0;
1653 swap_map = p->swap_map;
1654 p->swap_map = NULL;
1655 p->flags = 0;
1656 spin_unlock(&swap_lock);
1657 mutex_unlock(&swapon_mutex);
1658 vfree(swap_map);
1659
1660 swap_cgroup_swapoff(type);
1661
1662 inode = mapping->host;
1663 if (S_ISBLK(inode->i_mode)) {
1664 struct block_device *bdev = I_BDEV(inode);
1665 set_blocksize(bdev, p->old_block_size);
1666 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1667 } else {
1668 mutex_lock(&inode->i_mutex);
1669 inode->i_flags &= ~S_SWAPFILE;
1670 mutex_unlock(&inode->i_mutex);
1671 }
1672 filp_close(swap_file, NULL);
1673 err = 0;
1674 atomic_inc(&proc_poll_event);
1675 wake_up_interruptible(&proc_poll_wait);
1676
1677out_dput:
1678 filp_close(victim, NULL);
1679out:
1680 return err;
1681}
1682
1683#ifdef CONFIG_PROC_FS
1684static unsigned swaps_poll(struct file *file, poll_table *wait)
1685{
1686 struct seq_file *seq = file->private_data;
1687
1688 poll_wait(file, &proc_poll_wait, wait);
1689
1690 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1691 seq->poll_event = atomic_read(&proc_poll_event);
1692 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1693 }
1694
1695 return POLLIN | POLLRDNORM;
1696}
1697
1698
1699static void *swap_start(struct seq_file *swap, loff_t *pos)
1700{
1701 struct swap_info_struct *si;
1702 int type;
1703 loff_t l = *pos;
1704
1705 mutex_lock(&swapon_mutex);
1706
1707 if (!l)
1708 return SEQ_START_TOKEN;
1709
1710 for (type = 0; type < nr_swapfiles; type++) {
1711 smp_rmb();
1712 si = swap_info[type];
1713 if (!(si->flags & SWP_USED) || !si->swap_map)
1714 continue;
1715 if (!--l)
1716 return si;
1717 }
1718
1719 return NULL;
1720}
1721
1722static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1723{
1724 struct swap_info_struct *si = v;
1725 int type;
1726
1727 if (v == SEQ_START_TOKEN)
1728 type = 0;
1729 else
1730 type = si->type + 1;
1731
1732 for (; type < nr_swapfiles; type++) {
1733 smp_rmb();
1734 si = swap_info[type];
1735 if (!(si->flags & SWP_USED) || !si->swap_map)
1736 continue;
1737 ++*pos;
1738 return si;
1739 }
1740
1741 return NULL;
1742}
1743
1744static void swap_stop(struct seq_file *swap, void *v)
1745{
1746 mutex_unlock(&swapon_mutex);
1747}
1748
1749static int swap_show(struct seq_file *swap, void *v)
1750{
1751 struct swap_info_struct *si = v;
1752 struct file *file;
1753 int len;
1754
1755 if (si == SEQ_START_TOKEN) {
1756 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1757 return 0;
1758 }
1759
1760 file = si->swap_file;
1761 len = seq_path(swap, &file->f_path, " \t\n\\");
1762 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1763 len < 40 ? 40 - len : 1, " ",
1764 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1765 "partition" : "file\t",
1766 si->pages << (PAGE_SHIFT - 10),
1767 si->inuse_pages << (PAGE_SHIFT - 10),
1768 si->prio);
1769 return 0;
1770}
1771
1772static const struct seq_operations swaps_op = {
1773 .start = swap_start,
1774 .next = swap_next,
1775 .stop = swap_stop,
1776 .show = swap_show
1777};
1778
1779static int swaps_open(struct inode *inode, struct file *file)
1780{
1781 struct seq_file *seq;
1782 int ret;
1783
1784 ret = seq_open(file, &swaps_op);
1785 if (ret)
1786 return ret;
1787
1788 seq = file->private_data;
1789 seq->poll_event = atomic_read(&proc_poll_event);
1790 return 0;
1791}
1792
1793static const struct file_operations proc_swaps_operations = {
1794 .open = swaps_open,
1795 .read = seq_read,
1796 .llseek = seq_lseek,
1797 .release = seq_release,
1798 .poll = swaps_poll,
1799};
1800
1801static int __init procswaps_init(void)
1802{
1803 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1804 return 0;
1805}
1806__initcall(procswaps_init);
1807#endif
1808
1809#ifdef MAX_SWAPFILES_CHECK
1810static int __init max_swapfiles_check(void)
1811{
1812 MAX_SWAPFILES_CHECK();
1813 return 0;
1814}
1815late_initcall(max_swapfiles_check);
1816#endif
1817
1818static struct swap_info_struct *alloc_swap_info(void)
1819{
1820 struct swap_info_struct *p;
1821 unsigned int type;
1822
1823 p = kzalloc(sizeof(*p), GFP_KERNEL);
1824 if (!p)
1825 return ERR_PTR(-ENOMEM);
1826
1827 spin_lock(&swap_lock);
1828 for (type = 0; type < nr_swapfiles; type++) {
1829 if (!(swap_info[type]->flags & SWP_USED))
1830 break;
1831 }
1832 if (type >= MAX_SWAPFILES) {
1833 spin_unlock(&swap_lock);
1834 kfree(p);
1835 return ERR_PTR(-EPERM);
1836 }
1837 if (type >= nr_swapfiles) {
1838 p->type = type;
1839 swap_info[type] = p;
1840
1841
1842
1843
1844
1845 smp_wmb();
1846 nr_swapfiles++;
1847 } else {
1848 kfree(p);
1849 p = swap_info[type];
1850
1851
1852
1853
1854 }
1855 INIT_LIST_HEAD(&p->first_swap_extent.list);
1856 p->flags = SWP_USED;
1857 p->next = -1;
1858 spin_unlock(&swap_lock);
1859
1860 return p;
1861}
1862
1863static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1864{
1865 int error;
1866
1867 if (S_ISBLK(inode->i_mode)) {
1868 p->bdev = bdgrab(I_BDEV(inode));
1869 error = blkdev_get(p->bdev,
1870 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1871 sys_swapon);
1872 if (error < 0) {
1873 p->bdev = NULL;
1874 return -EINVAL;
1875 }
1876 p->old_block_size = block_size(p->bdev);
1877 error = set_blocksize(p->bdev, PAGE_SIZE);
1878 if (error < 0)
1879 return error;
1880 p->flags |= SWP_BLKDEV;
1881 } else if (S_ISREG(inode->i_mode)) {
1882 p->bdev = inode->i_sb->s_bdev;
1883 mutex_lock(&inode->i_mutex);
1884 if (IS_SWAPFILE(inode))
1885 return -EBUSY;
1886 } else
1887 return -EINVAL;
1888
1889 return 0;
1890}
1891
1892static unsigned long read_swap_header(struct swap_info_struct *p,
1893 union swap_header *swap_header,
1894 struct inode *inode)
1895{
1896 int i;
1897 unsigned long maxpages;
1898 unsigned long swapfilepages;
1899
1900 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1901 printk(KERN_ERR "Unable to find swap-space signature\n");
1902 return 0;
1903 }
1904
1905
1906 if (swab32(swap_header->info.version) == 1) {
1907 swab32s(&swap_header->info.version);
1908 swab32s(&swap_header->info.last_page);
1909 swab32s(&swap_header->info.nr_badpages);
1910 for (i = 0; i < swap_header->info.nr_badpages; i++)
1911 swab32s(&swap_header->info.badpages[i]);
1912 }
1913
1914 if (swap_header->info.version != 1) {
1915 printk(KERN_WARNING
1916 "Unable to handle swap header version %d\n",
1917 swap_header->info.version);
1918 return 0;
1919 }
1920
1921 p->lowest_bit = 1;
1922 p->cluster_next = 1;
1923 p->cluster_nr = 0;
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940 maxpages = swp_offset(pte_to_swp_entry(
1941 swp_entry_to_pte(swp_entry(0, ~0UL))));
1942 maxpages = swp_offset(radix_to_swp_entry(
1943 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1944
1945 if (maxpages > swap_header->info.last_page) {
1946 maxpages = swap_header->info.last_page + 1;
1947
1948 if ((unsigned int)maxpages == 0)
1949 maxpages = UINT_MAX;
1950 }
1951 p->highest_bit = maxpages - 1;
1952
1953 if (!maxpages)
1954 return 0;
1955 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1956 if (swapfilepages && maxpages > swapfilepages) {
1957 printk(KERN_WARNING
1958 "Swap area shorter than signature indicates\n");
1959 return 0;
1960 }
1961 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1962 return 0;
1963 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1964 return 0;
1965
1966 return maxpages;
1967}
1968
1969static int setup_swap_map_and_extents(struct swap_info_struct *p,
1970 union swap_header *swap_header,
1971 unsigned char *swap_map,
1972 unsigned long maxpages,
1973 sector_t *span)
1974{
1975 int i;
1976 unsigned int nr_good_pages;
1977 int nr_extents;
1978
1979 nr_good_pages = maxpages - 1;
1980
1981 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1982 unsigned int page_nr = swap_header->info.badpages[i];
1983 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1984 return -EINVAL;
1985 if (page_nr < maxpages) {
1986 swap_map[page_nr] = SWAP_MAP_BAD;
1987 nr_good_pages--;
1988 }
1989 }
1990
1991 if (nr_good_pages) {
1992 swap_map[0] = SWAP_MAP_BAD;
1993 p->max = maxpages;
1994 p->pages = nr_good_pages;
1995 nr_extents = setup_swap_extents(p, span);
1996 if (nr_extents < 0)
1997 return nr_extents;
1998 nr_good_pages = p->pages;
1999 }
2000 if (!nr_good_pages) {
2001 printk(KERN_WARNING "Empty swap-file\n");
2002 return -EINVAL;
2003 }
2004
2005 return nr_extents;
2006}
2007
2008SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2009{
2010 struct swap_info_struct *p;
2011 char *name;
2012 struct file *swap_file = NULL;
2013 struct address_space *mapping;
2014 int i;
2015 int prio;
2016 int error;
2017 union swap_header *swap_header;
2018 int nr_extents;
2019 sector_t span;
2020 unsigned long maxpages;
2021 unsigned char *swap_map = NULL;
2022 struct page *page = NULL;
2023 struct inode *inode = NULL;
2024
2025 if (!capable(CAP_SYS_ADMIN))
2026 return -EPERM;
2027
2028 p = alloc_swap_info();
2029 if (IS_ERR(p))
2030 return PTR_ERR(p);
2031
2032 name = getname(specialfile);
2033 if (IS_ERR(name)) {
2034 error = PTR_ERR(name);
2035 name = NULL;
2036 goto bad_swap;
2037 }
2038 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2039 if (IS_ERR(swap_file)) {
2040 error = PTR_ERR(swap_file);
2041 swap_file = NULL;
2042 goto bad_swap;
2043 }
2044
2045 p->swap_file = swap_file;
2046 mapping = swap_file->f_mapping;
2047
2048 for (i = 0; i < nr_swapfiles; i++) {
2049 struct swap_info_struct *q = swap_info[i];
2050
2051 if (q == p || !q->swap_file)
2052 continue;
2053 if (mapping == q->swap_file->f_mapping) {
2054 error = -EBUSY;
2055 goto bad_swap;
2056 }
2057 }
2058
2059 inode = mapping->host;
2060
2061 error = claim_swapfile(p, inode);
2062 if (unlikely(error))
2063 goto bad_swap;
2064
2065
2066
2067
2068 if (!mapping->a_ops->readpage) {
2069 error = -EINVAL;
2070 goto bad_swap;
2071 }
2072 page = read_mapping_page(mapping, 0, swap_file);
2073 if (IS_ERR(page)) {
2074 error = PTR_ERR(page);
2075 goto bad_swap;
2076 }
2077 swap_header = kmap(page);
2078
2079 maxpages = read_swap_header(p, swap_header, inode);
2080 if (unlikely(!maxpages)) {
2081 error = -EINVAL;
2082 goto bad_swap;
2083 }
2084
2085
2086 swap_map = vzalloc(maxpages);
2087 if (!swap_map) {
2088 error = -ENOMEM;
2089 goto bad_swap;
2090 }
2091
2092 error = swap_cgroup_swapon(p->type, maxpages);
2093 if (error)
2094 goto bad_swap;
2095
2096 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2097 maxpages, &span);
2098 if (unlikely(nr_extents < 0)) {
2099 error = nr_extents;
2100 goto bad_swap;
2101 }
2102
2103 if (p->bdev) {
2104 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2105 p->flags |= SWP_SOLIDSTATE;
2106 p->cluster_next = 1 + (random32() % p->highest_bit);
2107 }
2108 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2109 p->flags |= SWP_DISCARDABLE;
2110 }
2111
2112 mutex_lock(&swapon_mutex);
2113 prio = -1;
2114 if (swap_flags & SWAP_FLAG_PREFER)
2115 prio =
2116 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2117 enable_swap_info(p, prio, swap_map);
2118
2119 printk(KERN_INFO "Adding %uk swap on %s. "
2120 "Priority:%d extents:%d across:%lluk %s%s\n",
2121 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2122 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2123 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2124 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2125
2126 mutex_unlock(&swapon_mutex);
2127 atomic_inc(&proc_poll_event);
2128 wake_up_interruptible(&proc_poll_wait);
2129
2130 if (S_ISREG(inode->i_mode))
2131 inode->i_flags |= S_SWAPFILE;
2132 error = 0;
2133 goto out;
2134bad_swap:
2135 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2136 set_blocksize(p->bdev, p->old_block_size);
2137 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2138 }
2139 destroy_swap_extents(p);
2140 swap_cgroup_swapoff(p->type);
2141 spin_lock(&swap_lock);
2142 p->swap_file = NULL;
2143 p->flags = 0;
2144 spin_unlock(&swap_lock);
2145 vfree(swap_map);
2146 if (swap_file) {
2147 if (inode && S_ISREG(inode->i_mode)) {
2148 mutex_unlock(&inode->i_mutex);
2149 inode = NULL;
2150 }
2151 filp_close(swap_file, NULL);
2152 }
2153out:
2154 if (page && !IS_ERR(page)) {
2155 kunmap(page);
2156 page_cache_release(page);
2157 }
2158 if (name)
2159 putname(name);
2160 if (inode && S_ISREG(inode->i_mode))
2161 mutex_unlock(&inode->i_mutex);
2162 return error;
2163}
2164
2165void si_swapinfo(struct sysinfo *val)
2166{
2167 unsigned int type;
2168 unsigned long nr_to_be_unused = 0;
2169
2170 spin_lock(&swap_lock);
2171 for (type = 0; type < nr_swapfiles; type++) {
2172 struct swap_info_struct *si = swap_info[type];
2173
2174 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2175 nr_to_be_unused += si->inuse_pages;
2176 }
2177 val->freeswap = nr_swap_pages + nr_to_be_unused;
2178 val->totalswap = total_swap_pages + nr_to_be_unused;
2179 spin_unlock(&swap_lock);
2180}
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2194{
2195 struct swap_info_struct *p;
2196 unsigned long offset, type;
2197 unsigned char count;
2198 unsigned char has_cache;
2199 int err = -EINVAL;
2200
2201 if (non_swap_entry(entry))
2202 goto out;
2203
2204 type = swp_type(entry);
2205 if (type >= nr_swapfiles)
2206 goto bad_file;
2207 p = swap_info[type];
2208 offset = swp_offset(entry);
2209
2210 spin_lock(&swap_lock);
2211 if (unlikely(offset >= p->max))
2212 goto unlock_out;
2213
2214 count = p->swap_map[offset];
2215 has_cache = count & SWAP_HAS_CACHE;
2216 count &= ~SWAP_HAS_CACHE;
2217 err = 0;
2218
2219 if (usage == SWAP_HAS_CACHE) {
2220
2221
2222 if (!has_cache && count)
2223 has_cache = SWAP_HAS_CACHE;
2224 else if (has_cache)
2225 err = -EEXIST;
2226 else
2227 err = -ENOENT;
2228
2229 } else if (count || has_cache) {
2230
2231 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2232 count += usage;
2233 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2234 err = -EINVAL;
2235 else if (swap_count_continued(p, offset, count))
2236 count = COUNT_CONTINUED;
2237 else
2238 err = -ENOMEM;
2239 } else
2240 err = -ENOENT;
2241
2242 p->swap_map[offset] = count | has_cache;
2243
2244unlock_out:
2245 spin_unlock(&swap_lock);
2246out:
2247 return err;
2248
2249bad_file:
2250 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2251 goto out;
2252}
2253
2254
2255
2256
2257
2258void swap_shmem_alloc(swp_entry_t entry)
2259{
2260 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2261}
2262
2263
2264
2265
2266
2267
2268
2269
2270int swap_duplicate(swp_entry_t entry)
2271{
2272 int err = 0;
2273
2274 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2275 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2276 return err;
2277}
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287int swapcache_prepare(swp_entry_t entry)
2288{
2289 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2290}
2291
2292
2293
2294
2295
2296int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2297{
2298 struct swap_info_struct *si;
2299 int our_page_cluster = page_cluster;
2300 pgoff_t target, toff;
2301 pgoff_t base, end;
2302 int nr_pages = 0;
2303
2304 if (!our_page_cluster)
2305 return 0;
2306
2307 si = swap_info[swp_type(entry)];
2308 target = swp_offset(entry);
2309 base = (target >> our_page_cluster) << our_page_cluster;
2310 end = base + (1 << our_page_cluster);
2311 if (!base)
2312 base++;
2313
2314 spin_lock(&swap_lock);
2315 if (end > si->max)
2316 end = si->max;
2317
2318
2319 for (toff = target; ++toff < end; nr_pages++) {
2320
2321 if (!si->swap_map[toff])
2322 break;
2323 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2324 break;
2325 }
2326
2327 for (toff = target; --toff >= base; nr_pages++) {
2328
2329 if (!si->swap_map[toff])
2330 break;
2331 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2332 break;
2333 }
2334 spin_unlock(&swap_lock);
2335
2336
2337
2338
2339
2340 *offset = ++toff;
2341 return nr_pages? ++nr_pages: 0;
2342}
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2360{
2361 struct swap_info_struct *si;
2362 struct page *head;
2363 struct page *page;
2364 struct page *list_page;
2365 pgoff_t offset;
2366 unsigned char count;
2367
2368
2369
2370
2371
2372 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2373
2374 si = swap_info_get(entry);
2375 if (!si) {
2376
2377
2378
2379
2380
2381 goto outer;
2382 }
2383
2384 offset = swp_offset(entry);
2385 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2386
2387 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2388
2389
2390
2391
2392
2393 goto out;
2394 }
2395
2396 if (!page) {
2397 spin_unlock(&swap_lock);
2398 return -ENOMEM;
2399 }
2400
2401
2402
2403
2404
2405
2406 head = vmalloc_to_page(si->swap_map + offset);
2407 offset &= ~PAGE_MASK;
2408
2409
2410
2411
2412
2413 if (!page_private(head)) {
2414 BUG_ON(count & COUNT_CONTINUED);
2415 INIT_LIST_HEAD(&head->lru);
2416 set_page_private(head, SWP_CONTINUED);
2417 si->flags |= SWP_CONTINUED;
2418 }
2419
2420 list_for_each_entry(list_page, &head->lru, lru) {
2421 unsigned char *map;
2422
2423
2424
2425
2426
2427 if (!(count & COUNT_CONTINUED))
2428 goto out;
2429
2430 map = kmap_atomic(list_page, KM_USER0) + offset;
2431 count = *map;
2432 kunmap_atomic(map, KM_USER0);
2433
2434
2435
2436
2437
2438 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2439 goto out;
2440 }
2441
2442 list_add_tail(&page->lru, &head->lru);
2443 page = NULL;
2444out:
2445 spin_unlock(&swap_lock);
2446outer:
2447 if (page)
2448 __free_page(page);
2449 return 0;
2450}
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460static bool swap_count_continued(struct swap_info_struct *si,
2461 pgoff_t offset, unsigned char count)
2462{
2463 struct page *head;
2464 struct page *page;
2465 unsigned char *map;
2466
2467 head = vmalloc_to_page(si->swap_map + offset);
2468 if (page_private(head) != SWP_CONTINUED) {
2469 BUG_ON(count & COUNT_CONTINUED);
2470 return false;
2471 }
2472
2473 offset &= ~PAGE_MASK;
2474 page = list_entry(head->lru.next, struct page, lru);
2475 map = kmap_atomic(page, KM_USER0) + offset;
2476
2477 if (count == SWAP_MAP_MAX)
2478 goto init_map;
2479
2480 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2481
2482
2483
2484 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2485 kunmap_atomic(map, KM_USER0);
2486 page = list_entry(page->lru.next, struct page, lru);
2487 BUG_ON(page == head);
2488 map = kmap_atomic(page, KM_USER0) + offset;
2489 }
2490 if (*map == SWAP_CONT_MAX) {
2491 kunmap_atomic(map, KM_USER0);
2492 page = list_entry(page->lru.next, struct page, lru);
2493 if (page == head)
2494 return false;
2495 map = kmap_atomic(page, KM_USER0) + offset;
2496init_map: *map = 0;
2497 }
2498 *map += 1;
2499 kunmap_atomic(map, KM_USER0);
2500 page = list_entry(page->lru.prev, struct page, lru);
2501 while (page != head) {
2502 map = kmap_atomic(page, KM_USER0) + offset;
2503 *map = COUNT_CONTINUED;
2504 kunmap_atomic(map, KM_USER0);
2505 page = list_entry(page->lru.prev, struct page, lru);
2506 }
2507 return true;
2508
2509 } else {
2510
2511
2512
2513 BUG_ON(count != COUNT_CONTINUED);
2514 while (*map == COUNT_CONTINUED) {
2515 kunmap_atomic(map, KM_USER0);
2516 page = list_entry(page->lru.next, struct page, lru);
2517 BUG_ON(page == head);
2518 map = kmap_atomic(page, KM_USER0) + offset;
2519 }
2520 BUG_ON(*map == 0);
2521 *map -= 1;
2522 if (*map == 0)
2523 count = 0;
2524 kunmap_atomic(map, KM_USER0);
2525 page = list_entry(page->lru.prev, struct page, lru);
2526 while (page != head) {
2527 map = kmap_atomic(page, KM_USER0) + offset;
2528 *map = SWAP_CONT_MAX | count;
2529 count = COUNT_CONTINUED;
2530 kunmap_atomic(map, KM_USER0);
2531 page = list_entry(page->lru.prev, struct page, lru);
2532 }
2533 return count == COUNT_CONTINUED;
2534 }
2535}
2536
2537
2538
2539
2540
2541static void free_swap_count_continuations(struct swap_info_struct *si)
2542{
2543 pgoff_t offset;
2544
2545 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2546 struct page *head;
2547 head = vmalloc_to_page(si->swap_map + offset);
2548 if (page_private(head)) {
2549 struct list_head *this, *next;
2550 list_for_each_safe(this, next, &head->lru) {
2551 struct page *page;
2552 page = list_entry(this, struct page, lru);
2553 list_del(this);
2554 __free_page(page);
2555 }
2556 }
2557 }
2558}
2559