1
2
3
4
5#include <linux/list.h>
6#include <linux/init.h>
7#include <linux/module.h>
8#include <linux/mm.h>
9#include <linux/seq_file.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
13#include <linux/nodemask.h>
14#include <linux/pagemap.h>
15#include <linux/mempolicy.h>
16#include <linux/cpuset.h>
17#include <linux/mutex.h>
18#include <linux/bootmem.h>
19#include <linux/sysfs.h>
20#include <linux/slab.h>
21#include <linux/rmap.h>
22#include <linux/swap.h>
23#include <linux/swapops.h>
24
25#include <asm/page.h>
26#include <asm/pgtable.h>
27#include <asm/tlb.h>
28
29#include <linux/io.h>
30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h>
32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
34#include "internal.h"
35
36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
37static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
38unsigned long hugepages_treat_as_movable;
39
40int hugetlb_max_hstate __read_mostly;
41unsigned int default_hstate_idx;
42struct hstate hstates[HUGE_MAX_HSTATE];
43
44__initdata LIST_HEAD(huge_boot_pages);
45
46
47static struct hstate * __initdata parsed_hstate;
48static unsigned long __initdata default_hstate_max_huge_pages;
49static unsigned long __initdata default_hstate_size;
50
51
52
53
54DEFINE_SPINLOCK(hugetlb_lock);
55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{
58 bool free = (spool->count == 0) && (spool->used_hpages == 0);
59
60 spin_unlock(&spool->lock);
61
62
63
64 if (free)
65 kfree(spool);
66}
67
68struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69{
70 struct hugepage_subpool *spool;
71
72 spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73 if (!spool)
74 return NULL;
75
76 spin_lock_init(&spool->lock);
77 spool->count = 1;
78 spool->max_hpages = nr_blocks;
79 spool->used_hpages = 0;
80
81 return spool;
82}
83
84void hugepage_put_subpool(struct hugepage_subpool *spool)
85{
86 spin_lock(&spool->lock);
87 BUG_ON(!spool->count);
88 spool->count--;
89 unlock_or_release_subpool(spool);
90}
91
92static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93 long delta)
94{
95 int ret = 0;
96
97 if (!spool)
98 return 0;
99
100 spin_lock(&spool->lock);
101 if ((spool->used_hpages + delta) <= spool->max_hpages) {
102 spool->used_hpages += delta;
103 } else {
104 ret = -ENOMEM;
105 }
106 spin_unlock(&spool->lock);
107
108 return ret;
109}
110
111static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112 long delta)
113{
114 if (!spool)
115 return;
116
117 spin_lock(&spool->lock);
118 spool->used_hpages -= delta;
119
120
121 unlock_or_release_subpool(spool);
122}
123
124static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125{
126 return HUGETLBFS_SB(inode->i_sb)->spool;
127}
128
129static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130{
131 return subpool_inode(vma->vm_file->f_dentry->d_inode);
132}
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148struct file_region {
149 struct list_head link;
150 long from;
151 long to;
152};
153
154static long region_add(struct list_head *head, long f, long t)
155{
156 struct file_region *rg, *nrg, *trg;
157
158
159 list_for_each_entry(rg, head, link)
160 if (f <= rg->to)
161 break;
162
163
164 if (f > rg->from)
165 f = rg->from;
166
167
168 nrg = rg;
169 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
170 if (&rg->link == head)
171 break;
172 if (rg->from > t)
173 break;
174
175
176
177
178 if (rg->to > t)
179 t = rg->to;
180 if (rg != nrg) {
181 list_del(&rg->link);
182 kfree(rg);
183 }
184 }
185 nrg->from = f;
186 nrg->to = t;
187 return 0;
188}
189
190static long region_chg(struct list_head *head, long f, long t)
191{
192 struct file_region *rg, *nrg;
193 long chg = 0;
194
195
196 list_for_each_entry(rg, head, link)
197 if (f <= rg->to)
198 break;
199
200
201
202
203 if (&rg->link == head || t < rg->from) {
204 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
205 if (!nrg)
206 return -ENOMEM;
207 nrg->from = f;
208 nrg->to = f;
209 INIT_LIST_HEAD(&nrg->link);
210 list_add(&nrg->link, rg->link.prev);
211
212 return t - f;
213 }
214
215
216 if (f > rg->from)
217 f = rg->from;
218 chg = t - f;
219
220
221 list_for_each_entry(rg, rg->link.prev, link) {
222 if (&rg->link == head)
223 break;
224 if (rg->from > t)
225 return chg;
226
227
228
229
230 if (rg->to > t) {
231 chg += rg->to - t;
232 t = rg->to;
233 }
234 chg -= rg->to - rg->from;
235 }
236 return chg;
237}
238
239static long region_truncate(struct list_head *head, long end)
240{
241 struct file_region *rg, *trg;
242 long chg = 0;
243
244
245 list_for_each_entry(rg, head, link)
246 if (end <= rg->to)
247 break;
248 if (&rg->link == head)
249 return 0;
250
251
252 if (end > rg->from) {
253 chg = rg->to - end;
254 rg->to = end;
255 rg = list_entry(rg->link.next, typeof(*rg), link);
256 }
257
258
259 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
260 if (&rg->link == head)
261 break;
262 chg += rg->to - rg->from;
263 list_del(&rg->link);
264 kfree(rg);
265 }
266 return chg;
267}
268
269static long region_count(struct list_head *head, long f, long t)
270{
271 struct file_region *rg;
272 long chg = 0;
273
274
275 list_for_each_entry(rg, head, link) {
276 long seg_from;
277 long seg_to;
278
279 if (rg->to <= f)
280 continue;
281 if (rg->from >= t)
282 break;
283
284 seg_from = max(rg->from, f);
285 seg_to = min(rg->to, t);
286
287 chg += seg_to - seg_from;
288 }
289
290 return chg;
291}
292
293
294
295
296
297static pgoff_t vma_hugecache_offset(struct hstate *h,
298 struct vm_area_struct *vma, unsigned long address)
299{
300 return ((address - vma->vm_start) >> huge_page_shift(h)) +
301 (vma->vm_pgoff >> huge_page_order(h));
302}
303
304pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
305 unsigned long address)
306{
307 return vma_hugecache_offset(hstate_vma(vma), vma, address);
308}
309
310
311
312
313
314unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
315{
316 struct hstate *hstate;
317
318 if (!is_vm_hugetlb_page(vma))
319 return PAGE_SIZE;
320
321 hstate = hstate_vma(vma);
322
323 return 1UL << (hstate->order + PAGE_SHIFT);
324}
325EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
326
327
328
329
330
331
332
333#ifndef vma_mmu_pagesize
334unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
335{
336 return vma_kernel_pagesize(vma);
337}
338#endif
339
340
341
342
343
344
345#define HPAGE_RESV_OWNER (1UL << 0)
346#define HPAGE_RESV_UNMAPPED (1UL << 1)
347#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368static unsigned long get_vma_private_data(struct vm_area_struct *vma)
369{
370 return (unsigned long)vma->vm_private_data;
371}
372
373static void set_vma_private_data(struct vm_area_struct *vma,
374 unsigned long value)
375{
376 vma->vm_private_data = (void *)value;
377}
378
379struct resv_map {
380 struct kref refs;
381 struct list_head regions;
382};
383
384static struct resv_map *resv_map_alloc(void)
385{
386 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
387 if (!resv_map)
388 return NULL;
389
390 kref_init(&resv_map->refs);
391 INIT_LIST_HEAD(&resv_map->regions);
392
393 return resv_map;
394}
395
396static void resv_map_release(struct kref *ref)
397{
398 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
399
400
401 region_truncate(&resv_map->regions, 0);
402 kfree(resv_map);
403}
404
405static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
406{
407 VM_BUG_ON(!is_vm_hugetlb_page(vma));
408 if (!(vma->vm_flags & VM_MAYSHARE))
409 return (struct resv_map *)(get_vma_private_data(vma) &
410 ~HPAGE_RESV_MASK);
411 return NULL;
412}
413
414static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
415{
416 VM_BUG_ON(!is_vm_hugetlb_page(vma));
417 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
418
419 set_vma_private_data(vma, (get_vma_private_data(vma) &
420 HPAGE_RESV_MASK) | (unsigned long)map);
421}
422
423static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
424{
425 VM_BUG_ON(!is_vm_hugetlb_page(vma));
426 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
427
428 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
429}
430
431static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
432{
433 VM_BUG_ON(!is_vm_hugetlb_page(vma));
434
435 return (get_vma_private_data(vma) & flag) != 0;
436}
437
438
439static void decrement_hugepage_resv_vma(struct hstate *h,
440 struct vm_area_struct *vma)
441{
442 if (vma->vm_flags & VM_NORESERVE)
443 return;
444
445 if (vma->vm_flags & VM_MAYSHARE) {
446
447 h->resv_huge_pages--;
448 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
449
450
451
452
453 h->resv_huge_pages--;
454 }
455}
456
457
458void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
459{
460 VM_BUG_ON(!is_vm_hugetlb_page(vma));
461 if (!(vma->vm_flags & VM_MAYSHARE))
462 vma->vm_private_data = (void *)0;
463}
464
465
466static int vma_has_reserves(struct vm_area_struct *vma)
467{
468 if (vma->vm_flags & VM_MAYSHARE)
469 return 1;
470 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
471 return 1;
472 return 0;
473}
474
475static void copy_gigantic_page(struct page *dst, struct page *src)
476{
477 int i;
478 struct hstate *h = page_hstate(src);
479 struct page *dst_base = dst;
480 struct page *src_base = src;
481
482 for (i = 0; i < pages_per_huge_page(h); ) {
483 cond_resched();
484 copy_highpage(dst, src);
485
486 i++;
487 dst = mem_map_next(dst, dst_base, i);
488 src = mem_map_next(src, src_base, i);
489 }
490}
491
492void copy_huge_page(struct page *dst, struct page *src)
493{
494 int i;
495 struct hstate *h = page_hstate(src);
496
497 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
498 copy_gigantic_page(dst, src);
499 return;
500 }
501
502 might_sleep();
503 for (i = 0; i < pages_per_huge_page(h); i++) {
504 cond_resched();
505 copy_highpage(dst + i, src + i);
506 }
507}
508
509static void enqueue_huge_page(struct hstate *h, struct page *page)
510{
511 int nid = page_to_nid(page);
512 list_move(&page->lru, &h->hugepage_freelists[nid]);
513 h->free_huge_pages++;
514 h->free_huge_pages_node[nid]++;
515}
516
517static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
518{
519 struct page *page;
520
521 if (list_empty(&h->hugepage_freelists[nid]))
522 return NULL;
523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
524 list_move(&page->lru, &h->hugepage_activelist);
525 set_page_refcounted(page);
526 h->free_huge_pages--;
527 h->free_huge_pages_node[nid]--;
528 return page;
529}
530
531static struct page *dequeue_huge_page_vma(struct hstate *h,
532 struct vm_area_struct *vma,
533 unsigned long address, int avoid_reserve)
534{
535 struct page *page = NULL;
536 struct mempolicy *mpol;
537 nodemask_t *nodemask;
538 struct zonelist *zonelist;
539 struct zone *zone;
540 struct zoneref *z;
541 unsigned int cpuset_mems_cookie;
542
543retry_cpuset:
544 cpuset_mems_cookie = get_mems_allowed();
545 zonelist = huge_zonelist(vma, address,
546 htlb_alloc_mask, &mpol, &nodemask);
547
548
549
550
551
552 if (!vma_has_reserves(vma) &&
553 h->free_huge_pages - h->resv_huge_pages == 0)
554 goto err;
555
556
557 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
558 goto err;
559
560 for_each_zone_zonelist_nodemask(zone, z, zonelist,
561 MAX_NR_ZONES - 1, nodemask) {
562 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
563 page = dequeue_huge_page_node(h, zone_to_nid(zone));
564 if (page) {
565 if (!avoid_reserve)
566 decrement_hugepage_resv_vma(h, vma);
567 break;
568 }
569 }
570 }
571
572 mpol_cond_put(mpol);
573 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
574 goto retry_cpuset;
575 return page;
576
577err:
578 mpol_cond_put(mpol);
579 return NULL;
580}
581
582static void update_and_free_page(struct hstate *h, struct page *page)
583{
584 int i;
585
586 VM_BUG_ON(h->order >= MAX_ORDER);
587
588 h->nr_huge_pages--;
589 h->nr_huge_pages_node[page_to_nid(page)]--;
590 for (i = 0; i < pages_per_huge_page(h); i++) {
591 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
592 1 << PG_referenced | 1 << PG_dirty |
593 1 << PG_active | 1 << PG_reserved |
594 1 << PG_private | 1 << PG_writeback);
595 }
596 VM_BUG_ON(hugetlb_cgroup_from_page(page));
597 set_compound_page_dtor(page, NULL);
598 set_page_refcounted(page);
599 arch_release_hugepage(page);
600 __free_pages(page, huge_page_order(h));
601}
602
603struct hstate *size_to_hstate(unsigned long size)
604{
605 struct hstate *h;
606
607 for_each_hstate(h) {
608 if (huge_page_size(h) == size)
609 return h;
610 }
611 return NULL;
612}
613
614static void free_huge_page(struct page *page)
615{
616
617
618
619
620 struct hstate *h = page_hstate(page);
621 int nid = page_to_nid(page);
622 struct hugepage_subpool *spool =
623 (struct hugepage_subpool *)page_private(page);
624
625 set_page_private(page, 0);
626 page->mapping = NULL;
627 BUG_ON(page_count(page));
628 BUG_ON(page_mapcount(page));
629
630 spin_lock(&hugetlb_lock);
631 hugetlb_cgroup_uncharge_page(hstate_index(h),
632 pages_per_huge_page(h), page);
633 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
634
635 list_del(&page->lru);
636 update_and_free_page(h, page);
637 h->surplus_huge_pages--;
638 h->surplus_huge_pages_node[nid]--;
639 } else {
640 enqueue_huge_page(h, page);
641 }
642 spin_unlock(&hugetlb_lock);
643 hugepage_subpool_put_pages(spool, 1);
644}
645
646static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
647{
648 INIT_LIST_HEAD(&page->lru);
649 set_compound_page_dtor(page, free_huge_page);
650 spin_lock(&hugetlb_lock);
651 set_hugetlb_cgroup(page, NULL);
652 h->nr_huge_pages++;
653 h->nr_huge_pages_node[nid]++;
654 spin_unlock(&hugetlb_lock);
655 put_page(page);
656}
657
658static void prep_compound_gigantic_page(struct page *page, unsigned long order)
659{
660 int i;
661 int nr_pages = 1 << order;
662 struct page *p = page + 1;
663
664
665 set_compound_order(page, order);
666 __SetPageHead(page);
667 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
668 __SetPageTail(p);
669 set_page_count(p, 0);
670 p->first_page = page;
671 }
672}
673
674int PageHuge(struct page *page)
675{
676 compound_page_dtor *dtor;
677
678 if (!PageCompound(page))
679 return 0;
680
681 page = compound_head(page);
682 dtor = get_compound_page_dtor(page);
683
684 return dtor == free_huge_page;
685}
686EXPORT_SYMBOL_GPL(PageHuge);
687
688static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
689{
690 struct page *page;
691
692 if (h->order >= MAX_ORDER)
693 return NULL;
694
695 page = alloc_pages_exact_node(nid,
696 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
697 __GFP_REPEAT|__GFP_NOWARN,
698 huge_page_order(h));
699 if (page) {
700 if (arch_prepare_hugepage(page)) {
701 __free_pages(page, huge_page_order(h));
702 return NULL;
703 }
704 prep_new_huge_page(h, page, nid);
705 }
706
707 return page;
708}
709
710
711
712
713
714
715
716
717static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
718{
719 nid = next_node(nid, *nodes_allowed);
720 if (nid == MAX_NUMNODES)
721 nid = first_node(*nodes_allowed);
722 VM_BUG_ON(nid >= MAX_NUMNODES);
723
724 return nid;
725}
726
727static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
728{
729 if (!node_isset(nid, *nodes_allowed))
730 nid = next_node_allowed(nid, nodes_allowed);
731 return nid;
732}
733
734
735
736
737
738
739
740static int hstate_next_node_to_alloc(struct hstate *h,
741 nodemask_t *nodes_allowed)
742{
743 int nid;
744
745 VM_BUG_ON(!nodes_allowed);
746
747 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
748 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
749
750 return nid;
751}
752
753static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
754{
755 struct page *page;
756 int start_nid;
757 int next_nid;
758 int ret = 0;
759
760 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
761 next_nid = start_nid;
762
763 do {
764 page = alloc_fresh_huge_page_node(h, next_nid);
765 if (page) {
766 ret = 1;
767 break;
768 }
769 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
770 } while (next_nid != start_nid);
771
772 if (ret)
773 count_vm_event(HTLB_BUDDY_PGALLOC);
774 else
775 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
776
777 return ret;
778}
779
780
781
782
783
784
785
786static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
787{
788 int nid;
789
790 VM_BUG_ON(!nodes_allowed);
791
792 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
793 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
794
795 return nid;
796}
797
798
799
800
801
802
803
804static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
805 bool acct_surplus)
806{
807 int start_nid;
808 int next_nid;
809 int ret = 0;
810
811 start_nid = hstate_next_node_to_free(h, nodes_allowed);
812 next_nid = start_nid;
813
814 do {
815
816
817
818
819 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
820 !list_empty(&h->hugepage_freelists[next_nid])) {
821 struct page *page =
822 list_entry(h->hugepage_freelists[next_nid].next,
823 struct page, lru);
824 list_del(&page->lru);
825 h->free_huge_pages--;
826 h->free_huge_pages_node[next_nid]--;
827 if (acct_surplus) {
828 h->surplus_huge_pages--;
829 h->surplus_huge_pages_node[next_nid]--;
830 }
831 update_and_free_page(h, page);
832 ret = 1;
833 break;
834 }
835 next_nid = hstate_next_node_to_free(h, nodes_allowed);
836 } while (next_nid != start_nid);
837
838 return ret;
839}
840
841static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
842{
843 struct page *page;
844 unsigned int r_nid;
845
846 if (h->order >= MAX_ORDER)
847 return NULL;
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872 spin_lock(&hugetlb_lock);
873 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
874 spin_unlock(&hugetlb_lock);
875 return NULL;
876 } else {
877 h->nr_huge_pages++;
878 h->surplus_huge_pages++;
879 }
880 spin_unlock(&hugetlb_lock);
881
882 if (nid == NUMA_NO_NODE)
883 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
884 __GFP_REPEAT|__GFP_NOWARN,
885 huge_page_order(h));
886 else
887 page = alloc_pages_exact_node(nid,
888 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
889 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
890
891 if (page && arch_prepare_hugepage(page)) {
892 __free_pages(page, huge_page_order(h));
893 page = NULL;
894 }
895
896 spin_lock(&hugetlb_lock);
897 if (page) {
898 INIT_LIST_HEAD(&page->lru);
899 r_nid = page_to_nid(page);
900 set_compound_page_dtor(page, free_huge_page);
901 set_hugetlb_cgroup(page, NULL);
902
903
904
905 h->nr_huge_pages_node[r_nid]++;
906 h->surplus_huge_pages_node[r_nid]++;
907 __count_vm_event(HTLB_BUDDY_PGALLOC);
908 } else {
909 h->nr_huge_pages--;
910 h->surplus_huge_pages--;
911 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
912 }
913 spin_unlock(&hugetlb_lock);
914
915 return page;
916}
917
918
919
920
921
922
923struct page *alloc_huge_page_node(struct hstate *h, int nid)
924{
925 struct page *page;
926
927 spin_lock(&hugetlb_lock);
928 page = dequeue_huge_page_node(h, nid);
929 spin_unlock(&hugetlb_lock);
930
931 if (!page)
932 page = alloc_buddy_huge_page(h, nid);
933
934 return page;
935}
936
937
938
939
940
941static int gather_surplus_pages(struct hstate *h, int delta)
942{
943 struct list_head surplus_list;
944 struct page *page, *tmp;
945 int ret, i;
946 int needed, allocated;
947 bool alloc_ok = true;
948
949 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
950 if (needed <= 0) {
951 h->resv_huge_pages += delta;
952 return 0;
953 }
954
955 allocated = 0;
956 INIT_LIST_HEAD(&surplus_list);
957
958 ret = -ENOMEM;
959retry:
960 spin_unlock(&hugetlb_lock);
961 for (i = 0; i < needed; i++) {
962 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
963 if (!page) {
964 alloc_ok = false;
965 break;
966 }
967 list_add(&page->lru, &surplus_list);
968 }
969 allocated += i;
970
971
972
973
974
975 spin_lock(&hugetlb_lock);
976 needed = (h->resv_huge_pages + delta) -
977 (h->free_huge_pages + allocated);
978 if (needed > 0) {
979 if (alloc_ok)
980 goto retry;
981
982
983
984
985
986 goto free;
987 }
988
989
990
991
992
993
994
995
996 needed += allocated;
997 h->resv_huge_pages += delta;
998 ret = 0;
999
1000
1001 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1002 if ((--needed) < 0)
1003 break;
1004
1005
1006
1007
1008 put_page_testzero(page);
1009 VM_BUG_ON(page_count(page));
1010 enqueue_huge_page(h, page);
1011 }
1012free:
1013 spin_unlock(&hugetlb_lock);
1014
1015
1016 if (!list_empty(&surplus_list)) {
1017 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1018 put_page(page);
1019 }
1020 }
1021 spin_lock(&hugetlb_lock);
1022
1023 return ret;
1024}
1025
1026
1027
1028
1029
1030
1031
1032static void return_unused_surplus_pages(struct hstate *h,
1033 unsigned long unused_resv_pages)
1034{
1035 unsigned long nr_pages;
1036
1037
1038 h->resv_huge_pages -= unused_resv_pages;
1039
1040
1041 if (h->order >= MAX_ORDER)
1042 return;
1043
1044 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054 while (nr_pages--) {
1055 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
1056 break;
1057 }
1058}
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070static long vma_needs_reservation(struct hstate *h,
1071 struct vm_area_struct *vma, unsigned long addr)
1072{
1073 struct address_space *mapping = vma->vm_file->f_mapping;
1074 struct inode *inode = mapping->host;
1075
1076 if (vma->vm_flags & VM_MAYSHARE) {
1077 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1078 return region_chg(&inode->i_mapping->private_list,
1079 idx, idx + 1);
1080
1081 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1082 return 1;
1083
1084 } else {
1085 long err;
1086 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1087 struct resv_map *reservations = vma_resv_map(vma);
1088
1089 err = region_chg(&reservations->regions, idx, idx + 1);
1090 if (err < 0)
1091 return err;
1092 return 0;
1093 }
1094}
1095static void vma_commit_reservation(struct hstate *h,
1096 struct vm_area_struct *vma, unsigned long addr)
1097{
1098 struct address_space *mapping = vma->vm_file->f_mapping;
1099 struct inode *inode = mapping->host;
1100
1101 if (vma->vm_flags & VM_MAYSHARE) {
1102 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1103 region_add(&inode->i_mapping->private_list, idx, idx + 1);
1104
1105 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1106 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1107 struct resv_map *reservations = vma_resv_map(vma);
1108
1109
1110 region_add(&reservations->regions, idx, idx + 1);
1111 }
1112}
1113
1114static struct page *alloc_huge_page(struct vm_area_struct *vma,
1115 unsigned long addr, int avoid_reserve)
1116{
1117 struct hugepage_subpool *spool = subpool_vma(vma);
1118 struct hstate *h = hstate_vma(vma);
1119 struct page *page;
1120 long chg;
1121 int ret, idx;
1122 struct hugetlb_cgroup *h_cg;
1123
1124 idx = hstate_index(h);
1125
1126
1127
1128
1129
1130
1131
1132
1133 chg = vma_needs_reservation(h, vma, addr);
1134 if (chg < 0)
1135 return ERR_PTR(-ENOMEM);
1136 if (chg)
1137 if (hugepage_subpool_get_pages(spool, chg))
1138 return ERR_PTR(-ENOSPC);
1139
1140 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1141 if (ret) {
1142 hugepage_subpool_put_pages(spool, chg);
1143 return ERR_PTR(-ENOSPC);
1144 }
1145 spin_lock(&hugetlb_lock);
1146 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1147 if (page) {
1148
1149 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1150 h_cg, page);
1151 spin_unlock(&hugetlb_lock);
1152 } else {
1153 spin_unlock(&hugetlb_lock);
1154 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1155 if (!page) {
1156 hugetlb_cgroup_uncharge_cgroup(idx,
1157 pages_per_huge_page(h),
1158 h_cg);
1159 hugepage_subpool_put_pages(spool, chg);
1160 return ERR_PTR(-ENOSPC);
1161 }
1162 spin_lock(&hugetlb_lock);
1163 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1164 h_cg, page);
1165 list_move(&page->lru, &h->hugepage_activelist);
1166 spin_unlock(&hugetlb_lock);
1167 }
1168
1169 set_page_private(page, (unsigned long)spool);
1170
1171 vma_commit_reservation(h, vma, addr);
1172 return page;
1173}
1174
1175int __weak alloc_bootmem_huge_page(struct hstate *h)
1176{
1177 struct huge_bootmem_page *m;
1178 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1179
1180 while (nr_nodes) {
1181 void *addr;
1182
1183 addr = __alloc_bootmem_node_nopanic(
1184 NODE_DATA(hstate_next_node_to_alloc(h,
1185 &node_states[N_HIGH_MEMORY])),
1186 huge_page_size(h), huge_page_size(h), 0);
1187
1188 if (addr) {
1189
1190
1191
1192
1193
1194 m = addr;
1195 goto found;
1196 }
1197 nr_nodes--;
1198 }
1199 return 0;
1200
1201found:
1202 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
1203
1204 list_add(&m->list, &huge_boot_pages);
1205 m->hstate = h;
1206 return 1;
1207}
1208
1209static void prep_compound_huge_page(struct page *page, int order)
1210{
1211 if (unlikely(order > (MAX_ORDER - 1)))
1212 prep_compound_gigantic_page(page, order);
1213 else
1214 prep_compound_page(page, order);
1215}
1216
1217
1218static void __init gather_bootmem_prealloc(void)
1219{
1220 struct huge_bootmem_page *m;
1221
1222 list_for_each_entry(m, &huge_boot_pages, list) {
1223 struct hstate *h = m->hstate;
1224 struct page *page;
1225
1226#ifdef CONFIG_HIGHMEM
1227 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1228 free_bootmem_late((unsigned long)m,
1229 sizeof(struct huge_bootmem_page));
1230#else
1231 page = virt_to_page(m);
1232#endif
1233 __ClearPageReserved(page);
1234 WARN_ON(page_count(page) != 1);
1235 prep_compound_huge_page(page, h->order);
1236 prep_new_huge_page(h, page, page_to_nid(page));
1237
1238
1239
1240
1241
1242
1243 if (h->order > (MAX_ORDER - 1))
1244 totalram_pages += 1 << h->order;
1245 }
1246}
1247
1248static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1249{
1250 unsigned long i;
1251
1252 for (i = 0; i < h->max_huge_pages; ++i) {
1253 if (h->order >= MAX_ORDER) {
1254 if (!alloc_bootmem_huge_page(h))
1255 break;
1256 } else if (!alloc_fresh_huge_page(h,
1257 &node_states[N_HIGH_MEMORY]))
1258 break;
1259 }
1260 h->max_huge_pages = i;
1261}
1262
1263static void __init hugetlb_init_hstates(void)
1264{
1265 struct hstate *h;
1266
1267 for_each_hstate(h) {
1268
1269 if (h->order < MAX_ORDER)
1270 hugetlb_hstate_alloc_pages(h);
1271 }
1272}
1273
1274static char * __init memfmt(char *buf, unsigned long n)
1275{
1276 if (n >= (1UL << 30))
1277 sprintf(buf, "%lu GB", n >> 30);
1278 else if (n >= (1UL << 20))
1279 sprintf(buf, "%lu MB", n >> 20);
1280 else
1281 sprintf(buf, "%lu KB", n >> 10);
1282 return buf;
1283}
1284
1285static void __init report_hugepages(void)
1286{
1287 struct hstate *h;
1288
1289 for_each_hstate(h) {
1290 char buf[32];
1291 printk(KERN_INFO "HugeTLB registered %s page size, "
1292 "pre-allocated %ld pages\n",
1293 memfmt(buf, huge_page_size(h)),
1294 h->free_huge_pages);
1295 }
1296}
1297
1298#ifdef CONFIG_HIGHMEM
1299static void try_to_free_low(struct hstate *h, unsigned long count,
1300 nodemask_t *nodes_allowed)
1301{
1302 int i;
1303
1304 if (h->order >= MAX_ORDER)
1305 return;
1306
1307 for_each_node_mask(i, *nodes_allowed) {
1308 struct page *page, *next;
1309 struct list_head *freel = &h->hugepage_freelists[i];
1310 list_for_each_entry_safe(page, next, freel, lru) {
1311 if (count >= h->nr_huge_pages)
1312 return;
1313 if (PageHighMem(page))
1314 continue;
1315 list_del(&page->lru);
1316 update_and_free_page(h, page);
1317 h->free_huge_pages--;
1318 h->free_huge_pages_node[page_to_nid(page)]--;
1319 }
1320 }
1321}
1322#else
1323static inline void try_to_free_low(struct hstate *h, unsigned long count,
1324 nodemask_t *nodes_allowed)
1325{
1326}
1327#endif
1328
1329
1330
1331
1332
1333
1334static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1335 int delta)
1336{
1337 int start_nid, next_nid;
1338 int ret = 0;
1339
1340 VM_BUG_ON(delta != -1 && delta != 1);
1341
1342 if (delta < 0)
1343 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1344 else
1345 start_nid = hstate_next_node_to_free(h, nodes_allowed);
1346 next_nid = start_nid;
1347
1348 do {
1349 int nid = next_nid;
1350 if (delta < 0) {
1351
1352
1353
1354 if (!h->surplus_huge_pages_node[nid]) {
1355 next_nid = hstate_next_node_to_alloc(h,
1356 nodes_allowed);
1357 continue;
1358 }
1359 }
1360 if (delta > 0) {
1361
1362
1363
1364 if (h->surplus_huge_pages_node[nid] >=
1365 h->nr_huge_pages_node[nid]) {
1366 next_nid = hstate_next_node_to_free(h,
1367 nodes_allowed);
1368 continue;
1369 }
1370 }
1371
1372 h->surplus_huge_pages += delta;
1373 h->surplus_huge_pages_node[nid] += delta;
1374 ret = 1;
1375 break;
1376 } while (next_nid != start_nid);
1377
1378 return ret;
1379}
1380
1381#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1382static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1383 nodemask_t *nodes_allowed)
1384{
1385 unsigned long min_count, ret;
1386
1387 if (h->order >= MAX_ORDER)
1388 return h->max_huge_pages;
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401 spin_lock(&hugetlb_lock);
1402 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1403 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1404 break;
1405 }
1406
1407 while (count > persistent_huge_pages(h)) {
1408
1409
1410
1411
1412
1413 spin_unlock(&hugetlb_lock);
1414 ret = alloc_fresh_huge_page(h, nodes_allowed);
1415 spin_lock(&hugetlb_lock);
1416 if (!ret)
1417 goto out;
1418
1419
1420 if (signal_pending(current))
1421 goto out;
1422 }
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1440 min_count = max(count, min_count);
1441 try_to_free_low(h, min_count, nodes_allowed);
1442 while (min_count < persistent_huge_pages(h)) {
1443 if (!free_pool_huge_page(h, nodes_allowed, 0))
1444 break;
1445 }
1446 while (count < persistent_huge_pages(h)) {
1447 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1448 break;
1449 }
1450out:
1451 ret = persistent_huge_pages(h);
1452 spin_unlock(&hugetlb_lock);
1453 return ret;
1454}
1455
1456#define HSTATE_ATTR_RO(_name) \
1457 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1458
1459#define HSTATE_ATTR(_name) \
1460 static struct kobj_attribute _name##_attr = \
1461 __ATTR(_name, 0644, _name##_show, _name##_store)
1462
1463static struct kobject *hugepages_kobj;
1464static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1465
1466static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1467
1468static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1469{
1470 int i;
1471
1472 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1473 if (hstate_kobjs[i] == kobj) {
1474 if (nidp)
1475 *nidp = NUMA_NO_NODE;
1476 return &hstates[i];
1477 }
1478
1479 return kobj_to_node_hstate(kobj, nidp);
1480}
1481
1482static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1483 struct kobj_attribute *attr, char *buf)
1484{
1485 struct hstate *h;
1486 unsigned long nr_huge_pages;
1487 int nid;
1488
1489 h = kobj_to_hstate(kobj, &nid);
1490 if (nid == NUMA_NO_NODE)
1491 nr_huge_pages = h->nr_huge_pages;
1492 else
1493 nr_huge_pages = h->nr_huge_pages_node[nid];
1494
1495 return sprintf(buf, "%lu\n", nr_huge_pages);
1496}
1497
1498static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1499 struct kobject *kobj, struct kobj_attribute *attr,
1500 const char *buf, size_t len)
1501{
1502 int err;
1503 int nid;
1504 unsigned long count;
1505 struct hstate *h;
1506 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1507
1508 err = strict_strtoul(buf, 10, &count);
1509 if (err)
1510 goto out;
1511
1512 h = kobj_to_hstate(kobj, &nid);
1513 if (h->order >= MAX_ORDER) {
1514 err = -EINVAL;
1515 goto out;
1516 }
1517
1518 if (nid == NUMA_NO_NODE) {
1519
1520
1521
1522 if (!(obey_mempolicy &&
1523 init_nodemask_of_mempolicy(nodes_allowed))) {
1524 NODEMASK_FREE(nodes_allowed);
1525 nodes_allowed = &node_states[N_HIGH_MEMORY];
1526 }
1527 } else if (nodes_allowed) {
1528
1529
1530
1531
1532 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1533 init_nodemask_of_node(nodes_allowed, nid);
1534 } else
1535 nodes_allowed = &node_states[N_HIGH_MEMORY];
1536
1537 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1538
1539 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1540 NODEMASK_FREE(nodes_allowed);
1541
1542 return len;
1543out:
1544 NODEMASK_FREE(nodes_allowed);
1545 return err;
1546}
1547
1548static ssize_t nr_hugepages_show(struct kobject *kobj,
1549 struct kobj_attribute *attr, char *buf)
1550{
1551 return nr_hugepages_show_common(kobj, attr, buf);
1552}
1553
1554static ssize_t nr_hugepages_store(struct kobject *kobj,
1555 struct kobj_attribute *attr, const char *buf, size_t len)
1556{
1557 return nr_hugepages_store_common(false, kobj, attr, buf, len);
1558}
1559HSTATE_ATTR(nr_hugepages);
1560
1561#ifdef CONFIG_NUMA
1562
1563
1564
1565
1566
1567static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1568 struct kobj_attribute *attr, char *buf)
1569{
1570 return nr_hugepages_show_common(kobj, attr, buf);
1571}
1572
1573static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1574 struct kobj_attribute *attr, const char *buf, size_t len)
1575{
1576 return nr_hugepages_store_common(true, kobj, attr, buf, len);
1577}
1578HSTATE_ATTR(nr_hugepages_mempolicy);
1579#endif
1580
1581
1582static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1583 struct kobj_attribute *attr, char *buf)
1584{
1585 struct hstate *h = kobj_to_hstate(kobj, NULL);
1586 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1587}
1588
1589static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1590 struct kobj_attribute *attr, const char *buf, size_t count)
1591{
1592 int err;
1593 unsigned long input;
1594 struct hstate *h = kobj_to_hstate(kobj, NULL);
1595
1596 if (h->order >= MAX_ORDER)
1597 return -EINVAL;
1598
1599 err = strict_strtoul(buf, 10, &input);
1600 if (err)
1601 return err;
1602
1603 spin_lock(&hugetlb_lock);
1604 h->nr_overcommit_huge_pages = input;
1605 spin_unlock(&hugetlb_lock);
1606
1607 return count;
1608}
1609HSTATE_ATTR(nr_overcommit_hugepages);
1610
1611static ssize_t free_hugepages_show(struct kobject *kobj,
1612 struct kobj_attribute *attr, char *buf)
1613{
1614 struct hstate *h;
1615 unsigned long free_huge_pages;
1616 int nid;
1617
1618 h = kobj_to_hstate(kobj, &nid);
1619 if (nid == NUMA_NO_NODE)
1620 free_huge_pages = h->free_huge_pages;
1621 else
1622 free_huge_pages = h->free_huge_pages_node[nid];
1623
1624 return sprintf(buf, "%lu\n", free_huge_pages);
1625}
1626HSTATE_ATTR_RO(free_hugepages);
1627
1628static ssize_t resv_hugepages_show(struct kobject *kobj,
1629 struct kobj_attribute *attr, char *buf)
1630{
1631 struct hstate *h = kobj_to_hstate(kobj, NULL);
1632 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1633}
1634HSTATE_ATTR_RO(resv_hugepages);
1635
1636static ssize_t surplus_hugepages_show(struct kobject *kobj,
1637 struct kobj_attribute *attr, char *buf)
1638{
1639 struct hstate *h;
1640 unsigned long surplus_huge_pages;
1641 int nid;
1642
1643 h = kobj_to_hstate(kobj, &nid);
1644 if (nid == NUMA_NO_NODE)
1645 surplus_huge_pages = h->surplus_huge_pages;
1646 else
1647 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1648
1649 return sprintf(buf, "%lu\n", surplus_huge_pages);
1650}
1651HSTATE_ATTR_RO(surplus_hugepages);
1652
1653static struct attribute *hstate_attrs[] = {
1654 &nr_hugepages_attr.attr,
1655 &nr_overcommit_hugepages_attr.attr,
1656 &free_hugepages_attr.attr,
1657 &resv_hugepages_attr.attr,
1658 &surplus_hugepages_attr.attr,
1659#ifdef CONFIG_NUMA
1660 &nr_hugepages_mempolicy_attr.attr,
1661#endif
1662 NULL,
1663};
1664
1665static struct attribute_group hstate_attr_group = {
1666 .attrs = hstate_attrs,
1667};
1668
1669static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1670 struct kobject **hstate_kobjs,
1671 struct attribute_group *hstate_attr_group)
1672{
1673 int retval;
1674 int hi = hstate_index(h);
1675
1676 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1677 if (!hstate_kobjs[hi])
1678 return -ENOMEM;
1679
1680 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1681 if (retval)
1682 kobject_put(hstate_kobjs[hi]);
1683
1684 return retval;
1685}
1686
1687static void __init hugetlb_sysfs_init(void)
1688{
1689 struct hstate *h;
1690 int err;
1691
1692 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1693 if (!hugepages_kobj)
1694 return;
1695
1696 for_each_hstate(h) {
1697 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1698 hstate_kobjs, &hstate_attr_group);
1699 if (err)
1700 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1701 h->name);
1702 }
1703}
1704
1705#ifdef CONFIG_NUMA
1706
1707
1708
1709
1710
1711
1712
1713
1714struct node_hstate {
1715 struct kobject *hugepages_kobj;
1716 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1717};
1718struct node_hstate node_hstates[MAX_NUMNODES];
1719
1720
1721
1722
1723static struct attribute *per_node_hstate_attrs[] = {
1724 &nr_hugepages_attr.attr,
1725 &free_hugepages_attr.attr,
1726 &surplus_hugepages_attr.attr,
1727 NULL,
1728};
1729
1730static struct attribute_group per_node_hstate_attr_group = {
1731 .attrs = per_node_hstate_attrs,
1732};
1733
1734
1735
1736
1737
1738static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1739{
1740 int nid;
1741
1742 for (nid = 0; nid < nr_node_ids; nid++) {
1743 struct node_hstate *nhs = &node_hstates[nid];
1744 int i;
1745 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1746 if (nhs->hstate_kobjs[i] == kobj) {
1747 if (nidp)
1748 *nidp = nid;
1749 return &hstates[i];
1750 }
1751 }
1752
1753 BUG();
1754 return NULL;
1755}
1756
1757
1758
1759
1760
1761void hugetlb_unregister_node(struct node *node)
1762{
1763 struct hstate *h;
1764 struct node_hstate *nhs = &node_hstates[node->dev.id];
1765
1766 if (!nhs->hugepages_kobj)
1767 return;
1768
1769 for_each_hstate(h) {
1770 int idx = hstate_index(h);
1771 if (nhs->hstate_kobjs[idx]) {
1772 kobject_put(nhs->hstate_kobjs[idx]);
1773 nhs->hstate_kobjs[idx] = NULL;
1774 }
1775 }
1776
1777 kobject_put(nhs->hugepages_kobj);
1778 nhs->hugepages_kobj = NULL;
1779}
1780
1781
1782
1783
1784
1785static void hugetlb_unregister_all_nodes(void)
1786{
1787 int nid;
1788
1789
1790
1791
1792 register_hugetlbfs_with_node(NULL, NULL);
1793
1794
1795
1796
1797 for (nid = 0; nid < nr_node_ids; nid++)
1798 hugetlb_unregister_node(&node_devices[nid]);
1799}
1800
1801
1802
1803
1804
1805void hugetlb_register_node(struct node *node)
1806{
1807 struct hstate *h;
1808 struct node_hstate *nhs = &node_hstates[node->dev.id];
1809 int err;
1810
1811 if (nhs->hugepages_kobj)
1812 return;
1813
1814 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1815 &node->dev.kobj);
1816 if (!nhs->hugepages_kobj)
1817 return;
1818
1819 for_each_hstate(h) {
1820 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1821 nhs->hstate_kobjs,
1822 &per_node_hstate_attr_group);
1823 if (err) {
1824 printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1825 " for node %d\n",
1826 h->name, node->dev.id);
1827 hugetlb_unregister_node(node);
1828 break;
1829 }
1830 }
1831}
1832
1833
1834
1835
1836
1837
1838static void hugetlb_register_all_nodes(void)
1839{
1840 int nid;
1841
1842 for_each_node_state(nid, N_HIGH_MEMORY) {
1843 struct node *node = &node_devices[nid];
1844 if (node->dev.id == nid)
1845 hugetlb_register_node(node);
1846 }
1847
1848
1849
1850
1851
1852 register_hugetlbfs_with_node(hugetlb_register_node,
1853 hugetlb_unregister_node);
1854}
1855#else
1856
1857static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1858{
1859 BUG();
1860 if (nidp)
1861 *nidp = -1;
1862 return NULL;
1863}
1864
1865static void hugetlb_unregister_all_nodes(void) { }
1866
1867static void hugetlb_register_all_nodes(void) { }
1868
1869#endif
1870
1871static void __exit hugetlb_exit(void)
1872{
1873 struct hstate *h;
1874
1875 hugetlb_unregister_all_nodes();
1876
1877 for_each_hstate(h) {
1878 kobject_put(hstate_kobjs[hstate_index(h)]);
1879 }
1880
1881 kobject_put(hugepages_kobj);
1882}
1883module_exit(hugetlb_exit);
1884
1885static int __init hugetlb_init(void)
1886{
1887
1888
1889
1890
1891 if (HPAGE_SHIFT == 0)
1892 return 0;
1893
1894 if (!size_to_hstate(default_hstate_size)) {
1895 default_hstate_size = HPAGE_SIZE;
1896 if (!size_to_hstate(default_hstate_size))
1897 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1898 }
1899 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1900 if (default_hstate_max_huge_pages)
1901 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1902
1903 hugetlb_init_hstates();
1904
1905 gather_bootmem_prealloc();
1906
1907 report_hugepages();
1908
1909 hugetlb_sysfs_init();
1910
1911 hugetlb_register_all_nodes();
1912
1913 return 0;
1914}
1915module_init(hugetlb_init);
1916
1917
1918void __init hugetlb_add_hstate(unsigned order)
1919{
1920 struct hstate *h;
1921 unsigned long i;
1922
1923 if (size_to_hstate(PAGE_SIZE << order)) {
1924 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1925 return;
1926 }
1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1928 BUG_ON(order == 0);
1929 h = &hstates[hugetlb_max_hstate++];
1930 h->order = order;
1931 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1932 h->nr_huge_pages = 0;
1933 h->free_huge_pages = 0;
1934 for (i = 0; i < MAX_NUMNODES; ++i)
1935 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1936 INIT_LIST_HEAD(&h->hugepage_activelist);
1937 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1938 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1939 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1940 huge_page_size(h)/1024);
1941
1942
1943
1944
1945
1946 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1947 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1948
1949 parsed_hstate = h;
1950}
1951
1952static int __init hugetlb_nrpages_setup(char *s)
1953{
1954 unsigned long *mhp;
1955 static unsigned long *last_mhp;
1956
1957
1958
1959
1960
1961 if (!hugetlb_max_hstate)
1962 mhp = &default_hstate_max_huge_pages;
1963 else
1964 mhp = &parsed_hstate->max_huge_pages;
1965
1966 if (mhp == last_mhp) {
1967 printk(KERN_WARNING "hugepages= specified twice without "
1968 "interleaving hugepagesz=, ignoring\n");
1969 return 1;
1970 }
1971
1972 if (sscanf(s, "%lu", mhp) <= 0)
1973 *mhp = 0;
1974
1975
1976
1977
1978
1979
1980 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
1981 hugetlb_hstate_alloc_pages(parsed_hstate);
1982
1983 last_mhp = mhp;
1984
1985 return 1;
1986}
1987__setup("hugepages=", hugetlb_nrpages_setup);
1988
1989static int __init hugetlb_default_setup(char *s)
1990{
1991 default_hstate_size = memparse(s, &s);
1992 return 1;
1993}
1994__setup("default_hugepagesz=", hugetlb_default_setup);
1995
1996static unsigned int cpuset_mems_nr(unsigned int *array)
1997{
1998 int node;
1999 unsigned int nr = 0;
2000
2001 for_each_node_mask(node, cpuset_current_mems_allowed)
2002 nr += array[node];
2003
2004 return nr;
2005}
2006
2007#ifdef CONFIG_SYSCTL
2008static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2009 struct ctl_table *table, int write,
2010 void __user *buffer, size_t *length, loff_t *ppos)
2011{
2012 struct hstate *h = &default_hstate;
2013 unsigned long tmp;
2014 int ret;
2015
2016 tmp = h->max_huge_pages;
2017
2018 if (write && h->order >= MAX_ORDER)
2019 return -EINVAL;
2020
2021 table->data = &tmp;
2022 table->maxlen = sizeof(unsigned long);
2023 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2024 if (ret)
2025 goto out;
2026
2027 if (write) {
2028 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
2029 GFP_KERNEL | __GFP_NORETRY);
2030 if (!(obey_mempolicy &&
2031 init_nodemask_of_mempolicy(nodes_allowed))) {
2032 NODEMASK_FREE(nodes_allowed);
2033 nodes_allowed = &node_states[N_HIGH_MEMORY];
2034 }
2035 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
2036
2037 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
2038 NODEMASK_FREE(nodes_allowed);
2039 }
2040out:
2041 return ret;
2042}
2043
2044int hugetlb_sysctl_handler(struct ctl_table *table, int write,
2045 void __user *buffer, size_t *length, loff_t *ppos)
2046{
2047
2048 return hugetlb_sysctl_handler_common(false, table, write,
2049 buffer, length, ppos);
2050}
2051
2052#ifdef CONFIG_NUMA
2053int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
2054 void __user *buffer, size_t *length, loff_t *ppos)
2055{
2056 return hugetlb_sysctl_handler_common(true, table, write,
2057 buffer, length, ppos);
2058}
2059#endif
2060
2061int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
2062 void __user *buffer,
2063 size_t *length, loff_t *ppos)
2064{
2065 proc_dointvec(table, write, buffer, length, ppos);
2066 if (hugepages_treat_as_movable)
2067 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
2068 else
2069 htlb_alloc_mask = GFP_HIGHUSER;
2070 return 0;
2071}
2072
2073int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2074 void __user *buffer,
2075 size_t *length, loff_t *ppos)
2076{
2077 struct hstate *h = &default_hstate;
2078 unsigned long tmp;
2079 int ret;
2080
2081 tmp = h->nr_overcommit_huge_pages;
2082
2083 if (write && h->order >= MAX_ORDER)
2084 return -EINVAL;
2085
2086 table->data = &tmp;
2087 table->maxlen = sizeof(unsigned long);
2088 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2089 if (ret)
2090 goto out;
2091
2092 if (write) {
2093 spin_lock(&hugetlb_lock);
2094 h->nr_overcommit_huge_pages = tmp;
2095 spin_unlock(&hugetlb_lock);
2096 }
2097out:
2098 return ret;
2099}
2100
2101#endif
2102
2103void hugetlb_report_meminfo(struct seq_file *m)
2104{
2105 struct hstate *h = &default_hstate;
2106 seq_printf(m,
2107 "HugePages_Total: %5lu\n"
2108 "HugePages_Free: %5lu\n"
2109 "HugePages_Rsvd: %5lu\n"
2110 "HugePages_Surp: %5lu\n"
2111 "Hugepagesize: %8lu kB\n",
2112 h->nr_huge_pages,
2113 h->free_huge_pages,
2114 h->resv_huge_pages,
2115 h->surplus_huge_pages,
2116 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2117}
2118
2119int hugetlb_report_node_meminfo(int nid, char *buf)
2120{
2121 struct hstate *h = &default_hstate;
2122 return sprintf(buf,
2123 "Node %d HugePages_Total: %5u\n"
2124 "Node %d HugePages_Free: %5u\n"
2125 "Node %d HugePages_Surp: %5u\n",
2126 nid, h->nr_huge_pages_node[nid],
2127 nid, h->free_huge_pages_node[nid],
2128 nid, h->surplus_huge_pages_node[nid]);
2129}
2130
2131
2132unsigned long hugetlb_total_pages(void)
2133{
2134 struct hstate *h = &default_hstate;
2135 return h->nr_huge_pages * pages_per_huge_page(h);
2136}
2137
2138static int hugetlb_acct_memory(struct hstate *h, long delta)
2139{
2140 int ret = -ENOMEM;
2141
2142 spin_lock(&hugetlb_lock);
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160 if (delta > 0) {
2161 if (gather_surplus_pages(h, delta) < 0)
2162 goto out;
2163
2164 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2165 return_unused_surplus_pages(h, delta);
2166 goto out;
2167 }
2168 }
2169
2170 ret = 0;
2171 if (delta < 0)
2172 return_unused_surplus_pages(h, (unsigned long) -delta);
2173
2174out:
2175 spin_unlock(&hugetlb_lock);
2176 return ret;
2177}
2178
2179static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2180{
2181 struct resv_map *reservations = vma_resv_map(vma);
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191 if (reservations)
2192 kref_get(&reservations->refs);
2193}
2194
2195static void resv_map_put(struct vm_area_struct *vma)
2196{
2197 struct resv_map *reservations = vma_resv_map(vma);
2198
2199 if (!reservations)
2200 return;
2201 kref_put(&reservations->refs, resv_map_release);
2202}
2203
2204static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2205{
2206 struct hstate *h = hstate_vma(vma);
2207 struct resv_map *reservations = vma_resv_map(vma);
2208 struct hugepage_subpool *spool = subpool_vma(vma);
2209 unsigned long reserve;
2210 unsigned long start;
2211 unsigned long end;
2212
2213 if (reservations) {
2214 start = vma_hugecache_offset(h, vma, vma->vm_start);
2215 end = vma_hugecache_offset(h, vma, vma->vm_end);
2216
2217 reserve = (end - start) -
2218 region_count(&reservations->regions, start, end);
2219
2220 resv_map_put(vma);
2221
2222 if (reserve) {
2223 hugetlb_acct_memory(h, -reserve);
2224 hugepage_subpool_put_pages(spool, reserve);
2225 }
2226 }
2227}
2228
2229
2230
2231
2232
2233
2234
2235static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2236{
2237 BUG();
2238 return 0;
2239}
2240
2241const struct vm_operations_struct hugetlb_vm_ops = {
2242 .fault = hugetlb_vm_op_fault,
2243 .open = hugetlb_vm_op_open,
2244 .close = hugetlb_vm_op_close,
2245};
2246
2247static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2248 int writable)
2249{
2250 pte_t entry;
2251
2252 if (writable) {
2253 entry =
2254 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
2255 } else {
2256 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
2257 }
2258 entry = pte_mkyoung(entry);
2259 entry = pte_mkhuge(entry);
2260 entry = arch_make_huge_pte(entry, vma, page, writable);
2261
2262 return entry;
2263}
2264
2265static void set_huge_ptep_writable(struct vm_area_struct *vma,
2266 unsigned long address, pte_t *ptep)
2267{
2268 pte_t entry;
2269
2270 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2271 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2272 update_mmu_cache(vma, address, ptep);
2273}
2274
2275
2276int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2277 struct vm_area_struct *vma)
2278{
2279 pte_t *src_pte, *dst_pte, entry;
2280 struct page *ptepage;
2281 unsigned long addr;
2282 int cow;
2283 struct hstate *h = hstate_vma(vma);
2284 unsigned long sz = huge_page_size(h);
2285
2286 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2287
2288 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2289 src_pte = huge_pte_offset(src, addr);
2290 if (!src_pte)
2291 continue;
2292 dst_pte = huge_pte_alloc(dst, addr, sz);
2293 if (!dst_pte)
2294 goto nomem;
2295
2296
2297 if (dst_pte == src_pte)
2298 continue;
2299
2300 spin_lock(&dst->page_table_lock);
2301 spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
2302 if (!huge_pte_none(huge_ptep_get(src_pte))) {
2303 if (cow)
2304 huge_ptep_set_wrprotect(src, addr, src_pte);
2305 entry = huge_ptep_get(src_pte);
2306 ptepage = pte_page(entry);
2307 get_page(ptepage);
2308 page_dup_rmap(ptepage);
2309 set_huge_pte_at(dst, addr, dst_pte, entry);
2310 }
2311 spin_unlock(&src->page_table_lock);
2312 spin_unlock(&dst->page_table_lock);
2313 }
2314 return 0;
2315
2316nomem:
2317 return -ENOMEM;
2318}
2319
2320static int is_hugetlb_entry_migration(pte_t pte)
2321{
2322 swp_entry_t swp;
2323
2324 if (huge_pte_none(pte) || pte_present(pte))
2325 return 0;
2326 swp = pte_to_swp_entry(pte);
2327 if (non_swap_entry(swp) && is_migration_entry(swp))
2328 return 1;
2329 else
2330 return 0;
2331}
2332
2333static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2334{
2335 swp_entry_t swp;
2336
2337 if (huge_pte_none(pte) || pte_present(pte))
2338 return 0;
2339 swp = pte_to_swp_entry(pte);
2340 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2341 return 1;
2342 else
2343 return 0;
2344}
2345
2346void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2347 unsigned long start, unsigned long end,
2348 struct page *ref_page)
2349{
2350 int force_flush = 0;
2351 struct mm_struct *mm = vma->vm_mm;
2352 unsigned long address;
2353 pte_t *ptep;
2354 pte_t pte;
2355 struct page *page;
2356 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h);
2358
2359 WARN_ON(!is_vm_hugetlb_page(vma));
2360 BUG_ON(start & ~huge_page_mask(h));
2361 BUG_ON(end & ~huge_page_mask(h));
2362
2363 tlb_start_vma(tlb, vma);
2364 mmu_notifier_invalidate_range_start(mm, start, end);
2365again:
2366 spin_lock(&mm->page_table_lock);
2367 for (address = start; address < end; address += sz) {
2368 ptep = huge_pte_offset(mm, address);
2369 if (!ptep)
2370 continue;
2371
2372 if (huge_pmd_unshare(mm, &address, ptep))
2373 continue;
2374
2375 pte = huge_ptep_get(ptep);
2376 if (huge_pte_none(pte))
2377 continue;
2378
2379
2380
2381
2382 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2383 continue;
2384
2385 page = pte_page(pte);
2386
2387
2388
2389
2390
2391 if (ref_page) {
2392 if (page != ref_page)
2393 continue;
2394
2395
2396
2397
2398
2399
2400 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
2401 }
2402
2403 pte = huge_ptep_get_and_clear(mm, address, ptep);
2404 tlb_remove_tlb_entry(tlb, ptep, address);
2405 if (pte_dirty(pte))
2406 set_page_dirty(page);
2407
2408 page_remove_rmap(page);
2409 force_flush = !__tlb_remove_page(tlb, page);
2410 if (force_flush)
2411 break;
2412
2413 if (ref_page)
2414 break;
2415 }
2416 spin_unlock(&mm->page_table_lock);
2417
2418
2419
2420
2421
2422 if (force_flush) {
2423 force_flush = 0;
2424 tlb_flush_mmu(tlb);
2425 if (address < end && !ref_page)
2426 goto again;
2427 }
2428 mmu_notifier_invalidate_range_end(mm, start, end);
2429 tlb_end_vma(tlb, vma);
2430}
2431
2432void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2433 struct vm_area_struct *vma, unsigned long start,
2434 unsigned long end, struct page *ref_page)
2435{
2436 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448 vma->vm_flags &= ~VM_MAYSHARE;
2449}
2450
2451void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2452 unsigned long end, struct page *ref_page)
2453{
2454 struct mm_struct *mm;
2455 struct mmu_gather tlb;
2456
2457 mm = vma->vm_mm;
2458
2459 tlb_gather_mmu(&tlb, mm, 0);
2460 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2461 tlb_finish_mmu(&tlb, start, end);
2462}
2463
2464
2465
2466
2467
2468
2469
2470static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2471 struct page *page, unsigned long address)
2472{
2473 struct hstate *h = hstate_vma(vma);
2474 struct vm_area_struct *iter_vma;
2475 struct address_space *mapping;
2476 struct prio_tree_iter iter;
2477 pgoff_t pgoff;
2478
2479
2480
2481
2482
2483 address = address & huge_page_mask(h);
2484 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
2485 vma->vm_pgoff;
2486 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2487
2488
2489
2490
2491
2492
2493 mutex_lock(&mapping->i_mmap_mutex);
2494 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
2495
2496 if (iter_vma == vma)
2497 continue;
2498
2499
2500
2501
2502
2503
2504
2505
2506 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2507 unmap_hugepage_range(iter_vma, address,
2508 address + huge_page_size(h), page);
2509 }
2510 mutex_unlock(&mapping->i_mmap_mutex);
2511
2512 return 1;
2513}
2514
2515
2516
2517
2518
2519
2520
2521static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2522 unsigned long address, pte_t *ptep, pte_t pte,
2523 struct page *pagecache_page)
2524{
2525 struct hstate *h = hstate_vma(vma);
2526 struct page *old_page, *new_page;
2527 int avoidcopy;
2528 int outside_reserve = 0;
2529
2530 old_page = pte_page(pte);
2531
2532retry_avoidcopy:
2533
2534
2535 avoidcopy = (page_mapcount(old_page) == 1);
2536 if (avoidcopy) {
2537 if (PageAnon(old_page))
2538 page_move_anon_rmap(old_page, vma, address);
2539 set_huge_ptep_writable(vma, address, ptep);
2540 return 0;
2541 }
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552 if (!(vma->vm_flags & VM_MAYSHARE) &&
2553 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2554 old_page != pagecache_page)
2555 outside_reserve = 1;
2556
2557 page_cache_get(old_page);
2558
2559
2560 spin_unlock(&mm->page_table_lock);
2561 new_page = alloc_huge_page(vma, address, outside_reserve);
2562
2563 if (IS_ERR(new_page)) {
2564 long err = PTR_ERR(new_page);
2565 page_cache_release(old_page);
2566
2567
2568
2569
2570
2571
2572
2573
2574 if (outside_reserve) {
2575 BUG_ON(huge_pte_none(pte));
2576 if (unmap_ref_private(mm, vma, old_page, address)) {
2577 BUG_ON(huge_pte_none(pte));
2578 spin_lock(&mm->page_table_lock);
2579 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2580 if (likely(pte_same(huge_ptep_get(ptep), pte)))
2581 goto retry_avoidcopy;
2582
2583
2584
2585
2586 return 0;
2587 }
2588 WARN_ON_ONCE(1);
2589 }
2590
2591
2592 spin_lock(&mm->page_table_lock);
2593 if (err == -ENOMEM)
2594 return VM_FAULT_OOM;
2595 else
2596 return VM_FAULT_SIGBUS;
2597 }
2598
2599
2600
2601
2602
2603 if (unlikely(anon_vma_prepare(vma))) {
2604 page_cache_release(new_page);
2605 page_cache_release(old_page);
2606
2607 spin_lock(&mm->page_table_lock);
2608 return VM_FAULT_OOM;
2609 }
2610
2611 copy_user_huge_page(new_page, old_page, address, vma,
2612 pages_per_huge_page(h));
2613 __SetPageUptodate(new_page);
2614
2615
2616
2617
2618
2619 spin_lock(&mm->page_table_lock);
2620 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2621 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2622
2623 mmu_notifier_invalidate_range_start(mm,
2624 address & huge_page_mask(h),
2625 (address & huge_page_mask(h)) + huge_page_size(h));
2626 huge_ptep_clear_flush(vma, address, ptep);
2627 set_huge_pte_at(mm, address, ptep,
2628 make_huge_pte(vma, new_page, 1));
2629 page_remove_rmap(old_page);
2630 hugepage_add_new_anon_rmap(new_page, vma, address);
2631
2632 new_page = old_page;
2633 mmu_notifier_invalidate_range_end(mm,
2634 address & huge_page_mask(h),
2635 (address & huge_page_mask(h)) + huge_page_size(h));
2636 }
2637 page_cache_release(new_page);
2638 page_cache_release(old_page);
2639 return 0;
2640}
2641
2642
2643static struct page *hugetlbfs_pagecache_page(struct hstate *h,
2644 struct vm_area_struct *vma, unsigned long address)
2645{
2646 struct address_space *mapping;
2647 pgoff_t idx;
2648
2649 mapping = vma->vm_file->f_mapping;
2650 idx = vma_hugecache_offset(h, vma, address);
2651
2652 return find_lock_page(mapping, idx);
2653}
2654
2655
2656
2657
2658
2659static bool hugetlbfs_pagecache_present(struct hstate *h,
2660 struct vm_area_struct *vma, unsigned long address)
2661{
2662 struct address_space *mapping;
2663 pgoff_t idx;
2664 struct page *page;
2665
2666 mapping = vma->vm_file->f_mapping;
2667 idx = vma_hugecache_offset(h, vma, address);
2668
2669 page = find_get_page(mapping, idx);
2670 if (page)
2671 put_page(page);
2672 return page != NULL;
2673}
2674
2675static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2676 unsigned long address, pte_t *ptep, unsigned int flags)
2677{
2678 struct hstate *h = hstate_vma(vma);
2679 int ret = VM_FAULT_SIGBUS;
2680 int anon_rmap = 0;
2681 pgoff_t idx;
2682 unsigned long size;
2683 struct page *page;
2684 struct address_space *mapping;
2685 pte_t new_pte;
2686
2687
2688
2689
2690
2691
2692 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2693 printk(KERN_WARNING
2694 "PID %d killed due to inadequate hugepage pool\n",
2695 current->pid);
2696 return ret;
2697 }
2698
2699 mapping = vma->vm_file->f_mapping;
2700 idx = vma_hugecache_offset(h, vma, address);
2701
2702
2703
2704
2705
2706retry:
2707 page = find_lock_page(mapping, idx);
2708 if (!page) {
2709 size = i_size_read(mapping->host) >> huge_page_shift(h);
2710 if (idx >= size)
2711 goto out;
2712 page = alloc_huge_page(vma, address, 0);
2713 if (IS_ERR(page)) {
2714 ret = PTR_ERR(page);
2715 if (ret == -ENOMEM)
2716 ret = VM_FAULT_OOM;
2717 else
2718 ret = VM_FAULT_SIGBUS;
2719 goto out;
2720 }
2721 clear_huge_page(page, address, pages_per_huge_page(h));
2722 __SetPageUptodate(page);
2723
2724 if (vma->vm_flags & VM_MAYSHARE) {
2725 int err;
2726 struct inode *inode = mapping->host;
2727
2728 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
2729 if (err) {
2730 put_page(page);
2731 if (err == -EEXIST)
2732 goto retry;
2733 goto out;
2734 }
2735
2736 spin_lock(&inode->i_lock);
2737 inode->i_blocks += blocks_per_huge_page(h);
2738 spin_unlock(&inode->i_lock);
2739 } else {
2740 lock_page(page);
2741 if (unlikely(anon_vma_prepare(vma))) {
2742 ret = VM_FAULT_OOM;
2743 goto backout_unlocked;
2744 }
2745 anon_rmap = 1;
2746 }
2747 } else {
2748
2749
2750
2751
2752
2753 if (unlikely(PageHWPoison(page))) {
2754 ret = VM_FAULT_HWPOISON |
2755 VM_FAULT_SET_HINDEX(hstate_index(h));
2756 goto backout_unlocked;
2757 }
2758 }
2759
2760
2761
2762
2763
2764
2765
2766 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2767 if (vma_needs_reservation(h, vma, address) < 0) {
2768 ret = VM_FAULT_OOM;
2769 goto backout_unlocked;
2770 }
2771
2772 spin_lock(&mm->page_table_lock);
2773 size = i_size_read(mapping->host) >> huge_page_shift(h);
2774 if (idx >= size)
2775 goto backout;
2776
2777 ret = 0;
2778 if (!huge_pte_none(huge_ptep_get(ptep)))
2779 goto backout;
2780
2781 if (anon_rmap)
2782 hugepage_add_new_anon_rmap(page, vma, address);
2783 else
2784 page_dup_rmap(page);
2785 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
2786 && (vma->vm_flags & VM_SHARED)));
2787 set_huge_pte_at(mm, address, ptep, new_pte);
2788
2789 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2790
2791 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
2792 }
2793
2794 spin_unlock(&mm->page_table_lock);
2795 unlock_page(page);
2796out:
2797 return ret;
2798
2799backout:
2800 spin_unlock(&mm->page_table_lock);
2801backout_unlocked:
2802 unlock_page(page);
2803 put_page(page);
2804 goto out;
2805}
2806
2807int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2808 unsigned long address, unsigned int flags)
2809{
2810 pte_t *ptep;
2811 pte_t entry;
2812 int ret;
2813 struct page *page = NULL;
2814 struct page *pagecache_page = NULL;
2815 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2816 struct hstate *h = hstate_vma(vma);
2817
2818 address &= huge_page_mask(h);
2819
2820 ptep = huge_pte_offset(mm, address);
2821 if (ptep) {
2822 entry = huge_ptep_get(ptep);
2823 if (unlikely(is_hugetlb_entry_migration(entry))) {
2824 migration_entry_wait(mm, (pmd_t *)ptep, address);
2825 return 0;
2826 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2827 return VM_FAULT_HWPOISON_LARGE |
2828 VM_FAULT_SET_HINDEX(hstate_index(h));
2829 }
2830
2831 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2832 if (!ptep)
2833 return VM_FAULT_OOM;
2834
2835
2836
2837
2838
2839
2840 mutex_lock(&hugetlb_instantiation_mutex);
2841 entry = huge_ptep_get(ptep);
2842 if (huge_pte_none(entry)) {
2843 ret = hugetlb_no_page(mm, vma, address, ptep, flags);
2844 goto out_mutex;
2845 }
2846
2847 ret = 0;
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857 if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
2858 if (vma_needs_reservation(h, vma, address) < 0) {
2859 ret = VM_FAULT_OOM;
2860 goto out_mutex;
2861 }
2862
2863 if (!(vma->vm_flags & VM_MAYSHARE))
2864 pagecache_page = hugetlbfs_pagecache_page(h,
2865 vma, address);
2866 }
2867
2868
2869
2870
2871
2872
2873
2874
2875 page = pte_page(entry);
2876 get_page(page);
2877 if (page != pagecache_page)
2878 lock_page(page);
2879
2880 spin_lock(&mm->page_table_lock);
2881
2882 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
2883 goto out_page_table_lock;
2884
2885
2886 if (flags & FAULT_FLAG_WRITE) {
2887 if (!pte_write(entry)) {
2888 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2889 pagecache_page);
2890 goto out_page_table_lock;
2891 }
2892 entry = pte_mkdirty(entry);
2893 }
2894 entry = pte_mkyoung(entry);
2895 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2896 flags & FAULT_FLAG_WRITE))
2897 update_mmu_cache(vma, address, ptep);
2898
2899out_page_table_lock:
2900 spin_unlock(&mm->page_table_lock);
2901
2902 if (pagecache_page) {
2903 unlock_page(pagecache_page);
2904 put_page(pagecache_page);
2905 }
2906 if (page != pagecache_page)
2907 unlock_page(page);
2908 put_page(page);
2909
2910out_mutex:
2911 mutex_unlock(&hugetlb_instantiation_mutex);
2912
2913 return ret;
2914}
2915
2916
2917__attribute__((weak)) struct page *
2918follow_huge_pud(struct mm_struct *mm, unsigned long address,
2919 pud_t *pud, int write)
2920{
2921 BUG();
2922 return NULL;
2923}
2924
2925int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2926 struct page **pages, struct vm_area_struct **vmas,
2927 unsigned long *position, int *length, int i,
2928 unsigned int flags)
2929{
2930 unsigned long pfn_offset;
2931 unsigned long vaddr = *position;
2932 int remainder = *length;
2933 struct hstate *h = hstate_vma(vma);
2934
2935 spin_lock(&mm->page_table_lock);
2936 while (vaddr < vma->vm_end && remainder) {
2937 pte_t *pte;
2938 int absent;
2939 struct page *page;
2940
2941
2942
2943
2944
2945
2946 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2947 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2948
2949
2950
2951
2952
2953
2954
2955
2956 if (absent && (flags & FOLL_DUMP) &&
2957 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2958 remainder = 0;
2959 break;
2960 }
2961
2962 if (absent ||
2963 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2964 int ret;
2965
2966 spin_unlock(&mm->page_table_lock);
2967 ret = hugetlb_fault(mm, vma, vaddr,
2968 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2969 spin_lock(&mm->page_table_lock);
2970 if (!(ret & VM_FAULT_ERROR))
2971 continue;
2972
2973 remainder = 0;
2974 break;
2975 }
2976
2977 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
2978 page = pte_page(huge_ptep_get(pte));
2979same_page:
2980 if (pages) {
2981 pages[i] = mem_map_offset(page, pfn_offset);
2982 get_page(pages[i]);
2983 }
2984
2985 if (vmas)
2986 vmas[i] = vma;
2987
2988 vaddr += PAGE_SIZE;
2989 ++pfn_offset;
2990 --remainder;
2991 ++i;
2992 if (vaddr < vma->vm_end && remainder &&
2993 pfn_offset < pages_per_huge_page(h)) {
2994
2995
2996
2997
2998 goto same_page;
2999 }
3000 }
3001 spin_unlock(&mm->page_table_lock);
3002 *length = remainder;
3003 *position = vaddr;
3004
3005 return i ? i : -EFAULT;
3006}
3007
3008void hugetlb_change_protection(struct vm_area_struct *vma,
3009 unsigned long address, unsigned long end, pgprot_t newprot)
3010{
3011 struct mm_struct *mm = vma->vm_mm;
3012 unsigned long start = address;
3013 pte_t *ptep;
3014 pte_t pte;
3015 struct hstate *h = hstate_vma(vma);
3016
3017 BUG_ON(address >= end);
3018 flush_cache_range(vma, address, end);
3019
3020 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
3021 spin_lock(&mm->page_table_lock);
3022 for (; address < end; address += huge_page_size(h)) {
3023 ptep = huge_pte_offset(mm, address);
3024 if (!ptep)
3025 continue;
3026 if (huge_pmd_unshare(mm, &address, ptep))
3027 continue;
3028 if (!huge_pte_none(huge_ptep_get(ptep))) {
3029 pte = huge_ptep_get_and_clear(mm, address, ptep);
3030 pte = pte_mkhuge(pte_modify(pte, newprot));
3031 set_huge_pte_at(mm, address, ptep, pte);
3032 }
3033 }
3034 spin_unlock(&mm->page_table_lock);
3035
3036
3037
3038
3039
3040
3041 flush_tlb_range(vma, start, end);
3042 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
3043}
3044
3045int hugetlb_reserve_pages(struct inode *inode,
3046 long from, long to,
3047 struct vm_area_struct *vma,
3048 vm_flags_t vm_flags)
3049{
3050 long ret, chg;
3051 struct hstate *h = hstate_inode(inode);
3052 struct hugepage_subpool *spool = subpool_inode(inode);
3053
3054
3055
3056
3057
3058
3059 if (vm_flags & VM_NORESERVE)
3060 return 0;
3061
3062
3063
3064
3065
3066
3067
3068 if (!vma || vma->vm_flags & VM_MAYSHARE)
3069 chg = region_chg(&inode->i_mapping->private_list, from, to);
3070 else {
3071 struct resv_map *resv_map = resv_map_alloc();
3072 if (!resv_map)
3073 return -ENOMEM;
3074
3075 chg = to - from;
3076
3077 set_vma_resv_map(vma, resv_map);
3078 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
3079 }
3080
3081 if (chg < 0) {
3082 ret = chg;
3083 goto out_err;
3084 }
3085
3086
3087 if (hugepage_subpool_get_pages(spool, chg)) {
3088 ret = -ENOSPC;
3089 goto out_err;
3090 }
3091
3092
3093
3094
3095
3096 ret = hugetlb_acct_memory(h, chg);
3097 if (ret < 0) {
3098 hugepage_subpool_put_pages(spool, chg);
3099 goto out_err;
3100 }
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113 if (!vma || vma->vm_flags & VM_MAYSHARE)
3114 region_add(&inode->i_mapping->private_list, from, to);
3115 return 0;
3116out_err:
3117 if (vma)
3118 resv_map_put(vma);
3119 return ret;
3120}
3121
3122void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3123{
3124 struct hstate *h = hstate_inode(inode);
3125 long chg = region_truncate(&inode->i_mapping->private_list, offset);
3126 struct hugepage_subpool *spool = subpool_inode(inode);
3127
3128 spin_lock(&inode->i_lock);
3129 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3130 spin_unlock(&inode->i_lock);
3131
3132 hugepage_subpool_put_pages(spool, (chg - freed));
3133 hugetlb_acct_memory(h, -(chg - freed));
3134}
3135
3136#ifdef CONFIG_MEMORY_FAILURE
3137
3138
3139static int is_hugepage_on_freelist(struct page *hpage)
3140{
3141 struct page *page;
3142 struct page *tmp;
3143 struct hstate *h = page_hstate(hpage);
3144 int nid = page_to_nid(hpage);
3145
3146 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3147 if (page == hpage)
3148 return 1;
3149 return 0;
3150}
3151
3152
3153
3154
3155
3156int dequeue_hwpoisoned_huge_page(struct page *hpage)
3157{
3158 struct hstate *h = page_hstate(hpage);
3159 int nid = page_to_nid(hpage);
3160 int ret = -EBUSY;
3161
3162 spin_lock(&hugetlb_lock);
3163 if (is_hugepage_on_freelist(hpage)) {
3164 list_del(&hpage->lru);
3165 set_page_refcounted(hpage);
3166 h->free_huge_pages--;
3167 h->free_huge_pages_node[nid]--;
3168 ret = 0;
3169 }
3170 spin_unlock(&hugetlb_lock);
3171 return ret;
3172}
3173#endif
3174