1
2
3
4
5#include "sched.h"
6
7DEFINE_MUTEX(sched_domains_mutex);
8
9
10static cpumask_var_t sched_domains_tmpmask;
11static cpumask_var_t sched_domains_tmpmask2;
12
13#ifdef CONFIG_SCHED_DEBUG
14
15static int __init sched_debug_setup(char *str)
16{
17 sched_debug_verbose = true;
18
19 return 0;
20}
21early_param("sched_verbose", sched_debug_setup);
22
23static inline bool sched_debug(void)
24{
25 return sched_debug_verbose;
26}
27
28#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
29const struct sd_flag_debug sd_flag_debug[] = {
30#include <linux/sched/sd_flags.h>
31};
32#undef SD_FLAG
33
34static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
35 struct cpumask *groupmask)
36{
37 struct sched_group *group = sd->groups;
38 unsigned long flags = sd->flags;
39 unsigned int idx;
40
41 cpumask_clear(groupmask);
42
43 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
44 printk(KERN_CONT "span=%*pbl level=%s\n",
45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
46
47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
49 }
50 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
52 }
53
54 for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
55 unsigned int flag = BIT(idx);
56 unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
57
58 if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
59 !(sd->child->flags & flag))
60 printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
61 sd_flag_debug[idx].name);
62
63 if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
64 !(sd->parent->flags & flag))
65 printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
66 sd_flag_debug[idx].name);
67 }
68
69 printk(KERN_DEBUG "%*s groups:", level + 1, "");
70 do {
71 if (!group) {
72 printk("\n");
73 printk(KERN_ERR "ERROR: group is NULL\n");
74 break;
75 }
76
77 if (!cpumask_weight(sched_group_span(group))) {
78 printk(KERN_CONT "\n");
79 printk(KERN_ERR "ERROR: empty group\n");
80 break;
81 }
82
83 if (!(sd->flags & SD_OVERLAP) &&
84 cpumask_intersects(groupmask, sched_group_span(group))) {
85 printk(KERN_CONT "\n");
86 printk(KERN_ERR "ERROR: repeated CPUs\n");
87 break;
88 }
89
90 cpumask_or(groupmask, groupmask, sched_group_span(group));
91
92 printk(KERN_CONT " %d:{ span=%*pbl",
93 group->sgc->id,
94 cpumask_pr_args(sched_group_span(group)));
95
96 if ((sd->flags & SD_OVERLAP) &&
97 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
98 printk(KERN_CONT " mask=%*pbl",
99 cpumask_pr_args(group_balance_mask(group)));
100 }
101
102 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
103 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
104
105 if (group == sd->groups && sd->child &&
106 !cpumask_equal(sched_domain_span(sd->child),
107 sched_group_span(group))) {
108 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
109 }
110
111 printk(KERN_CONT " }");
112
113 group = group->next;
114
115 if (group != sd->groups)
116 printk(KERN_CONT ",");
117
118 } while (group != sd->groups);
119 printk(KERN_CONT "\n");
120
121 if (!cpumask_equal(sched_domain_span(sd), groupmask))
122 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
123
124 if (sd->parent &&
125 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
126 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
127 return 0;
128}
129
130static void sched_domain_debug(struct sched_domain *sd, int cpu)
131{
132 int level = 0;
133
134 if (!sched_debug_verbose)
135 return;
136
137 if (!sd) {
138 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
139 return;
140 }
141
142 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
143
144 for (;;) {
145 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
146 break;
147 level++;
148 sd = sd->parent;
149 if (!sd)
150 break;
151 }
152}
153#else
154
155# define sched_debug_verbose 0
156# define sched_domain_debug(sd, cpu) do { } while (0)
157static inline bool sched_debug(void)
158{
159 return false;
160}
161#endif
162
163
164#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
165static const unsigned int SD_DEGENERATE_GROUPS_MASK =
166#include <linux/sched/sd_flags.h>
1670;
168#undef SD_FLAG
169
170static int sd_degenerate(struct sched_domain *sd)
171{
172 if (cpumask_weight(sched_domain_span(sd)) == 1)
173 return 1;
174
175
176 if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
177 (sd->groups != sd->groups->next))
178 return 0;
179
180
181 if (sd->flags & (SD_WAKE_AFFINE))
182 return 0;
183
184 return 1;
185}
186
187static int
188sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
189{
190 unsigned long cflags = sd->flags, pflags = parent->flags;
191
192 if (sd_degenerate(parent))
193 return 1;
194
195 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
196 return 0;
197
198
199 if (parent->groups == parent->groups->next)
200 pflags &= ~SD_DEGENERATE_GROUPS_MASK;
201
202 if (~cflags & pflags)
203 return 0;
204
205 return 1;
206}
207
208#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
209DEFINE_STATIC_KEY_FALSE(sched_energy_present);
210unsigned int sysctl_sched_energy_aware = 1;
211DEFINE_MUTEX(sched_energy_mutex);
212bool sched_energy_update;
213
214void rebuild_sched_domains_energy(void)
215{
216 mutex_lock(&sched_energy_mutex);
217 sched_energy_update = true;
218 rebuild_sched_domains();
219 sched_energy_update = false;
220 mutex_unlock(&sched_energy_mutex);
221}
222
223#ifdef CONFIG_PROC_SYSCTL
224int sched_energy_aware_handler(struct ctl_table *table, int write,
225 void *buffer, size_t *lenp, loff_t *ppos)
226{
227 int ret, state;
228
229 if (write && !capable(CAP_SYS_ADMIN))
230 return -EPERM;
231
232 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
233 if (!ret && write) {
234 state = static_branch_unlikely(&sched_energy_present);
235 if (state != sysctl_sched_energy_aware)
236 rebuild_sched_domains_energy();
237 }
238
239 return ret;
240}
241#endif
242
243static void free_pd(struct perf_domain *pd)
244{
245 struct perf_domain *tmp;
246
247 while (pd) {
248 tmp = pd->next;
249 kfree(pd);
250 pd = tmp;
251 }
252}
253
254static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
255{
256 while (pd) {
257 if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
258 return pd;
259 pd = pd->next;
260 }
261
262 return NULL;
263}
264
265static struct perf_domain *pd_init(int cpu)
266{
267 struct em_perf_domain *obj = em_cpu_get(cpu);
268 struct perf_domain *pd;
269
270 if (!obj) {
271 if (sched_debug())
272 pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
273 return NULL;
274 }
275
276 pd = kzalloc(sizeof(*pd), GFP_KERNEL);
277 if (!pd)
278 return NULL;
279 pd->em_pd = obj;
280
281 return pd;
282}
283
284static void perf_domain_debug(const struct cpumask *cpu_map,
285 struct perf_domain *pd)
286{
287 if (!sched_debug() || !pd)
288 return;
289
290 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
291
292 while (pd) {
293 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
294 cpumask_first(perf_domain_span(pd)),
295 cpumask_pr_args(perf_domain_span(pd)),
296 em_pd_nr_perf_states(pd->em_pd));
297 pd = pd->next;
298 }
299
300 printk(KERN_CONT "\n");
301}
302
303static void destroy_perf_domain_rcu(struct rcu_head *rp)
304{
305 struct perf_domain *pd;
306
307 pd = container_of(rp, struct perf_domain, rcu);
308 free_pd(pd);
309}
310
311static void sched_energy_set(bool has_eas)
312{
313 if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
314 if (sched_debug())
315 pr_info("%s: stopping EAS\n", __func__);
316 static_branch_disable_cpuslocked(&sched_energy_present);
317 } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
318 if (sched_debug())
319 pr_info("%s: starting EAS\n", __func__);
320 static_branch_enable_cpuslocked(&sched_energy_present);
321 }
322}
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349#define EM_MAX_COMPLEXITY 2048
350
351extern struct cpufreq_governor schedutil_gov;
352static bool build_perf_domains(const struct cpumask *cpu_map)
353{
354 int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
355 struct perf_domain *pd = NULL, *tmp;
356 int cpu = cpumask_first(cpu_map);
357 struct root_domain *rd = cpu_rq(cpu)->rd;
358 struct cpufreq_policy *policy;
359 struct cpufreq_governor *gov;
360
361 if (!sysctl_sched_energy_aware)
362 goto free;
363
364
365 if (!per_cpu(sd_asym_cpucapacity, cpu)) {
366 if (sched_debug()) {
367 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
368 cpumask_pr_args(cpu_map));
369 }
370 goto free;
371 }
372
373
374 if (sched_smt_active()) {
375 pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
376 cpumask_pr_args(cpu_map));
377 goto free;
378 }
379
380 if (!arch_scale_freq_invariant()) {
381 if (sched_debug()) {
382 pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
383 cpumask_pr_args(cpu_map));
384 }
385 goto free;
386 }
387
388 for_each_cpu(i, cpu_map) {
389
390 if (find_pd(pd, i))
391 continue;
392
393
394 policy = cpufreq_cpu_get(i);
395 if (!policy)
396 goto free;
397 gov = policy->governor;
398 cpufreq_cpu_put(policy);
399 if (gov != &schedutil_gov) {
400 if (rd->pd)
401 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
402 cpumask_pr_args(cpu_map));
403 goto free;
404 }
405
406
407 tmp = pd_init(i);
408 if (!tmp)
409 goto free;
410 tmp->next = pd;
411 pd = tmp;
412
413
414
415
416
417 nr_pd++;
418 nr_ps += em_pd_nr_perf_states(pd->em_pd);
419 }
420
421
422 if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
423 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
424 cpumask_pr_args(cpu_map));
425 goto free;
426 }
427
428 perf_domain_debug(cpu_map, pd);
429
430
431 tmp = rd->pd;
432 rcu_assign_pointer(rd->pd, pd);
433 if (tmp)
434 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
435
436 return !!pd;
437
438free:
439 free_pd(pd);
440 tmp = rd->pd;
441 rcu_assign_pointer(rd->pd, NULL);
442 if (tmp)
443 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
444
445 return false;
446}
447#else
448static void free_pd(struct perf_domain *pd) { }
449#endif
450
451static void free_rootdomain(struct rcu_head *rcu)
452{
453 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
454
455 cpupri_cleanup(&rd->cpupri);
456 cpudl_cleanup(&rd->cpudl);
457 free_cpumask_var(rd->dlo_mask);
458 free_cpumask_var(rd->rto_mask);
459 free_cpumask_var(rd->online);
460 free_cpumask_var(rd->span);
461 free_pd(rd->pd);
462 kfree(rd);
463}
464
465void rq_attach_root(struct rq *rq, struct root_domain *rd)
466{
467 struct root_domain *old_rd = NULL;
468 unsigned long flags;
469
470 raw_spin_rq_lock_irqsave(rq, flags);
471
472 if (rq->rd) {
473 old_rd = rq->rd;
474
475 if (cpumask_test_cpu(rq->cpu, old_rd->online))
476 set_rq_offline(rq);
477
478 cpumask_clear_cpu(rq->cpu, old_rd->span);
479
480
481
482
483
484
485 if (!atomic_dec_and_test(&old_rd->refcount))
486 old_rd = NULL;
487 }
488
489 atomic_inc(&rd->refcount);
490 rq->rd = rd;
491
492 cpumask_set_cpu(rq->cpu, rd->span);
493 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
494 set_rq_online(rq);
495
496 raw_spin_rq_unlock_irqrestore(rq, flags);
497
498 if (old_rd)
499 call_rcu(&old_rd->rcu, free_rootdomain);
500}
501
502void sched_get_rd(struct root_domain *rd)
503{
504 atomic_inc(&rd->refcount);
505}
506
507void sched_put_rd(struct root_domain *rd)
508{
509 if (!atomic_dec_and_test(&rd->refcount))
510 return;
511
512 call_rcu(&rd->rcu, free_rootdomain);
513}
514
515static int init_rootdomain(struct root_domain *rd)
516{
517 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
518 goto out;
519 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
520 goto free_span;
521 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
522 goto free_online;
523 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
524 goto free_dlo_mask;
525
526#ifdef HAVE_RT_PUSH_IPI
527 rd->rto_cpu = -1;
528 raw_spin_lock_init(&rd->rto_lock);
529 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
530#endif
531
532 rd->visit_gen = 0;
533 init_dl_bw(&rd->dl_bw);
534 if (cpudl_init(&rd->cpudl) != 0)
535 goto free_rto_mask;
536
537 if (cpupri_init(&rd->cpupri) != 0)
538 goto free_cpudl;
539 return 0;
540
541free_cpudl:
542 cpudl_cleanup(&rd->cpudl);
543free_rto_mask:
544 free_cpumask_var(rd->rto_mask);
545free_dlo_mask:
546 free_cpumask_var(rd->dlo_mask);
547free_online:
548 free_cpumask_var(rd->online);
549free_span:
550 free_cpumask_var(rd->span);
551out:
552 return -ENOMEM;
553}
554
555
556
557
558
559struct root_domain def_root_domain;
560
561void init_defrootdomain(void)
562{
563 init_rootdomain(&def_root_domain);
564
565 atomic_set(&def_root_domain.refcount, 1);
566}
567
568static struct root_domain *alloc_rootdomain(void)
569{
570 struct root_domain *rd;
571
572 rd = kzalloc(sizeof(*rd), GFP_KERNEL);
573 if (!rd)
574 return NULL;
575
576 if (init_rootdomain(rd) != 0) {
577 kfree(rd);
578 return NULL;
579 }
580
581 return rd;
582}
583
584static void free_sched_groups(struct sched_group *sg, int free_sgc)
585{
586 struct sched_group *tmp, *first;
587
588 if (!sg)
589 return;
590
591 first = sg;
592 do {
593 tmp = sg->next;
594
595 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
596 kfree(sg->sgc);
597
598 if (atomic_dec_and_test(&sg->ref))
599 kfree(sg);
600 sg = tmp;
601 } while (sg != first);
602}
603
604static void destroy_sched_domain(struct sched_domain *sd)
605{
606
607
608
609
610
611 free_sched_groups(sd->groups, 1);
612
613 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
614 kfree(sd->shared);
615 kfree(sd);
616}
617
618static void destroy_sched_domains_rcu(struct rcu_head *rcu)
619{
620 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
621
622 while (sd) {
623 struct sched_domain *parent = sd->parent;
624 destroy_sched_domain(sd);
625 sd = parent;
626 }
627}
628
629static void destroy_sched_domains(struct sched_domain *sd)
630{
631 if (sd)
632 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
633}
634
635
636
637
638
639
640
641
642
643
644DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
645DEFINE_PER_CPU(int, sd_llc_size);
646DEFINE_PER_CPU(int, sd_llc_id);
647DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
648DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
649DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
650DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
651DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
652
653static void update_top_cache_domain(int cpu)
654{
655 struct sched_domain_shared *sds = NULL;
656 struct sched_domain *sd;
657 int id = cpu;
658 int size = 1;
659
660 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
661 if (sd) {
662 id = cpumask_first(sched_domain_span(sd));
663 size = cpumask_weight(sched_domain_span(sd));
664 sds = sd->shared;
665 }
666
667 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
668 per_cpu(sd_llc_size, cpu) = size;
669 per_cpu(sd_llc_id, cpu) = id;
670 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
671
672 sd = lowest_flag_domain(cpu, SD_NUMA);
673 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
674
675 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
676 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
677
678 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
679 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
680}
681
682
683
684
685
686static void
687cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
688{
689 struct rq *rq = cpu_rq(cpu);
690 struct sched_domain *tmp;
691 int numa_distance = 0;
692
693
694 for (tmp = sd; tmp; ) {
695 struct sched_domain *parent = tmp->parent;
696 if (!parent)
697 break;
698
699 if (sd_parent_degenerate(tmp, parent)) {
700 tmp->parent = parent->parent;
701 if (parent->parent)
702 parent->parent->child = tmp;
703
704
705
706
707
708 if (parent->flags & SD_PREFER_SIBLING)
709 tmp->flags |= SD_PREFER_SIBLING;
710 destroy_sched_domain(parent);
711 } else
712 tmp = tmp->parent;
713 }
714
715 if (sd && sd_degenerate(sd)) {
716 tmp = sd;
717 sd = sd->parent;
718 destroy_sched_domain(tmp);
719 if (sd)
720 sd->child = NULL;
721 }
722
723 for (tmp = sd; tmp; tmp = tmp->parent)
724 numa_distance += !!(tmp->flags & SD_NUMA);
725
726 sched_domain_debug(sd, cpu);
727
728 rq_attach_root(rq, rd);
729 tmp = rq->sd;
730 rcu_assign_pointer(rq->sd, sd);
731 dirty_sched_domain_sysctl(cpu);
732 destroy_sched_domains(tmp);
733
734 update_top_cache_domain(cpu);
735}
736
737struct s_data {
738 struct sched_domain * __percpu *sd;
739 struct root_domain *rd;
740};
741
742enum s_alloc {
743 sa_rootdomain,
744 sa_sd,
745 sa_sd_storage,
746 sa_none,
747};
748
749
750
751
752
753
754
755
756
757
758int group_balance_cpu(struct sched_group *sg)
759{
760 return cpumask_first(group_balance_mask(sg));
761}
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869static void
870build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
871{
872 const struct cpumask *sg_span = sched_group_span(sg);
873 struct sd_data *sdd = sd->private;
874 struct sched_domain *sibling;
875 int i;
876
877 cpumask_clear(mask);
878
879 for_each_cpu(i, sg_span) {
880 sibling = *per_cpu_ptr(sdd->sd, i);
881
882
883
884
885
886
887 if (!sibling->child)
888 continue;
889
890
891 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
892 continue;
893
894 cpumask_set_cpu(i, mask);
895 }
896
897
898 WARN_ON_ONCE(cpumask_empty(mask));
899}
900
901
902
903
904
905
906static struct sched_group *
907build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
908{
909 struct sched_group *sg;
910 struct cpumask *sg_span;
911
912 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
913 GFP_KERNEL, cpu_to_node(cpu));
914
915 if (!sg)
916 return NULL;
917
918 sg_span = sched_group_span(sg);
919 if (sd->child)
920 cpumask_copy(sg_span, sched_domain_span(sd->child));
921 else
922 cpumask_copy(sg_span, sched_domain_span(sd));
923
924 atomic_inc(&sg->ref);
925 return sg;
926}
927
928static void init_overlap_sched_group(struct sched_domain *sd,
929 struct sched_group *sg)
930{
931 struct cpumask *mask = sched_domains_tmpmask2;
932 struct sd_data *sdd = sd->private;
933 struct cpumask *sg_span;
934 int cpu;
935
936 build_balance_mask(sd, sg, mask);
937 cpu = cpumask_first(mask);
938
939 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
940 if (atomic_inc_return(&sg->sgc->ref) == 1)
941 cpumask_copy(group_balance_mask(sg), mask);
942 else
943 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
944
945
946
947
948
949
950 sg_span = sched_group_span(sg);
951 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
952 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
953 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
954}
955
956static struct sched_domain *
957find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
958{
959
960
961
962
963 while (sibling->child &&
964 !cpumask_subset(sched_domain_span(sibling->child),
965 sched_domain_span(sd)))
966 sibling = sibling->child;
967
968
969
970
971
972
973 while (sibling->child &&
974 cpumask_equal(sched_domain_span(sibling->child),
975 sched_domain_span(sibling)))
976 sibling = sibling->child;
977
978 return sibling;
979}
980
981static int
982build_overlap_sched_groups(struct sched_domain *sd, int cpu)
983{
984 struct sched_group *first = NULL, *last = NULL, *sg;
985 const struct cpumask *span = sched_domain_span(sd);
986 struct cpumask *covered = sched_domains_tmpmask;
987 struct sd_data *sdd = sd->private;
988 struct sched_domain *sibling;
989 int i;
990
991 cpumask_clear(covered);
992
993 for_each_cpu_wrap(i, span, cpu) {
994 struct cpumask *sg_span;
995
996 if (cpumask_test_cpu(i, covered))
997 continue;
998
999 sibling = *per_cpu_ptr(sdd->sd, i);
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
1012 continue;
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045 if (sibling->child &&
1046 !cpumask_subset(sched_domain_span(sibling->child), span))
1047 sibling = find_descended_sibling(sd, sibling);
1048
1049 sg = build_group_from_child_sched_domain(sibling, cpu);
1050 if (!sg)
1051 goto fail;
1052
1053 sg_span = sched_group_span(sg);
1054 cpumask_or(covered, covered, sg_span);
1055
1056 init_overlap_sched_group(sibling, sg);
1057
1058 if (!first)
1059 first = sg;
1060 if (last)
1061 last->next = sg;
1062 last = sg;
1063 last->next = first;
1064 }
1065 sd->groups = first;
1066
1067 return 0;
1068
1069fail:
1070 free_sched_groups(first, 0);
1071
1072 return -ENOMEM;
1073}
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1148{
1149 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1150 struct sched_domain *child = sd->child;
1151 struct sched_group *sg;
1152 bool already_visited;
1153
1154 if (child)
1155 cpu = cpumask_first(sched_domain_span(child));
1156
1157 sg = *per_cpu_ptr(sdd->sg, cpu);
1158 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1159
1160
1161 already_visited = atomic_inc_return(&sg->ref) > 1;
1162
1163 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1164
1165
1166 if (already_visited)
1167 return sg;
1168
1169 if (child) {
1170 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1171 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
1172 } else {
1173 cpumask_set_cpu(cpu, sched_group_span(sg));
1174 cpumask_set_cpu(cpu, group_balance_mask(sg));
1175 }
1176
1177 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
1178 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
1179 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
1180
1181 return sg;
1182}
1183
1184
1185
1186
1187
1188
1189
1190
1191static int
1192build_sched_groups(struct sched_domain *sd, int cpu)
1193{
1194 struct sched_group *first = NULL, *last = NULL;
1195 struct sd_data *sdd = sd->private;
1196 const struct cpumask *span = sched_domain_span(sd);
1197 struct cpumask *covered;
1198 int i;
1199
1200 lockdep_assert_held(&sched_domains_mutex);
1201 covered = sched_domains_tmpmask;
1202
1203 cpumask_clear(covered);
1204
1205 for_each_cpu_wrap(i, span, cpu) {
1206 struct sched_group *sg;
1207
1208 if (cpumask_test_cpu(i, covered))
1209 continue;
1210
1211 sg = get_group(i, sdd);
1212
1213 cpumask_or(covered, covered, sched_group_span(sg));
1214
1215 if (!first)
1216 first = sg;
1217 if (last)
1218 last->next = sg;
1219 last = sg;
1220 }
1221 last->next = first;
1222 sd->groups = first;
1223
1224 return 0;
1225}
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1238{
1239 struct sched_group *sg = sd->groups;
1240
1241 WARN_ON(!sg);
1242
1243 do {
1244 int cpu, max_cpu = -1;
1245
1246 sg->group_weight = cpumask_weight(sched_group_span(sg));
1247
1248 if (!(sd->flags & SD_ASYM_PACKING))
1249 goto next;
1250
1251 for_each_cpu(cpu, sched_group_span(sg)) {
1252 if (max_cpu < 0)
1253 max_cpu = cpu;
1254 else if (sched_asym_prefer(cpu, max_cpu))
1255 max_cpu = cpu;
1256 }
1257 sg->asym_prefer_cpu = max_cpu;
1258
1259next:
1260 sg = sg->next;
1261 } while (sg != sd->groups);
1262
1263 if (cpu != group_balance_cpu(sg))
1264 return;
1265
1266 update_group_capacity(sd, cpu);
1267}
1268
1269
1270
1271
1272struct asym_cap_data {
1273 struct list_head link;
1274 unsigned long capacity;
1275 unsigned long cpus[];
1276};
1277
1278
1279
1280
1281
1282
1283
1284static LIST_HEAD(asym_cap_list);
1285
1286#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
1287
1288
1289
1290
1291
1292static inline int
1293asym_cpu_capacity_classify(const struct cpumask *sd_span,
1294 const struct cpumask *cpu_map)
1295{
1296 struct asym_cap_data *entry;
1297 int count = 0, miss = 0;
1298
1299
1300
1301
1302
1303
1304
1305 list_for_each_entry(entry, &asym_cap_list, link) {
1306 if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
1307 ++count;
1308 else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
1309 ++miss;
1310 }
1311
1312 WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
1313
1314
1315 if (count < 2)
1316 return 0;
1317
1318 if (miss)
1319 return SD_ASYM_CPUCAPACITY;
1320
1321
1322 return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
1323
1324}
1325
1326static inline void asym_cpu_capacity_update_data(int cpu)
1327{
1328 unsigned long capacity = arch_scale_cpu_capacity(cpu);
1329 struct asym_cap_data *entry = NULL;
1330
1331 list_for_each_entry(entry, &asym_cap_list, link) {
1332 if (capacity == entry->capacity)
1333 goto done;
1334 }
1335
1336 entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
1337 if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
1338 return;
1339 entry->capacity = capacity;
1340 list_add(&entry->link, &asym_cap_list);
1341done:
1342 __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
1343}
1344
1345
1346
1347
1348
1349
1350static void asym_cpu_capacity_scan(void)
1351{
1352 struct asym_cap_data *entry, *next;
1353 int cpu;
1354
1355 list_for_each_entry(entry, &asym_cap_list, link)
1356 cpumask_clear(cpu_capacity_span(entry));
1357
1358 for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
1359 asym_cpu_capacity_update_data(cpu);
1360
1361 list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
1362 if (cpumask_empty(cpu_capacity_span(entry))) {
1363 list_del(&entry->link);
1364 kfree(entry);
1365 }
1366 }
1367
1368
1369
1370
1371
1372 if (list_is_singular(&asym_cap_list)) {
1373 entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
1374 list_del(&entry->link);
1375 kfree(entry);
1376 }
1377}
1378
1379
1380
1381
1382
1383
1384static int default_relax_domain_level = -1;
1385int sched_domain_level_max;
1386
1387static int __init setup_relax_domain_level(char *str)
1388{
1389 if (kstrtoint(str, 0, &default_relax_domain_level))
1390 pr_warn("Unable to set relax_domain_level\n");
1391
1392 return 1;
1393}
1394__setup("relax_domain_level=", setup_relax_domain_level);
1395
1396static void set_domain_attribute(struct sched_domain *sd,
1397 struct sched_domain_attr *attr)
1398{
1399 int request;
1400
1401 if (!attr || attr->relax_domain_level < 0) {
1402 if (default_relax_domain_level < 0)
1403 return;
1404 request = default_relax_domain_level;
1405 } else
1406 request = attr->relax_domain_level;
1407
1408 if (sd->level > request) {
1409
1410 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1411 }
1412}
1413
1414static void __sdt_free(const struct cpumask *cpu_map);
1415static int __sdt_alloc(const struct cpumask *cpu_map);
1416
1417static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
1418 const struct cpumask *cpu_map)
1419{
1420 switch (what) {
1421 case sa_rootdomain:
1422 if (!atomic_read(&d->rd->refcount))
1423 free_rootdomain(&d->rd->rcu);
1424 fallthrough;
1425 case sa_sd:
1426 free_percpu(d->sd);
1427 fallthrough;
1428 case sa_sd_storage:
1429 __sdt_free(cpu_map);
1430 fallthrough;
1431 case sa_none:
1432 break;
1433 }
1434}
1435
1436static enum s_alloc
1437__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1438{
1439 memset(d, 0, sizeof(*d));
1440
1441 if (__sdt_alloc(cpu_map))
1442 return sa_sd_storage;
1443 d->sd = alloc_percpu(struct sched_domain *);
1444 if (!d->sd)
1445 return sa_sd_storage;
1446 d->rd = alloc_rootdomain();
1447 if (!d->rd)
1448 return sa_sd;
1449
1450 return sa_rootdomain;
1451}
1452
1453
1454
1455
1456
1457
1458static void claim_allocations(int cpu, struct sched_domain *sd)
1459{
1460 struct sd_data *sdd = sd->private;
1461
1462 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1463 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1464
1465 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1466 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1467
1468 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1469 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1470
1471 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1472 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1473}
1474
1475#ifdef CONFIG_NUMA
1476enum numa_topology_type sched_numa_topology_type;
1477
1478static int sched_domains_numa_levels;
1479static int sched_domains_curr_level;
1480
1481int sched_max_numa_distance;
1482static int *sched_domains_numa_distance;
1483static struct cpumask ***sched_domains_numa_masks;
1484int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1485#endif
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503#define TOPOLOGY_SD_FLAGS \
1504 (SD_SHARE_CPUCAPACITY | \
1505 SD_SHARE_PKG_RESOURCES | \
1506 SD_NUMA | \
1507 SD_ASYM_PACKING)
1508
1509static struct sched_domain *
1510sd_init(struct sched_domain_topology_level *tl,
1511 const struct cpumask *cpu_map,
1512 struct sched_domain *child, int cpu)
1513{
1514 struct sd_data *sdd = &tl->data;
1515 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1516 int sd_id, sd_weight, sd_flags = 0;
1517 struct cpumask *sd_span;
1518
1519#ifdef CONFIG_NUMA
1520
1521
1522
1523 sched_domains_curr_level = tl->numa_level;
1524#endif
1525
1526 sd_weight = cpumask_weight(tl->mask(cpu));
1527
1528 if (tl->sd_flags)
1529 sd_flags = (*tl->sd_flags)();
1530 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1531 "wrong sd_flags in topology description\n"))
1532 sd_flags &= TOPOLOGY_SD_FLAGS;
1533
1534 *sd = (struct sched_domain){
1535 .min_interval = sd_weight,
1536 .max_interval = 2*sd_weight,
1537 .busy_factor = 16,
1538 .imbalance_pct = 117,
1539
1540 .cache_nice_tries = 0,
1541
1542 .flags = 1*SD_BALANCE_NEWIDLE
1543 | 1*SD_BALANCE_EXEC
1544 | 1*SD_BALANCE_FORK
1545 | 0*SD_BALANCE_WAKE
1546 | 1*SD_WAKE_AFFINE
1547 | 0*SD_SHARE_CPUCAPACITY
1548 | 0*SD_SHARE_PKG_RESOURCES
1549 | 0*SD_SERIALIZE
1550 | 1*SD_PREFER_SIBLING
1551 | 0*SD_NUMA
1552 | sd_flags
1553 ,
1554
1555 .last_balance = jiffies,
1556 .balance_interval = sd_weight,
1557 .max_newidle_lb_cost = 0,
1558 .next_decay_max_lb_cost = jiffies,
1559 .child = child,
1560#ifdef CONFIG_SCHED_DEBUG
1561 .name = tl->name,
1562#endif
1563 };
1564
1565 sd_span = sched_domain_span(sd);
1566 cpumask_and(sd_span, cpu_map, tl->mask(cpu));
1567 sd_id = cpumask_first(sd_span);
1568
1569 sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
1570
1571 WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
1572 (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
1573 "CPU capacity asymmetry not supported on SMT\n");
1574
1575
1576
1577
1578
1579 if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
1580 sd->child->flags &= ~SD_PREFER_SIBLING;
1581
1582 if (sd->flags & SD_SHARE_CPUCAPACITY) {
1583 sd->imbalance_pct = 110;
1584
1585 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1586 sd->imbalance_pct = 117;
1587 sd->cache_nice_tries = 1;
1588
1589#ifdef CONFIG_NUMA
1590 } else if (sd->flags & SD_NUMA) {
1591 sd->cache_nice_tries = 2;
1592
1593 sd->flags &= ~SD_PREFER_SIBLING;
1594 sd->flags |= SD_SERIALIZE;
1595 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1596 sd->flags &= ~(SD_BALANCE_EXEC |
1597 SD_BALANCE_FORK |
1598 SD_WAKE_AFFINE);
1599 }
1600
1601#endif
1602 } else {
1603 sd->cache_nice_tries = 1;
1604 }
1605
1606
1607
1608
1609
1610 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1611 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1612 atomic_inc(&sd->shared->ref);
1613 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1614 }
1615
1616 sd->private = sdd;
1617
1618 return sd;
1619}
1620
1621
1622
1623
1624static struct sched_domain_topology_level default_topology[] = {
1625#ifdef CONFIG_SCHED_SMT
1626 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1627#endif
1628#ifdef CONFIG_SCHED_MC
1629 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1630#endif
1631 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1632 { NULL, },
1633};
1634
1635static struct sched_domain_topology_level *sched_domain_topology =
1636 default_topology;
1637
1638#define for_each_sd_topology(tl) \
1639 for (tl = sched_domain_topology; tl->mask; tl++)
1640
1641void set_sched_topology(struct sched_domain_topology_level *tl)
1642{
1643 if (WARN_ON_ONCE(sched_smp_initialized))
1644 return;
1645
1646 sched_domain_topology = tl;
1647}
1648
1649#ifdef CONFIG_NUMA
1650
1651static const struct cpumask *sd_numa_mask(int cpu)
1652{
1653 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1654}
1655
1656static void sched_numa_warn(const char *str)
1657{
1658 static int done = false;
1659 int i,j;
1660
1661 if (done)
1662 return;
1663
1664 done = true;
1665
1666 printk(KERN_WARNING "ERROR: %s\n\n", str);
1667
1668 for (i = 0; i < nr_node_ids; i++) {
1669 printk(KERN_WARNING " ");
1670 for (j = 0; j < nr_node_ids; j++)
1671 printk(KERN_CONT "%02d ", node_distance(i,j));
1672 printk(KERN_CONT "\n");
1673 }
1674 printk(KERN_WARNING "\n");
1675}
1676
1677bool find_numa_distance(int distance)
1678{
1679 int i;
1680
1681 if (distance == node_distance(0, 0))
1682 return true;
1683
1684 for (i = 0; i < sched_domains_numa_levels; i++) {
1685 if (sched_domains_numa_distance[i] == distance)
1686 return true;
1687 }
1688
1689 return false;
1690}
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711static void init_numa_topology_type(void)
1712{
1713 int a, b, c, n;
1714
1715 n = sched_max_numa_distance;
1716
1717 if (sched_domains_numa_levels <= 2) {
1718 sched_numa_topology_type = NUMA_DIRECT;
1719 return;
1720 }
1721
1722 for_each_online_node(a) {
1723 for_each_online_node(b) {
1724
1725 if (node_distance(a, b) < n)
1726 continue;
1727
1728
1729 for_each_online_node(c) {
1730 if (node_distance(a, c) < n &&
1731 node_distance(b, c) < n) {
1732 sched_numa_topology_type =
1733 NUMA_GLUELESS_MESH;
1734 return;
1735 }
1736 }
1737
1738 sched_numa_topology_type = NUMA_BACKPLANE;
1739 return;
1740 }
1741 }
1742}
1743
1744
1745#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1746
1747void sched_init_numa(void)
1748{
1749 struct sched_domain_topology_level *tl;
1750 unsigned long *distance_map;
1751 int nr_levels = 0;
1752 int i, j;
1753
1754
1755
1756
1757
1758 distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1759 if (!distance_map)
1760 return;
1761
1762 bitmap_zero(distance_map, NR_DISTANCE_VALUES);
1763 for (i = 0; i < nr_node_ids; i++) {
1764 for (j = 0; j < nr_node_ids; j++) {
1765 int distance = node_distance(i, j);
1766
1767 if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1768 sched_numa_warn("Invalid distance value range");
1769 return;
1770 }
1771
1772 bitmap_set(distance_map, distance, 1);
1773 }
1774 }
1775
1776
1777
1778
1779 nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
1780
1781 sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1782 if (!sched_domains_numa_distance) {
1783 bitmap_free(distance_map);
1784 return;
1785 }
1786
1787 for (i = 0, j = 0; i < nr_levels; i++, j++) {
1788 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
1789 sched_domains_numa_distance[i] = j;
1790 }
1791
1792 bitmap_free(distance_map);
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810 sched_domains_numa_levels = 0;
1811
1812 sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
1813 if (!sched_domains_numa_masks)
1814 return;
1815
1816
1817
1818
1819
1820 for (i = 0; i < nr_levels; i++) {
1821 sched_domains_numa_masks[i] =
1822 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1823 if (!sched_domains_numa_masks[i])
1824 return;
1825
1826 for (j = 0; j < nr_node_ids; j++) {
1827 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1828 int k;
1829
1830 if (!mask)
1831 return;
1832
1833 sched_domains_numa_masks[i][j] = mask;
1834
1835 for_each_node(k) {
1836 if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
1837 sched_numa_warn("Node-distance not symmetric");
1838
1839 if (node_distance(j, k) > sched_domains_numa_distance[i])
1840 continue;
1841
1842 cpumask_or(mask, mask, cpumask_of_node(k));
1843 }
1844 }
1845 }
1846
1847
1848 for (i = 0; sched_domain_topology[i].mask; i++);
1849
1850 tl = kzalloc((i + nr_levels + 1) *
1851 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1852 if (!tl)
1853 return;
1854
1855
1856
1857
1858 for (i = 0; sched_domain_topology[i].mask; i++)
1859 tl[i] = sched_domain_topology[i];
1860
1861
1862
1863
1864 tl[i++] = (struct sched_domain_topology_level){
1865 .mask = sd_numa_mask,
1866 .numa_level = 0,
1867 SD_INIT_NAME(NODE)
1868 };
1869
1870
1871
1872
1873 for (j = 1; j < nr_levels; i++, j++) {
1874 tl[i] = (struct sched_domain_topology_level){
1875 .mask = sd_numa_mask,
1876 .sd_flags = cpu_numa_flags,
1877 .flags = SDTL_OVERLAP,
1878 .numa_level = j,
1879 SD_INIT_NAME(NUMA)
1880 };
1881 }
1882
1883 sched_domain_topology = tl;
1884
1885 sched_domains_numa_levels = nr_levels;
1886 sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
1887
1888 init_numa_topology_type();
1889}
1890
1891void sched_domains_numa_masks_set(unsigned int cpu)
1892{
1893 int node = cpu_to_node(cpu);
1894 int i, j;
1895
1896 for (i = 0; i < sched_domains_numa_levels; i++) {
1897 for (j = 0; j < nr_node_ids; j++) {
1898 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1899 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1900 }
1901 }
1902}
1903
1904void sched_domains_numa_masks_clear(unsigned int cpu)
1905{
1906 int i, j;
1907
1908 for (i = 0; i < sched_domains_numa_levels; i++) {
1909 for (j = 0; j < nr_node_ids; j++)
1910 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1911 }
1912}
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1923{
1924 int i, j = cpu_to_node(cpu);
1925
1926 for (i = 0; i < sched_domains_numa_levels; i++) {
1927 cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1928 if (cpu < nr_cpu_ids)
1929 return cpu;
1930 }
1931 return nr_cpu_ids;
1932}
1933
1934#endif
1935
1936static int __sdt_alloc(const struct cpumask *cpu_map)
1937{
1938 struct sched_domain_topology_level *tl;
1939 int j;
1940
1941 for_each_sd_topology(tl) {
1942 struct sd_data *sdd = &tl->data;
1943
1944 sdd->sd = alloc_percpu(struct sched_domain *);
1945 if (!sdd->sd)
1946 return -ENOMEM;
1947
1948 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1949 if (!sdd->sds)
1950 return -ENOMEM;
1951
1952 sdd->sg = alloc_percpu(struct sched_group *);
1953 if (!sdd->sg)
1954 return -ENOMEM;
1955
1956 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1957 if (!sdd->sgc)
1958 return -ENOMEM;
1959
1960 for_each_cpu(j, cpu_map) {
1961 struct sched_domain *sd;
1962 struct sched_domain_shared *sds;
1963 struct sched_group *sg;
1964 struct sched_group_capacity *sgc;
1965
1966 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1967 GFP_KERNEL, cpu_to_node(j));
1968 if (!sd)
1969 return -ENOMEM;
1970
1971 *per_cpu_ptr(sdd->sd, j) = sd;
1972
1973 sds = kzalloc_node(sizeof(struct sched_domain_shared),
1974 GFP_KERNEL, cpu_to_node(j));
1975 if (!sds)
1976 return -ENOMEM;
1977
1978 *per_cpu_ptr(sdd->sds, j) = sds;
1979
1980 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1981 GFP_KERNEL, cpu_to_node(j));
1982 if (!sg)
1983 return -ENOMEM;
1984
1985 sg->next = sg;
1986
1987 *per_cpu_ptr(sdd->sg, j) = sg;
1988
1989 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1990 GFP_KERNEL, cpu_to_node(j));
1991 if (!sgc)
1992 return -ENOMEM;
1993
1994#ifdef CONFIG_SCHED_DEBUG
1995 sgc->id = j;
1996#endif
1997
1998 *per_cpu_ptr(sdd->sgc, j) = sgc;
1999 }
2000 }
2001
2002 return 0;
2003}
2004
2005static void __sdt_free(const struct cpumask *cpu_map)
2006{
2007 struct sched_domain_topology_level *tl;
2008 int j;
2009
2010 for_each_sd_topology(tl) {
2011 struct sd_data *sdd = &tl->data;
2012
2013 for_each_cpu(j, cpu_map) {
2014 struct sched_domain *sd;
2015
2016 if (sdd->sd) {
2017 sd = *per_cpu_ptr(sdd->sd, j);
2018 if (sd && (sd->flags & SD_OVERLAP))
2019 free_sched_groups(sd->groups, 0);
2020 kfree(*per_cpu_ptr(sdd->sd, j));
2021 }
2022
2023 if (sdd->sds)
2024 kfree(*per_cpu_ptr(sdd->sds, j));
2025 if (sdd->sg)
2026 kfree(*per_cpu_ptr(sdd->sg, j));
2027 if (sdd->sgc)
2028 kfree(*per_cpu_ptr(sdd->sgc, j));
2029 }
2030 free_percpu(sdd->sd);
2031 sdd->sd = NULL;
2032 free_percpu(sdd->sds);
2033 sdd->sds = NULL;
2034 free_percpu(sdd->sg);
2035 sdd->sg = NULL;
2036 free_percpu(sdd->sgc);
2037 sdd->sgc = NULL;
2038 }
2039}
2040
2041static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
2042 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
2043 struct sched_domain *child, int cpu)
2044{
2045 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
2046
2047 if (child) {
2048 sd->level = child->level + 1;
2049 sched_domain_level_max = max(sched_domain_level_max, sd->level);
2050 child->parent = sd;
2051
2052 if (!cpumask_subset(sched_domain_span(child),
2053 sched_domain_span(sd))) {
2054 pr_err("BUG: arch topology borken\n");
2055#ifdef CONFIG_SCHED_DEBUG
2056 pr_err(" the %s domain not a subset of the %s domain\n",
2057 child->name, sd->name);
2058#endif
2059
2060 cpumask_or(sched_domain_span(sd),
2061 sched_domain_span(sd),
2062 sched_domain_span(child));
2063 }
2064
2065 }
2066 set_domain_attribute(sd, attr);
2067
2068 return sd;
2069}
2070
2071
2072
2073
2074
2075static bool topology_span_sane(struct sched_domain_topology_level *tl,
2076 const struct cpumask *cpu_map, int cpu)
2077{
2078 int i;
2079
2080
2081 if (tl->flags & SDTL_OVERLAP)
2082 return true;
2083
2084
2085
2086
2087
2088
2089
2090 for_each_cpu(i, cpu_map) {
2091 if (i == cpu)
2092 continue;
2093
2094
2095
2096
2097
2098
2099 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
2100 cpumask_intersects(tl->mask(cpu), tl->mask(i)))
2101 return false;
2102 }
2103
2104 return true;
2105}
2106
2107
2108
2109
2110
2111static int
2112build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
2113{
2114 enum s_alloc alloc_state = sa_none;
2115 struct sched_domain *sd;
2116 struct s_data d;
2117 struct rq *rq = NULL;
2118 int i, ret = -ENOMEM;
2119 bool has_asym = false;
2120
2121 if (WARN_ON(cpumask_empty(cpu_map)))
2122 goto error;
2123
2124 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
2125 if (alloc_state != sa_rootdomain)
2126 goto error;
2127
2128
2129 for_each_cpu(i, cpu_map) {
2130 struct sched_domain_topology_level *tl;
2131
2132 sd = NULL;
2133 for_each_sd_topology(tl) {
2134
2135 if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2136 goto error;
2137
2138 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
2139
2140 has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
2141
2142 if (tl == sched_domain_topology)
2143 *per_cpu_ptr(d.sd, i) = sd;
2144 if (tl->flags & SDTL_OVERLAP)
2145 sd->flags |= SD_OVERLAP;
2146 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
2147 break;
2148 }
2149 }
2150
2151
2152 for_each_cpu(i, cpu_map) {
2153 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2154 sd->span_weight = cpumask_weight(sched_domain_span(sd));
2155 if (sd->flags & SD_OVERLAP) {
2156 if (build_overlap_sched_groups(sd, i))
2157 goto error;
2158 } else {
2159 if (build_sched_groups(sd, i))
2160 goto error;
2161 }
2162 }
2163 }
2164
2165
2166 for (i = nr_cpumask_bits-1; i >= 0; i--) {
2167 if (!cpumask_test_cpu(i, cpu_map))
2168 continue;
2169
2170 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2171 claim_allocations(i, sd);
2172 init_sched_groups_capacity(i, sd);
2173 }
2174 }
2175
2176
2177 rcu_read_lock();
2178 for_each_cpu(i, cpu_map) {
2179 rq = cpu_rq(i);
2180 sd = *per_cpu_ptr(d.sd, i);
2181
2182
2183 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2184 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2185
2186 cpu_attach_domain(sd, d.rd, i);
2187 }
2188 rcu_read_unlock();
2189
2190 if (has_asym)
2191 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2192
2193 if (rq && sched_debug_verbose) {
2194 pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
2195 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2196 }
2197
2198 ret = 0;
2199error:
2200 __free_domain_allocs(&d, alloc_state, cpu_map);
2201
2202 return ret;
2203}
2204
2205
2206static cpumask_var_t *doms_cur;
2207
2208
2209static int ndoms_cur;
2210
2211
2212static struct sched_domain_attr *dattr_cur;
2213
2214
2215
2216
2217
2218
2219static cpumask_var_t fallback_doms;
2220
2221
2222
2223
2224
2225
2226int __weak arch_update_cpu_topology(void)
2227{
2228 return 0;
2229}
2230
2231cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2232{
2233 int i;
2234 cpumask_var_t *doms;
2235
2236 doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
2237 if (!doms)
2238 return NULL;
2239 for (i = 0; i < ndoms; i++) {
2240 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2241 free_sched_domains(doms, i);
2242 return NULL;
2243 }
2244 }
2245 return doms;
2246}
2247
2248void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2249{
2250 unsigned int i;
2251 for (i = 0; i < ndoms; i++)
2252 free_cpumask_var(doms[i]);
2253 kfree(doms);
2254}
2255
2256
2257
2258
2259
2260int sched_init_domains(const struct cpumask *cpu_map)
2261{
2262 int err;
2263
2264 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
2265 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
2266 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2267
2268 arch_update_cpu_topology();
2269 asym_cpu_capacity_scan();
2270 ndoms_cur = 1;
2271 doms_cur = alloc_sched_domains(ndoms_cur);
2272 if (!doms_cur)
2273 doms_cur = &fallback_doms;
2274 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
2275 err = build_sched_domains(doms_cur[0], NULL);
2276
2277 return err;
2278}
2279
2280
2281
2282
2283
2284static void detach_destroy_domains(const struct cpumask *cpu_map)
2285{
2286 unsigned int cpu = cpumask_any(cpu_map);
2287 int i;
2288
2289 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
2290 static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2291
2292 rcu_read_lock();
2293 for_each_cpu(i, cpu_map)
2294 cpu_attach_domain(NULL, &def_root_domain, i);
2295 rcu_read_unlock();
2296}
2297
2298
2299static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
2300 struct sched_domain_attr *new, int idx_new)
2301{
2302 struct sched_domain_attr tmp;
2303
2304
2305 if (!new && !cur)
2306 return 1;
2307
2308 tmp = SD_ATTR_INIT;
2309
2310 return !memcmp(cur ? (cur + idx_cur) : &tmp,
2311 new ? (new + idx_new) : &tmp,
2312 sizeof(struct sched_domain_attr));
2313}
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2342 struct sched_domain_attr *dattr_new)
2343{
2344 bool __maybe_unused has_eas = false;
2345 int i, j, n;
2346 int new_topology;
2347
2348 lockdep_assert_held(&sched_domains_mutex);
2349
2350
2351 new_topology = arch_update_cpu_topology();
2352
2353 if (new_topology)
2354 asym_cpu_capacity_scan();
2355
2356 if (!doms_new) {
2357 WARN_ON_ONCE(dattr_new);
2358 n = 0;
2359 doms_new = alloc_sched_domains(1);
2360 if (doms_new) {
2361 n = 1;
2362 cpumask_and(doms_new[0], cpu_active_mask,
2363 housekeeping_cpumask(HK_FLAG_DOMAIN));
2364 }
2365 } else {
2366 n = ndoms_new;
2367 }
2368
2369
2370 for (i = 0; i < ndoms_cur; i++) {
2371 for (j = 0; j < n && !new_topology; j++) {
2372 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2373 dattrs_equal(dattr_cur, i, dattr_new, j)) {
2374 struct root_domain *rd;
2375
2376
2377
2378
2379
2380
2381
2382 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2383 dl_clear_root_domain(rd);
2384 goto match1;
2385 }
2386 }
2387
2388 detach_destroy_domains(doms_cur[i]);
2389match1:
2390 ;
2391 }
2392
2393 n = ndoms_cur;
2394 if (!doms_new) {
2395 n = 0;
2396 doms_new = &fallback_doms;
2397 cpumask_and(doms_new[0], cpu_active_mask,
2398 housekeeping_cpumask(HK_FLAG_DOMAIN));
2399 }
2400
2401
2402 for (i = 0; i < ndoms_new; i++) {
2403 for (j = 0; j < n && !new_topology; j++) {
2404 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2405 dattrs_equal(dattr_new, i, dattr_cur, j))
2406 goto match2;
2407 }
2408
2409 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2410match2:
2411 ;
2412 }
2413
2414#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2415
2416 for (i = 0; i < ndoms_new; i++) {
2417 for (j = 0; j < n && !sched_energy_update; j++) {
2418 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2419 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2420 has_eas = true;
2421 goto match3;
2422 }
2423 }
2424
2425 has_eas |= build_perf_domains(doms_new[i]);
2426match3:
2427 ;
2428 }
2429 sched_energy_set(has_eas);
2430#endif
2431
2432
2433 if (doms_cur != &fallback_doms)
2434 free_sched_domains(doms_cur, ndoms_cur);
2435
2436 kfree(dattr_cur);
2437 doms_cur = doms_new;
2438 dattr_cur = dattr_new;
2439 ndoms_cur = ndoms_new;
2440
2441 update_sched_domain_debugfs();
2442}
2443
2444
2445
2446
2447void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2448 struct sched_domain_attr *dattr_new)
2449{
2450 mutex_lock(&sched_domains_mutex);
2451 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2452 mutex_unlock(&sched_domains_mutex);
2453}
2454