-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsched_bfs.c
7252 lines (6256 loc) · 177 KB
/
sched_bfs.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* kernel/sched_bfs.c, was sched.c
*
* Kernel scheduler and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
*
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1998-11-19 Implemented schedule_timeout() and related stuff
* by Andrea Arcangeli
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
* hybrid priority-list and round-robin design with
* an array-switch method of distributing timeslices
* and per-CPU runqueues. Cleanups and useful suggestions
* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
* 2007-04-15 Work begun on replacing all interactivity tuning with a
* fair scheduling design by Con Kolivas.
* 2007-05-05 Load balancing (smp-nice) and other improvements
* by Peter Williams
* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
* Thomas Gleixner, Mike Kravetz
* now Brainfuck deadline scheduling policy by Con Kolivas deletes
* a whole lot of those previous things.
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <linux/highmem.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
#include <linux/perf_event.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
#include <linux/freezer.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
#include <linux/log2.h>
#include <linux/bootmem.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
#include <asm/mutex.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
#include "sched_cpupri.h"
#include "workqueue_sched.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p) rt_prio((p)->prio)
#define rt_queue(rq) rt_prio((rq)->rq_prio)
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
(policy) == SCHED_RR)
#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO)
#define iso_task(p) unlikely((p)->policy == SCHED_ISO)
#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO)
#define ISO_PERIOD ((5 * HZ * grq.noc) + 1)
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p) ((p) - MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO)
#define STOP_PRIO (MAX_RT_PRIO - 1)
/*
* Some helpers for converting to/from various scales. Use shifts to get
* approximate multiples of ten for less overhead.
*/
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
#define JIFFY_NS (1000000000 / HZ)
#define HALF_JIFFY_NS (1000000000 / HZ / 2)
#define HALF_JIFFY_US (1000000 / HZ / 2)
#define MS_TO_NS(TIME) ((TIME) << 20)
#define MS_TO_US(TIME) ((TIME) << 10)
#define NS_TO_MS(TIME) ((TIME) >> 20)
#define NS_TO_US(TIME) ((TIME) >> 10)
#define RESCHED_US (100) /* Reschedule if less than this many μs left */
/*
* This is the number used to put the idle cycle one time every __idle_cycles_offset__ cycles
* it means about number of calls of schedule function
*/
#define INJECTION_IDLE_CYCLE_EACH_TIME_INIT 2000
#define INJECTION_IDLE_CYCLE_EACH_TIME_STANDARD 15
#define INJECTION_IDLE_CYCLE_EACH_TIME_STANDARD_MAX 20
int is_init = 1;
int idle_cycles_offset = 0;
void sched_idle_next(struct rq *rq, int this_cpu, struct task_struct *idle);
static void set_rq_online(struct rq *rq);
static void set_rq_offline(struct rq *rq);
/*
* This is the time all tasks within the same priority round robin.
* Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
* Tunable via /proc interface.
*/
int rr_interval __read_mostly = 6;
/*
* sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
* are allowed to run five seconds as real time tasks. This is the total over
* all online cpus.
*/
int sched_iso_cpu __read_mostly = 70;
/*
* The relative length of deadline for each priority(nice) level.
*/
static int prio_ratios[PRIO_RANGE] __read_mostly;
/*
* The quota handed out to tasks of all priority levels when refilling their
* time_slice.
*/
static inline int timeslice(void)
{
return MS_TO_US(rr_interval);
}
/*
* The global runqueue data that all CPUs work off. Data is protected either
* by the global grq lock, or the discrete lock that precedes the data in this
* struct.
*/
struct global_rq {
raw_spinlock_t lock;
unsigned long nr_running;
unsigned long nr_uninterruptible;
unsigned long long nr_switches;
struct list_head queue[PRIO_LIMIT];
DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
#ifdef CONFIG_SMP
unsigned long qnr; /* queued not running */
cpumask_t cpu_idle_map;
bool idle_cpus;
#endif
int noc; /* num_online_cpus stored and updated when it changes */
u64 niffies; /* Nanosecond jiffies */
unsigned long last_jiffy; /* Last jiffy we updated niffies */
raw_spinlock_t iso_lock;
int iso_ticks;
int iso_refractory;
};
#ifdef CONFIG_SMP
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
* fully partitioning the member cpus from any other cpuset. Whenever a new
* exclusive cpuset is created, we also create and attach a new root-domain
* object.
*
*/
struct root_domain {
atomic_t refcount;
atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
};
/*
* By default the system creates a single root-domain with all cpus as
* members (mimicking the global state we have today).
*/
static struct root_domain def_root_domain;
#endif /* CONFIG_SMP */
/* There can be only one */
static struct global_rq grq;
/*
* This is the main, per-CPU runqueue data structure.
* This data should only be modified by the local cpu.
*/
struct rq {
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
unsigned char in_nohz_recently;
#endif
#endif
struct task_struct *curr, *idle, *stop;
struct mm_struct *prev_mm;
/* Stored data about rq->curr to work outside grq lock */
u64 rq_deadline;
unsigned int rq_policy;
int rq_time_slice;
u64 rq_last_ran;
int rq_prio;
bool rq_running; /* There is a task running */
/* Accurate timekeeping data */
u64 timekeep_clock;
unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
iowait_pc, idle_pc;
long account_pc;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
int cpu; /* cpu of this runqueue */
bool online;
bool scaling; /* This CPU is managed by a scaling CPU freq governor */
struct task_struct *sticky_task;
struct root_domain *rd;
struct sched_domain *sd;
int *cpu_locality; /* CPU relative cache distance */
#ifdef CONFIG_SCHED_SMT
bool (*siblings_idle)(int cpu);
/* See if all smt siblings are idle */
cpumask_t smt_siblings;
#endif
#ifdef CONFIG_SCHED_MC
bool (*cache_idle)(int cpu);
/* See if all cache siblings are idle */
cpumask_t cache_siblings;
#endif
u64 last_niffy; /* Last time this RQ updated grq.niffies */
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
u64 prev_steal_time_rq;
#endif
u64 clock, old_clock, last_tick;
u64 clock_task;
bool dither;
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
};
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
static DEFINE_MUTEX(sched_hotcpu_mutex);
#ifdef CONFIG_SMP
/*
* sched_domains_mutex serialises calls to init_sched_domains,
* detach_destroy_domains and partition_sched_domains.
*/
static DEFINE_MUTEX(sched_domains_mutex);
/*
* By default the system creates a single root-domain with all cpus as
* members (mimicking the global state we have today).
*/
static struct root_domain def_root_domain;
int __weak arch_sd_sibling_asym_packing(void)
{
return 0*SD_ASYM_PACKING;
}
#endif
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
static inline void update_rq_clock(struct rq *rq);
/*
* Sanity check should sched_clock return bogus values. We make sure it does
* not appear to go backwards, and use jiffies to determine the maximum and
* minimum it could possibly have increased, and round down to the nearest
* jiffy when it falls outside this.
*/
static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
{
unsigned long min_diff, max_diff;
if (jiff_diff > 1)
min_diff = JIFFIES_TO_NS(jiff_diff - 1);
else
min_diff = 1;
/* Round up to the nearest tick for maximum */
max_diff = JIFFIES_TO_NS(jiff_diff + 1);
if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
*niff_diff = min_diff;
}
#ifdef CONFIG_SMP
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
static inline int cpu_of(struct rq *rq)
{
return rq->cpu;
}
/*
* Niffies are a globally increasing nanosecond counter. Whenever a runqueue
* clock is updated with the grq.lock held, it is an opportunity to update the
* niffies value. Any CPU can update it by adding how much its clock has
* increased since it last updated niffies, minus any added niffies by other
* CPUs.
*/
static inline void update_clocks(struct rq *rq)
{
s64 ndiff;
long jdiff;
update_rq_clock(rq);
ndiff = rq->clock - rq->old_clock;
/* old_clock is only updated when we are updating niffies */
rq->old_clock = rq->clock;
ndiff -= grq.niffies - rq->last_niffy;
jdiff = jiffies - grq.last_jiffy;
niffy_diff(&ndiff, jdiff);
grq.last_jiffy += jdiff;
grq.niffies += ndiff;
rq->last_niffy = grq.niffies;
}
#else /* CONFIG_SMP */
static struct rq *uprq;
#define cpu_rq(cpu) (uprq)
#define this_rq() (uprq)
#define task_rq(p) (uprq)
#define cpu_curr(cpu) ((uprq)->curr)
static inline int cpu_of(struct rq *rq)
{
return 0;
}
static inline void update_clocks(struct rq *rq)
{
s64 ndiff;
long jdiff;
update_rq_clock(rq);
ndiff = rq->clock - rq->old_clock;
rq->old_clock = rq->clock;
jdiff = jiffies - grq.last_jiffy;
niffy_diff(&ndiff, jdiff);
grq.last_jiffy += jdiff;
grq.niffies += ndiff;
}
#endif
#define raw_rq() (&__raw_get_cpu_var(runqueues))
#include "sched_stats.h"
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
/*
* All common locking functions performed on grq.lock. rq->clock is local to
* the CPU accessing it so it can be modified just with interrupts disabled
* when we're not updating niffies.
* Looking up task_rq must be done under grq.lock to be safe.
*/
static void update_rq_clock_task(struct rq *rq, s64 delta);
static inline void update_rq_clock(struct rq *rq)
{
s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
static inline bool task_running(struct task_struct *p)
{
return p->on_cpu;
}
static inline void grq_lock(void)
__acquires(grq.lock)
{
raw_spin_lock(&grq.lock);
}
static inline void grq_unlock(void)
__releases(grq.lock)
{
raw_spin_unlock(&grq.lock);
}
static inline void grq_lock_irq(void)
__acquires(grq.lock)
{
raw_spin_lock_irq(&grq.lock);
}
static inline void time_lock_grq(struct rq *rq)
__acquires(grq.lock)
{
grq_lock();
update_clocks(rq);
}
static inline void grq_unlock_irq(void)
__releases(grq.lock)
{
raw_spin_unlock_irq(&grq.lock);
}
static inline void grq_lock_irqsave(unsigned long *flags)
__acquires(grq.lock)
{
raw_spin_lock_irqsave(&grq.lock, *flags);
}
static inline void grq_unlock_irqrestore(unsigned long *flags)
__releases(grq.lock)
{
raw_spin_unlock_irqrestore(&grq.lock, *flags);
}
static inline struct rq
*task_grq_lock(struct task_struct *p, unsigned long *flags)
__acquires(grq.lock)
{
grq_lock_irqsave(flags);
return task_rq(p);
}
static inline struct rq
*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
__acquires(grq.lock)
{
struct rq *rq = task_grq_lock(p, flags);
update_clocks(rq);
return rq;
}
static inline struct rq *task_grq_lock_irq(struct task_struct *p)
__acquires(grq.lock)
{
grq_lock_irq();
return task_rq(p);
}
static inline void time_task_grq_lock_irq(struct task_struct *p)
__acquires(grq.lock)
{
struct rq *rq = task_grq_lock_irq(p);
update_clocks(rq);
}
static inline void task_grq_unlock_irq(void)
__releases(grq.lock)
{
grq_unlock_irq();
}
static inline void task_grq_unlock(unsigned long *flags)
__releases(grq.lock)
{
grq_unlock_irqrestore(flags);
}
/**
* grunqueue_is_locked
*
* Returns true if the global runqueue is locked.
* This interface allows printk to be called with the runqueue lock
* held and know whether or not it is OK to wake up the klogd.
*/
bool grunqueue_is_locked(void)
{
return raw_spin_is_locked(&grq.lock);
}
void grq_unlock_wait(void)
__releases(grq.lock)
{
smp_mb(); /* spin-unlock-wait is not a full memory barrier */
raw_spin_unlock_wait(&grq.lock);
}
static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
__acquires(grq.lock)
{
local_irq_save(*flags);
time_lock_grq(rq);
}
static inline struct rq *__task_grq_lock(struct task_struct *p)
__acquires(grq.lock)
{
grq_lock();
return task_rq(p);
}
static inline void __task_grq_unlock(void)
__releases(grq.lock)
{
grq_unlock();
}
/*
* Look for any tasks *anywhere* that are running nice 0 or better. We do
* this lockless for overhead reasons since the occasional wrong result
* is harmless.
*/
int above_background_load(void)
{
struct task_struct *cpu_curr;
unsigned long cpu;
for_each_online_cpu(cpu) {
cpu_curr = cpu_rq(cpu)->curr;
if (unlikely(!cpu_curr))
continue;
if (PRIO_TO_NICE(cpu_curr->static_prio) < 1)
return 1;
}
return 0;
}
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
grq.lock.owner = current;
#endif
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
grq_unlock_irq();
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
grq_unlock_irq();
#else
grq_unlock();
#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
smp_wmb();
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
static inline bool deadline_before(u64 deadline, u64 time)
{
return (deadline < time);
}
static inline bool deadline_after(u64 deadline, u64 time)
{
return (deadline > time);
}
/*
* A task that is queued but not running will be on the grq run list.
* A task that is not running or queued will not be on the grq run list.
* A task that is currently running will have ->on_cpu set but not on the
* grq run list.
*/
static inline bool task_queued(struct task_struct *p)
{
return (!list_empty(&p->run_list));
}
/*
* Removing from the global runqueue. Enter with grq locked.
*/
static void dequeue_task(struct task_struct *p)
{
list_del_init(&p->run_list);
if (list_empty(grq.queue + p->prio))
__clear_bit(p->prio, grq.prio_bitmap);
}
/*
* To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
* an idle task, we ensure none of the following conditions are met.
*/
static bool idleprio_suitable(struct task_struct *p)
{
return (!freezing(p) && !signal_pending(p) &&
!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
}
/*
* To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
* that the iso_refractory flag is not set.
*/
static bool isoprio_suitable(void)
{
return !grq.iso_refractory;
}
/*
* Adding to the global runqueue. Enter with grq locked.
*/
static void enqueue_task(struct task_struct *p)
{
if (!rt_task(p)) {
/* Check it hasn't gotten rt from PI */
if ((idleprio_task(p) && idleprio_suitable(p)) ||
(iso_task(p) && isoprio_suitable()))
p->prio = p->normal_prio;
else
p->prio = NORMAL_PRIO;
}
__set_bit(p->prio, grq.prio_bitmap);
list_add_tail(&p->run_list, grq.queue + p->prio);
sched_info_queued(p);
}
/* Only idle task does this as a real time task*/
static inline void enqueue_task_head(struct task_struct *p)
{
__set_bit(p->prio, grq.prio_bitmap);
list_add(&p->run_list, grq.queue + p->prio);
sched_info_queued(p);
}
static inline void requeue_task(struct task_struct *p)
{
sched_info_queued(p);
}
/*
* Returns the relative length of deadline all compared to the shortest
* deadline which is that of nice -20.
*/
static inline int task_prio_ratio(struct task_struct *p)
{
return prio_ratios[TASK_USER_PRIO(p)];
}
/*
* task_timeslice - all tasks of all priorities get the exact same timeslice
* length. CPU distribution is handled by giving different deadlines to
* tasks of different priorities. Use 128 as the base value for fast shifts.
*/
static inline int task_timeslice(struct task_struct *p)
{
return (rr_interval * task_prio_ratio(p) / 128);
}
#ifdef CONFIG_SMP
/*
* qnr is the "queued but not running" count which is the total number of
* tasks on the global runqueue list waiting for cpu time but not actually
* currently running on a cpu.
*/
static inline void inc_qnr(void)
{
grq.qnr++;
}
static inline void dec_qnr(void)
{
grq.qnr--;
}
static inline int queued_notrunning(void)
{
return grq.qnr;
}
/*
* The cpu_idle_map stores a bitmap of all the CPUs currently idle to
* allow easy lookup of whether any suitable idle CPUs are available.
* It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
* idle_cpus variable than to do a full bitmask check when we are busy.
*/
static inline void set_cpuidle_map(int cpu)
{
if (likely(cpu_online(cpu))) {
cpu_set(cpu, grq.cpu_idle_map);
grq.idle_cpus = true;
}
}
static inline void clear_cpuidle_map(int cpu)
{
cpu_clear(cpu, grq.cpu_idle_map);
if (cpus_empty(grq.cpu_idle_map))
grq.idle_cpus = false;
}
static bool suitable_idle_cpus(struct task_struct *p)
{
if (!grq.idle_cpus)
return false;
return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map));
}
#define CPUIDLE_DIFF_THREAD (1)
#define CPUIDLE_DIFF_CORE (2)
#define CPUIDLE_CACHE_BUSY (4)
#define CPUIDLE_DIFF_CPU (8)
#define CPUIDLE_THREAD_BUSY (16)
#define CPUIDLE_DIFF_NODE (32)
static void resched_task(struct task_struct *p);
/*
* The best idle CPU is chosen according to the CPUIDLE ranking above where the
* lowest value would give the most suitable CPU to schedule p onto next. The
* order works out to be the following:
*
* Same core, idle or busy cache, idle threads
* Other core, same cache, idle or busy cache, idle threads.
* Same node, other CPU, idle cache, idle threads.
* Same node, other CPU, busy cache, idle threads.
* Same core, busy threads.
* Other core, same cache, busy threads.
* Same node, other CPU, busy threads.
* Other node, other CPU, idle cache, idle threads.
* Other node, other CPU, busy cache, idle threads.
* Other node, other CPU, busy threads.
*/
static void
resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
{
unsigned int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
CPUIDLE_DIFF_THREAD;
int cpu_tmp;
if (cpu_isset(best_cpu, *tmpmask))
goto out;
for_each_cpu_mask(cpu_tmp, *tmpmask) {
unsigned int ranking;
struct rq *tmp_rq;
ranking = 0;
tmp_rq = cpu_rq(cpu_tmp);
#ifdef CONFIG_NUMA
if (rq->cpu_locality[cpu_tmp] > 3)
ranking |= CPUIDLE_DIFF_NODE;
else
#endif
if (rq->cpu_locality[cpu_tmp] > 2)
ranking |= CPUIDLE_DIFF_CPU;
#ifdef CONFIG_SCHED_MC
if (rq->cpu_locality[cpu_tmp] == 2)
ranking |= CPUIDLE_DIFF_CORE;
if (!(tmp_rq->cache_idle(cpu_tmp)))
ranking |= CPUIDLE_CACHE_BUSY;
#endif
#ifdef CONFIG_SCHED_SMT
if (rq->cpu_locality[cpu_tmp] == 1)
ranking |= CPUIDLE_DIFF_THREAD;
if (!(tmp_rq->siblings_idle(cpu_tmp)))
ranking |= CPUIDLE_THREAD_BUSY;
#endif
if (ranking < best_ranking) {
best_cpu = cpu_tmp;
best_ranking = ranking;
}
}
out:
resched_task(cpu_rq(best_cpu)->curr);
}
static void resched_best_idle(struct task_struct *p)
{
cpumask_t tmpmask;
cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
}
static inline void resched_suitable_idle(struct task_struct *p)
{
if (suitable_idle_cpus(p))
resched_best_idle(p);
}
/*
* Flags to tell us whether this CPU is running a CPU frequency governor that
* has slowed its speed or not. No locking required as the very rare wrongly
* read value would be harmless.
*/
void cpu_scaling(int cpu)
{
cpu_rq(cpu)->scaling = true;
}
void cpu_nonscaling(int cpu)
{
cpu_rq(cpu)->scaling = false;
}
static inline bool scaling_rq(struct rq *rq)
{
return rq->scaling;
}
#else /* CONFIG_SMP */
static inline void inc_qnr(void)
{
}
static inline void dec_qnr(void)
{
}
static inline int queued_notrunning(void)
{
return grq.nr_running;
}
static inline void set_cpuidle_map(int cpu)
{
}
static inline void clear_cpuidle_map(int cpu)
{
}
static inline bool suitable_idle_cpus(struct task_struct *p)
{
return uprq->curr == uprq->idle;
}
static inline void resched_suitable_idle(struct task_struct *p)
{
}
void cpu_scaling(int __unused)
{
}
void cpu_nonscaling(int __unused)
{
}
/*
* Although CPUs can scale in UP, there is nowhere else for tasks to go so this
* always returns 0.
*/
static inline bool scaling_rq(struct rq *rq)
{
return false;
}
#endif /* CONFIG_SMP */
EXPORT_SYMBOL_GPL(cpu_scaling);
EXPORT_SYMBOL_GPL(cpu_nonscaling);
/*
* activate_idle_task - move idle task to the _front_ of runqueue.
*/
static inline void activate_idle_task(struct task_struct *p)
{
enqueue_task_head(p);
grq.nr_running++;
inc_qnr();
}
static inline int normal_prio(struct task_struct *p)
{
if (has_rt_policy(p))
return MAX_RT_PRIO - 1 - p->rt_priority;
if (idleprio_task(p))
return IDLE_PRIO;
if (iso_task(p))
return ISO_PRIO;
return NORMAL_PRIO;
}
/*
* Calculate the current priority, i.e. the priority
* taken into account by the scheduler. This value might
* be boosted by RT tasks as it will be RT if the task got
* RT-boosted. If not then it returns p->normal_prio.
*/
static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);
/*
* If we are RT tasks or we were boosted to RT priority,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
if (!rt_prio(p->prio))
return p->normal_prio;
return p->prio;
}
/*
* activate_task - move a task to the runqueue. Enter with grq locked.