1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple scheduler.
4  *
5  * By default, it operates as a simple global weighted vtime scheduler and can
6  * be switched to FIFO scheduling. It also demonstrates the following niceties.
7  *
8  * - Statistics tracking how many tasks are queued to local and global dsq's.
9  * - Termination notification for userspace.
10  *
11  * While very simple, this scheduler should work reasonably well on CPUs with a
12  * uniform L3 cache topology. While preemption is not implemented, the fact that
13  * the scheduling queue is shared across all CPUs means that whatever is at the
14  * front of the queue is likely to be executed fairly quickly given enough
15  * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
16  * but comes with the usual problems with FIFO scheduling where saturating
17  * threads can easily drown out interactive ones.
18  *
19  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
20  * Copyright (c) 2022 Tejun Heo <[email protected]>
21  * Copyright (c) 2022 David Vernet <[email protected]>
22  */
23 #include <scx/common.bpf.h>
24 
25 char _license[] SEC("license") = "GPL";
26 
27 const volatile bool fifo_sched;
28 
29 static u64 vtime_now;
30 UEI_DEFINE(uei);
31 
32 /*
33  * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
34  * (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We
35  * therefore create a separate DSQ with ID 0 that we dispatch to and consume
36  * from. If scx_simple only supported global FIFO scheduling, then we could just
37  * use SCX_DSQ_GLOBAL.
38  */
39 #define SHARED_DSQ 0
40 
41 struct {
42 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
43 	__uint(key_size, sizeof(u32));
44 	__uint(value_size, sizeof(u64));
45 	__uint(max_entries, 2);			/* [local, global] */
46 } stats SEC(".maps");
47 
stat_inc(u32 idx)48 static void stat_inc(u32 idx)
49 {
50 	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
51 	if (cnt_p)
52 		(*cnt_p)++;
53 }
54 
BPF_STRUCT_OPS(simple_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)55 s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
56 {
57 	bool is_idle = false;
58 	s32 cpu;
59 
60 	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
61 	if (is_idle) {
62 		stat_inc(0);	/* count local queueing */
63 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
64 	}
65 
66 	return cpu;
67 }
68 
BPF_STRUCT_OPS(simple_enqueue,struct task_struct * p,u64 enq_flags)69 void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
70 {
71 	stat_inc(1);	/* count global queueing */
72 
73 	if (fifo_sched) {
74 		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
75 	} else {
76 		u64 vtime = p->scx.dsq_vtime;
77 
78 		/*
79 		 * Limit the amount of budget that an idling task can accumulate
80 		 * to one slice.
81 		 */
82 		if (time_before(vtime, vtime_now - SCX_SLICE_DFL))
83 			vtime = vtime_now - SCX_SLICE_DFL;
84 
85 		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
86 					 enq_flags);
87 	}
88 }
89 
BPF_STRUCT_OPS(simple_dispatch,s32 cpu,struct task_struct * prev)90 void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
91 {
92 	scx_bpf_dsq_move_to_local(SHARED_DSQ);
93 }
94 
BPF_STRUCT_OPS(simple_running,struct task_struct * p)95 void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
96 {
97 	if (fifo_sched)
98 		return;
99 
100 	/*
101 	 * Global vtime always progresses forward as tasks start executing. The
102 	 * test and update can be performed concurrently from multiple CPUs and
103 	 * thus racy. Any error should be contained and temporary. Let's just
104 	 * live with it.
105 	 */
106 	if (time_before(vtime_now, p->scx.dsq_vtime))
107 		vtime_now = p->scx.dsq_vtime;
108 }
109 
BPF_STRUCT_OPS(simple_stopping,struct task_struct * p,bool runnable)110 void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
111 {
112 	if (fifo_sched)
113 		return;
114 
115 	/*
116 	 * Scale the execution time by the inverse of the weight and charge.
117 	 *
118 	 * Note that the default yield implementation yields by setting
119 	 * @p->scx.slice to zero and the following would treat the yielding task
120 	 * as if it has consumed all its slice. If this penalizes yielding tasks
121 	 * too much, determine the execution time by taking explicit timestamps
122 	 * instead of depending on @p->scx.slice.
123 	 */
124 	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
125 }
126 
BPF_STRUCT_OPS(simple_enable,struct task_struct * p)127 void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
128 {
129 	p->scx.dsq_vtime = vtime_now;
130 }
131 
BPF_STRUCT_OPS_SLEEPABLE(simple_init)132 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
133 {
134 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
135 }
136 
BPF_STRUCT_OPS(simple_exit,struct scx_exit_info * ei)137 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
138 {
139 	UEI_RECORD(uei, ei);
140 }
141 
142 SCX_OPS_DEFINE(simple_ops,
143 	       .select_cpu		= (void *)simple_select_cpu,
144 	       .enqueue			= (void *)simple_enqueue,
145 	       .dispatch		= (void *)simple_dispatch,
146 	       .running			= (void *)simple_running,
147 	       .stopping		= (void *)simple_stopping,
148 	       .enable			= (void *)simple_enable,
149 	       .init			= (void *)simple_init,
150 	       .exit			= (void *)simple_exit,
151 	       .name			= "simple");
152