From d4a54e18ca707d4bf2ab7732fce591d598a7c15e Mon Sep 17 00:00:00 2001 From: Jan200101 Date: Wed, 9 Oct 2024 20:08:15 +0200 Subject: kernel 6.11.2 --- SOURCES/scx-kernel.patch | 4761 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 3768 insertions(+), 993 deletions(-) (limited to 'SOURCES/scx-kernel.patch') diff --git a/SOURCES/scx-kernel.patch b/SOURCES/scx-kernel.patch index 29e1f22..196bac1 100644 --- a/SOURCES/scx-kernel.patch +++ b/SOURCES/scx-kernel.patch @@ -1,3 +1,184 @@ +From 11276ed2c72c57624c1214e980efd24648be015c Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 4 Oct 2024 17:12:13 +0200 +Subject: [PATCH] sched-ext + +Signed-off-by: Peter Jung +--- + Documentation/scheduler/index.rst | 1 + + Documentation/scheduler/sched-ext.rst | 326 + + MAINTAINERS | 13 + + drivers/tty/sysrq.c | 1 + + include/asm-generic/vmlinux.lds.h | 1 + + include/linux/cgroup.h | 4 +- + include/linux/sched.h | 5 + + include/linux/sched/ext.h | 216 + + include/linux/sched/task.h | 8 +- + include/trace/events/sched_ext.h | 32 + + include/uapi/linux/sched.h | 1 + + init/Kconfig | 10 + + init/init_task.c | 12 + + kernel/Kconfig.preempt | 27 +- + kernel/fork.c | 17 +- + kernel/sched/build_policy.c | 11 + + kernel/sched/core.c | 288 +- + kernel/sched/cpufreq_schedutil.c | 50 +- + kernel/sched/debug.c | 3 + + kernel/sched/ext.c | 7262 +++++++++++++++++ + kernel/sched/ext.h | 91 + + kernel/sched/fair.c | 21 +- + kernel/sched/idle.c | 2 + + kernel/sched/sched.h | 203 +- + kernel/sched/syscalls.c | 26 + + lib/dump_stack.c | 1 + + tools/Makefile | 10 +- + tools/sched_ext/.gitignore | 2 + + tools/sched_ext/Makefile | 246 + + tools/sched_ext/README.md | 270 + + .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 + + tools/sched_ext/include/scx/common.bpf.h | 427 + + tools/sched_ext/include/scx/common.h | 75 + + tools/sched_ext/include/scx/compat.bpf.h | 47 + + tools/sched_ext/include/scx/compat.h | 186 + + tools/sched_ext/include/scx/user_exit_info.h | 115 + + tools/sched_ext/scx_central.bpf.c | 361 + + tools/sched_ext/scx_central.c | 135 + + tools/sched_ext/scx_flatcg.bpf.c | 957 +++ + tools/sched_ext/scx_flatcg.c | 233 + + tools/sched_ext/scx_flatcg.h | 51 + + tools/sched_ext/scx_qmap.bpf.c | 813 ++ + tools/sched_ext/scx_qmap.c | 153 + + tools/sched_ext/scx_show_state.py | 40 + + tools/sched_ext/scx_simple.bpf.c | 156 + + tools/sched_ext/scx_simple.c | 107 + + tools/testing/selftests/sched_ext/.gitignore | 6 + + tools/testing/selftests/sched_ext/Makefile | 218 + + tools/testing/selftests/sched_ext/config | 9 + + .../selftests/sched_ext/create_dsq.bpf.c | 58 + + .../testing/selftests/sched_ext/create_dsq.c | 57 + + .../sched_ext/ddsp_bogus_dsq_fail.bpf.c | 42 + + .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 57 + + .../sched_ext/ddsp_vtimelocal_fail.bpf.c | 39 + + .../sched_ext/ddsp_vtimelocal_fail.c | 56 + + .../selftests/sched_ext/dsp_local_on.bpf.c | 65 + + .../selftests/sched_ext/dsp_local_on.c | 58 + + .../sched_ext/enq_last_no_enq_fails.bpf.c | 21 + + .../sched_ext/enq_last_no_enq_fails.c | 60 + + .../sched_ext/enq_select_cpu_fails.bpf.c | 43 + + .../sched_ext/enq_select_cpu_fails.c | 61 + + tools/testing/selftests/sched_ext/exit.bpf.c | 84 + + tools/testing/selftests/sched_ext/exit.c | 55 + + tools/testing/selftests/sched_ext/exit_test.h | 20 + + .../testing/selftests/sched_ext/hotplug.bpf.c | 61 + + tools/testing/selftests/sched_ext/hotplug.c | 168 + + .../selftests/sched_ext/hotplug_test.h | 15 + + .../sched_ext/init_enable_count.bpf.c | 53 + + .../selftests/sched_ext/init_enable_count.c | 166 + + .../testing/selftests/sched_ext/maximal.bpf.c | 164 + + tools/testing/selftests/sched_ext/maximal.c | 51 + + .../selftests/sched_ext/maybe_null.bpf.c | 36 + + .../testing/selftests/sched_ext/maybe_null.c | 49 + + .../sched_ext/maybe_null_fail_dsp.bpf.c | 25 + + .../sched_ext/maybe_null_fail_yld.bpf.c | 28 + + .../testing/selftests/sched_ext/minimal.bpf.c | 21 + + tools/testing/selftests/sched_ext/minimal.c | 58 + + .../selftests/sched_ext/prog_run.bpf.c | 33 + + tools/testing/selftests/sched_ext/prog_run.c | 78 + + .../testing/selftests/sched_ext/reload_loop.c | 75 + + tools/testing/selftests/sched_ext/runner.c | 201 + + tools/testing/selftests/sched_ext/scx_test.h | 131 + + .../selftests/sched_ext/select_cpu_dfl.bpf.c | 40 + + .../selftests/sched_ext/select_cpu_dfl.c | 72 + + .../sched_ext/select_cpu_dfl_nodispatch.bpf.c | 89 + + .../sched_ext/select_cpu_dfl_nodispatch.c | 72 + + .../sched_ext/select_cpu_dispatch.bpf.c | 41 + + .../selftests/sched_ext/select_cpu_dispatch.c | 70 + + .../select_cpu_dispatch_bad_dsq.bpf.c | 37 + + .../sched_ext/select_cpu_dispatch_bad_dsq.c | 56 + + .../select_cpu_dispatch_dbl_dsp.bpf.c | 38 + + .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 56 + + .../sched_ext/select_cpu_vtime.bpf.c | 92 + + .../selftests/sched_ext/select_cpu_vtime.c | 59 + + .../selftests/sched_ext/test_example.c | 49 + + tools/testing/selftests/sched_ext/util.c | 71 + + tools/testing/selftests/sched_ext/util.h | 13 + + 97 files changed, 16174 insertions(+), 130 deletions(-) + create mode 100644 Documentation/scheduler/sched-ext.rst + create mode 100644 include/linux/sched/ext.h + create mode 100644 include/trace/events/sched_ext.h + create mode 100644 kernel/sched/ext.c + create mode 100644 kernel/sched/ext.h + create mode 100644 tools/sched_ext/.gitignore + create mode 100644 tools/sched_ext/Makefile + create mode 100644 tools/sched_ext/README.md + create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h + create mode 100644 tools/sched_ext/include/scx/common.bpf.h + create mode 100644 tools/sched_ext/include/scx/common.h + create mode 100644 tools/sched_ext/include/scx/compat.bpf.h + create mode 100644 tools/sched_ext/include/scx/compat.h + create mode 100644 tools/sched_ext/include/scx/user_exit_info.h + create mode 100644 tools/sched_ext/scx_central.bpf.c + create mode 100644 tools/sched_ext/scx_central.c + create mode 100644 tools/sched_ext/scx_flatcg.bpf.c + create mode 100644 tools/sched_ext/scx_flatcg.c + create mode 100644 tools/sched_ext/scx_flatcg.h + create mode 100644 tools/sched_ext/scx_qmap.bpf.c + create mode 100644 tools/sched_ext/scx_qmap.c + create mode 100644 tools/sched_ext/scx_show_state.py + create mode 100644 tools/sched_ext/scx_simple.bpf.c + create mode 100644 tools/sched_ext/scx_simple.c + create mode 100644 tools/testing/selftests/sched_ext/.gitignore + create mode 100644 tools/testing/selftests/sched_ext/Makefile + create mode 100644 tools/testing/selftests/sched_ext/config + create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c + create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c + create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c + create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c + create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/exit.c + create mode 100644 tools/testing/selftests/sched_ext/exit_test.h + create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/hotplug.c + create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h + create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c + create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/maximal.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/minimal.c + create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/prog_run.c + create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c + create mode 100644 tools/testing/selftests/sched_ext/runner.c + create mode 100644 tools/testing/selftests/sched_ext/scx_test.h + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c + create mode 100644 tools/testing/selftests/sched_ext/test_example.c + create mode 100644 tools/testing/selftests/sched_ext/util.c + create mode 100644 tools/testing/selftests/sched_ext/util.h + diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 43bd8a145b7a..0611dc3dda8e 100644 --- a/Documentation/scheduler/index.rst @@ -12,10 +193,10 @@ index 43bd8a145b7a..0611dc3dda8e 100644 text_files diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst new file mode 100644 -index 000000000000..a707d2181a77 +index 000000000000..6c0d70e2e27d --- /dev/null +++ b/Documentation/scheduler/sched-ext.rst -@@ -0,0 +1,316 @@ +@@ -0,0 +1,326 @@ +========================== +Extensible Scheduler Class +========================== @@ -101,6 +282,15 @@ index 000000000000..a707d2181a77 + # cat /sys/kernel/sched_ext/root/ops + simple + ++You can check if any BPF scheduler has ever been loaded since boot by examining ++this monotonically incrementing counter (a value of zero indicates that no BPF ++scheduler has been loaded): ++ ++.. code-block:: none ++ ++ # cat /sys/kernel/sched_ext/enable_seq ++ 1 ++ +``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more +detailed information: + @@ -114,6 +304,7 @@ index 000000000000..a707d2181a77 + enable_state : enabled (2) + bypass_depth : 0 + nr_rejected : 0 ++ enable_seq : 1 + +If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can +be determined as follows: @@ -333,10 +524,10 @@ index 000000000000..a707d2181a77 +possible, they are subject to change without warning between kernel +versions. diff --git a/MAINTAINERS b/MAINTAINERS -index 958e935449e5..17d2679d291a 100644 +index c2a7363e86fe..bcfe36daf67a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -19917,6 +19917,19 @@ F: include/linux/wait.h +@@ -20364,6 +20364,19 @@ F: include/linux/wait.h F: include/uapi/linux/sched.h F: kernel/sched/ @@ -353,11 +544,11 @@ index 958e935449e5..17d2679d291a 100644 +F: tools/sched_ext/ +F: tools/testing/selftests/sched_ext + - SCSI LIBSAS SUBSYSTEM - R: John Garry - R: Jason Yan + SCIOSENSE ENS160 MULTI-GAS SENSOR DRIVER + M: Gustavo Silva + S: Maintained diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c -index e5974b8239c9..167e877b8bef 100644 +index 14f8f00fdcf9..930b04e3d148 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { @@ -369,7 +560,7 @@ index e5974b8239c9..167e877b8bef 100644 NULL, /* T */ NULL, /* U */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index 70bf1004076b..a8417d31e348 100644 +index 1ae44793132a..19ec49a9179b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -133,6 +133,7 @@ @@ -381,10 +572,10 @@ index 70bf1004076b..a8417d31e348 100644 __sched_class_lowest = .; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h -index 2150ca60394b..3cdaec701600 100644 +index c60ba0ab1462..7139b33cb104 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h -@@ -29,8 +29,6 @@ +@@ -28,8 +28,6 @@ struct kernel_clone_args; @@ -393,7 +584,7 @@ index 2150ca60394b..3cdaec701600 100644 /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of -@@ -40,6 +38,8 @@ struct kernel_clone_args; +@@ -39,6 +37,8 @@ struct kernel_clone_args; #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 @@ -403,10 +594,10 @@ index 2150ca60394b..3cdaec701600 100644 CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ diff --git a/include/linux/sched.h b/include/linux/sched.h -index 76214d7c819d..0f3a107bcd02 100644 +index f8d150343d42..5b4f78fe379d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -80,6 +80,8 @@ struct task_group; +@@ -82,6 +82,8 @@ struct task_group; struct task_struct; struct user_event_mm; @@ -415,7 +606,7 @@ index 76214d7c819d..0f3a107bcd02 100644 /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). -@@ -802,6 +804,9 @@ struct task_struct { +@@ -810,6 +812,9 @@ struct task_struct { struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; @@ -427,10 +618,10 @@ index 76214d7c819d..0f3a107bcd02 100644 #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 -index 000000000000..26e1c33bc844 +index 000000000000..76166d3b14fc --- /dev/null +++ b/include/linux/sched/ext.h -@@ -0,0 +1,204 @@ +@@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst @@ -553,9 +744,17 @@ index 000000000000..26e1c33bc844 + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + ++enum scx_dsq_lnode_flags { ++ SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, ++ ++ /* high 16 bits can be for iter cursor flags */ ++ __SCX_DSQ_LNODE_PRIV_SHIFT = 16, ++}; ++ +struct scx_dsq_list_node { + struct list_head node; -+ bool is_bpf_iter_cursor; ++ u32 flags; ++ u32 priv; /* can be used by iter cursor */ +}; + +/* @@ -612,15 +811,19 @@ index 000000000000..26e1c33bc844 + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * -+ * If set from ops.init_task() and the task's policy is already -+ * %SCHED_EXT, which can happen while the BPF scheduler is being loaded -+ * or by inhering the parent's policy during fork, the task's policy is -+ * rejected and forcefully reverted to %SCHED_NORMAL. The number of -+ * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. ++ * Can be set from ops.init_task() while the BPF scheduler is being ++ * loaded (!scx_init_task_args->fork). If set and the task's policy is ++ * already %SCHED_EXT, the task's policy is rejected and forcefully ++ * reverted to %SCHED_NORMAL. The number of such events are reported ++ * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag ++ * during fork is not allowed. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ ++#ifdef CONFIG_EXT_GROUP_SCHED ++ struct cgroup *cgrp_moving_from; ++#endif + /* must be the last field, see init_scx_entity() */ + struct list_head tasks_node; +}; @@ -636,7 +839,7 @@ index 000000000000..26e1c33bc844 +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index d362aacf9f89..4df2f9055587 100644 +index d362aacf9f89..0f2aeb37bbb0 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); @@ -649,6 +852,18 @@ index d362aacf9f89..4df2f9055587 100644 extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); +@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) + return t; + } + ++static inline struct task_struct *tryget_task_struct(struct task_struct *t) ++{ ++ return refcount_inc_not_zero(&t->usage) ? t : NULL; ++} ++ + extern void __put_task_struct(struct task_struct *t); + extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); + diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h new file mode 100644 index 000000000000..fe19da7315a9 @@ -699,6 +914,37 @@ index 3bac0a8ceab2..359a14cc76a4 100644 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index 08a0d51afaae..e1a88d48d652 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1028,9 +1028,13 @@ menuconfig CGROUP_SCHED + tasks. + + if CGROUP_SCHED ++config GROUP_SCHED_WEIGHT ++ def_bool n ++ + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED ++ select GROUP_SCHED_WEIGHT + default CGROUP_SCHED + + config CFS_BANDWIDTH +@@ -1055,6 +1059,12 @@ config RT_GROUP_SCHED + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.rst for more information. + ++config EXT_GROUP_SCHED ++ bool ++ depends on SCHED_CLASS_EXT && CGROUP_SCHED ++ select GROUP_SCHED_WEIGHT ++ default y ++ + endif #CGROUP_SCHED + + config SCHED_MM_CID diff --git a/init/init_task.c b/init/init_task.c index eeb110c65fe2..e222722e790b 100644 --- a/init/init_task.c @@ -730,10 +976,10 @@ index eeb110c65fe2..e222722e790b 100644 .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a821..f3d140c3acc1 100644 +index c2f1fd95a821..fe782cd77388 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt -@@ -133,4 +133,28 @@ config SCHED_CORE +@@ -133,4 +133,29 @@ config SCHED_CORE which is the likely usage by Linux distributions, there should be no measurable impact on performance. @@ -741,6 +987,7 @@ index c2f1fd95a821..f3d140c3acc1 100644 +config SCHED_CLASS_EXT + bool "Extensible Scheduling Class" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF ++ select STACKTRACE if STACKTRACE_SUPPORT + help + This option enables a new scheduler class sched_ext (SCX), which + allows scheduling policies to be implemented as BPF programs to @@ -764,7 +1011,7 @@ index c2f1fd95a821..f3d140c3acc1 100644 + Documentation/scheduler/sched-ext.rst + https://github.com/sched-ext/scx diff --git a/kernel/fork.c b/kernel/fork.c -index 99076dbe27d8..741d962db0d9 100644 +index 238695afc630..69a0a7210060 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ @@ -775,7 +1022,7 @@ index 99076dbe27d8..741d962db0d9 100644 #include #include #include -@@ -971,6 +972,7 @@ void __put_task_struct(struct task_struct *tsk) +@@ -973,6 +974,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); @@ -783,7 +1030,7 @@ index 99076dbe27d8..741d962db0d9 100644 io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); -@@ -2363,7 +2365,7 @@ __latent_entropy struct task_struct *copy_process( +@@ -2355,7 +2357,7 @@ __latent_entropy struct task_struct *copy_process( retval = perf_event_init_task(p, clone_flags); if (retval) @@ -792,7 +1039,7 @@ index 99076dbe27d8..741d962db0d9 100644 retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; -@@ -2496,7 +2498,9 @@ __latent_entropy struct task_struct *copy_process( +@@ -2488,7 +2490,9 @@ __latent_entropy struct task_struct *copy_process( * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ @@ -803,7 +1050,7 @@ index 99076dbe27d8..741d962db0d9 100644 /* * From this point on we must avoid any synchronous user-space -@@ -2542,13 +2546,13 @@ __latent_entropy struct task_struct *copy_process( +@@ -2534,13 +2538,13 @@ __latent_entropy struct task_struct *copy_process( /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; @@ -819,7 +1066,7 @@ index 99076dbe27d8..741d962db0d9 100644 } /* No more failure paths after this point. */ -@@ -2622,10 +2626,11 @@ __latent_entropy struct task_struct *copy_process( +@@ -2614,10 +2618,11 @@ __latent_entropy struct task_struct *copy_process( return p; @@ -832,7 +1079,7 @@ index 99076dbe27d8..741d962db0d9 100644 cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { -@@ -2664,6 +2669,8 @@ __latent_entropy struct task_struct *copy_process( +@@ -2656,6 +2661,8 @@ __latent_entropy struct task_struct *copy_process( audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); @@ -842,7 +1089,7 @@ index 99076dbe27d8..741d962db0d9 100644 lockdep_free_task(p); #ifdef CONFIG_NUMA diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c -index d9dc9ab3773f..e7d539bb721e 100644 +index 39c315182b35..fae1f5c921eb 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -16,18 +16,25 @@ @@ -871,18 +1118,20 @@ index d9dc9ab3773f..e7d539bb721e 100644 #include -@@ -52,3 +59,6 @@ +@@ -52,4 +59,8 @@ #include "cputime.c" #include "deadline.c" +#ifdef CONFIG_SCHED_CLASS_EXT +# include "ext.c" +#endif ++ + #include "syscalls.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index ebf21373f663..fb6276f74ee6 100644 +index f3951e4a55e5..c792a6feb7a9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p) +@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ @@ -894,7 +1143,7 @@ index ebf21373f663..fb6276f74ee6 100644 } /* -@@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a, +@@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a, if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); @@ -906,7 +1155,7 @@ index ebf21373f663..fb6276f74ee6 100644 return false; } -@@ -1254,11 +1262,14 @@ bool sched_can_stop_tick(struct rq *rq) +@@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* @@ -918,14 +1167,14 @@ index ebf21373f663..fb6276f74ee6 100644 + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) -+ if (!scx_switched_all() && rq->nr_running > 1) ++ if (scx_enabled() && !scx_can_stop_tick(rq)) + return false; + -+ if (scx_enabled() && !scx_can_stop_tick(rq)) ++ if (rq->cfs.nr_running > 1) return false; /* -@@ -1340,8 +1351,8 @@ static void set_load_weight(struct task_struct *p, bool update_load) +@@ -1341,8 +1352,8 @@ void set_load_weight(struct task_struct *p, bool update_load) * SCHED_OTHER tasks have to update their load when changing their * weight */ @@ -936,7 +1185,7 @@ index ebf21373f663..fb6276f74ee6 100644 else p->se.load = lw; } -@@ -2210,6 +2221,17 @@ inline int task_curr(const struct task_struct *p) +@@ -2031,6 +2042,17 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } @@ -954,20 +1203,25 @@ index ebf21373f663..fb6276f74ee6 100644 /* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. -@@ -2217,9 +2239,9 @@ inline int task_curr(const struct task_struct *p) - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ --static inline void check_class_changed(struct rq *rq, struct task_struct *p, -- const struct sched_class *prev_class, -- int oldprio) -+void check_class_changed(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class, -+ int oldprio) +@@ -2289,7 +2311,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) + static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { - if (prev_class != p->sched_class) { - if (prev_class->switched_from) -@@ -3982,6 +4004,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) + /* When not in the task's cpumask, no point in looking further. */ +- if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ if (!task_allowed_on_cpu(p, cpu)) + return false; + + /* migrate_disabled() must be allowed to finish. */ +@@ -2298,7 +2320,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) + + /* Non kernel threads are not allowed during either online or offline. */ + if (!(p->flags & PF_KTHREAD)) +- return cpu_active(cpu) && task_cpu_possible(cpu, p); ++ return cpu_active(cpu); + + /* KTHREAD_IS_PER_CPU is always allowed. */ + if (kthread_is_per_cpu(p)) +@@ -3775,6 +3797,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { @@ -983,7 +1237,7 @@ index ebf21373f663..fb6276f74ee6 100644 /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. -@@ -4549,6 +4580,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4342,6 +4373,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->rt.on_rq = 0; p->rt.on_list = 0; @@ -994,7 +1248,7 @@ index ebf21373f663..fb6276f74ee6 100644 #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif -@@ -4789,10 +4824,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4582,10 +4617,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (dl_prio(p->prio)) return -EAGAIN; @@ -1015,7 +1269,7 @@ index ebf21373f663..fb6276f74ee6 100644 init_entity_runnable_average(&p->se); -@@ -4812,7 +4855,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4605,7 +4648,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } @@ -1024,7 +1278,7 @@ index ebf21373f663..fb6276f74ee6 100644 { unsigned long flags; -@@ -4974,6 +4974,13 @@ +@@ -4632,11 +4675,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -1038,15 +1292,13 @@ index ebf21373f663..fb6276f74ee6 100644 } void sched_post_fork(struct task_struct *p) -@@ -4982,6 +4989,7 @@ - sched_post_fork_bore(p); - #endif // CONFIG_SCHED_BORE + { uclamp_post_fork(p); + scx_post_fork(p); } unsigned long to_ratio(u64 period, u64 runtime) -@@ -5685,6 +5736,7 @@ void sched_tick(void) +@@ -5469,6 +5520,7 @@ void sched_tick(void) calc_global_load_tick(rq); sched_core_tick(rq); task_tick_mm_cid(rq, curr); @@ -1054,7 +1306,7 @@ index ebf21373f663..fb6276f74ee6 100644 rq_unlock(rq, &rf); -@@ -5697,8 +5749,10 @@ void sched_tick(void) +@@ -5481,8 +5533,10 @@ void sched_tick(void) wq_worker_tick(curr); #ifdef CONFIG_SMP @@ -1067,10 +1319,11 @@ index ebf21373f663..fb6276f74ee6 100644 #endif } -@@ -5989,7 +6043,19 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, +@@ -5772,8 +5826,19 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) + static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - #ifdef CONFIG_SMP +-#ifdef CONFIG_SMP + const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; + @@ -1080,23 +1333,28 @@ index ebf21373f663..fb6276f74ee6 100644 + * when waking up from SCHED_IDLE. If @start_class is below SCX, start + * from SCX instead. + */ -+ if (sched_class_above(&ext_sched_class, start_class)) ++ if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; +#endif + /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same -@@ -5998,7 +6064,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, +@@ -5782,11 +5847,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { +- if (class->balance(rq, prev, rf)) + for_active_class_range(class, start_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) ++ if (class->balance && class->balance(rq, prev, rf)) break; } -@@ -6016,6 +6082,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +-#endif + + put_prev_task(rq, prev); + } +@@ -5800,6 +5864,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; @@ -1106,7 +1364,7 @@ index ebf21373f663..fb6276f74ee6 100644 /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a -@@ -6056,10 +6125,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +@@ -5840,10 +5907,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (prev->dl_server) prev->dl_server = NULL; @@ -1124,7 +1382,7 @@ index ebf21373f663..fb6276f74ee6 100644 } BUG(); /* The idle class should always have a runnable task. */ -@@ -6089,7 +6163,7 @@ static inline struct task_struct *pick_task(struct rq *rq) +@@ -5873,7 +5945,7 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; @@ -1133,14 +1391,7 @@ index ebf21373f663..fb6276f74ee6 100644 p = class->pick_task(rq); if (p) return p; -@@ -7080,12 +7154,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag - } - EXPORT_SYMBOL(default_wake_function); - --static void __setscheduler_prio(struct task_struct *p, int prio) -+void __setscheduler_prio(struct task_struct *p, int prio) - { - if (dl_prio(prio)) +@@ -6870,6 +6942,10 @@ void __setscheduler_prio(struct task_struct *p, int prio) p->sched_class = &dl_sched_class; else if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -1151,7 +1402,7 @@ index ebf21373f663..fb6276f74ee6 100644 else p->sched_class = &fair_sched_class; -@@ -7246,6 +7324,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) +@@ -7015,6 +7091,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } __setscheduler_prio(p, prio); @@ -1159,68 +1410,7 @@ index ebf21373f663..fb6276f74ee6 100644 if (queued) enqueue_task(rq, p, queue_flag); -@@ -7467,6 +7546,25 @@ int sched_core_idle_cpu(int cpu) - #endif - - #ifdef CONFIG_SMP -+/* -+ * Load avg and utiliztion metrics need to be updated periodically and before -+ * consumption. This function updates the metrics for all subsystems except for -+ * the fair class. @rq must be locked and have its clock updated. -+ */ -+bool update_other_load_avgs(struct rq *rq) -+{ -+ u64 now = rq_clock_pelt(rq); -+ const struct sched_class *curr_class = rq->curr->sched_class; -+ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); -+ -+ lockdep_assert_rq_held(rq); -+ -+ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | -+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | -+ update_hw_load_avg(now, rq, hw_pressure) | -+ update_irq_load_avg(rq, 0); -+} -+ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -7789,6 +7887,10 @@ static int __sched_setscheduler(struct task_struct *p, - goto unlock; - } - -+ retval = scx_check_setscheduler(p, policy); -+ if (retval) -+ goto unlock; -+ - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. -@@ -7891,6 +7993,7 @@ static int __sched_setscheduler(struct task_struct *p, - __setscheduler_prio(p, newprio); - } - __setscheduler_uclamp(p, attr); -+ check_class_changing(rq, p, prev_class); - - if (queued) { - /* -@@ -9066,6 +9169,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - break; - } -@@ -9093,6 +9197,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - } - return ret; -@@ -9188,6 +9293,7 @@ void sched_show_task(struct task_struct *p) +@@ -7429,6 +7506,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -1228,7 +1418,7 @@ index ebf21373f663..fb6276f74ee6 100644 show_stack(p, NULL, KERN_INFO); put_task_stack(p); } -@@ -9680,6 +9786,8 @@ int sched_cpu_activate(unsigned int cpu) +@@ -7957,6 +8035,8 @@ int sched_cpu_activate(unsigned int cpu) cpuset_cpu_active(); } @@ -1237,7 +1427,7 @@ index ebf21373f663..fb6276f74ee6 100644 /* * Put the rq online, if not already. This happens: * -@@ -9903,6 +9903,8 @@ +@@ -8006,6 +8086,8 @@ int sched_cpu_deactivate(unsigned int cpu) sched_set_rq_offline(rq, cpu); @@ -1246,7 +1436,7 @@ index ebf21373f663..fb6276f74ee6 100644 /* * When going down, decrement the number of cores with SMT present. */ -@@ -10061,11 +10061,15 @@ +@@ -8192,11 +8192,15 @@ int i; /* Make sure the linker didn't screw up */ @@ -1266,7 +1456,17 @@ index ebf21373f663..fb6276f74ee6 100644 #endif #ifdef CONFIG_SCHED_BORE -@@ -10096,6 +10210,7 @@ void __init sched_init(void) +@@ -8218,6 +8304,9 @@ void __init sched_init(void) + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); + #endif /* CONFIG_FAIR_GROUP_SCHED */ ++#ifdef CONFIG_EXT_GROUP_SCHED ++ root_task_group.scx_weight = CGROUP_WEIGHT_DFL; ++#endif /* CONFIG_EXT_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +@@ -8363,6 +8452,7 @@ void __init sched_init(void) balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); @@ -1274,7 +1474,23 @@ index ebf21373f663..fb6276f74ee6 100644 psi_init(); -@@ -10522,11 +10637,6 @@ void sched_move_task(struct task_struct *tsk) +@@ -8648,6 +8738,7 @@ struct task_group *sched_create_group(struct task_group *parent) + if (!alloc_rt_sched_group(tg, parent)) + goto err; + ++ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + alloc_uclamp_sched_group(tg, parent); + + return tg; +@@ -8775,6 +8866,7 @@ void sched_move_task(struct task_struct *tsk) + put_prev_task(rq, tsk); + + sched_change_group(tsk, group); ++ scx_move_task(tsk); + + if (queued) + enqueue_task(rq, tsk, queue_flags); +@@ -8789,11 +8881,6 @@ void sched_move_task(struct task_struct *tsk) } } @@ -1286,15 +1502,153 @@ index ebf21373f663..fb6276f74ee6 100644 static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { -@@ -11293,29 +11403,27 @@ static int cpu_local_stat_show(struct seq_file *sf, +@@ -8817,6 +8904,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) + { + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); ++ int ret; ++ ++ ret = scx_tg_online(tg); ++ if (ret) ++ return ret; + + if (parent) + sched_online_group(tg, parent); +@@ -8831,6 +8923,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) + return 0; } - #ifdef CONFIG_FAIR_GROUP_SCHED ++static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ scx_tg_offline(tg); ++} ++ + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) + { + struct task_group *tg = css_tg(css); +@@ -8848,9 +8947,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) + sched_unregister_group(tg); + } + +-#ifdef CONFIG_RT_GROUP_SCHED + static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) + { ++#ifdef CONFIG_RT_GROUP_SCHED + struct task_struct *task; + struct cgroup_subsys_state *css; + +@@ -8858,9 +8957,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) + if (!sched_rt_can_attach(css_tg(css), task)) + return -EINVAL; + } +- return 0; +-} + #endif ++ return scx_cgroup_can_attach(tset); ++} + + static void cpu_cgroup_attach(struct cgroup_taskset *tset) + { +@@ -8869,6 +8968,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) + + cgroup_taskset_for_each(task, css, tset) + sched_move_task(task); + ++ scx_cgroup_finish_attach(); ++} ++ ++static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) ++{ ++ scx_cgroup_cancel_attach(tset); + } + + #ifdef CONFIG_UCLAMP_TASK_GROUP +@@ -9045,22 +9151,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) + } + #endif /* CONFIG_UCLAMP_TASK_GROUP */ + ++#ifdef CONFIG_GROUP_SCHED_WEIGHT +static unsigned long tg_weight(struct task_group *tg) +{ + #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); ++#else ++ return sched_weight_from_cgroup(tg->scx_weight); ++#endif +} ++ + static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) + { ++ int ret; ++ + if (shareval > scale_load_down(ULONG_MAX)) + shareval = MAX_SHARES; +- return sched_group_set_shares(css_tg(css), scale_load(shareval)); ++ ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); ++ if (!ret) ++ scx_group_set_weight(css_tg(css), ++ sched_weight_to_cgroup(shareval)); ++ return ret; + } + + static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) + { +- struct task_group *tg = css_tg(css); +- +- return (u64) scale_load_down(tg->shares); ++ return tg_weight(css_tg(css)); + } ++#endif /* CONFIG_GROUP_SCHED_WEIGHT */ + + #ifdef CONFIG_CFS_BANDWIDTH + static DEFINE_MUTEX(cfs_constraints_mutex); +@@ -9406,7 +9526,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) + return 0; + } + #endif /* CONFIG_CFS_BANDWIDTH */ +-#endif /* CONFIG_FAIR_GROUP_SCHED */ + + #ifdef CONFIG_RT_GROUP_SCHED + static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, +@@ -9434,7 +9553,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + } + #endif /* CONFIG_RT_GROUP_SCHED */ + +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) + { +@@ -9444,12 +9563,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) + { +- return sched_group_set_idle(css_tg(css), idle); ++ int ret; ++ ++ ret = sched_group_set_idle(css_tg(css), idle); ++ if (!ret) ++ scx_group_set_idle(css_tg(css), idle); ++ return ret; + } + #endif + + static struct cftype cpu_legacy_files[] = { +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + { + .name = "shares", + .read_u64 = cpu_shares_read_u64, +@@ -9559,38 +9683,35 @@ static int cpu_local_stat_show(struct seq_file *sf, + return 0; + } + +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -1319,6 +1673,7 @@ index ebf21373f663..fb6276f74ee6 100644 - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; ++ int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; @@ -1326,9 +1681,13 @@ index ebf21373f663..fb6276f74ee6 100644 - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); +- return sched_group_set_shares(css_tg(css), scale_load(weight)); ++ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); ++ if (!ret) ++ scx_group_set_weight(css_tg(css), cgrp_weight); ++ return ret; } -@@ -11323,7 +11431,7 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css, + static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -1337,7 +1696,58 @@ index ebf21373f663..fb6276f74ee6 100644 int last_delta = INT_MAX; int prio, delta; -@@ -12064,3 +12172,38 @@ void sched_mm_cid_fork(struct task_struct *t) +@@ -9609,7 +9730,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 nice) + { + unsigned long weight; +- int idx; ++ int idx, ret; + + if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; +@@ -9618,9 +9739,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + +- return sched_group_set_shares(css_tg(css), scale_load(weight)); ++ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); ++ if (!ret) ++ scx_group_set_weight(css_tg(css), ++ sched_weight_to_cgroup(weight)); ++ return ret; + } +-#endif ++#endif /* CONFIG_GROUP_SCHED_WEIGHT */ + + static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, + long period, long quota) +@@ -9680,7 +9805,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, + #endif + + static struct cftype cpu_files[] = { +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, +@@ -9734,14 +9859,14 @@ static struct cftype cpu_files[] = { + struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, ++ .css_offline = cpu_cgroup_css_offline, + .css_released = cpu_cgroup_css_released, + .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, + .css_local_stat_show = cpu_local_stat_show, +-#ifdef CONFIG_RT_GROUP_SCHED + .can_attach = cpu_cgroup_can_attach, +-#endif + .attach = cpu_cgroup_attach, ++ .cancel_attach = cpu_cgroup_cancel_attach, + .legacy_cftypes = cpu_legacy_files, + .dfl_cftypes = cpu_files, + .early_init = true, +@@ -10331,3 +10456,38 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif @@ -1481,10 +1891,10 @@ index c1eb9a1afd13..c057ef46c5f8 100644 diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c new file mode 100644 -index 000000000000..0dac88d0e578 +index 000000000000..25fadfaace33 --- /dev/null +++ b/kernel/sched/ext.c -@@ -0,0 +1,6532 @@ +@@ -0,0 +1,7262 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst @@ -1603,10 +2013,16 @@ index 000000000000..0dac88d0e578 + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + ++ /* ++ * CPU cgroup support flags ++ */ ++ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ ++ + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | -+ SCX_OPS_SWITCH_PARTIAL, ++ SCX_OPS_SWITCH_PARTIAL | ++ SCX_OPS_HAS_CGROUP_WEIGHT, +}; + +/* argument container for ops.init_task() */ @@ -1616,6 +2032,10 @@ index 000000000000..0dac88d0e578 + * to the scheduler transition path. + */ + bool fork; ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /* the cgroup the task is joining */ ++ struct cgroup *cgroup; ++#endif +}; + +/* argument container for ops.exit_task() */ @@ -1624,6 +2044,12 @@ index 000000000000..0dac88d0e578 + bool cancelled; +}; + ++/* argument container for ops->cgroup_init() */ ++struct scx_cgroup_init_args { ++ /* the weight of the cgroup [1..10000] */ ++ u32 weight; ++}; ++ +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, @@ -1988,6 +2414,79 @@ index 000000000000..0dac88d0e578 + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /** ++ * cgroup_init - Initialize a cgroup ++ * @cgrp: cgroup being initialized ++ * @args: init arguments, see the struct definition ++ * ++ * Either the BPF scheduler is being loaded or @cgrp created, initialize ++ * @cgrp for sched_ext. This operation may block. ++ * ++ * Return 0 for success, -errno for failure. An error return while ++ * loading will abort loading of the BPF scheduler. During cgroup ++ * creation, it will abort the specific cgroup creation. ++ */ ++ s32 (*cgroup_init)(struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args); ++ ++ /** ++ * cgroup_exit - Exit a cgroup ++ * @cgrp: cgroup being exited ++ * ++ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit ++ * @cgrp for sched_ext. This operation my block. ++ */ ++ void (*cgroup_exit)(struct cgroup *cgrp); ++ ++ /** ++ * cgroup_prep_move - Prepare a task to be moved to a different cgroup ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Prepare @p for move from cgroup @from to @to. This operation may ++ * block and can be used for allocations. ++ * ++ * Return 0 for success, -errno for failure. An error return aborts the ++ * migration. ++ */ ++ s32 (*cgroup_prep_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * cgroup_move - Commit cgroup move ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Commit the move. @p is dequeued during this operation. ++ */ ++ void (*cgroup_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * cgroup_cancel_move - Cancel cgroup move ++ * @p: task whose cgroup move is being canceled ++ * @from: cgroup @p was being moved from ++ * @to: cgroup @p was being moved to ++ * ++ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). ++ * Undo the preparation. ++ */ ++ void (*cgroup_cancel_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * cgroup_set_weight - A cgroup's weight is being changed ++ * @cgrp: cgroup whose weight is being updated ++ * @weight: new weight [1..10000] ++ * ++ * Update @tg's weight to @weight. ++ */ ++ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); ++#endif /* CONFIG_CGROUPS */ ++ + /* + * All online ops must come before ops.cpu_online(). + */ @@ -2173,8 +2672,12 @@ index 000000000000..0dac88d0e578 + SCX_KICK_WAIT = 1LLU << 2, +}; + ++enum scx_tg_flags { ++ SCX_TG_ONLINE = 1U << 0, ++ SCX_TG_INITED = 1U << 1, ++}; ++ +enum scx_ops_enable_state { -+ SCX_OPS_PREPPING, + SCX_OPS_ENABLING, + SCX_OPS_ENABLED, + SCX_OPS_DISABLING, @@ -2182,7 +2685,6 @@ index 000000000000..0dac88d0e578 +}; + +static const char *scx_ops_enable_state_str[] = { -+ [SCX_OPS_PREPPING] = "prepping", + [SCX_OPS_ENABLING] = "enabling", + [SCX_OPS_ENABLED] = "enabled", + [SCX_OPS_DISABLING] = "disabling", @@ -2250,6 +2752,7 @@ index 000000000000..0dac88d0e578 +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); ++static bool scx_ops_init_task_enabled; +static bool scx_switching_all; +DEFINE_STATIC_KEY_FALSE(__scx_switched_all); + @@ -2261,7 +2764,7 @@ index 000000000000..0dac88d0e578 +static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + -+struct static_key_false scx_has_op[SCX_OPI_END] = ++static struct static_key_false scx_has_op[SCX_OPI_END] = + { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; + +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); @@ -2271,6 +2774,13 @@ index 000000000000..0dac88d0e578 +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); + +/* ++ * A monotically increasing sequence number that is incremented every time a ++ * scheduler is enabled. This can be used by to check if any custom sched_ext ++ * scheduler has ever been used in the system. ++ */ ++static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); ++ ++/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_ops_error(). @@ -2314,8 +2824,15 @@ index 000000000000..0dac88d0e578 + */ +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); + -+/* dispatch queues */ -+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; ++/* ++ * Dispatch queues. ++ * ++ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is ++ * to avoid live-locking in bypass mode where all tasks are dispatched to ++ * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't ++ * sufficient, it can be further split. ++ */ ++static struct scx_dispatch_q **global_dsqs; + +static const struct rhashtable_params dsq_hash_params = { + .key_len = 8, @@ -2364,7 +2881,7 @@ index 000000000000..0dac88d0e578 + struct scx_bstr_buf buf; +}; + -+struct scx_dump_data scx_dump_data = { ++static struct scx_dump_data scx_dump_data = { + .cpu = -1, +}; + @@ -2418,6 +2935,16 @@ index 000000000000..0dac88d0e578 + return (s32)(a - b) < 0; +} + ++static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) ++{ ++ return global_dsqs[cpu_to_node(task_cpu(p))]; ++} ++ ++static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) ++{ ++ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); ++} ++ +/* + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -2554,6 +3081,11 @@ index 000000000000..0dac88d0e578 + return true; +} + ++static bool scx_kf_allowed_if_unlocked(void) ++{ ++ return !current->scx.kf_mask; ++} ++ +/** + * nldsq_next_task - Iterate to the next task in a non-local DSQ + * @dsq: user dsq being interated @@ -2587,7 +3119,7 @@ index 000000000000..0dac88d0e578 + + dsq_lnode = container_of(list_node, struct scx_dsq_list_node, + node); -+ } while (dsq_lnode->is_bpf_iter_cursor); ++ } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); + + return container_of(dsq_lnode, struct task_struct, scx.dsq_list); +} @@ -2605,16 +3137,22 @@ index 000000000000..0dac88d0e578 + */ +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ -+ SCX_DSQ_ITER_REV = 1U << 0, ++ SCX_DSQ_ITER_REV = 1U << 16, ++ ++ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, ++ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + -+ __SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV, ++ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, ++ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | ++ __SCX_DSQ_ITER_HAS_SLICE | ++ __SCX_DSQ_ITER_HAS_VTIME, +}; + +struct bpf_iter_scx_dsq_kern { + struct scx_dsq_list_node cursor; + struct scx_dispatch_q *dsq; -+ u32 dsq_seq; -+ u32 flags; ++ u64 slice; ++ u64 vtime; +} __attribute__((aligned(8))); + +struct bpf_iter_scx_dsq { @@ -2652,6 +3190,9 @@ index 000000000000..0dac88d0e578 +{ + lockdep_assert_held(&scx_tasks_lock); + ++ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ++ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); ++ + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + list_add(&iter->cursor.tasks_node, &scx_tasks); + iter->locked = NULL; @@ -2730,17 +3271,37 @@ index 000000000000..0dac88d0e578 + * whether they would like to filter out dead tasks. See scx_task_iter_init() + * for details. + */ -+static struct task_struct * -+scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) ++static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) +{ + struct task_struct *p; -+retry: ++ + scx_task_iter_rq_unlock(iter); + + while ((p = scx_task_iter_next(iter))) { + /* -+ * is_idle_task() tests %PF_IDLE which may not be set for CPUs -+ * which haven't yet been onlined. Test sched_class directly. ++ * scx_task_iter is used to prepare and move tasks into SCX ++ * while loading the BPF scheduler and vice-versa while ++ * unloading. The init_tasks ("swappers") should be excluded ++ * from the iteration because: ++ * ++ * - It's unsafe to use __setschduler_prio() on an init_task to ++ * determine the sched_class to use as it won't preserve its ++ * idle_sched_class. ++ * ++ * - ops.init/exit_task() can easily be confused if called with ++ * init_tasks as they, e.g., share PID 0. ++ * ++ * As init_tasks are never scheduled through SCX, they can be ++ * skipped safely. Note that is_idle_task() which tests %PF_IDLE ++ * doesn't work here: ++ * ++ * - %PF_IDLE may not be set for an init_task whose CPU hasn't ++ * yet been onlined. ++ * ++ * - %PF_IDLE can be set on tasks that are not init_tasks. See ++ * play_idle_precise() used by CONFIG_IDLE_INJECT. ++ * ++ * Test for idle_sched_class as only init_tasks are on it. + */ + if (p->sched_class != &idle_sched_class) + break; @@ -2751,16 +3312,6 @@ index 000000000000..0dac88d0e578 + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked = p; + -+ /* -+ * If we see %TASK_DEAD, @p already disabled preemption, is about to do -+ * the final __schedule(), won't ever need to be scheduled again and can -+ * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter -+ * the final __schedle() while we're locking its rq and thus will stay -+ * alive until the rq is unlocked. -+ */ -+ if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) -+ goto retry; -+ + return p; +} + @@ -2783,9 +3334,9 @@ index 000000000000..0dac88d0e578 + return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); +} + -+static bool scx_ops_bypassing(void) ++static bool scx_rq_bypassing(struct rq *rq) +{ -+ return unlikely(atomic_read(&scx_ops_bypass_depth)); ++ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} + +/** @@ -2919,13 +3470,18 @@ index 000000000000..0dac88d0e578 + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ ++ lockdep_assert_rq_held(rq); ++ +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). ++ * ++ * As this is used to determine ordering between tasks of sibling CPUs, ++ * it may be better to use per-core dispatch sequence instead. + */ + if (!sched_core_disabled()) -+ p->scx.core_sched_at = rq_clock_task(rq); ++ p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); +#endif +} + @@ -2942,7 +3498,6 @@ index 000000000000..0dac88d0e578 +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); -+ assert_clock_updated(rq); + +#ifdef CONFIG_SCHED_CORE + if (SCX_HAS_OP(core_sched_before)) @@ -2953,20 +3508,14 @@ index 000000000000..0dac88d0e578 +static void update_curr_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; -+ u64 now = rq_clock_task(rq); -+ u64 delta_exec; ++ s64 delta_exec; + -+ if (time_before_eq64(now, curr->se.exec_start)) ++ delta_exec = update_curr_common(rq); ++ if (unlikely(delta_exec <= 0)) + return; + -+ delta_exec = now - curr->se.exec_start; -+ curr->se.exec_start = now; -+ curr->se.sum_exec_runtime += delta_exec; -+ account_group_exec_runtime(curr, delta_exec); -+ cgroup_account_cputime(curr, delta_exec); -+ + if (curr->scx.slice != SCX_SLICE_INF) { -+ curr->scx.slice -= min(curr->scx.slice, delta_exec); ++ curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } @@ -3004,7 +3553,7 @@ index 000000000000..0dac88d0e578 + scx_ops_error("attempting to dispatch to a destroyed dsq"); + /* fall back to the global dsq */ + raw_spin_unlock(&dsq->lock); -+ dsq = &scx_dsq_global; ++ dsq = find_global_dsq(p); + raw_spin_lock(&dsq->lock); + } + } @@ -3107,6 +3656,8 @@ index 000000000000..0dac88d0e578 +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ ++ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); ++ + if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase(&p->scx.dsq_priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_priq); @@ -3114,6 +3665,7 @@ index 000000000000..0dac88d0e578 + } + + list_del_init(&p->scx.dsq_list.node); ++ dsq_mod_nr(dsq, -1); +} + +static void dispatch_dequeue(struct rq *rq, struct task_struct *p) @@ -3150,9 +3702,7 @@ index 000000000000..0dac88d0e578 + */ + if (p->scx.holding_cpu < 0) { + /* @p must still be on @dsq, dequeue */ -+ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); + task_unlink_from_dsq(p, dsq); -+ dsq_mod_nr(dsq, -1); + } else { + /* + * We're racing against dispatch_to_local_dsq() which already @@ -3169,34 +3719,32 @@ index 000000000000..0dac88d0e578 + raw_spin_unlock(&dsq->lock); +} + -+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) ++static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, ++ struct task_struct *p) +{ -+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -+} ++ struct scx_dispatch_q *dsq; + -+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -+{ -+ lockdep_assert(rcu_read_lock_any_held()); ++ if (dsq_id == SCX_DSQ_LOCAL) ++ return &rq->scx.local_dsq; + -+ if (dsq_id == SCX_DSQ_GLOBAL) -+ return &scx_dsq_global; -+ else -+ return find_user_dsq(dsq_id); -+} ++ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { ++ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + -+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, -+ struct task_struct *p) -+{ -+ struct scx_dispatch_q *dsq; ++ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) ++ return find_global_dsq(p); + -+ if (dsq_id == SCX_DSQ_LOCAL) -+ return &rq->scx.local_dsq; ++ return &cpu_rq(cpu)->scx.local_dsq; ++ } ++ ++ if (dsq_id == SCX_DSQ_GLOBAL) ++ dsq = find_global_dsq(p); ++ else ++ dsq = find_user_dsq(dsq_id); + -+ dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); -+ return &scx_dsq_global; ++ return find_global_dsq(p); + } + + return dsq; @@ -3235,8 +3783,8 @@ index 000000000000..0dac88d0e578 +static void direct_dispatch(struct task_struct *p, u64 enq_flags) +{ + struct rq *rq = task_rq(p); -+ struct scx_dispatch_q *dsq; -+ u64 dsq_id = p->scx.ddsp_dsq_id; ++ struct scx_dispatch_q *dsq = ++ find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + + touch_core_sched_dispatch(rq, p); + @@ -3248,15 +3796,9 @@ index 000000000000..0dac88d0e578 + * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer + * the enqueue so that it's executed when @rq can be unlocked. + */ -+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; ++ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { + unsigned long opss; + -+ if (cpu == cpu_of(rq)) { -+ dsq_id = SCX_DSQ_LOCAL; -+ goto dispatch; -+ } -+ + opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; + + switch (opss & SCX_OPSS_STATE_MASK) { @@ -3283,14 +3825,19 @@ index 000000000000..0dac88d0e578 + return; + } + -+dispatch: -+ dsq = find_dsq_for_dispatch(rq, dsq_id, p); + dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); +} + +static bool scx_rq_online(struct rq *rq) +{ -+ return likely(rq->scx.flags & SCX_RQ_ONLINE); ++ /* ++ * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates ++ * the online state as seen from the BPF scheduler. cpu_active() test ++ * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will ++ * stay set until the current scheduling operation is complete even if ++ * we aren't locking @rq. ++ */ ++ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); +} + +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, @@ -3313,7 +3860,7 @@ index 000000000000..0dac88d0e578 + if (!scx_rq_online(rq)) + goto local; + -+ if (scx_ops_bypassing()) { ++ if (scx_rq_bypassing(rq)) { + if (enq_flags & SCX_ENQ_LAST) + goto local; + else @@ -3378,7 +3925,7 @@ index 000000000000..0dac88d0e578 +global: + touch_core_sched(rq, p); /* see the comment in local: */ + p->scx.slice = SCX_SLICE_DFL; -+ dispatch_enqueue(&scx_dsq_global, p, enq_flags); ++ dispatch_enqueue(find_global_dsq(p), p, enq_flags); +} + +static bool task_runnable(const struct task_struct *p) @@ -3440,7 +3987,7 @@ index 000000000000..0dac88d0e578 + rq->scx.nr_running++; + add_nr_running(rq, 1); + -+ if (SCX_HAS_OP(runnable)) ++ if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + + if (enq_flags & SCX_ENQ_WAKEUP) @@ -3524,7 +4071,7 @@ index 000000000000..0dac88d0e578 + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + } + -+ if (SCX_HAS_OP(quiescent)) ++ if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + + if (deq_flags & SCX_DEQ_SLEEP) @@ -3559,193 +4106,173 @@ index 000000000000..0dac88d0e578 + return false; +} + ++static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, ++ struct scx_dispatch_q *src_dsq, ++ struct rq *dst_rq) ++{ ++ struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; ++ ++ /* @dsq is locked and @p is on @dst_rq */ ++ lockdep_assert_held(&src_dsq->lock); ++ lockdep_assert_rq_held(dst_rq); ++ ++ WARN_ON_ONCE(p->scx.holding_cpu >= 0); ++ ++ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) ++ list_add(&p->scx.dsq_list.node, &dst_dsq->list); ++ else ++ list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); ++ ++ dsq_mod_nr(dst_dsq, 1); ++ p->scx.dsq = dst_dsq; ++} ++ +#ifdef CONFIG_SMP +/** -+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ -+ * @rq: rq to move the task into, currently locked ++ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ + * @p: task to move + * @enq_flags: %SCX_ENQ_* ++ * @src_rq: rq to move the task from, locked on entry, released on return ++ * @dst_rq: rq to move the task into, locked on return + * -+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller -+ * must: -+ * -+ * 1. Start with exclusive access to @p either through its DSQ lock or -+ * %SCX_OPSS_DISPATCHING flag. -+ * -+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). -+ * -+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't -+ * deadlock with dequeue. -+ * -+ * 4. Lock @rq and the task_rq from #3. -+ * -+ * 5. Call this function. -+ * -+ * Returns %true if @p was successfully moved. %false after racing dequeue and -+ * losing. ++ * Move @p which is currently on @src_rq to @dst_rq's local DSQ. + */ -+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, -+ u64 enq_flags) ++static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, ++ struct rq *src_rq, struct rq *dst_rq) +{ -+ struct rq *task_rq; ++ lockdep_assert_rq_held(src_rq); + -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * If dequeue got to @p while we were trying to lock both rq's, it'd -+ * have cleared @p->scx.holding_cpu to -1. While other cpus may have -+ * updated it to different values afterwards, as this operation can't be -+ * preempted or recurse, @p->scx.holding_cpu can never become -+ * raw_smp_processor_id() again before we're done. Thus, we can tell -+ * whether we lost to dequeue by testing whether @p->scx.holding_cpu is -+ * still raw_smp_processor_id(). -+ * -+ * See dispatch_dequeue() for the counterpart. -+ */ -+ if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) -+ return false; ++ /* the following marks @p MIGRATING which excludes dequeue */ ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, cpu_of(dst_rq)); ++ p->scx.sticky_cpu = cpu_of(dst_rq); + -+ /* @p->rq couldn't have changed if we're still the holding cpu */ -+ task_rq = task_rq(p); -+ lockdep_assert_rq_held(task_rq); -+ -+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); -+ deactivate_task(task_rq, p, 0); -+ set_task_cpu(p, cpu_of(rq)); -+ p->scx.sticky_cpu = cpu_of(rq); ++ raw_spin_rq_unlock(src_rq); ++ raw_spin_rq_lock(dst_rq); + + /* + * We want to pass scx-specific enq_flags but activate_task() will + * truncate the upper 32 bit. As we own @rq, we can pass them through + * @rq->scx.extra_enq_flags instead. + */ -+ WARN_ON_ONCE(rq->scx.extra_enq_flags); -+ rq->scx.extra_enq_flags = enq_flags; -+ activate_task(rq, p, 0); -+ rq->scx.extra_enq_flags = 0; -+ -+ return true; ++ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); ++ WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); ++ dst_rq->scx.extra_enq_flags = enq_flags; ++ activate_task(dst_rq, p, 0); ++ dst_rq->scx.extra_enq_flags = 0; +} + -+/** -+ * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked -+ * @rq: current rq which is locked -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to ++/* ++ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two ++ * differences: + * -+ * We're holding @rq lock and trying to dispatch a task from @src_rq to -+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether -+ * @rq stays locked isn't important as long as the state is restored after -+ * dispatch_to_local_dsq_unlock(). -+ */ -+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq *src_rq, -+ struct rq *dst_rq) -+{ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(rq); -+ raw_spin_rq_lock(dst_rq); -+ } else if (rq == src_rq) { -+ double_lock_balance(rq, dst_rq); -+ } else if (rq == dst_rq) { -+ double_lock_balance(rq, src_rq); -+ } else { -+ raw_spin_rq_unlock(rq); -+ double_rq_lock(src_rq, dst_rq); -+ } -+} -+ -+/** -+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() -+ * @rq: current rq which is locked -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to ++ * - is_cpu_allowed() asks "Can this task run on this CPU?" while ++ * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to ++ * this CPU?". + * -+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. -+ */ -+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq *src_rq, -+ struct rq *dst_rq) -+{ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(dst_rq); -+ raw_spin_rq_lock(rq); -+ } else if (rq == src_rq) { -+ double_unlock_balance(rq, dst_rq); -+ } else if (rq == dst_rq) { -+ double_unlock_balance(rq, src_rq); -+ } else { -+ double_rq_unlock(src_rq, dst_rq); -+ raw_spin_rq_lock(rq); -+ } -+} -+#endif /* CONFIG_SMP */ -+ -+static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, -+ struct task_struct *p) -+{ -+ lockdep_assert_held(&dsq->lock); /* released on return */ -+ -+ /* @dsq is locked and @p is on this rq */ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ task_unlink_from_dsq(p, dsq); -+ list_add_tail(&p->scx.dsq_list.node, &rq->scx.local_dsq.list); -+ dsq_mod_nr(dsq, -1); -+ dsq_mod_nr(&rq->scx.local_dsq, 1); -+ p->scx.dsq = &rq->scx.local_dsq; -+ raw_spin_unlock(&dsq->lock); -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p -+ * can be pulled to @rq. ++ * While migration is disabled, is_cpu_allowed() has to say "yes" as the task ++ * must be allowed to finish on the CPU that it's currently on regardless of ++ * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the ++ * BPF scheduler shouldn't attempt to migrate a task which has migration ++ * disabled. ++ * ++ * - The BPF scheduler is bypassed while the rq is offline and we can always say ++ * no to the BPF scheduler initiated migrations while offline. + */ -+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) ++static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, ++ bool trigger_error) +{ + int cpu = cpu_of(rq); + -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ /* ++ * We don't require the BPF scheduler to avoid dispatching to offline ++ * CPUs mostly for convenience but also because CPUs can go offline ++ * between scx_bpf_dispatch() calls and here. Trigger error iff the ++ * picked CPU is outside the allowed mask. ++ */ ++ if (!task_allowed_on_cpu(p, cpu)) { ++ if (trigger_error) ++ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", ++ cpu_of(rq), p->comm, p->pid); + return false; ++ } ++ + if (unlikely(is_migration_disabled(p))) + return false; -+ if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p))) -+ return false; ++ + if (!scx_rq_online(rq)) + return false; ++ + return true; +} + -+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq, -+ struct task_struct *p, struct rq *task_rq) ++/** ++ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq ++ * @p: target task ++ * @dsq: locked DSQ @p is currently on ++ * @src_rq: rq @p is currently on, stable with @dsq locked ++ * ++ * Called with @dsq locked but no rq's locked. We want to move @p to a different ++ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is ++ * required when transferring into a local DSQ. Even when transferring into a ++ * non-local DSQ, it's better to use the same mechanism to protect against ++ * dequeues and maintain the invariant that @p->scx.dsq can only change while ++ * @src_rq is locked, which e.g. scx_dump_task() depends on. ++ * ++ * We want to grab @src_rq but that can deadlock if we try while locking @dsq, ++ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As ++ * this may race with dequeue, which can't drop the rq lock or fail, do a little ++ * dancing from our side. ++ * ++ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets ++ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu ++ * would be cleared to -1. While other cpus may have updated it to different ++ * values afterwards, as this operation can't be preempted or recurse, the ++ * holding_cpu can never become this CPU again before we're done. Thus, we can ++ * tell whether we lost to dequeue by testing whether the holding_cpu still ++ * points to this CPU. See dispatch_dequeue() for the counterpart. ++ * ++ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is ++ * still valid. %false if lost to dequeue. ++ */ ++static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, ++ struct scx_dispatch_q *dsq, ++ struct rq *src_rq) +{ -+ bool moved = false; ++ s32 cpu = raw_smp_processor_id(); + -+ lockdep_assert_held(&dsq->lock); /* released on return */ ++ lockdep_assert_held(&dsq->lock); + -+ /* -+ * @dsq is locked and @p is on a remote rq. @p is currently protected by -+ * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab -+ * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the -+ * rq lock or fail, do a little dancing from our side. See -+ * move_task_to_local_dsq(). -+ */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); -+ dsq_mod_nr(dsq, -1); -+ p->scx.holding_cpu = raw_smp_processor_id(); -+ raw_spin_unlock(&dsq->lock); ++ p->scx.holding_cpu = cpu; + -+ double_lock_balance(rq, task_rq); ++ raw_spin_unlock(&dsq->lock); ++ raw_spin_rq_lock(src_rq); + -+ moved = move_task_to_local_dsq(rq, p, 0); ++ /* task_rq couldn't have changed if we're still the holding cpu */ ++ return likely(p->scx.holding_cpu == cpu) && ++ !WARN_ON_ONCE(src_rq != task_rq(p)); ++} + -+ double_unlock_balance(rq, task_rq); ++static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, ++ struct scx_dispatch_q *dsq, struct rq *src_rq) ++{ ++ raw_spin_rq_unlock(this_rq); + -+ return moved; ++ if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { ++ move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); ++ return true; ++ } else { ++ raw_spin_rq_unlock(src_rq); ++ raw_spin_rq_lock(this_rq); ++ return false; ++ } +} +#else /* CONFIG_SMP */ -+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } -+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq, -+ struct task_struct *p, struct rq *task_rq) { return false; } ++static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } ++static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } ++static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } +#endif /* CONFIG_SMP */ + +static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) @@ -3766,12 +4293,14 @@ index 000000000000..0dac88d0e578 + struct rq *task_rq = task_rq(p); + + if (rq == task_rq) { -+ consume_local_task(rq, dsq, p); ++ task_unlink_from_dsq(p, dsq); ++ move_local_task_to_local_dsq(p, 0, dsq, rq); ++ raw_spin_unlock(&dsq->lock); + return true; + } + -+ if (task_can_run_on_remote_rq(p, rq)) { -+ if (likely(consume_remote_task(rq, dsq, p, task_rq))) ++ if (task_can_run_on_remote_rq(p, rq, false)) { ++ if (likely(consume_remote_task(rq, p, dsq, task_rq))) + return true; + goto retry; + } @@ -3781,122 +4310,102 @@ index 000000000000..0dac88d0e578 + return false; +} + -+enum dispatch_to_local_dsq_ret { -+ DTL_DISPATCHED, /* successfully dispatched */ -+ DTL_LOST, /* lost race to dequeue */ -+ DTL_NOT_LOCAL, /* destination is not a local DSQ */ -+ DTL_INVALID, /* invalid local dsq_id */ -+}; ++static bool consume_global_dsq(struct rq *rq) ++{ ++ int node = cpu_to_node(cpu_of(rq)); ++ ++ return consume_dispatch_q(rq, global_dsqs[node]); ++} + +/** + * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @rq: current rq which is locked -+ * @dsq_id: destination dsq ID ++ * @dst_dsq: destination DSQ + * @p: task to dispatch + * @enq_flags: %SCX_ENQ_* + * -+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by -+ * @dsq_id. This function performs all the synchronization dancing needed -+ * because local DSQs are protected with rq locks. ++ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local ++ * DSQ. This function performs all the synchronization dancing needed because ++ * local DSQs are protected with rq locks. + * + * The caller must have exclusive ownership of @p (e.g. through + * %SCX_OPSS_DISPATCHING). + */ -+static enum dispatch_to_local_dsq_ret -+dispatch_to_local_dsq(struct rq *rq, u64 dsq_id, struct task_struct *p, -+ u64 enq_flags) ++static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, ++ struct task_struct *p, u64 enq_flags) +{ + struct rq *src_rq = task_rq(p); -+ struct rq *dst_rq; ++ struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); + + /* + * We're synchronized against dequeue through DISPATCHING. As @p can't + * be dequeued, its task_rq and cpus_allowed are stable too. ++ * ++ * If dispatching to @rq that @p is already on, no lock dancing needed. + */ -+ if (dsq_id == SCX_DSQ_LOCAL) { -+ dst_rq = rq; -+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) -+ return DTL_INVALID; -+ dst_rq = cpu_rq(cpu); -+ } else { -+ return DTL_NOT_LOCAL; -+ } -+ -+ /* if dispatching to @rq that @p is already on, no lock dancing needed */ + if (rq == src_rq && rq == dst_rq) { -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags | SCX_ENQ_CLEAR_OPSS); -+ return DTL_DISPATCHED; ++ dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); ++ return; + } + +#ifdef CONFIG_SMP -+ if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { -+ struct rq *locked_dst_rq = dst_rq; -+ bool dsp; ++ if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { ++ dispatch_enqueue(find_global_dsq(p), p, ++ enq_flags | SCX_ENQ_CLEAR_OPSS); ++ return; ++ } + -+ /* -+ * @p is on a possibly remote @src_rq which we need to lock to -+ * move the task. If dequeue is in progress, it'd be locking -+ * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq -+ * lock while holding DISPATCHING. -+ * -+ * As DISPATCHING guarantees that @p is wholly ours, we can -+ * pretend that we're moving from a DSQ and use the same -+ * mechanism - mark the task under transfer with holding_cpu, -+ * release DISPATCHING and then follow the same protocol. -+ */ -+ p->scx.holding_cpu = raw_smp_processor_id(); ++ /* ++ * @p is on a possibly remote @src_rq which we need to lock to move the ++ * task. If dequeue is in progress, it'd be locking @src_rq and waiting ++ * on DISPATCHING, so we can't grab @src_rq lock while holding ++ * DISPATCHING. ++ * ++ * As DISPATCHING guarantees that @p is wholly ours, we can pretend that ++ * we're moving from a DSQ and use the same mechanism - mark the task ++ * under transfer with holding_cpu, release DISPATCHING and then follow ++ * the same protocol. See unlink_dsq_and_lock_src_rq(). ++ */ ++ p->scx.holding_cpu = raw_smp_processor_id(); + -+ /* store_release ensures that dequeue sees the above */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); ++ /* store_release ensures that dequeue sees the above */ ++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + -+ dispatch_to_local_dsq_lock(rq, src_rq, locked_dst_rq); ++ /* switch to @src_rq lock */ ++ if (rq != src_rq) { ++ raw_spin_rq_unlock(rq); ++ raw_spin_rq_lock(src_rq); ++ } + ++ /* task_rq couldn't have changed if we're still the holding cpu */ ++ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && ++ !WARN_ON_ONCE(src_rq != task_rq(p))) { + /* -+ * We don't require the BPF scheduler to avoid dispatching to -+ * offline CPUs mostly for convenience but also because CPUs can -+ * go offline between scx_bpf_dispatch() calls and here. If @p -+ * is destined to an offline CPU, queue it on its current CPU -+ * instead, which should always be safe. As this is an allowed -+ * behavior, don't trigger an ops error. ++ * If @p is staying on the same rq, there's no need to go ++ * through the full deactivate/activate cycle. Optimize by ++ * abbreviating move_remote_task_to_local_dsq(). + */ -+ if (!scx_rq_online(dst_rq)) -+ dst_rq = src_rq; -+ + if (src_rq == dst_rq) { -+ /* -+ * As @p is staying on the same rq, there's no need to -+ * go through the full deactivate/activate cycle. -+ * Optimize by abbreviating the operations in -+ * move_task_to_local_dsq(). -+ */ -+ dsp = p->scx.holding_cpu == raw_smp_processor_id(); -+ if (likely(dsp)) { -+ p->scx.holding_cpu = -1; -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags); -+ } ++ p->scx.holding_cpu = -1; ++ dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags); + } else { -+ dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); ++ move_remote_task_to_local_dsq(p, enq_flags, ++ src_rq, dst_rq); + } + + /* if the destination CPU is idle, wake it up */ -+ if (dsp && sched_class_above(p->sched_class, -+ dst_rq->curr->sched_class)) ++ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) + resched_curr(dst_rq); ++ } + -+ dispatch_to_local_dsq_unlock(rq, src_rq, locked_dst_rq); -+ -+ return dsp ? DTL_DISPATCHED : DTL_LOST; ++ /* switch back to @rq lock */ ++ if (rq != dst_rq) { ++ raw_spin_rq_unlock(dst_rq); ++ raw_spin_rq_lock(rq); + } ++#else /* CONFIG_SMP */ ++ BUG(); /* control can not reach here on UP */ +#endif /* CONFIG_SMP */ -+ -+ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", -+ cpu_of(dst_rq), p->comm, p->pid); -+ return DTL_INVALID; +} + +/** @@ -3971,20 +4480,12 @@ index 000000000000..0dac88d0e578 + + BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); + -+ switch (dispatch_to_local_dsq(rq, dsq_id, p, enq_flags)) { -+ case DTL_DISPATCHED: -+ break; -+ case DTL_LOST: -+ break; -+ case DTL_INVALID: -+ dsq_id = SCX_DSQ_GLOBAL; -+ fallthrough; -+ case DTL_NOT_LOCAL: -+ dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), -+ dsq_id, p); ++ dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p); ++ ++ if (dsq->id == SCX_DSQ_LOCAL) ++ dispatch_to_local_dsq(rq, dsq, p, enq_flags); ++ else + dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); -+ break; -+ } +} + +static void flush_dispatch_buf(struct rq *rq) @@ -4046,7 +4547,7 @@ index 000000000000..0dac88d0e578 + * same conditions later and pick @rq->curr accordingly. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && -+ prev->scx.slice && !scx_ops_bypassing()) { ++ prev->scx.slice && !scx_rq_bypassing(rq)) { + if (local) + prev->scx.flags |= SCX_TASK_BAL_KEEP; + goto has_tasks; @@ -4057,10 +4558,10 @@ index 000000000000..0dac88d0e578 + if (rq->scx.local_dsq.nr) + goto has_tasks; + -+ if (consume_dispatch_q(rq, &scx_dsq_global)) ++ if (consume_global_dsq(rq)) + goto has_tasks; + -+ if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) ++ if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) + goto out; + + dspc->rq = rq; @@ -4082,7 +4583,7 @@ index 000000000000..0dac88d0e578 + + if (rq->scx.local_dsq.nr) + goto has_tasks; -+ if (consume_dispatch_q(rq, &scx_dsq_global)) ++ if (consume_global_dsq(rq)) + goto has_tasks; + + /* @@ -4109,7 +4610,6 @@ index 000000000000..0dac88d0e578 + return has_tasks; +} + -+#ifdef CONFIG_SMP +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ @@ -4143,7 +4643,31 @@ index 000000000000..0dac88d0e578 + + return ret; +} -+#endif ++ ++static void process_ddsp_deferred_locals(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++ lockdep_assert_rq_held(rq); ++ ++ /* ++ * Now that @rq can be unlocked, execute the deferred enqueueing of ++ * tasks directly dispatched to the local DSQs of other CPUs. See ++ * direct_dispatch(). Keep popping from the head instead of using ++ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq ++ * temporarily. ++ */ ++ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, ++ struct task_struct, scx.dsq_list.node))) { ++ struct scx_dispatch_q *dsq; ++ ++ list_del_init(&p->scx.dsq_list.node); ++ ++ dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); ++ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) ++ dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags); ++ } ++} + +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) +{ @@ -4187,62 +4711,71 @@ index 000000000000..0dac88d0e578 + } +} + -+static void process_ddsp_deferred_locals(struct rq *rq) ++static enum scx_cpu_preempt_reason ++preempt_reason_from_class(const struct sched_class *class) +{ -+ struct task_struct *p, *tmp; ++#ifdef CONFIG_SMP ++ if (class == &stop_sched_class) ++ return SCX_CPU_PREEMPT_STOP; ++#endif ++ if (class == &dl_sched_class) ++ return SCX_CPU_PREEMPT_DL; ++ if (class == &rt_sched_class) ++ return SCX_CPU_PREEMPT_RT; ++ return SCX_CPU_PREEMPT_UNKNOWN; ++} + -+ lockdep_assert_rq_held(rq); ++static void switch_class_scx(struct rq *rq, struct task_struct *next) ++{ ++ const struct sched_class *next_class = next->sched_class; + ++ if (!scx_enabled()) ++ return; ++#ifdef CONFIG_SMP + /* -+ * Now that @rq can be unlocked, execute the deferred enqueueing of -+ * tasks directly dispatched to the local DSQs of other CPUs. See -+ * direct_dispatch(). ++ * Pairs with the smp_load_acquire() issued by a CPU in ++ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a ++ * resched. + */ -+ list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals, -+ scx.dsq_list.node) { -+ s32 ret; ++ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); ++#endif ++ if (!static_branch_unlikely(&scx_ops_cpu_preempt)) ++ return; + -+ list_del_init(&p->scx.dsq_list.node); ++ /* ++ * The callback is conceptually meant to convey that the CPU is no ++ * longer under the control of SCX. Therefore, don't invoke the callback ++ * if the next class is below SCX (in which case the BPF scheduler has ++ * actively decided not to schedule any tasks on the CPU). ++ */ ++ if (sched_class_above(&ext_sched_class, next_class)) ++ return; ++ ++ /* ++ * At this point we know that SCX was preempted by a higher priority ++ * sched_class, so invoke the ->cpu_release() callback if we have not ++ * done so already. We only send the callback once between SCX being ++ * preempted, and it regaining control of the CPU. ++ * ++ * ->cpu_release() complements ->cpu_acquire(), which is emitted the ++ * next time that balance_scx() is invoked. ++ */ ++ if (!rq->scx.cpu_released) { ++ if (SCX_HAS_OP(cpu_release)) { ++ struct scx_cpu_release_args args = { ++ .reason = preempt_reason_from_class(next_class), ++ .task = next, ++ }; + -+ ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p, -+ p->scx.ddsp_enq_flags); -+ WARN_ON_ONCE(ret == DTL_NOT_LOCAL); ++ SCX_CALL_OP(SCX_KF_CPU_RELEASE, ++ cpu_release, cpu_of(rq), &args); ++ } ++ rq->scx.cpu_released = true; + } +} + +static void put_prev_task_scx(struct rq *rq, struct task_struct *p) +{ -+#ifndef CONFIG_SMP -+ /* -+ * UP workaround. -+ * -+ * Because SCX may transfer tasks across CPUs during dispatch, dispatch -+ * is performed from its balance operation which isn't called in UP. -+ * Let's work around by calling it from the operations which come right -+ * after. -+ * -+ * 1. If the prev task is on SCX, pick_next_task() calls -+ * .put_prev_task() right after. As .put_prev_task() is also called -+ * from other places, we need to distinguish the calls which can be -+ * done by looking at the previous task's state - if still queued or -+ * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). -+ * This case is handled here. -+ * -+ * 2. If the prev task is not on SCX, the first following call into SCX -+ * will be .pick_next_task(), which is covered by calling -+ * balance_scx() from pick_next_task_scx(). -+ * -+ * Note that we can't merge the first case into the second as -+ * balance_scx() must be called before the previous SCX task goes -+ * through put_prev_task_scx(). -+ * -+ * @rq is pinned and can't be unlocked. As UP doesn't transfer tasks -+ * around, balance_one() doesn't need to. -+ */ -+ if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) -+ balance_one(rq, p, true); -+#endif -+ + update_curr_scx(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ @@ -4269,7 +4802,7 @@ index 000000000000..0dac88d0e578 + * scheduler class or core-sched forcing a different task. Leave + * it at the head of the local DSQ. + */ -+ if (p->scx.slice && !scx_ops_bypassing()) { ++ if (p->scx.slice && !scx_rq_bypassing(rq)) { + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + return; + } @@ -4300,12 +4833,6 @@ index 000000000000..0dac88d0e578 +{ + struct task_struct *p; + -+#ifndef CONFIG_SMP -+ /* UP workaround - see the comment at the head of put_prev_task_scx() */ -+ if (unlikely(rq->curr->sched_class != &ext_sched_class)) -+ balance_one(rq, rq->curr, true); -+#endif -+ + p = first_local_task(rq); + if (!p) + return NULL; @@ -4313,7 +4840,7 @@ index 000000000000..0dac88d0e578 + set_next_task_scx(rq, p, true); + + if (unlikely(!p->scx.slice)) { -+ if (!scx_ops_bypassing() && !scx_warned_zero_slice) { ++ if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", + p->comm, p->pid); + scx_warned_zero_slice = true; @@ -4350,7 +4877,7 @@ index 000000000000..0dac88d0e578 + * calling ops.core_sched_before(). Accesses are controlled by the + * verifier. + */ -+ if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) ++ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + (struct task_struct *)a, + (struct task_struct *)b); @@ -4402,69 +4929,6 @@ index 000000000000..0dac88d0e578 +} +#endif /* CONFIG_SCHED_CORE */ + -+static enum scx_cpu_preempt_reason -+preempt_reason_from_class(const struct sched_class *class) -+{ -+#ifdef CONFIG_SMP -+ if (class == &stop_sched_class) -+ return SCX_CPU_PREEMPT_STOP; -+#endif -+ if (class == &dl_sched_class) -+ return SCX_CPU_PREEMPT_DL; -+ if (class == &rt_sched_class) -+ return SCX_CPU_PREEMPT_RT; -+ return SCX_CPU_PREEMPT_UNKNOWN; -+} -+ -+static void switch_class_scx(struct rq *rq, struct task_struct *next) -+{ -+ const struct sched_class *next_class = next->sched_class; -+ -+ if (!scx_enabled()) -+ return; -+#ifdef CONFIG_SMP -+ /* -+ * Pairs with the smp_load_acquire() issued by a CPU in -+ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a -+ * resched. -+ */ -+ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); -+#endif -+ if (!static_branch_unlikely(&scx_ops_cpu_preempt)) -+ return; -+ -+ /* -+ * The callback is conceptually meant to convey that the CPU is no -+ * longer under the control of SCX. Therefore, don't invoke the callback -+ * if the next class is below SCX (in which case the BPF scheduler has -+ * actively decided not to schedule any tasks on the CPU). -+ */ -+ if (sched_class_above(&ext_sched_class, next_class)) -+ return; -+ -+ /* -+ * At this point we know that SCX was preempted by a higher priority -+ * sched_class, so invoke the ->cpu_release() callback if we have not -+ * done so already. We only send the callback once between SCX being -+ * preempted, and it regaining control of the CPU. -+ * -+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the -+ * next time that balance_scx() is invoked. -+ */ -+ if (!rq->scx.cpu_released) { -+ if (SCX_HAS_OP(cpu_release)) { -+ struct scx_cpu_release_args args = { -+ .reason = preempt_reason_from_class(next_class), -+ .task = next, -+ }; -+ -+ SCX_CALL_OP(SCX_KF_CPU_RELEASE, -+ cpu_release, cpu_of(rq), &args); -+ } -+ rq->scx.cpu_released = true; -+ } -+} -+ +#ifdef CONFIG_SMP + +static bool test_and_clear_cpu_idle(int cpu) @@ -4815,7 +5279,7 @@ index 000000000000..0dac88d0e578 + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). + */ -+ if (scx_ops_bypassing()) { ++ if (scx_rq_bypassing(rq)) { + curr->scx.slice = 0; + touch_core_sched(rq, curr); + } else if (SCX_HAS_OP(tick)) { @@ -4826,6 +5290,28 @@ index 000000000000..0dac88d0e578 + resched_curr(rq); +} + ++#ifdef CONFIG_EXT_GROUP_SCHED ++static struct cgroup *tg_cgrp(struct task_group *tg) ++{ ++ /* ++ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, ++ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the ++ * root cgroup. ++ */ ++ if (tg && tg->css.cgroup) ++ return tg->css.cgroup; ++ else ++ return &cgrp_dfl_root.cgrp; ++} ++ ++#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), ++ ++#else /* CONFIG_EXT_GROUP_SCHED */ ++ ++#define SCX_INIT_TASK_ARGS_CGROUP(tg) ++ ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++ +static enum scx_task_state scx_get_task_state(const struct task_struct *p) +{ + return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; @@ -4870,6 +5356,7 @@ index 000000000000..0dac88d0e578 + + if (SCX_HAS_OP(init_task)) { + struct scx_init_task_args args = { ++ SCX_INIT_TASK_ARGS_CGROUP(tg) + .fork = fork, + }; + @@ -4883,24 +5370,29 @@ index 000000000000..0dac88d0e578 + scx_set_task_state(p, SCX_TASK_INIT); + + if (p->scx.disallow) { -+ struct rq *rq; -+ struct rq_flags rf; ++ if (!fork) { ++ struct rq *rq; ++ struct rq_flags rf; + -+ rq = task_rq_lock(p, &rf); ++ rq = task_rq_lock(p, &rf); + -+ /* -+ * We're either in fork or load path and @p->policy will be -+ * applied right after. Reverting @p->policy here and rejecting -+ * %SCHED_EXT transitions from scx_check_setscheduler() -+ * guarantees that if ops.init_task() sets @p->disallow, @p can -+ * never be in SCX. -+ */ -+ if (p->policy == SCHED_EXT) { -+ p->policy = SCHED_NORMAL; -+ atomic_long_inc(&scx_nr_rejected); -+ } ++ /* ++ * We're in the load path and @p->policy will be applied ++ * right after. Reverting @p->policy here and rejecting ++ * %SCHED_EXT transitions from scx_check_setscheduler() ++ * guarantees that if ops.init_task() sets @p->disallow, ++ * @p can never be in SCX. ++ */ ++ if (p->policy == SCHED_EXT) { ++ p->policy = SCHED_NORMAL; ++ atomic_long_inc(&scx_nr_rejected); ++ } + -+ task_rq_unlock(rq, p, &rf); ++ task_rq_unlock(rq, p, &rf); ++ } else if (p->policy == SCHED_EXT) { ++ scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork", ++ p->comm, p->pid); ++ } + } + + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; @@ -4929,7 +5421,7 @@ index 000000000000..0dac88d0e578 + scx_set_task_state(p, SCX_TASK_ENABLED); + + if (SCX_HAS_OP(set_weight)) -+ SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); ++ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void scx_ops_disable_task(struct task_struct *p) @@ -5004,7 +5496,7 @@ index 000000000000..0dac88d0e578 +{ + percpu_rwsem_assert_held(&scx_fork_rwsem); + -+ if (scx_enabled()) ++ if (scx_ops_init_task_enabled) + return scx_ops_init_task(p, task_group(p), true); + else + return 0; @@ -5012,7 +5504,7 @@ index 000000000000..0dac88d0e578 + +void scx_post_fork(struct task_struct *p) +{ -+ if (scx_enabled()) { ++ if (scx_ops_init_task_enabled) { + scx_set_task_state(p, SCX_TASK_READY); + + /* @@ -5126,7 +5618,7 @@ index 000000000000..0dac88d0e578 +{ + struct task_struct *p = rq->curr; + -+ if (scx_ops_bypassing()) ++ if (scx_rq_bypassing(rq)) + return false; + + if (p->sched_class != &ext_sched_class) @@ -5141,6 +5633,222 @@ index 000000000000..0dac88d0e578 +} +#endif + ++#ifdef CONFIG_EXT_GROUP_SCHED ++ ++DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); ++static bool scx_cgroup_enabled; ++static bool cgroup_warned_missing_weight; ++static bool cgroup_warned_missing_idle; ++ ++static void scx_cgroup_warn_missing_weight(struct task_group *tg) ++{ ++ if (scx_ops_enable_state() == SCX_OPS_DISABLED || ++ cgroup_warned_missing_weight) ++ return; ++ ++ if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) ++ return; ++ ++ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", ++ scx_ops.name); ++ cgroup_warned_missing_weight = true; ++} ++ ++static void scx_cgroup_warn_missing_idle(struct task_group *tg) ++{ ++ if (!scx_cgroup_enabled || cgroup_warned_missing_idle) ++ return; ++ ++ if (!tg->idle) ++ return; ++ ++ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", ++ scx_ops.name); ++ cgroup_warned_missing_idle = true; ++} ++ ++int scx_tg_online(struct task_group *tg) ++{ ++ int ret = 0; ++ ++ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); ++ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ scx_cgroup_warn_missing_weight(tg); ++ ++ if (scx_cgroup_enabled) { ++ if (SCX_HAS_OP(cgroup_init)) { ++ struct scx_cgroup_init_args args = ++ { .weight = tg->scx_weight }; ++ ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, ++ tg->css.cgroup, &args); ++ if (ret) ++ ret = ops_sanitize_err("cgroup_init", ret); ++ } ++ if (ret == 0) ++ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; ++ } else { ++ tg->scx_flags |= SCX_TG_ONLINE; ++ } ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++ return ret; ++} ++ ++void scx_tg_offline(struct task_group *tg) ++{ ++ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); ++ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); ++ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++int scx_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *p; ++ int ret; ++ ++ /* released in scx_finish/cancel_attach() */ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ if (!scx_cgroup_enabled) ++ return 0; ++ ++ cgroup_taskset_for_each(p, css, tset) { ++ struct cgroup *from = tg_cgrp(task_group(p)); ++ struct cgroup *to = tg_cgrp(css_tg(css)); ++ ++ WARN_ON_ONCE(p->scx.cgrp_moving_from); ++ ++ /* ++ * sched_move_task() omits identity migrations. Let's match the ++ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() ++ * always match one-to-one. ++ */ ++ if (from == to) ++ continue; ++ ++ if (SCX_HAS_OP(cgroup_prep_move)) { ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, ++ p, from, css->cgroup); ++ if (ret) ++ goto err; ++ } ++ ++ p->scx.cgrp_moving_from = from; ++ } ++ ++ return 0; ++ ++err: ++ cgroup_taskset_for_each(p, css, tset) { ++ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, ++ p->scx.cgrp_moving_from, css->cgroup); ++ p->scx.cgrp_moving_from = NULL; ++ } ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++ return ops_sanitize_err("cgroup_prep_move", ret); ++} ++ ++void scx_move_task(struct task_struct *p) ++{ ++ if (!scx_cgroup_enabled) ++ return; ++ ++ /* ++ * We're called from sched_move_task() which handles both cgroup and ++ * autogroup moves. Ignore the latter. ++ * ++ * Also ignore exiting tasks, because in the exit path tasks transition ++ * from the autogroup to the root group, so task_group_is_autogroup() ++ * alone isn't able to catch exiting autogroup tasks. This is safe for ++ * cgroup_move(), because cgroup migrations never happen for PF_EXITING ++ * tasks. ++ */ ++ if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) ++ return; ++ ++ /* ++ * @p must have ops.cgroup_prep_move() called on it and thus ++ * cgrp_moving_from set. ++ */ ++ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) ++ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, ++ p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); ++ p->scx.cgrp_moving_from = NULL; ++} ++ ++void scx_cgroup_finish_attach(void) ++{ ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *p; ++ ++ if (!scx_cgroup_enabled) ++ goto out_unlock; ++ ++ cgroup_taskset_for_each(p, css, tset) { ++ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, ++ p->scx.cgrp_moving_from, css->cgroup); ++ p->scx.cgrp_moving_from = NULL; ++ } ++out_unlock: ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++void scx_group_set_weight(struct task_group *tg, unsigned long weight) ++{ ++ percpu_down_read(&scx_cgroup_rwsem); ++ ++ if (scx_cgroup_enabled && tg->scx_weight != weight) { ++ if (SCX_HAS_OP(cgroup_set_weight)) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, ++ tg_cgrp(tg), weight); ++ tg->scx_weight = weight; ++ } ++ ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++void scx_group_set_idle(struct task_group *tg, bool idle) ++{ ++ percpu_down_read(&scx_cgroup_rwsem); ++ scx_cgroup_warn_missing_idle(tg); ++ percpu_up_read(&scx_cgroup_rwsem); ++} ++ ++static void scx_cgroup_lock(void) ++{ ++ percpu_down_write(&scx_cgroup_rwsem); ++} ++ ++static void scx_cgroup_unlock(void) ++{ ++ percpu_up_write(&scx_cgroup_rwsem); ++} ++ ++#else /* CONFIG_EXT_GROUP_SCHED */ ++ ++static inline void scx_cgroup_lock(void) {} ++static inline void scx_cgroup_unlock(void) {} ++ ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++ +/* + * Omitted operations: + * @@ -5161,6 +5869,7 @@ index 000000000000..0dac88d0e578 + + .wakeup_preempt = wakeup_preempt_scx, + ++ .balance = balance_scx, + .pick_next_task = pick_next_task_scx, + + .put_prev_task = put_prev_task_scx, @@ -5169,7 +5878,6 @@ index 000000000000..0dac88d0e578 + .switch_class = switch_class_scx, + +#ifdef CONFIG_SMP -+ .balance = balance_scx, + .select_task_rq = select_task_rq_scx, + .task_woken = task_woken_scx, + .set_cpus_allowed = set_cpus_allowed_scx, @@ -5278,6 +5986,102 @@ index 000000000000..0dac88d0e578 + rcu_read_unlock(); +} + ++#ifdef CONFIG_EXT_GROUP_SCHED ++static void scx_cgroup_exit(void) ++{ ++ struct cgroup_subsys_state *css; ++ ++ percpu_rwsem_assert_held(&scx_cgroup_rwsem); ++ ++ WARN_ON_ONCE(!scx_cgroup_enabled); ++ scx_cgroup_enabled = false; ++ ++ /* ++ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk ++ * cgroups and exit all the inited ones, all online cgroups are exited. ++ */ ++ rcu_read_lock(); ++ css_for_each_descendant_post(css, &root_task_group.css) { ++ struct task_group *tg = css_tg(css); ++ ++ if (!(tg->scx_flags & SCX_TG_INITED)) ++ continue; ++ tg->scx_flags &= ~SCX_TG_INITED; ++ ++ if (!scx_ops.cgroup_exit) ++ continue; ++ ++ if (WARN_ON_ONCE(!css_tryget(css))) ++ continue; ++ rcu_read_unlock(); ++ ++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); ++ ++ rcu_read_lock(); ++ css_put(css); ++ } ++ rcu_read_unlock(); ++} ++ ++static int scx_cgroup_init(void) ++{ ++ struct cgroup_subsys_state *css; ++ int ret; ++ ++ percpu_rwsem_assert_held(&scx_cgroup_rwsem); ++ ++ cgroup_warned_missing_weight = false; ++ cgroup_warned_missing_idle = false; ++ ++ /* ++ * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk ++ * cgroups and init, all online cgroups are initialized. ++ */ ++ rcu_read_lock(); ++ css_for_each_descendant_pre(css, &root_task_group.css) { ++ struct task_group *tg = css_tg(css); ++ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; ++ ++ scx_cgroup_warn_missing_weight(tg); ++ scx_cgroup_warn_missing_idle(tg); ++ ++ if ((tg->scx_flags & ++ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) ++ continue; ++ ++ if (!scx_ops.cgroup_init) { ++ tg->scx_flags |= SCX_TG_INITED; ++ continue; ++ } ++ ++ if (WARN_ON_ONCE(!css_tryget(css))) ++ continue; ++ rcu_read_unlock(); ++ ++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, ++ css->cgroup, &args); ++ if (ret) { ++ css_put(css); ++ return ret; ++ } ++ tg->scx_flags |= SCX_TG_INITED; ++ ++ rcu_read_lock(); ++ css_put(css); ++ } ++ rcu_read_unlock(); ++ ++ WARN_ON_ONCE(scx_cgroup_enabled); ++ scx_cgroup_enabled = true; ++ ++ return 0; ++} ++ ++#else ++static void scx_cgroup_exit(void) {} ++static int scx_cgroup_init(void) { return 0; } ++#endif ++ + +/******************************************************************************** + * Sysfs interface and ops enable/disable. @@ -5318,11 +6122,19 @@ index 000000000000..0dac88d0e578 +} +SCX_ATTR(hotplug_seq); + ++static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, ++ struct kobj_attribute *ka, char *buf) ++{ ++ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); ++} ++SCX_ATTR(enable_seq); ++ +static struct attribute *scx_global_attrs[] = { + &scx_attr_state.attr, + &scx_attr_switch_all.attr, + &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, ++ &scx_attr_enable_seq.attr, + NULL, +}; + @@ -5421,16 +6233,8 @@ index 000000000000..0dac88d0e578 + } + + /* -+ * We need to guarantee that no tasks are on the BPF scheduler while -+ * bypassing. Either we see enabled or the enable path sees the -+ * increased bypass_depth before moving tasks to SCX. -+ */ -+ if (!scx_enabled()) -+ return; -+ -+ /* + * No task property is changing. We just need to make sure all currently -+ * queued tasks are re-queued according to the new scx_ops_bypassing() ++ * queued tasks are re-queued according to the new scx_rq_bypassing() + * state. As an optimization, walk each rq's runnable_list instead of + * the scx_tasks list. + * @@ -5444,6 +6248,24 @@ index 000000000000..0dac88d0e578 + + rq_lock_irqsave(rq, &rf); + ++ if (bypass) { ++ WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); ++ rq->scx.flags |= SCX_RQ_BYPASSING; ++ } else { ++ WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); ++ rq->scx.flags &= ~SCX_RQ_BYPASSING; ++ } ++ ++ /* ++ * We need to guarantee that no tasks are on the BPF scheduler ++ * while bypassing. Either we see enabled or the enable path ++ * sees scx_rq_bypassing() before moving tasks to SCX. ++ */ ++ if (!scx_enabled()) { ++ rq_unlock_irqrestore(rq, &rf); ++ continue; ++ } ++ + /* + * The use of list_for_each_entry_safe_reverse() is required + * because each task is going to be removed from and added back @@ -5499,11 +6321,11 @@ index 000000000000..0dac88d0e578 +{ + switch (kind) { + case SCX_EXIT_UNREG: -+ return "Scheduler unregistered from user space"; ++ return "unregistered from user space"; + case SCX_EXIT_UNREG_BPF: -+ return "Scheduler unregistered from BPF"; ++ return "unregistered from BPF"; + case SCX_EXIT_UNREG_KERN: -+ return "Scheduler unregistered from the main kernel"; ++ return "unregistered from the main kernel"; + case SCX_EXIT_SYSRQ: + return "disabled by sysrq-S"; + case SCX_EXIT_ERROR: @@ -5569,66 +6391,64 @@ index 000000000000..0dac88d0e578 + WRITE_ONCE(scx_switching_all, false); + + /* -+ * Avoid racing against fork. See scx_ops_enable() for explanation on -+ * the locking order. ++ * Shut down cgroup support before tasks so that the cgroup attach path ++ * doesn't race against scx_ops_exit_task(). + */ -+ percpu_down_write(&scx_fork_rwsem); -+ cpus_read_lock(); ++ scx_cgroup_lock(); ++ scx_cgroup_exit(); ++ scx_cgroup_unlock(); + -+ spin_lock_irq(&scx_tasks_lock); -+ scx_task_iter_init(&sti); + /* -+ * Invoke scx_ops_exit_task() on all non-idle tasks, including -+ * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, -+ * we may not have invoked sched_ext_free() on them by the time a -+ * scheduler is disabled. We must therefore exit the task here, or we'd -+ * fail to invoke ops.exit_task(), as the scheduler will have been -+ * unloaded by the time the task is subsequently exited on the -+ * sched_ext_free() path. ++ * The BPF scheduler is going away. All tasks including %TASK_DEAD ones ++ * must be switched out and exited synchronously. + */ -+ while ((p = scx_task_iter_next_locked(&sti, true))) { ++ percpu_down_write(&scx_fork_rwsem); ++ ++ scx_ops_init_task_enabled = false; ++ ++ spin_lock_irq(&scx_tasks_lock); ++ scx_task_iter_init(&sti); ++ while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + -+ if (READ_ONCE(p->__state) != TASK_DEAD) { -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, -+ &ctx); ++ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + -+ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); -+ __setscheduler_prio(p, p->prio); -+ check_class_changing(task_rq(p), p, old_class); ++ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); ++ __setscheduler_prio(p, p->prio); ++ check_class_changing(task_rq(p), p, old_class); + -+ sched_enq_and_set_task(&ctx); ++ sched_enq_and_set_task(&ctx); + -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ } ++ check_class_changed(task_rq(p), p, old_class, p->prio); + scx_ops_exit_task(p); + } + scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); ++ percpu_up_write(&scx_fork_rwsem); + + /* no task is on scx, turn off all the switches and flush in-progress calls */ -+ static_branch_disable_cpuslocked(&__scx_ops_enabled); ++ static_branch_disable(&__scx_ops_enabled); + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) -+ static_branch_disable_cpuslocked(&scx_has_op[i]); -+ static_branch_disable_cpuslocked(&scx_ops_enq_last); -+ static_branch_disable_cpuslocked(&scx_ops_enq_exiting); -+ static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); -+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); ++ static_branch_disable(&scx_has_op[i]); ++ static_branch_disable(&scx_ops_enq_last); ++ static_branch_disable(&scx_ops_enq_exiting); ++ static_branch_disable(&scx_ops_cpu_preempt); ++ static_branch_disable(&scx_builtin_idle_enabled); + synchronize_rcu(); + -+ cpus_read_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+ + if (ei->kind >= SCX_EXIT_ERROR) { -+ printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); -+ -+ if (ei->msg[0] == '\0') -+ printk(KERN_ERR "sched_ext: %s\n", ei->reason); -+ else -+ printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); ++ pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", ++ scx_ops.name, ei->reason); + ++ if (ei->msg[0] != '\0') ++ pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); ++#ifdef CONFIG_STACKTRACE + stack_trace_print(ei->bt, ei->bt_len, 2); ++#endif ++ } else { ++ pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", ++ scx_ops.name, ei->reason); + } + + if (scx_ops.exit) @@ -5817,7 +6637,7 @@ index 000000000000..0dac88d0e578 + static unsigned long bt[SCX_EXIT_BT_LEN]; + char dsq_id_buf[19] = "(n/a)"; + unsigned long ops_state = atomic_long_read(&p->scx.ops_state); -+ unsigned int bt_len; ++ unsigned int bt_len = 0; + + if (p->scx.dsq) + scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", @@ -5842,7 +6662,9 @@ index 000000000000..0dac88d0e578 + ops_dump_exit(); + } + ++#ifdef CONFIG_STACKTRACE + bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); ++#endif + if (bt_len) { + dump_newline(s); + dump_stack_trace(s, " ", bt, bt_len); @@ -6000,10 +6822,10 @@ index 000000000000..0dac88d0e578 + return; + + ei->exit_code = exit_code; -+ ++#ifdef CONFIG_STACKTRACE + if (kind >= SCX_EXIT_ERROR) + ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); -+ ++#endif + va_start(args, fmt); + vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); + va_end(args); @@ -6061,12 +6883,12 @@ index 000000000000..0dac88d0e578 + return 0; +} + -+static int scx_ops_enable(struct sched_ext_ops *ops) ++static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) +{ + struct scx_task_iter sti; + struct task_struct *p; + unsigned long timeout; -+ int i, cpu, ret; ++ int i, cpu, node, ret; + + if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), + cpu_possible_mask)) { @@ -6085,6 +6907,34 @@ index 000000000000..0dac88d0e578 + } + } + ++ if (!global_dsqs) { ++ struct scx_dispatch_q **dsqs; ++ ++ dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); ++ if (!dsqs) { ++ ret = -ENOMEM; ++ goto err_unlock; ++ } ++ ++ for_each_node_state(node, N_POSSIBLE) { ++ struct scx_dispatch_q *dsq; ++ ++ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); ++ if (!dsq) { ++ for_each_node_state(node, N_POSSIBLE) ++ kfree(dsqs[node]); ++ kfree(dsqs); ++ ret = -ENOMEM; ++ goto err_unlock; ++ } ++ ++ init_dsq(dsq, SCX_DSQ_GLOBAL); ++ dsqs[node] = dsq; ++ } ++ ++ global_dsqs = dsqs; ++ } ++ + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + ret = -EBUSY; + goto err_unlock; @@ -6108,12 +6958,12 @@ index 000000000000..0dac88d0e578 + } + + /* -+ * Set scx_ops, transition to PREPPING and clear exit info to arm the ++ * Set scx_ops, transition to ENABLING and clear exit info to arm the + * disable path. Failure triggers full disabling from here on. + */ + scx_ops = *ops; + -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != ++ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != + SCX_OPS_DISABLED); + + atomic_set(&scx_exit_kind, SCX_EXIT_NONE); @@ -6134,7 +6984,8 @@ index 000000000000..0dac88d0e578 + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + if (ret) { + ret = ops_sanitize_err("init", ret); -+ goto err_disable_unlock_cpus; ++ cpus_read_unlock(); ++ goto err_disable; + } + } + @@ -6142,6 +6993,7 @@ index 000000000000..0dac88d0e578 + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + ++ check_hotplug_seq(ops); + cpus_read_unlock(); + + ret = validate_ops(ops); @@ -6169,42 +7021,40 @@ index 000000000000..0dac88d0e578 + scx_watchdog_timeout / 2); + + /* -+ * Lock out forks before opening the floodgate so that they don't wander -+ * into the operations prematurely. -+ * -+ * We don't need to keep the CPUs stable but grab cpus_read_lock() to -+ * ease future locking changes for cgroup suport. -+ * -+ * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the -+ * following dependency chain: -+ * -+ * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock ++ * Once __scx_ops_enabled is set, %current can be switched to SCX ++ * anytime. This can lead to stalls as some BPF schedulers (e.g. ++ * userspace scheduling) may not function correctly before all tasks are ++ * switched. Init in bypass mode to guarantee forward progress. + */ -+ percpu_down_write(&scx_fork_rwsem); -+ cpus_read_lock(); -+ -+ check_hotplug_seq(ops); ++ scx_ops_bypass(true); + + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) + if (((void (**)(void))ops)[i]) -+ static_branch_enable_cpuslocked(&scx_has_op[i]); ++ static_branch_enable(&scx_has_op[i]); + + if (ops->flags & SCX_OPS_ENQ_LAST) -+ static_branch_enable_cpuslocked(&scx_ops_enq_last); ++ static_branch_enable(&scx_ops_enq_last); + + if (ops->flags & SCX_OPS_ENQ_EXITING) -+ static_branch_enable_cpuslocked(&scx_ops_enq_exiting); ++ static_branch_enable(&scx_ops_enq_exiting); + if (scx_ops.cpu_acquire || scx_ops.cpu_release) -+ static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); ++ static_branch_enable(&scx_ops_cpu_preempt); + + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + reset_idle_masks(); -+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); ++ static_branch_enable(&scx_builtin_idle_enabled); + } else { -+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); ++ static_branch_disable(&scx_builtin_idle_enabled); + } + -+ static_branch_enable_cpuslocked(&__scx_ops_enabled); ++ /* ++ * Lock out forks, cgroup on/offlining and moves before opening the ++ * floodgate so that they don't wander into the operations prematurely. ++ */ ++ percpu_down_write(&scx_fork_rwsem); ++ ++ WARN_ON_ONCE(scx_ops_init_task_enabled); ++ scx_ops_init_task_enabled = true; + + /* + * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -6212,12 +7062,29 @@ index 000000000000..0dac88d0e578 + * leaving as sched_ext_free() can handle both prepped and enabled + * tasks. Prep all tasks first and then enable them with preemption + * disabled. ++ * ++ * All cgroups should be initialized before scx_ops_init_task() so that ++ * the BPF scheduler can reliably track each task's cgroup membership ++ * from scx_ops_init_task(). Lock out cgroup on/offlining and task ++ * migrations while tasks are being initialized so that ++ * scx_cgroup_can_attach() never sees uninitialized tasks. + */ -+ spin_lock_irq(&scx_tasks_lock); ++ scx_cgroup_lock(); ++ ret = scx_cgroup_init(); ++ if (ret) ++ goto err_disable_unlock_all; + ++ spin_lock_irq(&scx_tasks_lock); + scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_locked(&sti, false))) { -+ get_task_struct(p); ++ while ((p = scx_task_iter_next_locked(&sti))) { ++ /* ++ * @p may already be dead, have lost all its usages counts and ++ * be waiting for RCU grace period before being freed. @p can't ++ * be initialized for SCX in such cases and should be ignored. ++ */ ++ if (!tryget_task_struct(p)) ++ continue; ++ + scx_task_iter_rq_unlock(&sti); + spin_unlock_irq(&scx_tasks_lock); + @@ -6232,51 +7099,37 @@ index 000000000000..0dac88d0e578 + goto err_disable_unlock_all; + } + ++ scx_set_task_state(p, SCX_TASK_READY); ++ + put_task_struct(p); + spin_lock_irq(&scx_tasks_lock); + } + scx_task_iter_exit(&sti); ++ spin_unlock_irq(&scx_tasks_lock); ++ scx_cgroup_unlock(); ++ percpu_up_write(&scx_fork_rwsem); + + /* -+ * All tasks are prepped but are still ops-disabled. Ensure that -+ * %current can't be scheduled out and switch everyone. -+ * preempt_disable() is necessary because we can't guarantee that -+ * %current won't be starved if scheduled out while switching. -+ */ -+ preempt_disable(); -+ -+ /* -+ * From here on, the disable path must assume that tasks have ops -+ * enabled and need to be recovered. -+ * -+ * Transition to ENABLING fails iff the BPF scheduler has already -+ * triggered scx_bpf_error(). Returning an error code here would lose -+ * the recorded error information. Exit indicating success so that the -+ * error is notified through ops.exit() with all the details. ++ * All tasks are READY. It's safe to turn on scx_enabled() and switch ++ * all eligible tasks. + */ -+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { -+ preempt_enable(); -+ spin_unlock_irq(&scx_tasks_lock); -+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); -+ ret = 0; -+ goto err_disable_unlock_all; -+ } ++ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); ++ static_branch_enable(&__scx_ops_enabled); + + /* -+ * We're fully committed and can't fail. The PREPPED -> ENABLED ++ * We're fully committed and can't fail. The task READY -> ENABLED + * transitions here are synchronized against sched_ext_free() through + * scx_tasks_lock. + */ -+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); -+ ++ percpu_down_write(&scx_fork_rwsem); ++ spin_lock_irq(&scx_tasks_lock); + scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_locked(&sti, false))) { ++ while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + -+ scx_set_task_state(p, SCX_TASK_READY); + __setscheduler_prio(p, p->prio); + check_class_changing(task_rq(p), p, old_class); + @@ -6285,13 +7138,16 @@ index 000000000000..0dac88d0e578 + check_class_changed(task_rq(p), p, old_class, p->prio); + } + scx_task_iter_exit(&sti); -+ + spin_unlock_irq(&scx_tasks_lock); -+ preempt_enable(); -+ cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + -+ /* see above ENABLING transition for the explanation on exiting with 0 */ ++ scx_ops_bypass(false); ++ ++ /* ++ * Returning an error code here would lose the recorded error ++ * information. Exit indicating success so that the error is notified ++ * through ops.exit() with all the details. ++ */ + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { + WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); + ret = 0; @@ -6301,9 +7157,13 @@ index 000000000000..0dac88d0e578 + if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) + static_branch_enable(&__scx_switched_all); + ++ pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", ++ scx_ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(scx_root_kobj, KOBJ_ADD); + mutex_unlock(&scx_ops_enable_mutex); + ++ atomic_long_inc(&scx_enable_seq); ++ + return 0; + +err_del: @@ -6320,9 +7180,9 @@ index 000000000000..0dac88d0e578 + return ret; + +err_disable_unlock_all: ++ scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); -+err_disable_unlock_cpus: -+ cpus_read_unlock(); ++ scx_ops_bypass(false); +err_disable: + mutex_unlock(&scx_ops_enable_mutex); + /* must be fully disabled before returning */ @@ -6514,6 +7374,11 @@ index 000000000000..0dac88d0e578 + + switch (moff) { + case offsetof(struct sched_ext_ops, init_task): ++#ifdef CONFIG_EXT_GROUP_SCHED ++ case offsetof(struct sched_ext_ops, cgroup_init): ++ case offsetof(struct sched_ext_ops, cgroup_exit): ++ case offsetof(struct sched_ext_ops, cgroup_prep_move): ++#endif + case offsetof(struct sched_ext_ops, cpu_online): + case offsetof(struct sched_ext_ops, cpu_offline): + case offsetof(struct sched_ext_ops, init): @@ -6527,12 +7392,12 @@ index 000000000000..0dac88d0e578 + return 0; +} + -+static int bpf_scx_reg(void *kdata) ++static int bpf_scx_reg(void *kdata, struct bpf_link *link) +{ -+ return scx_ops_enable(kdata); ++ return scx_ops_enable(kdata, link); +} + -+static void bpf_scx_unreg(void *kdata) ++static void bpf_scx_unreg(void *kdata, struct bpf_link *link) +{ + scx_ops_disable(SCX_EXIT_UNREG); + kthread_flush_work(&scx_ops_disable_work); @@ -6551,7 +7416,7 @@ index 000000000000..0dac88d0e578 + return 0; +} + -+static int bpf_scx_update(void *kdata, void *old_kdata) ++static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) +{ + /* + * sched_ext does not support updating the actively-loaded BPF @@ -6572,6 +7437,7 @@ index 000000000000..0dac88d0e578 +static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} +static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} +static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} ++static void tick_stub(struct task_struct *p) {} +static void runnable_stub(struct task_struct *p, u64 enq_flags) {} +static void running_stub(struct task_struct *p) {} +static void stopping_stub(struct task_struct *p, bool runnable) {} @@ -6587,16 +7453,28 @@ index 000000000000..0dac88d0e578 +static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} +static void enable_stub(struct task_struct *p) {} +static void disable_stub(struct task_struct *p) {} ++#ifdef CONFIG_EXT_GROUP_SCHED ++static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } ++static void cgroup_exit_stub(struct cgroup *cgrp) {} ++static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } ++static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} ++static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} ++static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} ++#endif +static void cpu_online_stub(s32 cpu) {} +static void cpu_offline_stub(s32 cpu) {} +static s32 init_stub(void) { return -EINVAL; } +static void exit_stub(struct scx_exit_info *info) {} ++static void dump_stub(struct scx_dump_ctx *ctx) {} ++static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} ++static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {} + +static struct sched_ext_ops __bpf_ops_sched_ext_ops = { + .select_cpu = select_cpu_stub, + .enqueue = enqueue_stub, + .dequeue = dequeue_stub, + .dispatch = dispatch_stub, ++ .tick = tick_stub, + .runnable = runnable_stub, + .running = running_stub, + .stopping = stopping_stub, @@ -6612,10 +7490,21 @@ index 000000000000..0dac88d0e578 + .exit_task = exit_task_stub, + .enable = enable_stub, + .disable = disable_stub, ++#ifdef CONFIG_EXT_GROUP_SCHED ++ .cgroup_init = cgroup_init_stub, ++ .cgroup_exit = cgroup_exit_stub, ++ .cgroup_prep_move = cgroup_prep_move_stub, ++ .cgroup_move = cgroup_move_stub, ++ .cgroup_cancel_move = cgroup_cancel_move_stub, ++ .cgroup_set_weight = cgroup_set_weight_stub, ++#endif + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, + .init = init_stub, + .exit = exit_stub, ++ .dump = dump_stub, ++ .dump_cpu = dump_cpu_stub, ++ .dump_task = dump_task_stub, +}; + +static struct bpf_struct_ops bpf_sched_ext_ops = { @@ -6858,10 +7747,10 @@ index 000000000000..0dac88d0e578 + * definitions so that BPF scheduler implementations can use them + * through the generated vmlinux.h. + */ -+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT); ++ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | ++ SCX_TG_ONLINE); + + BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -+ init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); +#ifdef CONFIG_SMP + BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); @@ -6903,35 +7792,6 @@ index 000000000000..0dac88d0e578 +__bpf_kfunc_start_defs(); + +/** -+ * scx_bpf_create_dsq - Create a custom DSQ -+ * @dsq_id: DSQ to create -+ * @node: NUMA node to allocate from -+ * -+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable -+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. -+ */ -+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) -+{ -+ if (unlikely(node >= (int)nr_node_ids || -+ (node < 0 && node != NUMA_NO_NODE))) -+ return -EINVAL; -+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_sleepable) -+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -+BTF_KFUNCS_END(scx_kfunc_ids_sleepable) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_sleepable, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously @@ -7021,7 +7881,7 @@ index 000000000000..0dac88d0e578 + * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs ++ * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe @@ -7071,7 +7931,7 @@ index 000000000000..0dac88d0e578 + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs ++ * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * @@ -7112,6 +7972,118 @@ index 000000000000..0dac88d0e578 + .set = &scx_kfunc_ids_enqueue_dispatch, +}; + ++static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, ++ struct task_struct *p, u64 dsq_id, ++ u64 enq_flags) ++{ ++ struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; ++ struct rq *this_rq, *src_rq, *dst_rq, *locked_rq; ++ bool dispatched = false; ++ bool in_balance; ++ unsigned long flags; ++ ++ if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) ++ return false; ++ ++ /* ++ * Can be called from either ops.dispatch() locking this_rq() or any ++ * context where no rq lock is held. If latter, lock @p's task_rq which ++ * we'll likely need anyway. ++ */ ++ src_rq = task_rq(p); ++ ++ local_irq_save(flags); ++ this_rq = this_rq(); ++ in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; ++ ++ if (in_balance) { ++ if (this_rq != src_rq) { ++ raw_spin_rq_unlock(this_rq); ++ raw_spin_rq_lock(src_rq); ++ } ++ } else { ++ raw_spin_rq_lock(src_rq); ++ } ++ ++ locked_rq = src_rq; ++ raw_spin_lock(&src_dsq->lock); ++ ++ /* ++ * Did someone else get to it? @p could have already left $src_dsq, got ++ * re-enqueud, or be in the process of being consumed by someone else. ++ */ ++ if (unlikely(p->scx.dsq != src_dsq || ++ u32_before(kit->cursor.priv, p->scx.dsq_seq) || ++ p->scx.holding_cpu >= 0) || ++ WARN_ON_ONCE(src_rq != task_rq(p))) { ++ raw_spin_unlock(&src_dsq->lock); ++ goto out; ++ } ++ ++ /* @p is still on $src_dsq and stable, determine the destination */ ++ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); ++ ++ if (dst_dsq->id == SCX_DSQ_LOCAL) { ++ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); ++ if (!task_can_run_on_remote_rq(p, dst_rq, true)) { ++ dst_dsq = find_global_dsq(p); ++ dst_rq = src_rq; ++ } ++ } else { ++ /* no need to migrate if destination is a non-local DSQ */ ++ dst_rq = src_rq; ++ } ++ ++ /* ++ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different ++ * CPU, @p will be migrated. ++ */ ++ if (dst_dsq->id == SCX_DSQ_LOCAL) { ++ /* @p is going from a non-local DSQ to a local DSQ */ ++ if (src_rq == dst_rq) { ++ task_unlink_from_dsq(p, src_dsq); ++ move_local_task_to_local_dsq(p, enq_flags, ++ src_dsq, dst_rq); ++ raw_spin_unlock(&src_dsq->lock); ++ } else { ++ raw_spin_unlock(&src_dsq->lock); ++ move_remote_task_to_local_dsq(p, enq_flags, ++ src_rq, dst_rq); ++ locked_rq = dst_rq; ++ } ++ } else { ++ /* ++ * @p is going from a non-local DSQ to a non-local DSQ. As ++ * $src_dsq is already locked, do an abbreviated dequeue. ++ */ ++ task_unlink_from_dsq(p, src_dsq); ++ p->scx.dsq = NULL; ++ raw_spin_unlock(&src_dsq->lock); ++ ++ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) ++ p->scx.dsq_vtime = kit->vtime; ++ dispatch_enqueue(dst_dsq, p, enq_flags); ++ } ++ ++ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) ++ p->scx.slice = kit->slice; ++ ++ dispatched = true; ++out: ++ if (in_balance) { ++ if (this_rq != locked_rq) { ++ raw_spin_rq_unlock(locked_rq); ++ raw_spin_rq_lock(this_rq); ++ } ++ } else { ++ raw_spin_rq_unlock_irqrestore(locked_rq, flags); ++ } ++ ++ kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | ++ __SCX_DSQ_ITER_HAS_VTIME); ++ return dispatched; ++} ++ +__bpf_kfunc_start_defs(); + +/** @@ -7171,7 +8143,7 @@ index 000000000000..0dac88d0e578 + + flush_dispatch_buf(dspc->rq); + -+ dsq = find_non_local_dsq(dsq_id); ++ dsq = find_user_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + return false; @@ -7191,12 +8163,112 @@ index 000000000000..0dac88d0e578 + } +} + ++/** ++ * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ ++ * @it__iter: DSQ iterator in progress ++ * @slice: duration the dispatched task can run for in nsecs ++ * ++ * Override the slice of the next task that will be dispatched from @it__iter ++ * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called, ++ * the previous slice duration is kept. ++ */ ++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( ++ struct bpf_iter_scx_dsq *it__iter, u64 slice) ++{ ++ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; ++ ++ kit->slice = slice; ++ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; ++} ++ ++/** ++ * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ ++ * @it__iter: DSQ iterator in progress ++ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ ++ * ++ * Override the vtime of the next task that will be dispatched from @it__iter ++ * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the ++ * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to ++ * dispatch the next task, the override is ignored and cleared. ++ */ ++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( ++ struct bpf_iter_scx_dsq *it__iter, u64 vtime) ++{ ++ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; ++ ++ kit->vtime = vtime; ++ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; ++} ++ ++/** ++ * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ ++ * @it__iter: DSQ iterator in progress ++ * @p: task to transfer ++ * @dsq_id: DSQ to move @p to ++ * @enq_flags: SCX_ENQ_* ++ * ++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ ++ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can ++ * be the destination. ++ * ++ * For the transfer to be successful, @p must still be on the DSQ and have been ++ * queued before the DSQ iteration started. This function doesn't care whether ++ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have ++ * been queued before the iteration started. ++ * ++ * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to ++ * update. ++ * ++ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq ++ * lock (e.g. BPF timers or SYSCALL programs). ++ * ++ * Returns %true if @p has been consumed, %false if @p had already been consumed ++ * or dequeued. ++ */ ++__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, ++ struct task_struct *p, u64 dsq_id, ++ u64 enq_flags) ++{ ++ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, ++ p, dsq_id, enq_flags); ++} ++ ++/** ++ * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ ++ * @it__iter: DSQ iterator in progress ++ * @p: task to transfer ++ * @dsq_id: DSQ to move @p to ++ * @enq_flags: SCX_ENQ_* ++ * ++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the ++ * priority queue of the DSQ specified by @dsq_id. The destination must be a ++ * user DSQ as only user DSQs support priority queue. ++ * ++ * @p's slice and vtime are kept by default. Use ++ * scx_bpf_dispatch_from_dsq_set_slice() and ++ * scx_bpf_dispatch_from_dsq_set_vtime() to update. ++ * ++ * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See ++ * scx_bpf_dispatch_vtime() for more information on @vtime. ++ */ ++__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, ++ struct task_struct *p, u64 dsq_id, ++ u64 enq_flags) ++{ ++ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, ++ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); ++} ++ +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_consume) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { @@ -7274,6 +8346,37 @@ index 000000000000..0dac88d0e578 +__bpf_kfunc_start_defs(); + +/** ++ * scx_bpf_create_dsq - Create a custom DSQ ++ * @dsq_id: DSQ to create ++ * @node: NUMA node to allocate from ++ * ++ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable ++ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. ++ */ ++__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) ++{ ++ if (unlikely(node >= (int)nr_node_ids || ++ (node < 0 && node != NUMA_NO_NODE))) ++ return -EINVAL; ++ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_unlocked) ++BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) ++BTF_KFUNCS_END(scx_kfunc_ids_unlocked) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_unlocked, ++}; ++ ++__bpf_kfunc_start_defs(); ++ ++/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags @@ -7291,17 +8394,17 @@ index 000000000000..0dac88d0e578 + if (!ops_cpu_valid(cpu, NULL)) + return; + ++ local_irq_save(irq_flags); ++ ++ this_rq = this_rq(); ++ + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ -+ if (scx_ops_bypassing()) -+ return; -+ -+ local_irq_save(irq_flags); -+ -+ this_rq = this_rq(); ++ if (scx_rq_bypassing(this_rq)) ++ goto out; + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting @@ -7361,7 +8464,7 @@ index 000000000000..0dac88d0e578 + goto out; + } + } else { -+ dsq = find_non_local_dsq(dsq_id); ++ dsq = find_user_dsq(dsq_id); + if (dsq) { + ret = READ_ONCE(dsq->nr); + goto out; @@ -7407,17 +8510,16 @@ index 000000000000..0dac88d0e578 + BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != + __alignof__(struct bpf_iter_scx_dsq)); + -+ if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS) ++ if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) + return -EINVAL; + -+ kit->dsq = find_non_local_dsq(dsq_id); ++ kit->dsq = find_user_dsq(dsq_id); + if (!kit->dsq) + return -ENOENT; + + INIT_LIST_HEAD(&kit->cursor.node); -+ kit->cursor.is_bpf_iter_cursor = true; -+ kit->dsq_seq = READ_ONCE(kit->dsq->seq); -+ kit->flags = flags; ++ kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags; ++ kit->cursor.priv = READ_ONCE(kit->dsq->seq); + + return 0; +} @@ -7431,7 +8533,7 @@ index 000000000000..0dac88d0e578 +__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ bool rev = kit->flags & SCX_DSQ_ITER_REV; ++ bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + unsigned long flags; + @@ -7452,7 +8554,7 @@ index 000000000000..0dac88d0e578 + */ + do { + p = nldsq_next_task(kit->dsq, p, rev); -+ } while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq))); ++ } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); + + if (p) { + if (rev) @@ -7918,6 +9020,41 @@ index 000000000000..0dac88d0e578 + return cpu_rq(cpu); +} + ++/** ++ * scx_bpf_task_cgroup - Return the sched cgroup of a task ++ * @p: task of interest ++ * ++ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with ++ * from the scheduler's POV. SCX operations should use this function to ++ * determine @p's current cgroup as, unlike following @p->cgroups, ++ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all ++ * rq-locked operations. Can be called on the parameter tasks of rq-locked ++ * operations. The restriction guarantees that @p's rq is locked by the caller. ++ */ ++#ifdef CONFIG_CGROUP_SCHED ++__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) ++{ ++ struct task_group *tg = p->sched_task_group; ++ struct cgroup *cgrp = &cgrp_dfl_root.cgrp; ++ ++ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) ++ goto out; ++ ++ /* ++ * A task_group may either be a cgroup or an autogroup. In the latter ++ * case, @tg->css.cgroup is %NULL. A task_group can't become the other ++ * kind once created. ++ */ ++ if (tg && tg->css.cgroup) ++ cgrp = tg->css.cgroup; ++ else ++ cgrp = &cgrp_dfl_root.cgrp; ++out: ++ cgroup_get(cgrp); ++ return cgrp; ++} ++#endif ++ +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7946,6 +9083,9 @@ index 000000000000..0dac88d0e578 +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_rq) ++#ifdef CONFIG_CGROUP_SCHED ++BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) ++#endif +BTF_KFUNCS_END(scx_kfunc_ids_any) + +static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7969,10 +9109,6 @@ index 000000000000..0dac88d0e578 + * check using scx_kf_allowed(). + */ + if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_sleepable)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, -+ &scx_kfunc_set_sleepable)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_select_cpu)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_enqueue_dispatch)) || @@ -7981,6 +9117,10 @@ index 000000000000..0dac88d0e578 + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_unlocked)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, ++ &scx_kfunc_set_unlocked)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &scx_kfunc_set_any)) || @@ -8019,10 +9159,10 @@ index 000000000000..0dac88d0e578 +__initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h new file mode 100644 -index 000000000000..32d3a51f591a +index 000000000000..246019519231 --- /dev/null +++ b/kernel/sched/ext.h -@@ -0,0 +1,69 @@ +@@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst @@ -8092,11 +9232,33 @@ index 000000000000..32d3a51f591a +#else +static inline void scx_update_idle(struct rq *rq, bool idle) {} +#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++#ifdef CONFIG_EXT_GROUP_SCHED ++int scx_tg_online(struct task_group *tg); ++void scx_tg_offline(struct task_group *tg); ++int scx_cgroup_can_attach(struct cgroup_taskset *tset); ++void scx_move_task(struct task_struct *p); ++void scx_cgroup_finish_attach(void); ++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); ++void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); ++void scx_group_set_idle(struct task_group *tg, bool idle); ++#else /* CONFIG_EXT_GROUP_SCHED */ ++static inline int scx_tg_online(struct task_group *tg) { return 0; } ++static inline void scx_tg_offline(struct task_group *tg) {} ++static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } ++static inline void scx_move_task(struct task_struct *p) {} ++static inline void scx_cgroup_finish_attach(void) {} ++static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} ++static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} ++static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 483c137b9d3d..ab17954001ae 100644 +index 91b242e47db7..a36e37a674e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -3835,7 +3835,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +@@ -3857,7 +3857,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } @@ -8106,16 +9268,7 @@ index 483c137b9d3d..ab17954001ae 100644 { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); -@@ -8697,7 +8697,7 @@ - /* - * BATCH and IDLE tasks do not preempt others. - */ -- if (unlikely(p->policy != SCHED_NORMAL)) -+ if (unlikely(!normal_policy(p->policy))) - return; - - cfs_rq = cfs_rq_of(se); -@@ -9647,29 +9647,18 @@ +@@ -9365,29 +9366,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -8148,7 +9301,7 @@ index 483c137b9d3d..ab17954001ae 100644 } #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -13207,6 +13198,7 @@ DEFINE_SCHED_CLASS(fair) = { +@@ -13233,6 +13223,7 @@ DEFINE_SCHED_CLASS(fair) = { .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -8157,10 +9310,10 @@ index 483c137b9d3d..ab17954001ae 100644 .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index 6135fbe83d68..3b6540cc436a 100644 +index 6e78d071beb5..c7a218123b7a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c -@@ -458,11 +458,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) +@@ -452,11 +452,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { @@ -8175,14 +9328,13 @@ index 6135fbe83d68..3b6540cc436a 100644 } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 38aeedd8a6cc..f952a4b99ead 100644 +index 432b43aa091c..48d893de632b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -187,9 +187,19 @@ static inline int idle_policy(int policy) - { +@@ -192,9 +192,18 @@ static inline int idle_policy(int policy) return policy == SCHED_IDLE; } -+ + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT @@ -8199,7 +9351,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644 } static inline int rt_policy(int policy) -@@ -237,6 +247,24 @@ static inline void update_avg(u64 *avg, u64 sample) +@@ -244,6 +253,24 @@ static inline void update_avg(u64 *avg, u64 sample) #define shr_bound(val, shift) \ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) @@ -8224,7 +9376,50 @@ index 38aeedd8a6cc..f952a4b99ead 100644 /* * !! For sched_setattr_nocheck() (kernel) only !! * -@@ -475,6 +503,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +@@ -397,16 +424,17 @@ struct cfs_bandwidth { + struct task_group { + struct cgroup_subsys_state css; + ++#ifdef CONFIG_GROUP_SCHED_WEIGHT ++ /* A positive value indicates that this is a SCHED_IDLE group. */ ++ int idle; ++#endif ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; +- +- /* A positive value indicates that this is a SCHED_IDLE group. */ +- int idle; +- + #ifdef CONFIG_SMP + /* + * load_avg can be heavily contended at clock tick time, so put +@@ -424,6 +452,11 @@ struct task_group { + struct rt_bandwidth rt_bandwidth; + #endif + ++#ifdef CONFIG_EXT_GROUP_SCHED ++ u32 scx_flags; /* SCX_TG_* */ ++ u32 scx_weight; ++#endif ++ + struct rcu_head rcu; + struct list_head list; + +@@ -448,7 +481,7 @@ struct task_group { + + }; + +-#ifdef CONFIG_FAIR_GROUP_SCHED ++#ifdef CONFIG_GROUP_SCHED_WEIGHT + #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + + /* +@@ -479,6 +512,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } @@ -8236,11 +9431,20 @@ index 38aeedd8a6cc..f952a4b99ead 100644 extern int tg_nop(struct task_group *tg, void *data); #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -583,6 +616,12 @@ do { \ - # define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) - # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) +@@ -535,6 +573,9 @@ extern void set_task_rq_fair(struct sched_entity *se, + static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } + #endif /* CONFIG_SMP */ ++#else /* !CONFIG_FAIR_GROUP_SCHED */ ++static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } ++static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } + #endif /* CONFIG_FAIR_GROUP_SCHED */ + + #else /* CONFIG_CGROUP_SCHED */ +@@ -588,6 +629,11 @@ do { \ + # define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) + # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) -+struct rq; +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); @@ -8249,7 +9453,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644 /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; -@@ -691,6 +730,42 @@ struct cfs_rq { +@@ -696,6 +742,43 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -8263,6 +9467,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644 + */ + SCX_RQ_ONLINE = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, ++ SCX_RQ_BYPASSING = 1 << 3, + + SCX_RQ_IN_WAKEUP = 1 << 16, + SCX_RQ_IN_BALANCE = 1 << 17, @@ -8292,11 +9497,10 @@ index 38aeedd8a6cc..f952a4b99ead 100644 static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; -@@ -988,12 +1063,6 @@ struct uclamp_rq { +@@ -996,11 +1079,6 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ --struct rq; -struct balance_callback { - struct balance_callback *next; - void (*func)(struct rq *rq); @@ -8305,7 +9509,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644 /* * This is the main, per-CPU runqueue data structure. * -@@ -1036,6 +1105,9 @@ struct rq { +@@ -1043,6 +1121,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; @@ -8315,16 +9519,24 @@ index 38aeedd8a6cc..f952a4b99ead 100644 #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ -@@ -2278,6 +2350,8 @@ struct sched_class { +@@ -2291,13 +2372,15 @@ struct sched_class { + + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + ++ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + struct task_struct *(*pick_next_task)(struct rq *rq); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + void (*switch_class)(struct rq *rq, struct task_struct *next); + #ifdef CONFIG_SMP - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); -@@ -2305,8 +2379,11 @@ struct sched_class { + + struct task_struct * (*pick_task)(struct rq *rq); +@@ -2323,8 +2406,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ @@ -8336,7 +9548,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); -@@ -2355,19 +2432,54 @@ const struct sched_class name##_sched_class \ +@@ -2373,19 +2459,54 @@ const struct sched_class name##_sched_class \ extern struct sched_class __sched_class_highest[]; extern struct sched_class __sched_class_lowest[]; @@ -8397,50 +9609,77 @@ index 38aeedd8a6cc..f952a4b99ead 100644 static inline bool sched_stop_runnable(struct rq *rq) { -@@ -2464,7 +2576,7 @@ extern void init_sched_dl_class(void); +@@ -2424,6 +2545,19 @@ extern void sched_balance_trigger(struct rq *rq); + extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); + extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); + ++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) ++{ ++ /* When not in the task's cpumask, no point in looking further. */ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ /* Can @cpu run a user thread? */ ++ if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p)) ++ return false; ++ ++ return true; ++} ++ + static inline cpumask_t *alloc_user_cpus_ptr(int node) + { + /* +@@ -2457,6 +2591,11 @@ extern int push_cpu_stop(void *arg); + + #else /* !CONFIG_SMP: */ + ++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) ++{ ++ return true; ++} ++ + static inline int __set_cpus_allowed_ptr(struct task_struct *p, + struct affinity_context *ctx) + { +@@ -2510,8 +2649,6 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, const struct load_weight *lw); -+extern void __setscheduler_prio(struct task_struct *p, int prio); - +- extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); -@@ -2542,6 +2654,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); - extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); - -+extern void check_class_changing(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class); -+extern void check_class_changed(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class, -+ int oldprio); -+ - extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); - #ifdef CONFIG_PREEMPT_RT -@@ -3007,6 +3125,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} - #endif +@@ -3056,6 +3193,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } #ifdef CONFIG_SMP -+ + +bool update_other_load_avgs(struct rq *rq); + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); -@@ -3049,6 +3170,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) - { +@@ -3099,6 +3238,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } + +#else /* !CONFIG_SMP */ +static inline bool update_other_load_avgs(struct rq *rq) { return false; } - #endif + #endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK -@@ -3481,4 +3604,24 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } - extern u64 avg_vruntime(struct cfs_rq *cfs_rq); - extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +@@ -3609,6 +3750,8 @@ extern void set_load_weight(struct task_struct *p, bool update_load); + extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); + extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); + ++extern void check_class_changing(struct rq *rq, struct task_struct *p, ++ const struct sched_class *prev_class); + extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); +@@ -3629,4 +3772,24 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea + + #endif +#ifdef CONFIG_SCHED_CLASS_EXT +/* @@ -8463,11 +9702,76 @@ index 38aeedd8a6cc..f952a4b99ead 100644 +#include "ext.h" + #endif /* _KERNEL_SCHED_SCHED_H */ +diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c +index ae1b42775ef9..4fa59c9f69ac 100644 +--- a/kernel/sched/syscalls.c ++++ b/kernel/sched/syscalls.c +@@ -259,6 +259,25 @@ int sched_core_idle_cpu(int cpu) + #endif + + #ifdef CONFIG_SMP ++/* ++ * Load avg and utiliztion metrics need to be updated periodically and before ++ * consumption. This function updates the metrics for all subsystems except for ++ * the fair class. @rq must be locked and have its clock updated. ++ */ ++bool update_other_load_avgs(struct rq *rq) ++{ ++ u64 now = rq_clock_pelt(rq); ++ const struct sched_class *curr_class = rq->curr->sched_class; ++ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); ++ ++ lockdep_assert_rq_held(rq); ++ ++ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | ++ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | ++ update_hw_load_avg(now, rq, hw_pressure) | ++ update_irq_load_avg(rq, 0); ++} ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -695,6 +714,10 @@ int __sched_setscheduler(struct task_struct *p, + goto unlock; + } + ++ retval = scx_check_setscheduler(p, policy); ++ if (retval) ++ goto unlock; ++ + /* + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. +@@ -797,6 +820,7 @@ int __sched_setscheduler(struct task_struct *p, + __setscheduler_prio(p, newprio); + } + __setscheduler_uclamp(p, attr); ++ check_class_changing(rq, p, prev_class); + + if (queued) { + /* +@@ -1602,6 +1626,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: ++ case SCHED_EXT: + ret = 0; + break; + } +@@ -1629,6 +1654,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: ++ case SCHED_EXT: + ret = 0; + } + return ret; diff --git a/lib/dump_stack.c b/lib/dump_stack.c -index 222c6d6c8281..9581ef4efec5 100644 +index 1a996fbbf50a..388da1aea14a 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c -@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) +@@ -73,6 +73,7 @@ void dump_stack_print_info(const char *log_lvl) print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); @@ -8479,7 +9783,7 @@ diff --git a/tools/Makefile b/tools/Makefile index 276f5d0d53a4..278d24723b74 100644 --- a/tools/Makefile +++ b/tools/Makefile -@@ -28,6 +28,7 @@ include scripts/Makefile.include +@@ -28,6 +28,7 @@ help: @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' @@ -8497,7 +9801,7 @@ index 276f5d0d53a4..278d24723b74 100644 selftests: FORCE $(call descend,testing/$@) -@@ -184,6 +188,9 @@ install: acpi_install counter_install cpupower_install gpio_install \ +@@ -184,6 +188,9 @@ perf_clean: $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean @@ -8526,7 +9830,7 @@ index 000000000000..d6264fe1c8cd +build/ diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile new file mode 100644 -index 000000000000..bf7e108f5ae1 +index 000000000000..ca3815e572d8 --- /dev/null +++ b/tools/sched_ext/Makefile @@ -0,0 +1,246 @@ @@ -8708,7 +10012,7 @@ index 000000000000..bf7e108f5ae1 + +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) + -+c-sched-targets = scx_simple scx_qmap scx_central ++c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg + +$(addprefix $(BINDIR)/,$(c-sched-targets)): \ + $(BINDIR)/%: \ @@ -8778,10 +10082,10 @@ index 000000000000..bf7e108f5ae1 +.SECONDARY: diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md new file mode 100644 -index 000000000000..8efe70cc4363 +index 000000000000..16a42e4060f6 --- /dev/null +++ b/tools/sched_ext/README.md -@@ -0,0 +1,258 @@ +@@ -0,0 +1,270 @@ +SCHED_EXT EXAMPLE SCHEDULERS +============================ + @@ -8976,6 +10280,18 @@ index 000000000000..8efe70cc4363 +infinite slices and no timer ticks allows the VM to avoid unnecessary expensive +vmexits. + ++## scx_flatcg ++ ++A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical ++weight-based cgroup CPU control by flattening the cgroup hierarchy into a single ++layer, by compounding the active weight share at each level. The effect of this ++is a much more performant CPU controller, which does not need to descend down ++cgroup trees in order to properly compute a cgroup's share. ++ ++Similar to scx_simple, in limited scenarios, this scheduler can perform ++reasonably well on single socket-socket systems with a unified L3 cache and show ++significantly lowered hierarchical scheduling overhead. ++ + +# Troubleshooting + @@ -9059,10 +10375,10 @@ index 000000000000..ad7d139ce907 + */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h new file mode 100644 -index 000000000000..20280df62857 +index 000000000000..225f61f9bfca --- /dev/null +++ b/tools/sched_ext/include/scx/common.bpf.h -@@ -0,0 +1,401 @@ +@@ -0,0 +1,427 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. @@ -9072,7 +10388,13 @@ index 000000000000..20280df62857 +#ifndef __SCX_COMMON_BPF_H +#define __SCX_COMMON_BPF_H + ++#ifdef LSP ++#define __bpf__ ++#include "../vmlinux/vmlinux.h" ++#else +#include "vmlinux.h" ++#endif ++ +#include +#include +#include @@ -9100,6 +10422,10 @@ index 000000000000..20280df62857 +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch_cancel(void) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; ++void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; ++void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; ++bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; ++bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +u32 scx_bpf_reenqueue_local(void) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; @@ -9126,6 +10452,13 @@ index 000000000000..20280df62857 +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; ++struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; ++ ++/* ++ * Use the following as @it__iter when calling ++ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops. ++ */ ++#define BPF_FOR_EACH_ITER (&___it) + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} @@ -9363,6 +10696,15 @@ index 000000000000..20280df62857 +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym; ++u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; ++ ++/* ++ * Access a cpumask in read-only mode (typically to check bits). ++ */ ++const struct cpumask *cast_mask(struct bpf_cpumask *mask) ++{ ++ return (const struct cpumask *)mask; ++} + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; @@ -9547,10 +10889,10 @@ index 000000000000..5b0f90152152 +#endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h new file mode 100644 -index 000000000000..3d2fe1208900 +index 000000000000..e5afe9efd3f3 --- /dev/null +++ b/tools/sched_ext/include/scx/compat.bpf.h -@@ -0,0 +1,28 @@ +@@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. @@ -9568,6 +10910,25 @@ index 000000000000..3d2fe1208900 + __ret; \ +}) + ++/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ ++#define __COMPAT_scx_bpf_task_cgroup(p) \ ++ (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ ++ scx_bpf_task_cgroup((p)) : NULL) ++ ++/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */ ++#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \ ++ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \ ++ scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0) ++#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \ ++ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \ ++ scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0) ++#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \ ++ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \ ++ scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) ++#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \ ++ (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \ ++ scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) ++ +/* + * Define sched_ext_ops. This may be expanded to define multiple variants for + * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). @@ -9581,10 +10942,10 @@ index 000000000000..3d2fe1208900 +#endif /* __SCX_COMPAT_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h new file mode 100644 -index 000000000000..1bf8eddf20c2 +index 000000000000..cc56ff9aa252 --- /dev/null +++ b/tools/sched_ext/include/scx/compat.h -@@ -0,0 +1,187 @@ +@@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. @@ -9753,14 +11114,13 @@ index 000000000000..1bf8eddf20c2 + * To maintain compatibility with older libbpf while avoiding trying to attach + * twice, disable the autoattach feature on newer libbpf. + */ -+/* BACKPORT - bpf_mpa__set_autoattach() not available yet, commented out */ -+/*#if LIBBPF_MAJOR_VERSION > 1 || \ ++#if LIBBPF_MAJOR_VERSION > 1 || \ + (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5) +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \ + bpf_map__set_autoattach((__skel)->maps.__ops_name, false) -+#else*/ ++#else +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0) -+/*#endif*/ ++#endif + +#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ + struct bpf_link *__link; \ @@ -9774,10 +11134,10 @@ index 000000000000..1bf8eddf20c2 +#endif /* __SCX_COMPAT_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h new file mode 100644 -index 000000000000..891693ee604e +index 000000000000..8ce2734402e1 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.h -@@ -0,0 +1,111 @@ +@@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts @@ -9805,7 +11165,11 @@ index 000000000000..891693ee604e + +#ifdef __bpf__ + ++#ifdef LSP ++#include "../vmlinux/vmlinux.h" ++#else +#include "vmlinux.h" ++#endif +#include + +#define UEI_DEFINE(__name) \ @@ -9891,7 +11255,7 @@ index 000000000000..891693ee604e +#endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c new file mode 100644 -index 000000000000..1d8fd570eaa7 +index 000000000000..8dd8eb73b6b8 --- /dev/null +++ b/tools/sched_ext/scx_central.bpf.c @@ -0,0 +1,361 @@ @@ -10095,7 +11459,7 @@ index 000000000000..1d8fd570eaa7 + + /* central's gimme is never set */ + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (gimme && !*gimme) ++ if (!gimme || !*gimme) + continue; + + if (dispatch_to_cpu(cpu)) @@ -10215,79 +11579,1190 @@ index 000000000000..1d8fd570eaa7 + return -EINVAL; + } + -+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); -+ bpf_timer_set_callback(timer, central_timerfn); ++ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); ++ bpf_timer_set_callback(timer, central_timerfn); ++ ++ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); ++ /* ++ * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a ++ * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. ++ * Retry without the PIN. This would be the perfect use case for ++ * bpf_core_enum_value_exists() but the enum type doesn't have a name ++ * and can't be used with bpf_core_enum_value_exists(). Oh well... ++ */ ++ if (ret == -EINVAL) { ++ timer_pinned = false; ++ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); ++ } ++ if (ret) ++ scx_bpf_error("bpf_timer_start failed (%d)", ret); ++ return ret; ++} ++ ++void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SCX_OPS_DEFINE(central_ops, ++ /* ++ * We are offloading all scheduling decisions to the central CPU ++ * and thus being the last task on a given CPU doesn't mean ++ * anything special. Enqueue the last tasks like any other tasks. ++ */ ++ .flags = SCX_OPS_ENQ_LAST, ++ ++ .select_cpu = (void *)central_select_cpu, ++ .enqueue = (void *)central_enqueue, ++ .dispatch = (void *)central_dispatch, ++ .running = (void *)central_running, ++ .stopping = (void *)central_stopping, ++ .init = (void *)central_init, ++ .exit = (void *)central_exit, ++ .name = "central"); +diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c +new file mode 100644 +index 000000000000..21deea320bd7 +--- /dev/null ++++ b/tools/sched_ext/scx_central.c +@@ -0,0 +1,135 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo ++ * Copyright (c) 2022 David Vernet ++ */ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "scx_central.bpf.skel.h" ++ ++const char help_fmt[] = ++"A central FIFO sched_ext scheduler.\n" ++"\n" ++"See the top-level comment in .bpf.c for more details.\n" ++"\n" ++"Usage: %s [-s SLICE_US] [-c CPU]\n" ++"\n" ++" -s SLICE_US Override slice duration\n" ++" -c CPU Override the central CPU (default: 0)\n" ++" -v Print libbpf debug messages\n" ++" -h Display this help and exit\n"; ++ ++static bool verbose; ++static volatile int exit_req; ++ ++static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) ++{ ++ if (level == LIBBPF_DEBUG && !verbose) ++ return 0; ++ return vfprintf(stderr, format, args); ++} ++ ++static void sigint_handler(int dummy) ++{ ++ exit_req = 1; ++} ++ ++int main(int argc, char **argv) ++{ ++ struct scx_central *skel; ++ struct bpf_link *link; ++ __u64 seq = 0, ecode; ++ __s32 opt; ++ cpu_set_t *cpuset; ++ ++ libbpf_set_print(libbpf_print_fn); ++ signal(SIGINT, sigint_handler); ++ signal(SIGTERM, sigint_handler); ++restart: ++ skel = SCX_OPS_OPEN(central_ops, scx_central); ++ ++ skel->rodata->central_cpu = 0; ++ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); ++ ++ while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { ++ switch (opt) { ++ case 's': ++ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; ++ break; ++ case 'c': ++ skel->rodata->central_cpu = strtoul(optarg, NULL, 0); ++ break; ++ case 'v': ++ verbose = true; ++ break; ++ default: ++ fprintf(stderr, help_fmt, basename(argv[0])); ++ return opt != 'h'; ++ } ++ } ++ ++ /* Resize arrays so their element count is equal to cpu count. */ ++ RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids); ++ RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids); ++ ++ SCX_OPS_LOAD(skel, central_ops, scx_central, uei); ++ ++ /* ++ * Affinitize the loading thread to the central CPU, as: ++ * - That's where the BPF timer is first invoked in the BPF program. ++ * - We probably don't want this user space component to take up a core ++ * from a task that would benefit from avoiding preemption on one of ++ * the tickless cores. ++ * ++ * Until BPF supports pinning the timer, it's not guaranteed that it ++ * will always be invoked on the central CPU. In practice, this ++ * suffices the majority of the time. ++ */ ++ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); ++ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); ++ CPU_ZERO(cpuset); ++ CPU_SET(skel->rodata->central_cpu, cpuset); ++ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), ++ "Failed to affinitize to central CPU %d (max %d)", ++ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); ++ CPU_FREE(cpuset); ++ ++ link = SCX_OPS_ATTACH(skel, central_ops, scx_central); ++ ++ if (!skel->data->timer_pinned) ++ printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); ++ ++ while (!exit_req && !UEI_EXITED(skel, uei)) { ++ printf("[SEQ %llu]\n", seq++); ++ printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", ++ skel->bss->nr_total, ++ skel->bss->nr_locals, ++ skel->bss->nr_queued, ++ skel->bss->nr_lost_pids); ++ printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", ++ skel->bss->nr_timers, ++ skel->bss->nr_dispatches, ++ skel->bss->nr_mismatches, ++ skel->bss->nr_retries); ++ printf("overflow:%10" PRIu64 "\n", ++ skel->bss->nr_overflows); ++ fflush(stdout); ++ sleep(1); ++ } ++ ++ bpf_link__destroy(link); ++ ecode = UEI_REPORT(skel, uei); ++ scx_central__destroy(skel); ++ ++ if (UEI_ECODE_RESTART(ecode)) ++ goto restart; ++ return 0; ++} +diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c +new file mode 100644 +index 000000000000..b722baf6da4b +--- /dev/null ++++ b/tools/sched_ext/scx_flatcg.bpf.c +@@ -0,0 +1,957 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements ++ * hierarchical weight-based cgroup CPU control by flattening the cgroup ++ * hierarchy into a single layer by compounding the active weight share at each ++ * level. Consider the following hierarchy with weights in parentheses: ++ * ++ * R + A (100) + B (100) ++ * | \ C (100) ++ * \ D (200) ++ * ++ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. ++ * Let's say all three have runnable tasks. The total share that each of these ++ * three cgroups is entitled to can be calculated by compounding its share at ++ * each level. ++ * ++ * For example, B is competing against C and in that competition its share is ++ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's ++ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the ++ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's ++ * eventual shaer is the same at 1/6. D is only competing at the top level and ++ * its share is 200/(100+200) == 2/3. ++ * ++ * So, instead of hierarchically scheduling level-by-level, we can consider it ++ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 ++ * and keep updating the eventual shares as the cgroups' runnable states change. ++ * ++ * This flattening of hierarchy can bring a substantial performance gain when ++ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using ++ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it ++ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two ++ * apache instances competing with 2:1 weight ratio nested four level deep. ++ * ++ * However, the gain comes at the cost of not being able to properly handle ++ * thundering herd of cgroups. For example, if many cgroups which are nested ++ * behind a low priority parent cgroup wake up around the same time, they may be ++ * able to consume more CPU cycles than they are entitled to. In many use cases, ++ * this isn't a real concern especially given the performance gain. Also, there ++ * are ways to mitigate the problem further by e.g. introducing an extra ++ * scheduling layer on cgroup delegation boundaries. ++ * ++ * The scheduler first picks the cgroup to run and then schedule the tasks ++ * within by using nested weighted vtime scheduling by default. The ++ * cgroup-internal scheduling can be switched to FIFO with the -f option. ++ */ ++#include ++#include "scx_flatcg.h" ++ ++/* ++ * Maximum amount of retries to find a valid cgroup. ++ */ ++enum { ++ FALLBACK_DSQ = 0, ++ CGROUP_MAX_RETRIES = 1024, ++}; ++ ++char _license[] SEC("license") = "GPL"; ++ ++const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ ++const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; ++const volatile bool fifo_sched; ++ ++u64 cvtime_now; ++UEI_DEFINE(uei); ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); ++ __type(key, u32); ++ __type(value, u64); ++ __uint(max_entries, FCG_NR_STATS); ++} stats SEC(".maps"); ++ ++static void stat_inc(enum fcg_stat_idx idx) ++{ ++ u32 idx_v = idx; ++ ++ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); ++ if (cnt_p) ++ (*cnt_p)++; ++} ++ ++struct fcg_cpu_ctx { ++ u64 cur_cgid; ++ u64 cur_at; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); ++ __type(key, u32); ++ __type(value, struct fcg_cpu_ctx); ++ __uint(max_entries, 1); ++} cpu_ctx SEC(".maps"); ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); ++ __uint(map_flags, BPF_F_NO_PREALLOC); ++ __type(key, int); ++ __type(value, struct fcg_cgrp_ctx); ++} cgrp_ctx SEC(".maps"); ++ ++struct cgv_node { ++ struct bpf_rb_node rb_node; ++ __u64 cvtime; ++ __u64 cgid; ++}; ++ ++private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; ++private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); ++ ++struct cgv_node_stash { ++ struct cgv_node __kptr *node; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_HASH); ++ __uint(max_entries, 16384); ++ __type(key, __u64); ++ __type(value, struct cgv_node_stash); ++} cgv_node_stash SEC(".maps"); ++ ++struct fcg_task_ctx { ++ u64 bypassed_at; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); ++ __uint(map_flags, BPF_F_NO_PREALLOC); ++ __type(key, int); ++ __type(value, struct fcg_task_ctx); ++} task_ctx SEC(".maps"); ++ ++/* gets inc'd on weight tree changes to expire the cached hweights */ ++u64 hweight_gen = 1; ++ ++static u64 div_round_up(u64 dividend, u64 divisor) ++{ ++ return (dividend + divisor - 1) / divisor; ++} ++ ++static bool vtime_before(u64 a, u64 b) ++{ ++ return (s64)(a - b) < 0; ++} ++ ++static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) ++{ ++ struct cgv_node *cgc_a, *cgc_b; ++ ++ cgc_a = container_of(a, struct cgv_node, rb_node); ++ cgc_b = container_of(b, struct cgv_node, rb_node); ++ ++ return cgc_a->cvtime < cgc_b->cvtime; ++} ++ ++static struct fcg_cpu_ctx *find_cpu_ctx(void) ++{ ++ struct fcg_cpu_ctx *cpuc; ++ u32 idx = 0; ++ ++ cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); ++ if (!cpuc) { ++ scx_bpf_error("cpu_ctx lookup failed"); ++ return NULL; ++ } ++ return cpuc; ++} ++ ++static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); ++ if (!cgc) { ++ scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); ++ return NULL; ++ } ++ return cgc; ++} ++ ++static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ ++ cgrp = bpf_cgroup_ancestor(cgrp, level); ++ if (!cgrp) { ++ scx_bpf_error("ancestor cgroup lookup failed"); ++ return NULL; ++ } ++ ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ scx_bpf_error("ancestor cgrp_ctx lookup failed"); ++ bpf_cgroup_release(cgrp); ++ return cgc; ++} ++ ++static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) ++{ ++ int level; ++ ++ if (!cgc->nr_active) { ++ stat_inc(FCG_STAT_HWT_SKIP); ++ return; ++ } ++ ++ if (cgc->hweight_gen == hweight_gen) { ++ stat_inc(FCG_STAT_HWT_CACHE); ++ return; ++ } ++ ++ stat_inc(FCG_STAT_HWT_UPDATES); ++ bpf_for(level, 0, cgrp->level + 1) { ++ struct fcg_cgrp_ctx *cgc; ++ bool is_active; ++ ++ cgc = find_ancestor_cgrp_ctx(cgrp, level); ++ if (!cgc) ++ break; ++ ++ if (!level) { ++ cgc->hweight = FCG_HWEIGHT_ONE; ++ cgc->hweight_gen = hweight_gen; ++ } else { ++ struct fcg_cgrp_ctx *pcgc; ++ ++ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); ++ if (!pcgc) ++ break; ++ ++ /* ++ * We can be opportunistic here and not grab the ++ * cgv_tree_lock and deal with the occasional races. ++ * However, hweight updates are already cached and ++ * relatively low-frequency. Let's just do the ++ * straightforward thing. ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ is_active = cgc->nr_active; ++ if (is_active) { ++ cgc->hweight_gen = pcgc->hweight_gen; ++ cgc->hweight = ++ div_round_up(pcgc->hweight * cgc->weight, ++ pcgc->child_weight_sum); ++ } ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ if (!is_active) { ++ stat_inc(FCG_STAT_HWT_RACE); ++ break; ++ } ++ } ++ } ++} ++ ++static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) ++{ ++ u64 delta, cvtime, max_budget; ++ ++ /* ++ * A node which is on the rbtree can't be pointed to from elsewhere yet ++ * and thus can't be updated and repositioned. Instead, we collect the ++ * vtime deltas separately and apply it asynchronously here. ++ */ ++ delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta); ++ cvtime = cgv_node->cvtime + delta; ++ ++ /* ++ * Allow a cgroup to carry the maximum budget proportional to its ++ * hweight such that a full-hweight cgroup can immediately take up half ++ * of the CPUs at the most while staying at the front of the rbtree. ++ */ ++ max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / ++ (2 * FCG_HWEIGHT_ONE); ++ if (vtime_before(cvtime, cvtime_now - max_budget)) ++ cvtime = cvtime_now - max_budget; ++ ++ cgv_node->cvtime = cvtime; ++} ++ ++static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) ++{ ++ struct cgv_node_stash *stash; ++ struct cgv_node *cgv_node; ++ u64 cgid = cgrp->kn->id; ++ ++ /* paired with cmpxchg in try_pick_next_cgroup() */ ++ if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { ++ stat_inc(FCG_STAT_ENQ_SKIP); ++ return; ++ } ++ ++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); ++ if (!stash) { ++ scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); ++ return; ++ } ++ ++ /* NULL if the node is already on the rbtree */ ++ cgv_node = bpf_kptr_xchg(&stash->node, NULL); ++ if (!cgv_node) { ++ stat_inc(FCG_STAT_ENQ_RACE); ++ return; ++ } ++ ++ bpf_spin_lock(&cgv_tree_lock); ++ cgrp_cap_budget(cgv_node, cgc); ++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); ++ bpf_spin_unlock(&cgv_tree_lock); ++} ++ ++static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) ++{ ++ /* ++ * Tell fcg_stopping() that this bypassed the regular scheduling path ++ * and should be force charged to the cgroup. 0 is used to indicate that ++ * the task isn't bypassing, so if the current runtime is 0, go back by ++ * one nanosecond. ++ */ ++ taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; ++} ++ ++s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) ++{ ++ struct fcg_task_ctx *taskc; ++ bool is_idle = false; ++ s32 cpu; ++ ++ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); ++ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); ++ if (!taskc) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return cpu; ++ } ++ ++ /* ++ * If select_cpu_dfl() is recommending local enqueue, the target CPU is ++ * idle. Follow it and charge the cgroup later in fcg_stopping() after ++ * the fact. ++ */ ++ if (is_idle) { ++ set_bypassed_at(p, taskc); ++ stat_inc(FCG_STAT_LOCAL); ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); ++ } ++ ++ return cpu; ++} ++ ++void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ struct fcg_task_ctx *taskc; ++ struct cgroup *cgrp; ++ struct fcg_cgrp_ctx *cgc; ++ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); ++ if (!taskc) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return; ++ } ++ ++ /* ++ * Use the direct dispatching and force charging to deal with tasks with ++ * custom affinities so that we don't have to worry about per-cgroup ++ * dq's containing tasks that can't be executed from some CPUs. ++ */ ++ if (p->nr_cpus_allowed != nr_cpus) { ++ set_bypassed_at(p, taskc); ++ ++ /* ++ * The global dq is deprioritized as we don't want to let tasks ++ * to boost themselves by constraining its cpumask. The ++ * deprioritization is rather severe, so let's not apply that to ++ * per-cpu kernel threads. This is ham-fisted. We probably wanna ++ * implement per-cgroup fallback dq's instead so that we have ++ * more control over when tasks with custom cpumask get issued. ++ */ ++ if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { ++ stat_inc(FCG_STAT_LOCAL); ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); ++ } else { ++ stat_inc(FCG_STAT_GLOBAL); ++ scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags); ++ } ++ return; ++ } ++ ++ cgrp = __COMPAT_scx_bpf_task_cgroup(p); ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ goto out_release; ++ ++ if (fifo_sched) { ++ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); ++ } else { ++ u64 tvtime = p->scx.dsq_vtime; ++ ++ /* ++ * Limit the amount of budget that an idling task can accumulate ++ * to one slice. ++ */ ++ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) ++ tvtime = cgc->tvtime_now - SCX_SLICE_DFL; ++ ++ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, ++ tvtime, enq_flags); ++ } ++ ++ cgrp_enqueued(cgrp, cgc); ++out_release: ++ bpf_cgroup_release(cgrp); ++} ++ ++/* ++ * Walk the cgroup tree to update the active weight sums as tasks wake up and ++ * sleep. The weight sums are used as the base when calculating the proportion a ++ * given cgroup or task is entitled to at each level. ++ */ ++static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ bool updated = false; ++ int idx; ++ ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ return; ++ ++ /* ++ * In most cases, a hot cgroup would have multiple threads going to ++ * sleep and waking up while the whole cgroup stays active. In leaf ++ * cgroups, ->nr_runnable which is updated with __sync operations gates ++ * ->nr_active updates, so that we don't have to grab the cgv_tree_lock ++ * repeatedly for a busy cgroup which is staying active. ++ */ ++ if (runnable) { ++ if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) ++ return; ++ stat_inc(FCG_STAT_ACT); ++ } else { ++ if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) ++ return; ++ stat_inc(FCG_STAT_DEACT); ++ } ++ ++ /* ++ * If @cgrp is becoming runnable, its hweight should be refreshed after ++ * it's added to the weight tree so that enqueue has the up-to-date ++ * value. If @cgrp is becoming quiescent, the hweight should be ++ * refreshed before it's removed from the weight tree so that the usage ++ * charging which happens afterwards has access to the latest value. ++ */ ++ if (!runnable) ++ cgrp_refresh_hweight(cgrp, cgc); ++ ++ /* propagate upwards */ ++ bpf_for(idx, 0, cgrp->level) { ++ int level = cgrp->level - idx; ++ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; ++ bool propagate = false; ++ ++ cgc = find_ancestor_cgrp_ctx(cgrp, level); ++ if (!cgc) ++ break; ++ if (level) { ++ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); ++ if (!pcgc) ++ break; ++ } ++ ++ /* ++ * We need the propagation protected by a lock to synchronize ++ * against weight changes. There's no reason to drop the lock at ++ * each level but bpf_spin_lock() doesn't want any function ++ * calls while locked. ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ ++ if (runnable) { ++ if (!cgc->nr_active++) { ++ updated = true; ++ if (pcgc) { ++ propagate = true; ++ pcgc->child_weight_sum += cgc->weight; ++ } ++ } ++ } else { ++ if (!--cgc->nr_active) { ++ updated = true; ++ if (pcgc) { ++ propagate = true; ++ pcgc->child_weight_sum -= cgc->weight; ++ } ++ } ++ } ++ ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ if (!propagate) ++ break; ++ } ++ ++ if (updated) ++ __sync_fetch_and_add(&hweight_gen, 1); ++ ++ if (runnable) ++ cgrp_refresh_hweight(cgrp, cgc); ++} ++ ++void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) ++{ ++ struct cgroup *cgrp; ++ ++ cgrp = __COMPAT_scx_bpf_task_cgroup(p); ++ update_active_weight_sums(cgrp, true); ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) ++{ ++ struct cgroup *cgrp; ++ struct fcg_cgrp_ctx *cgc; ++ ++ if (fifo_sched) ++ return; ++ ++ cgrp = __COMPAT_scx_bpf_task_cgroup(p); ++ cgc = find_cgrp_ctx(cgrp); ++ if (cgc) { ++ /* ++ * @cgc->tvtime_now always progresses forward as tasks start ++ * executing. The test and update can be performed concurrently ++ * from multiple CPUs and thus racy. Any error should be ++ * contained and temporary. Let's just live with it. ++ */ ++ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) ++ cgc->tvtime_now = p->scx.dsq_vtime; ++ } ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) ++{ ++ struct fcg_task_ctx *taskc; ++ struct cgroup *cgrp; ++ struct fcg_cgrp_ctx *cgc; ++ ++ /* ++ * Scale the execution time by the inverse of the weight and charge. ++ * ++ * Note that the default yield implementation yields by setting ++ * @p->scx.slice to zero and the following would treat the yielding task ++ * as if it has consumed all its slice. If this penalizes yielding tasks ++ * too much, determine the execution time by taking explicit timestamps ++ * instead of depending on @p->scx.slice. ++ */ ++ if (!fifo_sched) ++ p->scx.dsq_vtime += ++ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; ++ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); ++ if (!taskc) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return; ++ } ++ ++ if (!taskc->bypassed_at) ++ return; ++ ++ cgrp = __COMPAT_scx_bpf_task_cgroup(p); ++ cgc = find_cgrp_ctx(cgrp); ++ if (cgc) { ++ __sync_fetch_and_add(&cgc->cvtime_delta, ++ p->se.sum_exec_runtime - taskc->bypassed_at); ++ taskc->bypassed_at = 0; ++ } ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) ++{ ++ struct cgroup *cgrp; ++ ++ cgrp = __COMPAT_scx_bpf_task_cgroup(p); ++ update_active_weight_sums(cgrp, false); ++ bpf_cgroup_release(cgrp); ++} ++ ++void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) ++{ ++ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; ++ ++ cgc = find_cgrp_ctx(cgrp); ++ if (!cgc) ++ return; ++ ++ if (cgrp->level) { ++ pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); ++ if (!pcgc) ++ return; ++ } ++ ++ bpf_spin_lock(&cgv_tree_lock); ++ if (pcgc && cgc->nr_active) ++ pcgc->child_weight_sum += (s64)weight - cgc->weight; ++ cgc->weight = weight; ++ bpf_spin_unlock(&cgv_tree_lock); ++} ++ ++static bool try_pick_next_cgroup(u64 *cgidp) ++{ ++ struct bpf_rb_node *rb_node; ++ struct cgv_node_stash *stash; ++ struct cgv_node *cgv_node; ++ struct fcg_cgrp_ctx *cgc; ++ struct cgroup *cgrp; ++ u64 cgid; ++ ++ /* pop the front cgroup and wind cvtime_now accordingly */ ++ bpf_spin_lock(&cgv_tree_lock); ++ ++ rb_node = bpf_rbtree_first(&cgv_tree); ++ if (!rb_node) { ++ bpf_spin_unlock(&cgv_tree_lock); ++ stat_inc(FCG_STAT_PNC_NO_CGRP); ++ *cgidp = 0; ++ return true; ++ } ++ ++ rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ if (!rb_node) { ++ /* ++ * This should never happen. bpf_rbtree_first() was called ++ * above while the tree lock was held, so the node should ++ * always be present. ++ */ ++ scx_bpf_error("node could not be removed"); ++ return true; ++ } ++ ++ cgv_node = container_of(rb_node, struct cgv_node, rb_node); ++ cgid = cgv_node->cgid; ++ ++ if (vtime_before(cvtime_now, cgv_node->cvtime)) ++ cvtime_now = cgv_node->cvtime; ++ ++ /* ++ * If lookup fails, the cgroup's gone. Free and move on. See ++ * fcg_cgroup_exit(). ++ */ ++ cgrp = bpf_cgroup_from_id(cgid); ++ if (!cgrp) { ++ stat_inc(FCG_STAT_PNC_GONE); ++ goto out_free; ++ } ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); ++ if (!cgc) { ++ bpf_cgroup_release(cgrp); ++ stat_inc(FCG_STAT_PNC_GONE); ++ goto out_free; ++ } ++ ++ if (!scx_bpf_consume(cgid)) { ++ bpf_cgroup_release(cgrp); ++ stat_inc(FCG_STAT_PNC_EMPTY); ++ goto out_stash; ++ } ++ ++ /* ++ * Successfully consumed from the cgroup. This will be our current ++ * cgroup for the new slice. Refresh its hweight. ++ */ ++ cgrp_refresh_hweight(cgrp, cgc); ++ ++ bpf_cgroup_release(cgrp); ++ ++ /* ++ * As the cgroup may have more tasks, add it back to the rbtree. Note ++ * that here we charge the full slice upfront and then exact later ++ * according to the actual consumption. This prevents lowpri thundering ++ * herd from saturating the machine. ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); ++ cgrp_cap_budget(cgv_node, cgc); ++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); ++ bpf_spin_unlock(&cgv_tree_lock); ++ ++ *cgidp = cgid; ++ stat_inc(FCG_STAT_PNC_NEXT); ++ return true; ++ ++out_stash: ++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); ++ if (!stash) { ++ stat_inc(FCG_STAT_PNC_GONE); ++ goto out_free; ++ } ++ ++ /* ++ * Paired with cmpxchg in cgrp_enqueued(). If they see the following ++ * transition, they'll enqueue the cgroup. If they are earlier, we'll ++ * see their task in the dq below and requeue the cgroup. ++ */ ++ __sync_val_compare_and_swap(&cgc->queued, 1, 0); ++ ++ if (scx_bpf_dsq_nr_queued(cgid)) { ++ bpf_spin_lock(&cgv_tree_lock); ++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); ++ bpf_spin_unlock(&cgv_tree_lock); ++ stat_inc(FCG_STAT_PNC_RACE); ++ } else { ++ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); ++ if (cgv_node) { ++ scx_bpf_error("unexpected !NULL cgv_node stash"); ++ goto out_free; ++ } ++ } ++ ++ return false; ++ ++out_free: ++ bpf_obj_drop(cgv_node); ++ return false; ++} ++ ++void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ struct fcg_cpu_ctx *cpuc; ++ struct fcg_cgrp_ctx *cgc; ++ struct cgroup *cgrp; ++ u64 now = bpf_ktime_get_ns(); ++ bool picked_next = false; ++ ++ cpuc = find_cpu_ctx(); ++ if (!cpuc) ++ return; ++ ++ if (!cpuc->cur_cgid) ++ goto pick_next_cgroup; ++ ++ if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { ++ if (scx_bpf_consume(cpuc->cur_cgid)) { ++ stat_inc(FCG_STAT_CNS_KEEP); ++ return; ++ } ++ stat_inc(FCG_STAT_CNS_EMPTY); ++ } else { ++ stat_inc(FCG_STAT_CNS_EXPIRE); ++ } ++ ++ /* ++ * The current cgroup is expiring. It was already charged a full slice. ++ * Calculate the actual usage and accumulate the delta. ++ */ ++ cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); ++ if (!cgrp) { ++ stat_inc(FCG_STAT_CNS_GONE); ++ goto pick_next_cgroup; ++ } ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); ++ if (cgc) { ++ /* ++ * We want to update the vtime delta and then look for the next ++ * cgroup to execute but the latter needs to be done in a loop ++ * and we can't keep the lock held. Oh well... ++ */ ++ bpf_spin_lock(&cgv_tree_lock); ++ __sync_fetch_and_add(&cgc->cvtime_delta, ++ (cpuc->cur_at + cgrp_slice_ns - now) * ++ FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); ++ bpf_spin_unlock(&cgv_tree_lock); ++ } else { ++ stat_inc(FCG_STAT_CNS_GONE); ++ } ++ ++ bpf_cgroup_release(cgrp); ++ ++pick_next_cgroup: ++ cpuc->cur_at = now; ++ ++ if (scx_bpf_consume(FALLBACK_DSQ)) { ++ cpuc->cur_cgid = 0; ++ return; ++ } ++ ++ bpf_repeat(CGROUP_MAX_RETRIES) { ++ if (try_pick_next_cgroup(&cpuc->cur_cgid)) { ++ picked_next = true; ++ break; ++ } ++ } ++ ++ /* ++ * This only happens if try_pick_next_cgroup() races against enqueue ++ * path for more than CGROUP_MAX_RETRIES times, which is extremely ++ * unlikely and likely indicates an underlying bug. There shouldn't be ++ * any stall risk as the race is against enqueue. ++ */ ++ if (!picked_next) ++ stat_inc(FCG_STAT_PNC_FAIL); ++} ++ ++s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, ++ struct scx_init_task_args *args) ++{ ++ struct fcg_task_ctx *taskc; ++ struct fcg_cgrp_ctx *cgc; ++ ++ /* ++ * @p is new. Let's ensure that its task_ctx is available. We can sleep ++ * in this function and the following will automatically use GFP_KERNEL. ++ */ ++ taskc = bpf_task_storage_get(&task_ctx, p, 0, ++ BPF_LOCAL_STORAGE_GET_F_CREATE); ++ if (!taskc) ++ return -ENOMEM; ++ ++ taskc->bypassed_at = 0; ++ ++ if (!(cgc = find_cgrp_ctx(args->cgroup))) ++ return -ENOENT; ++ ++ p->scx.dsq_vtime = cgc->tvtime_now; ++ ++ return 0; ++} ++ ++int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args) ++{ ++ struct fcg_cgrp_ctx *cgc; ++ struct cgv_node *cgv_node; ++ struct cgv_node_stash empty_stash = {}, *stash; ++ u64 cgid = cgrp->kn->id; ++ int ret; ++ ++ /* ++ * Technically incorrect as cgroup ID is full 64bit while dsq ID is ++ * 63bit. Should not be a problem in practice and easy to spot in the ++ * unlikely case that it breaks. ++ */ ++ ret = scx_bpf_create_dsq(cgid, -1); ++ if (ret) ++ return ret; ++ ++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, ++ BPF_LOCAL_STORAGE_GET_F_CREATE); ++ if (!cgc) { ++ ret = -ENOMEM; ++ goto err_destroy_dsq; ++ } ++ ++ cgc->weight = args->weight; ++ cgc->hweight = FCG_HWEIGHT_ONE; ++ ++ ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, ++ BPF_NOEXIST); ++ if (ret) { ++ if (ret != -ENOMEM) ++ scx_bpf_error("unexpected stash creation error (%d)", ++ ret); ++ goto err_destroy_dsq; ++ } ++ ++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); ++ if (!stash) { ++ scx_bpf_error("unexpected cgv_node stash lookup failure"); ++ ret = -ENOENT; ++ goto err_destroy_dsq; ++ } ++ ++ cgv_node = bpf_obj_new(struct cgv_node); ++ if (!cgv_node) { ++ ret = -ENOMEM; ++ goto err_del_cgv_node; ++ } ++ ++ cgv_node->cgid = cgid; ++ cgv_node->cvtime = cvtime_now; ++ ++ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); ++ if (cgv_node) { ++ scx_bpf_error("unexpected !NULL cgv_node stash"); ++ ret = -EBUSY; ++ goto err_drop; ++ } ++ ++ return 0; ++ ++err_drop: ++ bpf_obj_drop(cgv_node); ++err_del_cgv_node: ++ bpf_map_delete_elem(&cgv_node_stash, &cgid); ++err_destroy_dsq: ++ scx_bpf_destroy_dsq(cgid); ++ return ret; ++} ++ ++void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) ++{ ++ u64 cgid = cgrp->kn->id; + -+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + /* -+ * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a -+ * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. -+ * Retry without the PIN. This would be the perfect use case for -+ * bpf_core_enum_value_exists() but the enum type doesn't have a name -+ * and can't be used with bpf_core_enum_value_exists(). Oh well... ++ * For now, there's no way find and remove the cgv_node if it's on the ++ * cgv_tree. Let's drain them in the dispatch path as they get popped ++ * off the front of the tree. + */ -+ if (ret == -EINVAL) { -+ timer_pinned = false; -+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); -+ } -+ if (ret) -+ scx_bpf_error("bpf_timer_start failed (%d)", ret); -+ return ret; ++ bpf_map_delete_elem(&cgv_node_stash, &cgid); ++ scx_bpf_destroy_dsq(cgid); +} + -+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) ++void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) +{ -+ UEI_RECORD(uei, ei); ++ struct fcg_cgrp_ctx *from_cgc, *to_cgc; ++ s64 vtime_delta; ++ ++ /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ ++ if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) ++ return; ++ ++ vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; ++ p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; +} + -+SCX_OPS_DEFINE(central_ops, -+ /* -+ * We are offloading all scheduling decisions to the central CPU -+ * and thus being the last task on a given CPU doesn't mean -+ * anything special. Enqueue the last tasks like any other tasks. -+ */ -+ .flags = SCX_OPS_ENQ_LAST, ++s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) ++{ ++ return scx_bpf_create_dsq(FALLBACK_DSQ, -1); ++} + -+ .select_cpu = (void *)central_select_cpu, -+ .enqueue = (void *)central_enqueue, -+ .dispatch = (void *)central_dispatch, -+ .running = (void *)central_running, -+ .stopping = (void *)central_stopping, -+ .init = (void *)central_init, -+ .exit = (void *)central_exit, -+ .name = "central"); -diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c ++void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SCX_OPS_DEFINE(flatcg_ops, ++ .select_cpu = (void *)fcg_select_cpu, ++ .enqueue = (void *)fcg_enqueue, ++ .dispatch = (void *)fcg_dispatch, ++ .runnable = (void *)fcg_runnable, ++ .running = (void *)fcg_running, ++ .stopping = (void *)fcg_stopping, ++ .quiescent = (void *)fcg_quiescent, ++ .init_task = (void *)fcg_init_task, ++ .cgroup_set_weight = (void *)fcg_cgroup_set_weight, ++ .cgroup_init = (void *)fcg_cgroup_init, ++ .cgroup_exit = (void *)fcg_cgroup_exit, ++ .cgroup_move = (void *)fcg_cgroup_move, ++ .init = (void *)fcg_init, ++ .exit = (void *)fcg_exit, ++ .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, ++ .name = "flatcg"); +diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c new file mode 100644 -index 000000000000..21deea320bd7 +index 000000000000..5d24ca9c29d9 --- /dev/null -+++ b/tools/sched_ext/scx_central.c -@@ -0,0 +1,135 @@ ++++ b/tools/sched_ext/scx_flatcg.c +@@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 Tejun Heo ++ * Copyright (c) 2023 David Vernet + */ -+#define _GNU_SOURCE -+#include +#include -+#include -+#include +#include ++#include +#include ++#include ++#include ++#include ++#include +#include +#include -+#include "scx_central.bpf.skel.h" ++#include "scx_flatcg.h" ++#include "scx_flatcg.bpf.skel.h" ++ ++#ifndef FILEID_KERNFS ++#define FILEID_KERNFS 0xfe ++#endif + +const char help_fmt[] = -+"A central FIFO sched_ext scheduler.\n" ++"A flattened cgroup hierarchy sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" -+"Usage: %s [-s SLICE_US] [-c CPU]\n" ++"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n" +"\n" +" -s SLICE_US Override slice duration\n" -+" -c CPU Override the central CPU (default: 0)\n" ++" -i INTERVAL Report interval\n" ++" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + @@ -10306,103 +12781,251 @@ index 000000000000..21deea320bd7 + exit_req = 1; +} + ++static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) ++{ ++ FILE *fp; ++ char buf[4096]; ++ char *line, *cur = NULL, *tok; ++ __u64 sum = 0, idle = 0; ++ __u64 delta_sum, delta_idle; ++ int idx; ++ ++ fp = fopen("/proc/stat", "r"); ++ if (!fp) { ++ perror("fopen(\"/proc/stat\")"); ++ return 0.0; ++ } ++ ++ if (!fgets(buf, sizeof(buf), fp)) { ++ perror("fgets(\"/proc/stat\")"); ++ fclose(fp); ++ return 0.0; ++ } ++ fclose(fp); ++ ++ line = buf; ++ for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { ++ char *endp = NULL; ++ __u64 v; ++ ++ if (idx == 0) { ++ line = NULL; ++ continue; ++ } ++ v = strtoull(tok, &endp, 0); ++ if (!endp || *endp != '\0') { ++ fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", ++ idx, tok); ++ continue; ++ } ++ sum += v; ++ if (idx == 4) ++ idle = v; ++ } ++ ++ delta_sum = sum - *last_sum; ++ delta_idle = idle - *last_idle; ++ *last_sum = sum; ++ *last_idle = idle; ++ ++ return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; ++} ++ ++static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) ++{ ++ __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; ++ __u32 idx; ++ ++ memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); ++ ++ for (idx = 0; idx < FCG_NR_STATS; idx++) { ++ int ret, cpu; ++ ++ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), ++ &idx, cnts[idx]); ++ if (ret < 0) ++ continue; ++ for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) ++ stats[idx] += cnts[idx][cpu]; ++ } ++} ++ +int main(int argc, char **argv) +{ -+ struct scx_central *skel; ++ struct scx_flatcg *skel; + struct bpf_link *link; -+ __u64 seq = 0, ecode; ++ struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; ++ bool dump_cgrps = false; ++ __u64 last_cpu_sum = 0, last_cpu_idle = 0; ++ __u64 last_stats[FCG_NR_STATS] = {}; ++ unsigned long seq = 0; + __s32 opt; -+ cpu_set_t *cpuset; ++ __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: -+ skel = SCX_OPS_OPEN(central_ops, scx_central); ++ skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); + -+ skel->rodata->central_cpu = 0; -+ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); ++ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); ++ ++ while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { ++ double v; + -+ while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { + switch (opt) { + case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; ++ v = strtod(optarg, NULL); ++ skel->rodata->cgrp_slice_ns = v * 1000; + break; -+ case 'c': -+ skel->rodata->central_cpu = strtoul(optarg, NULL, 0); ++ case 'i': ++ v = strtod(optarg, NULL); ++ intv_ts.tv_sec = v; ++ intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; ++ break; ++ case 'd': ++ dump_cgrps = true; ++ break; ++ case 'f': ++ skel->rodata->fifo_sched = true; + break; + case 'v': + verbose = true; + break; ++ case 'h': + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + -+ /* Resize arrays so their element count is equal to cpu count. */ -+ RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids); -+ RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids); -+ -+ SCX_OPS_LOAD(skel, central_ops, scx_central, uei); -+ -+ /* -+ * Affinitize the loading thread to the central CPU, as: -+ * - That's where the BPF timer is first invoked in the BPF program. -+ * - We probably don't want this user space component to take up a core -+ * from a task that would benefit from avoiding preemption on one of -+ * the tickless cores. -+ * -+ * Until BPF supports pinning the timer, it's not guaranteed that it -+ * will always be invoked on the central CPU. In practice, this -+ * suffices the majority of the time. -+ */ -+ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); -+ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); -+ CPU_ZERO(cpuset); -+ CPU_SET(skel->rodata->central_cpu, cpuset); -+ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), -+ "Failed to affinitize to central CPU %d (max %d)", -+ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); -+ CPU_FREE(cpuset); -+ -+ link = SCX_OPS_ATTACH(skel, central_ops, scx_central); ++ printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", ++ (double)skel->rodata->cgrp_slice_ns / 1000000.0, ++ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, ++ dump_cgrps); + -+ if (!skel->data->timer_pinned) -+ printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); ++ SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); ++ link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg); + + while (!exit_req && !UEI_EXITED(skel, uei)) { -+ printf("[SEQ %llu]\n", seq++); -+ printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", -+ skel->bss->nr_total, -+ skel->bss->nr_locals, -+ skel->bss->nr_queued, -+ skel->bss->nr_lost_pids); -+ printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", -+ skel->bss->nr_timers, -+ skel->bss->nr_dispatches, -+ skel->bss->nr_mismatches, -+ skel->bss->nr_retries); -+ printf("overflow:%10" PRIu64 "\n", -+ skel->bss->nr_overflows); ++ __u64 acc_stats[FCG_NR_STATS]; ++ __u64 stats[FCG_NR_STATS]; ++ float cpu_util; ++ int i; ++ ++ cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); ++ ++ fcg_read_stats(skel, acc_stats); ++ for (i = 0; i < FCG_NR_STATS; i++) ++ stats[i] = acc_stats[i] - last_stats[i]; ++ ++ memcpy(last_stats, acc_stats, sizeof(acc_stats)); ++ ++ printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", ++ seq++, cpu_util * 100.0, skel->data->hweight_gen); ++ printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", ++ stats[FCG_STAT_ACT], ++ stats[FCG_STAT_DEACT], ++ stats[FCG_STAT_GLOBAL], ++ stats[FCG_STAT_LOCAL]); ++ printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", ++ stats[FCG_STAT_HWT_CACHE], ++ stats[FCG_STAT_HWT_UPDATES], ++ stats[FCG_STAT_HWT_SKIP], ++ stats[FCG_STAT_HWT_RACE]); ++ printf("ENQ skip:%6llu race:%6llu\n", ++ stats[FCG_STAT_ENQ_SKIP], ++ stats[FCG_STAT_ENQ_RACE]); ++ printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", ++ stats[FCG_STAT_CNS_KEEP], ++ stats[FCG_STAT_CNS_EXPIRE], ++ stats[FCG_STAT_CNS_EMPTY], ++ stats[FCG_STAT_CNS_GONE]); ++ printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", ++ stats[FCG_STAT_PNC_NEXT], ++ stats[FCG_STAT_PNC_EMPTY], ++ stats[FCG_STAT_PNC_NO_CGRP], ++ stats[FCG_STAT_PNC_GONE], ++ stats[FCG_STAT_PNC_RACE], ++ stats[FCG_STAT_PNC_FAIL]); ++ printf("BAD remove:%6llu\n", ++ acc_stats[FCG_STAT_BAD_REMOVAL]); + fflush(stdout); -+ sleep(1); ++ ++ nanosleep(&intv_ts, NULL); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); -+ scx_central__destroy(skel); ++ scx_flatcg__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} +diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h +new file mode 100644 +index 000000000000..6f2ea50acb1c +--- /dev/null ++++ b/tools/sched_ext/scx_flatcg.h +@@ -0,0 +1,51 @@ ++#ifndef __SCX_EXAMPLE_FLATCG_H ++#define __SCX_EXAMPLE_FLATCG_H ++ ++enum { ++ FCG_HWEIGHT_ONE = 1LLU << 16, ++}; ++ ++enum fcg_stat_idx { ++ FCG_STAT_ACT, ++ FCG_STAT_DEACT, ++ FCG_STAT_LOCAL, ++ FCG_STAT_GLOBAL, ++ ++ FCG_STAT_HWT_UPDATES, ++ FCG_STAT_HWT_CACHE, ++ FCG_STAT_HWT_SKIP, ++ FCG_STAT_HWT_RACE, ++ ++ FCG_STAT_ENQ_SKIP, ++ FCG_STAT_ENQ_RACE, ++ ++ FCG_STAT_CNS_KEEP, ++ FCG_STAT_CNS_EXPIRE, ++ FCG_STAT_CNS_EMPTY, ++ FCG_STAT_CNS_GONE, ++ ++ FCG_STAT_PNC_NO_CGRP, ++ FCG_STAT_PNC_NEXT, ++ FCG_STAT_PNC_EMPTY, ++ FCG_STAT_PNC_GONE, ++ FCG_STAT_PNC_RACE, ++ FCG_STAT_PNC_FAIL, ++ ++ FCG_STAT_BAD_REMOVAL, ++ ++ FCG_NR_STATS, ++}; ++ ++struct fcg_cgrp_ctx { ++ u32 nr_active; ++ u32 nr_runnable; ++ u32 queued; ++ u32 weight; ++ u32 hweight; ++ u64 child_weight_sum; ++ u64 hweight_gen; ++ s64 cvtime_delta; ++ u64 tvtime_now; ++}; ++ ++#endif /* __SCX_EXAMPLE_FLATCG_H */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c new file mode 100644 -index 000000000000..892278f12dce +index 000000000000..5b39bee9eb23 --- /dev/null +++ b/tools/sched_ext/scx_qmap.bpf.c -@@ -0,0 +1,706 @@ +@@ -0,0 +1,813 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple five-level FIFO queue scheduler. @@ -10432,6 +13055,8 @@ index 000000000000..892278f12dce +enum consts { + ONE_SEC_IN_NS = 1000000000, + SHARED_DSQ = 0, ++ HIGHPRI_DSQ = 1, ++ HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ +}; + +char _license[] SEC("license") = "GPL"; @@ -10441,10 +13066,12 @@ index 000000000000..892278f12dce +const volatile u32 stall_kernel_nth; +const volatile u32 dsp_inf_loop_after; +const volatile u32 dsp_batch; ++const volatile bool highpri_boosting; +const volatile bool print_shared_dsq; +const volatile s32 disallow_tgid; +const volatile bool suppress_dump; + ++u64 nr_highpri_queued; +u32 test_error_cnt; + +UEI_DEFINE(uei); @@ -10500,6 +13127,7 @@ index 000000000000..892278f12dce +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local_dsq */ ++ bool highpri; + u64 core_sched_seq; +}; + @@ -10527,6 +13155,7 @@ index 000000000000..892278f12dce +/* Statistics */ +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; +u64 nr_core_sched_execed; ++u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; +u32 cpuperf_min, cpuperf_avg, cpuperf_max; +u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; + @@ -10545,17 +13174,25 @@ index 000000000000..892278f12dce + return -1; +} + ++static struct task_ctx *lookup_task_ctx(struct task_struct *p) ++{ ++ struct task_ctx *tctx; ++ ++ if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return NULL; ++ } ++ return tctx; ++} ++ +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); ++ if (!(tctx = lookup_task_ctx(p))) + return -ESRCH; -+ } + + cpu = pick_direct_dispatch_cpu(p, prev_cpu); + @@ -10602,11 +13239,8 @@ index 000000000000..892278f12dce + if (test_error_cnt && !--test_error_cnt) + scx_bpf_error("test triggering error"); + -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); ++ if (!(tctx = lookup_task_ctx(p))) + return; -+ } + + /* + * All enqueued tasks must have their core_sched_seq updated for correct @@ -10661,6 +13295,10 @@ index 000000000000..892278f12dce + return; + } + ++ if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { ++ tctx->highpri = true; ++ __sync_fetch_and_add(&nr_highpri_queued, 1); ++ } + __sync_fetch_and_add(&nr_enqueued, 1); +} + @@ -10677,13 +13315,80 @@ index 000000000000..892278f12dce + +static void update_core_sched_head_seq(struct task_struct *p) +{ -+ struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); ++ struct task_ctx *tctx; + -+ if (tctx) ++ if ((tctx = lookup_task_ctx(p))) + core_sched_head_seqs[idx] = tctx->core_sched_seq; -+ else -+ scx_bpf_error("task_ctx lookup failed"); ++} ++ ++/* ++ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly ++ * selective priority boosting mechanism by scanning SHARED_DSQ looking for ++ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This ++ * makes minor difference only when dsp_batch is larger than 1. ++ * ++ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and ++ * non-rq-lock holding BPF programs. As demonstration, this function is called ++ * from qmap_dispatch() and monitor_timerfn(). ++ */ ++static bool dispatch_highpri(bool from_timer) ++{ ++ struct task_struct *p; ++ s32 this_cpu = bpf_get_smp_processor_id(); ++ ++ /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ ++ bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { ++ static u64 highpri_seq; ++ struct task_ctx *tctx; ++ ++ if (!(tctx = lookup_task_ctx(p))) ++ return false; ++ ++ if (tctx->highpri) { ++ /* exercise the set_*() and vtime interface too */ ++ __COMPAT_scx_bpf_dispatch_from_dsq_set_slice( ++ BPF_FOR_EACH_ITER, slice_ns * 2); ++ __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime( ++ BPF_FOR_EACH_ITER, highpri_seq++); ++ __COMPAT_scx_bpf_dispatch_vtime_from_dsq( ++ BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); ++ } ++ } ++ ++ /* ++ * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU ++ * is found. ++ */ ++ bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { ++ bool dispatched = false; ++ s32 cpu; ++ ++ if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) ++ cpu = this_cpu; ++ else ++ cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); ++ ++ if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, ++ SCX_DSQ_LOCAL_ON | cpu, ++ SCX_ENQ_PREEMPT)) { ++ if (cpu == this_cpu) { ++ dispatched = true; ++ __sync_fetch_and_add(&nr_expedited_local, 1); ++ } else { ++ __sync_fetch_and_add(&nr_expedited_remote, 1); ++ } ++ if (from_timer) ++ __sync_fetch_and_add(&nr_expedited_from_timer, 1); ++ } else { ++ __sync_fetch_and_add(&nr_expedited_lost, 1); ++ } ++ ++ if (dispatched) ++ return true; ++ } ++ ++ return false; +} + +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) @@ -10694,7 +13399,10 @@ index 000000000000..892278f12dce + void *fifo; + s32 i, pid; + -+ if (scx_bpf_consume(SHARED_DSQ)) ++ if (dispatch_highpri(false)) ++ return; ++ ++ if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ)) + return; + + if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { @@ -10731,6 +13439,8 @@ index 000000000000..892278f12dce + + /* Dispatch or advance. */ + bpf_repeat(BPF_MAX_LOOPS) { ++ struct task_ctx *tctx; ++ + if (bpf_map_pop_elem(fifo, &pid)) + break; + @@ -10738,13 +13448,25 @@ index 000000000000..892278f12dce + if (!p) + continue; + ++ if (!(tctx = lookup_task_ctx(p))) { ++ bpf_task_release(p); ++ return; ++ } ++ ++ if (tctx->highpri) ++ __sync_fetch_and_sub(&nr_highpri_queued, 1); ++ + update_core_sched_head_seq(p); + __sync_fetch_and_add(&nr_dispatched, 1); ++ + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); + bpf_task_release(p); ++ + batch--; + cpuc->dsp_cnt--; + if (!batch || !scx_bpf_dispatch_nr_slots()) { ++ if (dispatch_highpri(false)) ++ return; + scx_bpf_consume(SHARED_DSQ); + return; + } @@ -11054,6 +13776,10 @@ index 000000000000..892278f12dce + +static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) +{ ++ bpf_rcu_read_lock(); ++ dispatch_highpri(true); ++ bpf_rcu_read_unlock(); ++ + monitor_cpuperf(); + + if (print_shared_dsq) @@ -11075,6 +13801,10 @@ index 000000000000..892278f12dce + if (ret) + return ret; + ++ ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); ++ if (ret) ++ return ret; ++ + timer = bpf_map_lookup_elem(&monitor_timer, &key); + if (!timer) + return -ESRCH; @@ -11111,10 +13841,10 @@ index 000000000000..892278f12dce + .name = "qmap"); diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c new file mode 100644 -index 000000000000..c9ca30d62b2b +index 000000000000..ac45a02b4055 --- /dev/null +++ b/tools/sched_ext/scx_qmap.c -@@ -0,0 +1,144 @@ +@@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. @@ -11146,6 +13876,7 @@ index 000000000000..c9ca30d62b2b +" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" +" -b COUNT Dispatch upto COUNT tasks together\n" +" -P Print out DSQ content to trace_pipe every second, use with -b\n" ++" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" +" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -D LEN Set scx_exit_info.dump buffer length\n" +" -S Suppress qmap-specific debug dump\n" @@ -11180,7 +13911,7 @@ index 000000000000..c9ca30d62b2b + + skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); + -+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { ++ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -11203,6 +13934,9 @@ index 000000000000..c9ca30d62b2b + case 'P': + skel->rodata->print_shared_dsq = true; + break; ++ case 'H': ++ skel->rodata->highpri_boosting = true; ++ break; + case 'd': + skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); + if (skel->rodata->disallow_tgid < 0) @@ -11238,6 +13972,11 @@ index 000000000000..c9ca30d62b2b + skel->bss->nr_reenqueued, skel->bss->nr_dequeued, + skel->bss->nr_core_sched_execed, + skel->bss->nr_ddsp_from_enq); ++ printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", ++ skel->bss->nr_expedited_local, ++ skel->bss->nr_expedited_remote, ++ skel->bss->nr_expedited_from_timer, ++ skel->bss->nr_expedited_lost); + if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) + printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", + skel->bss->cpuperf_min, @@ -11261,10 +14000,10 @@ index 000000000000..c9ca30d62b2b +} diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py new file mode 100644 -index 000000000000..d457d2a74e1e +index 000000000000..8bc626ede1c4 --- /dev/null +++ b/tools/sched_ext/scx_show_state.py -@@ -0,0 +1,39 @@ +@@ -0,0 +1,40 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Tejun Heo @@ -11304,6 +14043,7 @@ index 000000000000..d457d2a74e1e +print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') +print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') +print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') ++print(f'enable_seq : {read_atomic("scx_enable_seq")}') diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c new file mode 100644 index 000000000000..ed7e8d535fc5 @@ -13191,10 +15931,10 @@ index 000000000000..97d45f1e5597 +REGISTER_SCX_TEST(&init_enable_count) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c new file mode 100644 -index 000000000000..44612fdaf399 +index 000000000000..00bfa9cb95d3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c -@@ -0,0 +1,132 @@ +@@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler with every callback defined. @@ -13292,6 +16032,32 @@ index 000000000000..44612fdaf399 +void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) +{} + ++s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args) ++{ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) ++{} ++ ++s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) ++{ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, ++ struct cgroup *from, struct cgroup *to) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) ++{} ++ +s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) +{ + return 0; @@ -13323,6 +16089,12 @@ index 000000000000..44612fdaf399 + .enable = maximal_enable, + .exit_task = maximal_exit_task, + .disable = maximal_disable, ++ .cgroup_init = maximal_cgroup_init, ++ .cgroup_exit = maximal_cgroup_exit, ++ .cgroup_prep_move = maximal_cgroup_prep_move, ++ .cgroup_move = maximal_cgroup_move, ++ .cgroup_cancel_move = maximal_cgroup_cancel_move, ++ .cgroup_set_weight = maximal_cgroup_set_weight, + .init = maximal_init, + .exit = maximal_exit, + .name = "maximal", @@ -15130,3 +17902,6 @@ index 000000000000..bc13dfec1267 +int file_write_long(const char *path, long val); + +#endif // __SCX_TEST_H__ +-- +2.47.0.rc0 + -- cgit v1.2.3