aboutsummaryrefslogtreecommitdiff
path: root/SOURCES/scx-kernel.patch
diff options
context:
space:
mode:
Diffstat (limited to 'SOURCES/scx-kernel.patch')
-rw-r--r--SOURCES/scx-kernel.patch4553
1 files changed, 3664 insertions, 889 deletions
diff --git a/SOURCES/scx-kernel.patch b/SOURCES/scx-kernel.patch
index 29e1f22..196bac1 100644
--- a/SOURCES/scx-kernel.patch
+++ b/SOURCES/scx-kernel.patch
@@ -1,3 +1,184 @@
+From 11276ed2c72c57624c1214e980efd24648be015c Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 4 Oct 2024 17:12:13 +0200
+Subject: [PATCH] sched-ext
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/scheduler/index.rst | 1 +
+ Documentation/scheduler/sched-ext.rst | 326 +
+ MAINTAINERS | 13 +
+ drivers/tty/sysrq.c | 1 +
+ include/asm-generic/vmlinux.lds.h | 1 +
+ include/linux/cgroup.h | 4 +-
+ include/linux/sched.h | 5 +
+ include/linux/sched/ext.h | 216 +
+ include/linux/sched/task.h | 8 +-
+ include/trace/events/sched_ext.h | 32 +
+ include/uapi/linux/sched.h | 1 +
+ init/Kconfig | 10 +
+ init/init_task.c | 12 +
+ kernel/Kconfig.preempt | 27 +-
+ kernel/fork.c | 17 +-
+ kernel/sched/build_policy.c | 11 +
+ kernel/sched/core.c | 288 +-
+ kernel/sched/cpufreq_schedutil.c | 50 +-
+ kernel/sched/debug.c | 3 +
+ kernel/sched/ext.c | 7262 +++++++++++++++++
+ kernel/sched/ext.h | 91 +
+ kernel/sched/fair.c | 21 +-
+ kernel/sched/idle.c | 2 +
+ kernel/sched/sched.h | 203 +-
+ kernel/sched/syscalls.c | 26 +
+ lib/dump_stack.c | 1 +
+ tools/Makefile | 10 +-
+ tools/sched_ext/.gitignore | 2 +
+ tools/sched_ext/Makefile | 246 +
+ tools/sched_ext/README.md | 270 +
+ .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 +
+ tools/sched_ext/include/scx/common.bpf.h | 427 +
+ tools/sched_ext/include/scx/common.h | 75 +
+ tools/sched_ext/include/scx/compat.bpf.h | 47 +
+ tools/sched_ext/include/scx/compat.h | 186 +
+ tools/sched_ext/include/scx/user_exit_info.h | 115 +
+ tools/sched_ext/scx_central.bpf.c | 361 +
+ tools/sched_ext/scx_central.c | 135 +
+ tools/sched_ext/scx_flatcg.bpf.c | 957 +++
+ tools/sched_ext/scx_flatcg.c | 233 +
+ tools/sched_ext/scx_flatcg.h | 51 +
+ tools/sched_ext/scx_qmap.bpf.c | 813 ++
+ tools/sched_ext/scx_qmap.c | 153 +
+ tools/sched_ext/scx_show_state.py | 40 +
+ tools/sched_ext/scx_simple.bpf.c | 156 +
+ tools/sched_ext/scx_simple.c | 107 +
+ tools/testing/selftests/sched_ext/.gitignore | 6 +
+ tools/testing/selftests/sched_ext/Makefile | 218 +
+ tools/testing/selftests/sched_ext/config | 9 +
+ .../selftests/sched_ext/create_dsq.bpf.c | 58 +
+ .../testing/selftests/sched_ext/create_dsq.c | 57 +
+ .../sched_ext/ddsp_bogus_dsq_fail.bpf.c | 42 +
+ .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 57 +
+ .../sched_ext/ddsp_vtimelocal_fail.bpf.c | 39 +
+ .../sched_ext/ddsp_vtimelocal_fail.c | 56 +
+ .../selftests/sched_ext/dsp_local_on.bpf.c | 65 +
+ .../selftests/sched_ext/dsp_local_on.c | 58 +
+ .../sched_ext/enq_last_no_enq_fails.bpf.c | 21 +
+ .../sched_ext/enq_last_no_enq_fails.c | 60 +
+ .../sched_ext/enq_select_cpu_fails.bpf.c | 43 +
+ .../sched_ext/enq_select_cpu_fails.c | 61 +
+ tools/testing/selftests/sched_ext/exit.bpf.c | 84 +
+ tools/testing/selftests/sched_ext/exit.c | 55 +
+ tools/testing/selftests/sched_ext/exit_test.h | 20 +
+ .../testing/selftests/sched_ext/hotplug.bpf.c | 61 +
+ tools/testing/selftests/sched_ext/hotplug.c | 168 +
+ .../selftests/sched_ext/hotplug_test.h | 15 +
+ .../sched_ext/init_enable_count.bpf.c | 53 +
+ .../selftests/sched_ext/init_enable_count.c | 166 +
+ .../testing/selftests/sched_ext/maximal.bpf.c | 164 +
+ tools/testing/selftests/sched_ext/maximal.c | 51 +
+ .../selftests/sched_ext/maybe_null.bpf.c | 36 +
+ .../testing/selftests/sched_ext/maybe_null.c | 49 +
+ .../sched_ext/maybe_null_fail_dsp.bpf.c | 25 +
+ .../sched_ext/maybe_null_fail_yld.bpf.c | 28 +
+ .../testing/selftests/sched_ext/minimal.bpf.c | 21 +
+ tools/testing/selftests/sched_ext/minimal.c | 58 +
+ .../selftests/sched_ext/prog_run.bpf.c | 33 +
+ tools/testing/selftests/sched_ext/prog_run.c | 78 +
+ .../testing/selftests/sched_ext/reload_loop.c | 75 +
+ tools/testing/selftests/sched_ext/runner.c | 201 +
+ tools/testing/selftests/sched_ext/scx_test.h | 131 +
+ .../selftests/sched_ext/select_cpu_dfl.bpf.c | 40 +
+ .../selftests/sched_ext/select_cpu_dfl.c | 72 +
+ .../sched_ext/select_cpu_dfl_nodispatch.bpf.c | 89 +
+ .../sched_ext/select_cpu_dfl_nodispatch.c | 72 +
+ .../sched_ext/select_cpu_dispatch.bpf.c | 41 +
+ .../selftests/sched_ext/select_cpu_dispatch.c | 70 +
+ .../select_cpu_dispatch_bad_dsq.bpf.c | 37 +
+ .../sched_ext/select_cpu_dispatch_bad_dsq.c | 56 +
+ .../select_cpu_dispatch_dbl_dsp.bpf.c | 38 +
+ .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 56 +
+ .../sched_ext/select_cpu_vtime.bpf.c | 92 +
+ .../selftests/sched_ext/select_cpu_vtime.c | 59 +
+ .../selftests/sched_ext/test_example.c | 49 +
+ tools/testing/selftests/sched_ext/util.c | 71 +
+ tools/testing/selftests/sched_ext/util.h | 13 +
+ 97 files changed, 16174 insertions(+), 130 deletions(-)
+ create mode 100644 Documentation/scheduler/sched-ext.rst
+ create mode 100644 include/linux/sched/ext.h
+ create mode 100644 include/trace/events/sched_ext.h
+ create mode 100644 kernel/sched/ext.c
+ create mode 100644 kernel/sched/ext.h
+ create mode 100644 tools/sched_ext/.gitignore
+ create mode 100644 tools/sched_ext/Makefile
+ create mode 100644 tools/sched_ext/README.md
+ create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h
+ create mode 100644 tools/sched_ext/include/scx/common.bpf.h
+ create mode 100644 tools/sched_ext/include/scx/common.h
+ create mode 100644 tools/sched_ext/include/scx/compat.bpf.h
+ create mode 100644 tools/sched_ext/include/scx/compat.h
+ create mode 100644 tools/sched_ext/include/scx/user_exit_info.h
+ create mode 100644 tools/sched_ext/scx_central.bpf.c
+ create mode 100644 tools/sched_ext/scx_central.c
+ create mode 100644 tools/sched_ext/scx_flatcg.bpf.c
+ create mode 100644 tools/sched_ext/scx_flatcg.c
+ create mode 100644 tools/sched_ext/scx_flatcg.h
+ create mode 100644 tools/sched_ext/scx_qmap.bpf.c
+ create mode 100644 tools/sched_ext/scx_qmap.c
+ create mode 100644 tools/sched_ext/scx_show_state.py
+ create mode 100644 tools/sched_ext/scx_simple.bpf.c
+ create mode 100644 tools/sched_ext/scx_simple.c
+ create mode 100644 tools/testing/selftests/sched_ext/.gitignore
+ create mode 100644 tools/testing/selftests/sched_ext/Makefile
+ create mode 100644 tools/testing/selftests/sched_ext/config
+ create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
+ create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
+ create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/exit.c
+ create mode 100644 tools/testing/selftests/sched_ext/exit_test.h
+ create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/hotplug.c
+ create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h
+ create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c
+ create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/maximal.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/minimal.c
+ create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/prog_run.c
+ create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c
+ create mode 100644 tools/testing/selftests/sched_ext/runner.c
+ create mode 100644 tools/testing/selftests/sched_ext/scx_test.h
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c
+ create mode 100644 tools/testing/selftests/sched_ext/test_example.c
+ create mode 100644 tools/testing/selftests/sched_ext/util.c
+ create mode 100644 tools/testing/selftests/sched_ext/util.h
+
diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
index 43bd8a145b7a..0611dc3dda8e 100644
--- a/Documentation/scheduler/index.rst
@@ -12,10 +193,10 @@ index 43bd8a145b7a..0611dc3dda8e 100644
text_files
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
new file mode 100644
-index 000000000000..a707d2181a77
+index 000000000000..6c0d70e2e27d
--- /dev/null
+++ b/Documentation/scheduler/sched-ext.rst
-@@ -0,0 +1,316 @@
+@@ -0,0 +1,326 @@
+==========================
+Extensible Scheduler Class
+==========================
@@ -101,6 +282,15 @@ index 000000000000..a707d2181a77
+ # cat /sys/kernel/sched_ext/root/ops
+ simple
+
++You can check if any BPF scheduler has ever been loaded since boot by examining
++this monotonically incrementing counter (a value of zero indicates that no BPF
++scheduler has been loaded):
++
++.. code-block:: none
++
++ # cat /sys/kernel/sched_ext/enable_seq
++ 1
++
+``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
+detailed information:
+
@@ -114,6 +304,7 @@ index 000000000000..a707d2181a77
+ enable_state : enabled (2)
+ bypass_depth : 0
+ nr_rejected : 0
++ enable_seq : 1
+
+If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
+be determined as follows:
@@ -333,10 +524,10 @@ index 000000000000..a707d2181a77
+possible, they are subject to change without warning between kernel
+versions.
diff --git a/MAINTAINERS b/MAINTAINERS
-index 958e935449e5..17d2679d291a 100644
+index c2a7363e86fe..bcfe36daf67a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
-@@ -19917,6 +19917,19 @@ F: include/linux/wait.h
+@@ -20364,6 +20364,19 @@ F: include/linux/wait.h
F: include/uapi/linux/sched.h
F: kernel/sched/
@@ -353,11 +544,11 @@ index 958e935449e5..17d2679d291a 100644
+F: tools/sched_ext/
+F: tools/testing/selftests/sched_ext
+
- SCSI LIBSAS SUBSYSTEM
- R: John Garry <john.g.garry@oracle.com>
- R: Jason Yan <yanaijie@huawei.com>
+ SCIOSENSE ENS160 MULTI-GAS SENSOR DRIVER
+ M: Gustavo Silva <gustavograzs@gmail.com>
+ S: Maintained
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
-index e5974b8239c9..167e877b8bef 100644
+index 14f8f00fdcf9..930b04e3d148 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = {
@@ -369,7 +560,7 @@ index e5974b8239c9..167e877b8bef 100644
NULL, /* T */
NULL, /* U */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
-index 70bf1004076b..a8417d31e348 100644
+index 1ae44793132a..19ec49a9179b 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -133,6 +133,7 @@
@@ -381,10 +572,10 @@ index 70bf1004076b..a8417d31e348 100644
__sched_class_lowest = .;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
-index 2150ca60394b..3cdaec701600 100644
+index c60ba0ab1462..7139b33cb104 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
-@@ -29,8 +29,6 @@
+@@ -28,8 +28,6 @@
struct kernel_clone_args;
@@ -393,7 +584,7 @@ index 2150ca60394b..3cdaec701600 100644
/*
* All weight knobs on the default hierarchy should use the following min,
* default and max values. The default value is the logarithmic center of
-@@ -40,6 +38,8 @@ struct kernel_clone_args;
+@@ -39,6 +37,8 @@ struct kernel_clone_args;
#define CGROUP_WEIGHT_DFL 100
#define CGROUP_WEIGHT_MAX 10000
@@ -403,10 +594,10 @@ index 2150ca60394b..3cdaec701600 100644
CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */
CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */
diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 76214d7c819d..0f3a107bcd02 100644
+index f8d150343d42..5b4f78fe379d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
-@@ -80,6 +80,8 @@ struct task_group;
+@@ -82,6 +82,8 @@ struct task_group;
struct task_struct;
struct user_event_mm;
@@ -415,7 +606,7 @@ index 76214d7c819d..0f3a107bcd02 100644
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
-@@ -802,6 +804,9 @@ struct task_struct {
+@@ -810,6 +812,9 @@ struct task_struct {
struct sched_rt_entity rt;
struct sched_dl_entity dl;
struct sched_dl_entity *dl_server;
@@ -427,10 +618,10 @@ index 76214d7c819d..0f3a107bcd02 100644
#ifdef CONFIG_SCHED_CORE
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
new file mode 100644
-index 000000000000..26e1c33bc844
+index 000000000000..76166d3b14fc
--- /dev/null
+++ b/include/linux/sched/ext.h
-@@ -0,0 +1,204 @@
+@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@@ -553,9 +744,17 @@ index 000000000000..26e1c33bc844
+ __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+};
+
++enum scx_dsq_lnode_flags {
++ SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
++
++ /* high 16 bits can be for iter cursor flags */
++ __SCX_DSQ_LNODE_PRIV_SHIFT = 16,
++};
++
+struct scx_dsq_list_node {
+ struct list_head node;
-+ bool is_bpf_iter_cursor;
++ u32 flags;
++ u32 priv; /* can be used by iter cursor */
+};
+
+/*
@@ -612,15 +811,19 @@ index 000000000000..26e1c33bc844
+ * If set, reject future sched_setscheduler(2) calls updating the policy
+ * to %SCHED_EXT with -%EACCES.
+ *
-+ * If set from ops.init_task() and the task's policy is already
-+ * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
-+ * or by inhering the parent's policy during fork, the task's policy is
-+ * rejected and forcefully reverted to %SCHED_NORMAL. The number of
-+ * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
++ * Can be set from ops.init_task() while the BPF scheduler is being
++ * loaded (!scx_init_task_args->fork). If set and the task's policy is
++ * already %SCHED_EXT, the task's policy is rejected and forcefully
++ * reverted to %SCHED_NORMAL. The number of such events are reported
++ * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
++ * during fork is not allowed.
+ */
+ bool disallow; /* reject switching into SCX */
+
+ /* cold fields */
++#ifdef CONFIG_EXT_GROUP_SCHED
++ struct cgroup *cgrp_moving_from;
++#endif
+ /* must be the last field, see init_scx_entity() */
+ struct list_head tasks_node;
+};
@@ -636,7 +839,7 @@ index 000000000000..26e1c33bc844
+#endif /* CONFIG_SCHED_CLASS_EXT */
+#endif /* _LINUX_SCHED_EXT_H */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
-index d362aacf9f89..4df2f9055587 100644
+index d362aacf9f89..0f2aeb37bbb0 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
@@ -649,6 +852,18 @@ index d362aacf9f89..4df2f9055587 100644
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);
+@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
+ return t;
+ }
+
++static inline struct task_struct *tryget_task_struct(struct task_struct *t)
++{
++ return refcount_inc_not_zero(&t->usage) ? t : NULL;
++}
++
+ extern void __put_task_struct(struct task_struct *t);
+ extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
+
diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
new file mode 100644
index 000000000000..fe19da7315a9
@@ -699,6 +914,37 @@ index 3bac0a8ceab2..359a14cc76a4 100644
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
+diff --git a/init/Kconfig b/init/Kconfig
+index 08a0d51afaae..e1a88d48d652 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1028,9 +1028,13 @@ menuconfig CGROUP_SCHED
+ tasks.
+
+ if CGROUP_SCHED
++config GROUP_SCHED_WEIGHT
++ def_bool n
++
+ config FAIR_GROUP_SCHED
+ bool "Group scheduling for SCHED_OTHER"
+ depends on CGROUP_SCHED
++ select GROUP_SCHED_WEIGHT
+ default CGROUP_SCHED
+
+ config CFS_BANDWIDTH
+@@ -1055,6 +1059,12 @@ config RT_GROUP_SCHED
+ realtime bandwidth for them.
+ See Documentation/scheduler/sched-rt-group.rst for more information.
+
++config EXT_GROUP_SCHED
++ bool
++ depends on SCHED_CLASS_EXT && CGROUP_SCHED
++ select GROUP_SCHED_WEIGHT
++ default y
++
+ endif #CGROUP_SCHED
+
+ config SCHED_MM_CID
diff --git a/init/init_task.c b/init/init_task.c
index eeb110c65fe2..e222722e790b 100644
--- a/init/init_task.c
@@ -730,10 +976,10 @@ index eeb110c65fe2..e222722e790b 100644
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
-index c2f1fd95a821..f3d140c3acc1 100644
+index c2f1fd95a821..fe782cd77388 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
-@@ -133,4 +133,28 @@ config SCHED_CORE
+@@ -133,4 +133,29 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should
be no measurable impact on performance.
@@ -741,6 +987,7 @@ index c2f1fd95a821..f3d140c3acc1 100644
+config SCHED_CLASS_EXT
+ bool "Extensible Scheduling Class"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
++ select STACKTRACE if STACKTRACE_SUPPORT
+ help
+ This option enables a new scheduler class sched_ext (SCX), which
+ allows scheduling policies to be implemented as BPF programs to
@@ -764,7 +1011,7 @@ index c2f1fd95a821..f3d140c3acc1 100644
+ Documentation/scheduler/sched-ext.rst
+ https://github.com/sched-ext/scx
diff --git a/kernel/fork.c b/kernel/fork.c
-index 99076dbe27d8..741d962db0d9 100644
+index 238695afc630..69a0a7210060 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
@@ -775,7 +1022,7 @@ index 99076dbe27d8..741d962db0d9 100644
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
-@@ -971,6 +972,7 @@ void __put_task_struct(struct task_struct *tsk)
+@@ -973,6 +974,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
@@ -783,7 +1030,7 @@ index 99076dbe27d8..741d962db0d9 100644
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
-@@ -2363,7 +2365,7 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2355,7 +2357,7 @@ __latent_entropy struct task_struct *copy_process(
retval = perf_event_init_task(p, clone_flags);
if (retval)
@@ -792,7 +1039,7 @@ index 99076dbe27d8..741d962db0d9 100644
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
-@@ -2496,7 +2498,9 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2488,7 +2490,9 @@ __latent_entropy struct task_struct *copy_process(
* cgroup specific, it unconditionally needs to place the task on a
* runqueue.
*/
@@ -803,7 +1050,7 @@ index 99076dbe27d8..741d962db0d9 100644
/*
* From this point on we must avoid any synchronous user-space
-@@ -2542,13 +2546,13 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2534,13 +2538,13 @@ __latent_entropy struct task_struct *copy_process(
/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
@@ -819,7 +1066,7 @@ index 99076dbe27d8..741d962db0d9 100644
}
/* No more failure paths after this point. */
-@@ -2622,10 +2626,11 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2614,10 +2618,11 @@ __latent_entropy struct task_struct *copy_process(
return p;
@@ -832,7 +1079,7 @@ index 99076dbe27d8..741d962db0d9 100644
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
-@@ -2664,6 +2669,8 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2656,6 +2661,8 @@ __latent_entropy struct task_struct *copy_process(
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
@@ -842,7 +1089,7 @@ index 99076dbe27d8..741d962db0d9 100644
lockdep_free_task(p);
#ifdef CONFIG_NUMA
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
-index d9dc9ab3773f..e7d539bb721e 100644
+index 39c315182b35..fae1f5c921eb 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -16,18 +16,25 @@
@@ -871,18 +1118,20 @@ index d9dc9ab3773f..e7d539bb721e 100644
#include <uapi/linux/sched/types.h>
-@@ -52,3 +59,6 @@
+@@ -52,4 +59,8 @@
#include "cputime.c"
#include "deadline.c"
+#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext.c"
+#endif
++
+ #include "syscalls.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index ebf21373f663..fb6276f74ee6 100644
+index f3951e4a55e5..c792a6feb7a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
-@@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p)
+@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p)
if (p->sched_class == &idle_sched_class)
return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
@@ -894,7 +1143,7 @@ index ebf21373f663..fb6276f74ee6 100644
}
/*
-@@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a,
+@@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a,
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return cfs_prio_less(a, b, in_fi);
@@ -906,7 +1155,7 @@ index ebf21373f663..fb6276f74ee6 100644
return false;
}
-@@ -1254,11 +1262,14 @@ bool sched_can_stop_tick(struct rq *rq)
+@@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
/*
@@ -918,14 +1167,14 @@ index ebf21373f663..fb6276f74ee6 100644
+ * involuntary preemption. For SCX, ask.
*/
- if (rq->nr_running > 1)
-+ if (!scx_switched_all() && rq->nr_running > 1)
++ if (scx_enabled() && !scx_can_stop_tick(rq))
+ return false;
+
-+ if (scx_enabled() && !scx_can_stop_tick(rq))
++ if (rq->cfs.nr_running > 1)
return false;
/*
-@@ -1340,8 +1351,8 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+@@ -1341,8 +1352,8 @@ void set_load_weight(struct task_struct *p, bool update_load)
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
@@ -936,7 +1185,7 @@ index ebf21373f663..fb6276f74ee6 100644
else
p->se.load = lw;
}
-@@ -2210,6 +2221,17 @@ inline int task_curr(const struct task_struct *p)
+@@ -2031,6 +2042,17 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
@@ -954,20 +1203,25 @@ index ebf21373f663..fb6276f74ee6 100644
/*
* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
* use the balance_callback list if you want balancing.
-@@ -2217,9 +2239,9 @@ inline int task_curr(const struct task_struct *p)
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
--static inline void check_class_changed(struct rq *rq, struct task_struct *p,
-- const struct sched_class *prev_class,
-- int oldprio)
-+void check_class_changed(struct rq *rq, struct task_struct *p,
-+ const struct sched_class *prev_class,
-+ int oldprio)
+@@ -2289,7 +2311,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
+ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
-@@ -3982,6 +4004,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
+ /* When not in the task's cpumask, no point in looking further. */
+- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
++ if (!task_allowed_on_cpu(p, cpu))
+ return false;
+
+ /* migrate_disabled() must be allowed to finish. */
+@@ -2298,7 +2320,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+
+ /* Non kernel threads are not allowed during either online or offline. */
+ if (!(p->flags & PF_KTHREAD))
+- return cpu_active(cpu) && task_cpu_possible(cpu, p);
++ return cpu_active(cpu);
+
+ /* KTHREAD_IS_PER_CPU is always allowed. */
+ if (kthread_is_per_cpu(p))
+@@ -3775,6 +3797,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
@@ -983,7 +1237,7 @@ index ebf21373f663..fb6276f74ee6 100644
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
-@@ -4549,6 +4580,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4342,6 +4373,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->rt.on_rq = 0;
p->rt.on_list = 0;
@@ -994,7 +1248,7 @@ index ebf21373f663..fb6276f74ee6 100644
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
-@@ -4789,10 +4824,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4582,10 +4617,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (dl_prio(p->prio))
return -EAGAIN;
@@ -1015,7 +1269,7 @@ index ebf21373f663..fb6276f74ee6 100644
init_entity_runnable_average(&p->se);
-@@ -4812,7 +4855,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4605,7 +4648,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
@@ -1024,7 +1278,7 @@ index ebf21373f663..fb6276f74ee6 100644
{
unsigned long flags;
-@@ -4974,6 +4974,13 @@
+@@ -4632,11 +4675,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -1038,15 +1292,13 @@ index ebf21373f663..fb6276f74ee6 100644
}
void sched_post_fork(struct task_struct *p)
-@@ -4982,6 +4989,7 @@
- sched_post_fork_bore(p);
- #endif // CONFIG_SCHED_BORE
+ {
uclamp_post_fork(p);
+ scx_post_fork(p);
}
unsigned long to_ratio(u64 period, u64 runtime)
-@@ -5685,6 +5736,7 @@ void sched_tick(void)
+@@ -5469,6 +5520,7 @@ void sched_tick(void)
calc_global_load_tick(rq);
sched_core_tick(rq);
task_tick_mm_cid(rq, curr);
@@ -1054,7 +1306,7 @@ index ebf21373f663..fb6276f74ee6 100644
rq_unlock(rq, &rf);
-@@ -5697,8 +5749,10 @@ void sched_tick(void)
+@@ -5481,8 +5533,10 @@ void sched_tick(void)
wq_worker_tick(curr);
#ifdef CONFIG_SMP
@@ -1067,10 +1319,11 @@ index ebf21373f663..fb6276f74ee6 100644
#endif
}
-@@ -5989,7 +6043,19 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+@@ -5772,8 +5826,19 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
+ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf)
{
- #ifdef CONFIG_SMP
+-#ifdef CONFIG_SMP
+ const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
+
@@ -1080,23 +1333,28 @@ index ebf21373f663..fb6276f74ee6 100644
+ * when waking up from SCHED_IDLE. If @start_class is below SCX, start
+ * from SCX instead.
+ */
-+ if (sched_class_above(&ext_sched_class, start_class))
++ if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
+ start_class = &ext_sched_class;
+#endif
+
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
-@@ -5998,7 +6064,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+@@ -5782,11 +5847,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
* We can terminate the balance pass as soon as we know there is
* a runnable task of @class priority or higher.
*/
- for_class_range(class, prev->sched_class, &idle_sched_class) {
+- if (class->balance(rq, prev, rf))
+ for_active_class_range(class, start_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
++ if (class->balance && class->balance(rq, prev, rf))
break;
}
-@@ -6016,6 +6082,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+-#endif
+
+ put_prev_task(rq, prev);
+ }
+@@ -5800,6 +5864,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
const struct sched_class *class;
struct task_struct *p;
@@ -1106,7 +1364,7 @@ index ebf21373f663..fb6276f74ee6 100644
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
-@@ -6056,10 +6125,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+@@ -5840,10 +5907,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (prev->dl_server)
prev->dl_server = NULL;
@@ -1124,7 +1382,7 @@ index ebf21373f663..fb6276f74ee6 100644
}
BUG(); /* The idle class should always have a runnable task. */
-@@ -6089,7 +6163,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
+@@ -5873,7 +5945,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
const struct sched_class *class;
struct task_struct *p;
@@ -1133,14 +1391,7 @@ index ebf21373f663..fb6276f74ee6 100644
p = class->pick_task(rq);
if (p)
return p;
-@@ -7080,12 +7154,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
- }
- EXPORT_SYMBOL(default_wake_function);
-
--static void __setscheduler_prio(struct task_struct *p, int prio)
-+void __setscheduler_prio(struct task_struct *p, int prio)
- {
- if (dl_prio(prio))
+@@ -6870,6 +6942,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
@@ -1151,7 +1402,7 @@ index ebf21373f663..fb6276f74ee6 100644
else
p->sched_class = &fair_sched_class;
-@@ -7246,6 +7324,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
+@@ -7015,6 +7091,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
}
__setscheduler_prio(p, prio);
@@ -1159,68 +1410,7 @@ index ebf21373f663..fb6276f74ee6 100644
if (queued)
enqueue_task(rq, p, queue_flag);
-@@ -7467,6 +7546,25 @@ int sched_core_idle_cpu(int cpu)
- #endif
-
- #ifdef CONFIG_SMP
-+/*
-+ * Load avg and utiliztion metrics need to be updated periodically and before
-+ * consumption. This function updates the metrics for all subsystems except for
-+ * the fair class. @rq must be locked and have its clock updated.
-+ */
-+bool update_other_load_avgs(struct rq *rq)
-+{
-+ u64 now = rq_clock_pelt(rq);
-+ const struct sched_class *curr_class = rq->curr->sched_class;
-+ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
-+
-+ lockdep_assert_rq_held(rq);
-+
-+ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
-+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
-+ update_hw_load_avg(now, rq, hw_pressure) |
-+ update_irq_load_avg(rq, 0);
-+}
-+
- /*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
-@@ -7789,6 +7887,10 @@ static int __sched_setscheduler(struct task_struct *p,
- goto unlock;
- }
-
-+ retval = scx_check_setscheduler(p, policy);
-+ if (retval)
-+ goto unlock;
-+
- /*
- * If not changing anything there's no need to proceed further,
- * but store a possible modification of reset_on_fork.
-@@ -7891,6 +7993,7 @@ static int __sched_setscheduler(struct task_struct *p,
- __setscheduler_prio(p, newprio);
- }
- __setscheduler_uclamp(p, attr);
-+ check_class_changing(rq, p, prev_class);
-
- if (queued) {
- /*
-@@ -9066,6 +9169,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
-+ case SCHED_EXT:
- ret = 0;
- break;
- }
-@@ -9093,6 +9197,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
-+ case SCHED_EXT:
- ret = 0;
- }
- return ret;
-@@ -9188,6 +9293,7 @@ void sched_show_task(struct task_struct *p)
+@@ -7429,6 +7506,7 @@ void sched_show_task(struct task_struct *p)
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -1228,7 +1418,7 @@ index ebf21373f663..fb6276f74ee6 100644
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
-@@ -9680,6 +9786,8 @@ int sched_cpu_activate(unsigned int cpu)
+@@ -7957,6 +8035,8 @@ int sched_cpu_activate(unsigned int cpu)
cpuset_cpu_active();
}
@@ -1237,7 +1427,7 @@ index ebf21373f663..fb6276f74ee6 100644
/*
* Put the rq online, if not already. This happens:
*
-@@ -9903,6 +9903,8 @@
+@@ -8006,6 +8086,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_set_rq_offline(rq, cpu);
@@ -1246,7 +1436,7 @@ index ebf21373f663..fb6276f74ee6 100644
/*
* When going down, decrement the number of cores with SMT present.
*/
-@@ -10061,11 +10061,15 @@
+@@ -8192,11 +8192,15 @@
int i;
/* Make sure the linker didn't screw up */
@@ -1266,7 +1456,17 @@ index ebf21373f663..fb6276f74ee6 100644
#endif
#ifdef CONFIG_SCHED_BORE
-@@ -10096,6 +10210,7 @@ void __init sched_init(void)
+@@ -8218,6 +8304,9 @@ void __init sched_init(void)
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
++#ifdef CONFIG_EXT_GROUP_SCHED
++ root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
++#endif /* CONFIG_EXT_GROUP_SCHED */
+ #ifdef CONFIG_RT_GROUP_SCHED
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+@@ -8363,6 +8452,7 @@ void __init sched_init(void)
balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
@@ -1274,7 +1474,23 @@ index ebf21373f663..fb6276f74ee6 100644
psi_init();
-@@ -10522,11 +10637,6 @@ void sched_move_task(struct task_struct *tsk)
+@@ -8648,6 +8738,7 @@ struct task_group *sched_create_group(struct task_group *parent)
+ if (!alloc_rt_sched_group(tg, parent))
+ goto err;
+
++ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
+ alloc_uclamp_sched_group(tg, parent);
+
+ return tg;
+@@ -8775,6 +8866,7 @@ void sched_move_task(struct task_struct *tsk)
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, group);
++ scx_move_task(tsk);
+
+ if (queued)
+ enqueue_task(rq, tsk, queue_flags);
+@@ -8789,11 +8881,6 @@ void sched_move_task(struct task_struct *tsk)
}
}
@@ -1286,16 +1502,154 @@ index ebf21373f663..fb6276f74ee6 100644
static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
-@@ -11293,29 +11403,27 @@ static int cpu_local_stat_show(struct seq_file *sf,
+@@ -8817,6 +8904,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+ {
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
++ int ret;
++
++ ret = scx_tg_online(tg);
++ if (ret)
++ return ret;
+
+ if (parent)
+ sched_online_group(tg, parent);
+@@ -8831,6 +8923,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+ return 0;
}
- #ifdef CONFIG_FAIR_GROUP_SCHED
++static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
++{
++ struct task_group *tg = css_tg(css);
++
++ scx_tg_offline(tg);
++}
+
+ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+ {
+ struct task_group *tg = css_tg(css);
+@@ -8848,9 +8947,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+ sched_unregister_group(tg);
+ }
+
+-#ifdef CONFIG_RT_GROUP_SCHED
+ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+ {
++#ifdef CONFIG_RT_GROUP_SCHED
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+@@ -8858,9 +8957,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+ if (!sched_rt_can_attach(css_tg(css), task))
+ return -EINVAL;
+ }
+- return 0;
+-}
+ #endif
++ return scx_cgroup_can_attach(tset);
++}
+
+ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+ {
+@@ -8869,6 +8968,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+
+ cgroup_taskset_for_each(task, css, tset)
+ sched_move_task(task);
++
++ scx_cgroup_finish_attach();
++}
++
++static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
++{
++ scx_cgroup_cancel_attach(tset);
+ }
+
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+@@ -9045,22 +9151,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+ }
+ #endif /* CONFIG_UCLAMP_TASK_GROUP */
+
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+static unsigned long tg_weight(struct task_group *tg)
+{
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ return scale_load_down(tg->shares);
++#else
++ return sched_weight_from_cgroup(tg->scx_weight);
++#endif
+}
+
+ static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
+ {
++ int ret;
++
+ if (shareval > scale_load_down(ULONG_MAX))
+ shareval = MAX_SHARES;
+- return sched_group_set_shares(css_tg(css), scale_load(shareval));
++ ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
++ if (!ret)
++ scx_group_set_weight(css_tg(css),
++ sched_weight_to_cgroup(shareval));
++ return ret;
+ }
+
+ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+ {
+- struct task_group *tg = css_tg(css);
+-
+- return (u64) scale_load_down(tg->shares);
++ return tg_weight(css_tg(css));
+ }
++#endif /* CONFIG_GROUP_SCHED_WEIGHT */
+
+ #ifdef CONFIG_CFS_BANDWIDTH
+ static DEFINE_MUTEX(cfs_constraints_mutex);
+@@ -9406,7 +9526,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+ return 0;
+ }
+ #endif /* CONFIG_CFS_BANDWIDTH */
+-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ #ifdef CONFIG_RT_GROUP_SCHED
+ static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+@@ -9434,7 +9553,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ }
+ #endif /* CONFIG_RT_GROUP_SCHED */
+
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+ {
+@@ -9444,12 +9563,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+ {
+- return sched_group_set_idle(css_tg(css), idle);
++ int ret;
++
++ ret = sched_group_set_idle(css_tg(css), idle);
++ if (!ret)
++ scx_group_set_idle(css_tg(css), idle);
++ return ret;
+ }
+ #endif
+
+ static struct cftype cpu_legacy_files[] = {
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ {
+ .name = "shares",
+ .read_u64 = cpu_shares_read_u64,
+@@ -9559,38 +9683,35 @@ static int cpu_local_stat_show(struct seq_file *sf,
+ return 0;
+ }
+
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
++
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -1319,6 +1673,7 @@ index ebf21373f663..fb6276f74ee6 100644
- */
- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ unsigned long weight;
++ int ret;
+
+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
return -ERANGE;
@@ -1326,9 +1681,13 @@ index ebf21373f663..fb6276f74ee6 100644
- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+ weight = sched_weight_from_cgroup(cgrp_weight);
- return sched_group_set_shares(css_tg(css), scale_load(weight));
+- return sched_group_set_shares(css_tg(css), scale_load(weight));
++ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
++ if (!ret)
++ scx_group_set_weight(css_tg(css), cgrp_weight);
++ return ret;
}
-@@ -11323,7 +11431,7 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -1337,7 +1696,58 @@ index ebf21373f663..fb6276f74ee6 100644
int last_delta = INT_MAX;
int prio, delta;
-@@ -12064,3 +12172,38 @@ void sched_mm_cid_fork(struct task_struct *t)
+@@ -9609,7 +9730,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+ {
+ unsigned long weight;
+- int idx;
++ int idx, ret;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+@@ -9618,9 +9739,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ idx = array_index_nospec(idx, 40);
+ weight = sched_prio_to_weight[idx];
+
+- return sched_group_set_shares(css_tg(css), scale_load(weight));
++ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
++ if (!ret)
++ scx_group_set_weight(css_tg(css),
++ sched_weight_to_cgroup(weight));
++ return ret;
+ }
+-#endif
++#endif /* CONFIG_GROUP_SCHED_WEIGHT */
+
+ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ long period, long quota)
+@@ -9680,7 +9805,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ #endif
+
+ static struct cftype cpu_files[] = {
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+@@ -9734,14 +9859,14 @@ static struct cftype cpu_files[] = {
+ struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
++ .css_offline = cpu_cgroup_css_offline,
+ .css_released = cpu_cgroup_css_released,
+ .css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
+ .css_local_stat_show = cpu_local_stat_show,
+-#ifdef CONFIG_RT_GROUP_SCHED
+ .can_attach = cpu_cgroup_can_attach,
+-#endif
+ .attach = cpu_cgroup_attach,
++ .cancel_attach = cpu_cgroup_cancel_attach,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
+ .early_init = true,
+@@ -10331,3 +10456,38 @@ void sched_mm_cid_fork(struct task_struct *t)
t->mm_cid_active = 1;
}
#endif
@@ -1481,10 +1891,10 @@ index c1eb9a1afd13..c057ef46c5f8 100644
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644
-index 000000000000..0dac88d0e578
+index 000000000000..25fadfaace33
--- /dev/null
+++ b/kernel/sched/ext.c
-@@ -0,0 +1,6532 @@
+@@ -0,0 +1,7262 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@@ -1603,10 +2013,16 @@ index 000000000000..0dac88d0e578
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
++ /*
++ * CPU cgroup support flags
++ */
++ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
++
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
-+ SCX_OPS_SWITCH_PARTIAL,
++ SCX_OPS_SWITCH_PARTIAL |
++ SCX_OPS_HAS_CGROUP_WEIGHT,
+};
+
+/* argument container for ops.init_task() */
@@ -1616,6 +2032,10 @@ index 000000000000..0dac88d0e578
+ * to the scheduler transition path.
+ */
+ bool fork;
++#ifdef CONFIG_EXT_GROUP_SCHED
++ /* the cgroup the task is joining */
++ struct cgroup *cgroup;
++#endif
+};
+
+/* argument container for ops.exit_task() */
@@ -1624,6 +2044,12 @@ index 000000000000..0dac88d0e578
+ bool cancelled;
+};
+
++/* argument container for ops->cgroup_init() */
++struct scx_cgroup_init_args {
++ /* the weight of the cgroup [1..10000] */
++ u32 weight;
++};
++
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
@@ -1988,6 +2414,79 @@ index 000000000000..0dac88d0e578
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
++#ifdef CONFIG_EXT_GROUP_SCHED
++ /**
++ * cgroup_init - Initialize a cgroup
++ * @cgrp: cgroup being initialized
++ * @args: init arguments, see the struct definition
++ *
++ * Either the BPF scheduler is being loaded or @cgrp created, initialize
++ * @cgrp for sched_ext. This operation may block.
++ *
++ * Return 0 for success, -errno for failure. An error return while
++ * loading will abort loading of the BPF scheduler. During cgroup
++ * creation, it will abort the specific cgroup creation.
++ */
++ s32 (*cgroup_init)(struct cgroup *cgrp,
++ struct scx_cgroup_init_args *args);
++
++ /**
++ * cgroup_exit - Exit a cgroup
++ * @cgrp: cgroup being exited
++ *
++ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
++ * @cgrp for sched_ext. This operation my block.
++ */
++ void (*cgroup_exit)(struct cgroup *cgrp);
++
++ /**
++ * cgroup_prep_move - Prepare a task to be moved to a different cgroup
++ * @p: task being moved
++ * @from: cgroup @p is being moved from
++ * @to: cgroup @p is being moved to
++ *
++ * Prepare @p for move from cgroup @from to @to. This operation may
++ * block and can be used for allocations.
++ *
++ * Return 0 for success, -errno for failure. An error return aborts the
++ * migration.
++ */
++ s32 (*cgroup_prep_move)(struct task_struct *p,
++ struct cgroup *from, struct cgroup *to);
++
++ /**
++ * cgroup_move - Commit cgroup move
++ * @p: task being moved
++ * @from: cgroup @p is being moved from
++ * @to: cgroup @p is being moved to
++ *
++ * Commit the move. @p is dequeued during this operation.
++ */
++ void (*cgroup_move)(struct task_struct *p,
++ struct cgroup *from, struct cgroup *to);
++
++ /**
++ * cgroup_cancel_move - Cancel cgroup move
++ * @p: task whose cgroup move is being canceled
++ * @from: cgroup @p was being moved from
++ * @to: cgroup @p was being moved to
++ *
++ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
++ * Undo the preparation.
++ */
++ void (*cgroup_cancel_move)(struct task_struct *p,
++ struct cgroup *from, struct cgroup *to);
++
++ /**
++ * cgroup_set_weight - A cgroup's weight is being changed
++ * @cgrp: cgroup whose weight is being updated
++ * @weight: new weight [1..10000]
++ *
++ * Update @tg's weight to @weight.
++ */
++ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
++#endif /* CONFIG_CGROUPS */
++
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
@@ -2173,8 +2672,12 @@ index 000000000000..0dac88d0e578
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
++enum scx_tg_flags {
++ SCX_TG_ONLINE = 1U << 0,
++ SCX_TG_INITED = 1U << 1,
++};
++
+enum scx_ops_enable_state {
-+ SCX_OPS_PREPPING,
+ SCX_OPS_ENABLING,
+ SCX_OPS_ENABLED,
+ SCX_OPS_DISABLING,
@@ -2182,7 +2685,6 @@ index 000000000000..0dac88d0e578
+};
+
+static const char *scx_ops_enable_state_str[] = {
-+ [SCX_OPS_PREPPING] = "prepping",
+ [SCX_OPS_ENABLING] = "enabling",
+ [SCX_OPS_ENABLED] = "enabled",
+ [SCX_OPS_DISABLING] = "disabling",
@@ -2250,6 +2752,7 @@ index 000000000000..0dac88d0e578
+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
++static bool scx_ops_init_task_enabled;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
@@ -2261,7 +2764,7 @@ index 000000000000..0dac88d0e578
+static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
-+struct static_key_false scx_has_op[SCX_OPI_END] =
++static struct static_key_false scx_has_op[SCX_OPI_END] =
+ { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
+
+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
@@ -2271,6 +2774,13 @@ index 000000000000..0dac88d0e578
+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+
+/*
++ * A monotically increasing sequence number that is incremented every time a
++ * scheduler is enabled. This can be used by to check if any custom sched_ext
++ * scheduler has ever been used in the system.
++ */
++static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
++
++/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
@@ -2314,8 +2824,15 @@ index 000000000000..0dac88d0e578
+ */
+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
+
-+/* dispatch queues */
-+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
++/*
++ * Dispatch queues.
++ *
++ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
++ * to avoid live-locking in bypass mode where all tasks are dispatched to
++ * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
++ * sufficient, it can be further split.
++ */
++static struct scx_dispatch_q **global_dsqs;
+
+static const struct rhashtable_params dsq_hash_params = {
+ .key_len = 8,
@@ -2364,7 +2881,7 @@ index 000000000000..0dac88d0e578
+ struct scx_bstr_buf buf;
+};
+
-+struct scx_dump_data scx_dump_data = {
++static struct scx_dump_data scx_dump_data = {
+ .cpu = -1,
+};
+
@@ -2418,6 +2935,16 @@ index 000000000000..0dac88d0e578
+ return (s32)(a - b) < 0;
+}
+
++static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
++{
++ return global_dsqs[cpu_to_node(task_cpu(p))];
++}
++
++static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
++{
++ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
++}
++
+/*
+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -2554,6 +3081,11 @@ index 000000000000..0dac88d0e578
+ return true;
+}
+
++static bool scx_kf_allowed_if_unlocked(void)
++{
++ return !current->scx.kf_mask;
++}
++
+/**
+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
+ * @dsq: user dsq being interated
@@ -2587,7 +3119,7 @@ index 000000000000..0dac88d0e578
+
+ dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
+ node);
-+ } while (dsq_lnode->is_bpf_iter_cursor);
++ } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
+
+ return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
+}
@@ -2605,16 +3137,22 @@ index 000000000000..0dac88d0e578
+ */
+enum scx_dsq_iter_flags {
+ /* iterate in the reverse dispatch order */
-+ SCX_DSQ_ITER_REV = 1U << 0,
++ SCX_DSQ_ITER_REV = 1U << 16,
+
-+ __SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV,
++ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
++ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
++
++ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
++ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
++ __SCX_DSQ_ITER_HAS_SLICE |
++ __SCX_DSQ_ITER_HAS_VTIME,
+};
+
+struct bpf_iter_scx_dsq_kern {
+ struct scx_dsq_list_node cursor;
+ struct scx_dispatch_q *dsq;
-+ u32 dsq_seq;
-+ u32 flags;
++ u64 slice;
++ u64 vtime;
+} __attribute__((aligned(8)));
+
+struct bpf_iter_scx_dsq {
@@ -2652,6 +3190,9 @@ index 000000000000..0dac88d0e578
+{
+ lockdep_assert_held(&scx_tasks_lock);
+
++ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
++ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
++
+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
+ list_add(&iter->cursor.tasks_node, &scx_tasks);
+ iter->locked = NULL;
@@ -2730,17 +3271,37 @@ index 000000000000..0dac88d0e578
+ * whether they would like to filter out dead tasks. See scx_task_iter_init()
+ * for details.
+ */
-+static struct task_struct *
-+scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
++static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
+{
+ struct task_struct *p;
-+retry:
++
+ scx_task_iter_rq_unlock(iter);
+
+ while ((p = scx_task_iter_next(iter))) {
+ /*
-+ * is_idle_task() tests %PF_IDLE which may not be set for CPUs
-+ * which haven't yet been onlined. Test sched_class directly.
++ * scx_task_iter is used to prepare and move tasks into SCX
++ * while loading the BPF scheduler and vice-versa while
++ * unloading. The init_tasks ("swappers") should be excluded
++ * from the iteration because:
++ *
++ * - It's unsafe to use __setschduler_prio() on an init_task to
++ * determine the sched_class to use as it won't preserve its
++ * idle_sched_class.
++ *
++ * - ops.init/exit_task() can easily be confused if called with
++ * init_tasks as they, e.g., share PID 0.
++ *
++ * As init_tasks are never scheduled through SCX, they can be
++ * skipped safely. Note that is_idle_task() which tests %PF_IDLE
++ * doesn't work here:
++ *
++ * - %PF_IDLE may not be set for an init_task whose CPU hasn't
++ * yet been onlined.
++ *
++ * - %PF_IDLE can be set on tasks that are not init_tasks. See
++ * play_idle_precise() used by CONFIG_IDLE_INJECT.
++ *
++ * Test for idle_sched_class as only init_tasks are on it.
+ */
+ if (p->sched_class != &idle_sched_class)
+ break;
@@ -2751,16 +3312,6 @@ index 000000000000..0dac88d0e578
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked = p;
+
-+ /*
-+ * If we see %TASK_DEAD, @p already disabled preemption, is about to do
-+ * the final __schedule(), won't ever need to be scheduled again and can
-+ * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
-+ * the final __schedle() while we're locking its rq and thus will stay
-+ * alive until the rq is unlocked.
-+ */
-+ if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
-+ goto retry;
-+
+ return p;
+}
+
@@ -2783,9 +3334,9 @@ index 000000000000..0dac88d0e578
+ return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+}
+
-+static bool scx_ops_bypassing(void)
++static bool scx_rq_bypassing(struct rq *rq)
+{
-+ return unlikely(atomic_read(&scx_ops_bypass_depth));
++ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
+
+/**
@@ -2919,13 +3470,18 @@ index 000000000000..0dac88d0e578
+ */
+static void touch_core_sched(struct rq *rq, struct task_struct *p)
+{
++ lockdep_assert_rq_held(rq);
++
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * It's okay to update the timestamp spuriously. Use
+ * sched_core_disabled() which is cheaper than enabled().
++ *
++ * As this is used to determine ordering between tasks of sibling CPUs,
++ * it may be better to use per-core dispatch sequence instead.
+ */
+ if (!sched_core_disabled())
-+ p->scx.core_sched_at = rq_clock_task(rq);
++ p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
+#endif
+}
+
@@ -2942,7 +3498,6 @@ index 000000000000..0dac88d0e578
+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
-+ assert_clock_updated(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ if (SCX_HAS_OP(core_sched_before))
@@ -2953,20 +3508,14 @@ index 000000000000..0dac88d0e578
+static void update_curr_scx(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
-+ u64 now = rq_clock_task(rq);
-+ u64 delta_exec;
++ s64 delta_exec;
+
-+ if (time_before_eq64(now, curr->se.exec_start))
++ delta_exec = update_curr_common(rq);
++ if (unlikely(delta_exec <= 0))
+ return;
+
-+ delta_exec = now - curr->se.exec_start;
-+ curr->se.exec_start = now;
-+ curr->se.sum_exec_runtime += delta_exec;
-+ account_group_exec_runtime(curr, delta_exec);
-+ cgroup_account_cputime(curr, delta_exec);
-+
+ if (curr->scx.slice != SCX_SLICE_INF) {
-+ curr->scx.slice -= min(curr->scx.slice, delta_exec);
++ curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
+ if (!curr->scx.slice)
+ touch_core_sched(rq, curr);
+ }
@@ -3004,7 +3553,7 @@ index 000000000000..0dac88d0e578
+ scx_ops_error("attempting to dispatch to a destroyed dsq");
+ /* fall back to the global dsq */
+ raw_spin_unlock(&dsq->lock);
-+ dsq = &scx_dsq_global;
++ dsq = find_global_dsq(p);
+ raw_spin_lock(&dsq->lock);
+ }
+ }
@@ -3107,6 +3656,8 @@ index 000000000000..0dac88d0e578
+static void task_unlink_from_dsq(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
++ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
++
+ if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
+ rb_erase(&p->scx.dsq_priq, &dsq->priq);
+ RB_CLEAR_NODE(&p->scx.dsq_priq);
@@ -3114,6 +3665,7 @@ index 000000000000..0dac88d0e578
+ }
+
+ list_del_init(&p->scx.dsq_list.node);
++ dsq_mod_nr(dsq, -1);
+}
+
+static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
@@ -3150,9 +3702,7 @@ index 000000000000..0dac88d0e578
+ */
+ if (p->scx.holding_cpu < 0) {
+ /* @p must still be on @dsq, dequeue */
-+ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
+ task_unlink_from_dsq(p, dsq);
-+ dsq_mod_nr(dsq, -1);
+ } else {
+ /*
+ * We're racing against dispatch_to_local_dsq() which already
@@ -3169,21 +3719,6 @@ index 000000000000..0dac88d0e578
+ raw_spin_unlock(&dsq->lock);
+}
+
-+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
-+{
-+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
-+}
-+
-+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
-+{
-+ lockdep_assert(rcu_read_lock_any_held());
-+
-+ if (dsq_id == SCX_DSQ_GLOBAL)
-+ return &scx_dsq_global;
-+ else
-+ return find_user_dsq(dsq_id);
-+}
-+
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+ struct task_struct *p)
+{
@@ -3192,11 +3727,24 @@ index 000000000000..0dac88d0e578
+ if (dsq_id == SCX_DSQ_LOCAL)
+ return &rq->scx.local_dsq;
+
-+ dsq = find_non_local_dsq(dsq_id);
++ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
++ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
++
++ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
++ return find_global_dsq(p);
++
++ return &cpu_rq(cpu)->scx.local_dsq;
++ }
++
++ if (dsq_id == SCX_DSQ_GLOBAL)
++ dsq = find_global_dsq(p);
++ else
++ dsq = find_user_dsq(dsq_id);
++
+ if (unlikely(!dsq)) {
+ scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
-+ return &scx_dsq_global;
++ return find_global_dsq(p);
+ }
+
+ return dsq;
@@ -3235,8 +3783,8 @@ index 000000000000..0dac88d0e578
+static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+{
+ struct rq *rq = task_rq(p);
-+ struct scx_dispatch_q *dsq;
-+ u64 dsq_id = p->scx.ddsp_dsq_id;
++ struct scx_dispatch_q *dsq =
++ find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+
+ touch_core_sched_dispatch(rq, p);
+
@@ -3248,15 +3796,9 @@ index 000000000000..0dac88d0e578
+ * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
+ * the enqueue so that it's executed when @rq can be unlocked.
+ */
-+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
++ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
+ unsigned long opss;
+
-+ if (cpu == cpu_of(rq)) {
-+ dsq_id = SCX_DSQ_LOCAL;
-+ goto dispatch;
-+ }
-+
+ opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
@@ -3283,14 +3825,19 @@ index 000000000000..0dac88d0e578
+ return;
+ }
+
-+dispatch:
-+ dsq = find_dsq_for_dispatch(rq, dsq_id, p);
+ dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+}
+
+static bool scx_rq_online(struct rq *rq)
+{
-+ return likely(rq->scx.flags & SCX_RQ_ONLINE);
++ /*
++ * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates
++ * the online state as seen from the BPF scheduler. cpu_active() test
++ * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will
++ * stay set until the current scheduling operation is complete even if
++ * we aren't locking @rq.
++ */
++ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
+}
+
+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
@@ -3313,7 +3860,7 @@ index 000000000000..0dac88d0e578
+ if (!scx_rq_online(rq))
+ goto local;
+
-+ if (scx_ops_bypassing()) {
++ if (scx_rq_bypassing(rq)) {
+ if (enq_flags & SCX_ENQ_LAST)
+ goto local;
+ else
@@ -3378,7 +3925,7 @@ index 000000000000..0dac88d0e578
+global:
+ touch_core_sched(rq, p); /* see the comment in local: */
+ p->scx.slice = SCX_SLICE_DFL;
-+ dispatch_enqueue(&scx_dsq_global, p, enq_flags);
++ dispatch_enqueue(find_global_dsq(p), p, enq_flags);
+}
+
+static bool task_runnable(const struct task_struct *p)
@@ -3440,7 +3987,7 @@ index 000000000000..0dac88d0e578
+ rq->scx.nr_running++;
+ add_nr_running(rq, 1);
+
-+ if (SCX_HAS_OP(runnable))
++ if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+
+ if (enq_flags & SCX_ENQ_WAKEUP)
@@ -3524,7 +4071,7 @@ index 000000000000..0dac88d0e578
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+ }
+
-+ if (SCX_HAS_OP(quiescent))
++ if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+
+ if (deq_flags & SCX_DEQ_SLEEP)
@@ -3559,193 +4106,173 @@ index 000000000000..0dac88d0e578
+ return false;
+}
+
++static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
++ struct scx_dispatch_q *src_dsq,
++ struct rq *dst_rq)
++{
++ struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
++
++ /* @dsq is locked and @p is on @dst_rq */
++ lockdep_assert_held(&src_dsq->lock);
++ lockdep_assert_rq_held(dst_rq);
++
++ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
++
++ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
++ list_add(&p->scx.dsq_list.node, &dst_dsq->list);
++ else
++ list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
++
++ dsq_mod_nr(dst_dsq, 1);
++ p->scx.dsq = dst_dsq;
++}
++
+#ifdef CONFIG_SMP
+/**
-+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
-+ * @rq: rq to move the task into, currently locked
++ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
+ * @p: task to move
+ * @enq_flags: %SCX_ENQ_*
++ * @src_rq: rq to move the task from, locked on entry, released on return
++ * @dst_rq: rq to move the task into, locked on return
+ *
-+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller
-+ * must:
-+ *
-+ * 1. Start with exclusive access to @p either through its DSQ lock or
-+ * %SCX_OPSS_DISPATCHING flag.
-+ *
-+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
-+ *
-+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't
-+ * deadlock with dequeue.
-+ *
-+ * 4. Lock @rq and the task_rq from #3.
-+ *
-+ * 5. Call this function.
-+ *
-+ * Returns %true if @p was successfully moved. %false after racing dequeue and
-+ * losing.
++ * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
+ */
-+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
-+ u64 enq_flags)
++static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
++ struct rq *src_rq, struct rq *dst_rq)
+{
-+ struct rq *task_rq;
-+
-+ lockdep_assert_rq_held(rq);
-+
-+ /*
-+ * If dequeue got to @p while we were trying to lock both rq's, it'd
-+ * have cleared @p->scx.holding_cpu to -1. While other cpus may have
-+ * updated it to different values afterwards, as this operation can't be
-+ * preempted or recurse, @p->scx.holding_cpu can never become
-+ * raw_smp_processor_id() again before we're done. Thus, we can tell
-+ * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
-+ * still raw_smp_processor_id().
-+ *
-+ * See dispatch_dequeue() for the counterpart.
-+ */
-+ if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
-+ return false;
++ lockdep_assert_rq_held(src_rq);
+
-+ /* @p->rq couldn't have changed if we're still the holding cpu */
-+ task_rq = task_rq(p);
-+ lockdep_assert_rq_held(task_rq);
++ /* the following marks @p MIGRATING which excludes dequeue */
++ deactivate_task(src_rq, p, 0);
++ set_task_cpu(p, cpu_of(dst_rq));
++ p->scx.sticky_cpu = cpu_of(dst_rq);
+
-+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
-+ deactivate_task(task_rq, p, 0);
-+ set_task_cpu(p, cpu_of(rq));
-+ p->scx.sticky_cpu = cpu_of(rq);
++ raw_spin_rq_unlock(src_rq);
++ raw_spin_rq_lock(dst_rq);
+
+ /*
+ * We want to pass scx-specific enq_flags but activate_task() will
+ * truncate the upper 32 bit. As we own @rq, we can pass them through
+ * @rq->scx.extra_enq_flags instead.
+ */
-+ WARN_ON_ONCE(rq->scx.extra_enq_flags);
-+ rq->scx.extra_enq_flags = enq_flags;
-+ activate_task(rq, p, 0);
-+ rq->scx.extra_enq_flags = 0;
-+
-+ return true;
++ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
++ WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
++ dst_rq->scx.extra_enq_flags = enq_flags;
++ activate_task(dst_rq, p, 0);
++ dst_rq->scx.extra_enq_flags = 0;
+}
+
-+/**
-+ * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked
-+ * @rq: current rq which is locked
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
++/*
++ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two
++ * differences:
+ *
-+ * We're holding @rq lock and trying to dispatch a task from @src_rq to
-+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
-+ * @rq stays locked isn't important as long as the state is restored after
-+ * dispatch_to_local_dsq_unlock().
-+ */
-+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq *src_rq,
-+ struct rq *dst_rq)
-+{
-+ if (src_rq == dst_rq) {
-+ raw_spin_rq_unlock(rq);
-+ raw_spin_rq_lock(dst_rq);
-+ } else if (rq == src_rq) {
-+ double_lock_balance(rq, dst_rq);
-+ } else if (rq == dst_rq) {
-+ double_lock_balance(rq, src_rq);
-+ } else {
-+ raw_spin_rq_unlock(rq);
-+ double_rq_lock(src_rq, dst_rq);
-+ }
-+}
-+
-+/**
-+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
-+ * @rq: current rq which is locked
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
++ * - is_cpu_allowed() asks "Can this task run on this CPU?" while
++ * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to
++ * this CPU?".
+ *
-+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
-+ */
-+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq *src_rq,
-+ struct rq *dst_rq)
-+{
-+ if (src_rq == dst_rq) {
-+ raw_spin_rq_unlock(dst_rq);
-+ raw_spin_rq_lock(rq);
-+ } else if (rq == src_rq) {
-+ double_unlock_balance(rq, dst_rq);
-+ } else if (rq == dst_rq) {
-+ double_unlock_balance(rq, src_rq);
-+ } else {
-+ double_rq_unlock(src_rq, dst_rq);
-+ raw_spin_rq_lock(rq);
-+ }
-+}
-+#endif /* CONFIG_SMP */
-+
-+static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq,
-+ struct task_struct *p)
-+{
-+ lockdep_assert_held(&dsq->lock); /* released on return */
-+
-+ /* @dsq is locked and @p is on this rq */
-+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-+ task_unlink_from_dsq(p, dsq);
-+ list_add_tail(&p->scx.dsq_list.node, &rq->scx.local_dsq.list);
-+ dsq_mod_nr(dsq, -1);
-+ dsq_mod_nr(&rq->scx.local_dsq, 1);
-+ p->scx.dsq = &rq->scx.local_dsq;
-+ raw_spin_unlock(&dsq->lock);
-+}
-+
-+#ifdef CONFIG_SMP
-+/*
-+ * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p
-+ * can be pulled to @rq.
++ * While migration is disabled, is_cpu_allowed() has to say "yes" as the task
++ * must be allowed to finish on the CPU that it's currently on regardless of
++ * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the
++ * BPF scheduler shouldn't attempt to migrate a task which has migration
++ * disabled.
++ *
++ * - The BPF scheduler is bypassed while the rq is offline and we can always say
++ * no to the BPF scheduler initiated migrations while offline.
+ */
-+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
++static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
++ bool trigger_error)
+{
+ int cpu = cpu_of(rq);
+
-+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
++ /*
++ * We don't require the BPF scheduler to avoid dispatching to offline
++ * CPUs mostly for convenience but also because CPUs can go offline
++ * between scx_bpf_dispatch() calls and here. Trigger error iff the
++ * picked CPU is outside the allowed mask.
++ */
++ if (!task_allowed_on_cpu(p, cpu)) {
++ if (trigger_error)
++ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
++ cpu_of(rq), p->comm, p->pid);
+ return false;
++ }
++
+ if (unlikely(is_migration_disabled(p)))
+ return false;
-+ if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
-+ return false;
++
+ if (!scx_rq_online(rq))
+ return false;
++
+ return true;
+}
+
-+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq,
-+ struct task_struct *p, struct rq *task_rq)
++/**
++ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
++ * @p: target task
++ * @dsq: locked DSQ @p is currently on
++ * @src_rq: rq @p is currently on, stable with @dsq locked
++ *
++ * Called with @dsq locked but no rq's locked. We want to move @p to a different
++ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is
++ * required when transferring into a local DSQ. Even when transferring into a
++ * non-local DSQ, it's better to use the same mechanism to protect against
++ * dequeues and maintain the invariant that @p->scx.dsq can only change while
++ * @src_rq is locked, which e.g. scx_dump_task() depends on.
++ *
++ * We want to grab @src_rq but that can deadlock if we try while locking @dsq,
++ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As
++ * this may race with dequeue, which can't drop the rq lock or fail, do a little
++ * dancing from our side.
++ *
++ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
++ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu
++ * would be cleared to -1. While other cpus may have updated it to different
++ * values afterwards, as this operation can't be preempted or recurse, the
++ * holding_cpu can never become this CPU again before we're done. Thus, we can
++ * tell whether we lost to dequeue by testing whether the holding_cpu still
++ * points to this CPU. See dispatch_dequeue() for the counterpart.
++ *
++ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is
++ * still valid. %false if lost to dequeue.
++ */
++static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
++ struct scx_dispatch_q *dsq,
++ struct rq *src_rq)
+{
-+ bool moved = false;
++ s32 cpu = raw_smp_processor_id();
+
-+ lockdep_assert_held(&dsq->lock); /* released on return */
++ lockdep_assert_held(&dsq->lock);
+
-+ /*
-+ * @dsq is locked and @p is on a remote rq. @p is currently protected by
-+ * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
-+ * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
-+ * rq lock or fail, do a little dancing from our side. See
-+ * move_task_to_local_dsq().
-+ */
+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+ task_unlink_from_dsq(p, dsq);
-+ dsq_mod_nr(dsq, -1);
-+ p->scx.holding_cpu = raw_smp_processor_id();
-+ raw_spin_unlock(&dsq->lock);
++ p->scx.holding_cpu = cpu;
+
-+ double_lock_balance(rq, task_rq);
++ raw_spin_unlock(&dsq->lock);
++ raw_spin_rq_lock(src_rq);
+
-+ moved = move_task_to_local_dsq(rq, p, 0);
++ /* task_rq couldn't have changed if we're still the holding cpu */
++ return likely(p->scx.holding_cpu == cpu) &&
++ !WARN_ON_ONCE(src_rq != task_rq(p));
++}
+
-+ double_unlock_balance(rq, task_rq);
++static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
++ struct scx_dispatch_q *dsq, struct rq *src_rq)
++{
++ raw_spin_rq_unlock(this_rq);
+
-+ return moved;
++ if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
++ move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
++ return true;
++ } else {
++ raw_spin_rq_unlock(src_rq);
++ raw_spin_rq_lock(this_rq);
++ return false;
++ }
+}
+#else /* CONFIG_SMP */
-+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; }
-+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq,
-+ struct task_struct *p, struct rq *task_rq) { return false; }
++static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
++static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
++static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
+#endif /* CONFIG_SMP */
+
+static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
@@ -3766,12 +4293,14 @@ index 000000000000..0dac88d0e578
+ struct rq *task_rq = task_rq(p);
+
+ if (rq == task_rq) {
-+ consume_local_task(rq, dsq, p);
++ task_unlink_from_dsq(p, dsq);
++ move_local_task_to_local_dsq(p, 0, dsq, rq);
++ raw_spin_unlock(&dsq->lock);
+ return true;
+ }
+
-+ if (task_can_run_on_remote_rq(p, rq)) {
-+ if (likely(consume_remote_task(rq, dsq, p, task_rq)))
++ if (task_can_run_on_remote_rq(p, rq, false)) {
++ if (likely(consume_remote_task(rq, p, dsq, task_rq)))
+ return true;
+ goto retry;
+ }
@@ -3781,122 +4310,102 @@ index 000000000000..0dac88d0e578
+ return false;
+}
+
-+enum dispatch_to_local_dsq_ret {
-+ DTL_DISPATCHED, /* successfully dispatched */
-+ DTL_LOST, /* lost race to dequeue */
-+ DTL_NOT_LOCAL, /* destination is not a local DSQ */
-+ DTL_INVALID, /* invalid local dsq_id */
-+};
++static bool consume_global_dsq(struct rq *rq)
++{
++ int node = cpu_to_node(cpu_of(rq));
++
++ return consume_dispatch_q(rq, global_dsqs[node]);
++}
+
+/**
+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @rq: current rq which is locked
-+ * @dsq_id: destination dsq ID
++ * @dst_dsq: destination DSQ
+ * @p: task to dispatch
+ * @enq_flags: %SCX_ENQ_*
+ *
-+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
-+ * @dsq_id. This function performs all the synchronization dancing needed
-+ * because local DSQs are protected with rq locks.
++ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
++ * DSQ. This function performs all the synchronization dancing needed because
++ * local DSQs are protected with rq locks.
+ *
+ * The caller must have exclusive ownership of @p (e.g. through
+ * %SCX_OPSS_DISPATCHING).
+ */
-+static enum dispatch_to_local_dsq_ret
-+dispatch_to_local_dsq(struct rq *rq, u64 dsq_id, struct task_struct *p,
-+ u64 enq_flags)
++static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
++ struct task_struct *p, u64 enq_flags)
+{
+ struct rq *src_rq = task_rq(p);
-+ struct rq *dst_rq;
++ struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+
+ /*
+ * We're synchronized against dequeue through DISPATCHING. As @p can't
+ * be dequeued, its task_rq and cpus_allowed are stable too.
++ *
++ * If dispatching to @rq that @p is already on, no lock dancing needed.
+ */
-+ if (dsq_id == SCX_DSQ_LOCAL) {
-+ dst_rq = rq;
-+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
-+
-+ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
-+ return DTL_INVALID;
-+ dst_rq = cpu_rq(cpu);
-+ } else {
-+ return DTL_NOT_LOCAL;
-+ }
-+
-+ /* if dispatching to @rq that @p is already on, no lock dancing needed */
+ if (rq == src_rq && rq == dst_rq) {
-+ dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+ enq_flags | SCX_ENQ_CLEAR_OPSS);
-+ return DTL_DISPATCHED;
++ dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
++ return;
+ }
+
+#ifdef CONFIG_SMP
-+ if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
-+ struct rq *locked_dst_rq = dst_rq;
-+ bool dsp;
++ if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
++ dispatch_enqueue(find_global_dsq(p), p,
++ enq_flags | SCX_ENQ_CLEAR_OPSS);
++ return;
++ }
+
-+ /*
-+ * @p is on a possibly remote @src_rq which we need to lock to
-+ * move the task. If dequeue is in progress, it'd be locking
-+ * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
-+ * lock while holding DISPATCHING.
-+ *
-+ * As DISPATCHING guarantees that @p is wholly ours, we can
-+ * pretend that we're moving from a DSQ and use the same
-+ * mechanism - mark the task under transfer with holding_cpu,
-+ * release DISPATCHING and then follow the same protocol.
-+ */
-+ p->scx.holding_cpu = raw_smp_processor_id();
++ /*
++ * @p is on a possibly remote @src_rq which we need to lock to move the
++ * task. If dequeue is in progress, it'd be locking @src_rq and waiting
++ * on DISPATCHING, so we can't grab @src_rq lock while holding
++ * DISPATCHING.
++ *
++ * As DISPATCHING guarantees that @p is wholly ours, we can pretend that
++ * we're moving from a DSQ and use the same mechanism - mark the task
++ * under transfer with holding_cpu, release DISPATCHING and then follow
++ * the same protocol. See unlink_dsq_and_lock_src_rq().
++ */
++ p->scx.holding_cpu = raw_smp_processor_id();
+
-+ /* store_release ensures that dequeue sees the above */
-+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
++ /* store_release ensures that dequeue sees the above */
++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
-+ dispatch_to_local_dsq_lock(rq, src_rq, locked_dst_rq);
++ /* switch to @src_rq lock */
++ if (rq != src_rq) {
++ raw_spin_rq_unlock(rq);
++ raw_spin_rq_lock(src_rq);
++ }
+
++ /* task_rq couldn't have changed if we're still the holding cpu */
++ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
++ !WARN_ON_ONCE(src_rq != task_rq(p))) {
+ /*
-+ * We don't require the BPF scheduler to avoid dispatching to
-+ * offline CPUs mostly for convenience but also because CPUs can
-+ * go offline between scx_bpf_dispatch() calls and here. If @p
-+ * is destined to an offline CPU, queue it on its current CPU
-+ * instead, which should always be safe. As this is an allowed
-+ * behavior, don't trigger an ops error.
++ * If @p is staying on the same rq, there's no need to go
++ * through the full deactivate/activate cycle. Optimize by
++ * abbreviating move_remote_task_to_local_dsq().
+ */
-+ if (!scx_rq_online(dst_rq))
-+ dst_rq = src_rq;
-+
+ if (src_rq == dst_rq) {
-+ /*
-+ * As @p is staying on the same rq, there's no need to
-+ * go through the full deactivate/activate cycle.
-+ * Optimize by abbreviating the operations in
-+ * move_task_to_local_dsq().
-+ */
-+ dsp = p->scx.holding_cpu == raw_smp_processor_id();
-+ if (likely(dsp)) {
-+ p->scx.holding_cpu = -1;
-+ dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+ enq_flags);
-+ }
++ p->scx.holding_cpu = -1;
++ dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
+ } else {
-+ dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
++ move_remote_task_to_local_dsq(p, enq_flags,
++ src_rq, dst_rq);
+ }
+
+ /* if the destination CPU is idle, wake it up */
-+ if (dsp && sched_class_above(p->sched_class,
-+ dst_rq->curr->sched_class))
++ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
+ resched_curr(dst_rq);
++ }
+
-+ dispatch_to_local_dsq_unlock(rq, src_rq, locked_dst_rq);
-+
-+ return dsp ? DTL_DISPATCHED : DTL_LOST;
++ /* switch back to @rq lock */
++ if (rq != dst_rq) {
++ raw_spin_rq_unlock(dst_rq);
++ raw_spin_rq_lock(rq);
+ }
++#else /* CONFIG_SMP */
++ BUG(); /* control can not reach here on UP */
+#endif /* CONFIG_SMP */
-+
-+ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
-+ cpu_of(dst_rq), p->comm, p->pid);
-+ return DTL_INVALID;
+}
+
+/**
@@ -3971,20 +4480,12 @@ index 000000000000..0dac88d0e578
+
+ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
+
-+ switch (dispatch_to_local_dsq(rq, dsq_id, p, enq_flags)) {
-+ case DTL_DISPATCHED:
-+ break;
-+ case DTL_LOST:
-+ break;
-+ case DTL_INVALID:
-+ dsq_id = SCX_DSQ_GLOBAL;
-+ fallthrough;
-+ case DTL_NOT_LOCAL:
-+ dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
-+ dsq_id, p);
++ dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
++
++ if (dsq->id == SCX_DSQ_LOCAL)
++ dispatch_to_local_dsq(rq, dsq, p, enq_flags);
++ else
+ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
-+ break;
-+ }
+}
+
+static void flush_dispatch_buf(struct rq *rq)
@@ -4046,7 +4547,7 @@ index 000000000000..0dac88d0e578
+ * same conditions later and pick @rq->curr accordingly.
+ */
+ if ((prev->scx.flags & SCX_TASK_QUEUED) &&
-+ prev->scx.slice && !scx_ops_bypassing()) {
++ prev->scx.slice && !scx_rq_bypassing(rq)) {
+ if (local)
+ prev->scx.flags |= SCX_TASK_BAL_KEEP;
+ goto has_tasks;
@@ -4057,10 +4558,10 @@ index 000000000000..0dac88d0e578
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+
-+ if (consume_dispatch_q(rq, &scx_dsq_global))
++ if (consume_global_dsq(rq))
+ goto has_tasks;
+
-+ if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
++ if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ goto out;
+
+ dspc->rq = rq;
@@ -4082,7 +4583,7 @@ index 000000000000..0dac88d0e578
+
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
-+ if (consume_dispatch_q(rq, &scx_dsq_global))
++ if (consume_global_dsq(rq))
+ goto has_tasks;
+
+ /*
@@ -4109,7 +4610,6 @@ index 000000000000..0dac88d0e578
+ return has_tasks;
+}
+
-+#ifdef CONFIG_SMP
+static int balance_scx(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
@@ -4143,7 +4643,31 @@ index 000000000000..0dac88d0e578
+
+ return ret;
+}
-+#endif
++
++static void process_ddsp_deferred_locals(struct rq *rq)
++{
++ struct task_struct *p;
++
++ lockdep_assert_rq_held(rq);
++
++ /*
++ * Now that @rq can be unlocked, execute the deferred enqueueing of
++ * tasks directly dispatched to the local DSQs of other CPUs. See
++ * direct_dispatch(). Keep popping from the head instead of using
++ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
++ * temporarily.
++ */
++ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
++ struct task_struct, scx.dsq_list.node))) {
++ struct scx_dispatch_q *dsq;
++
++ list_del_init(&p->scx.dsq_list.node);
++
++ dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
++ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
++ dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
++ }
++}
+
+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
+{
@@ -4187,62 +4711,71 @@ index 000000000000..0dac88d0e578
+ }
+}
+
-+static void process_ddsp_deferred_locals(struct rq *rq)
++static enum scx_cpu_preempt_reason
++preempt_reason_from_class(const struct sched_class *class)
++{
++#ifdef CONFIG_SMP
++ if (class == &stop_sched_class)
++ return SCX_CPU_PREEMPT_STOP;
++#endif
++ if (class == &dl_sched_class)
++ return SCX_CPU_PREEMPT_DL;
++ if (class == &rt_sched_class)
++ return SCX_CPU_PREEMPT_RT;
++ return SCX_CPU_PREEMPT_UNKNOWN;
++}
++
++static void switch_class_scx(struct rq *rq, struct task_struct *next)
+{
-+ struct task_struct *p, *tmp;
++ const struct sched_class *next_class = next->sched_class;
+
-+ lockdep_assert_rq_held(rq);
++ if (!scx_enabled())
++ return;
++#ifdef CONFIG_SMP
++ /*
++ * Pairs with the smp_load_acquire() issued by a CPU in
++ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
++ * resched.
++ */
++ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
++#endif
++ if (!static_branch_unlikely(&scx_ops_cpu_preempt))
++ return;
+
+ /*
-+ * Now that @rq can be unlocked, execute the deferred enqueueing of
-+ * tasks directly dispatched to the local DSQs of other CPUs. See
-+ * direct_dispatch().
++ * The callback is conceptually meant to convey that the CPU is no
++ * longer under the control of SCX. Therefore, don't invoke the callback
++ * if the next class is below SCX (in which case the BPF scheduler has
++ * actively decided not to schedule any tasks on the CPU).
+ */
-+ list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals,
-+ scx.dsq_list.node) {
-+ s32 ret;
++ if (sched_class_above(&ext_sched_class, next_class))
++ return;
+
-+ list_del_init(&p->scx.dsq_list.node);
++ /*
++ * At this point we know that SCX was preempted by a higher priority
++ * sched_class, so invoke the ->cpu_release() callback if we have not
++ * done so already. We only send the callback once between SCX being
++ * preempted, and it regaining control of the CPU.
++ *
++ * ->cpu_release() complements ->cpu_acquire(), which is emitted the
++ * next time that balance_scx() is invoked.
++ */
++ if (!rq->scx.cpu_released) {
++ if (SCX_HAS_OP(cpu_release)) {
++ struct scx_cpu_release_args args = {
++ .reason = preempt_reason_from_class(next_class),
++ .task = next,
++ };
+
-+ ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p,
-+ p->scx.ddsp_enq_flags);
-+ WARN_ON_ONCE(ret == DTL_NOT_LOCAL);
++ SCX_CALL_OP(SCX_KF_CPU_RELEASE,
++ cpu_release, cpu_of(rq), &args);
++ }
++ rq->scx.cpu_released = true;
+ }
+}
+
+static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
+{
-+#ifndef CONFIG_SMP
-+ /*
-+ * UP workaround.
-+ *
-+ * Because SCX may transfer tasks across CPUs during dispatch, dispatch
-+ * is performed from its balance operation which isn't called in UP.
-+ * Let's work around by calling it from the operations which come right
-+ * after.
-+ *
-+ * 1. If the prev task is on SCX, pick_next_task() calls
-+ * .put_prev_task() right after. As .put_prev_task() is also called
-+ * from other places, we need to distinguish the calls which can be
-+ * done by looking at the previous task's state - if still queued or
-+ * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task().
-+ * This case is handled here.
-+ *
-+ * 2. If the prev task is not on SCX, the first following call into SCX
-+ * will be .pick_next_task(), which is covered by calling
-+ * balance_scx() from pick_next_task_scx().
-+ *
-+ * Note that we can't merge the first case into the second as
-+ * balance_scx() must be called before the previous SCX task goes
-+ * through put_prev_task_scx().
-+ *
-+ * @rq is pinned and can't be unlocked. As UP doesn't transfer tasks
-+ * around, balance_one() doesn't need to.
-+ */
-+ if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP))
-+ balance_one(rq, p, true);
-+#endif
-+
+ update_curr_scx(rq);
+
+ /* see dequeue_task_scx() on why we skip when !QUEUED */
@@ -4269,7 +4802,7 @@ index 000000000000..0dac88d0e578
+ * scheduler class or core-sched forcing a different task. Leave
+ * it at the head of the local DSQ.
+ */
-+ if (p->scx.slice && !scx_ops_bypassing()) {
++ if (p->scx.slice && !scx_rq_bypassing(rq)) {
+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ return;
+ }
@@ -4300,12 +4833,6 @@ index 000000000000..0dac88d0e578
+{
+ struct task_struct *p;
+
-+#ifndef CONFIG_SMP
-+ /* UP workaround - see the comment at the head of put_prev_task_scx() */
-+ if (unlikely(rq->curr->sched_class != &ext_sched_class))
-+ balance_one(rq, rq->curr, true);
-+#endif
-+
+ p = first_local_task(rq);
+ if (!p)
+ return NULL;
@@ -4313,7 +4840,7 @@ index 000000000000..0dac88d0e578
+ set_next_task_scx(rq, p, true);
+
+ if (unlikely(!p->scx.slice)) {
-+ if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
++ if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+ p->comm, p->pid);
+ scx_warned_zero_slice = true;
@@ -4350,7 +4877,7 @@ index 000000000000..0dac88d0e578
+ * calling ops.core_sched_before(). Accesses are controlled by the
+ * verifier.
+ */
-+ if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
++ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+ (struct task_struct *)a,
+ (struct task_struct *)b);
@@ -4402,69 +4929,6 @@ index 000000000000..0dac88d0e578
+}
+#endif /* CONFIG_SCHED_CORE */
+
-+static enum scx_cpu_preempt_reason
-+preempt_reason_from_class(const struct sched_class *class)
-+{
-+#ifdef CONFIG_SMP
-+ if (class == &stop_sched_class)
-+ return SCX_CPU_PREEMPT_STOP;
-+#endif
-+ if (class == &dl_sched_class)
-+ return SCX_CPU_PREEMPT_DL;
-+ if (class == &rt_sched_class)
-+ return SCX_CPU_PREEMPT_RT;
-+ return SCX_CPU_PREEMPT_UNKNOWN;
-+}
-+
-+static void switch_class_scx(struct rq *rq, struct task_struct *next)
-+{
-+ const struct sched_class *next_class = next->sched_class;
-+
-+ if (!scx_enabled())
-+ return;
-+#ifdef CONFIG_SMP
-+ /*
-+ * Pairs with the smp_load_acquire() issued by a CPU in
-+ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
-+ * resched.
-+ */
-+ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-+#endif
-+ if (!static_branch_unlikely(&scx_ops_cpu_preempt))
-+ return;
-+
-+ /*
-+ * The callback is conceptually meant to convey that the CPU is no
-+ * longer under the control of SCX. Therefore, don't invoke the callback
-+ * if the next class is below SCX (in which case the BPF scheduler has
-+ * actively decided not to schedule any tasks on the CPU).
-+ */
-+ if (sched_class_above(&ext_sched_class, next_class))
-+ return;
-+
-+ /*
-+ * At this point we know that SCX was preempted by a higher priority
-+ * sched_class, so invoke the ->cpu_release() callback if we have not
-+ * done so already. We only send the callback once between SCX being
-+ * preempted, and it regaining control of the CPU.
-+ *
-+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the
-+ * next time that balance_scx() is invoked.
-+ */
-+ if (!rq->scx.cpu_released) {
-+ if (SCX_HAS_OP(cpu_release)) {
-+ struct scx_cpu_release_args args = {
-+ .reason = preempt_reason_from_class(next_class),
-+ .task = next,
-+ };
-+
-+ SCX_CALL_OP(SCX_KF_CPU_RELEASE,
-+ cpu_release, cpu_of(rq), &args);
-+ }
-+ rq->scx.cpu_released = true;
-+ }
-+}
-+
+#ifdef CONFIG_SMP
+
+static bool test_and_clear_cpu_idle(int cpu)
@@ -4815,7 +5279,7 @@ index 000000000000..0dac88d0e578
+ * While disabling, always resched and refresh core-sched timestamp as
+ * we can't trust the slice management or ops.core_sched_before().
+ */
-+ if (scx_ops_bypassing()) {
++ if (scx_rq_bypassing(rq)) {
+ curr->scx.slice = 0;
+ touch_core_sched(rq, curr);
+ } else if (SCX_HAS_OP(tick)) {
@@ -4826,6 +5290,28 @@ index 000000000000..0dac88d0e578
+ resched_curr(rq);
+}
+
++#ifdef CONFIG_EXT_GROUP_SCHED
++static struct cgroup *tg_cgrp(struct task_group *tg)
++{
++ /*
++ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
++ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
++ * root cgroup.
++ */
++ if (tg && tg->css.cgroup)
++ return tg->css.cgroup;
++ else
++ return &cgrp_dfl_root.cgrp;
++}
++
++#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg),
++
++#else /* CONFIG_EXT_GROUP_SCHED */
++
++#define SCX_INIT_TASK_ARGS_CGROUP(tg)
++
++#endif /* CONFIG_EXT_GROUP_SCHED */
++
+static enum scx_task_state scx_get_task_state(const struct task_struct *p)
+{
+ return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
@@ -4870,6 +5356,7 @@ index 000000000000..0dac88d0e578
+
+ if (SCX_HAS_OP(init_task)) {
+ struct scx_init_task_args args = {
++ SCX_INIT_TASK_ARGS_CGROUP(tg)
+ .fork = fork,
+ };
+
@@ -4883,24 +5370,29 @@ index 000000000000..0dac88d0e578
+ scx_set_task_state(p, SCX_TASK_INIT);
+
+ if (p->scx.disallow) {
-+ struct rq *rq;
-+ struct rq_flags rf;
++ if (!fork) {
++ struct rq *rq;
++ struct rq_flags rf;
+
-+ rq = task_rq_lock(p, &rf);
++ rq = task_rq_lock(p, &rf);
+
-+ /*
-+ * We're either in fork or load path and @p->policy will be
-+ * applied right after. Reverting @p->policy here and rejecting
-+ * %SCHED_EXT transitions from scx_check_setscheduler()
-+ * guarantees that if ops.init_task() sets @p->disallow, @p can
-+ * never be in SCX.
-+ */
-+ if (p->policy == SCHED_EXT) {
-+ p->policy = SCHED_NORMAL;
-+ atomic_long_inc(&scx_nr_rejected);
-+ }
++ /*
++ * We're in the load path and @p->policy will be applied
++ * right after. Reverting @p->policy here and rejecting
++ * %SCHED_EXT transitions from scx_check_setscheduler()
++ * guarantees that if ops.init_task() sets @p->disallow,
++ * @p can never be in SCX.
++ */
++ if (p->policy == SCHED_EXT) {
++ p->policy = SCHED_NORMAL;
++ atomic_long_inc(&scx_nr_rejected);
++ }
+
-+ task_rq_unlock(rq, p, &rf);
++ task_rq_unlock(rq, p, &rf);
++ } else if (p->policy == SCHED_EXT) {
++ scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
++ p->comm, p->pid);
++ }
+ }
+
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
@@ -4929,7 +5421,7 @@ index 000000000000..0dac88d0e578
+ scx_set_task_state(p, SCX_TASK_ENABLED);
+
+ if (SCX_HAS_OP(set_weight))
-+ SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
++ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+}
+
+static void scx_ops_disable_task(struct task_struct *p)
@@ -5004,7 +5496,7 @@ index 000000000000..0dac88d0e578
+{
+ percpu_rwsem_assert_held(&scx_fork_rwsem);
+
-+ if (scx_enabled())
++ if (scx_ops_init_task_enabled)
+ return scx_ops_init_task(p, task_group(p), true);
+ else
+ return 0;
@@ -5012,7 +5504,7 @@ index 000000000000..0dac88d0e578
+
+void scx_post_fork(struct task_struct *p)
+{
-+ if (scx_enabled()) {
++ if (scx_ops_init_task_enabled) {
+ scx_set_task_state(p, SCX_TASK_READY);
+
+ /*
@@ -5126,7 +5618,7 @@ index 000000000000..0dac88d0e578
+{
+ struct task_struct *p = rq->curr;
+
-+ if (scx_ops_bypassing())
++ if (scx_rq_bypassing(rq))
+ return false;
+
+ if (p->sched_class != &ext_sched_class)
@@ -5141,6 +5633,222 @@ index 000000000000..0dac88d0e578
+}
+#endif
+
++#ifdef CONFIG_EXT_GROUP_SCHED
++
++DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
++static bool scx_cgroup_enabled;
++static bool cgroup_warned_missing_weight;
++static bool cgroup_warned_missing_idle;
++
++static void scx_cgroup_warn_missing_weight(struct task_group *tg)
++{
++ if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
++ cgroup_warned_missing_weight)
++ return;
++
++ if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
++ return;
++
++ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
++ scx_ops.name);
++ cgroup_warned_missing_weight = true;
++}
++
++static void scx_cgroup_warn_missing_idle(struct task_group *tg)
++{
++ if (!scx_cgroup_enabled || cgroup_warned_missing_idle)
++ return;
++
++ if (!tg->idle)
++ return;
++
++ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
++ scx_ops.name);
++ cgroup_warned_missing_idle = true;
++}
++
++int scx_tg_online(struct task_group *tg)
++{
++ int ret = 0;
++
++ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
++
++ percpu_down_read(&scx_cgroup_rwsem);
++
++ scx_cgroup_warn_missing_weight(tg);
++
++ if (scx_cgroup_enabled) {
++ if (SCX_HAS_OP(cgroup_init)) {
++ struct scx_cgroup_init_args args =
++ { .weight = tg->scx_weight };
++
++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
++ tg->css.cgroup, &args);
++ if (ret)
++ ret = ops_sanitize_err("cgroup_init", ret);
++ }
++ if (ret == 0)
++ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
++ } else {
++ tg->scx_flags |= SCX_TG_ONLINE;
++ }
++
++ percpu_up_read(&scx_cgroup_rwsem);
++ return ret;
++}
++
++void scx_tg_offline(struct task_group *tg)
++{
++ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
++
++ percpu_down_read(&scx_cgroup_rwsem);
++
++ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
++ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
++
++ percpu_up_read(&scx_cgroup_rwsem);
++}
++
++int scx_cgroup_can_attach(struct cgroup_taskset *tset)
++{
++ struct cgroup_subsys_state *css;
++ struct task_struct *p;
++ int ret;
++
++ /* released in scx_finish/cancel_attach() */
++ percpu_down_read(&scx_cgroup_rwsem);
++
++ if (!scx_cgroup_enabled)
++ return 0;
++
++ cgroup_taskset_for_each(p, css, tset) {
++ struct cgroup *from = tg_cgrp(task_group(p));
++ struct cgroup *to = tg_cgrp(css_tg(css));
++
++ WARN_ON_ONCE(p->scx.cgrp_moving_from);
++
++ /*
++ * sched_move_task() omits identity migrations. Let's match the
++ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
++ * always match one-to-one.
++ */
++ if (from == to)
++ continue;
++
++ if (SCX_HAS_OP(cgroup_prep_move)) {
++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
++ p, from, css->cgroup);
++ if (ret)
++ goto err;
++ }
++
++ p->scx.cgrp_moving_from = from;
++ }
++
++ return 0;
++
++err:
++ cgroup_taskset_for_each(p, css, tset) {
++ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
++ p->scx.cgrp_moving_from, css->cgroup);
++ p->scx.cgrp_moving_from = NULL;
++ }
++
++ percpu_up_read(&scx_cgroup_rwsem);
++ return ops_sanitize_err("cgroup_prep_move", ret);
++}
++
++void scx_move_task(struct task_struct *p)
++{
++ if (!scx_cgroup_enabled)
++ return;
++
++ /*
++ * We're called from sched_move_task() which handles both cgroup and
++ * autogroup moves. Ignore the latter.
++ *
++ * Also ignore exiting tasks, because in the exit path tasks transition
++ * from the autogroup to the root group, so task_group_is_autogroup()
++ * alone isn't able to catch exiting autogroup tasks. This is safe for
++ * cgroup_move(), because cgroup migrations never happen for PF_EXITING
++ * tasks.
++ */
++ if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
++ return;
++
++ /*
++ * @p must have ops.cgroup_prep_move() called on it and thus
++ * cgrp_moving_from set.
++ */
++ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
++ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
++ p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
++ p->scx.cgrp_moving_from = NULL;
++}
++
++void scx_cgroup_finish_attach(void)
++{
++ percpu_up_read(&scx_cgroup_rwsem);
++}
++
++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
++{
++ struct cgroup_subsys_state *css;
++ struct task_struct *p;
++
++ if (!scx_cgroup_enabled)
++ goto out_unlock;
++
++ cgroup_taskset_for_each(p, css, tset) {
++ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
++ p->scx.cgrp_moving_from, css->cgroup);
++ p->scx.cgrp_moving_from = NULL;
++ }
++out_unlock:
++ percpu_up_read(&scx_cgroup_rwsem);
++}
++
++void scx_group_set_weight(struct task_group *tg, unsigned long weight)
++{
++ percpu_down_read(&scx_cgroup_rwsem);
++
++ if (scx_cgroup_enabled && tg->scx_weight != weight) {
++ if (SCX_HAS_OP(cgroup_set_weight))
++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
++ tg_cgrp(tg), weight);
++ tg->scx_weight = weight;
++ }
++
++ percpu_up_read(&scx_cgroup_rwsem);
++}
++
++void scx_group_set_idle(struct task_group *tg, bool idle)
++{
++ percpu_down_read(&scx_cgroup_rwsem);
++ scx_cgroup_warn_missing_idle(tg);
++ percpu_up_read(&scx_cgroup_rwsem);
++}
++
++static void scx_cgroup_lock(void)
++{
++ percpu_down_write(&scx_cgroup_rwsem);
++}
++
++static void scx_cgroup_unlock(void)
++{
++ percpu_up_write(&scx_cgroup_rwsem);
++}
++
++#else /* CONFIG_EXT_GROUP_SCHED */
++
++static inline void scx_cgroup_lock(void) {}
++static inline void scx_cgroup_unlock(void) {}
++
++#endif /* CONFIG_EXT_GROUP_SCHED */
++
+/*
+ * Omitted operations:
+ *
@@ -5161,6 +5869,7 @@ index 000000000000..0dac88d0e578
+
+ .wakeup_preempt = wakeup_preempt_scx,
+
++ .balance = balance_scx,
+ .pick_next_task = pick_next_task_scx,
+
+ .put_prev_task = put_prev_task_scx,
@@ -5169,7 +5878,6 @@ index 000000000000..0dac88d0e578
+ .switch_class = switch_class_scx,
+
+#ifdef CONFIG_SMP
-+ .balance = balance_scx,
+ .select_task_rq = select_task_rq_scx,
+ .task_woken = task_woken_scx,
+ .set_cpus_allowed = set_cpus_allowed_scx,
@@ -5278,6 +5986,102 @@ index 000000000000..0dac88d0e578
+ rcu_read_unlock();
+}
+
++#ifdef CONFIG_EXT_GROUP_SCHED
++static void scx_cgroup_exit(void)
++{
++ struct cgroup_subsys_state *css;
++
++ percpu_rwsem_assert_held(&scx_cgroup_rwsem);
++
++ WARN_ON_ONCE(!scx_cgroup_enabled);
++ scx_cgroup_enabled = false;
++
++ /*
++ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
++ * cgroups and exit all the inited ones, all online cgroups are exited.
++ */
++ rcu_read_lock();
++ css_for_each_descendant_post(css, &root_task_group.css) {
++ struct task_group *tg = css_tg(css);
++
++ if (!(tg->scx_flags & SCX_TG_INITED))
++ continue;
++ tg->scx_flags &= ~SCX_TG_INITED;
++
++ if (!scx_ops.cgroup_exit)
++ continue;
++
++ if (WARN_ON_ONCE(!css_tryget(css)))
++ continue;
++ rcu_read_unlock();
++
++ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
++
++ rcu_read_lock();
++ css_put(css);
++ }
++ rcu_read_unlock();
++}
++
++static int scx_cgroup_init(void)
++{
++ struct cgroup_subsys_state *css;
++ int ret;
++
++ percpu_rwsem_assert_held(&scx_cgroup_rwsem);
++
++ cgroup_warned_missing_weight = false;
++ cgroup_warned_missing_idle = false;
++
++ /*
++ * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
++ * cgroups and init, all online cgroups are initialized.
++ */
++ rcu_read_lock();
++ css_for_each_descendant_pre(css, &root_task_group.css) {
++ struct task_group *tg = css_tg(css);
++ struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
++
++ scx_cgroup_warn_missing_weight(tg);
++ scx_cgroup_warn_missing_idle(tg);
++
++ if ((tg->scx_flags &
++ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
++ continue;
++
++ if (!scx_ops.cgroup_init) {
++ tg->scx_flags |= SCX_TG_INITED;
++ continue;
++ }
++
++ if (WARN_ON_ONCE(!css_tryget(css)))
++ continue;
++ rcu_read_unlock();
++
++ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
++ css->cgroup, &args);
++ if (ret) {
++ css_put(css);
++ return ret;
++ }
++ tg->scx_flags |= SCX_TG_INITED;
++
++ rcu_read_lock();
++ css_put(css);
++ }
++ rcu_read_unlock();
++
++ WARN_ON_ONCE(scx_cgroup_enabled);
++ scx_cgroup_enabled = true;
++
++ return 0;
++}
++
++#else
++static void scx_cgroup_exit(void) {}
++static int scx_cgroup_init(void) { return 0; }
++#endif
++
+
+/********************************************************************************
+ * Sysfs interface and ops enable/disable.
@@ -5318,11 +6122,19 @@ index 000000000000..0dac88d0e578
+}
+SCX_ATTR(hotplug_seq);
+
++static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
++ struct kobj_attribute *ka, char *buf)
++{
++ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
++}
++SCX_ATTR(enable_seq);
++
+static struct attribute *scx_global_attrs[] = {
+ &scx_attr_state.attr,
+ &scx_attr_switch_all.attr,
+ &scx_attr_nr_rejected.attr,
+ &scx_attr_hotplug_seq.attr,
++ &scx_attr_enable_seq.attr,
+ NULL,
+};
+
@@ -5421,16 +6233,8 @@ index 000000000000..0dac88d0e578
+ }
+
+ /*
-+ * We need to guarantee that no tasks are on the BPF scheduler while
-+ * bypassing. Either we see enabled or the enable path sees the
-+ * increased bypass_depth before moving tasks to SCX.
-+ */
-+ if (!scx_enabled())
-+ return;
-+
-+ /*
+ * No task property is changing. We just need to make sure all currently
-+ * queued tasks are re-queued according to the new scx_ops_bypassing()
++ * queued tasks are re-queued according to the new scx_rq_bypassing()
+ * state. As an optimization, walk each rq's runnable_list instead of
+ * the scx_tasks list.
+ *
@@ -5444,6 +6248,24 @@ index 000000000000..0dac88d0e578
+
+ rq_lock_irqsave(rq, &rf);
+
++ if (bypass) {
++ WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
++ rq->scx.flags |= SCX_RQ_BYPASSING;
++ } else {
++ WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
++ rq->scx.flags &= ~SCX_RQ_BYPASSING;
++ }
++
++ /*
++ * We need to guarantee that no tasks are on the BPF scheduler
++ * while bypassing. Either we see enabled or the enable path
++ * sees scx_rq_bypassing() before moving tasks to SCX.
++ */
++ if (!scx_enabled()) {
++ rq_unlock_irqrestore(rq, &rf);
++ continue;
++ }
++
+ /*
+ * The use of list_for_each_entry_safe_reverse() is required
+ * because each task is going to be removed from and added back
@@ -5499,11 +6321,11 @@ index 000000000000..0dac88d0e578
+{
+ switch (kind) {
+ case SCX_EXIT_UNREG:
-+ return "Scheduler unregistered from user space";
++ return "unregistered from user space";
+ case SCX_EXIT_UNREG_BPF:
-+ return "Scheduler unregistered from BPF";
++ return "unregistered from BPF";
+ case SCX_EXIT_UNREG_KERN:
-+ return "Scheduler unregistered from the main kernel";
++ return "unregistered from the main kernel";
+ case SCX_EXIT_SYSRQ:
+ return "disabled by sysrq-S";
+ case SCX_EXIT_ERROR:
@@ -5569,66 +6391,64 @@ index 000000000000..0dac88d0e578
+ WRITE_ONCE(scx_switching_all, false);
+
+ /*
-+ * Avoid racing against fork. See scx_ops_enable() for explanation on
-+ * the locking order.
++ * Shut down cgroup support before tasks so that the cgroup attach path
++ * doesn't race against scx_ops_exit_task().
++ */
++ scx_cgroup_lock();
++ scx_cgroup_exit();
++ scx_cgroup_unlock();
++
++ /*
++ * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
++ * must be switched out and exited synchronously.
+ */
+ percpu_down_write(&scx_fork_rwsem);
-+ cpus_read_lock();
++
++ scx_ops_init_task_enabled = false;
+
+ spin_lock_irq(&scx_tasks_lock);
+ scx_task_iter_init(&sti);
-+ /*
-+ * Invoke scx_ops_exit_task() on all non-idle tasks, including
-+ * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount,
-+ * we may not have invoked sched_ext_free() on them by the time a
-+ * scheduler is disabled. We must therefore exit the task here, or we'd
-+ * fail to invoke ops.exit_task(), as the scheduler will have been
-+ * unloaded by the time the task is subsequently exited on the
-+ * sched_ext_free() path.
-+ */
-+ while ((p = scx_task_iter_next_locked(&sti, true))) {
++ while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class;
+ struct sched_enq_and_set_ctx ctx;
+
-+ if (READ_ONCE(p->__state) != TASK_DEAD) {
-+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
-+ &ctx);
++ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+
-+ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
-+ __setscheduler_prio(p, p->prio);
-+ check_class_changing(task_rq(p), p, old_class);
++ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
++ __setscheduler_prio(p, p->prio);
++ check_class_changing(task_rq(p), p, old_class);
+
-+ sched_enq_and_set_task(&ctx);
++ sched_enq_and_set_task(&ctx);
+
-+ check_class_changed(task_rq(p), p, old_class, p->prio);
-+ }
++ check_class_changed(task_rq(p), p, old_class, p->prio);
+ scx_ops_exit_task(p);
+ }
+ scx_task_iter_exit(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
++ percpu_up_write(&scx_fork_rwsem);
+
+ /* no task is on scx, turn off all the switches and flush in-progress calls */
-+ static_branch_disable_cpuslocked(&__scx_ops_enabled);
++ static_branch_disable(&__scx_ops_enabled);
+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
-+ static_branch_disable_cpuslocked(&scx_has_op[i]);
-+ static_branch_disable_cpuslocked(&scx_ops_enq_last);
-+ static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
-+ static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
-+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
++ static_branch_disable(&scx_has_op[i]);
++ static_branch_disable(&scx_ops_enq_last);
++ static_branch_disable(&scx_ops_enq_exiting);
++ static_branch_disable(&scx_ops_cpu_preempt);
++ static_branch_disable(&scx_builtin_idle_enabled);
+ synchronize_rcu();
+
-+ cpus_read_unlock();
-+ percpu_up_write(&scx_fork_rwsem);
-+
+ if (ei->kind >= SCX_EXIT_ERROR) {
-+ printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
-+
-+ if (ei->msg[0] == '\0')
-+ printk(KERN_ERR "sched_ext: %s\n", ei->reason);
-+ else
-+ printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg);
++ pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
++ scx_ops.name, ei->reason);
+
++ if (ei->msg[0] != '\0')
++ pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
++#ifdef CONFIG_STACKTRACE
+ stack_trace_print(ei->bt, ei->bt_len, 2);
++#endif
++ } else {
++ pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
++ scx_ops.name, ei->reason);
+ }
+
+ if (scx_ops.exit)
@@ -5817,7 +6637,7 @@ index 000000000000..0dac88d0e578
+ static unsigned long bt[SCX_EXIT_BT_LEN];
+ char dsq_id_buf[19] = "(n/a)";
+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
-+ unsigned int bt_len;
++ unsigned int bt_len = 0;
+
+ if (p->scx.dsq)
+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
@@ -5842,7 +6662,9 @@ index 000000000000..0dac88d0e578
+ ops_dump_exit();
+ }
+
++#ifdef CONFIG_STACKTRACE
+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
++#endif
+ if (bt_len) {
+ dump_newline(s);
+ dump_stack_trace(s, " ", bt, bt_len);
@@ -6000,10 +6822,10 @@ index 000000000000..0dac88d0e578
+ return;
+
+ ei->exit_code = exit_code;
-+
++#ifdef CONFIG_STACKTRACE
+ if (kind >= SCX_EXIT_ERROR)
+ ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
-+
++#endif
+ va_start(args, fmt);
+ vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
+ va_end(args);
@@ -6061,12 +6883,12 @@ index 000000000000..0dac88d0e578
+ return 0;
+}
+
-+static int scx_ops_enable(struct sched_ext_ops *ops)
++static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ unsigned long timeout;
-+ int i, cpu, ret;
++ int i, cpu, node, ret;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
@@ -6085,6 +6907,34 @@ index 000000000000..0dac88d0e578
+ }
+ }
+
++ if (!global_dsqs) {
++ struct scx_dispatch_q **dsqs;
++
++ dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
++ if (!dsqs) {
++ ret = -ENOMEM;
++ goto err_unlock;
++ }
++
++ for_each_node_state(node, N_POSSIBLE) {
++ struct scx_dispatch_q *dsq;
++
++ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
++ if (!dsq) {
++ for_each_node_state(node, N_POSSIBLE)
++ kfree(dsqs[node]);
++ kfree(dsqs);
++ ret = -ENOMEM;
++ goto err_unlock;
++ }
++
++ init_dsq(dsq, SCX_DSQ_GLOBAL);
++ dsqs[node] = dsq;
++ }
++
++ global_dsqs = dsqs;
++ }
++
+ if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ ret = -EBUSY;
+ goto err_unlock;
@@ -6108,12 +6958,12 @@ index 000000000000..0dac88d0e578
+ }
+
+ /*
-+ * Set scx_ops, transition to PREPPING and clear exit info to arm the
++ * Set scx_ops, transition to ENABLING and clear exit info to arm the
+ * disable path. Failure triggers full disabling from here on.
+ */
+ scx_ops = *ops;
+
-+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
++ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
+ SCX_OPS_DISABLED);
+
+ atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
@@ -6134,7 +6984,8 @@ index 000000000000..0dac88d0e578
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+ if (ret) {
+ ret = ops_sanitize_err("init", ret);
-+ goto err_disable_unlock_cpus;
++ cpus_read_unlock();
++ goto err_disable;
+ }
+ }
+
@@ -6142,6 +6993,7 @@ index 000000000000..0dac88d0e578
+ if (((void (**)(void))ops)[i])
+ static_branch_enable_cpuslocked(&scx_has_op[i]);
+
++ check_hotplug_seq(ops);
+ cpus_read_unlock();
+
+ ret = validate_ops(ops);
@@ -6169,42 +7021,40 @@ index 000000000000..0dac88d0e578
+ scx_watchdog_timeout / 2);
+
+ /*
-+ * Lock out forks before opening the floodgate so that they don't wander
-+ * into the operations prematurely.
-+ *
-+ * We don't need to keep the CPUs stable but grab cpus_read_lock() to
-+ * ease future locking changes for cgroup suport.
-+ *
-+ * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
-+ * following dependency chain:
-+ *
-+ * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
++ * Once __scx_ops_enabled is set, %current can be switched to SCX
++ * anytime. This can lead to stalls as some BPF schedulers (e.g.
++ * userspace scheduling) may not function correctly before all tasks are
++ * switched. Init in bypass mode to guarantee forward progress.
+ */
-+ percpu_down_write(&scx_fork_rwsem);
-+ cpus_read_lock();
-+
-+ check_hotplug_seq(ops);
++ scx_ops_bypass(true);
+
+ for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
+ if (((void (**)(void))ops)[i])
-+ static_branch_enable_cpuslocked(&scx_has_op[i]);
++ static_branch_enable(&scx_has_op[i]);
+
+ if (ops->flags & SCX_OPS_ENQ_LAST)
-+ static_branch_enable_cpuslocked(&scx_ops_enq_last);
++ static_branch_enable(&scx_ops_enq_last);
+
+ if (ops->flags & SCX_OPS_ENQ_EXITING)
-+ static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
++ static_branch_enable(&scx_ops_enq_exiting);
+ if (scx_ops.cpu_acquire || scx_ops.cpu_release)
-+ static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
++ static_branch_enable(&scx_ops_cpu_preempt);
+
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
+ reset_idle_masks();
-+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
++ static_branch_enable(&scx_builtin_idle_enabled);
+ } else {
-+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
++ static_branch_disable(&scx_builtin_idle_enabled);
+ }
+
-+ static_branch_enable_cpuslocked(&__scx_ops_enabled);
++ /*
++ * Lock out forks, cgroup on/offlining and moves before opening the
++ * floodgate so that they don't wander into the operations prematurely.
++ */
++ percpu_down_write(&scx_fork_rwsem);
++
++ WARN_ON_ONCE(scx_ops_init_task_enabled);
++ scx_ops_init_task_enabled = true;
+
+ /*
+ * Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -6212,12 +7062,29 @@ index 000000000000..0dac88d0e578
+ * leaving as sched_ext_free() can handle both prepped and enabled
+ * tasks. Prep all tasks first and then enable them with preemption
+ * disabled.
++ *
++ * All cgroups should be initialized before scx_ops_init_task() so that
++ * the BPF scheduler can reliably track each task's cgroup membership
++ * from scx_ops_init_task(). Lock out cgroup on/offlining and task
++ * migrations while tasks are being initialized so that
++ * scx_cgroup_can_attach() never sees uninitialized tasks.
+ */
-+ spin_lock_irq(&scx_tasks_lock);
++ scx_cgroup_lock();
++ ret = scx_cgroup_init();
++ if (ret)
++ goto err_disable_unlock_all;
+
++ spin_lock_irq(&scx_tasks_lock);
+ scx_task_iter_init(&sti);
-+ while ((p = scx_task_iter_next_locked(&sti, false))) {
-+ get_task_struct(p);
++ while ((p = scx_task_iter_next_locked(&sti))) {
++ /*
++ * @p may already be dead, have lost all its usages counts and
++ * be waiting for RCU grace period before being freed. @p can't
++ * be initialized for SCX in such cases and should be ignored.
++ */
++ if (!tryget_task_struct(p))
++ continue;
++
+ scx_task_iter_rq_unlock(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+
@@ -6232,51 +7099,37 @@ index 000000000000..0dac88d0e578
+ goto err_disable_unlock_all;
+ }
+
++ scx_set_task_state(p, SCX_TASK_READY);
++
+ put_task_struct(p);
+ spin_lock_irq(&scx_tasks_lock);
+ }
+ scx_task_iter_exit(&sti);
++ spin_unlock_irq(&scx_tasks_lock);
++ scx_cgroup_unlock();
++ percpu_up_write(&scx_fork_rwsem);
+
+ /*
-+ * All tasks are prepped but are still ops-disabled. Ensure that
-+ * %current can't be scheduled out and switch everyone.
-+ * preempt_disable() is necessary because we can't guarantee that
-+ * %current won't be starved if scheduled out while switching.
-+ */
-+ preempt_disable();
-+
-+ /*
-+ * From here on, the disable path must assume that tasks have ops
-+ * enabled and need to be recovered.
-+ *
-+ * Transition to ENABLING fails iff the BPF scheduler has already
-+ * triggered scx_bpf_error(). Returning an error code here would lose
-+ * the recorded error information. Exit indicating success so that the
-+ * error is notified through ops.exit() with all the details.
++ * All tasks are READY. It's safe to turn on scx_enabled() and switch
++ * all eligible tasks.
+ */
-+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
-+ preempt_enable();
-+ spin_unlock_irq(&scx_tasks_lock);
-+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
-+ ret = 0;
-+ goto err_disable_unlock_all;
-+ }
++ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
++ static_branch_enable(&__scx_ops_enabled);
+
+ /*
-+ * We're fully committed and can't fail. The PREPPED -> ENABLED
++ * We're fully committed and can't fail. The task READY -> ENABLED
+ * transitions here are synchronized against sched_ext_free() through
+ * scx_tasks_lock.
+ */
-+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
-+
++ percpu_down_write(&scx_fork_rwsem);
++ spin_lock_irq(&scx_tasks_lock);
+ scx_task_iter_init(&sti);
-+ while ((p = scx_task_iter_next_locked(&sti, false))) {
++ while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class;
+ struct sched_enq_and_set_ctx ctx;
+
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+
-+ scx_set_task_state(p, SCX_TASK_READY);
+ __setscheduler_prio(p, p->prio);
+ check_class_changing(task_rq(p), p, old_class);
+
@@ -6285,13 +7138,16 @@ index 000000000000..0dac88d0e578
+ check_class_changed(task_rq(p), p, old_class, p->prio);
+ }
+ scx_task_iter_exit(&sti);
-+
+ spin_unlock_irq(&scx_tasks_lock);
-+ preempt_enable();
-+ cpus_read_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
-+ /* see above ENABLING transition for the explanation on exiting with 0 */
++ scx_ops_bypass(false);
++
++ /*
++ * Returning an error code here would lose the recorded error
++ * information. Exit indicating success so that the error is notified
++ * through ops.exit() with all the details.
++ */
+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ ret = 0;
@@ -6301,9 +7157,13 @@ index 000000000000..0dac88d0e578
+ if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
+ static_branch_enable(&__scx_switched_all);
+
++ pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
++ scx_ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(scx_root_kobj, KOBJ_ADD);
+ mutex_unlock(&scx_ops_enable_mutex);
+
++ atomic_long_inc(&scx_enable_seq);
++
+ return 0;
+
+err_del:
@@ -6320,9 +7180,9 @@ index 000000000000..0dac88d0e578
+ return ret;
+
+err_disable_unlock_all:
++ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
-+err_disable_unlock_cpus:
-+ cpus_read_unlock();
++ scx_ops_bypass(false);
+err_disable:
+ mutex_unlock(&scx_ops_enable_mutex);
+ /* must be fully disabled before returning */
@@ -6514,6 +7374,11 @@ index 000000000000..0dac88d0e578
+
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, init_task):
++#ifdef CONFIG_EXT_GROUP_SCHED
++ case offsetof(struct sched_ext_ops, cgroup_init):
++ case offsetof(struct sched_ext_ops, cgroup_exit):
++ case offsetof(struct sched_ext_ops, cgroup_prep_move):
++#endif
+ case offsetof(struct sched_ext_ops, cpu_online):
+ case offsetof(struct sched_ext_ops, cpu_offline):
+ case offsetof(struct sched_ext_ops, init):
@@ -6527,12 +7392,12 @@ index 000000000000..0dac88d0e578
+ return 0;
+}
+
-+static int bpf_scx_reg(void *kdata)
++static int bpf_scx_reg(void *kdata, struct bpf_link *link)
+{
-+ return scx_ops_enable(kdata);
++ return scx_ops_enable(kdata, link);
+}
+
-+static void bpf_scx_unreg(void *kdata)
++static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
+{
+ scx_ops_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&scx_ops_disable_work);
@@ -6551,7 +7416,7 @@ index 000000000000..0dac88d0e578
+ return 0;
+}
+
-+static int bpf_scx_update(void *kdata, void *old_kdata)
++static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+ /*
+ * sched_ext does not support updating the actively-loaded BPF
@@ -6572,6 +7437,7 @@ index 000000000000..0dac88d0e578
+static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
+static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
+static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
++static void tick_stub(struct task_struct *p) {}
+static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
+static void running_stub(struct task_struct *p) {}
+static void stopping_stub(struct task_struct *p, bool runnable) {}
@@ -6587,16 +7453,28 @@ index 000000000000..0dac88d0e578
+static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
+static void enable_stub(struct task_struct *p) {}
+static void disable_stub(struct task_struct *p) {}
++#ifdef CONFIG_EXT_GROUP_SCHED
++static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
++static void cgroup_exit_stub(struct cgroup *cgrp) {}
++static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
++static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
++static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
++static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
++#endif
+static void cpu_online_stub(s32 cpu) {}
+static void cpu_offline_stub(s32 cpu) {}
+static s32 init_stub(void) { return -EINVAL; }
+static void exit_stub(struct scx_exit_info *info) {}
++static void dump_stub(struct scx_dump_ctx *ctx) {}
++static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
++static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {}
+
+static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
+ .select_cpu = select_cpu_stub,
+ .enqueue = enqueue_stub,
+ .dequeue = dequeue_stub,
+ .dispatch = dispatch_stub,
++ .tick = tick_stub,
+ .runnable = runnable_stub,
+ .running = running_stub,
+ .stopping = stopping_stub,
@@ -6612,10 +7490,21 @@ index 000000000000..0dac88d0e578
+ .exit_task = exit_task_stub,
+ .enable = enable_stub,
+ .disable = disable_stub,
++#ifdef CONFIG_EXT_GROUP_SCHED
++ .cgroup_init = cgroup_init_stub,
++ .cgroup_exit = cgroup_exit_stub,
++ .cgroup_prep_move = cgroup_prep_move_stub,
++ .cgroup_move = cgroup_move_stub,
++ .cgroup_cancel_move = cgroup_cancel_move_stub,
++ .cgroup_set_weight = cgroup_set_weight_stub,
++#endif
+ .cpu_online = cpu_online_stub,
+ .cpu_offline = cpu_offline_stub,
+ .init = init_stub,
+ .exit = exit_stub,
++ .dump = dump_stub,
++ .dump_cpu = dump_cpu_stub,
++ .dump_task = dump_task_stub,
+};
+
+static struct bpf_struct_ops bpf_sched_ext_ops = {
@@ -6858,10 +7747,10 @@ index 000000000000..0dac88d0e578
+ * definitions so that BPF scheduler implementations can use them
+ * through the generated vmlinux.h.
+ */
-+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT);
++ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
++ SCX_TG_ONLINE);
+
+ BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-+ init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
+#ifdef CONFIG_SMP
+ BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
@@ -6903,35 +7792,6 @@ index 000000000000..0dac88d0e578
+__bpf_kfunc_start_defs();
+
+/**
-+ * scx_bpf_create_dsq - Create a custom DSQ
-+ * @dsq_id: DSQ to create
-+ * @node: NUMA node to allocate from
-+ *
-+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
-+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
-+ */
-+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
-+{
-+ if (unlikely(node >= (int)nr_node_ids ||
-+ (node < 0 && node != NUMA_NO_NODE)))
-+ return -EINVAL;
-+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_sleepable)
-+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
-+BTF_KFUNCS_END(scx_kfunc_ids_sleepable)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
-+ .owner = THIS_MODULE,
-+ .set = &scx_kfunc_ids_sleepable,
-+};
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
@@ -7021,7 +7881,7 @@ index 000000000000..0dac88d0e578
+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
++ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
@@ -7071,7 +7931,7 @@ index 000000000000..0dac88d0e578
+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
++ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
@@ -7112,6 +7972,118 @@ index 000000000000..0dac88d0e578
+ .set = &scx_kfunc_ids_enqueue_dispatch,
+};
+
++static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
++ struct task_struct *p, u64 dsq_id,
++ u64 enq_flags)
++{
++ struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
++ struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
++ bool dispatched = false;
++ bool in_balance;
++ unsigned long flags;
++
++ if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
++ return false;
++
++ /*
++ * Can be called from either ops.dispatch() locking this_rq() or any
++ * context where no rq lock is held. If latter, lock @p's task_rq which
++ * we'll likely need anyway.
++ */
++ src_rq = task_rq(p);
++
++ local_irq_save(flags);
++ this_rq = this_rq();
++ in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
++
++ if (in_balance) {
++ if (this_rq != src_rq) {
++ raw_spin_rq_unlock(this_rq);
++ raw_spin_rq_lock(src_rq);
++ }
++ } else {
++ raw_spin_rq_lock(src_rq);
++ }
++
++ locked_rq = src_rq;
++ raw_spin_lock(&src_dsq->lock);
++
++ /*
++ * Did someone else get to it? @p could have already left $src_dsq, got
++ * re-enqueud, or be in the process of being consumed by someone else.
++ */
++ if (unlikely(p->scx.dsq != src_dsq ||
++ u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
++ p->scx.holding_cpu >= 0) ||
++ WARN_ON_ONCE(src_rq != task_rq(p))) {
++ raw_spin_unlock(&src_dsq->lock);
++ goto out;
++ }
++
++ /* @p is still on $src_dsq and stable, determine the destination */
++ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
++
++ if (dst_dsq->id == SCX_DSQ_LOCAL) {
++ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
++ if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
++ dst_dsq = find_global_dsq(p);
++ dst_rq = src_rq;
++ }
++ } else {
++ /* no need to migrate if destination is a non-local DSQ */
++ dst_rq = src_rq;
++ }
++
++ /*
++ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
++ * CPU, @p will be migrated.
++ */
++ if (dst_dsq->id == SCX_DSQ_LOCAL) {
++ /* @p is going from a non-local DSQ to a local DSQ */
++ if (src_rq == dst_rq) {
++ task_unlink_from_dsq(p, src_dsq);
++ move_local_task_to_local_dsq(p, enq_flags,
++ src_dsq, dst_rq);
++ raw_spin_unlock(&src_dsq->lock);
++ } else {
++ raw_spin_unlock(&src_dsq->lock);
++ move_remote_task_to_local_dsq(p, enq_flags,
++ src_rq, dst_rq);
++ locked_rq = dst_rq;
++ }
++ } else {
++ /*
++ * @p is going from a non-local DSQ to a non-local DSQ. As
++ * $src_dsq is already locked, do an abbreviated dequeue.
++ */
++ task_unlink_from_dsq(p, src_dsq);
++ p->scx.dsq = NULL;
++ raw_spin_unlock(&src_dsq->lock);
++
++ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
++ p->scx.dsq_vtime = kit->vtime;
++ dispatch_enqueue(dst_dsq, p, enq_flags);
++ }
++
++ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
++ p->scx.slice = kit->slice;
++
++ dispatched = true;
++out:
++ if (in_balance) {
++ if (this_rq != locked_rq) {
++ raw_spin_rq_unlock(locked_rq);
++ raw_spin_rq_lock(this_rq);
++ }
++ } else {
++ raw_spin_rq_unlock_irqrestore(locked_rq, flags);
++ }
++
++ kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
++ __SCX_DSQ_ITER_HAS_VTIME);
++ return dispatched;
++}
++
+__bpf_kfunc_start_defs();
+
+/**
@@ -7171,7 +8143,7 @@ index 000000000000..0dac88d0e578
+
+ flush_dispatch_buf(dspc->rq);
+
-+ dsq = find_non_local_dsq(dsq_id);
++ dsq = find_user_dsq(dsq_id);
+ if (unlikely(!dsq)) {
+ scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ return false;
@@ -7191,12 +8163,112 @@ index 000000000000..0dac88d0e578
+ }
+}
+
++/**
++ * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
++ * @it__iter: DSQ iterator in progress
++ * @slice: duration the dispatched task can run for in nsecs
++ *
++ * Override the slice of the next task that will be dispatched from @it__iter
++ * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
++ * the previous slice duration is kept.
++ */
++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
++ struct bpf_iter_scx_dsq *it__iter, u64 slice)
++{
++ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
++
++ kit->slice = slice;
++ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
++}
++
++/**
++ * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
++ * @it__iter: DSQ iterator in progress
++ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
++ *
++ * Override the vtime of the next task that will be dispatched from @it__iter
++ * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
++ * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
++ * dispatch the next task, the override is ignored and cleared.
++ */
++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
++ struct bpf_iter_scx_dsq *it__iter, u64 vtime)
++{
++ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
++
++ kit->vtime = vtime;
++ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
++}
++
++/**
++ * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
++ * @it__iter: DSQ iterator in progress
++ * @p: task to transfer
++ * @dsq_id: DSQ to move @p to
++ * @enq_flags: SCX_ENQ_*
++ *
++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
++ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
++ * be the destination.
++ *
++ * For the transfer to be successful, @p must still be on the DSQ and have been
++ * queued before the DSQ iteration started. This function doesn't care whether
++ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
++ * been queued before the iteration started.
++ *
++ * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
++ * update.
++ *
++ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
++ * lock (e.g. BPF timers or SYSCALL programs).
++ *
++ * Returns %true if @p has been consumed, %false if @p had already been consumed
++ * or dequeued.
++ */
++__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
++ struct task_struct *p, u64 dsq_id,
++ u64 enq_flags)
++{
++ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
++ p, dsq_id, enq_flags);
++}
++
++/**
++ * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
++ * @it__iter: DSQ iterator in progress
++ * @p: task to transfer
++ * @dsq_id: DSQ to move @p to
++ * @enq_flags: SCX_ENQ_*
++ *
++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the
++ * priority queue of the DSQ specified by @dsq_id. The destination must be a
++ * user DSQ as only user DSQs support priority queue.
++ *
++ * @p's slice and vtime are kept by default. Use
++ * scx_bpf_dispatch_from_dsq_set_slice() and
++ * scx_bpf_dispatch_from_dsq_set_vtime() to update.
++ *
++ * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
++ * scx_bpf_dispatch_vtime() for more information on @vtime.
++ */
++__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
++ struct task_struct *p, u64 dsq_id,
++ u64 enq_flags)
++{
++ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
++ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
++}
++
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
+BTF_ID_FLAGS(func, scx_bpf_consume)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -7274,6 +8346,37 @@ index 000000000000..0dac88d0e578
+__bpf_kfunc_start_defs();
+
+/**
++ * scx_bpf_create_dsq - Create a custom DSQ
++ * @dsq_id: DSQ to create
++ * @node: NUMA node to allocate from
++ *
++ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
++ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
++ */
++__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
++{
++ if (unlikely(node >= (int)nr_node_ids ||
++ (node < 0 && node != NUMA_NO_NODE)))
++ return -EINVAL;
++ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
++}
++
++__bpf_kfunc_end_defs();
++
++BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
++BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
++BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
++
++static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
++ .owner = THIS_MODULE,
++ .set = &scx_kfunc_ids_unlocked,
++};
++
++__bpf_kfunc_start_defs();
++
++/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
@@ -7291,17 +8394,17 @@ index 000000000000..0dac88d0e578
+ if (!ops_cpu_valid(cpu, NULL))
+ return;
+
++ local_irq_save(irq_flags);
++
++ this_rq = this_rq();
++
+ /*
+ * While bypassing for PM ops, IRQ handling may not be online which can
+ * lead to irq_work_queue() malfunction such as infinite busy wait for
+ * IRQ status update. Suppress kicking.
+ */
-+ if (scx_ops_bypassing())
-+ return;
-+
-+ local_irq_save(irq_flags);
-+
-+ this_rq = this_rq();
++ if (scx_rq_bypassing(this_rq))
++ goto out;
+
+ /*
+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
@@ -7361,7 +8464,7 @@ index 000000000000..0dac88d0e578
+ goto out;
+ }
+ } else {
-+ dsq = find_non_local_dsq(dsq_id);
++ dsq = find_user_dsq(dsq_id);
+ if (dsq) {
+ ret = READ_ONCE(dsq->nr);
+ goto out;
@@ -7407,17 +8510,16 @@ index 000000000000..0dac88d0e578
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
+ __alignof__(struct bpf_iter_scx_dsq));
+
-+ if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
++ if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
+ return -EINVAL;
+
-+ kit->dsq = find_non_local_dsq(dsq_id);
++ kit->dsq = find_user_dsq(dsq_id);
+ if (!kit->dsq)
+ return -ENOENT;
+
+ INIT_LIST_HEAD(&kit->cursor.node);
-+ kit->cursor.is_bpf_iter_cursor = true;
-+ kit->dsq_seq = READ_ONCE(kit->dsq->seq);
-+ kit->flags = flags;
++ kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags;
++ kit->cursor.priv = READ_ONCE(kit->dsq->seq);
+
+ return 0;
+}
@@ -7431,7 +8533,7 @@ index 000000000000..0dac88d0e578
+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
-+ bool rev = kit->flags & SCX_DSQ_ITER_REV;
++ bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
+ struct task_struct *p;
+ unsigned long flags;
+
@@ -7452,7 +8554,7 @@ index 000000000000..0dac88d0e578
+ */
+ do {
+ p = nldsq_next_task(kit->dsq, p, rev);
-+ } while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq)));
++ } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
+
+ if (p) {
+ if (rev)
@@ -7918,6 +9020,41 @@ index 000000000000..0dac88d0e578
+ return cpu_rq(cpu);
+}
+
++/**
++ * scx_bpf_task_cgroup - Return the sched cgroup of a task
++ * @p: task of interest
++ *
++ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
++ * from the scheduler's POV. SCX operations should use this function to
++ * determine @p's current cgroup as, unlike following @p->cgroups,
++ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
++ * rq-locked operations. Can be called on the parameter tasks of rq-locked
++ * operations. The restriction guarantees that @p's rq is locked by the caller.
++ */
++#ifdef CONFIG_CGROUP_SCHED
++__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
++{
++ struct task_group *tg = p->sched_task_group;
++ struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
++
++ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
++ goto out;
++
++ /*
++ * A task_group may either be a cgroup or an autogroup. In the latter
++ * case, @tg->css.cgroup is %NULL. A task_group can't become the other
++ * kind once created.
++ */
++ if (tg && tg->css.cgroup)
++ cgrp = tg->css.cgroup;
++ else
++ cgrp = &cgrp_dfl_root.cgrp;
++out:
++ cgroup_get(cgrp);
++ return cgrp;
++}
++#endif
++
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7946,6 +9083,9 @@ index 000000000000..0dac88d0e578
+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
++#ifdef CONFIG_CGROUP_SCHED
++BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
++#endif
+BTF_KFUNCS_END(scx_kfunc_ids_any)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7969,10 +9109,6 @@ index 000000000000..0dac88d0e578
+ * check using scx_kf_allowed().
+ */
+ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+ &scx_kfunc_set_sleepable)) ||
-+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
-+ &scx_kfunc_set_sleepable)) ||
-+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_select_cpu)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_enqueue_dispatch)) ||
@@ -7981,6 +9117,10 @@ index 000000000000..0dac88d0e578
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_cpu_release)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
++ &scx_kfunc_set_unlocked)) ||
++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
++ &scx_kfunc_set_unlocked)) ||
++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_any)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &scx_kfunc_set_any)) ||
@@ -8019,10 +9159,10 @@ index 000000000000..0dac88d0e578
+__initcall(scx_init);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
new file mode 100644
-index 000000000000..32d3a51f591a
+index 000000000000..246019519231
--- /dev/null
+++ b/kernel/sched/ext.h
-@@ -0,0 +1,69 @@
+@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@@ -8092,11 +9232,33 @@ index 000000000000..32d3a51f591a
+#else
+static inline void scx_update_idle(struct rq *rq, bool idle) {}
+#endif
++
++#ifdef CONFIG_CGROUP_SCHED
++#ifdef CONFIG_EXT_GROUP_SCHED
++int scx_tg_online(struct task_group *tg);
++void scx_tg_offline(struct task_group *tg);
++int scx_cgroup_can_attach(struct cgroup_taskset *tset);
++void scx_move_task(struct task_struct *p);
++void scx_cgroup_finish_attach(void);
++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
++void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
++void scx_group_set_idle(struct task_group *tg, bool idle);
++#else /* CONFIG_EXT_GROUP_SCHED */
++static inline int scx_tg_online(struct task_group *tg) { return 0; }
++static inline void scx_tg_offline(struct task_group *tg) {}
++static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
++static inline void scx_move_task(struct task_struct *p) {}
++static inline void scx_cgroup_finish_attach(void) {}
++static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
++static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
++static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
++#endif /* CONFIG_EXT_GROUP_SCHED */
++#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 483c137b9d3d..ab17954001ae 100644
+index 91b242e47db7..a36e37a674e8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
-@@ -3835,7 +3835,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+@@ -3857,7 +3857,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
}
@@ -8106,16 +9268,7 @@ index 483c137b9d3d..ab17954001ae 100644
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-@@ -8697,7 +8697,7 @@
- /*
- * BATCH and IDLE tasks do not preempt others.
- */
-- if (unlikely(p->policy != SCHED_NORMAL))
-+ if (unlikely(!normal_policy(p->policy)))
- return;
-
- cfs_rq = cfs_rq_of(se);
-@@ -9647,29 +9647,18 @@
+@@ -9365,29 +9366,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {
static bool __update_blocked_others(struct rq *rq, bool *done)
{
@@ -8148,7 +9301,7 @@ index 483c137b9d3d..ab17954001ae 100644
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -13207,6 +13198,7 @@ DEFINE_SCHED_CLASS(fair) = {
+@@ -13233,6 +13223,7 @@ DEFINE_SCHED_CLASS(fair) = {
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
@@ -8157,10 +9310,10 @@ index 483c137b9d3d..ab17954001ae 100644
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
-index 6135fbe83d68..3b6540cc436a 100644
+index 6e78d071beb5..c7a218123b7a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
-@@ -458,11 +458,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
+@@ -452,11 +452,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
@@ -8175,14 +9328,13 @@ index 6135fbe83d68..3b6540cc436a 100644
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 38aeedd8a6cc..f952a4b99ead 100644
+index 432b43aa091c..48d893de632b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
-@@ -187,9 +187,19 @@ static inline int idle_policy(int policy)
- {
+@@ -192,9 +192,18 @@ static inline int idle_policy(int policy)
return policy == SCHED_IDLE;
}
-+
+
+static inline int normal_policy(int policy)
+{
+#ifdef CONFIG_SCHED_CLASS_EXT
@@ -8199,7 +9351,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
}
static inline int rt_policy(int policy)
-@@ -237,6 +247,24 @@ static inline void update_avg(u64 *avg, u64 sample)
+@@ -244,6 +253,24 @@ static inline void update_avg(u64 *avg, u64 sample)
#define shr_bound(val, shift) \
(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
@@ -8224,7 +9376,50 @@ index 38aeedd8a6cc..f952a4b99ead 100644
/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
-@@ -475,6 +503,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+@@ -397,16 +424,17 @@ struct cfs_bandwidth {
+ struct task_group {
+ struct cgroup_subsys_state css;
+
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
++ /* A positive value indicates that this is a SCHED_IDLE group. */
++ int idle;
++#endif
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ /* schedulable entities of this group on each CPU */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each CPU */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+-
+- /* A positive value indicates that this is a SCHED_IDLE group. */
+- int idle;
+-
+ #ifdef CONFIG_SMP
+ /*
+ * load_avg can be heavily contended at clock tick time, so put
+@@ -424,6 +452,11 @@ struct task_group {
+ struct rt_bandwidth rt_bandwidth;
+ #endif
+
++#ifdef CONFIG_EXT_GROUP_SCHED
++ u32 scx_flags; /* SCX_TG_* */
++ u32 scx_weight;
++#endif
++
+ struct rcu_head rcu;
+ struct list_head list;
+
+@@ -448,7 +481,7 @@ struct task_group {
+
+ };
+
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
+
+ /*
+@@ -479,6 +512,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
return walk_tg_tree_from(&root_task_group, down, up, data);
}
@@ -8236,11 +9431,20 @@ index 38aeedd8a6cc..f952a4b99ead 100644
extern int tg_nop(struct task_group *tg, void *data);
#ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -583,6 +616,12 @@ do { \
- # define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
- # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+@@ -535,6 +573,9 @@ extern void set_task_rq_fair(struct sched_entity *se,
+ static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+ #endif /* CONFIG_SMP */
++#else /* !CONFIG_FAIR_GROUP_SCHED */
++static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
++static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ #else /* CONFIG_CGROUP_SCHED */
+@@ -588,6 +629,11 @@ do { \
+ # define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+ # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
-+struct rq;
+struct balance_callback {
+ struct balance_callback *next;
+ void (*func)(struct rq *rq);
@@ -8249,7 +9453,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
-@@ -691,6 +730,42 @@ struct cfs_rq {
+@@ -696,6 +742,43 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -8263,6 +9467,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
+ */
+ SCX_RQ_ONLINE = 1 << 0,
+ SCX_RQ_CAN_STOP_TICK = 1 << 1,
++ SCX_RQ_BYPASSING = 1 << 3,
+
+ SCX_RQ_IN_WAKEUP = 1 << 16,
+ SCX_RQ_IN_BALANCE = 1 << 17,
@@ -8292,11 +9497,10 @@ index 38aeedd8a6cc..f952a4b99ead 100644
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
-@@ -988,12 +1063,6 @@ struct uclamp_rq {
+@@ -996,11 +1079,6 @@ struct uclamp_rq {
DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
--struct rq;
-struct balance_callback {
- struct balance_callback *next;
- void (*func)(struct rq *rq);
@@ -8305,7 +9509,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
/*
* This is the main, per-CPU runqueue data structure.
*
-@@ -1036,6 +1105,9 @@ struct rq {
+@@ -1043,6 +1121,9 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
@@ -8315,16 +9519,24 @@ index 38aeedd8a6cc..f952a4b99ead 100644
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
-@@ -2278,6 +2350,8 @@ struct sched_class {
+@@ -2291,13 +2372,15 @@ struct sched_class {
+
+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+
++ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ struct task_struct *(*pick_next_task)(struct rq *rq);
+
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ void (*switch_class)(struct rq *rq, struct task_struct *next);
+
#ifdef CONFIG_SMP
- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
-@@ -2305,8 +2379,11 @@ struct sched_class {
+
+ struct task_struct * (*pick_task)(struct rq *rq);
+@@ -2323,8 +2406,11 @@ struct sched_class {
* cannot assume the switched_from/switched_to pair is serialized by
* rq->lock. They are however serialized by p->pi_lock.
*/
@@ -8336,7 +9548,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
-@@ -2355,19 +2432,54 @@ const struct sched_class name##_sched_class \
+@@ -2373,19 +2459,54 @@ const struct sched_class name##_sched_class \
extern struct sched_class __sched_class_highest[];
extern struct sched_class __sched_class_lowest[];
@@ -8397,50 +9609,77 @@ index 38aeedd8a6cc..f952a4b99ead 100644
static inline bool sched_stop_runnable(struct rq *rq)
{
-@@ -2464,7 +2576,7 @@ extern void init_sched_dl_class(void);
+@@ -2424,6 +2545,19 @@ extern void sched_balance_trigger(struct rq *rq);
+ extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx);
+ extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
+
++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
++{
++ /* When not in the task's cpumask, no point in looking further. */
++ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
++ return false;
++
++ /* Can @cpu run a user thread? */
++ if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p))
++ return false;
++
++ return true;
++}
++
+ static inline cpumask_t *alloc_user_cpus_ptr(int node)
+ {
+ /*
+@@ -2457,6 +2591,11 @@ extern int push_cpu_stop(void *arg);
+
+ #else /* !CONFIG_SMP: */
+
++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
++{
++ return true;
++}
++
+ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ struct affinity_context *ctx)
+ {
+@@ -2510,8 +2649,6 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
-+extern void __setscheduler_prio(struct task_struct *p, int prio);
-
+-
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
-@@ -2542,6 +2654,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
- extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
- extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-+extern void check_class_changing(struct rq *rq, struct task_struct *p,
-+ const struct sched_class *prev_class);
-+extern void check_class_changed(struct rq *rq, struct task_struct *p,
-+ const struct sched_class *prev_class,
-+ int oldprio);
-+
- extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
-
- #ifdef CONFIG_PREEMPT_RT
-@@ -3007,6 +3125,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
- #endif
+@@ -3056,6 +3193,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
#ifdef CONFIG_SMP
-+
+
+bool update_other_load_avgs(struct rq *rq);
+
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max);
-@@ -3049,6 +3170,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
- {
+@@ -3099,6 +3238,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
+
+#else /* !CONFIG_SMP */
+static inline bool update_other_load_avgs(struct rq *rq) { return false; }
- #endif
+ #endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
-@@ -3481,4 +3604,24 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
- extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
- extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+@@ -3609,6 +3750,8 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
+ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+ extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+
++extern void check_class_changing(struct rq *rq, struct task_struct *p,
++ const struct sched_class *prev_class);
+ extern void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio);
+@@ -3629,4 +3772,24 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea
+
+ #endif
+#ifdef CONFIG_SCHED_CLASS_EXT
+/*
@@ -8463,11 +9702,76 @@ index 38aeedd8a6cc..f952a4b99ead 100644
+#include "ext.h"
+
#endif /* _KERNEL_SCHED_SCHED_H */
+diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
+index ae1b42775ef9..4fa59c9f69ac 100644
+--- a/kernel/sched/syscalls.c
++++ b/kernel/sched/syscalls.c
+@@ -259,6 +259,25 @@ int sched_core_idle_cpu(int cpu)
+ #endif
+
+ #ifdef CONFIG_SMP
++/*
++ * Load avg and utiliztion metrics need to be updated periodically and before
++ * consumption. This function updates the metrics for all subsystems except for
++ * the fair class. @rq must be locked and have its clock updated.
++ */
++bool update_other_load_avgs(struct rq *rq)
++{
++ u64 now = rq_clock_pelt(rq);
++ const struct sched_class *curr_class = rq->curr->sched_class;
++ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
++
++ lockdep_assert_rq_held(rq);
++
++ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
++ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
++ update_hw_load_avg(now, rq, hw_pressure) |
++ update_irq_load_avg(rq, 0);
++}
++
+ /*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+@@ -695,6 +714,10 @@ int __sched_setscheduler(struct task_struct *p,
+ goto unlock;
+ }
+
++ retval = scx_check_setscheduler(p, policy);
++ if (retval)
++ goto unlock;
++
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+@@ -797,6 +820,7 @@ int __sched_setscheduler(struct task_struct *p,
+ __setscheduler_prio(p, newprio);
+ }
+ __setscheduler_uclamp(p, attr);
++ check_class_changing(rq, p, prev_class);
+
+ if (queued) {
+ /*
+@@ -1602,6 +1626,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
++ case SCHED_EXT:
+ ret = 0;
+ break;
+ }
+@@ -1629,6 +1654,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
++ case SCHED_EXT:
+ ret = 0;
+ }
+ return ret;
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
-index 222c6d6c8281..9581ef4efec5 100644
+index 1a996fbbf50a..388da1aea14a 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
-@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl)
+@@ -73,6 +73,7 @@ void dump_stack_print_info(const char *log_lvl)
print_worker_info(log_lvl, current);
print_stop_info(log_lvl, current);
@@ -8479,7 +9783,7 @@ diff --git a/tools/Makefile b/tools/Makefile
index 276f5d0d53a4..278d24723b74 100644
--- a/tools/Makefile
+++ b/tools/Makefile
-@@ -28,6 +28,7 @@ include scripts/Makefile.include
+@@ -28,6 +28,7 @@ help:
@echo ' pci - PCI tools'
@echo ' perf - Linux performance measurement and analysis tool'
@echo ' selftests - various kernel selftests'
@@ -8497,7 +9801,7 @@ index 276f5d0d53a4..278d24723b74 100644
selftests: FORCE
$(call descend,testing/$@)
-@@ -184,6 +188,9 @@ install: acpi_install counter_install cpupower_install gpio_install \
+@@ -184,6 +188,9 @@ perf_clean:
$(Q)mkdir -p $(PERF_O) .
$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
@@ -8526,7 +9830,7 @@ index 000000000000..d6264fe1c8cd
+build/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
new file mode 100644
-index 000000000000..bf7e108f5ae1
+index 000000000000..ca3815e572d8
--- /dev/null
+++ b/tools/sched_ext/Makefile
@@ -0,0 +1,246 @@
@@ -8708,7 +10012,7 @@ index 000000000000..bf7e108f5ae1
+
+SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
+
-+c-sched-targets = scx_simple scx_qmap scx_central
++c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
+
+$(addprefix $(BINDIR)/,$(c-sched-targets)): \
+ $(BINDIR)/%: \
@@ -8778,10 +10082,10 @@ index 000000000000..bf7e108f5ae1
+.SECONDARY:
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
new file mode 100644
-index 000000000000..8efe70cc4363
+index 000000000000..16a42e4060f6
--- /dev/null
+++ b/tools/sched_ext/README.md
-@@ -0,0 +1,258 @@
+@@ -0,0 +1,270 @@
+SCHED_EXT EXAMPLE SCHEDULERS
+============================
+
@@ -8976,6 +10280,18 @@ index 000000000000..8efe70cc4363
+infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
+vmexits.
+
++## scx_flatcg
++
++A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
++weight-based cgroup CPU control by flattening the cgroup hierarchy into a single
++layer, by compounding the active weight share at each level. The effect of this
++is a much more performant CPU controller, which does not need to descend down
++cgroup trees in order to properly compute a cgroup's share.
++
++Similar to scx_simple, in limited scenarios, this scheduler can perform
++reasonably well on single socket-socket systems with a unified L3 cache and show
++significantly lowered hierarchical scheduling overhead.
++
+
+# Troubleshooting
+
@@ -9059,10 +10375,10 @@ index 000000000000..ad7d139ce907
+ */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
new file mode 100644
-index 000000000000..20280df62857
+index 000000000000..225f61f9bfca
--- /dev/null
+++ b/tools/sched_ext/include/scx/common.bpf.h
-@@ -0,0 +1,401 @@
+@@ -0,0 +1,427 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
@@ -9072,7 +10388,13 @@ index 000000000000..20280df62857
+#ifndef __SCX_COMMON_BPF_H
+#define __SCX_COMMON_BPF_H
+
++#ifdef LSP
++#define __bpf__
++#include "../vmlinux/vmlinux.h"
++#else
+#include "vmlinux.h"
++#endif
++
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <asm-generic/errno.h>
@@ -9100,6 +10422,10 @@ index 000000000000..20280df62857
+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+void scx_bpf_dispatch_cancel(void) __ksym;
+bool scx_bpf_consume(u64 dsq_id) __ksym;
++void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
++void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
++bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
++bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+u32 scx_bpf_reenqueue_local(void) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@@ -9126,6 +10452,13 @@ index 000000000000..20280df62857
+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
++struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
++
++/*
++ * Use the following as @it__iter when calling
++ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
++ */
++#define BPF_FOR_EACH_ITER (&___it)
+
+static inline __attribute__((format(printf, 1, 2)))
+void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
@@ -9363,6 +10696,15 @@ index 000000000000..20280df62857
+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+ const struct cpumask *src2) __ksym;
++u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
++
++/*
++ * Access a cpumask in read-only mode (typically to check bits).
++ */
++const struct cpumask *cast_mask(struct bpf_cpumask *mask)
++{
++ return (const struct cpumask *)mask;
++}
+
+/* rcu */
+void bpf_rcu_read_lock(void) __ksym;
@@ -9547,10 +10889,10 @@ index 000000000000..5b0f90152152
+#endif /* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
new file mode 100644
-index 000000000000..3d2fe1208900
+index 000000000000..e5afe9efd3f3
--- /dev/null
+++ b/tools/sched_ext/include/scx/compat.bpf.h
-@@ -0,0 +1,28 @@
+@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
@@ -9568,6 +10910,25 @@ index 000000000000..3d2fe1208900
+ __ret; \
+})
+
++/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
++#define __COMPAT_scx_bpf_task_cgroup(p) \
++ (bpf_ksym_exists(scx_bpf_task_cgroup) ? \
++ scx_bpf_task_cgroup((p)) : NULL)
++
++/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */
++#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \
++ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \
++ scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0)
++#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \
++ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \
++ scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0)
++#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \
++ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \
++ scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
++#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \
++ (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \
++ scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
++
+/*
+ * Define sched_ext_ops. This may be expanded to define multiple variants for
+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
@@ -9581,10 +10942,10 @@ index 000000000000..3d2fe1208900
+#endif /* __SCX_COMPAT_BPF_H */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
new file mode 100644
-index 000000000000..1bf8eddf20c2
+index 000000000000..cc56ff9aa252
--- /dev/null
+++ b/tools/sched_ext/include/scx/compat.h
-@@ -0,0 +1,187 @@
+@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
@@ -9753,14 +11114,13 @@ index 000000000000..1bf8eddf20c2
+ * To maintain compatibility with older libbpf while avoiding trying to attach
+ * twice, disable the autoattach feature on newer libbpf.
+ */
-+/* BACKPORT - bpf_mpa__set_autoattach() not available yet, commented out */
-+/*#if LIBBPF_MAJOR_VERSION > 1 || \
++#if LIBBPF_MAJOR_VERSION > 1 || \
+ (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5)
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \
+ bpf_map__set_autoattach((__skel)->maps.__ops_name, false)
-+#else*/
++#else
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0)
-+/*#endif*/
++#endif
+
+#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \
+ struct bpf_link *__link; \
@@ -9774,10 +11134,10 @@ index 000000000000..1bf8eddf20c2
+#endif /* __SCX_COMPAT_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
new file mode 100644
-index 000000000000..891693ee604e
+index 000000000000..8ce2734402e1
--- /dev/null
+++ b/tools/sched_ext/include/scx/user_exit_info.h
-@@ -0,0 +1,111 @@
+@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
@@ -9805,7 +11165,11 @@ index 000000000000..891693ee604e
+
+#ifdef __bpf__
+
++#ifdef LSP
++#include "../vmlinux/vmlinux.h"
++#else
+#include "vmlinux.h"
++#endif
+#include <bpf/bpf_core_read.h>
+
+#define UEI_DEFINE(__name) \
@@ -9891,7 +11255,7 @@ index 000000000000..891693ee604e
+#endif /* __USER_EXIT_INFO_H */
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
new file mode 100644
-index 000000000000..1d8fd570eaa7
+index 000000000000..8dd8eb73b6b8
--- /dev/null
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -0,0 +1,361 @@
@@ -10095,7 +11459,7 @@ index 000000000000..1d8fd570eaa7
+
+ /* central's gimme is never set */
+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-+ if (gimme && !*gimme)
++ if (!gimme || !*gimme)
+ continue;
+
+ if (dispatch_to_cpu(cpu))
@@ -10397,12 +11761,1271 @@ index 000000000000..21deea320bd7
+ goto restart;
+ return 0;
+}
+diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
+new file mode 100644
+index 000000000000..b722baf6da4b
+--- /dev/null
++++ b/tools/sched_ext/scx_flatcg.bpf.c
+@@ -0,0 +1,957 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements
++ * hierarchical weight-based cgroup CPU control by flattening the cgroup
++ * hierarchy into a single layer by compounding the active weight share at each
++ * level. Consider the following hierarchy with weights in parentheses:
++ *
++ * R + A (100) + B (100)
++ * | \ C (100)
++ * \ D (200)
++ *
++ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
++ * Let's say all three have runnable tasks. The total share that each of these
++ * three cgroups is entitled to can be calculated by compounding its share at
++ * each level.
++ *
++ * For example, B is competing against C and in that competition its share is
++ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
++ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the
++ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
++ * eventual shaer is the same at 1/6. D is only competing at the top level and
++ * its share is 200/(100+200) == 2/3.
++ *
++ * So, instead of hierarchically scheduling level-by-level, we can consider it
++ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
++ * and keep updating the eventual shares as the cgroups' runnable states change.
++ *
++ * This flattening of hierarchy can bring a substantial performance gain when
++ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using
++ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
++ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
++ * apache instances competing with 2:1 weight ratio nested four level deep.
++ *
++ * However, the gain comes at the cost of not being able to properly handle
++ * thundering herd of cgroups. For example, if many cgroups which are nested
++ * behind a low priority parent cgroup wake up around the same time, they may be
++ * able to consume more CPU cycles than they are entitled to. In many use cases,
++ * this isn't a real concern especially given the performance gain. Also, there
++ * are ways to mitigate the problem further by e.g. introducing an extra
++ * scheduling layer on cgroup delegation boundaries.
++ *
++ * The scheduler first picks the cgroup to run and then schedule the tasks
++ * within by using nested weighted vtime scheduling by default. The
++ * cgroup-internal scheduling can be switched to FIFO with the -f option.
++ */
++#include <scx/common.bpf.h>
++#include "scx_flatcg.h"
++
++/*
++ * Maximum amount of retries to find a valid cgroup.
++ */
++enum {
++ FALLBACK_DSQ = 0,
++ CGROUP_MAX_RETRIES = 1024,
++};
++
++char _license[] SEC("license") = "GPL";
++
++const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
++const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
++const volatile bool fifo_sched;
++
++u64 cvtime_now;
++UEI_DEFINE(uei);
++
++struct {
++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
++ __type(key, u32);
++ __type(value, u64);
++ __uint(max_entries, FCG_NR_STATS);
++} stats SEC(".maps");
++
++static void stat_inc(enum fcg_stat_idx idx)
++{
++ u32 idx_v = idx;
++
++ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
++ if (cnt_p)
++ (*cnt_p)++;
++}
++
++struct fcg_cpu_ctx {
++ u64 cur_cgid;
++ u64 cur_at;
++};
++
++struct {
++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
++ __type(key, u32);
++ __type(value, struct fcg_cpu_ctx);
++ __uint(max_entries, 1);
++} cpu_ctx SEC(".maps");
++
++struct {
++ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
++ __uint(map_flags, BPF_F_NO_PREALLOC);
++ __type(key, int);
++ __type(value, struct fcg_cgrp_ctx);
++} cgrp_ctx SEC(".maps");
++
++struct cgv_node {
++ struct bpf_rb_node rb_node;
++ __u64 cvtime;
++ __u64 cgid;
++};
++
++private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
++private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
++
++struct cgv_node_stash {
++ struct cgv_node __kptr *node;
++};
++
++struct {
++ __uint(type, BPF_MAP_TYPE_HASH);
++ __uint(max_entries, 16384);
++ __type(key, __u64);
++ __type(value, struct cgv_node_stash);
++} cgv_node_stash SEC(".maps");
++
++struct fcg_task_ctx {
++ u64 bypassed_at;
++};
++
++struct {
++ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
++ __uint(map_flags, BPF_F_NO_PREALLOC);
++ __type(key, int);
++ __type(value, struct fcg_task_ctx);
++} task_ctx SEC(".maps");
++
++/* gets inc'd on weight tree changes to expire the cached hweights */
++u64 hweight_gen = 1;
++
++static u64 div_round_up(u64 dividend, u64 divisor)
++{
++ return (dividend + divisor - 1) / divisor;
++}
++
++static bool vtime_before(u64 a, u64 b)
++{
++ return (s64)(a - b) < 0;
++}
++
++static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
++{
++ struct cgv_node *cgc_a, *cgc_b;
++
++ cgc_a = container_of(a, struct cgv_node, rb_node);
++ cgc_b = container_of(b, struct cgv_node, rb_node);
++
++ return cgc_a->cvtime < cgc_b->cvtime;
++}
++
++static struct fcg_cpu_ctx *find_cpu_ctx(void)
++{
++ struct fcg_cpu_ctx *cpuc;
++ u32 idx = 0;
++
++ cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
++ if (!cpuc) {
++ scx_bpf_error("cpu_ctx lookup failed");
++ return NULL;
++ }
++ return cpuc;
++}
++
++static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp)
++{
++ struct fcg_cgrp_ctx *cgc;
++
++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
++ if (!cgc) {
++ scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
++ return NULL;
++ }
++ return cgc;
++}
++
++static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level)
++{
++ struct fcg_cgrp_ctx *cgc;
++
++ cgrp = bpf_cgroup_ancestor(cgrp, level);
++ if (!cgrp) {
++ scx_bpf_error("ancestor cgroup lookup failed");
++ return NULL;
++ }
++
++ cgc = find_cgrp_ctx(cgrp);
++ if (!cgc)
++ scx_bpf_error("ancestor cgrp_ctx lookup failed");
++ bpf_cgroup_release(cgrp);
++ return cgc;
++}
++
++static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
++{
++ int level;
++
++ if (!cgc->nr_active) {
++ stat_inc(FCG_STAT_HWT_SKIP);
++ return;
++ }
++
++ if (cgc->hweight_gen == hweight_gen) {
++ stat_inc(FCG_STAT_HWT_CACHE);
++ return;
++ }
++
++ stat_inc(FCG_STAT_HWT_UPDATES);
++ bpf_for(level, 0, cgrp->level + 1) {
++ struct fcg_cgrp_ctx *cgc;
++ bool is_active;
++
++ cgc = find_ancestor_cgrp_ctx(cgrp, level);
++ if (!cgc)
++ break;
++
++ if (!level) {
++ cgc->hweight = FCG_HWEIGHT_ONE;
++ cgc->hweight_gen = hweight_gen;
++ } else {
++ struct fcg_cgrp_ctx *pcgc;
++
++ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
++ if (!pcgc)
++ break;
++
++ /*
++ * We can be opportunistic here and not grab the
++ * cgv_tree_lock and deal with the occasional races.
++ * However, hweight updates are already cached and
++ * relatively low-frequency. Let's just do the
++ * straightforward thing.
++ */
++ bpf_spin_lock(&cgv_tree_lock);
++ is_active = cgc->nr_active;
++ if (is_active) {
++ cgc->hweight_gen = pcgc->hweight_gen;
++ cgc->hweight =
++ div_round_up(pcgc->hweight * cgc->weight,
++ pcgc->child_weight_sum);
++ }
++ bpf_spin_unlock(&cgv_tree_lock);
++
++ if (!is_active) {
++ stat_inc(FCG_STAT_HWT_RACE);
++ break;
++ }
++ }
++ }
++}
++
++static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
++{
++ u64 delta, cvtime, max_budget;
++
++ /*
++ * A node which is on the rbtree can't be pointed to from elsewhere yet
++ * and thus can't be updated and repositioned. Instead, we collect the
++ * vtime deltas separately and apply it asynchronously here.
++ */
++ delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta);
++ cvtime = cgv_node->cvtime + delta;
++
++ /*
++ * Allow a cgroup to carry the maximum budget proportional to its
++ * hweight such that a full-hweight cgroup can immediately take up half
++ * of the CPUs at the most while staying at the front of the rbtree.
++ */
++ max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
++ (2 * FCG_HWEIGHT_ONE);
++ if (vtime_before(cvtime, cvtime_now - max_budget))
++ cvtime = cvtime_now - max_budget;
++
++ cgv_node->cvtime = cvtime;
++}
++
++static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
++{
++ struct cgv_node_stash *stash;
++ struct cgv_node *cgv_node;
++ u64 cgid = cgrp->kn->id;
++
++ /* paired with cmpxchg in try_pick_next_cgroup() */
++ if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) {
++ stat_inc(FCG_STAT_ENQ_SKIP);
++ return;
++ }
++
++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
++ if (!stash) {
++ scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
++ return;
++ }
++
++ /* NULL if the node is already on the rbtree */
++ cgv_node = bpf_kptr_xchg(&stash->node, NULL);
++ if (!cgv_node) {
++ stat_inc(FCG_STAT_ENQ_RACE);
++ return;
++ }
++
++ bpf_spin_lock(&cgv_tree_lock);
++ cgrp_cap_budget(cgv_node, cgc);
++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
++ bpf_spin_unlock(&cgv_tree_lock);
++}
++
++static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
++{
++ /*
++ * Tell fcg_stopping() that this bypassed the regular scheduling path
++ * and should be force charged to the cgroup. 0 is used to indicate that
++ * the task isn't bypassing, so if the current runtime is 0, go back by
++ * one nanosecond.
++ */
++ taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
++}
++
++s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
++{
++ struct fcg_task_ctx *taskc;
++ bool is_idle = false;
++ s32 cpu;
++
++ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
++
++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
++ if (!taskc) {
++ scx_bpf_error("task_ctx lookup failed");
++ return cpu;
++ }
++
++ /*
++ * If select_cpu_dfl() is recommending local enqueue, the target CPU is
++ * idle. Follow it and charge the cgroup later in fcg_stopping() after
++ * the fact.
++ */
++ if (is_idle) {
++ set_bypassed_at(p, taskc);
++ stat_inc(FCG_STAT_LOCAL);
++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
++ }
++
++ return cpu;
++}
++
++void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
++{
++ struct fcg_task_ctx *taskc;
++ struct cgroup *cgrp;
++ struct fcg_cgrp_ctx *cgc;
++
++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
++ if (!taskc) {
++ scx_bpf_error("task_ctx lookup failed");
++ return;
++ }
++
++ /*
++ * Use the direct dispatching and force charging to deal with tasks with
++ * custom affinities so that we don't have to worry about per-cgroup
++ * dq's containing tasks that can't be executed from some CPUs.
++ */
++ if (p->nr_cpus_allowed != nr_cpus) {
++ set_bypassed_at(p, taskc);
++
++ /*
++ * The global dq is deprioritized as we don't want to let tasks
++ * to boost themselves by constraining its cpumask. The
++ * deprioritization is rather severe, so let's not apply that to
++ * per-cpu kernel threads. This is ham-fisted. We probably wanna
++ * implement per-cgroup fallback dq's instead so that we have
++ * more control over when tasks with custom cpumask get issued.
++ */
++ if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
++ stat_inc(FCG_STAT_LOCAL);
++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
++ } else {
++ stat_inc(FCG_STAT_GLOBAL);
++ scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags);
++ }
++ return;
++ }
++
++ cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++ cgc = find_cgrp_ctx(cgrp);
++ if (!cgc)
++ goto out_release;
++
++ if (fifo_sched) {
++ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
++ } else {
++ u64 tvtime = p->scx.dsq_vtime;
++
++ /*
++ * Limit the amount of budget that an idling task can accumulate
++ * to one slice.
++ */
++ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
++ tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
++
++ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
++ tvtime, enq_flags);
++ }
++
++ cgrp_enqueued(cgrp, cgc);
++out_release:
++ bpf_cgroup_release(cgrp);
++}
++
++/*
++ * Walk the cgroup tree to update the active weight sums as tasks wake up and
++ * sleep. The weight sums are used as the base when calculating the proportion a
++ * given cgroup or task is entitled to at each level.
++ */
++static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
++{
++ struct fcg_cgrp_ctx *cgc;
++ bool updated = false;
++ int idx;
++
++ cgc = find_cgrp_ctx(cgrp);
++ if (!cgc)
++ return;
++
++ /*
++ * In most cases, a hot cgroup would have multiple threads going to
++ * sleep and waking up while the whole cgroup stays active. In leaf
++ * cgroups, ->nr_runnable which is updated with __sync operations gates
++ * ->nr_active updates, so that we don't have to grab the cgv_tree_lock
++ * repeatedly for a busy cgroup which is staying active.
++ */
++ if (runnable) {
++ if (__sync_fetch_and_add(&cgc->nr_runnable, 1))
++ return;
++ stat_inc(FCG_STAT_ACT);
++ } else {
++ if (__sync_sub_and_fetch(&cgc->nr_runnable, 1))
++ return;
++ stat_inc(FCG_STAT_DEACT);
++ }
++
++ /*
++ * If @cgrp is becoming runnable, its hweight should be refreshed after
++ * it's added to the weight tree so that enqueue has the up-to-date
++ * value. If @cgrp is becoming quiescent, the hweight should be
++ * refreshed before it's removed from the weight tree so that the usage
++ * charging which happens afterwards has access to the latest value.
++ */
++ if (!runnable)
++ cgrp_refresh_hweight(cgrp, cgc);
++
++ /* propagate upwards */
++ bpf_for(idx, 0, cgrp->level) {
++ int level = cgrp->level - idx;
++ struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
++ bool propagate = false;
++
++ cgc = find_ancestor_cgrp_ctx(cgrp, level);
++ if (!cgc)
++ break;
++ if (level) {
++ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
++ if (!pcgc)
++ break;
++ }
++
++ /*
++ * We need the propagation protected by a lock to synchronize
++ * against weight changes. There's no reason to drop the lock at
++ * each level but bpf_spin_lock() doesn't want any function
++ * calls while locked.
++ */
++ bpf_spin_lock(&cgv_tree_lock);
++
++ if (runnable) {
++ if (!cgc->nr_active++) {
++ updated = true;
++ if (pcgc) {
++ propagate = true;
++ pcgc->child_weight_sum += cgc->weight;
++ }
++ }
++ } else {
++ if (!--cgc->nr_active) {
++ updated = true;
++ if (pcgc) {
++ propagate = true;
++ pcgc->child_weight_sum -= cgc->weight;
++ }
++ }
++ }
++
++ bpf_spin_unlock(&cgv_tree_lock);
++
++ if (!propagate)
++ break;
++ }
++
++ if (updated)
++ __sync_fetch_and_add(&hweight_gen, 1);
++
++ if (runnable)
++ cgrp_refresh_hweight(cgrp, cgc);
++}
++
++void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
++{
++ struct cgroup *cgrp;
++
++ cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++ update_active_weight_sums(cgrp, true);
++ bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
++{
++ struct cgroup *cgrp;
++ struct fcg_cgrp_ctx *cgc;
++
++ if (fifo_sched)
++ return;
++
++ cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++ cgc = find_cgrp_ctx(cgrp);
++ if (cgc) {
++ /*
++ * @cgc->tvtime_now always progresses forward as tasks start
++ * executing. The test and update can be performed concurrently
++ * from multiple CPUs and thus racy. Any error should be
++ * contained and temporary. Let's just live with it.
++ */
++ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
++ cgc->tvtime_now = p->scx.dsq_vtime;
++ }
++ bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
++{
++ struct fcg_task_ctx *taskc;
++ struct cgroup *cgrp;
++ struct fcg_cgrp_ctx *cgc;
++
++ /*
++ * Scale the execution time by the inverse of the weight and charge.
++ *
++ * Note that the default yield implementation yields by setting
++ * @p->scx.slice to zero and the following would treat the yielding task
++ * as if it has consumed all its slice. If this penalizes yielding tasks
++ * too much, determine the execution time by taking explicit timestamps
++ * instead of depending on @p->scx.slice.
++ */
++ if (!fifo_sched)
++ p->scx.dsq_vtime +=
++ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
++
++ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
++ if (!taskc) {
++ scx_bpf_error("task_ctx lookup failed");
++ return;
++ }
++
++ if (!taskc->bypassed_at)
++ return;
++
++ cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++ cgc = find_cgrp_ctx(cgrp);
++ if (cgc) {
++ __sync_fetch_and_add(&cgc->cvtime_delta,
++ p->se.sum_exec_runtime - taskc->bypassed_at);
++ taskc->bypassed_at = 0;
++ }
++ bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
++{
++ struct cgroup *cgrp;
++
++ cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++ update_active_weight_sums(cgrp, false);
++ bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
++{
++ struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
++
++ cgc = find_cgrp_ctx(cgrp);
++ if (!cgc)
++ return;
++
++ if (cgrp->level) {
++ pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1);
++ if (!pcgc)
++ return;
++ }
++
++ bpf_spin_lock(&cgv_tree_lock);
++ if (pcgc && cgc->nr_active)
++ pcgc->child_weight_sum += (s64)weight - cgc->weight;
++ cgc->weight = weight;
++ bpf_spin_unlock(&cgv_tree_lock);
++}
++
++static bool try_pick_next_cgroup(u64 *cgidp)
++{
++ struct bpf_rb_node *rb_node;
++ struct cgv_node_stash *stash;
++ struct cgv_node *cgv_node;
++ struct fcg_cgrp_ctx *cgc;
++ struct cgroup *cgrp;
++ u64 cgid;
++
++ /* pop the front cgroup and wind cvtime_now accordingly */
++ bpf_spin_lock(&cgv_tree_lock);
++
++ rb_node = bpf_rbtree_first(&cgv_tree);
++ if (!rb_node) {
++ bpf_spin_unlock(&cgv_tree_lock);
++ stat_inc(FCG_STAT_PNC_NO_CGRP);
++ *cgidp = 0;
++ return true;
++ }
++
++ rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
++ bpf_spin_unlock(&cgv_tree_lock);
++
++ if (!rb_node) {
++ /*
++ * This should never happen. bpf_rbtree_first() was called
++ * above while the tree lock was held, so the node should
++ * always be present.
++ */
++ scx_bpf_error("node could not be removed");
++ return true;
++ }
++
++ cgv_node = container_of(rb_node, struct cgv_node, rb_node);
++ cgid = cgv_node->cgid;
++
++ if (vtime_before(cvtime_now, cgv_node->cvtime))
++ cvtime_now = cgv_node->cvtime;
++
++ /*
++ * If lookup fails, the cgroup's gone. Free and move on. See
++ * fcg_cgroup_exit().
++ */
++ cgrp = bpf_cgroup_from_id(cgid);
++ if (!cgrp) {
++ stat_inc(FCG_STAT_PNC_GONE);
++ goto out_free;
++ }
++
++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
++ if (!cgc) {
++ bpf_cgroup_release(cgrp);
++ stat_inc(FCG_STAT_PNC_GONE);
++ goto out_free;
++ }
++
++ if (!scx_bpf_consume(cgid)) {
++ bpf_cgroup_release(cgrp);
++ stat_inc(FCG_STAT_PNC_EMPTY);
++ goto out_stash;
++ }
++
++ /*
++ * Successfully consumed from the cgroup. This will be our current
++ * cgroup for the new slice. Refresh its hweight.
++ */
++ cgrp_refresh_hweight(cgrp, cgc);
++
++ bpf_cgroup_release(cgrp);
++
++ /*
++ * As the cgroup may have more tasks, add it back to the rbtree. Note
++ * that here we charge the full slice upfront and then exact later
++ * according to the actual consumption. This prevents lowpri thundering
++ * herd from saturating the machine.
++ */
++ bpf_spin_lock(&cgv_tree_lock);
++ cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1);
++ cgrp_cap_budget(cgv_node, cgc);
++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
++ bpf_spin_unlock(&cgv_tree_lock);
++
++ *cgidp = cgid;
++ stat_inc(FCG_STAT_PNC_NEXT);
++ return true;
++
++out_stash:
++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
++ if (!stash) {
++ stat_inc(FCG_STAT_PNC_GONE);
++ goto out_free;
++ }
++
++ /*
++ * Paired with cmpxchg in cgrp_enqueued(). If they see the following
++ * transition, they'll enqueue the cgroup. If they are earlier, we'll
++ * see their task in the dq below and requeue the cgroup.
++ */
++ __sync_val_compare_and_swap(&cgc->queued, 1, 0);
++
++ if (scx_bpf_dsq_nr_queued(cgid)) {
++ bpf_spin_lock(&cgv_tree_lock);
++ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
++ bpf_spin_unlock(&cgv_tree_lock);
++ stat_inc(FCG_STAT_PNC_RACE);
++ } else {
++ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
++ if (cgv_node) {
++ scx_bpf_error("unexpected !NULL cgv_node stash");
++ goto out_free;
++ }
++ }
++
++ return false;
++
++out_free:
++ bpf_obj_drop(cgv_node);
++ return false;
++}
++
++void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
++{
++ struct fcg_cpu_ctx *cpuc;
++ struct fcg_cgrp_ctx *cgc;
++ struct cgroup *cgrp;
++ u64 now = bpf_ktime_get_ns();
++ bool picked_next = false;
++
++ cpuc = find_cpu_ctx();
++ if (!cpuc)
++ return;
++
++ if (!cpuc->cur_cgid)
++ goto pick_next_cgroup;
++
++ if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
++ if (scx_bpf_consume(cpuc->cur_cgid)) {
++ stat_inc(FCG_STAT_CNS_KEEP);
++ return;
++ }
++ stat_inc(FCG_STAT_CNS_EMPTY);
++ } else {
++ stat_inc(FCG_STAT_CNS_EXPIRE);
++ }
++
++ /*
++ * The current cgroup is expiring. It was already charged a full slice.
++ * Calculate the actual usage and accumulate the delta.
++ */
++ cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
++ if (!cgrp) {
++ stat_inc(FCG_STAT_CNS_GONE);
++ goto pick_next_cgroup;
++ }
++
++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
++ if (cgc) {
++ /*
++ * We want to update the vtime delta and then look for the next
++ * cgroup to execute but the latter needs to be done in a loop
++ * and we can't keep the lock held. Oh well...
++ */
++ bpf_spin_lock(&cgv_tree_lock);
++ __sync_fetch_and_add(&cgc->cvtime_delta,
++ (cpuc->cur_at + cgrp_slice_ns - now) *
++ FCG_HWEIGHT_ONE / (cgc->hweight ?: 1));
++ bpf_spin_unlock(&cgv_tree_lock);
++ } else {
++ stat_inc(FCG_STAT_CNS_GONE);
++ }
++
++ bpf_cgroup_release(cgrp);
++
++pick_next_cgroup:
++ cpuc->cur_at = now;
++
++ if (scx_bpf_consume(FALLBACK_DSQ)) {
++ cpuc->cur_cgid = 0;
++ return;
++ }
++
++ bpf_repeat(CGROUP_MAX_RETRIES) {
++ if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
++ picked_next = true;
++ break;
++ }
++ }
++
++ /*
++ * This only happens if try_pick_next_cgroup() races against enqueue
++ * path for more than CGROUP_MAX_RETRIES times, which is extremely
++ * unlikely and likely indicates an underlying bug. There shouldn't be
++ * any stall risk as the race is against enqueue.
++ */
++ if (!picked_next)
++ stat_inc(FCG_STAT_PNC_FAIL);
++}
++
++s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
++ struct scx_init_task_args *args)
++{
++ struct fcg_task_ctx *taskc;
++ struct fcg_cgrp_ctx *cgc;
++
++ /*
++ * @p is new. Let's ensure that its task_ctx is available. We can sleep
++ * in this function and the following will automatically use GFP_KERNEL.
++ */
++ taskc = bpf_task_storage_get(&task_ctx, p, 0,
++ BPF_LOCAL_STORAGE_GET_F_CREATE);
++ if (!taskc)
++ return -ENOMEM;
++
++ taskc->bypassed_at = 0;
++
++ if (!(cgc = find_cgrp_ctx(args->cgroup)))
++ return -ENOENT;
++
++ p->scx.dsq_vtime = cgc->tvtime_now;
++
++ return 0;
++}
++
++int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
++ struct scx_cgroup_init_args *args)
++{
++ struct fcg_cgrp_ctx *cgc;
++ struct cgv_node *cgv_node;
++ struct cgv_node_stash empty_stash = {}, *stash;
++ u64 cgid = cgrp->kn->id;
++ int ret;
++
++ /*
++ * Technically incorrect as cgroup ID is full 64bit while dsq ID is
++ * 63bit. Should not be a problem in practice and easy to spot in the
++ * unlikely case that it breaks.
++ */
++ ret = scx_bpf_create_dsq(cgid, -1);
++ if (ret)
++ return ret;
++
++ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
++ BPF_LOCAL_STORAGE_GET_F_CREATE);
++ if (!cgc) {
++ ret = -ENOMEM;
++ goto err_destroy_dsq;
++ }
++
++ cgc->weight = args->weight;
++ cgc->hweight = FCG_HWEIGHT_ONE;
++
++ ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
++ BPF_NOEXIST);
++ if (ret) {
++ if (ret != -ENOMEM)
++ scx_bpf_error("unexpected stash creation error (%d)",
++ ret);
++ goto err_destroy_dsq;
++ }
++
++ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
++ if (!stash) {
++ scx_bpf_error("unexpected cgv_node stash lookup failure");
++ ret = -ENOENT;
++ goto err_destroy_dsq;
++ }
++
++ cgv_node = bpf_obj_new(struct cgv_node);
++ if (!cgv_node) {
++ ret = -ENOMEM;
++ goto err_del_cgv_node;
++ }
++
++ cgv_node->cgid = cgid;
++ cgv_node->cvtime = cvtime_now;
++
++ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
++ if (cgv_node) {
++ scx_bpf_error("unexpected !NULL cgv_node stash");
++ ret = -EBUSY;
++ goto err_drop;
++ }
++
++ return 0;
++
++err_drop:
++ bpf_obj_drop(cgv_node);
++err_del_cgv_node:
++ bpf_map_delete_elem(&cgv_node_stash, &cgid);
++err_destroy_dsq:
++ scx_bpf_destroy_dsq(cgid);
++ return ret;
++}
++
++void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
++{
++ u64 cgid = cgrp->kn->id;
++
++ /*
++ * For now, there's no way find and remove the cgv_node if it's on the
++ * cgv_tree. Let's drain them in the dispatch path as they get popped
++ * off the front of the tree.
++ */
++ bpf_map_delete_elem(&cgv_node_stash, &cgid);
++ scx_bpf_destroy_dsq(cgid);
++}
++
++void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
++ struct cgroup *from, struct cgroup *to)
++{
++ struct fcg_cgrp_ctx *from_cgc, *to_cgc;
++ s64 vtime_delta;
++
++ /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
++ if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
++ return;
++
++ vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
++ p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
++}
++
++s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
++{
++ return scx_bpf_create_dsq(FALLBACK_DSQ, -1);
++}
++
++void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
++{
++ UEI_RECORD(uei, ei);
++}
++
++SCX_OPS_DEFINE(flatcg_ops,
++ .select_cpu = (void *)fcg_select_cpu,
++ .enqueue = (void *)fcg_enqueue,
++ .dispatch = (void *)fcg_dispatch,
++ .runnable = (void *)fcg_runnable,
++ .running = (void *)fcg_running,
++ .stopping = (void *)fcg_stopping,
++ .quiescent = (void *)fcg_quiescent,
++ .init_task = (void *)fcg_init_task,
++ .cgroup_set_weight = (void *)fcg_cgroup_set_weight,
++ .cgroup_init = (void *)fcg_cgroup_init,
++ .cgroup_exit = (void *)fcg_cgroup_exit,
++ .cgroup_move = (void *)fcg_cgroup_move,
++ .init = (void *)fcg_init,
++ .exit = (void *)fcg_exit,
++ .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
++ .name = "flatcg");
+diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
+new file mode 100644
+index 000000000000..5d24ca9c29d9
+--- /dev/null
++++ b/tools/sched_ext/scx_flatcg.c
+@@ -0,0 +1,233 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
++ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
++ */
++#include <stdio.h>
++#include <signal.h>
++#include <unistd.h>
++#include <libgen.h>
++#include <limits.h>
++#include <inttypes.h>
++#include <fcntl.h>
++#include <time.h>
++#include <bpf/bpf.h>
++#include <scx/common.h>
++#include "scx_flatcg.h"
++#include "scx_flatcg.bpf.skel.h"
++
++#ifndef FILEID_KERNFS
++#define FILEID_KERNFS 0xfe
++#endif
++
++const char help_fmt[] =
++"A flattened cgroup hierarchy sched_ext scheduler.\n"
++"\n"
++"See the top-level comment in .bpf.c for more details.\n"
++"\n"
++"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
++"\n"
++" -s SLICE_US Override slice duration\n"
++" -i INTERVAL Report interval\n"
++" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
++" -v Print libbpf debug messages\n"
++" -h Display this help and exit\n";
++
++static bool verbose;
++static volatile int exit_req;
++
++static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
++{
++ if (level == LIBBPF_DEBUG && !verbose)
++ return 0;
++ return vfprintf(stderr, format, args);
++}
++
++static void sigint_handler(int dummy)
++{
++ exit_req = 1;
++}
++
++static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
++{
++ FILE *fp;
++ char buf[4096];
++ char *line, *cur = NULL, *tok;
++ __u64 sum = 0, idle = 0;
++ __u64 delta_sum, delta_idle;
++ int idx;
++
++ fp = fopen("/proc/stat", "r");
++ if (!fp) {
++ perror("fopen(\"/proc/stat\")");
++ return 0.0;
++ }
++
++ if (!fgets(buf, sizeof(buf), fp)) {
++ perror("fgets(\"/proc/stat\")");
++ fclose(fp);
++ return 0.0;
++ }
++ fclose(fp);
++
++ line = buf;
++ for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
++ char *endp = NULL;
++ __u64 v;
++
++ if (idx == 0) {
++ line = NULL;
++ continue;
++ }
++ v = strtoull(tok, &endp, 0);
++ if (!endp || *endp != '\0') {
++ fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
++ idx, tok);
++ continue;
++ }
++ sum += v;
++ if (idx == 4)
++ idle = v;
++ }
++
++ delta_sum = sum - *last_sum;
++ delta_idle = idle - *last_idle;
++ *last_sum = sum;
++ *last_idle = idle;
++
++ return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
++}
++
++static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
++{
++ __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
++ __u32 idx;
++
++ memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
++
++ for (idx = 0; idx < FCG_NR_STATS; idx++) {
++ int ret, cpu;
++
++ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
++ &idx, cnts[idx]);
++ if (ret < 0)
++ continue;
++ for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
++ stats[idx] += cnts[idx][cpu];
++ }
++}
++
++int main(int argc, char **argv)
++{
++ struct scx_flatcg *skel;
++ struct bpf_link *link;
++ struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
++ bool dump_cgrps = false;
++ __u64 last_cpu_sum = 0, last_cpu_idle = 0;
++ __u64 last_stats[FCG_NR_STATS] = {};
++ unsigned long seq = 0;
++ __s32 opt;
++ __u64 ecode;
++
++ libbpf_set_print(libbpf_print_fn);
++ signal(SIGINT, sigint_handler);
++ signal(SIGTERM, sigint_handler);
++restart:
++ skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
++
++ skel->rodata->nr_cpus = libbpf_num_possible_cpus();
++
++ while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
++ double v;
++
++ switch (opt) {
++ case 's':
++ v = strtod(optarg, NULL);
++ skel->rodata->cgrp_slice_ns = v * 1000;
++ break;
++ case 'i':
++ v = strtod(optarg, NULL);
++ intv_ts.tv_sec = v;
++ intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
++ break;
++ case 'd':
++ dump_cgrps = true;
++ break;
++ case 'f':
++ skel->rodata->fifo_sched = true;
++ break;
++ case 'v':
++ verbose = true;
++ break;
++ case 'h':
++ default:
++ fprintf(stderr, help_fmt, basename(argv[0]));
++ return opt != 'h';
++ }
++ }
++
++ printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
++ (double)skel->rodata->cgrp_slice_ns / 1000000.0,
++ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
++ dump_cgrps);
++
++ SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
++ link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);
++
++ while (!exit_req && !UEI_EXITED(skel, uei)) {
++ __u64 acc_stats[FCG_NR_STATS];
++ __u64 stats[FCG_NR_STATS];
++ float cpu_util;
++ int i;
++
++ cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);
++
++ fcg_read_stats(skel, acc_stats);
++ for (i = 0; i < FCG_NR_STATS; i++)
++ stats[i] = acc_stats[i] - last_stats[i];
++
++ memcpy(last_stats, acc_stats, sizeof(acc_stats));
++
++ printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
++ seq++, cpu_util * 100.0, skel->data->hweight_gen);
++ printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n",
++ stats[FCG_STAT_ACT],
++ stats[FCG_STAT_DEACT],
++ stats[FCG_STAT_GLOBAL],
++ stats[FCG_STAT_LOCAL]);
++ printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n",
++ stats[FCG_STAT_HWT_CACHE],
++ stats[FCG_STAT_HWT_UPDATES],
++ stats[FCG_STAT_HWT_SKIP],
++ stats[FCG_STAT_HWT_RACE]);
++ printf("ENQ skip:%6llu race:%6llu\n",
++ stats[FCG_STAT_ENQ_SKIP],
++ stats[FCG_STAT_ENQ_RACE]);
++ printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n",
++ stats[FCG_STAT_CNS_KEEP],
++ stats[FCG_STAT_CNS_EXPIRE],
++ stats[FCG_STAT_CNS_EMPTY],
++ stats[FCG_STAT_CNS_GONE]);
++ printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n",
++ stats[FCG_STAT_PNC_NEXT],
++ stats[FCG_STAT_PNC_EMPTY],
++ stats[FCG_STAT_PNC_NO_CGRP],
++ stats[FCG_STAT_PNC_GONE],
++ stats[FCG_STAT_PNC_RACE],
++ stats[FCG_STAT_PNC_FAIL]);
++ printf("BAD remove:%6llu\n",
++ acc_stats[FCG_STAT_BAD_REMOVAL]);
++ fflush(stdout);
++
++ nanosleep(&intv_ts, NULL);
++ }
++
++ bpf_link__destroy(link);
++ ecode = UEI_REPORT(skel, uei);
++ scx_flatcg__destroy(skel);
++
++ if (UEI_ECODE_RESTART(ecode))
++ goto restart;
++ return 0;
++}
+diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
+new file mode 100644
+index 000000000000..6f2ea50acb1c
+--- /dev/null
++++ b/tools/sched_ext/scx_flatcg.h
+@@ -0,0 +1,51 @@
++#ifndef __SCX_EXAMPLE_FLATCG_H
++#define __SCX_EXAMPLE_FLATCG_H
++
++enum {
++ FCG_HWEIGHT_ONE = 1LLU << 16,
++};
++
++enum fcg_stat_idx {
++ FCG_STAT_ACT,
++ FCG_STAT_DEACT,
++ FCG_STAT_LOCAL,
++ FCG_STAT_GLOBAL,
++
++ FCG_STAT_HWT_UPDATES,
++ FCG_STAT_HWT_CACHE,
++ FCG_STAT_HWT_SKIP,
++ FCG_STAT_HWT_RACE,
++
++ FCG_STAT_ENQ_SKIP,
++ FCG_STAT_ENQ_RACE,
++
++ FCG_STAT_CNS_KEEP,
++ FCG_STAT_CNS_EXPIRE,
++ FCG_STAT_CNS_EMPTY,
++ FCG_STAT_CNS_GONE,
++
++ FCG_STAT_PNC_NO_CGRP,
++ FCG_STAT_PNC_NEXT,
++ FCG_STAT_PNC_EMPTY,
++ FCG_STAT_PNC_GONE,
++ FCG_STAT_PNC_RACE,
++ FCG_STAT_PNC_FAIL,
++
++ FCG_STAT_BAD_REMOVAL,
++
++ FCG_NR_STATS,
++};
++
++struct fcg_cgrp_ctx {
++ u32 nr_active;
++ u32 nr_runnable;
++ u32 queued;
++ u32 weight;
++ u32 hweight;
++ u64 child_weight_sum;
++ u64 hweight_gen;
++ s64 cvtime_delta;
++ u64 tvtime_now;
++};
++
++#endif /* __SCX_EXAMPLE_FLATCG_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
new file mode 100644
-index 000000000000..892278f12dce
+index 000000000000..5b39bee9eb23
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.bpf.c
-@@ -0,0 +1,706 @@
+@@ -0,0 +1,813 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple five-level FIFO queue scheduler.
@@ -10432,6 +13055,8 @@ index 000000000000..892278f12dce
+enum consts {
+ ONE_SEC_IN_NS = 1000000000,
+ SHARED_DSQ = 0,
++ HIGHPRI_DSQ = 1,
++ HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
+};
+
+char _license[] SEC("license") = "GPL";
@@ -10441,10 +13066,12 @@ index 000000000000..892278f12dce
+const volatile u32 stall_kernel_nth;
+const volatile u32 dsp_inf_loop_after;
+const volatile u32 dsp_batch;
++const volatile bool highpri_boosting;
+const volatile bool print_shared_dsq;
+const volatile s32 disallow_tgid;
+const volatile bool suppress_dump;
+
++u64 nr_highpri_queued;
+u32 test_error_cnt;
+
+UEI_DEFINE(uei);
@@ -10500,6 +13127,7 @@ index 000000000000..892278f12dce
+/* Per-task scheduling context */
+struct task_ctx {
+ bool force_local; /* Dispatch directly to local_dsq */
++ bool highpri;
+ u64 core_sched_seq;
+};
+
@@ -10527,6 +13155,7 @@ index 000000000000..892278f12dce
+/* Statistics */
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
+u64 nr_core_sched_execed;
++u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
+u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
@@ -10545,17 +13174,25 @@ index 000000000000..892278f12dce
+ return -1;
+}
+
++static struct task_ctx *lookup_task_ctx(struct task_struct *p)
++{
++ struct task_ctx *tctx;
++
++ if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
++ scx_bpf_error("task_ctx lookup failed");
++ return NULL;
++ }
++ return tctx;
++}
++
+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags)
+{
+ struct task_ctx *tctx;
+ s32 cpu;
+
-+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+ if (!tctx) {
-+ scx_bpf_error("task_ctx lookup failed");
++ if (!(tctx = lookup_task_ctx(p)))
+ return -ESRCH;
-+ }
+
+ cpu = pick_direct_dispatch_cpu(p, prev_cpu);
+
@@ -10602,11 +13239,8 @@ index 000000000000..892278f12dce
+ if (test_error_cnt && !--test_error_cnt)
+ scx_bpf_error("test triggering error");
+
-+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+ if (!tctx) {
-+ scx_bpf_error("task_ctx lookup failed");
++ if (!(tctx = lookup_task_ctx(p)))
+ return;
-+ }
+
+ /*
+ * All enqueued tasks must have their core_sched_seq updated for correct
@@ -10661,6 +13295,10 @@ index 000000000000..892278f12dce
+ return;
+ }
+
++ if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
++ tctx->highpri = true;
++ __sync_fetch_and_add(&nr_highpri_queued, 1);
++ }
+ __sync_fetch_and_add(&nr_enqueued, 1);
+}
+
@@ -10677,13 +13315,80 @@ index 000000000000..892278f12dce
+
+static void update_core_sched_head_seq(struct task_struct *p)
+{
-+ struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ int idx = weight_to_idx(p->scx.weight);
++ struct task_ctx *tctx;
+
-+ if (tctx)
++ if ((tctx = lookup_task_ctx(p)))
+ core_sched_head_seqs[idx] = tctx->core_sched_seq;
-+ else
-+ scx_bpf_error("task_ctx lookup failed");
++}
++
++/*
++ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
++ * selective priority boosting mechanism by scanning SHARED_DSQ looking for
++ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
++ * makes minor difference only when dsp_batch is larger than 1.
++ *
++ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
++ * non-rq-lock holding BPF programs. As demonstration, this function is called
++ * from qmap_dispatch() and monitor_timerfn().
++ */
++static bool dispatch_highpri(bool from_timer)
++{
++ struct task_struct *p;
++ s32 this_cpu = bpf_get_smp_processor_id();
++
++ /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
++ bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
++ static u64 highpri_seq;
++ struct task_ctx *tctx;
++
++ if (!(tctx = lookup_task_ctx(p)))
++ return false;
++
++ if (tctx->highpri) {
++ /* exercise the set_*() and vtime interface too */
++ __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(
++ BPF_FOR_EACH_ITER, slice_ns * 2);
++ __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(
++ BPF_FOR_EACH_ITER, highpri_seq++);
++ __COMPAT_scx_bpf_dispatch_vtime_from_dsq(
++ BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
++ }
++ }
++
++ /*
++ * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
++ * is found.
++ */
++ bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
++ bool dispatched = false;
++ s32 cpu;
++
++ if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
++ cpu = this_cpu;
++ else
++ cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
++
++ if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
++ SCX_DSQ_LOCAL_ON | cpu,
++ SCX_ENQ_PREEMPT)) {
++ if (cpu == this_cpu) {
++ dispatched = true;
++ __sync_fetch_and_add(&nr_expedited_local, 1);
++ } else {
++ __sync_fetch_and_add(&nr_expedited_remote, 1);
++ }
++ if (from_timer)
++ __sync_fetch_and_add(&nr_expedited_from_timer, 1);
++ } else {
++ __sync_fetch_and_add(&nr_expedited_lost, 1);
++ }
++
++ if (dispatched)
++ return true;
++ }
++
++ return false;
+}
+
+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
@@ -10694,7 +13399,10 @@ index 000000000000..892278f12dce
+ void *fifo;
+ s32 i, pid;
+
-+ if (scx_bpf_consume(SHARED_DSQ))
++ if (dispatch_highpri(false))
++ return;
++
++ if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
+ return;
+
+ if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -10731,6 +13439,8 @@ index 000000000000..892278f12dce
+
+ /* Dispatch or advance. */
+ bpf_repeat(BPF_MAX_LOOPS) {
++ struct task_ctx *tctx;
++
+ if (bpf_map_pop_elem(fifo, &pid))
+ break;
+
@@ -10738,13 +13448,25 @@ index 000000000000..892278f12dce
+ if (!p)
+ continue;
+
++ if (!(tctx = lookup_task_ctx(p))) {
++ bpf_task_release(p);
++ return;
++ }
++
++ if (tctx->highpri)
++ __sync_fetch_and_sub(&nr_highpri_queued, 1);
++
+ update_core_sched_head_seq(p);
+ __sync_fetch_and_add(&nr_dispatched, 1);
++
+ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
+ bpf_task_release(p);
++
+ batch--;
+ cpuc->dsp_cnt--;
+ if (!batch || !scx_bpf_dispatch_nr_slots()) {
++ if (dispatch_highpri(false))
++ return;
+ scx_bpf_consume(SHARED_DSQ);
+ return;
+ }
@@ -11054,6 +13776,10 @@ index 000000000000..892278f12dce
+
+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
++ bpf_rcu_read_lock();
++ dispatch_highpri(true);
++ bpf_rcu_read_unlock();
++
+ monitor_cpuperf();
+
+ if (print_shared_dsq)
@@ -11075,6 +13801,10 @@ index 000000000000..892278f12dce
+ if (ret)
+ return ret;
+
++ ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
++ if (ret)
++ return ret;
++
+ timer = bpf_map_lookup_elem(&monitor_timer, &key);
+ if (!timer)
+ return -ESRCH;
@@ -11111,10 +13841,10 @@ index 000000000000..892278f12dce
+ .name = "qmap");
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
new file mode 100644
-index 000000000000..c9ca30d62b2b
+index 000000000000..ac45a02b4055
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.c
-@@ -0,0 +1,144 @@
+@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
@@ -11146,6 +13876,7 @@ index 000000000000..c9ca30d62b2b
+" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
+" -b COUNT Dispatch upto COUNT tasks together\n"
+" -P Print out DSQ content to trace_pipe every second, use with -b\n"
++" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
+" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+" -D LEN Set scx_exit_info.dump buffer length\n"
+" -S Suppress qmap-specific debug dump\n"
@@ -11180,7 +13911,7 @@ index 000000000000..c9ca30d62b2b
+
+ skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
+
-+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
++ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
+ switch (opt) {
+ case 's':
+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -11203,6 +13934,9 @@ index 000000000000..c9ca30d62b2b
+ case 'P':
+ skel->rodata->print_shared_dsq = true;
+ break;
++ case 'H':
++ skel->rodata->highpri_boosting = true;
++ break;
+ case 'd':
+ skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
+ if (skel->rodata->disallow_tgid < 0)
@@ -11238,6 +13972,11 @@ index 000000000000..c9ca30d62b2b
+ skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
+ skel->bss->nr_core_sched_execed,
+ skel->bss->nr_ddsp_from_enq);
++ printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
++ skel->bss->nr_expedited_local,
++ skel->bss->nr_expedited_remote,
++ skel->bss->nr_expedited_from_timer,
++ skel->bss->nr_expedited_lost);
+ if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
+ printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
+ skel->bss->cpuperf_min,
@@ -11261,10 +14000,10 @@ index 000000000000..c9ca30d62b2b
+}
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
new file mode 100644
-index 000000000000..d457d2a74e1e
+index 000000000000..8bc626ede1c4
--- /dev/null
+++ b/tools/sched_ext/scx_show_state.py
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,40 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
@@ -11304,6 +14043,7 @@ index 000000000000..d457d2a74e1e
+print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
+print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
+print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
++print(f'enable_seq : {read_atomic("scx_enable_seq")}')
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
new file mode 100644
index 000000000000..ed7e8d535fc5
@@ -13191,10 +15931,10 @@ index 000000000000..97d45f1e5597
+REGISTER_SCX_TEST(&init_enable_count)
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
new file mode 100644
-index 000000000000..44612fdaf399
+index 000000000000..00bfa9cb95d3
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
-@@ -0,0 +1,132 @@
+@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler with every callback defined.
@@ -13292,6 +16032,32 @@ index 000000000000..44612fdaf399
+void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p)
+{}
+
++s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp,
++ struct scx_cgroup_init_args *args)
++{
++ return 0;
++}
++
++void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp)
++{}
++
++s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p,
++ struct cgroup *from, struct cgroup *to)
++{
++ return 0;
++}
++
++void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p,
++ struct cgroup *from, struct cgroup *to)
++{}
++
++void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
++ struct cgroup *from, struct cgroup *to)
++{}
++
++void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
++{}
++
+s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
+{
+ return 0;
@@ -13323,6 +16089,12 @@ index 000000000000..44612fdaf399
+ .enable = maximal_enable,
+ .exit_task = maximal_exit_task,
+ .disable = maximal_disable,
++ .cgroup_init = maximal_cgroup_init,
++ .cgroup_exit = maximal_cgroup_exit,
++ .cgroup_prep_move = maximal_cgroup_prep_move,
++ .cgroup_move = maximal_cgroup_move,
++ .cgroup_cancel_move = maximal_cgroup_cancel_move,
++ .cgroup_set_weight = maximal_cgroup_set_weight,
+ .init = maximal_init,
+ .exit = maximal_exit,
+ .name = "maximal",
@@ -15130,3 +17902,6 @@ index 000000000000..bc13dfec1267
+int file_write_long(const char *path, long val);
+
+#endif // __SCX_TEST_H__
+--
+2.47.0.rc0
+