1 files changed, 3664 insertions, 889 deletions
diff --git a/SOURCES/scx-kernel.patch b/SOURCES/scx-kernel.patch
index 29e1f22..196bac1 100644
--- a/SOURCES/scx-kernel.patch
+++ b/SOURCES/scx-kernel.patch
@@ -1,3 +1,184 @@
+From 11276ed2c72c57624c1214e980efd24648be015c Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 4 Oct 2024 17:12:13 +0200
+Subject: [PATCH] sched-ext
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/scheduler/index.rst             |    1 +
+ Documentation/scheduler/sched-ext.rst         |  326 +
+ MAINTAINERS                                   |   13 +
+ drivers/tty/sysrq.c                           |    1 +
+ include/asm-generic/vmlinux.lds.h             |    1 +
+ include/linux/cgroup.h                        |    4 +-
+ include/linux/sched.h                         |    5 +
+ include/linux/sched/ext.h                     |  216 +
+ include/linux/sched/task.h                    |    8 +-
+ include/trace/events/sched_ext.h              |   32 +
+ include/uapi/linux/sched.h                    |    1 +
+ init/Kconfig                                  |   10 +
+ init/init_task.c                              |   12 +
+ kernel/Kconfig.preempt                        |   27 +-
+ kernel/fork.c                                 |   17 +-
+ kernel/sched/build_policy.c                   |   11 +
+ kernel/sched/core.c                           |  288 +-
+ kernel/sched/cpufreq_schedutil.c              |   50 +-
+ kernel/sched/debug.c                          |    3 +
+ kernel/sched/ext.c                            | 7262 +++++++++++++++++
+ kernel/sched/ext.h                            |   91 +
+ kernel/sched/fair.c                           |   21 +-
+ kernel/sched/idle.c                           |    2 +
+ kernel/sched/sched.h                          |  203 +-
+ kernel/sched/syscalls.c                       |   26 +
+ lib/dump_stack.c                              |    1 +
+ tools/Makefile                                |   10 +-
+ tools/sched_ext/.gitignore                    |    2 +
+ tools/sched_ext/Makefile                      |  246 +
+ tools/sched_ext/README.md                     |  270 +
+ .../sched_ext/include/bpf-compat/gnu/stubs.h  |   11 +
+ tools/sched_ext/include/scx/common.bpf.h      |  427 +
+ tools/sched_ext/include/scx/common.h          |   75 +
+ tools/sched_ext/include/scx/compat.bpf.h      |   47 +
+ tools/sched_ext/include/scx/compat.h          |  186 +
+ tools/sched_ext/include/scx/user_exit_info.h  |  115 +
+ tools/sched_ext/scx_central.bpf.c             |  361 +
+ tools/sched_ext/scx_central.c                 |  135 +
+ tools/sched_ext/scx_flatcg.bpf.c              |  957 +++
+ tools/sched_ext/scx_flatcg.c                  |  233 +
+ tools/sched_ext/scx_flatcg.h                  |   51 +
+ tools/sched_ext/scx_qmap.bpf.c                |  813 ++
+ tools/sched_ext/scx_qmap.c                    |  153 +
+ tools/sched_ext/scx_show_state.py             |   40 +
+ tools/sched_ext/scx_simple.bpf.c              |  156 +
+ tools/sched_ext/scx_simple.c                  |  107 +
+ tools/testing/selftests/sched_ext/.gitignore  |    6 +
+ tools/testing/selftests/sched_ext/Makefile    |  218 +
+ tools/testing/selftests/sched_ext/config      |    9 +
+ .../selftests/sched_ext/create_dsq.bpf.c      |   58 +
+ .../testing/selftests/sched_ext/create_dsq.c  |   57 +
+ .../sched_ext/ddsp_bogus_dsq_fail.bpf.c       |   42 +
+ .../selftests/sched_ext/ddsp_bogus_dsq_fail.c |   57 +
+ .../sched_ext/ddsp_vtimelocal_fail.bpf.c      |   39 +
+ .../sched_ext/ddsp_vtimelocal_fail.c          |   56 +
+ .../selftests/sched_ext/dsp_local_on.bpf.c    |   65 +
+ .../selftests/sched_ext/dsp_local_on.c        |   58 +
+ .../sched_ext/enq_last_no_enq_fails.bpf.c     |   21 +
+ .../sched_ext/enq_last_no_enq_fails.c         |   60 +
+ .../sched_ext/enq_select_cpu_fails.bpf.c      |   43 +
+ .../sched_ext/enq_select_cpu_fails.c          |   61 +
+ tools/testing/selftests/sched_ext/exit.bpf.c  |   84 +
+ tools/testing/selftests/sched_ext/exit.c      |   55 +
+ tools/testing/selftests/sched_ext/exit_test.h |   20 +
+ .../testing/selftests/sched_ext/hotplug.bpf.c |   61 +
+ tools/testing/selftests/sched_ext/hotplug.c   |  168 +
+ .../selftests/sched_ext/hotplug_test.h        |   15 +
+ .../sched_ext/init_enable_count.bpf.c         |   53 +
+ .../selftests/sched_ext/init_enable_count.c   |  166 +
+ .../testing/selftests/sched_ext/maximal.bpf.c |  164 +
+ tools/testing/selftests/sched_ext/maximal.c   |   51 +
+ .../selftests/sched_ext/maybe_null.bpf.c      |   36 +
+ .../testing/selftests/sched_ext/maybe_null.c  |   49 +
+ .../sched_ext/maybe_null_fail_dsp.bpf.c       |   25 +
+ .../sched_ext/maybe_null_fail_yld.bpf.c       |   28 +
+ .../testing/selftests/sched_ext/minimal.bpf.c |   21 +
+ tools/testing/selftests/sched_ext/minimal.c   |   58 +
+ .../selftests/sched_ext/prog_run.bpf.c        |   33 +
+ tools/testing/selftests/sched_ext/prog_run.c  |   78 +
+ .../testing/selftests/sched_ext/reload_loop.c |   75 +
+ tools/testing/selftests/sched_ext/runner.c    |  201 +
+ tools/testing/selftests/sched_ext/scx_test.h  |  131 +
+ .../selftests/sched_ext/select_cpu_dfl.bpf.c  |   40 +
+ .../selftests/sched_ext/select_cpu_dfl.c      |   72 +
+ .../sched_ext/select_cpu_dfl_nodispatch.bpf.c |   89 +
+ .../sched_ext/select_cpu_dfl_nodispatch.c     |   72 +
+ .../sched_ext/select_cpu_dispatch.bpf.c       |   41 +
+ .../selftests/sched_ext/select_cpu_dispatch.c |   70 +
+ .../select_cpu_dispatch_bad_dsq.bpf.c         |   37 +
+ .../sched_ext/select_cpu_dispatch_bad_dsq.c   |   56 +
+ .../select_cpu_dispatch_dbl_dsp.bpf.c         |   38 +
+ .../sched_ext/select_cpu_dispatch_dbl_dsp.c   |   56 +
+ .../sched_ext/select_cpu_vtime.bpf.c          |   92 +
+ .../selftests/sched_ext/select_cpu_vtime.c    |   59 +
+ .../selftests/sched_ext/test_example.c        |   49 +
+ tools/testing/selftests/sched_ext/util.c      |   71 +
+ tools/testing/selftests/sched_ext/util.h      |   13 +
+ 97 files changed, 16174 insertions(+), 130 deletions(-)
+ create mode 100644 Documentation/scheduler/sched-ext.rst
+ create mode 100644 include/linux/sched/ext.h
+ create mode 100644 include/trace/events/sched_ext.h
+ create mode 100644 kernel/sched/ext.c
+ create mode 100644 kernel/sched/ext.h
+ create mode 100644 tools/sched_ext/.gitignore
+ create mode 100644 tools/sched_ext/Makefile
+ create mode 100644 tools/sched_ext/README.md
+ create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h
+ create mode 100644 tools/sched_ext/include/scx/common.bpf.h
+ create mode 100644 tools/sched_ext/include/scx/common.h
+ create mode 100644 tools/sched_ext/include/scx/compat.bpf.h
+ create mode 100644 tools/sched_ext/include/scx/compat.h
+ create mode 100644 tools/sched_ext/include/scx/user_exit_info.h
+ create mode 100644 tools/sched_ext/scx_central.bpf.c
+ create mode 100644 tools/sched_ext/scx_central.c
+ create mode 100644 tools/sched_ext/scx_flatcg.bpf.c
+ create mode 100644 tools/sched_ext/scx_flatcg.c
+ create mode 100644 tools/sched_ext/scx_flatcg.h
+ create mode 100644 tools/sched_ext/scx_qmap.bpf.c
+ create mode 100644 tools/sched_ext/scx_qmap.c
+ create mode 100644 tools/sched_ext/scx_show_state.py
+ create mode 100644 tools/sched_ext/scx_simple.bpf.c
+ create mode 100644 tools/sched_ext/scx_simple.c
+ create mode 100644 tools/testing/selftests/sched_ext/.gitignore
+ create mode 100644 tools/testing/selftests/sched_ext/Makefile
+ create mode 100644 tools/testing/selftests/sched_ext/config
+ create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
+ create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
+ create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/exit.c
+ create mode 100644 tools/testing/selftests/sched_ext/exit_test.h
+ create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/hotplug.c
+ create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h
+ create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c
+ create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/maximal.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/minimal.c
+ create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/prog_run.c
+ create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c
+ create mode 100644 tools/testing/selftests/sched_ext/runner.c
+ create mode 100644 tools/testing/selftests/sched_ext/scx_test.h
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c
+ create mode 100644 tools/testing/selftests/sched_ext/test_example.c
+ create mode 100644 tools/testing/selftests/sched_ext/util.c
+ create mode 100644 tools/testing/selftests/sched_ext/util.h
+
 diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
 index 43bd8a145b7a..0611dc3dda8e 100644
 --- a/Documentation/scheduler/index.rst
@@ -12,10 +193,10 @@ index 43bd8a145b7a..0611dc3dda8e 100644
      text_files
 diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
 new file mode 100644
-index 000000000000..a707d2181a77
+index 000000000000..6c0d70e2e27d
 --- /dev/null
 +++ b/Documentation/scheduler/sched-ext.rst
-@@ -0,0 +1,316 @@
+@@ -0,0 +1,326 @@
 +==========================
 +Extensible Scheduler Class
 +==========================
@@ -101,6 +282,15 @@ index 000000000000..a707d2181a77
 +    # cat /sys/kernel/sched_ext/root/ops
 +    simple
 +
++You can check if any BPF scheduler has ever been loaded since boot by examining
++this monotonically incrementing counter (a value of zero indicates that no BPF
++scheduler has been loaded):
++
++.. code-block:: none
++
++    # cat /sys/kernel/sched_ext/enable_seq
++    1
++
 +``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
 +detailed information:
 +
@@ -114,6 +304,7 @@ index 000000000000..a707d2181a77
 +    enable_state  : enabled (2)
 +    bypass_depth  : 0
 +    nr_rejected   : 0
++    enable_seq    : 1
 +
 +If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
 +be determined as follows:
@@ -333,10 +524,10 @@ index 000000000000..a707d2181a77
 +possible, they are subject to change without warning between kernel
 +versions.
 diff --git a/MAINTAINERS b/MAINTAINERS
-index 958e935449e5..17d2679d291a 100644
+index c2a7363e86fe..bcfe36daf67a 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
-@@ -19917,6 +19917,19 @@ F:	include/linux/wait.h
+@@ -20364,6 +20364,19 @@ F:	include/linux/wait.h
  F:	include/uapi/linux/sched.h
  F:	kernel/sched/
  
@@ -353,11 +544,11 @@ index 958e935449e5..17d2679d291a 100644
 +F:	tools/sched_ext/
 +F:	tools/testing/selftests/sched_ext
 +
- SCSI LIBSAS SUBSYSTEM
- R:	John Garry <john.g.garry@oracle.com>
- R:	Jason Yan <yanaijie@huawei.com>
+ SCIOSENSE ENS160 MULTI-GAS SENSOR DRIVER
+ M:	Gustavo Silva <gustavograzs@gmail.com>
+ S:	Maintained
 diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
-index e5974b8239c9..167e877b8bef 100644
+index 14f8f00fdcf9..930b04e3d148 100644
 --- a/drivers/tty/sysrq.c
 +++ b/drivers/tty/sysrq.c
 @@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = {
@@ -369,7 +560,7 @@ index e5974b8239c9..167e877b8bef 100644
  	NULL,				/* T */
  	NULL,				/* U */
 diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
-index 70bf1004076b..a8417d31e348 100644
+index 1ae44793132a..19ec49a9179b 100644
 --- a/include/asm-generic/vmlinux.lds.h
 +++ b/include/asm-generic/vmlinux.lds.h
 @@ -133,6 +133,7 @@
@@ -381,10 +572,10 @@ index 70bf1004076b..a8417d31e348 100644
  	__sched_class_lowest = .;
  
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
-index 2150ca60394b..3cdaec701600 100644
+index c60ba0ab1462..7139b33cb104 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
-@@ -29,8 +29,6 @@
+@@ -28,8 +28,6 @@
  
  struct kernel_clone_args;
  
@@ -393,7 +584,7 @@ index 2150ca60394b..3cdaec701600 100644
  /*
   * All weight knobs on the default hierarchy should use the following min,
   * default and max values.  The default value is the logarithmic center of
-@@ -40,6 +38,8 @@ struct kernel_clone_args;
+@@ -39,6 +37,8 @@ struct kernel_clone_args;
  #define CGROUP_WEIGHT_DFL		100
  #define CGROUP_WEIGHT_MAX		10000
  
@@ -403,10 +594,10 @@ index 2150ca60394b..3cdaec701600 100644
  	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
  	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 76214d7c819d..0f3a107bcd02 100644
+index f8d150343d42..5b4f78fe379d 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
-@@ -80,6 +80,8 @@ struct task_group;
+@@ -82,6 +82,8 @@ struct task_group;
  struct task_struct;
  struct user_event_mm;
  
@@ -415,7 +606,7 @@ index 76214d7c819d..0f3a107bcd02 100644
  /*
   * Task state bitmask. NOTE! These bits are also
   * encoded in fs/proc/array.c: get_task_state().
-@@ -802,6 +804,9 @@ struct task_struct {
+@@ -810,6 +812,9 @@ struct task_struct {
  	struct sched_rt_entity		rt;
  	struct sched_dl_entity		dl;
  	struct sched_dl_entity		*dl_server;
@@ -427,10 +618,10 @@ index 76214d7c819d..0f3a107bcd02 100644
  #ifdef CONFIG_SCHED_CORE
 diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
 new file mode 100644
-index 000000000000..26e1c33bc844
+index 000000000000..76166d3b14fc
 --- /dev/null
 +++ b/include/linux/sched/ext.h
-@@ -0,0 +1,204 @@
+@@ -0,0 +1,216 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@@ -553,9 +744,17 @@ index 000000000000..26e1c33bc844
 +	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 +};
 +
++enum scx_dsq_lnode_flags {
++	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
++
++	/* high 16 bits can be for iter cursor flags */
++	__SCX_DSQ_LNODE_PRIV_SHIFT = 16,
++};
++
 +struct scx_dsq_list_node {
 +	struct list_head	node;
-+	bool			is_bpf_iter_cursor;
++	u32			flags;
++	u32			priv;		/* can be used by iter cursor */
 +};
 +
 +/*
@@ -612,15 +811,19 @@ index 000000000000..26e1c33bc844
 +	 * If set, reject future sched_setscheduler(2) calls updating the policy
 +	 * to %SCHED_EXT with -%EACCES.
 +	 *
-+	 * If set from ops.init_task() and the task's policy is already
-+	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
-+	 * or by inhering the parent's policy during fork, the task's policy is
-+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of
-+	 * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
++	 * Can be set from ops.init_task() while the BPF scheduler is being
++	 * loaded (!scx_init_task_args->fork). If set and the task's policy is
++	 * already %SCHED_EXT, the task's policy is rejected and forcefully
++	 * reverted to %SCHED_NORMAL. The number of such events are reported
++	 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
++	 * during fork is not allowed.
 +	 */
 +	bool			disallow;	/* reject switching into SCX */
 +
 +	/* cold fields */
++#ifdef CONFIG_EXT_GROUP_SCHED
++	struct cgroup		*cgrp_moving_from;
++#endif
 +	/* must be the last field, see init_scx_entity() */
 +	struct list_head	tasks_node;
 +};
@@ -636,7 +839,7 @@ index 000000000000..26e1c33bc844
 +#endif	/* CONFIG_SCHED_CLASS_EXT */
 +#endif	/* _LINUX_SCHED_EXT_H */
 diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
-index d362aacf9f89..4df2f9055587 100644
+index d362aacf9f89..0f2aeb37bbb0 100644
 --- a/include/linux/sched/task.h
 +++ b/include/linux/sched/task.h
 @@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
@@ -649,6 +852,18 @@ index d362aacf9f89..4df2f9055587 100644
  extern void sched_post_fork(struct task_struct *p);
  extern void sched_dead(struct task_struct *p);
  
+@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
+ 	return t;
+ }
+ 
++static inline struct task_struct *tryget_task_struct(struct task_struct *t)
++{
++	return refcount_inc_not_zero(&t->usage) ? t : NULL;
++}
++
+ extern void __put_task_struct(struct task_struct *t);
+ extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
+ 
 diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
 new file mode 100644
 index 000000000000..fe19da7315a9
@@ -699,6 +914,37 @@ index 3bac0a8ceab2..359a14cc76a4 100644
  
  /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
  #define SCHED_RESET_ON_FORK     0x40000000
+diff --git a/init/Kconfig b/init/Kconfig
+index 08a0d51afaae..e1a88d48d652 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1028,9 +1028,13 @@ menuconfig CGROUP_SCHED
+ 	  tasks.
+ 
+ if CGROUP_SCHED
++config GROUP_SCHED_WEIGHT
++	def_bool n
++
+ config FAIR_GROUP_SCHED
+ 	bool "Group scheduling for SCHED_OTHER"
+ 	depends on CGROUP_SCHED
++	select GROUP_SCHED_WEIGHT
+ 	default CGROUP_SCHED
+ 
+ config CFS_BANDWIDTH
+@@ -1055,6 +1059,12 @@ config RT_GROUP_SCHED
+ 	  realtime bandwidth for them.
+ 	  See Documentation/scheduler/sched-rt-group.rst for more information.
+ 
++config EXT_GROUP_SCHED
++	bool
++	depends on SCHED_CLASS_EXT && CGROUP_SCHED
++	select GROUP_SCHED_WEIGHT
++	default y
++
+ endif #CGROUP_SCHED
+ 
+ config SCHED_MM_CID
 diff --git a/init/init_task.c b/init/init_task.c
 index eeb110c65fe2..e222722e790b 100644
 --- a/init/init_task.c
@@ -730,10 +976,10 @@ index eeb110c65fe2..e222722e790b 100644
  	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
  	.ptrace_entry	= LIST_HEAD_INIT(init_task.ptrace_entry),
 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
-index c2f1fd95a821..f3d140c3acc1 100644
+index c2f1fd95a821..fe782cd77388 100644
 --- a/kernel/Kconfig.preempt
 +++ b/kernel/Kconfig.preempt
-@@ -133,4 +133,28 @@ config SCHED_CORE
+@@ -133,4 +133,29 @@ config SCHED_CORE
  	  which is the likely usage by Linux distributions, there should
  	  be no measurable impact on performance.
  
@@ -741,6 +987,7 @@ index c2f1fd95a821..f3d140c3acc1 100644
 +config SCHED_CLASS_EXT
 +	bool "Extensible Scheduling Class"
 +	depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
++	select STACKTRACE if STACKTRACE_SUPPORT
 +	help
 +	  This option enables a new scheduler class sched_ext (SCX), which
 +	  allows scheduling policies to be implemented as BPF programs to
@@ -764,7 +1011,7 @@ index c2f1fd95a821..f3d140c3acc1 100644
 +	    Documentation/scheduler/sched-ext.rst
 +	    https://github.com/sched-ext/scx
 diff --git a/kernel/fork.c b/kernel/fork.c
-index 99076dbe27d8..741d962db0d9 100644
+index 238695afc630..69a0a7210060 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -23,6 +23,7 @@
@@ -775,7 +1022,7 @@ index 99076dbe27d8..741d962db0d9 100644
  #include <linux/seq_file.h>
  #include <linux/rtmutex.h>
  #include <linux/init.h>
-@@ -971,6 +972,7 @@ void __put_task_struct(struct task_struct *tsk)
+@@ -973,6 +974,7 @@ void __put_task_struct(struct task_struct *tsk)
  	WARN_ON(refcount_read(&tsk->usage));
  	WARN_ON(tsk == current);
  
@@ -783,7 +1030,7 @@ index 99076dbe27d8..741d962db0d9 100644
  	io_uring_free(tsk);
  	cgroup_free(tsk);
  	task_numa_free(tsk, true);
-@@ -2363,7 +2365,7 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2355,7 +2357,7 @@ __latent_entropy struct task_struct *copy_process(
  
  	retval = perf_event_init_task(p, clone_flags);
  	if (retval)
@@ -792,7 +1039,7 @@ index 99076dbe27d8..741d962db0d9 100644
  	retval = audit_alloc(p);
  	if (retval)
  		goto bad_fork_cleanup_perf;
-@@ -2496,7 +2498,9 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2488,7 +2490,9 @@ __latent_entropy struct task_struct *copy_process(
  	 * cgroup specific, it unconditionally needs to place the task on a
  	 * runqueue.
  	 */
@@ -803,7 +1050,7 @@ index 99076dbe27d8..741d962db0d9 100644
  
  	/*
  	 * From this point on we must avoid any synchronous user-space
-@@ -2542,13 +2546,13 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2534,13 +2538,13 @@ __latent_entropy struct task_struct *copy_process(
  	/* Don't start children in a dying pid namespace */
  	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
  		retval = -ENOMEM;
@@ -819,7 +1066,7 @@ index 99076dbe27d8..741d962db0d9 100644
  	}
  
  	/* No more failure paths after this point. */
-@@ -2622,10 +2626,11 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2614,10 +2618,11 @@ __latent_entropy struct task_struct *copy_process(
  
  	return p;
  
@@ -832,7 +1079,7 @@ index 99076dbe27d8..741d962db0d9 100644
  	cgroup_cancel_fork(p, args);
  bad_fork_put_pidfd:
  	if (clone_flags & CLONE_PIDFD) {
-@@ -2664,6 +2669,8 @@ __latent_entropy struct task_struct *copy_process(
+@@ -2656,6 +2661,8 @@ __latent_entropy struct task_struct *copy_process(
  	audit_free(p);
  bad_fork_cleanup_perf:
  	perf_event_free_task(p);
@@ -842,7 +1089,7 @@ index 99076dbe27d8..741d962db0d9 100644
  	lockdep_free_task(p);
  #ifdef CONFIG_NUMA
 diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
-index d9dc9ab3773f..e7d539bb721e 100644
+index 39c315182b35..fae1f5c921eb 100644
 --- a/kernel/sched/build_policy.c
 +++ b/kernel/sched/build_policy.c
 @@ -16,18 +16,25 @@
@@ -871,18 +1118,20 @@ index d9dc9ab3773f..e7d539bb721e 100644
  
  #include <uapi/linux/sched/types.h>
  
-@@ -52,3 +59,6 @@
+@@ -52,4 +59,8 @@
  #include "cputime.c"
  #include "deadline.c"
  
 +#ifdef CONFIG_SCHED_CLASS_EXT
 +# include "ext.c"
 +#endif
++
+ #include "syscalls.c"
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index ebf21373f663..fb6276f74ee6 100644
+index f3951e4a55e5..c792a6feb7a9 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p)
+@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p)
  	if (p->sched_class == &idle_sched_class)
  		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
  
@@ -894,7 +1143,7 @@ index ebf21373f663..fb6276f74ee6 100644
  }
  
  /*
-@@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a,
+@@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a,
  	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
  		return cfs_prio_less(a, b, in_fi);
  
@@ -906,7 +1155,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	return false;
  }
  
-@@ -1254,11 +1262,14 @@ bool sched_can_stop_tick(struct rq *rq)
+@@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq)
  		return true;
  
  	/*
@@ -918,14 +1167,14 @@ index ebf21373f663..fb6276f74ee6 100644
 +	 * involuntary preemption. For SCX, ask.
  	 */
 -	if (rq->nr_running > 1)
-+	if (!scx_switched_all() && rq->nr_running > 1)
++	if (scx_enabled() && !scx_can_stop_tick(rq))
 +		return false;
 +
-+	if (scx_enabled() && !scx_can_stop_tick(rq))
++	if (rq->cfs.nr_running > 1)
  		return false;
  
  	/*
-@@ -1340,8 +1351,8 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+@@ -1341,8 +1352,8 @@ void set_load_weight(struct task_struct *p, bool update_load)
  	 * SCHED_OTHER tasks have to update their load when changing their
  	 * weight
  	 */
@@ -936,7 +1185,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	else
  		p->se.load = lw;
  }
-@@ -2210,6 +2221,17 @@ inline int task_curr(const struct task_struct *p)
+@@ -2031,6 +2042,17 @@ inline int task_curr(const struct task_struct *p)
  	return cpu_curr(task_cpu(p)) == p;
  }
  
@@ -954,20 +1203,25 @@ index ebf21373f663..fb6276f74ee6 100644
  /*
   * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
   * use the balance_callback list if you want balancing.
-@@ -2217,9 +2239,9 @@ inline int task_curr(const struct task_struct *p)
-  * this means any call to check_class_changed() must be followed by a call to
-  * balance_callback().
-  */
--static inline void check_class_changed(struct rq *rq, struct task_struct *p,
--				       const struct sched_class *prev_class,
--				       int oldprio)
-+void check_class_changed(struct rq *rq, struct task_struct *p,
-+			 const struct sched_class *prev_class,
-+			 int oldprio)
+@@ -2289,7 +2311,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
+ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
  {
- 	if (prev_class != p->sched_class) {
- 		if (prev_class->switched_from)
-@@ -3982,6 +4004,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
+ 	/* When not in the task's cpumask, no point in looking further. */
+-	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
++	if (!task_allowed_on_cpu(p, cpu))
+ 		return false;
+ 
+ 	/* migrate_disabled() must be allowed to finish. */
+@@ -2298,7 +2320,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+ 
+ 	/* Non kernel threads are not allowed during either online or offline. */
+ 	if (!(p->flags & PF_KTHREAD))
+-		return cpu_active(cpu) && task_cpu_possible(cpu, p);
++		return cpu_active(cpu);
+ 
+ 	/* KTHREAD_IS_PER_CPU is always allowed. */
+ 	if (kthread_is_per_cpu(p))
+@@ -3775,6 +3797,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
  
  static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
  {
@@ -983,7 +1237,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	/*
  	 * Do not complicate things with the async wake_list while the CPU is
  	 * in hotplug state.
-@@ -4549,6 +4580,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4342,6 +4373,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  	p->rt.on_rq		= 0;
  	p->rt.on_list		= 0;
  
@@ -994,7 +1248,7 @@ index ebf21373f663..fb6276f74ee6 100644
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  	INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
-@@ -4789,10 +4824,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4582,10 +4617,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
  
  	if (dl_prio(p->prio))
  		return -EAGAIN;
@@ -1015,7 +1269,7 @@ index ebf21373f663..fb6276f74ee6 100644
  
  	init_entity_runnable_average(&p->se);
  
-@@ -4812,7 +4855,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4605,7 +4648,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
  	return 0;
  }
  
@@ -1024,7 +1278,7 @@ index ebf21373f663..fb6276f74ee6 100644
  {
  	unsigned long flags;
  
-@@ -4974,6 +4974,13 @@
+@@ -4632,11 +4675,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
  	if (p->sched_class->task_fork)
  		p->sched_class->task_fork(p);
  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -1038,15 +1292,13 @@ index ebf21373f663..fb6276f74ee6 100644
  }
  
  void sched_post_fork(struct task_struct *p)
-@@ -4982,6 +4989,7 @@
- 	sched_post_fork_bore(p);
- #endif // CONFIG_SCHED_BORE
+ {
  	uclamp_post_fork(p);
 +	scx_post_fork(p);
  }
  
  unsigned long to_ratio(u64 period, u64 runtime)
-@@ -5685,6 +5736,7 @@ void sched_tick(void)
+@@ -5469,6 +5520,7 @@ void sched_tick(void)
  	calc_global_load_tick(rq);
  	sched_core_tick(rq);
  	task_tick_mm_cid(rq, curr);
@@ -1054,7 +1306,7 @@ index ebf21373f663..fb6276f74ee6 100644
  
  	rq_unlock(rq, &rf);
  
-@@ -5697,8 +5749,10 @@ void sched_tick(void)
+@@ -5481,8 +5533,10 @@ void sched_tick(void)
  		wq_worker_tick(curr);
  
  #ifdef CONFIG_SMP
@@ -1067,10 +1319,11 @@ index ebf21373f663..fb6276f74ee6 100644
  #endif
  }
  
-@@ -5989,7 +6043,19 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+@@ -5772,8 +5826,19 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
+ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
  				  struct rq_flags *rf)
  {
- #ifdef CONFIG_SMP
+-#ifdef CONFIG_SMP
 +	const struct sched_class *start_class = prev->sched_class;
  	const struct sched_class *class;
 +
@@ -1080,23 +1333,28 @@ index ebf21373f663..fb6276f74ee6 100644
 +	 * when waking up from SCHED_IDLE. If @start_class is below SCX, start
 +	 * from SCX instead.
 +	 */
-+	if (sched_class_above(&ext_sched_class, start_class))
++	if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
 +		start_class = &ext_sched_class;
 +#endif
 +
  	/*
  	 * We must do the balancing pass before put_prev_task(), such
  	 * that when we release the rq->lock the task is in the same
-@@ -5998,7 +6064,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+@@ -5782,11 +5847,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
  	 * We can terminate the balance pass as soon as we know there is
  	 * a runnable task of @class priority or higher.
  	 */
 -	for_class_range(class, prev->sched_class, &idle_sched_class) {
+-		if (class->balance(rq, prev, rf))
 +	for_active_class_range(class, start_class, &idle_sched_class) {
- 		if (class->balance(rq, prev, rf))
++		if (class->balance && class->balance(rq, prev, rf))
  			break;
  	}
-@@ -6016,6 +6082,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+-#endif
+ 
+ 	put_prev_task(rq, prev);
+ }
+@@ -5800,6 +5864,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  	const struct sched_class *class;
  	struct task_struct *p;
  
@@ -1106,7 +1364,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	/*
  	 * Optimization: we know that if all tasks are in the fair class we can
  	 * call that function directly, but only if the @prev task wasn't of a
-@@ -6056,10 +6125,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+@@ -5840,10 +5907,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  	if (prev->dl_server)
  		prev->dl_server = NULL;
  
@@ -1124,7 +1382,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	}
  
  	BUG(); /* The idle class should always have a runnable task. */
-@@ -6089,7 +6163,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
+@@ -5873,7 +5945,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
  	const struct sched_class *class;
  	struct task_struct *p;
  
@@ -1133,14 +1391,7 @@ index ebf21373f663..fb6276f74ee6 100644
  		p = class->pick_task(rq);
  		if (p)
  			return p;
-@@ -7080,12 +7154,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
- }
- EXPORT_SYMBOL(default_wake_function);
- 
--static void __setscheduler_prio(struct task_struct *p, int prio)
-+void __setscheduler_prio(struct task_struct *p, int prio)
- {
- 	if (dl_prio(prio))
+@@ -6870,6 +6942,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
  		p->sched_class = &dl_sched_class;
  	else if (rt_prio(prio))
  		p->sched_class = &rt_sched_class;
@@ -1151,7 +1402,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	else
  		p->sched_class = &fair_sched_class;
  
-@@ -7246,6 +7324,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
+@@ -7015,6 +7091,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
  	}
  
  	__setscheduler_prio(p, prio);
@@ -1159,68 +1410,7 @@ index ebf21373f663..fb6276f74ee6 100644
  
  	if (queued)
  		enqueue_task(rq, p, queue_flag);
-@@ -7467,6 +7546,25 @@ int sched_core_idle_cpu(int cpu)
- #endif
- 
- #ifdef CONFIG_SMP
-+/*
-+ * Load avg and utiliztion metrics need to be updated periodically and before
-+ * consumption. This function updates the metrics for all subsystems except for
-+ * the fair class. @rq must be locked and have its clock updated.
-+ */
-+bool update_other_load_avgs(struct rq *rq)
-+{
-+	u64 now = rq_clock_pelt(rq);
-+	const struct sched_class *curr_class = rq->curr->sched_class;
-+	unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
-+		update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
-+		update_hw_load_avg(now, rq, hw_pressure) |
-+		update_irq_load_avg(rq, 0);
-+}
-+
- /*
-  * This function computes an effective utilization for the given CPU, to be
-  * used for frequency selection given the linear relation: f = u * f_max.
-@@ -7789,6 +7887,10 @@ static int __sched_setscheduler(struct task_struct *p,
- 		goto unlock;
- 	}
- 
-+	retval = scx_check_setscheduler(p, policy);
-+	if (retval)
-+		goto unlock;
-+
- 	/*
- 	 * If not changing anything there's no need to proceed further,
- 	 * but store a possible modification of reset_on_fork.
-@@ -7891,6 +7993,7 @@ static int __sched_setscheduler(struct task_struct *p,
- 		__setscheduler_prio(p, newprio);
- 	}
- 	__setscheduler_uclamp(p, attr);
-+	check_class_changing(rq, p, prev_class);
- 
- 	if (queued) {
- 		/*
-@@ -9066,6 +9169,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
- 	case SCHED_NORMAL:
- 	case SCHED_BATCH:
- 	case SCHED_IDLE:
-+	case SCHED_EXT:
- 		ret = 0;
- 		break;
- 	}
-@@ -9093,6 +9197,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
- 	case SCHED_NORMAL:
- 	case SCHED_BATCH:
- 	case SCHED_IDLE:
-+	case SCHED_EXT:
- 		ret = 0;
- 	}
- 	return ret;
-@@ -9188,6 +9293,7 @@ void sched_show_task(struct task_struct *p)
+@@ -7429,6 +7506,7 @@ void sched_show_task(struct task_struct *p)
  
  	print_worker_info(KERN_INFO, p);
  	print_stop_info(KERN_INFO, p);
@@ -1228,7 +1418,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	show_stack(p, NULL, KERN_INFO);
  	put_task_stack(p);
  }
-@@ -9680,6 +9786,8 @@ int sched_cpu_activate(unsigned int cpu)
+@@ -7957,6 +8035,8 @@ int sched_cpu_activate(unsigned int cpu)
  		cpuset_cpu_active();
  	}
  
@@ -1237,7 +1427,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	/*
  	 * Put the rq online, if not already. This happens:
  	 *
-@@ -9903,6 +9903,8 @@
+@@ -8006,6 +8086,8 @@ int sched_cpu_deactivate(unsigned int cpu)
  
  	sched_set_rq_offline(rq, cpu);
  
@@ -1246,7 +1436,7 @@ index ebf21373f663..fb6276f74ee6 100644
  	/*
  	 * When going down, decrement the number of cores with SMT present.
  	 */
-@@ -10061,11 +10061,15 @@
+@@ -8192,11 +8192,15 @@
  	int i;
  
  	/* Make sure the linker didn't screw up */
@@ -1266,7 +1456,17 @@ index ebf21373f663..fb6276f74ee6 100644
  #endif
  
  #ifdef CONFIG_SCHED_BORE
-@@ -10096,6 +10210,7 @@ void __init sched_init(void)
+@@ -8218,6 +8304,9 @@ void __init sched_init(void)
+ 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
++#ifdef CONFIG_EXT_GROUP_SCHED
++		root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
++#endif /* CONFIG_EXT_GROUP_SCHED */
+ #ifdef CONFIG_RT_GROUP_SCHED
+ 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ 		ptr += nr_cpu_ids * sizeof(void **);
+@@ -8363,6 +8452,7 @@ void __init sched_init(void)
  	balance_push_set(smp_processor_id(), false);
  #endif
  	init_sched_fair_class();
@@ -1274,7 +1474,23 @@ index ebf21373f663..fb6276f74ee6 100644
  
  	psi_init();
  
-@@ -10522,11 +10637,6 @@ void sched_move_task(struct task_struct *tsk)
+@@ -8648,6 +8738,7 @@ struct task_group *sched_create_group(struct task_group *parent)
+ 	if (!alloc_rt_sched_group(tg, parent))
+ 		goto err;
+ 
++	scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
+ 	alloc_uclamp_sched_group(tg, parent);
+ 
+ 	return tg;
+@@ -8775,6 +8866,7 @@ void sched_move_task(struct task_struct *tsk)
+ 		put_prev_task(rq, tsk);
+ 
+ 	sched_change_group(tsk, group);
++	scx_move_task(tsk);
+ 
+ 	if (queued)
+ 		enqueue_task(rq, tsk, queue_flags);
+@@ -8789,11 +8881,6 @@ void sched_move_task(struct task_struct *tsk)
  	}
  }
  
@@ -1286,16 +1502,154 @@ index ebf21373f663..fb6276f74ee6 100644
  static struct cgroup_subsys_state *
  cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  {
-@@ -11293,29 +11403,27 @@ static int cpu_local_stat_show(struct seq_file *sf,
+@@ -8817,6 +8904,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+ {
+ 	struct task_group *tg = css_tg(css);
+ 	struct task_group *parent = css_tg(css->parent);
++	int ret;
++
++	ret = scx_tg_online(tg);
++	if (ret)
++		return ret;
+ 
+ 	if (parent)
+ 		sched_online_group(tg, parent);
+@@ -8831,6 +8923,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+ 	return 0;
  }
  
- #ifdef CONFIG_FAIR_GROUP_SCHED
++static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
++{
++	struct task_group *tg = css_tg(css);
++
++	scx_tg_offline(tg);
++}
 +
+ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+ {
+ 	struct task_group *tg = css_tg(css);
+@@ -8848,9 +8947,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+ 	sched_unregister_group(tg);
+ }
+ 
+-#ifdef CONFIG_RT_GROUP_SCHED
+ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+ {
++#ifdef CONFIG_RT_GROUP_SCHED
+ 	struct task_struct *task;
+ 	struct cgroup_subsys_state *css;
+ 
+@@ -8858,9 +8957,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+ 		if (!sched_rt_can_attach(css_tg(css), task))
+ 			return -EINVAL;
+ 	}
+-	return 0;
+-}
+ #endif
++	return scx_cgroup_can_attach(tset);
++}
+ 
+ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+ {
+@@ -8869,6 +8968,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+ 
+ 	cgroup_taskset_for_each(task, css, tset)
+ 		sched_move_task(task);
++
++	scx_cgroup_finish_attach();
++}
++
++static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
++{
++	scx_cgroup_cancel_attach(tset);
+ }
+ 
+ #ifdef CONFIG_UCLAMP_TASK_GROUP
+@@ -9045,22 +9151,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+ }
+ #endif /* CONFIG_UCLAMP_TASK_GROUP */
+ 
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
 +static unsigned long tg_weight(struct task_group *tg)
 +{
+ #ifdef CONFIG_FAIR_GROUP_SCHED
 +	return scale_load_down(tg->shares);
++#else
++	return sched_weight_from_cgroup(tg->scx_weight);
++#endif
 +}
 +
+ static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ 				struct cftype *cftype, u64 shareval)
+ {
++	int ret;
++
+ 	if (shareval > scale_load_down(ULONG_MAX))
+ 		shareval = MAX_SHARES;
+-	return sched_group_set_shares(css_tg(css), scale_load(shareval));
++	ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
++	if (!ret)
++		scx_group_set_weight(css_tg(css),
++				     sched_weight_to_cgroup(shareval));
++	return ret;
+ }
+ 
+ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ 			       struct cftype *cft)
+ {
+-	struct task_group *tg = css_tg(css);
+-
+-	return (u64) scale_load_down(tg->shares);
++	return tg_weight(css_tg(css));
+ }
++#endif /* CONFIG_GROUP_SCHED_WEIGHT */
+ 
+ #ifdef CONFIG_CFS_BANDWIDTH
+ static DEFINE_MUTEX(cfs_constraints_mutex);
+@@ -9406,7 +9526,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+ 	return 0;
+ }
+ #endif /* CONFIG_CFS_BANDWIDTH */
+-#endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+ #ifdef CONFIG_RT_GROUP_SCHED
+ static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+@@ -9434,7 +9553,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ }
+ #endif /* CONFIG_RT_GROUP_SCHED */
+ 
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ 			       struct cftype *cft)
+ {
+@@ -9444,12 +9563,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ 				struct cftype *cft, s64 idle)
+ {
+-	return sched_group_set_idle(css_tg(css), idle);
++	int ret;
++
++	ret = sched_group_set_idle(css_tg(css), idle);
++	if (!ret)
++		scx_group_set_idle(css_tg(css), idle);
++	return ret;
+ }
+ #endif
+ 
+ static struct cftype cpu_legacy_files[] = {
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ 	{
+ 		.name = "shares",
+ 		.read_u64 = cpu_shares_read_u64,
+@@ -9559,38 +9683,35 @@ static int cpu_local_stat_show(struct seq_file *sf,
+ 	return 0;
+ }
+ 
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
++
  static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
  			       struct cftype *cft)
  {
@@ -1319,6 +1673,7 @@ index ebf21373f663..fb6276f74ee6 100644
 -	 */
 -	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
 +	unsigned long weight;
++	int ret;
 +
 +	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
  		return -ERANGE;
@@ -1326,9 +1681,13 @@ index ebf21373f663..fb6276f74ee6 100644
 -	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
 +	weight = sched_weight_from_cgroup(cgrp_weight);
  
- 	return sched_group_set_shares(css_tg(css), scale_load(weight));
+-	return sched_group_set_shares(css_tg(css), scale_load(weight));
++	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
++	if (!ret)
++		scx_group_set_weight(css_tg(css), cgrp_weight);
++	return ret;
  }
-@@ -11323,7 +11431,7 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+ 
  static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
  				    struct cftype *cft)
  {
@@ -1337,7 +1696,58 @@ index ebf21373f663..fb6276f74ee6 100644
  	int last_delta = INT_MAX;
  	int prio, delta;
  
-@@ -12064,3 +12172,38 @@ void sched_mm_cid_fork(struct task_struct *t)
+@@ -9609,7 +9730,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ 				     struct cftype *cft, s64 nice)
+ {
+ 	unsigned long weight;
+-	int idx;
++	int idx, ret;
+ 
+ 	if (nice < MIN_NICE || nice > MAX_NICE)
+ 		return -ERANGE;
+@@ -9618,9 +9739,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ 	idx = array_index_nospec(idx, 40);
+ 	weight = sched_prio_to_weight[idx];
+ 
+-	return sched_group_set_shares(css_tg(css), scale_load(weight));
++	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
++	if (!ret)
++		scx_group_set_weight(css_tg(css),
++				     sched_weight_to_cgroup(weight));
++	return ret;
+ }
+-#endif
++#endif /* CONFIG_GROUP_SCHED_WEIGHT */
+ 
+ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ 						  long period, long quota)
+@@ -9680,7 +9805,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ #endif
+ 
+ static struct cftype cpu_files[] = {
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ 	{
+ 		.name = "weight",
+ 		.flags = CFTYPE_NOT_ON_ROOT,
+@@ -9734,14 +9859,14 @@ static struct cftype cpu_files[] = {
+ struct cgroup_subsys cpu_cgrp_subsys = {
+ 	.css_alloc	= cpu_cgroup_css_alloc,
+ 	.css_online	= cpu_cgroup_css_online,
++	.css_offline	= cpu_cgroup_css_offline,
+ 	.css_released	= cpu_cgroup_css_released,
+ 	.css_free	= cpu_cgroup_css_free,
+ 	.css_extra_stat_show = cpu_extra_stat_show,
+ 	.css_local_stat_show = cpu_local_stat_show,
+-#ifdef CONFIG_RT_GROUP_SCHED
+ 	.can_attach	= cpu_cgroup_can_attach,
+-#endif
+ 	.attach		= cpu_cgroup_attach,
++	.cancel_attach	= cpu_cgroup_cancel_attach,
+ 	.legacy_cftypes	= cpu_legacy_files,
+ 	.dfl_cftypes	= cpu_files,
+ 	.early_init	= true,
+@@ -10331,3 +10456,38 @@ void sched_mm_cid_fork(struct task_struct *t)
  	t->mm_cid_active = 1;
  }
  #endif
@@ -1481,10 +1891,10 @@ index c1eb9a1afd13..c057ef46c5f8 100644
  
 diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
 new file mode 100644
-index 000000000000..0dac88d0e578
+index 000000000000..25fadfaace33
 --- /dev/null
 +++ b/kernel/sched/ext.c
-@@ -0,0 +1,6532 @@
+@@ -0,0 +1,7262 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@@ -1603,10 +2013,16 @@ index 000000000000..0dac88d0e578
 +	 */
 +	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
 +
++	/*
++	 * CPU cgroup support flags
++	 */
++	SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16,	/* cpu.weight */
++
 +	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
 +				  SCX_OPS_ENQ_LAST |
 +				  SCX_OPS_ENQ_EXITING |
-+				  SCX_OPS_SWITCH_PARTIAL,
++				  SCX_OPS_SWITCH_PARTIAL |
++				  SCX_OPS_HAS_CGROUP_WEIGHT,
 +};
 +
 +/* argument container for ops.init_task() */
@@ -1616,6 +2032,10 @@ index 000000000000..0dac88d0e578
 +	 * to the scheduler transition path.
 +	 */
 +	bool			fork;
++#ifdef CONFIG_EXT_GROUP_SCHED
++	/* the cgroup the task is joining */
++	struct cgroup		*cgroup;
++#endif
 +};
 +
 +/* argument container for ops.exit_task() */
@@ -1624,6 +2044,12 @@ index 000000000000..0dac88d0e578
 +	bool cancelled;
 +};
 +
++/* argument container for ops->cgroup_init() */
++struct scx_cgroup_init_args {
++	/* the weight of the cgroup [1..10000] */
++	u32			weight;
++};
++
 +enum scx_cpu_preempt_reason {
 +	/* next task is being scheduled by &sched_class_rt */
 +	SCX_CPU_PREEMPT_RT,
@@ -1988,6 +2414,79 @@ index 000000000000..0dac88d0e578
 +	 */
 +	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
 +
++#ifdef CONFIG_EXT_GROUP_SCHED
++	/**
++	 * cgroup_init - Initialize a cgroup
++	 * @cgrp: cgroup being initialized
++	 * @args: init arguments, see the struct definition
++	 *
++	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
++	 * @cgrp for sched_ext. This operation may block.
++	 *
++	 * Return 0 for success, -errno for failure. An error return while
++	 * loading will abort loading of the BPF scheduler. During cgroup
++	 * creation, it will abort the specific cgroup creation.
++	 */
++	s32 (*cgroup_init)(struct cgroup *cgrp,
++			   struct scx_cgroup_init_args *args);
++
++	/**
++	 * cgroup_exit - Exit a cgroup
++	 * @cgrp: cgroup being exited
++	 *
++	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
++	 * @cgrp for sched_ext. This operation my block.
++	 */
++	void (*cgroup_exit)(struct cgroup *cgrp);
++
++	/**
++	 * cgroup_prep_move - Prepare a task to be moved to a different cgroup
++	 * @p: task being moved
++	 * @from: cgroup @p is being moved from
++	 * @to: cgroup @p is being moved to
++	 *
++	 * Prepare @p for move from cgroup @from to @to. This operation may
++	 * block and can be used for allocations.
++	 *
++	 * Return 0 for success, -errno for failure. An error return aborts the
++	 * migration.
++	 */
++	s32 (*cgroup_prep_move)(struct task_struct *p,
++				struct cgroup *from, struct cgroup *to);
++
++	/**
++	 * cgroup_move - Commit cgroup move
++	 * @p: task being moved
++	 * @from: cgroup @p is being moved from
++	 * @to: cgroup @p is being moved to
++	 *
++	 * Commit the move. @p is dequeued during this operation.
++	 */
++	void (*cgroup_move)(struct task_struct *p,
++			    struct cgroup *from, struct cgroup *to);
++
++	/**
++	 * cgroup_cancel_move - Cancel cgroup move
++	 * @p: task whose cgroup move is being canceled
++	 * @from: cgroup @p was being moved from
++	 * @to: cgroup @p was being moved to
++	 *
++	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
++	 * Undo the preparation.
++	 */
++	void (*cgroup_cancel_move)(struct task_struct *p,
++				   struct cgroup *from, struct cgroup *to);
++
++	/**
++	 * cgroup_set_weight - A cgroup's weight is being changed
++	 * @cgrp: cgroup whose weight is being updated
++	 * @weight: new weight [1..10000]
++	 *
++	 * Update @tg's weight to @weight.
++	 */
++	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
++#endif	/* CONFIG_CGROUPS */
++
 +	/*
 +	 * All online ops must come before ops.cpu_online().
 +	 */
@@ -2173,8 +2672,12 @@ index 000000000000..0dac88d0e578
 +	SCX_KICK_WAIT		= 1LLU << 2,
 +};
 +
++enum scx_tg_flags {
++	SCX_TG_ONLINE		= 1U << 0,
++	SCX_TG_INITED		= 1U << 1,
++};
++
 +enum scx_ops_enable_state {
-+	SCX_OPS_PREPPING,
 +	SCX_OPS_ENABLING,
 +	SCX_OPS_ENABLED,
 +	SCX_OPS_DISABLING,
@@ -2182,7 +2685,6 @@ index 000000000000..0dac88d0e578
 +};
 +
 +static const char *scx_ops_enable_state_str[] = {
-+	[SCX_OPS_PREPPING]	= "prepping",
 +	[SCX_OPS_ENABLING]	= "enabling",
 +	[SCX_OPS_ENABLED]	= "enabled",
 +	[SCX_OPS_DISABLING]	= "disabling",
@@ -2250,6 +2752,7 @@ index 000000000000..0dac88d0e578
 +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
 +static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
++static bool scx_ops_init_task_enabled;
 +static bool scx_switching_all;
 +DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
 +
@@ -2261,7 +2764,7 @@ index 000000000000..0dac88d0e578
 +static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 +
-+struct static_key_false scx_has_op[SCX_OPI_END] =
++static struct static_key_false scx_has_op[SCX_OPI_END] =
 +	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
 +
 +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
@@ -2271,6 +2774,13 @@ index 000000000000..0dac88d0e578
 +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 +
 +/*
++ * A monotically increasing sequence number that is incremented every time a
++ * scheduler is enabled. This can be used by to check if any custom sched_ext
++ * scheduler has ever been used in the system.
++ */
++static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
++
++/*
 + * The maximum amount of time in jiffies that a task may be runnable without
 + * being scheduled on a CPU. If this timeout is exceeded, it will trigger
 + * scx_ops_error().
@@ -2314,8 +2824,15 @@ index 000000000000..0dac88d0e578
 + */
 +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
 +
-+/* dispatch queues */
-+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
++/*
++ * Dispatch queues.
++ *
++ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
++ * to avoid live-locking in bypass mode where all tasks are dispatched to
++ * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
++ * sufficient, it can be further split.
++ */
++static struct scx_dispatch_q **global_dsqs;
 +
 +static const struct rhashtable_params dsq_hash_params = {
 +	.key_len		= 8,
@@ -2364,7 +2881,7 @@ index 000000000000..0dac88d0e578
 +	struct scx_bstr_buf	buf;
 +};
 +
-+struct scx_dump_data scx_dump_data = {
++static struct scx_dump_data scx_dump_data = {
 +	.cpu			= -1,
 +};
 +
@@ -2418,6 +2935,16 @@ index 000000000000..0dac88d0e578
 +	return (s32)(a - b) < 0;
 +}
 +
++static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
++{
++	return global_dsqs[cpu_to_node(task_cpu(p))];
++}
++
++static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
++{
++	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
++}
++
 +/*
 + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
 + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -2554,6 +3081,11 @@ index 000000000000..0dac88d0e578
 +	return true;
 +}
 +
++static bool scx_kf_allowed_if_unlocked(void)
++{
++	return !current->scx.kf_mask;
++}
++
 +/**
 + * nldsq_next_task - Iterate to the next task in a non-local DSQ
 + * @dsq: user dsq being interated
@@ -2587,7 +3119,7 @@ index 000000000000..0dac88d0e578
 +
 +		dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
 +					 node);
-+	} while (dsq_lnode->is_bpf_iter_cursor);
++	} while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
 +
 +	return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
 +}
@@ -2605,16 +3137,22 @@ index 000000000000..0dac88d0e578
 + */
 +enum scx_dsq_iter_flags {
 +	/* iterate in the reverse dispatch order */
-+	SCX_DSQ_ITER_REV		= 1U << 0,
++	SCX_DSQ_ITER_REV		= 1U << 16,
 +
-+	__SCX_DSQ_ITER_ALL_FLAGS	= SCX_DSQ_ITER_REV,
++	__SCX_DSQ_ITER_HAS_SLICE	= 1U << 30,
++	__SCX_DSQ_ITER_HAS_VTIME	= 1U << 31,
++
++	__SCX_DSQ_ITER_USER_FLAGS	= SCX_DSQ_ITER_REV,
++	__SCX_DSQ_ITER_ALL_FLAGS	= __SCX_DSQ_ITER_USER_FLAGS |
++					  __SCX_DSQ_ITER_HAS_SLICE |
++					  __SCX_DSQ_ITER_HAS_VTIME,
 +};
 +
 +struct bpf_iter_scx_dsq_kern {
 +	struct scx_dsq_list_node	cursor;
 +	struct scx_dispatch_q		*dsq;
-+	u32				dsq_seq;
-+	u32				flags;
++	u64				slice;
++	u64				vtime;
 +} __attribute__((aligned(8)));
 +
 +struct bpf_iter_scx_dsq {
@@ -2652,6 +3190,9 @@ index 000000000000..0dac88d0e578
 +{
 +	lockdep_assert_held(&scx_tasks_lock);
 +
++	BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
++		     ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
++
 +	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
 +	list_add(&iter->cursor.tasks_node, &scx_tasks);
 +	iter->locked = NULL;
@@ -2730,17 +3271,37 @@ index 000000000000..0dac88d0e578
 + * whether they would like to filter out dead tasks. See scx_task_iter_init()
 + * for details.
 + */
-+static struct task_struct *
-+scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
++static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 +{
 +	struct task_struct *p;
-+retry:
++
 +	scx_task_iter_rq_unlock(iter);
 +
 +	while ((p = scx_task_iter_next(iter))) {
 +		/*
-+		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
-+		 * which haven't yet been onlined. Test sched_class directly.
++		 * scx_task_iter is used to prepare and move tasks into SCX
++		 * while loading the BPF scheduler and vice-versa while
++		 * unloading. The init_tasks ("swappers") should be excluded
++		 * from the iteration because:
++		 *
++		 * - It's unsafe to use __setschduler_prio() on an init_task to
++		 *   determine the sched_class to use as it won't preserve its
++		 *   idle_sched_class.
++		 *
++		 * - ops.init/exit_task() can easily be confused if called with
++		 *   init_tasks as they, e.g., share PID 0.
++		 *
++		 * As init_tasks are never scheduled through SCX, they can be
++		 * skipped safely. Note that is_idle_task() which tests %PF_IDLE
++		 * doesn't work here:
++		 *
++		 * - %PF_IDLE may not be set for an init_task whose CPU hasn't
++		 *   yet been onlined.
++		 *
++		 * - %PF_IDLE can be set on tasks that are not init_tasks. See
++		 *   play_idle_precise() used by CONFIG_IDLE_INJECT.
++		 *
++		 * Test for idle_sched_class as only init_tasks are on it.
 +		 */
 +		if (p->sched_class != &idle_sched_class)
 +			break;
@@ -2751,16 +3312,6 @@ index 000000000000..0dac88d0e578
 +	iter->rq = task_rq_lock(p, &iter->rf);
 +	iter->locked = p;
 +
-+	/*
-+	 * If we see %TASK_DEAD, @p already disabled preemption, is about to do
-+	 * the final __schedule(), won't ever need to be scheduled again and can
-+	 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
-+	 * the final __schedle() while we're locking its rq and thus will stay
-+	 * alive until the rq is unlocked.
-+	 */
-+	if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
-+		goto retry;
-+
 +	return p;
 +}
 +
@@ -2783,9 +3334,9 @@ index 000000000000..0dac88d0e578
 +	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
 +}
 +
-+static bool scx_ops_bypassing(void)
++static bool scx_rq_bypassing(struct rq *rq)
 +{
-+	return unlikely(atomic_read(&scx_ops_bypass_depth));
++	return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
 +}
 +
 +/**
@@ -2919,13 +3470,18 @@ index 000000000000..0dac88d0e578
 + */
 +static void touch_core_sched(struct rq *rq, struct task_struct *p)
 +{
++	lockdep_assert_rq_held(rq);
++
 +#ifdef CONFIG_SCHED_CORE
 +	/*
 +	 * It's okay to update the timestamp spuriously. Use
 +	 * sched_core_disabled() which is cheaper than enabled().
++	 *
++	 * As this is used to determine ordering between tasks of sibling CPUs,
++	 * it may be better to use per-core dispatch sequence instead.
 +	 */
 +	if (!sched_core_disabled())
-+		p->scx.core_sched_at = rq_clock_task(rq);
++		p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
 +#endif
 +}
 +
@@ -2942,7 +3498,6 @@ index 000000000000..0dac88d0e578
 +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
 +{
 +	lockdep_assert_rq_held(rq);
-+	assert_clock_updated(rq);
 +
 +#ifdef CONFIG_SCHED_CORE
 +	if (SCX_HAS_OP(core_sched_before))
@@ -2953,20 +3508,14 @@ index 000000000000..0dac88d0e578
 +static void update_curr_scx(struct rq *rq)
 +{
 +	struct task_struct *curr = rq->curr;
-+	u64 now = rq_clock_task(rq);
-+	u64 delta_exec;
++	s64 delta_exec;
 +
-+	if (time_before_eq64(now, curr->se.exec_start))
++	delta_exec = update_curr_common(rq);
++	if (unlikely(delta_exec <= 0))
 +		return;
 +
-+	delta_exec = now - curr->se.exec_start;
-+	curr->se.exec_start = now;
-+	curr->se.sum_exec_runtime += delta_exec;
-+	account_group_exec_runtime(curr, delta_exec);
-+	cgroup_account_cputime(curr, delta_exec);
-+
 +	if (curr->scx.slice != SCX_SLICE_INF) {
-+		curr->scx.slice -= min(curr->scx.slice, delta_exec);
++		curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
 +		if (!curr->scx.slice)
 +			touch_core_sched(rq, curr);
 +	}
@@ -3004,7 +3553,7 @@ index 000000000000..0dac88d0e578
 +			scx_ops_error("attempting to dispatch to a destroyed dsq");
 +			/* fall back to the global dsq */
 +			raw_spin_unlock(&dsq->lock);
-+			dsq = &scx_dsq_global;
++			dsq = find_global_dsq(p);
 +			raw_spin_lock(&dsq->lock);
 +		}
 +	}
@@ -3107,6 +3656,8 @@ index 000000000000..0dac88d0e578
 +static void task_unlink_from_dsq(struct task_struct *p,
 +				 struct scx_dispatch_q *dsq)
 +{
++	WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
++
 +	if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
 +		rb_erase(&p->scx.dsq_priq, &dsq->priq);
 +		RB_CLEAR_NODE(&p->scx.dsq_priq);
@@ -3114,6 +3665,7 @@ index 000000000000..0dac88d0e578
 +	}
 +
 +	list_del_init(&p->scx.dsq_list.node);
++	dsq_mod_nr(dsq, -1);
 +}
 +
 +static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
@@ -3150,9 +3702,7 @@ index 000000000000..0dac88d0e578
 +	*/
 +	if (p->scx.holding_cpu < 0) {
 +		/* @p must still be on @dsq, dequeue */
-+		WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
 +		task_unlink_from_dsq(p, dsq);
-+		dsq_mod_nr(dsq, -1);
 +	} else {
 +		/*
 +		 * We're racing against dispatch_to_local_dsq() which already
@@ -3169,21 +3719,6 @@ index 000000000000..0dac88d0e578
 +		raw_spin_unlock(&dsq->lock);
 +}
 +
-+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
-+{
-+	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
-+}
-+
-+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
-+{
-+	lockdep_assert(rcu_read_lock_any_held());
-+
-+	if (dsq_id == SCX_DSQ_GLOBAL)
-+		return &scx_dsq_global;
-+	else
-+		return find_user_dsq(dsq_id);
-+}
-+
 +static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
 +						    struct task_struct *p)
 +{
@@ -3192,11 +3727,24 @@ index 000000000000..0dac88d0e578
 +	if (dsq_id == SCX_DSQ_LOCAL)
 +		return &rq->scx.local_dsq;
 +
-+	dsq = find_non_local_dsq(dsq_id);
++	if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
++		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
++
++		if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
++			return find_global_dsq(p);
++
++		return &cpu_rq(cpu)->scx.local_dsq;
++	}
++
++	if (dsq_id == SCX_DSQ_GLOBAL)
++		dsq = find_global_dsq(p);
++	else
++		dsq = find_user_dsq(dsq_id);
++
 +	if (unlikely(!dsq)) {
 +		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
 +			      dsq_id, p->comm, p->pid);
-+		return &scx_dsq_global;
++		return find_global_dsq(p);
 +	}
 +
 +	return dsq;
@@ -3235,8 +3783,8 @@ index 000000000000..0dac88d0e578
 +static void direct_dispatch(struct task_struct *p, u64 enq_flags)
 +{
 +	struct rq *rq = task_rq(p);
-+	struct scx_dispatch_q *dsq;
-+	u64 dsq_id = p->scx.ddsp_dsq_id;
++	struct scx_dispatch_q *dsq =
++		find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
 +
 +	touch_core_sched_dispatch(rq, p);
 +
@@ -3248,15 +3796,9 @@ index 000000000000..0dac88d0e578
 +	 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
 +	 * the enqueue so that it's executed when @rq can be unlocked.
 +	 */
-+	if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
++	if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
 +		unsigned long opss;
 +
-+		if (cpu == cpu_of(rq)) {
-+			dsq_id = SCX_DSQ_LOCAL;
-+			goto dispatch;
-+		}
-+
 +		opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
 +
 +		switch (opss & SCX_OPSS_STATE_MASK) {
@@ -3283,14 +3825,19 @@ index 000000000000..0dac88d0e578
 +		return;
 +	}
 +
-+dispatch:
-+	dsq = find_dsq_for_dispatch(rq, dsq_id, p);
 +	dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
 +}
 +
 +static bool scx_rq_online(struct rq *rq)
 +{
-+	return likely(rq->scx.flags & SCX_RQ_ONLINE);
++	/*
++	 * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates
++	 * the online state as seen from the BPF scheduler. cpu_active() test
++	 * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will
++	 * stay set until the current scheduling operation is complete even if
++	 * we aren't locking @rq.
++	 */
++	return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
 +}
 +
 +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
@@ -3313,7 +3860,7 @@ index 000000000000..0dac88d0e578
 +	if (!scx_rq_online(rq))
 +		goto local;
 +
-+	if (scx_ops_bypassing()) {
++	if (scx_rq_bypassing(rq)) {
 +		if (enq_flags & SCX_ENQ_LAST)
 +			goto local;
 +		else
@@ -3378,7 +3925,7 @@ index 000000000000..0dac88d0e578
 +global:
 +	touch_core_sched(rq, p);	/* see the comment in local: */
 +	p->scx.slice = SCX_SLICE_DFL;
-+	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
++	dispatch_enqueue(find_global_dsq(p), p, enq_flags);
 +}
 +
 +static bool task_runnable(const struct task_struct *p)
@@ -3440,7 +3987,7 @@ index 000000000000..0dac88d0e578
 +	rq->scx.nr_running++;
 +	add_nr_running(rq, 1);
 +
-+	if (SCX_HAS_OP(runnable))
++	if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
 +		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
 +
 +	if (enq_flags & SCX_ENQ_WAKEUP)
@@ -3524,7 +4071,7 @@ index 000000000000..0dac88d0e578
 +		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
 +	}
 +
-+	if (SCX_HAS_OP(quiescent))
++	if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
 +		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
 +
 +	if (deq_flags & SCX_DEQ_SLEEP)
@@ -3559,193 +4106,173 @@ index 000000000000..0dac88d0e578
 +		return false;
 +}
 +
++static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
++					 struct scx_dispatch_q *src_dsq,
++					 struct rq *dst_rq)
++{
++	struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
++
++	/* @dsq is locked and @p is on @dst_rq */
++	lockdep_assert_held(&src_dsq->lock);
++	lockdep_assert_rq_held(dst_rq);
++
++	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
++
++	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
++		list_add(&p->scx.dsq_list.node, &dst_dsq->list);
++	else
++		list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
++
++	dsq_mod_nr(dst_dsq, 1);
++	p->scx.dsq = dst_dsq;
++}
++
 +#ifdef CONFIG_SMP
 +/**
-+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
-+ * @rq: rq to move the task into, currently locked
++ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
 + * @p: task to move
 + * @enq_flags: %SCX_ENQ_*
++ * @src_rq: rq to move the task from, locked on entry, released on return
++ * @dst_rq: rq to move the task into, locked on return
 + *
-+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller
-+ * must:
-+ *
-+ * 1. Start with exclusive access to @p either through its DSQ lock or
-+ *    %SCX_OPSS_DISPATCHING flag.
-+ *
-+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
-+ *
-+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't
-+ *    deadlock with dequeue.
-+ *
-+ * 4. Lock @rq and the task_rq from #3.
-+ *
-+ * 5. Call this function.
-+ *
-+ * Returns %true if @p was successfully moved. %false after racing dequeue and
-+ * losing.
++ * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
 + */
-+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
-+				   u64 enq_flags)
++static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
++					  struct rq *src_rq, struct rq *dst_rq)
 +{
-+	struct rq *task_rq;
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	/*
-+	 * If dequeue got to @p while we were trying to lock both rq's, it'd
-+	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
-+	 * updated it to different values afterwards, as this operation can't be
-+	 * preempted or recurse, @p->scx.holding_cpu can never become
-+	 * raw_smp_processor_id() again before we're done. Thus, we can tell
-+	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
-+	 * still raw_smp_processor_id().
-+	 *
-+	 * See dispatch_dequeue() for the counterpart.
-+	 */
-+	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
-+		return false;
++	lockdep_assert_rq_held(src_rq);
 +
-+	/* @p->rq couldn't have changed if we're still the holding cpu */
-+	task_rq = task_rq(p);
-+	lockdep_assert_rq_held(task_rq);
++	/* the following marks @p MIGRATING which excludes dequeue */
++	deactivate_task(src_rq, p, 0);
++	set_task_cpu(p, cpu_of(dst_rq));
++	p->scx.sticky_cpu = cpu_of(dst_rq);
 +
-+	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
-+	deactivate_task(task_rq, p, 0);
-+	set_task_cpu(p, cpu_of(rq));
-+	p->scx.sticky_cpu = cpu_of(rq);
++	raw_spin_rq_unlock(src_rq);
++	raw_spin_rq_lock(dst_rq);
 +
 +	/*
 +	 * We want to pass scx-specific enq_flags but activate_task() will
 +	 * truncate the upper 32 bit. As we own @rq, we can pass them through
 +	 * @rq->scx.extra_enq_flags instead.
 +	 */
-+	WARN_ON_ONCE(rq->scx.extra_enq_flags);
-+	rq->scx.extra_enq_flags = enq_flags;
-+	activate_task(rq, p, 0);
-+	rq->scx.extra_enq_flags = 0;
-+
-+	return true;
++	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
++	WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
++	dst_rq->scx.extra_enq_flags = enq_flags;
++	activate_task(dst_rq, p, 0);
++	dst_rq->scx.extra_enq_flags = 0;
 +}
 +
-+/**
-+ * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked
-+ * @rq: current rq which is locked
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
++/*
++ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two
++ * differences:
 + *
-+ * We're holding @rq lock and trying to dispatch a task from @src_rq to
-+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
-+ * @rq stays locked isn't important as long as the state is restored after
-+ * dispatch_to_local_dsq_unlock().
-+ */
-+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq *src_rq,
-+				       struct rq *dst_rq)
-+{
-+	if (src_rq == dst_rq) {
-+		raw_spin_rq_unlock(rq);
-+		raw_spin_rq_lock(dst_rq);
-+	} else if (rq == src_rq) {
-+		double_lock_balance(rq, dst_rq);
-+	} else if (rq == dst_rq) {
-+		double_lock_balance(rq, src_rq);
-+	} else {
-+		raw_spin_rq_unlock(rq);
-+		double_rq_lock(src_rq, dst_rq);
-+	}
-+}
-+
-+/**
-+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
-+ * @rq: current rq which is locked
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
++ * - is_cpu_allowed() asks "Can this task run on this CPU?" while
++ *   task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to
++ *   this CPU?".
 + *
-+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
-+ */
-+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq *src_rq,
-+					 struct rq *dst_rq)
-+{
-+	if (src_rq == dst_rq) {
-+		raw_spin_rq_unlock(dst_rq);
-+		raw_spin_rq_lock(rq);
-+	} else if (rq == src_rq) {
-+		double_unlock_balance(rq, dst_rq);
-+	} else if (rq == dst_rq) {
-+		double_unlock_balance(rq, src_rq);
-+	} else {
-+		double_rq_unlock(src_rq, dst_rq);
-+		raw_spin_rq_lock(rq);
-+	}
-+}
-+#endif	/* CONFIG_SMP */
-+
-+static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq,
-+			       struct task_struct *p)
-+{
-+	lockdep_assert_held(&dsq->lock);	/* released on return */
-+
-+	/* @dsq is locked and @p is on this rq */
-+	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-+	task_unlink_from_dsq(p, dsq);
-+	list_add_tail(&p->scx.dsq_list.node, &rq->scx.local_dsq.list);
-+	dsq_mod_nr(dsq, -1);
-+	dsq_mod_nr(&rq->scx.local_dsq, 1);
-+	p->scx.dsq = &rq->scx.local_dsq;
-+	raw_spin_unlock(&dsq->lock);
-+}
-+
-+#ifdef CONFIG_SMP
-+/*
-+ * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p
-+ * can be pulled to @rq.
++ *   While migration is disabled, is_cpu_allowed() has to say "yes" as the task
++ *   must be allowed to finish on the CPU that it's currently on regardless of
++ *   the CPU state. However, task_can_run_on_remote_rq() must say "no" as the
++ *   BPF scheduler shouldn't attempt to migrate a task which has migration
++ *   disabled.
++ *
++ * - The BPF scheduler is bypassed while the rq is offline and we can always say
++ *   no to the BPF scheduler initiated migrations while offline.
 + */
-+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
++static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
++				      bool trigger_error)
 +{
 +	int cpu = cpu_of(rq);
 +
-+	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
++	/*
++	 * We don't require the BPF scheduler to avoid dispatching to offline
++	 * CPUs mostly for convenience but also because CPUs can go offline
++	 * between scx_bpf_dispatch() calls and here. Trigger error iff the
++	 * picked CPU is outside the allowed mask.
++	 */
++	if (!task_allowed_on_cpu(p, cpu)) {
++		if (trigger_error)
++			scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
++				      cpu_of(rq), p->comm, p->pid);
 +		return false;
++	}
++
 +	if (unlikely(is_migration_disabled(p)))
 +		return false;
-+	if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
-+		return false;
++
 +	if (!scx_rq_online(rq))
 +		return false;
++
 +	return true;
 +}
 +
-+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq,
-+				struct task_struct *p, struct rq *task_rq)
++/**
++ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
++ * @p: target task
++ * @dsq: locked DSQ @p is currently on
++ * @src_rq: rq @p is currently on, stable with @dsq locked
++ *
++ * Called with @dsq locked but no rq's locked. We want to move @p to a different
++ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is
++ * required when transferring into a local DSQ. Even when transferring into a
++ * non-local DSQ, it's better to use the same mechanism to protect against
++ * dequeues and maintain the invariant that @p->scx.dsq can only change while
++ * @src_rq is locked, which e.g. scx_dump_task() depends on.
++ *
++ * We want to grab @src_rq but that can deadlock if we try while locking @dsq,
++ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As
++ * this may race with dequeue, which can't drop the rq lock or fail, do a little
++ * dancing from our side.
++ *
++ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
++ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu
++ * would be cleared to -1. While other cpus may have updated it to different
++ * values afterwards, as this operation can't be preempted or recurse, the
++ * holding_cpu can never become this CPU again before we're done. Thus, we can
++ * tell whether we lost to dequeue by testing whether the holding_cpu still
++ * points to this CPU. See dispatch_dequeue() for the counterpart.
++ *
++ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is
++ * still valid. %false if lost to dequeue.
++ */
++static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
++				       struct scx_dispatch_q *dsq,
++				       struct rq *src_rq)
 +{
-+	bool moved = false;
++	s32 cpu = raw_smp_processor_id();
 +
-+	lockdep_assert_held(&dsq->lock);	/* released on return */
++	lockdep_assert_held(&dsq->lock);
 +
-+	/*
-+	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
-+	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
-+	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
-+	 * rq lock or fail, do a little dancing from our side. See
-+	 * move_task_to_local_dsq().
-+	 */
 +	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
 +	task_unlink_from_dsq(p, dsq);
-+	dsq_mod_nr(dsq, -1);
-+	p->scx.holding_cpu = raw_smp_processor_id();
-+	raw_spin_unlock(&dsq->lock);
++	p->scx.holding_cpu = cpu;
 +
-+	double_lock_balance(rq, task_rq);
++	raw_spin_unlock(&dsq->lock);
++	raw_spin_rq_lock(src_rq);
 +
-+	moved = move_task_to_local_dsq(rq, p, 0);
++	/* task_rq couldn't have changed if we're still the holding cpu */
++	return likely(p->scx.holding_cpu == cpu) &&
++		!WARN_ON_ONCE(src_rq != task_rq(p));
++}
 +
-+	double_unlock_balance(rq, task_rq);
++static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
++				struct scx_dispatch_q *dsq, struct rq *src_rq)
++{
++	raw_spin_rq_unlock(this_rq);
 +
-+	return moved;
++	if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
++		move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
++		return true;
++	} else {
++		raw_spin_rq_unlock(src_rq);
++		raw_spin_rq_lock(this_rq);
++		return false;
++	}
 +}
 +#else	/* CONFIG_SMP */
-+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; }
-+static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq,
-+				struct task_struct *p, struct rq *task_rq) { return false; }
++static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
++static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
++static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
 +#endif	/* CONFIG_SMP */
 +
 +static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
@@ -3766,12 +4293,14 @@ index 000000000000..0dac88d0e578
 +		struct rq *task_rq = task_rq(p);
 +
 +		if (rq == task_rq) {
-+			consume_local_task(rq, dsq, p);
++			task_unlink_from_dsq(p, dsq);
++			move_local_task_to_local_dsq(p, 0, dsq, rq);
++			raw_spin_unlock(&dsq->lock);
 +			return true;
 +		}
 +
-+		if (task_can_run_on_remote_rq(p, rq)) {
-+			if (likely(consume_remote_task(rq, dsq, p, task_rq)))
++		if (task_can_run_on_remote_rq(p, rq, false)) {
++			if (likely(consume_remote_task(rq, p, dsq, task_rq)))
 +				return true;
 +			goto retry;
 +		}
@@ -3781,122 +4310,102 @@ index 000000000000..0dac88d0e578
 +	return false;
 +}
 +
-+enum dispatch_to_local_dsq_ret {
-+	DTL_DISPATCHED,		/* successfully dispatched */
-+	DTL_LOST,		/* lost race to dequeue */
-+	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
-+	DTL_INVALID,		/* invalid local dsq_id */
-+};
++static bool consume_global_dsq(struct rq *rq)
++{
++	int node = cpu_to_node(cpu_of(rq));
++
++	return consume_dispatch_q(rq, global_dsqs[node]);
++}
 +
 +/**
 + * dispatch_to_local_dsq - Dispatch a task to a local dsq
 + * @rq: current rq which is locked
-+ * @dsq_id: destination dsq ID
++ * @dst_dsq: destination DSQ
 + * @p: task to dispatch
 + * @enq_flags: %SCX_ENQ_*
 + *
-+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
-+ * @dsq_id. This function performs all the synchronization dancing needed
-+ * because local DSQs are protected with rq locks.
++ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
++ * DSQ. This function performs all the synchronization dancing needed because
++ * local DSQs are protected with rq locks.
 + *
 + * The caller must have exclusive ownership of @p (e.g. through
 + * %SCX_OPSS_DISPATCHING).
 + */
-+static enum dispatch_to_local_dsq_ret
-+dispatch_to_local_dsq(struct rq *rq, u64 dsq_id, struct task_struct *p,
-+		      u64 enq_flags)
++static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
++				  struct task_struct *p, u64 enq_flags)
 +{
 +	struct rq *src_rq = task_rq(p);
-+	struct rq *dst_rq;
++	struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
 +
 +	/*
 +	 * We're synchronized against dequeue through DISPATCHING. As @p can't
 +	 * be dequeued, its task_rq and cpus_allowed are stable too.
++	 *
++	 * If dispatching to @rq that @p is already on, no lock dancing needed.
 +	 */
-+	if (dsq_id == SCX_DSQ_LOCAL) {
-+		dst_rq = rq;
-+	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
-+
-+		if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
-+			return DTL_INVALID;
-+		dst_rq = cpu_rq(cpu);
-+	} else {
-+		return DTL_NOT_LOCAL;
-+	}
-+
-+	/* if dispatching to @rq that @p is already on, no lock dancing needed */
 +	if (rq == src_rq && rq == dst_rq) {
-+		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+				 enq_flags | SCX_ENQ_CLEAR_OPSS);
-+		return DTL_DISPATCHED;
++		dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
++		return;
 +	}
 +
 +#ifdef CONFIG_SMP
-+	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
-+		struct rq *locked_dst_rq = dst_rq;
-+		bool dsp;
++	if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
++		dispatch_enqueue(find_global_dsq(p), p,
++				 enq_flags | SCX_ENQ_CLEAR_OPSS);
++		return;
++	}
 +
-+		/*
-+		 * @p is on a possibly remote @src_rq which we need to lock to
-+		 * move the task. If dequeue is in progress, it'd be locking
-+		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
-+		 * lock while holding DISPATCHING.
-+		 *
-+		 * As DISPATCHING guarantees that @p is wholly ours, we can
-+		 * pretend that we're moving from a DSQ and use the same
-+		 * mechanism - mark the task under transfer with holding_cpu,
-+		 * release DISPATCHING and then follow the same protocol.
-+		 */
-+		p->scx.holding_cpu = raw_smp_processor_id();
++	/*
++	 * @p is on a possibly remote @src_rq which we need to lock to move the
++	 * task. If dequeue is in progress, it'd be locking @src_rq and waiting
++	 * on DISPATCHING, so we can't grab @src_rq lock while holding
++	 * DISPATCHING.
++	 *
++	 * As DISPATCHING guarantees that @p is wholly ours, we can pretend that
++	 * we're moving from a DSQ and use the same mechanism - mark the task
++	 * under transfer with holding_cpu, release DISPATCHING and then follow
++	 * the same protocol. See unlink_dsq_and_lock_src_rq().
++	 */
++	p->scx.holding_cpu = raw_smp_processor_id();
 +
-+		/* store_release ensures that dequeue sees the above */
-+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
++	/* store_release ensures that dequeue sees the above */
++	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
 +
-+		dispatch_to_local_dsq_lock(rq, src_rq, locked_dst_rq);
++	/* switch to @src_rq lock */
++	if (rq != src_rq) {
++		raw_spin_rq_unlock(rq);
++		raw_spin_rq_lock(src_rq);
++	}
 +
++	/* task_rq couldn't have changed if we're still the holding cpu */
++	if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
++	    !WARN_ON_ONCE(src_rq != task_rq(p))) {
 +		/*
-+		 * We don't require the BPF scheduler to avoid dispatching to
-+		 * offline CPUs mostly for convenience but also because CPUs can
-+		 * go offline between scx_bpf_dispatch() calls and here. If @p
-+		 * is destined to an offline CPU, queue it on its current CPU
-+		 * instead, which should always be safe. As this is an allowed
-+		 * behavior, don't trigger an ops error.
++		 * If @p is staying on the same rq, there's no need to go
++		 * through the full deactivate/activate cycle. Optimize by
++		 * abbreviating move_remote_task_to_local_dsq().
 +		 */
-+		if (!scx_rq_online(dst_rq))
-+			dst_rq = src_rq;
-+
 +		if (src_rq == dst_rq) {
-+			/*
-+			 * As @p is staying on the same rq, there's no need to
-+			 * go through the full deactivate/activate cycle.
-+			 * Optimize by abbreviating the operations in
-+			 * move_task_to_local_dsq().
-+			 */
-+			dsp = p->scx.holding_cpu == raw_smp_processor_id();
-+			if (likely(dsp)) {
-+				p->scx.holding_cpu = -1;
-+				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+						 enq_flags);
-+			}
++			p->scx.holding_cpu = -1;
++			dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
 +		} else {
-+			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
++			move_remote_task_to_local_dsq(p, enq_flags,
++						      src_rq, dst_rq);
 +		}
 +
 +		/* if the destination CPU is idle, wake it up */
-+		if (dsp && sched_class_above(p->sched_class,
-+					     dst_rq->curr->sched_class))
++		if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
 +			resched_curr(dst_rq);
++	}
 +
-+		dispatch_to_local_dsq_unlock(rq, src_rq, locked_dst_rq);
-+
-+		return dsp ? DTL_DISPATCHED : DTL_LOST;
++	/* switch back to @rq lock */
++	if (rq != dst_rq) {
++		raw_spin_rq_unlock(dst_rq);
++		raw_spin_rq_lock(rq);
 +	}
++#else	/* CONFIG_SMP */
++	BUG();	/* control can not reach here on UP */
 +#endif	/* CONFIG_SMP */
-+
-+	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
-+		      cpu_of(dst_rq), p->comm, p->pid);
-+	return DTL_INVALID;
 +}
 +
 +/**
@@ -3971,20 +4480,12 @@ index 000000000000..0dac88d0e578
 +
 +	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
 +
-+	switch (dispatch_to_local_dsq(rq, dsq_id, p, enq_flags)) {
-+	case DTL_DISPATCHED:
-+		break;
-+	case DTL_LOST:
-+		break;
-+	case DTL_INVALID:
-+		dsq_id = SCX_DSQ_GLOBAL;
-+		fallthrough;
-+	case DTL_NOT_LOCAL:
-+		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
-+					    dsq_id, p);
++	dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
++
++	if (dsq->id == SCX_DSQ_LOCAL)
++		dispatch_to_local_dsq(rq, dsq, p, enq_flags);
++	else
 +		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
-+		break;
-+	}
 +}
 +
 +static void flush_dispatch_buf(struct rq *rq)
@@ -4046,7 +4547,7 @@ index 000000000000..0dac88d0e578
 +		 * same conditions later and pick @rq->curr accordingly.
 +		 */
 +		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
-+		    prev->scx.slice && !scx_ops_bypassing()) {
++		    prev->scx.slice && !scx_rq_bypassing(rq)) {
 +			if (local)
 +				prev->scx.flags |= SCX_TASK_BAL_KEEP;
 +			goto has_tasks;
@@ -4057,10 +4558,10 @@ index 000000000000..0dac88d0e578
 +	if (rq->scx.local_dsq.nr)
 +		goto has_tasks;
 +
-+	if (consume_dispatch_q(rq, &scx_dsq_global))
++	if (consume_global_dsq(rq))
 +		goto has_tasks;
 +
-+	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
++	if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
 +		goto out;
 +
 +	dspc->rq = rq;
@@ -4082,7 +4583,7 @@ index 000000000000..0dac88d0e578
 +
 +		if (rq->scx.local_dsq.nr)
 +			goto has_tasks;
-+		if (consume_dispatch_q(rq, &scx_dsq_global))
++		if (consume_global_dsq(rq))
 +			goto has_tasks;
 +
 +		/*
@@ -4109,7 +4610,6 @@ index 000000000000..0dac88d0e578
 +	return has_tasks;
 +}
 +
-+#ifdef CONFIG_SMP
 +static int balance_scx(struct rq *rq, struct task_struct *prev,
 +		       struct rq_flags *rf)
 +{
@@ -4143,7 +4643,31 @@ index 000000000000..0dac88d0e578
 +
 +	return ret;
 +}
-+#endif
++
++static void process_ddsp_deferred_locals(struct rq *rq)
++{
++	struct task_struct *p;
++
++	lockdep_assert_rq_held(rq);
++
++	/*
++	 * Now that @rq can be unlocked, execute the deferred enqueueing of
++	 * tasks directly dispatched to the local DSQs of other CPUs. See
++	 * direct_dispatch(). Keep popping from the head instead of using
++	 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
++	 * temporarily.
++	 */
++	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
++				struct task_struct, scx.dsq_list.node))) {
++		struct scx_dispatch_q *dsq;
++
++		list_del_init(&p->scx.dsq_list.node);
++
++		dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
++		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
++			dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
++	}
++}
 +
 +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 +{
@@ -4187,62 +4711,71 @@ index 000000000000..0dac88d0e578
 +	}
 +}
 +
-+static void process_ddsp_deferred_locals(struct rq *rq)
++static enum scx_cpu_preempt_reason
++preempt_reason_from_class(const struct sched_class *class)
++{
++#ifdef CONFIG_SMP
++	if (class == &stop_sched_class)
++		return SCX_CPU_PREEMPT_STOP;
++#endif
++	if (class == &dl_sched_class)
++		return SCX_CPU_PREEMPT_DL;
++	if (class == &rt_sched_class)
++		return SCX_CPU_PREEMPT_RT;
++	return SCX_CPU_PREEMPT_UNKNOWN;
++}
++
++static void switch_class_scx(struct rq *rq, struct task_struct *next)
 +{
-+	struct task_struct *p, *tmp;
++	const struct sched_class *next_class = next->sched_class;
 +
-+	lockdep_assert_rq_held(rq);
++	if (!scx_enabled())
++		return;
++#ifdef CONFIG_SMP
++	/*
++	 * Pairs with the smp_load_acquire() issued by a CPU in
++	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
++	 * resched.
++	 */
++	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
++#endif
++	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
++		return;
 +
 +	/*
-+	 * Now that @rq can be unlocked, execute the deferred enqueueing of
-+	 * tasks directly dispatched to the local DSQs of other CPUs. See
-+	 * direct_dispatch().
++	 * The callback is conceptually meant to convey that the CPU is no
++	 * longer under the control of SCX. Therefore, don't invoke the callback
++	 * if the next class is below SCX (in which case the BPF scheduler has
++	 * actively decided not to schedule any tasks on the CPU).
 +	 */
-+	list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals,
-+				 scx.dsq_list.node) {
-+		s32 ret;
++	if (sched_class_above(&ext_sched_class, next_class))
++		return;
 +
-+		list_del_init(&p->scx.dsq_list.node);
++	/*
++	 * At this point we know that SCX was preempted by a higher priority
++	 * sched_class, so invoke the ->cpu_release() callback if we have not
++	 * done so already. We only send the callback once between SCX being
++	 * preempted, and it regaining control of the CPU.
++	 *
++	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
++	 *  next time that balance_scx() is invoked.
++	 */
++	if (!rq->scx.cpu_released) {
++		if (SCX_HAS_OP(cpu_release)) {
++			struct scx_cpu_release_args args = {
++				.reason = preempt_reason_from_class(next_class),
++				.task = next,
++			};
 +
-+		ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p,
-+					    p->scx.ddsp_enq_flags);
-+		WARN_ON_ONCE(ret == DTL_NOT_LOCAL);
++			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
++				    cpu_release, cpu_of(rq), &args);
++		}
++		rq->scx.cpu_released = true;
 +	}
 +}
 +
 +static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 +{
-+#ifndef CONFIG_SMP
-+	/*
-+	 * UP workaround.
-+	 *
-+	 * Because SCX may transfer tasks across CPUs during dispatch, dispatch
-+	 * is performed from its balance operation which isn't called in UP.
-+	 * Let's work around by calling it from the operations which come right
-+	 * after.
-+	 *
-+	 * 1. If the prev task is on SCX, pick_next_task() calls
-+	 *    .put_prev_task() right after. As .put_prev_task() is also called
-+	 *    from other places, we need to distinguish the calls which can be
-+	 *    done by looking at the previous task's state - if still queued or
-+	 *    dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task().
-+	 *    This case is handled here.
-+	 *
-+	 * 2. If the prev task is not on SCX, the first following call into SCX
-+	 *    will be .pick_next_task(), which is covered by calling
-+	 *    balance_scx() from pick_next_task_scx().
-+	 *
-+	 * Note that we can't merge the first case into the second as
-+	 * balance_scx() must be called before the previous SCX task goes
-+	 * through put_prev_task_scx().
-+	 *
-+         * @rq is pinned and can't be unlocked. As UP doesn't transfer tasks
-+         * around, balance_one() doesn't need to.
-+	 */
-+	if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP))
-+		balance_one(rq, p, true);
-+#endif
-+
 +	update_curr_scx(rq);
 +
 +	/* see dequeue_task_scx() on why we skip when !QUEUED */
@@ -4269,7 +4802,7 @@ index 000000000000..0dac88d0e578
 +		 * scheduler class or core-sched forcing a different task. Leave
 +		 * it at the head of the local DSQ.
 +		 */
-+		if (p->scx.slice && !scx_ops_bypassing()) {
++		if (p->scx.slice && !scx_rq_bypassing(rq)) {
 +			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
 +			return;
 +		}
@@ -4300,12 +4833,6 @@ index 000000000000..0dac88d0e578
 +{
 +	struct task_struct *p;
 +
-+#ifndef CONFIG_SMP
-+	/* UP workaround - see the comment at the head of put_prev_task_scx() */
-+	if (unlikely(rq->curr->sched_class != &ext_sched_class))
-+		balance_one(rq, rq->curr, true);
-+#endif
-+
 +	p = first_local_task(rq);
 +	if (!p)
 +		return NULL;
@@ -4313,7 +4840,7 @@ index 000000000000..0dac88d0e578
 +	set_next_task_scx(rq, p, true);
 +
 +	if (unlikely(!p->scx.slice)) {
-+		if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
++		if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
 +			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
 +					p->comm, p->pid);
 +			scx_warned_zero_slice = true;
@@ -4350,7 +4877,7 @@ index 000000000000..0dac88d0e578
 +	 * calling ops.core_sched_before(). Accesses are controlled by the
 +	 * verifier.
 +	 */
-+	if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
++	if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
 +		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
 +					      (struct task_struct *)a,
 +					      (struct task_struct *)b);
@@ -4402,69 +4929,6 @@ index 000000000000..0dac88d0e578
 +}
 +#endif	/* CONFIG_SCHED_CORE */
 +
-+static enum scx_cpu_preempt_reason
-+preempt_reason_from_class(const struct sched_class *class)
-+{
-+#ifdef CONFIG_SMP
-+	if (class == &stop_sched_class)
-+		return SCX_CPU_PREEMPT_STOP;
-+#endif
-+	if (class == &dl_sched_class)
-+		return SCX_CPU_PREEMPT_DL;
-+	if (class == &rt_sched_class)
-+		return SCX_CPU_PREEMPT_RT;
-+	return SCX_CPU_PREEMPT_UNKNOWN;
-+}
-+
-+static void switch_class_scx(struct rq *rq, struct task_struct *next)
-+{
-+	const struct sched_class *next_class = next->sched_class;
-+
-+	if (!scx_enabled())
-+		return;
-+#ifdef CONFIG_SMP
-+	/*
-+	 * Pairs with the smp_load_acquire() issued by a CPU in
-+	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
-+	 * resched.
-+	 */
-+	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-+#endif
-+	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
-+		return;
-+
-+	/*
-+	 * The callback is conceptually meant to convey that the CPU is no
-+	 * longer under the control of SCX. Therefore, don't invoke the callback
-+	 * if the next class is below SCX (in which case the BPF scheduler has
-+	 * actively decided not to schedule any tasks on the CPU).
-+	 */
-+	if (sched_class_above(&ext_sched_class, next_class))
-+		return;
-+
-+	/*
-+	 * At this point we know that SCX was preempted by a higher priority
-+	 * sched_class, so invoke the ->cpu_release() callback if we have not
-+	 * done so already. We only send the callback once between SCX being
-+	 * preempted, and it regaining control of the CPU.
-+	 *
-+	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
-+	 *  next time that balance_scx() is invoked.
-+	 */
-+	if (!rq->scx.cpu_released) {
-+		if (SCX_HAS_OP(cpu_release)) {
-+			struct scx_cpu_release_args args = {
-+				.reason = preempt_reason_from_class(next_class),
-+				.task = next,
-+			};
-+
-+			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
-+				    cpu_release, cpu_of(rq), &args);
-+		}
-+		rq->scx.cpu_released = true;
-+	}
-+}
-+
 +#ifdef CONFIG_SMP
 +
 +static bool test_and_clear_cpu_idle(int cpu)
@@ -4815,7 +5279,7 @@ index 000000000000..0dac88d0e578
 +	 * While disabling, always resched and refresh core-sched timestamp as
 +	 * we can't trust the slice management or ops.core_sched_before().
 +	 */
-+	if (scx_ops_bypassing()) {
++	if (scx_rq_bypassing(rq)) {
 +		curr->scx.slice = 0;
 +		touch_core_sched(rq, curr);
 +	} else if (SCX_HAS_OP(tick)) {
@@ -4826,6 +5290,28 @@ index 000000000000..0dac88d0e578
 +		resched_curr(rq);
 +}
 +
++#ifdef CONFIG_EXT_GROUP_SCHED
++static struct cgroup *tg_cgrp(struct task_group *tg)
++{
++	/*
++	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
++	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
++	 * root cgroup.
++	 */
++	if (tg && tg->css.cgroup)
++		return tg->css.cgroup;
++	else
++		return &cgrp_dfl_root.cgrp;
++}
++
++#define SCX_INIT_TASK_ARGS_CGROUP(tg)		.cgroup = tg_cgrp(tg),
++
++#else	/* CONFIG_EXT_GROUP_SCHED */
++
++#define SCX_INIT_TASK_ARGS_CGROUP(tg)
++
++#endif	/* CONFIG_EXT_GROUP_SCHED */
++
 +static enum scx_task_state scx_get_task_state(const struct task_struct *p)
 +{
 +	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
@@ -4870,6 +5356,7 @@ index 000000000000..0dac88d0e578
 +
 +	if (SCX_HAS_OP(init_task)) {
 +		struct scx_init_task_args args = {
++			SCX_INIT_TASK_ARGS_CGROUP(tg)
 +			.fork = fork,
 +		};
 +
@@ -4883,24 +5370,29 @@ index 000000000000..0dac88d0e578
 +	scx_set_task_state(p, SCX_TASK_INIT);
 +
 +	if (p->scx.disallow) {
-+		struct rq *rq;
-+		struct rq_flags rf;
++		if (!fork) {
++			struct rq *rq;
++			struct rq_flags rf;
 +
-+		rq = task_rq_lock(p, &rf);
++			rq = task_rq_lock(p, &rf);
 +
-+		/*
-+		 * We're either in fork or load path and @p->policy will be
-+		 * applied right after. Reverting @p->policy here and rejecting
-+		 * %SCHED_EXT transitions from scx_check_setscheduler()
-+		 * guarantees that if ops.init_task() sets @p->disallow, @p can
-+		 * never be in SCX.
-+		 */
-+		if (p->policy == SCHED_EXT) {
-+			p->policy = SCHED_NORMAL;
-+			atomic_long_inc(&scx_nr_rejected);
-+		}
++			/*
++			 * We're in the load path and @p->policy will be applied
++			 * right after. Reverting @p->policy here and rejecting
++			 * %SCHED_EXT transitions from scx_check_setscheduler()
++			 * guarantees that if ops.init_task() sets @p->disallow,
++			 * @p can never be in SCX.
++			 */
++			if (p->policy == SCHED_EXT) {
++				p->policy = SCHED_NORMAL;
++				atomic_long_inc(&scx_nr_rejected);
++			}
 +
-+		task_rq_unlock(rq, p, &rf);
++			task_rq_unlock(rq, p, &rf);
++		} else if (p->policy == SCHED_EXT) {
++			scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
++				      p->comm, p->pid);
++		}
 +	}
 +
 +	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
@@ -4929,7 +5421,7 @@ index 000000000000..0dac88d0e578
 +	scx_set_task_state(p, SCX_TASK_ENABLED);
 +
 +	if (SCX_HAS_OP(set_weight))
-+		SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
++		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
 +}
 +
 +static void scx_ops_disable_task(struct task_struct *p)
@@ -5004,7 +5496,7 @@ index 000000000000..0dac88d0e578
 +{
 +	percpu_rwsem_assert_held(&scx_fork_rwsem);
 +
-+	if (scx_enabled())
++	if (scx_ops_init_task_enabled)
 +		return scx_ops_init_task(p, task_group(p), true);
 +	else
 +		return 0;
@@ -5012,7 +5504,7 @@ index 000000000000..0dac88d0e578
 +
 +void scx_post_fork(struct task_struct *p)
 +{
-+	if (scx_enabled()) {
++	if (scx_ops_init_task_enabled) {
 +		scx_set_task_state(p, SCX_TASK_READY);
 +
 +		/*
@@ -5126,7 +5618,7 @@ index 000000000000..0dac88d0e578
 +{
 +	struct task_struct *p = rq->curr;
 +
-+	if (scx_ops_bypassing())
++	if (scx_rq_bypassing(rq))
 +		return false;
 +
 +	if (p->sched_class != &ext_sched_class)
@@ -5141,6 +5633,222 @@ index 000000000000..0dac88d0e578
 +}
 +#endif
 +
++#ifdef CONFIG_EXT_GROUP_SCHED
++
++DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
++static bool scx_cgroup_enabled;
++static bool cgroup_warned_missing_weight;
++static bool cgroup_warned_missing_idle;
++
++static void scx_cgroup_warn_missing_weight(struct task_group *tg)
++{
++	if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
++	    cgroup_warned_missing_weight)
++		return;
++
++	if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
++		return;
++
++	pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
++		scx_ops.name);
++	cgroup_warned_missing_weight = true;
++}
++
++static void scx_cgroup_warn_missing_idle(struct task_group *tg)
++{
++	if (!scx_cgroup_enabled || cgroup_warned_missing_idle)
++		return;
++
++	if (!tg->idle)
++		return;
++
++	pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
++		scx_ops.name);
++	cgroup_warned_missing_idle = true;
++}
++
++int scx_tg_online(struct task_group *tg)
++{
++	int ret = 0;
++
++	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
++
++	percpu_down_read(&scx_cgroup_rwsem);
++
++	scx_cgroup_warn_missing_weight(tg);
++
++	if (scx_cgroup_enabled) {
++		if (SCX_HAS_OP(cgroup_init)) {
++			struct scx_cgroup_init_args args =
++				{ .weight = tg->scx_weight };
++
++			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
++					      tg->css.cgroup, &args);
++			if (ret)
++				ret = ops_sanitize_err("cgroup_init", ret);
++		}
++		if (ret == 0)
++			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
++	} else {
++		tg->scx_flags |= SCX_TG_ONLINE;
++	}
++
++	percpu_up_read(&scx_cgroup_rwsem);
++	return ret;
++}
++
++void scx_tg_offline(struct task_group *tg)
++{
++	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
++
++	percpu_down_read(&scx_cgroup_rwsem);
++
++	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
++		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
++	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
++
++	percpu_up_read(&scx_cgroup_rwsem);
++}
++
++int scx_cgroup_can_attach(struct cgroup_taskset *tset)
++{
++	struct cgroup_subsys_state *css;
++	struct task_struct *p;
++	int ret;
++
++	/* released in scx_finish/cancel_attach() */
++	percpu_down_read(&scx_cgroup_rwsem);
++
++	if (!scx_cgroup_enabled)
++		return 0;
++
++	cgroup_taskset_for_each(p, css, tset) {
++		struct cgroup *from = tg_cgrp(task_group(p));
++		struct cgroup *to = tg_cgrp(css_tg(css));
++
++		WARN_ON_ONCE(p->scx.cgrp_moving_from);
++
++		/*
++		 * sched_move_task() omits identity migrations. Let's match the
++		 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
++		 * always match one-to-one.
++		 */
++		if (from == to)
++			continue;
++
++		if (SCX_HAS_OP(cgroup_prep_move)) {
++			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
++					      p, from, css->cgroup);
++			if (ret)
++				goto err;
++		}
++
++		p->scx.cgrp_moving_from = from;
++	}
++
++	return 0;
++
++err:
++	cgroup_taskset_for_each(p, css, tset) {
++		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
++			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
++				    p->scx.cgrp_moving_from, css->cgroup);
++		p->scx.cgrp_moving_from = NULL;
++	}
++
++	percpu_up_read(&scx_cgroup_rwsem);
++	return ops_sanitize_err("cgroup_prep_move", ret);
++}
++
++void scx_move_task(struct task_struct *p)
++{
++	if (!scx_cgroup_enabled)
++		return;
++
++	/*
++	 * We're called from sched_move_task() which handles both cgroup and
++	 * autogroup moves. Ignore the latter.
++	 *
++	 * Also ignore exiting tasks, because in the exit path tasks transition
++	 * from the autogroup to the root group, so task_group_is_autogroup()
++	 * alone isn't able to catch exiting autogroup tasks. This is safe for
++	 * cgroup_move(), because cgroup migrations never happen for PF_EXITING
++	 * tasks.
++	 */
++	if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
++		return;
++
++	/*
++	 * @p must have ops.cgroup_prep_move() called on it and thus
++	 * cgrp_moving_from set.
++	 */
++	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
++		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
++			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
++	p->scx.cgrp_moving_from = NULL;
++}
++
++void scx_cgroup_finish_attach(void)
++{
++	percpu_up_read(&scx_cgroup_rwsem);
++}
++
++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
++{
++	struct cgroup_subsys_state *css;
++	struct task_struct *p;
++
++	if (!scx_cgroup_enabled)
++		goto out_unlock;
++
++	cgroup_taskset_for_each(p, css, tset) {
++		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
++			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
++				    p->scx.cgrp_moving_from, css->cgroup);
++		p->scx.cgrp_moving_from = NULL;
++	}
++out_unlock:
++	percpu_up_read(&scx_cgroup_rwsem);
++}
++
++void scx_group_set_weight(struct task_group *tg, unsigned long weight)
++{
++	percpu_down_read(&scx_cgroup_rwsem);
++
++	if (scx_cgroup_enabled && tg->scx_weight != weight) {
++		if (SCX_HAS_OP(cgroup_set_weight))
++			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
++				    tg_cgrp(tg), weight);
++		tg->scx_weight = weight;
++	}
++
++	percpu_up_read(&scx_cgroup_rwsem);
++}
++
++void scx_group_set_idle(struct task_group *tg, bool idle)
++{
++	percpu_down_read(&scx_cgroup_rwsem);
++	scx_cgroup_warn_missing_idle(tg);
++	percpu_up_read(&scx_cgroup_rwsem);
++}
++
++static void scx_cgroup_lock(void)
++{
++	percpu_down_write(&scx_cgroup_rwsem);
++}
++
++static void scx_cgroup_unlock(void)
++{
++	percpu_up_write(&scx_cgroup_rwsem);
++}
++
++#else	/* CONFIG_EXT_GROUP_SCHED */
++
++static inline void scx_cgroup_lock(void) {}
++static inline void scx_cgroup_unlock(void) {}
++
++#endif	/* CONFIG_EXT_GROUP_SCHED */
++
 +/*
 + * Omitted operations:
 + *
@@ -5161,6 +5869,7 @@ index 000000000000..0dac88d0e578
 +
 +	.wakeup_preempt		= wakeup_preempt_scx,
 +
++	.balance		= balance_scx,
 +	.pick_next_task		= pick_next_task_scx,
 +
 +	.put_prev_task		= put_prev_task_scx,
@@ -5169,7 +5878,6 @@ index 000000000000..0dac88d0e578
 +	.switch_class		= switch_class_scx,
 +
 +#ifdef CONFIG_SMP
-+	.balance		= balance_scx,
 +	.select_task_rq		= select_task_rq_scx,
 +	.task_woken		= task_woken_scx,
 +	.set_cpus_allowed	= set_cpus_allowed_scx,
@@ -5278,6 +5986,102 @@ index 000000000000..0dac88d0e578
 +	rcu_read_unlock();
 +}
 +
++#ifdef CONFIG_EXT_GROUP_SCHED
++static void scx_cgroup_exit(void)
++{
++	struct cgroup_subsys_state *css;
++
++	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
++
++	WARN_ON_ONCE(!scx_cgroup_enabled);
++	scx_cgroup_enabled = false;
++
++	/*
++	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
++	 * cgroups and exit all the inited ones, all online cgroups are exited.
++	 */
++	rcu_read_lock();
++	css_for_each_descendant_post(css, &root_task_group.css) {
++		struct task_group *tg = css_tg(css);
++
++		if (!(tg->scx_flags & SCX_TG_INITED))
++			continue;
++		tg->scx_flags &= ~SCX_TG_INITED;
++
++		if (!scx_ops.cgroup_exit)
++			continue;
++
++		if (WARN_ON_ONCE(!css_tryget(css)))
++			continue;
++		rcu_read_unlock();
++
++		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
++
++		rcu_read_lock();
++		css_put(css);
++	}
++	rcu_read_unlock();
++}
++
++static int scx_cgroup_init(void)
++{
++	struct cgroup_subsys_state *css;
++	int ret;
++
++	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
++
++	cgroup_warned_missing_weight = false;
++	cgroup_warned_missing_idle = false;
++
++	/*
++	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
++	 * cgroups and init, all online cgroups are initialized.
++	 */
++	rcu_read_lock();
++	css_for_each_descendant_pre(css, &root_task_group.css) {
++		struct task_group *tg = css_tg(css);
++		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
++
++		scx_cgroup_warn_missing_weight(tg);
++		scx_cgroup_warn_missing_idle(tg);
++
++		if ((tg->scx_flags &
++		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
++			continue;
++
++		if (!scx_ops.cgroup_init) {
++			tg->scx_flags |= SCX_TG_INITED;
++			continue;
++		}
++
++		if (WARN_ON_ONCE(!css_tryget(css)))
++			continue;
++		rcu_read_unlock();
++
++		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
++				      css->cgroup, &args);
++		if (ret) {
++			css_put(css);
++			return ret;
++		}
++		tg->scx_flags |= SCX_TG_INITED;
++
++		rcu_read_lock();
++		css_put(css);
++	}
++	rcu_read_unlock();
++
++	WARN_ON_ONCE(scx_cgroup_enabled);
++	scx_cgroup_enabled = true;
++
++	return 0;
++}
++
++#else
++static void scx_cgroup_exit(void) {}
++static int scx_cgroup_init(void) { return 0; }
++#endif
++
 +
 +/********************************************************************************
 + * Sysfs interface and ops enable/disable.
@@ -5318,11 +6122,19 @@ index 000000000000..0dac88d0e578
 +}
 +SCX_ATTR(hotplug_seq);
 +
++static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
++					struct kobj_attribute *ka, char *buf)
++{
++	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
++}
++SCX_ATTR(enable_seq);
++
 +static struct attribute *scx_global_attrs[] = {
 +	&scx_attr_state.attr,
 +	&scx_attr_switch_all.attr,
 +	&scx_attr_nr_rejected.attr,
 +	&scx_attr_hotplug_seq.attr,
++	&scx_attr_enable_seq.attr,
 +	NULL,
 +};
 +
@@ -5421,16 +6233,8 @@ index 000000000000..0dac88d0e578
 +	}
 +
 +	/*
-+	 * We need to guarantee that no tasks are on the BPF scheduler while
-+	 * bypassing. Either we see enabled or the enable path sees the
-+	 * increased bypass_depth before moving tasks to SCX.
-+	 */
-+	if (!scx_enabled())
-+		return;
-+
-+	/*
 +	 * No task property is changing. We just need to make sure all currently
-+	 * queued tasks are re-queued according to the new scx_ops_bypassing()
++	 * queued tasks are re-queued according to the new scx_rq_bypassing()
 +	 * state. As an optimization, walk each rq's runnable_list instead of
 +	 * the scx_tasks list.
 +	 *
@@ -5444,6 +6248,24 @@ index 000000000000..0dac88d0e578
 +
 +		rq_lock_irqsave(rq, &rf);
 +
++		if (bypass) {
++			WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
++			rq->scx.flags |= SCX_RQ_BYPASSING;
++		} else {
++			WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
++			rq->scx.flags &= ~SCX_RQ_BYPASSING;
++		}
++
++		/*
++		 * We need to guarantee that no tasks are on the BPF scheduler
++		 * while bypassing. Either we see enabled or the enable path
++		 * sees scx_rq_bypassing() before moving tasks to SCX.
++		 */
++		if (!scx_enabled()) {
++			rq_unlock_irqrestore(rq, &rf);
++			continue;
++		}
++
 +		/*
 +		 * The use of list_for_each_entry_safe_reverse() is required
 +		 * because each task is going to be removed from and added back
@@ -5499,11 +6321,11 @@ index 000000000000..0dac88d0e578
 +{
 +	switch (kind) {
 +	case SCX_EXIT_UNREG:
-+		return "Scheduler unregistered from user space";
++		return "unregistered from user space";
 +	case SCX_EXIT_UNREG_BPF:
-+		return "Scheduler unregistered from BPF";
++		return "unregistered from BPF";
 +	case SCX_EXIT_UNREG_KERN:
-+		return "Scheduler unregistered from the main kernel";
++		return "unregistered from the main kernel";
 +	case SCX_EXIT_SYSRQ:
 +		return "disabled by sysrq-S";
 +	case SCX_EXIT_ERROR:
@@ -5569,66 +6391,64 @@ index 000000000000..0dac88d0e578
 +	WRITE_ONCE(scx_switching_all, false);
 +
 +	/*
-+	 * Avoid racing against fork. See scx_ops_enable() for explanation on
-+	 * the locking order.
++	 * Shut down cgroup support before tasks so that the cgroup attach path
++	 * doesn't race against scx_ops_exit_task().
++	 */
++	scx_cgroup_lock();
++	scx_cgroup_exit();
++	scx_cgroup_unlock();
++
++	/*
++	 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
++	 * must be switched out and exited synchronously.
 +	 */
 +	percpu_down_write(&scx_fork_rwsem);
-+	cpus_read_lock();
++
++	scx_ops_init_task_enabled = false;
 +
 +	spin_lock_irq(&scx_tasks_lock);
 +	scx_task_iter_init(&sti);
-+	/*
-+	 * Invoke scx_ops_exit_task() on all non-idle tasks, including
-+	 * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount,
-+	 * we may not have invoked sched_ext_free() on them by the time a
-+	 * scheduler is disabled. We must therefore exit the task here, or we'd
-+	 * fail to invoke ops.exit_task(), as the scheduler will have been
-+	 * unloaded by the time the task is subsequently exited on the
-+	 * sched_ext_free() path.
-+	 */
-+	while ((p = scx_task_iter_next_locked(&sti, true))) {
++	while ((p = scx_task_iter_next_locked(&sti))) {
 +		const struct sched_class *old_class = p->sched_class;
 +		struct sched_enq_and_set_ctx ctx;
 +
-+		if (READ_ONCE(p->__state) != TASK_DEAD) {
-+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
-+					       &ctx);
++		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 +
-+			p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
-+			__setscheduler_prio(p, p->prio);
-+			check_class_changing(task_rq(p), p, old_class);
++		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
++		__setscheduler_prio(p, p->prio);
++		check_class_changing(task_rq(p), p, old_class);
 +
-+			sched_enq_and_set_task(&ctx);
++		sched_enq_and_set_task(&ctx);
 +
-+			check_class_changed(task_rq(p), p, old_class, p->prio);
-+		}
++		check_class_changed(task_rq(p), p, old_class, p->prio);
 +		scx_ops_exit_task(p);
 +	}
 +	scx_task_iter_exit(&sti);
 +	spin_unlock_irq(&scx_tasks_lock);
++	percpu_up_write(&scx_fork_rwsem);
 +
 +	/* no task is on scx, turn off all the switches and flush in-progress calls */
-+	static_branch_disable_cpuslocked(&__scx_ops_enabled);
++	static_branch_disable(&__scx_ops_enabled);
 +	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
-+		static_branch_disable_cpuslocked(&scx_has_op[i]);
-+	static_branch_disable_cpuslocked(&scx_ops_enq_last);
-+	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
-+	static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
-+	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
++		static_branch_disable(&scx_has_op[i]);
++	static_branch_disable(&scx_ops_enq_last);
++	static_branch_disable(&scx_ops_enq_exiting);
++	static_branch_disable(&scx_ops_cpu_preempt);
++	static_branch_disable(&scx_builtin_idle_enabled);
 +	synchronize_rcu();
 +
-+	cpus_read_unlock();
-+	percpu_up_write(&scx_fork_rwsem);
-+
 +	if (ei->kind >= SCX_EXIT_ERROR) {
-+		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
-+
-+		if (ei->msg[0] == '\0')
-+			printk(KERN_ERR "sched_ext: %s\n", ei->reason);
-+		else
-+			printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg);
++		pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
++		       scx_ops.name, ei->reason);
 +
++		if (ei->msg[0] != '\0')
++			pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
++#ifdef CONFIG_STACKTRACE
 +		stack_trace_print(ei->bt, ei->bt_len, 2);
++#endif
++	} else {
++		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
++			scx_ops.name, ei->reason);
 +	}
 +
 +	if (scx_ops.exit)
@@ -5817,7 +6637,7 @@ index 000000000000..0dac88d0e578
 +	static unsigned long bt[SCX_EXIT_BT_LEN];
 +	char dsq_id_buf[19] = "(n/a)";
 +	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
-+	unsigned int bt_len;
++	unsigned int bt_len = 0;
 +
 +	if (p->scx.dsq)
 +		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
@@ -5842,7 +6662,9 @@ index 000000000000..0dac88d0e578
 +		ops_dump_exit();
 +	}
 +
++#ifdef CONFIG_STACKTRACE
 +	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
++#endif
 +	if (bt_len) {
 +		dump_newline(s);
 +		dump_stack_trace(s, "    ", bt, bt_len);
@@ -6000,10 +6822,10 @@ index 000000000000..0dac88d0e578
 +		return;
 +
 +	ei->exit_code = exit_code;
-+
++#ifdef CONFIG_STACKTRACE
 +	if (kind >= SCX_EXIT_ERROR)
 +		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
-+
++#endif
 +	va_start(args, fmt);
 +	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 +	va_end(args);
@@ -6061,12 +6883,12 @@ index 000000000000..0dac88d0e578
 +	return 0;
 +}
 +
-+static int scx_ops_enable(struct sched_ext_ops *ops)
++static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 +{
 +	struct scx_task_iter sti;
 +	struct task_struct *p;
 +	unsigned long timeout;
-+	int i, cpu, ret;
++	int i, cpu, node, ret;
 +
 +	if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
 +			   cpu_possible_mask)) {
@@ -6085,6 +6907,34 @@ index 000000000000..0dac88d0e578
 +		}
 +	}
 +
++	if (!global_dsqs) {
++		struct scx_dispatch_q **dsqs;
++
++		dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
++		if (!dsqs) {
++			ret = -ENOMEM;
++			goto err_unlock;
++		}
++
++		for_each_node_state(node, N_POSSIBLE) {
++			struct scx_dispatch_q *dsq;
++
++			dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
++			if (!dsq) {
++				for_each_node_state(node, N_POSSIBLE)
++					kfree(dsqs[node]);
++				kfree(dsqs);
++				ret = -ENOMEM;
++				goto err_unlock;
++			}
++
++			init_dsq(dsq, SCX_DSQ_GLOBAL);
++			dsqs[node] = dsq;
++		}
++
++		global_dsqs = dsqs;
++	}
++
 +	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
 +		ret = -EBUSY;
 +		goto err_unlock;
@@ -6108,12 +6958,12 @@ index 000000000000..0dac88d0e578
 +	}
 +
 +	/*
-+	 * Set scx_ops, transition to PREPPING and clear exit info to arm the
++	 * Set scx_ops, transition to ENABLING and clear exit info to arm the
 +	 * disable path. Failure triggers full disabling from here on.
 +	 */
 +	scx_ops = *ops;
 +
-+	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
++	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
 +		     SCX_OPS_DISABLED);
 +
 +	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
@@ -6134,7 +6984,8 @@ index 000000000000..0dac88d0e578
 +		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
 +		if (ret) {
 +			ret = ops_sanitize_err("init", ret);
-+			goto err_disable_unlock_cpus;
++			cpus_read_unlock();
++			goto err_disable;
 +		}
 +	}
 +
@@ -6142,6 +6993,7 @@ index 000000000000..0dac88d0e578
 +		if (((void (**)(void))ops)[i])
 +			static_branch_enable_cpuslocked(&scx_has_op[i]);
 +
++	check_hotplug_seq(ops);
 +	cpus_read_unlock();
 +
 +	ret = validate_ops(ops);
@@ -6169,42 +7021,40 @@ index 000000000000..0dac88d0e578
 +			   scx_watchdog_timeout / 2);
 +
 +	/*
-+	 * Lock out forks before opening the floodgate so that they don't wander
-+	 * into the operations prematurely.
-+	 *
-+	 * We don't need to keep the CPUs stable but grab cpus_read_lock() to
-+	 * ease future locking changes for cgroup suport.
-+	 *
-+	 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
-+	 * following dependency chain:
-+	 *
-+	 *   scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
++	 * Once __scx_ops_enabled is set, %current can be switched to SCX
++	 * anytime. This can lead to stalls as some BPF schedulers (e.g.
++	 * userspace scheduling) may not function correctly before all tasks are
++	 * switched. Init in bypass mode to guarantee forward progress.
 +	 */
-+	percpu_down_write(&scx_fork_rwsem);
-+	cpus_read_lock();
-+
-+	check_hotplug_seq(ops);
++	scx_ops_bypass(true);
 +
 +	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 +		if (((void (**)(void))ops)[i])
-+			static_branch_enable_cpuslocked(&scx_has_op[i]);
++			static_branch_enable(&scx_has_op[i]);
 +
 +	if (ops->flags & SCX_OPS_ENQ_LAST)
-+		static_branch_enable_cpuslocked(&scx_ops_enq_last);
++		static_branch_enable(&scx_ops_enq_last);
 +
 +	if (ops->flags & SCX_OPS_ENQ_EXITING)
-+		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
++		static_branch_enable(&scx_ops_enq_exiting);
 +	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
-+		static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
++		static_branch_enable(&scx_ops_cpu_preempt);
 +
 +	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
 +		reset_idle_masks();
-+		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
++		static_branch_enable(&scx_builtin_idle_enabled);
 +	} else {
-+		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
++		static_branch_disable(&scx_builtin_idle_enabled);
 +	}
 +
-+	static_branch_enable_cpuslocked(&__scx_ops_enabled);
++	/*
++	 * Lock out forks, cgroup on/offlining and moves before opening the
++	 * floodgate so that they don't wander into the operations prematurely.
++	 */
++	percpu_down_write(&scx_fork_rwsem);
++
++	WARN_ON_ONCE(scx_ops_init_task_enabled);
++	scx_ops_init_task_enabled = true;
 +
 +	/*
 +	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -6212,12 +7062,29 @@ index 000000000000..0dac88d0e578
 +	 * leaving as sched_ext_free() can handle both prepped and enabled
 +	 * tasks. Prep all tasks first and then enable them with preemption
 +	 * disabled.
++	 *
++	 * All cgroups should be initialized before scx_ops_init_task() so that
++	 * the BPF scheduler can reliably track each task's cgroup membership
++	 * from scx_ops_init_task(). Lock out cgroup on/offlining and task
++	 * migrations while tasks are being initialized so that
++	 * scx_cgroup_can_attach() never sees uninitialized tasks.
 +	 */
-+	spin_lock_irq(&scx_tasks_lock);
++	scx_cgroup_lock();
++	ret = scx_cgroup_init();
++	if (ret)
++		goto err_disable_unlock_all;
 +
++	spin_lock_irq(&scx_tasks_lock);
 +	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_locked(&sti, false))) {
-+		get_task_struct(p);
++	while ((p = scx_task_iter_next_locked(&sti))) {
++		/*
++		 * @p may already be dead, have lost all its usages counts and
++		 * be waiting for RCU grace period before being freed. @p can't
++		 * be initialized for SCX in such cases and should be ignored.
++		 */
++		if (!tryget_task_struct(p))
++			continue;
++
 +		scx_task_iter_rq_unlock(&sti);
 +		spin_unlock_irq(&scx_tasks_lock);
 +
@@ -6232,51 +7099,37 @@ index 000000000000..0dac88d0e578
 +			goto err_disable_unlock_all;
 +		}
 +
++		scx_set_task_state(p, SCX_TASK_READY);
++
 +		put_task_struct(p);
 +		spin_lock_irq(&scx_tasks_lock);
 +	}
 +	scx_task_iter_exit(&sti);
++	spin_unlock_irq(&scx_tasks_lock);
++	scx_cgroup_unlock();
++	percpu_up_write(&scx_fork_rwsem);
 +
 +	/*
-+	 * All tasks are prepped but are still ops-disabled. Ensure that
-+	 * %current can't be scheduled out and switch everyone.
-+	 * preempt_disable() is necessary because we can't guarantee that
-+	 * %current won't be starved if scheduled out while switching.
-+	 */
-+	preempt_disable();
-+
-+	/*
-+	 * From here on, the disable path must assume that tasks have ops
-+	 * enabled and need to be recovered.
-+	 *
-+	 * Transition to ENABLING fails iff the BPF scheduler has already
-+	 * triggered scx_bpf_error(). Returning an error code here would lose
-+	 * the recorded error information. Exit indicating success so that the
-+	 * error is notified through ops.exit() with all the details.
++	 * All tasks are READY. It's safe to turn on scx_enabled() and switch
++	 * all eligible tasks.
 +	 */
-+	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
-+		preempt_enable();
-+		spin_unlock_irq(&scx_tasks_lock);
-+		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
-+		ret = 0;
-+		goto err_disable_unlock_all;
-+	}
++	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
++	static_branch_enable(&__scx_ops_enabled);
 +
 +	/*
-+	 * We're fully committed and can't fail. The PREPPED -> ENABLED
++	 * We're fully committed and can't fail. The task READY -> ENABLED
 +	 * transitions here are synchronized against sched_ext_free() through
 +	 * scx_tasks_lock.
 +	 */
-+	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
-+
++	percpu_down_write(&scx_fork_rwsem);
++	spin_lock_irq(&scx_tasks_lock);
 +	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_locked(&sti, false))) {
++	while ((p = scx_task_iter_next_locked(&sti))) {
 +		const struct sched_class *old_class = p->sched_class;
 +		struct sched_enq_and_set_ctx ctx;
 +
 +		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 +
-+		scx_set_task_state(p, SCX_TASK_READY);
 +		__setscheduler_prio(p, p->prio);
 +		check_class_changing(task_rq(p), p, old_class);
 +
@@ -6285,13 +7138,16 @@ index 000000000000..0dac88d0e578
 +		check_class_changed(task_rq(p), p, old_class, p->prio);
 +	}
 +	scx_task_iter_exit(&sti);
-+
 +	spin_unlock_irq(&scx_tasks_lock);
-+	preempt_enable();
-+	cpus_read_unlock();
 +	percpu_up_write(&scx_fork_rwsem);
 +
-+	/* see above ENABLING transition for the explanation on exiting with 0 */
++	scx_ops_bypass(false);
++
++	/*
++	 * Returning an error code here would lose the recorded error
++	 * information. Exit indicating success so that the error is notified
++	 * through ops.exit() with all the details.
++	 */
 +	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 +		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
 +		ret = 0;
@@ -6301,9 +7157,13 @@ index 000000000000..0dac88d0e578
 +	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
 +		static_branch_enable(&__scx_switched_all);
 +
++	pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
++		scx_ops.name, scx_switched_all() ? "" : " (partial)");
 +	kobject_uevent(scx_root_kobj, KOBJ_ADD);
 +	mutex_unlock(&scx_ops_enable_mutex);
 +
++	atomic_long_inc(&scx_enable_seq);
++
 +	return 0;
 +
 +err_del:
@@ -6320,9 +7180,9 @@ index 000000000000..0dac88d0e578
 +	return ret;
 +
 +err_disable_unlock_all:
++	scx_cgroup_unlock();
 +	percpu_up_write(&scx_fork_rwsem);
-+err_disable_unlock_cpus:
-+	cpus_read_unlock();
++	scx_ops_bypass(false);
 +err_disable:
 +	mutex_unlock(&scx_ops_enable_mutex);
 +	/* must be fully disabled before returning */
@@ -6514,6 +7374,11 @@ index 000000000000..0dac88d0e578
 +
 +	switch (moff) {
 +	case offsetof(struct sched_ext_ops, init_task):
++#ifdef CONFIG_EXT_GROUP_SCHED
++	case offsetof(struct sched_ext_ops, cgroup_init):
++	case offsetof(struct sched_ext_ops, cgroup_exit):
++	case offsetof(struct sched_ext_ops, cgroup_prep_move):
++#endif
 +	case offsetof(struct sched_ext_ops, cpu_online):
 +	case offsetof(struct sched_ext_ops, cpu_offline):
 +	case offsetof(struct sched_ext_ops, init):
@@ -6527,12 +7392,12 @@ index 000000000000..0dac88d0e578
 +	return 0;
 +}
 +
-+static int bpf_scx_reg(void *kdata)
++static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 +{
-+	return scx_ops_enable(kdata);
++	return scx_ops_enable(kdata, link);
 +}
 +
-+static void bpf_scx_unreg(void *kdata)
++static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 +{
 +	scx_ops_disable(SCX_EXIT_UNREG);
 +	kthread_flush_work(&scx_ops_disable_work);
@@ -6551,7 +7416,7 @@ index 000000000000..0dac88d0e578
 +	return 0;
 +}
 +
-+static int bpf_scx_update(void *kdata, void *old_kdata)
++static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
 +{
 +	/*
 +	 * sched_ext does not support updating the actively-loaded BPF
@@ -6572,6 +7437,7 @@ index 000000000000..0dac88d0e578
 +static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
 +static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
 +static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
++static void tick_stub(struct task_struct *p) {}
 +static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
 +static void running_stub(struct task_struct *p) {}
 +static void stopping_stub(struct task_struct *p, bool runnable) {}
@@ -6587,16 +7453,28 @@ index 000000000000..0dac88d0e578
 +static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
 +static void enable_stub(struct task_struct *p) {}
 +static void disable_stub(struct task_struct *p) {}
++#ifdef CONFIG_EXT_GROUP_SCHED
++static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
++static void cgroup_exit_stub(struct cgroup *cgrp) {}
++static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
++static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
++static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
++static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
++#endif
 +static void cpu_online_stub(s32 cpu) {}
 +static void cpu_offline_stub(s32 cpu) {}
 +static s32 init_stub(void) { return -EINVAL; }
 +static void exit_stub(struct scx_exit_info *info) {}
++static void dump_stub(struct scx_dump_ctx *ctx) {}
++static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
++static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {}
 +
 +static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 +	.select_cpu = select_cpu_stub,
 +	.enqueue = enqueue_stub,
 +	.dequeue = dequeue_stub,
 +	.dispatch = dispatch_stub,
++	.tick = tick_stub,
 +	.runnable = runnable_stub,
 +	.running = running_stub,
 +	.stopping = stopping_stub,
@@ -6612,10 +7490,21 @@ index 000000000000..0dac88d0e578
 +	.exit_task = exit_task_stub,
 +	.enable = enable_stub,
 +	.disable = disable_stub,
++#ifdef CONFIG_EXT_GROUP_SCHED
++	.cgroup_init = cgroup_init_stub,
++	.cgroup_exit = cgroup_exit_stub,
++	.cgroup_prep_move = cgroup_prep_move_stub,
++	.cgroup_move = cgroup_move_stub,
++	.cgroup_cancel_move = cgroup_cancel_move_stub,
++	.cgroup_set_weight = cgroup_set_weight_stub,
++#endif
 +	.cpu_online = cpu_online_stub,
 +	.cpu_offline = cpu_offline_stub,
 +	.init = init_stub,
 +	.exit = exit_stub,
++	.dump = dump_stub,
++	.dump_cpu = dump_cpu_stub,
++	.dump_task = dump_task_stub,
 +};
 +
 +static struct bpf_struct_ops bpf_sched_ext_ops = {
@@ -6858,10 +7747,10 @@ index 000000000000..0dac88d0e578
 +	 * definitions so that BPF scheduler implementations can use them
 +	 * through the generated vmlinux.h.
 +	 */
-+	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT);
++	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
++		   SCX_TG_ONLINE);
 +
 +	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-+	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
 +#ifdef CONFIG_SMP
 +	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
 +	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
@@ -6903,35 +7792,6 @@ index 000000000000..0dac88d0e578
 +__bpf_kfunc_start_defs();
 +
 +/**
-+ * scx_bpf_create_dsq - Create a custom DSQ
-+ * @dsq_id: DSQ to create
-+ * @node: NUMA node to allocate from
-+ *
-+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
-+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
-+ */
-+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
-+{
-+	if (unlikely(node >= (int)nr_node_ids ||
-+		     (node < 0 && node != NUMA_NO_NODE)))
-+		return -EINVAL;
-+	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_sleepable)
-+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
-+BTF_KFUNCS_END(scx_kfunc_ids_sleepable)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_sleepable,
-+};
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
 + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
 + * @p: task_struct to select a CPU for
 + * @prev_cpu: CPU @p was on previously
@@ -7021,7 +7881,7 @@ index 000000000000..0dac88d0e578
 + * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
 + * @p: task_struct to dispatch
 + * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
++ * @slice: duration @p can run for in nsecs, 0 to keep the current value
 + * @enq_flags: SCX_ENQ_*
 + *
 + * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
@@ -7071,7 +7931,7 @@ index 000000000000..0dac88d0e578
 + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
 + * @p: task_struct to dispatch
 + * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
++ * @slice: duration @p can run for in nsecs, 0 to keep the current value
 + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
 + * @enq_flags: SCX_ENQ_*
 + *
@@ -7112,6 +7972,118 @@ index 000000000000..0dac88d0e578
 +	.set			= &scx_kfunc_ids_enqueue_dispatch,
 +};
 +
++static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
++				  struct task_struct *p, u64 dsq_id,
++				  u64 enq_flags)
++{
++	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
++	struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
++	bool dispatched = false;
++	bool in_balance;
++	unsigned long flags;
++
++	if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
++		return false;
++
++	/*
++	 * Can be called from either ops.dispatch() locking this_rq() or any
++	 * context where no rq lock is held. If latter, lock @p's task_rq which
++	 * we'll likely need anyway.
++	 */
++	src_rq = task_rq(p);
++
++	local_irq_save(flags);
++	this_rq = this_rq();
++	in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
++
++	if (in_balance) {
++		if (this_rq != src_rq) {
++			raw_spin_rq_unlock(this_rq);
++			raw_spin_rq_lock(src_rq);
++		}
++	} else {
++		raw_spin_rq_lock(src_rq);
++	}
++
++	locked_rq = src_rq;
++	raw_spin_lock(&src_dsq->lock);
++
++	/*
++	 * Did someone else get to it? @p could have already left $src_dsq, got
++	 * re-enqueud, or be in the process of being consumed by someone else.
++	 */
++	if (unlikely(p->scx.dsq != src_dsq ||
++		     u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
++		     p->scx.holding_cpu >= 0) ||
++	    WARN_ON_ONCE(src_rq != task_rq(p))) {
++		raw_spin_unlock(&src_dsq->lock);
++		goto out;
++	}
++
++	/* @p is still on $src_dsq and stable, determine the destination */
++	dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
++
++	if (dst_dsq->id == SCX_DSQ_LOCAL) {
++		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
++		if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
++			dst_dsq = find_global_dsq(p);
++			dst_rq = src_rq;
++		}
++	} else {
++		/* no need to migrate if destination is a non-local DSQ */
++		dst_rq = src_rq;
++	}
++
++	/*
++	 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
++	 * CPU, @p will be migrated.
++	 */
++	if (dst_dsq->id == SCX_DSQ_LOCAL) {
++		/* @p is going from a non-local DSQ to a local DSQ */
++		if (src_rq == dst_rq) {
++			task_unlink_from_dsq(p, src_dsq);
++			move_local_task_to_local_dsq(p, enq_flags,
++						     src_dsq, dst_rq);
++			raw_spin_unlock(&src_dsq->lock);
++		} else {
++			raw_spin_unlock(&src_dsq->lock);
++			move_remote_task_to_local_dsq(p, enq_flags,
++						      src_rq, dst_rq);
++			locked_rq = dst_rq;
++		}
++	} else {
++		/*
++		 * @p is going from a non-local DSQ to a non-local DSQ. As
++		 * $src_dsq is already locked, do an abbreviated dequeue.
++		 */
++		task_unlink_from_dsq(p, src_dsq);
++		p->scx.dsq = NULL;
++		raw_spin_unlock(&src_dsq->lock);
++
++		if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
++			p->scx.dsq_vtime = kit->vtime;
++		dispatch_enqueue(dst_dsq, p, enq_flags);
++	}
++
++	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
++		p->scx.slice = kit->slice;
++
++	dispatched = true;
++out:
++	if (in_balance) {
++		if (this_rq != locked_rq) {
++			raw_spin_rq_unlock(locked_rq);
++			raw_spin_rq_lock(this_rq);
++		}
++	} else {
++		raw_spin_rq_unlock_irqrestore(locked_rq, flags);
++	}
++
++	kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
++			       __SCX_DSQ_ITER_HAS_VTIME);
++	return dispatched;
++}
++
 +__bpf_kfunc_start_defs();
 +
 +/**
@@ -7171,7 +8143,7 @@ index 000000000000..0dac88d0e578
 +
 +	flush_dispatch_buf(dspc->rq);
 +
-+	dsq = find_non_local_dsq(dsq_id);
++	dsq = find_user_dsq(dsq_id);
 +	if (unlikely(!dsq)) {
 +		scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
 +		return false;
@@ -7191,12 +8163,112 @@ index 000000000000..0dac88d0e578
 +	}
 +}
 +
++/**
++ * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
++ * @it__iter: DSQ iterator in progress
++ * @slice: duration the dispatched task can run for in nsecs
++ *
++ * Override the slice of the next task that will be dispatched from @it__iter
++ * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
++ * the previous slice duration is kept.
++ */
++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
++				struct bpf_iter_scx_dsq *it__iter, u64 slice)
++{
++	struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
++
++	kit->slice = slice;
++	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
++}
++
++/**
++ * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
++ * @it__iter: DSQ iterator in progress
++ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
++ *
++ * Override the vtime of the next task that will be dispatched from @it__iter
++ * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
++ * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
++ * dispatch the next task, the override is ignored and cleared.
++ */
++__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
++				struct bpf_iter_scx_dsq *it__iter, u64 vtime)
++{
++	struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
++
++	kit->vtime = vtime;
++	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
++}
++
++/**
++ * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
++ * @it__iter: DSQ iterator in progress
++ * @p: task to transfer
++ * @dsq_id: DSQ to move @p to
++ * @enq_flags: SCX_ENQ_*
++ *
++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
++ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
++ * be the destination.
++ *
++ * For the transfer to be successful, @p must still be on the DSQ and have been
++ * queued before the DSQ iteration started. This function doesn't care whether
++ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
++ * been queued before the iteration started.
++ *
++ * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
++ * update.
++ *
++ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
++ * lock (e.g. BPF timers or SYSCALL programs).
++ *
++ * Returns %true if @p has been consumed, %false if @p had already been consumed
++ * or dequeued.
++ */
++__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
++					   struct task_struct *p, u64 dsq_id,
++					   u64 enq_flags)
++{
++	return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
++				     p, dsq_id, enq_flags);
++}
++
++/**
++ * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
++ * @it__iter: DSQ iterator in progress
++ * @p: task to transfer
++ * @dsq_id: DSQ to move @p to
++ * @enq_flags: SCX_ENQ_*
++ *
++ * Transfer @p which is on the DSQ currently iterated by @it__iter to the
++ * priority queue of the DSQ specified by @dsq_id. The destination must be a
++ * user DSQ as only user DSQs support priority queue.
++ *
++ * @p's slice and vtime are kept by default. Use
++ * scx_bpf_dispatch_from_dsq_set_slice() and
++ * scx_bpf_dispatch_from_dsq_set_vtime() to update.
++ *
++ * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
++ * scx_bpf_dispatch_vtime() for more information on @vtime.
++ */
++__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
++						 struct task_struct *p, u64 dsq_id,
++						 u64 enq_flags)
++{
++	return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
++				     p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
++}
++
 +__bpf_kfunc_end_defs();
 +
 +BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
 +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
 +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
 +BTF_ID_FLAGS(func, scx_bpf_consume)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
 +BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
 +
 +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -7274,6 +8346,37 @@ index 000000000000..0dac88d0e578
 +__bpf_kfunc_start_defs();
 +
 +/**
++ * scx_bpf_create_dsq - Create a custom DSQ
++ * @dsq_id: DSQ to create
++ * @node: NUMA node to allocate from
++ *
++ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
++ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
++ */
++__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
++{
++	if (unlikely(node >= (int)nr_node_ids ||
++		     (node < 0 && node != NUMA_NO_NODE)))
++		return -EINVAL;
++	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
++}
++
++__bpf_kfunc_end_defs();
++
++BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
++BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
++BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
++
++static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
++	.owner			= THIS_MODULE,
++	.set			= &scx_kfunc_ids_unlocked,
++};
++
++__bpf_kfunc_start_defs();
++
++/**
 + * scx_bpf_kick_cpu - Trigger reschedule on a CPU
 + * @cpu: cpu to kick
 + * @flags: %SCX_KICK_* flags
@@ -7291,17 +8394,17 @@ index 000000000000..0dac88d0e578
 +	if (!ops_cpu_valid(cpu, NULL))
 +		return;
 +
++	local_irq_save(irq_flags);
++
++	this_rq = this_rq();
++
 +	/*
 +	 * While bypassing for PM ops, IRQ handling may not be online which can
 +	 * lead to irq_work_queue() malfunction such as infinite busy wait for
 +	 * IRQ status update. Suppress kicking.
 +	 */
-+	if (scx_ops_bypassing())
-+		return;
-+
-+	local_irq_save(irq_flags);
-+
-+	this_rq = this_rq();
++	if (scx_rq_bypassing(this_rq))
++		goto out;
 +
 +	/*
 +	 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
@@ -7361,7 +8464,7 @@ index 000000000000..0dac88d0e578
 +			goto out;
 +		}
 +	} else {
-+		dsq = find_non_local_dsq(dsq_id);
++		dsq = find_user_dsq(dsq_id);
 +		if (dsq) {
 +			ret = READ_ONCE(dsq->nr);
 +			goto out;
@@ -7407,17 +8510,16 @@ index 000000000000..0dac88d0e578
 +	BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
 +		     __alignof__(struct bpf_iter_scx_dsq));
 +
-+	if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
++	if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
 +		return -EINVAL;
 +
-+	kit->dsq = find_non_local_dsq(dsq_id);
++	kit->dsq = find_user_dsq(dsq_id);
 +	if (!kit->dsq)
 +		return -ENOENT;
 +
 +	INIT_LIST_HEAD(&kit->cursor.node);
-+	kit->cursor.is_bpf_iter_cursor = true;
-+	kit->dsq_seq = READ_ONCE(kit->dsq->seq);
-+	kit->flags = flags;
++	kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags;
++	kit->cursor.priv = READ_ONCE(kit->dsq->seq);
 +
 +	return 0;
 +}
@@ -7431,7 +8533,7 @@ index 000000000000..0dac88d0e578
 +__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
 +{
 +	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
-+	bool rev = kit->flags & SCX_DSQ_ITER_REV;
++	bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
 +	struct task_struct *p;
 +	unsigned long flags;
 +
@@ -7452,7 +8554,7 @@ index 000000000000..0dac88d0e578
 +	 */
 +	do {
 +		p = nldsq_next_task(kit->dsq, p, rev);
-+	} while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq)));
++	} while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
 +
 +	if (p) {
 +		if (rev)
@@ -7918,6 +9020,41 @@ index 000000000000..0dac88d0e578
 +	return cpu_rq(cpu);
 +}
 +
++/**
++ * scx_bpf_task_cgroup - Return the sched cgroup of a task
++ * @p: task of interest
++ *
++ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
++ * from the scheduler's POV. SCX operations should use this function to
++ * determine @p's current cgroup as, unlike following @p->cgroups,
++ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
++ * rq-locked operations. Can be called on the parameter tasks of rq-locked
++ * operations. The restriction guarantees that @p's rq is locked by the caller.
++ */
++#ifdef CONFIG_CGROUP_SCHED
++__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
++{
++	struct task_group *tg = p->sched_task_group;
++	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
++
++	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
++		goto out;
++
++	/*
++	 * A task_group may either be a cgroup or an autogroup. In the latter
++	 * case, @tg->css.cgroup is %NULL. A task_group can't become the other
++	 * kind once created.
++	 */
++	if (tg && tg->css.cgroup)
++		cgrp = tg->css.cgroup;
++	else
++		cgrp = &cgrp_dfl_root.cgrp;
++out:
++	cgroup_get(cgrp);
++	return cgrp;
++}
++#endif
++
 +__bpf_kfunc_end_defs();
 +
 +BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7946,6 +9083,9 @@ index 000000000000..0dac88d0e578
 +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
 +BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
++#ifdef CONFIG_CGROUP_SCHED
++BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
++#endif
 +BTF_KFUNCS_END(scx_kfunc_ids_any)
 +
 +static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7969,10 +9109,6 @@ index 000000000000..0dac88d0e578
 +	 * check using scx_kf_allowed().
 +	 */
 +	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_sleepable)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
-+					     &scx_kfunc_set_sleepable)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 +					     &scx_kfunc_set_select_cpu)) ||
 +	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 +					     &scx_kfunc_set_enqueue_dispatch)) ||
@@ -7981,6 +9117,10 @@ index 000000000000..0dac88d0e578
 +	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 +					     &scx_kfunc_set_cpu_release)) ||
 +	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
++					     &scx_kfunc_set_unlocked)) ||
++	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
++					     &scx_kfunc_set_unlocked)) ||
++	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 +					     &scx_kfunc_set_any)) ||
 +	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
 +					     &scx_kfunc_set_any)) ||
@@ -8019,10 +9159,10 @@ index 000000000000..0dac88d0e578
 +__initcall(scx_init);
 diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
 new file mode 100644
-index 000000000000..32d3a51f591a
+index 000000000000..246019519231
 --- /dev/null
 +++ b/kernel/sched/ext.h
-@@ -0,0 +1,69 @@
+@@ -0,0 +1,91 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
@@ -8092,11 +9232,33 @@ index 000000000000..32d3a51f591a
 +#else
 +static inline void scx_update_idle(struct rq *rq, bool idle) {}
 +#endif
++
++#ifdef CONFIG_CGROUP_SCHED
++#ifdef CONFIG_EXT_GROUP_SCHED
++int scx_tg_online(struct task_group *tg);
++void scx_tg_offline(struct task_group *tg);
++int scx_cgroup_can_attach(struct cgroup_taskset *tset);
++void scx_move_task(struct task_struct *p);
++void scx_cgroup_finish_attach(void);
++void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
++void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
++void scx_group_set_idle(struct task_group *tg, bool idle);
++#else	/* CONFIG_EXT_GROUP_SCHED */
++static inline int scx_tg_online(struct task_group *tg) { return 0; }
++static inline void scx_tg_offline(struct task_group *tg) {}
++static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
++static inline void scx_move_task(struct task_struct *p) {}
++static inline void scx_cgroup_finish_attach(void) {}
++static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
++static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
++static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
++#endif	/* CONFIG_EXT_GROUP_SCHED */
++#endif	/* CONFIG_CGROUP_SCHED */
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 483c137b9d3d..ab17954001ae 100644
+index 91b242e47db7..a36e37a674e8 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
-@@ -3835,7 +3835,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+@@ -3857,7 +3857,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  	}
  }
  
@@ -8106,16 +9268,7 @@ index 483c137b9d3d..ab17954001ae 100644
  {
  	struct sched_entity *se = &p->se;
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-@@ -8697,7 +8697,7 @@
- 	/*
- 	 * BATCH and IDLE tasks do not preempt others.
- 	 */
--	if (unlikely(p->policy != SCHED_NORMAL))
-+	if (unlikely(!normal_policy(p->policy)))
- 		return;
- 
- 	cfs_rq = cfs_rq_of(se);
-@@ -9647,29 +9647,18 @@
+@@ -9365,29 +9366,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {
  
  static bool __update_blocked_others(struct rq *rq, bool *done)
  {
@@ -8148,7 +9301,7 @@ index 483c137b9d3d..ab17954001ae 100644
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -13207,6 +13198,7 @@ DEFINE_SCHED_CLASS(fair) = {
+@@ -13233,6 +13223,7 @@ DEFINE_SCHED_CLASS(fair) = {
  	.task_tick		= task_tick_fair,
  	.task_fork		= task_fork_fair,
  
@@ -8157,10 +9310,10 @@ index 483c137b9d3d..ab17954001ae 100644
  	.switched_from		= switched_from_fair,
  	.switched_to		= switched_to_fair,
 diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
-index 6135fbe83d68..3b6540cc436a 100644
+index 6e78d071beb5..c7a218123b7a 100644
 --- a/kernel/sched/idle.c
 +++ b/kernel/sched/idle.c
-@@ -458,11 +458,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
+@@ -452,11 +452,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
  
  static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
  {
@@ -8175,14 +9328,13 @@ index 6135fbe83d68..3b6540cc436a 100644
  }
  
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 38aeedd8a6cc..f952a4b99ead 100644
+index 432b43aa091c..48d893de632b 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
-@@ -187,9 +187,19 @@ static inline int idle_policy(int policy)
- {
+@@ -192,9 +192,18 @@ static inline int idle_policy(int policy)
  	return policy == SCHED_IDLE;
  }
-+
+ 
 +static inline int normal_policy(int policy)
 +{
 +#ifdef CONFIG_SCHED_CLASS_EXT
@@ -8199,7 +9351,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  }
  
  static inline int rt_policy(int policy)
-@@ -237,6 +247,24 @@ static inline void update_avg(u64 *avg, u64 sample)
+@@ -244,6 +253,24 @@ static inline void update_avg(u64 *avg, u64 sample)
  #define shr_bound(val, shift)							\
  	(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
  
@@ -8224,7 +9376,50 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  /*
   * !! For sched_setattr_nocheck() (kernel) only !!
   *
-@@ -475,6 +503,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+@@ -397,16 +424,17 @@ struct cfs_bandwidth {
+ struct task_group {
+ 	struct cgroup_subsys_state css;
+ 
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
++	/* A positive value indicates that this is a SCHED_IDLE group. */
++	int			idle;
++#endif
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	/* schedulable entities of this group on each CPU */
+ 	struct sched_entity	**se;
+ 	/* runqueue "owned" by this group on each CPU */
+ 	struct cfs_rq		**cfs_rq;
+ 	unsigned long		shares;
+-
+-	/* A positive value indicates that this is a SCHED_IDLE group. */
+-	int			idle;
+-
+ #ifdef	CONFIG_SMP
+ 	/*
+ 	 * load_avg can be heavily contended at clock tick time, so put
+@@ -424,6 +452,11 @@ struct task_group {
+ 	struct rt_bandwidth	rt_bandwidth;
+ #endif
+ 
++#ifdef CONFIG_EXT_GROUP_SCHED
++	u32			scx_flags;	/* SCX_TG_* */
++	u32			scx_weight;
++#endif
++
+ 	struct rcu_head		rcu;
+ 	struct list_head	list;
+ 
+@@ -448,7 +481,7 @@ struct task_group {
+ 
+ };
+ 
+-#ifdef CONFIG_FAIR_GROUP_SCHED
++#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ #define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD
+ 
+ /*
+@@ -479,6 +512,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
  	return walk_tg_tree_from(&root_task_group, down, up, data);
  }
  
@@ -8236,11 +9431,20 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  extern int tg_nop(struct task_group *tg, void *data);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -583,6 +616,12 @@ do {									\
- # define u64_u32_load(var)      u64_u32_load_copy(var, var##_copy)
- # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+@@ -535,6 +573,9 @@ extern void set_task_rq_fair(struct sched_entity *se,
+ static inline void set_task_rq_fair(struct sched_entity *se,
+ 			     struct cfs_rq *prev, struct cfs_rq *next) { }
+ #endif /* CONFIG_SMP */
++#else /* !CONFIG_FAIR_GROUP_SCHED */
++static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
++static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+ #else /* CONFIG_CGROUP_SCHED */
+@@ -588,6 +629,11 @@ do {									\
+ # define u64_u32_load(var)		u64_u32_load_copy(var, var##_copy)
+ # define u64_u32_store(var, val)	u64_u32_store_copy(var, var##_copy, val)
  
-+struct rq;
 +struct balance_callback {
 +	struct balance_callback *next;
 +	void (*func)(struct rq *rq);
@@ -8249,7 +9453,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  /* CFS-related fields in a runqueue */
  struct cfs_rq {
  	struct load_weight	load;
-@@ -691,6 +730,42 @@ struct cfs_rq {
+@@ -696,6 +742,43 @@ struct cfs_rq {
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  };
  
@@ -8263,6 +9467,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
 +	 */
 +	SCX_RQ_ONLINE		= 1 << 0,
 +	SCX_RQ_CAN_STOP_TICK	= 1 << 1,
++	SCX_RQ_BYPASSING	= 1 << 3,
 +
 +	SCX_RQ_IN_WAKEUP	= 1 << 16,
 +	SCX_RQ_IN_BALANCE	= 1 << 17,
@@ -8292,11 +9497,10 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  static inline int rt_bandwidth_enabled(void)
  {
  	return sysctl_sched_rt_runtime >= 0;
-@@ -988,12 +1063,6 @@ struct uclamp_rq {
+@@ -996,11 +1079,6 @@ struct uclamp_rq {
  DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
  #endif /* CONFIG_UCLAMP_TASK */
  
--struct rq;
 -struct balance_callback {
 -	struct balance_callback *next;
 -	void (*func)(struct rq *rq);
@@ -8305,7 +9509,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  /*
   * This is the main, per-CPU runqueue data structure.
   *
-@@ -1036,6 +1105,9 @@ struct rq {
+@@ -1043,6 +1121,9 @@ struct rq {
  	struct cfs_rq		cfs;
  	struct rt_rq		rt;
  	struct dl_rq		dl;
@@ -8315,16 +9519,24 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  	/* list of leaf cfs_rq on this CPU: */
-@@ -2278,6 +2350,8 @@ struct sched_class {
+@@ -2291,13 +2372,15 @@ struct sched_class {
+ 
+ 	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ 
++	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ 	struct task_struct *(*pick_next_task)(struct rq *rq);
+ 
  	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
  	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
  
 +	void (*switch_class)(struct rq *rq, struct task_struct *next);
 +
  #ifdef CONFIG_SMP
- 	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+-	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
  	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
-@@ -2305,8 +2379,11 @@ struct sched_class {
+ 
+ 	struct task_struct * (*pick_task)(struct rq *rq);
+@@ -2323,8 +2406,11 @@ struct sched_class {
  	 * cannot assume the switched_from/switched_to pair is serialized by
  	 * rq->lock. They are however serialized by p->pi_lock.
  	 */
@@ -8336,7 +9548,7 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
  			      int oldprio);
  
-@@ -2355,19 +2432,54 @@ const struct sched_class name##_sched_class \
+@@ -2373,19 +2459,54 @@ const struct sched_class name##_sched_class \
  extern struct sched_class __sched_class_highest[];
  extern struct sched_class __sched_class_lowest[];
  
@@ -8397,50 +9609,77 @@ index 38aeedd8a6cc..f952a4b99ead 100644
  
  static inline bool sched_stop_runnable(struct rq *rq)
  {
-@@ -2464,7 +2576,7 @@ extern void init_sched_dl_class(void);
+@@ -2424,6 +2545,19 @@ extern void sched_balance_trigger(struct rq *rq);
+ extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx);
+ extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
+ 
++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
++{
++	/* When not in the task's cpumask, no point in looking further. */
++	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
++		return false;
++
++	/* Can @cpu run a user thread? */
++	if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p))
++		return false;
++
++	return true;
++}
++
+ static inline cpumask_t *alloc_user_cpus_ptr(int node)
+ {
+ 	/*
+@@ -2457,6 +2591,11 @@ extern int push_cpu_stop(void *arg);
+ 
+ #else /* !CONFIG_SMP: */
+ 
++static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
++{
++	return true;
++}
++
+ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ 					 struct affinity_context *ctx)
+ {
+@@ -2510,8 +2649,6 @@ extern void init_sched_dl_class(void);
  extern void init_sched_rt_class(void);
  extern void init_sched_fair_class(void);
  
 -extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
-+extern void __setscheduler_prio(struct task_struct *p, int prio);
- 
+-
  extern void resched_curr(struct rq *rq);
  extern void resched_cpu(int cpu);
-@@ -2542,6 +2654,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
- extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
- extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
  
-+extern void check_class_changing(struct rq *rq, struct task_struct *p,
-+				 const struct sched_class *prev_class);
-+extern void check_class_changed(struct rq *rq, struct task_struct *p,
-+				const struct sched_class *prev_class,
-+				int oldprio);
-+
- extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
- 
- #ifdef CONFIG_PREEMPT_RT
-@@ -3007,6 +3125,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
- #endif
+@@ -3056,6 +3193,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
  
  #ifdef CONFIG_SMP
-+
+ 
 +bool update_other_load_avgs(struct rq *rq);
 +
  unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
  				 unsigned long *min,
  				 unsigned long *max);
-@@ -3049,6 +3170,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
- {
+@@ -3099,6 +3238,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
  	return READ_ONCE(rq->avg_rt.util_avg);
  }
+ 
 +#else /* !CONFIG_SMP */
 +static inline bool update_other_load_avgs(struct rq *rq) { return false; }
- #endif
+ #endif /* CONFIG_SMP */
  
  #ifdef CONFIG_UCLAMP_TASK
-@@ -3481,4 +3604,24 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
- extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
- extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+@@ -3609,6 +3750,8 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
+ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+ extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+ 
++extern void check_class_changing(struct rq *rq, struct task_struct *p,
++				 const struct sched_class *prev_class);
+ extern void check_class_changed(struct rq *rq, struct task_struct *p,
+ 				const struct sched_class *prev_class,
+ 				int oldprio);
+@@ -3629,4 +3772,24 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea
+ 
+ #endif
  
 +#ifdef CONFIG_SCHED_CLASS_EXT
 +/*
@@ -8463,11 +9702,76 @@ index 38aeedd8a6cc..f952a4b99ead 100644
 +#include "ext.h"
 +
  #endif /* _KERNEL_SCHED_SCHED_H */
+diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
+index ae1b42775ef9..4fa59c9f69ac 100644
+--- a/kernel/sched/syscalls.c
++++ b/kernel/sched/syscalls.c
+@@ -259,6 +259,25 @@ int sched_core_idle_cpu(int cpu)
+ #endif
+ 
+ #ifdef CONFIG_SMP
++/*
++ * Load avg and utiliztion metrics need to be updated periodically and before
++ * consumption. This function updates the metrics for all subsystems except for
++ * the fair class. @rq must be locked and have its clock updated.
++ */
++bool update_other_load_avgs(struct rq *rq)
++{
++	u64 now = rq_clock_pelt(rq);
++	const struct sched_class *curr_class = rq->curr->sched_class;
++	unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
++
++	lockdep_assert_rq_held(rq);
++
++	return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
++		update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
++		update_hw_load_avg(now, rq, hw_pressure) |
++		update_irq_load_avg(rq, 0);
++}
++
+ /*
+  * This function computes an effective utilization for the given CPU, to be
+  * used for frequency selection given the linear relation: f = u * f_max.
+@@ -695,6 +714,10 @@ int __sched_setscheduler(struct task_struct *p,
+ 		goto unlock;
+ 	}
+ 
++	retval = scx_check_setscheduler(p, policy);
++	if (retval)
++		goto unlock;
++
+ 	/*
+ 	 * If not changing anything there's no need to proceed further,
+ 	 * but store a possible modification of reset_on_fork.
+@@ -797,6 +820,7 @@ int __sched_setscheduler(struct task_struct *p,
+ 		__setscheduler_prio(p, newprio);
+ 	}
+ 	__setscheduler_uclamp(p, attr);
++	check_class_changing(rq, p, prev_class);
+ 
+ 	if (queued) {
+ 		/*
+@@ -1602,6 +1626,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+ 	case SCHED_NORMAL:
+ 	case SCHED_BATCH:
+ 	case SCHED_IDLE:
++	case SCHED_EXT:
+ 		ret = 0;
+ 		break;
+ 	}
+@@ -1629,6 +1654,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+ 	case SCHED_NORMAL:
+ 	case SCHED_BATCH:
+ 	case SCHED_IDLE:
++	case SCHED_EXT:
+ 		ret = 0;
+ 	}
+ 	return ret;
 diff --git a/lib/dump_stack.c b/lib/dump_stack.c
-index 222c6d6c8281..9581ef4efec5 100644
+index 1a996fbbf50a..388da1aea14a 100644
 --- a/lib/dump_stack.c
 +++ b/lib/dump_stack.c
-@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl)
+@@ -73,6 +73,7 @@ void dump_stack_print_info(const char *log_lvl)
  
  	print_worker_info(log_lvl, current);
  	print_stop_info(log_lvl, current);
@@ -8479,7 +9783,7 @@ diff --git a/tools/Makefile b/tools/Makefile
 index 276f5d0d53a4..278d24723b74 100644
 --- a/tools/Makefile
 +++ b/tools/Makefile
-@@ -28,6 +28,7 @@ include scripts/Makefile.include
+@@ -28,6 +28,7 @@ help:
  	@echo '  pci                    - PCI tools'
  	@echo '  perf                   - Linux performance measurement and analysis tool'
  	@echo '  selftests              - various kernel selftests'
@@ -8497,7 +9801,7 @@ index 276f5d0d53a4..278d24723b74 100644
  selftests: FORCE
  	$(call descend,testing/$@)
  
-@@ -184,6 +188,9 @@ install: acpi_install counter_install cpupower_install gpio_install \
+@@ -184,6 +188,9 @@ perf_clean:
  	$(Q)mkdir -p $(PERF_O) .
  	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
  
@@ -8526,7 +9830,7 @@ index 000000000000..d6264fe1c8cd
 +build/
 diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
 new file mode 100644
-index 000000000000..bf7e108f5ae1
+index 000000000000..ca3815e572d8
 --- /dev/null
 +++ b/tools/sched_ext/Makefile
 @@ -0,0 +1,246 @@
@@ -8708,7 +10012,7 @@ index 000000000000..bf7e108f5ae1
 +
 +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 +
-+c-sched-targets = scx_simple scx_qmap scx_central
++c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
 +
 +$(addprefix $(BINDIR)/,$(c-sched-targets)): \
 +	$(BINDIR)/%: \
@@ -8778,10 +10082,10 @@ index 000000000000..bf7e108f5ae1
 +.SECONDARY:
 diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
 new file mode 100644
-index 000000000000..8efe70cc4363
+index 000000000000..16a42e4060f6
 --- /dev/null
 +++ b/tools/sched_ext/README.md
-@@ -0,0 +1,258 @@
+@@ -0,0 +1,270 @@
 +SCHED_EXT EXAMPLE SCHEDULERS
 +============================
 +
@@ -8976,6 +10280,18 @@ index 000000000000..8efe70cc4363
 +infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
 +vmexits.
 +
++## scx_flatcg
++
++A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
++weight-based cgroup CPU control by flattening the cgroup hierarchy into a single
++layer, by compounding the active weight share at each level. The effect of this
++is a much more performant CPU controller, which does not need to descend down
++cgroup trees in order to properly compute a cgroup's share.
++
++Similar to scx_simple, in limited scenarios, this scheduler can perform
++reasonably well on single socket-socket systems with a unified L3 cache and show
++significantly lowered hierarchical scheduling overhead.
++
 +
 +# Troubleshooting
 +
@@ -9059,10 +10375,10 @@ index 000000000000..ad7d139ce907
 + */
 diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
 new file mode 100644
-index 000000000000..20280df62857
+index 000000000000..225f61f9bfca
 --- /dev/null
 +++ b/tools/sched_ext/include/scx/common.bpf.h
-@@ -0,0 +1,401 @@
+@@ -0,0 +1,427 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
@@ -9072,7 +10388,13 @@ index 000000000000..20280df62857
 +#ifndef __SCX_COMMON_BPF_H
 +#define __SCX_COMMON_BPF_H
 +
++#ifdef LSP
++#define __bpf__
++#include "../vmlinux/vmlinux.h"
++#else
 +#include "vmlinux.h"
++#endif
++
 +#include <bpf/bpf_helpers.h>
 +#include <bpf/bpf_tracing.h>
 +#include <asm-generic/errno.h>
@@ -9100,6 +10422,10 @@ index 000000000000..20280df62857
 +u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 +void scx_bpf_dispatch_cancel(void) __ksym;
 +bool scx_bpf_consume(u64 dsq_id) __ksym;
++void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
++void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
++bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
++bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 +u32 scx_bpf_reenqueue_local(void) __ksym;
 +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@@ -9126,6 +10452,13 @@ index 000000000000..20280df62857
 +bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 +struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
++struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
++
++/*
++ * Use the following as @it__iter when calling
++ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
++ */
++#define BPF_FOR_EACH_ITER	(&___it)
 +
 +static inline __attribute__((format(printf, 1, 2)))
 +void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
@@ -9363,6 +10696,15 @@ index 000000000000..20280df62857
 +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
 +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
 +				   const struct cpumask *src2) __ksym;
++u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
++
++/*
++ * Access a cpumask in read-only mode (typically to check bits).
++ */
++const struct cpumask *cast_mask(struct bpf_cpumask *mask)
++{
++	return (const struct cpumask *)mask;
++}
 +
 +/* rcu */
 +void bpf_rcu_read_lock(void) __ksym;
@@ -9547,10 +10889,10 @@ index 000000000000..5b0f90152152
 +#endif	/* __SCHED_EXT_COMMON_H */
 diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
 new file mode 100644
-index 000000000000..3d2fe1208900
+index 000000000000..e5afe9efd3f3
 --- /dev/null
 +++ b/tools/sched_ext/include/scx/compat.bpf.h
-@@ -0,0 +1,28 @@
+@@ -0,0 +1,47 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
@@ -9568,6 +10910,25 @@ index 000000000000..3d2fe1208900
 +	__ret;									\
 +})
 +
++/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
++#define __COMPAT_scx_bpf_task_cgroup(p)						\
++	(bpf_ksym_exists(scx_bpf_task_cgroup) ?					\
++	 scx_bpf_task_cgroup((p)) : NULL)
++
++/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */
++#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice)			\
++	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ?			\
++	 scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0)
++#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime)			\
++	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ?			\
++	 scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0)
++#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags)		\
++	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ?				\
++	 scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
++#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags)	\
++	(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ?			\
++	 scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
++
 +/*
 + * Define sched_ext_ops. This may be expanded to define multiple variants for
 + * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
@@ -9581,10 +10942,10 @@ index 000000000000..3d2fe1208900
 +#endif	/* __SCX_COMPAT_BPF_H */
 diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
 new file mode 100644
-index 000000000000..1bf8eddf20c2
+index 000000000000..cc56ff9aa252
 --- /dev/null
 +++ b/tools/sched_ext/include/scx/compat.h
-@@ -0,0 +1,187 @@
+@@ -0,0 +1,186 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
@@ -9753,14 +11114,13 @@ index 000000000000..1bf8eddf20c2
 + * To maintain compatibility with older libbpf while avoiding trying to attach
 + * twice, disable the autoattach feature on newer libbpf.
 + */
-+/* BACKPORT - bpf_mpa__set_autoattach() not available yet, commented out */
-+/*#if LIBBPF_MAJOR_VERSION > 1 ||					\
++#if LIBBPF_MAJOR_VERSION > 1 ||							\
 +	(LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5)
 +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name)			\
 +	bpf_map__set_autoattach((__skel)->maps.__ops_name, false)
-+#else*/
++#else
 +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0)
-+/*#endif*/
++#endif
 +
 +#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({			\
 +	struct bpf_link *__link;						\
@@ -9774,10 +11134,10 @@ index 000000000000..1bf8eddf20c2
 +#endif	/* __SCX_COMPAT_H */
 diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
 new file mode 100644
-index 000000000000..891693ee604e
+index 000000000000..8ce2734402e1
 --- /dev/null
 +++ b/tools/sched_ext/include/scx/user_exit_info.h
-@@ -0,0 +1,111 @@
+@@ -0,0 +1,115 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Define struct user_exit_info which is shared between BPF and userspace parts
@@ -9805,7 +11165,11 @@ index 000000000000..891693ee604e
 +
 +#ifdef __bpf__
 +
++#ifdef LSP
++#include "../vmlinux/vmlinux.h"
++#else
 +#include "vmlinux.h"
++#endif
 +#include <bpf/bpf_core_read.h>
 +
 +#define UEI_DEFINE(__name)							\
@@ -9891,7 +11255,7 @@ index 000000000000..891693ee604e
 +#endif	/* __USER_EXIT_INFO_H */
 diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
 new file mode 100644
-index 000000000000..1d8fd570eaa7
+index 000000000000..8dd8eb73b6b8
 --- /dev/null
 +++ b/tools/sched_ext/scx_central.bpf.c
 @@ -0,0 +1,361 @@
@@ -10095,7 +11459,7 @@ index 000000000000..1d8fd570eaa7
 +
 +			/* central's gimme is never set */
 +			gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-+			if (gimme && !*gimme)
++			if (!gimme || !*gimme)
 +				continue;
 +
 +			if (dispatch_to_cpu(cpu))
@@ -10397,12 +11761,1271 @@ index 000000000000..21deea320bd7
 +		goto restart;
 +	return 0;
 +}
+diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
+new file mode 100644
+index 000000000000..b722baf6da4b
+--- /dev/null
++++ b/tools/sched_ext/scx_flatcg.bpf.c
+@@ -0,0 +1,957 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements
++ * hierarchical weight-based cgroup CPU control by flattening the cgroup
++ * hierarchy into a single layer by compounding the active weight share at each
++ * level. Consider the following hierarchy with weights in parentheses:
++ *
++ * R + A (100) + B (100)
++ *   |         \ C (100)
++ *   \ D (200)
++ *
++ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
++ * Let's say all three have runnable tasks. The total share that each of these
++ * three cgroups is entitled to can be calculated by compounding its share at
++ * each level.
++ *
++ * For example, B is competing against C and in that competition its share is
++ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
++ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the
++ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
++ * eventual shaer is the same at 1/6. D is only competing at the top level and
++ * its share is 200/(100+200) == 2/3.
++ *
++ * So, instead of hierarchically scheduling level-by-level, we can consider it
++ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
++ * and keep updating the eventual shares as the cgroups' runnable states change.
++ *
++ * This flattening of hierarchy can bring a substantial performance gain when
++ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using
++ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
++ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
++ * apache instances competing with 2:1 weight ratio nested four level deep.
++ *
++ * However, the gain comes at the cost of not being able to properly handle
++ * thundering herd of cgroups. For example, if many cgroups which are nested
++ * behind a low priority parent cgroup wake up around the same time, they may be
++ * able to consume more CPU cycles than they are entitled to. In many use cases,
++ * this isn't a real concern especially given the performance gain. Also, there
++ * are ways to mitigate the problem further by e.g. introducing an extra
++ * scheduling layer on cgroup delegation boundaries.
++ *
++ * The scheduler first picks the cgroup to run and then schedule the tasks
++ * within by using nested weighted vtime scheduling by default. The
++ * cgroup-internal scheduling can be switched to FIFO with the -f option.
++ */
++#include <scx/common.bpf.h>
++#include "scx_flatcg.h"
++
++/*
++ * Maximum amount of retries to find a valid cgroup.
++ */
++enum {
++	FALLBACK_DSQ		= 0,
++	CGROUP_MAX_RETRIES	= 1024,
++};
++
++char _license[] SEC("license") = "GPL";
++
++const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
++const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
++const volatile bool fifo_sched;
++
++u64 cvtime_now;
++UEI_DEFINE(uei);
++
++struct {
++	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
++	__type(key, u32);
++	__type(value, u64);
++	__uint(max_entries, FCG_NR_STATS);
++} stats SEC(".maps");
++
++static void stat_inc(enum fcg_stat_idx idx)
++{
++	u32 idx_v = idx;
++
++	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
++	if (cnt_p)
++		(*cnt_p)++;
++}
++
++struct fcg_cpu_ctx {
++	u64			cur_cgid;
++	u64			cur_at;
++};
++
++struct {
++	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
++	__type(key, u32);
++	__type(value, struct fcg_cpu_ctx);
++	__uint(max_entries, 1);
++} cpu_ctx SEC(".maps");
++
++struct {
++	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
++	__uint(map_flags, BPF_F_NO_PREALLOC);
++	__type(key, int);
++	__type(value, struct fcg_cgrp_ctx);
++} cgrp_ctx SEC(".maps");
++
++struct cgv_node {
++	struct bpf_rb_node	rb_node;
++	__u64			cvtime;
++	__u64			cgid;
++};
++
++private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
++private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
++
++struct cgv_node_stash {
++	struct cgv_node __kptr *node;
++};
++
++struct {
++	__uint(type, BPF_MAP_TYPE_HASH);
++	__uint(max_entries, 16384);
++	__type(key, __u64);
++	__type(value, struct cgv_node_stash);
++} cgv_node_stash SEC(".maps");
++
++struct fcg_task_ctx {
++	u64		bypassed_at;
++};
++
++struct {
++	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
++	__uint(map_flags, BPF_F_NO_PREALLOC);
++	__type(key, int);
++	__type(value, struct fcg_task_ctx);
++} task_ctx SEC(".maps");
++
++/* gets inc'd on weight tree changes to expire the cached hweights */
++u64 hweight_gen = 1;
++
++static u64 div_round_up(u64 dividend, u64 divisor)
++{
++	return (dividend + divisor - 1) / divisor;
++}
++
++static bool vtime_before(u64 a, u64 b)
++{
++	return (s64)(a - b) < 0;
++}
++
++static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
++{
++	struct cgv_node *cgc_a, *cgc_b;
++
++	cgc_a = container_of(a, struct cgv_node, rb_node);
++	cgc_b = container_of(b, struct cgv_node, rb_node);
++
++	return cgc_a->cvtime < cgc_b->cvtime;
++}
++
++static struct fcg_cpu_ctx *find_cpu_ctx(void)
++{
++	struct fcg_cpu_ctx *cpuc;
++	u32 idx = 0;
++
++	cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
++	if (!cpuc) {
++		scx_bpf_error("cpu_ctx lookup failed");
++		return NULL;
++	}
++	return cpuc;
++}
++
++static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp)
++{
++	struct fcg_cgrp_ctx *cgc;
++
++	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
++	if (!cgc) {
++		scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
++		return NULL;
++	}
++	return cgc;
++}
++
++static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level)
++{
++	struct fcg_cgrp_ctx *cgc;
++
++	cgrp = bpf_cgroup_ancestor(cgrp, level);
++	if (!cgrp) {
++		scx_bpf_error("ancestor cgroup lookup failed");
++		return NULL;
++	}
++
++	cgc = find_cgrp_ctx(cgrp);
++	if (!cgc)
++		scx_bpf_error("ancestor cgrp_ctx lookup failed");
++	bpf_cgroup_release(cgrp);
++	return cgc;
++}
++
++static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
++{
++	int level;
++
++	if (!cgc->nr_active) {
++		stat_inc(FCG_STAT_HWT_SKIP);
++		return;
++	}
++
++	if (cgc->hweight_gen == hweight_gen) {
++		stat_inc(FCG_STAT_HWT_CACHE);
++		return;
++	}
++
++	stat_inc(FCG_STAT_HWT_UPDATES);
++	bpf_for(level, 0, cgrp->level + 1) {
++		struct fcg_cgrp_ctx *cgc;
++		bool is_active;
++
++		cgc = find_ancestor_cgrp_ctx(cgrp, level);
++		if (!cgc)
++			break;
++
++		if (!level) {
++			cgc->hweight = FCG_HWEIGHT_ONE;
++			cgc->hweight_gen = hweight_gen;
++		} else {
++			struct fcg_cgrp_ctx *pcgc;
++
++			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
++			if (!pcgc)
++				break;
++
++			/*
++			 * We can be opportunistic here and not grab the
++			 * cgv_tree_lock and deal with the occasional races.
++			 * However, hweight updates are already cached and
++			 * relatively low-frequency. Let's just do the
++			 * straightforward thing.
++			 */
++			bpf_spin_lock(&cgv_tree_lock);
++			is_active = cgc->nr_active;
++			if (is_active) {
++				cgc->hweight_gen = pcgc->hweight_gen;
++				cgc->hweight =
++					div_round_up(pcgc->hweight * cgc->weight,
++						     pcgc->child_weight_sum);
++			}
++			bpf_spin_unlock(&cgv_tree_lock);
++
++			if (!is_active) {
++				stat_inc(FCG_STAT_HWT_RACE);
++				break;
++			}
++		}
++	}
++}
++
++static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
++{
++	u64 delta, cvtime, max_budget;
++
++	/*
++	 * A node which is on the rbtree can't be pointed to from elsewhere yet
++	 * and thus can't be updated and repositioned. Instead, we collect the
++	 * vtime deltas separately and apply it asynchronously here.
++	 */
++	delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta);
++	cvtime = cgv_node->cvtime + delta;
++
++	/*
++	 * Allow a cgroup to carry the maximum budget proportional to its
++	 * hweight such that a full-hweight cgroup can immediately take up half
++	 * of the CPUs at the most while staying at the front of the rbtree.
++	 */
++	max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
++		(2 * FCG_HWEIGHT_ONE);
++	if (vtime_before(cvtime, cvtime_now - max_budget))
++		cvtime = cvtime_now - max_budget;
++
++	cgv_node->cvtime = cvtime;
++}
++
++static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
++{
++	struct cgv_node_stash *stash;
++	struct cgv_node *cgv_node;
++	u64 cgid = cgrp->kn->id;
++
++	/* paired with cmpxchg in try_pick_next_cgroup() */
++	if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) {
++		stat_inc(FCG_STAT_ENQ_SKIP);
++		return;
++	}
++
++	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
++	if (!stash) {
++		scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
++		return;
++	}
++
++	/* NULL if the node is already on the rbtree */
++	cgv_node = bpf_kptr_xchg(&stash->node, NULL);
++	if (!cgv_node) {
++		stat_inc(FCG_STAT_ENQ_RACE);
++		return;
++	}
++
++	bpf_spin_lock(&cgv_tree_lock);
++	cgrp_cap_budget(cgv_node, cgc);
++	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
++	bpf_spin_unlock(&cgv_tree_lock);
++}
++
++static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
++{
++	/*
++	 * Tell fcg_stopping() that this bypassed the regular scheduling path
++	 * and should be force charged to the cgroup. 0 is used to indicate that
++	 * the task isn't bypassing, so if the current runtime is 0, go back by
++	 * one nanosecond.
++	 */
++	taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
++}
++
++s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
++{
++	struct fcg_task_ctx *taskc;
++	bool is_idle = false;
++	s32 cpu;
++
++	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
++
++	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
++	if (!taskc) {
++		scx_bpf_error("task_ctx lookup failed");
++		return cpu;
++	}
++
++	/*
++	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
++	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
++	 * the fact.
++	 */
++	if (is_idle) {
++		set_bypassed_at(p, taskc);
++		stat_inc(FCG_STAT_LOCAL);
++		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
++	}
++
++	return cpu;
++}
++
++void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
++{
++	struct fcg_task_ctx *taskc;
++	struct cgroup *cgrp;
++	struct fcg_cgrp_ctx *cgc;
++
++	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
++	if (!taskc) {
++		scx_bpf_error("task_ctx lookup failed");
++		return;
++	}
++
++	/*
++	 * Use the direct dispatching and force charging to deal with tasks with
++	 * custom affinities so that we don't have to worry about per-cgroup
++	 * dq's containing tasks that can't be executed from some CPUs.
++	 */
++	if (p->nr_cpus_allowed != nr_cpus) {
++		set_bypassed_at(p, taskc);
++
++		/*
++		 * The global dq is deprioritized as we don't want to let tasks
++		 * to boost themselves by constraining its cpumask. The
++		 * deprioritization is rather severe, so let's not apply that to
++		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
++		 * implement per-cgroup fallback dq's instead so that we have
++		 * more control over when tasks with custom cpumask get issued.
++		 */
++		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
++			stat_inc(FCG_STAT_LOCAL);
++			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
++		} else {
++			stat_inc(FCG_STAT_GLOBAL);
++			scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags);
++		}
++		return;
++	}
++
++	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++	cgc = find_cgrp_ctx(cgrp);
++	if (!cgc)
++		goto out_release;
++
++	if (fifo_sched) {
++		scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
++	} else {
++		u64 tvtime = p->scx.dsq_vtime;
++
++		/*
++		 * Limit the amount of budget that an idling task can accumulate
++		 * to one slice.
++		 */
++		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
++			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
++
++		scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
++				       tvtime, enq_flags);
++	}
++
++	cgrp_enqueued(cgrp, cgc);
++out_release:
++	bpf_cgroup_release(cgrp);
++}
++
++/*
++ * Walk the cgroup tree to update the active weight sums as tasks wake up and
++ * sleep. The weight sums are used as the base when calculating the proportion a
++ * given cgroup or task is entitled to at each level.
++ */
++static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
++{
++	struct fcg_cgrp_ctx *cgc;
++	bool updated = false;
++	int idx;
++
++	cgc = find_cgrp_ctx(cgrp);
++	if (!cgc)
++		return;
++
++	/*
++	 * In most cases, a hot cgroup would have multiple threads going to
++	 * sleep and waking up while the whole cgroup stays active. In leaf
++	 * cgroups, ->nr_runnable which is updated with __sync operations gates
++	 * ->nr_active updates, so that we don't have to grab the cgv_tree_lock
++	 * repeatedly for a busy cgroup which is staying active.
++	 */
++	if (runnable) {
++		if (__sync_fetch_and_add(&cgc->nr_runnable, 1))
++			return;
++		stat_inc(FCG_STAT_ACT);
++	} else {
++		if (__sync_sub_and_fetch(&cgc->nr_runnable, 1))
++			return;
++		stat_inc(FCG_STAT_DEACT);
++	}
++
++	/*
++	 * If @cgrp is becoming runnable, its hweight should be refreshed after
++	 * it's added to the weight tree so that enqueue has the up-to-date
++	 * value. If @cgrp is becoming quiescent, the hweight should be
++	 * refreshed before it's removed from the weight tree so that the usage
++	 * charging which happens afterwards has access to the latest value.
++	 */
++	if (!runnable)
++		cgrp_refresh_hweight(cgrp, cgc);
++
++	/* propagate upwards */
++	bpf_for(idx, 0, cgrp->level) {
++		int level = cgrp->level - idx;
++		struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
++		bool propagate = false;
++
++		cgc = find_ancestor_cgrp_ctx(cgrp, level);
++		if (!cgc)
++			break;
++		if (level) {
++			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
++			if (!pcgc)
++				break;
++		}
++
++		/*
++		 * We need the propagation protected by a lock to synchronize
++		 * against weight changes. There's no reason to drop the lock at
++		 * each level but bpf_spin_lock() doesn't want any function
++		 * calls while locked.
++		 */
++		bpf_spin_lock(&cgv_tree_lock);
++
++		if (runnable) {
++			if (!cgc->nr_active++) {
++				updated = true;
++				if (pcgc) {
++					propagate = true;
++					pcgc->child_weight_sum += cgc->weight;
++				}
++			}
++		} else {
++			if (!--cgc->nr_active) {
++				updated = true;
++				if (pcgc) {
++					propagate = true;
++					pcgc->child_weight_sum -= cgc->weight;
++				}
++			}
++		}
++
++		bpf_spin_unlock(&cgv_tree_lock);
++
++		if (!propagate)
++			break;
++	}
++
++	if (updated)
++		__sync_fetch_and_add(&hweight_gen, 1);
++
++	if (runnable)
++		cgrp_refresh_hweight(cgrp, cgc);
++}
++
++void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
++{
++	struct cgroup *cgrp;
++
++	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++	update_active_weight_sums(cgrp, true);
++	bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
++{
++	struct cgroup *cgrp;
++	struct fcg_cgrp_ctx *cgc;
++
++	if (fifo_sched)
++		return;
++
++	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++	cgc = find_cgrp_ctx(cgrp);
++	if (cgc) {
++		/*
++		 * @cgc->tvtime_now always progresses forward as tasks start
++		 * executing. The test and update can be performed concurrently
++		 * from multiple CPUs and thus racy. Any error should be
++		 * contained and temporary. Let's just live with it.
++		 */
++		if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
++			cgc->tvtime_now = p->scx.dsq_vtime;
++	}
++	bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
++{
++	struct fcg_task_ctx *taskc;
++	struct cgroup *cgrp;
++	struct fcg_cgrp_ctx *cgc;
++
++	/*
++	 * Scale the execution time by the inverse of the weight and charge.
++	 *
++	 * Note that the default yield implementation yields by setting
++	 * @p->scx.slice to zero and the following would treat the yielding task
++	 * as if it has consumed all its slice. If this penalizes yielding tasks
++	 * too much, determine the execution time by taking explicit timestamps
++	 * instead of depending on @p->scx.slice.
++	 */
++	if (!fifo_sched)
++		p->scx.dsq_vtime +=
++			(SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
++
++	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
++	if (!taskc) {
++		scx_bpf_error("task_ctx lookup failed");
++		return;
++	}
++
++	if (!taskc->bypassed_at)
++		return;
++
++	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++	cgc = find_cgrp_ctx(cgrp);
++	if (cgc) {
++		__sync_fetch_and_add(&cgc->cvtime_delta,
++				     p->se.sum_exec_runtime - taskc->bypassed_at);
++		taskc->bypassed_at = 0;
++	}
++	bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
++{
++	struct cgroup *cgrp;
++
++	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
++	update_active_weight_sums(cgrp, false);
++	bpf_cgroup_release(cgrp);
++}
++
++void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
++{
++	struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
++
++	cgc = find_cgrp_ctx(cgrp);
++	if (!cgc)
++		return;
++
++	if (cgrp->level) {
++		pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1);
++		if (!pcgc)
++			return;
++	}
++
++	bpf_spin_lock(&cgv_tree_lock);
++	if (pcgc && cgc->nr_active)
++		pcgc->child_weight_sum += (s64)weight - cgc->weight;
++	cgc->weight = weight;
++	bpf_spin_unlock(&cgv_tree_lock);
++}
++
++static bool try_pick_next_cgroup(u64 *cgidp)
++{
++	struct bpf_rb_node *rb_node;
++	struct cgv_node_stash *stash;
++	struct cgv_node *cgv_node;
++	struct fcg_cgrp_ctx *cgc;
++	struct cgroup *cgrp;
++	u64 cgid;
++
++	/* pop the front cgroup and wind cvtime_now accordingly */
++	bpf_spin_lock(&cgv_tree_lock);
++
++	rb_node = bpf_rbtree_first(&cgv_tree);
++	if (!rb_node) {
++		bpf_spin_unlock(&cgv_tree_lock);
++		stat_inc(FCG_STAT_PNC_NO_CGRP);
++		*cgidp = 0;
++		return true;
++	}
++
++	rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
++	bpf_spin_unlock(&cgv_tree_lock);
++
++	if (!rb_node) {
++		/*
++		 * This should never happen. bpf_rbtree_first() was called
++		 * above while the tree lock was held, so the node should
++		 * always be present.
++		 */
++		scx_bpf_error("node could not be removed");
++		return true;
++	}
++
++	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
++	cgid = cgv_node->cgid;
++
++	if (vtime_before(cvtime_now, cgv_node->cvtime))
++		cvtime_now = cgv_node->cvtime;
++
++	/*
++	 * If lookup fails, the cgroup's gone. Free and move on. See
++	 * fcg_cgroup_exit().
++	 */
++	cgrp = bpf_cgroup_from_id(cgid);
++	if (!cgrp) {
++		stat_inc(FCG_STAT_PNC_GONE);
++		goto out_free;
++	}
++
++	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
++	if (!cgc) {
++		bpf_cgroup_release(cgrp);
++		stat_inc(FCG_STAT_PNC_GONE);
++		goto out_free;
++	}
++
++	if (!scx_bpf_consume(cgid)) {
++		bpf_cgroup_release(cgrp);
++		stat_inc(FCG_STAT_PNC_EMPTY);
++		goto out_stash;
++	}
++
++	/*
++	 * Successfully consumed from the cgroup. This will be our current
++	 * cgroup for the new slice. Refresh its hweight.
++	 */
++	cgrp_refresh_hweight(cgrp, cgc);
++
++	bpf_cgroup_release(cgrp);
++
++	/*
++	 * As the cgroup may have more tasks, add it back to the rbtree. Note
++	 * that here we charge the full slice upfront and then exact later
++	 * according to the actual consumption. This prevents lowpri thundering
++	 * herd from saturating the machine.
++	 */
++	bpf_spin_lock(&cgv_tree_lock);
++	cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1);
++	cgrp_cap_budget(cgv_node, cgc);
++	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
++	bpf_spin_unlock(&cgv_tree_lock);
++
++	*cgidp = cgid;
++	stat_inc(FCG_STAT_PNC_NEXT);
++	return true;
++
++out_stash:
++	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
++	if (!stash) {
++		stat_inc(FCG_STAT_PNC_GONE);
++		goto out_free;
++	}
++
++	/*
++	 * Paired with cmpxchg in cgrp_enqueued(). If they see the following
++	 * transition, they'll enqueue the cgroup. If they are earlier, we'll
++	 * see their task in the dq below and requeue the cgroup.
++	 */
++	__sync_val_compare_and_swap(&cgc->queued, 1, 0);
++
++	if (scx_bpf_dsq_nr_queued(cgid)) {
++		bpf_spin_lock(&cgv_tree_lock);
++		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
++		bpf_spin_unlock(&cgv_tree_lock);
++		stat_inc(FCG_STAT_PNC_RACE);
++	} else {
++		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
++		if (cgv_node) {
++			scx_bpf_error("unexpected !NULL cgv_node stash");
++			goto out_free;
++		}
++	}
++
++	return false;
++
++out_free:
++	bpf_obj_drop(cgv_node);
++	return false;
++}
++
++void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
++{
++	struct fcg_cpu_ctx *cpuc;
++	struct fcg_cgrp_ctx *cgc;
++	struct cgroup *cgrp;
++	u64 now = bpf_ktime_get_ns();
++	bool picked_next = false;
++
++	cpuc = find_cpu_ctx();
++	if (!cpuc)
++		return;
++
++	if (!cpuc->cur_cgid)
++		goto pick_next_cgroup;
++
++	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
++		if (scx_bpf_consume(cpuc->cur_cgid)) {
++			stat_inc(FCG_STAT_CNS_KEEP);
++			return;
++		}
++		stat_inc(FCG_STAT_CNS_EMPTY);
++	} else {
++		stat_inc(FCG_STAT_CNS_EXPIRE);
++	}
++
++	/*
++	 * The current cgroup is expiring. It was already charged a full slice.
++	 * Calculate the actual usage and accumulate the delta.
++	 */
++	cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
++	if (!cgrp) {
++		stat_inc(FCG_STAT_CNS_GONE);
++		goto pick_next_cgroup;
++	}
++
++	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
++	if (cgc) {
++		/*
++		 * We want to update the vtime delta and then look for the next
++		 * cgroup to execute but the latter needs to be done in a loop
++		 * and we can't keep the lock held. Oh well...
++		 */
++		bpf_spin_lock(&cgv_tree_lock);
++		__sync_fetch_and_add(&cgc->cvtime_delta,
++				     (cpuc->cur_at + cgrp_slice_ns - now) *
++				     FCG_HWEIGHT_ONE / (cgc->hweight ?: 1));
++		bpf_spin_unlock(&cgv_tree_lock);
++	} else {
++		stat_inc(FCG_STAT_CNS_GONE);
++	}
++
++	bpf_cgroup_release(cgrp);
++
++pick_next_cgroup:
++	cpuc->cur_at = now;
++
++	if (scx_bpf_consume(FALLBACK_DSQ)) {
++		cpuc->cur_cgid = 0;
++		return;
++	}
++
++	bpf_repeat(CGROUP_MAX_RETRIES) {
++		if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
++			picked_next = true;
++			break;
++		}
++	}
++
++	/*
++	 * This only happens if try_pick_next_cgroup() races against enqueue
++	 * path for more than CGROUP_MAX_RETRIES times, which is extremely
++	 * unlikely and likely indicates an underlying bug. There shouldn't be
++	 * any stall risk as the race is against enqueue.
++	 */
++	if (!picked_next)
++		stat_inc(FCG_STAT_PNC_FAIL);
++}
++
++s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
++		   struct scx_init_task_args *args)
++{
++	struct fcg_task_ctx *taskc;
++	struct fcg_cgrp_ctx *cgc;
++
++	/*
++	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
++	 * in this function and the following will automatically use GFP_KERNEL.
++	 */
++	taskc = bpf_task_storage_get(&task_ctx, p, 0,
++				     BPF_LOCAL_STORAGE_GET_F_CREATE);
++	if (!taskc)
++		return -ENOMEM;
++
++	taskc->bypassed_at = 0;
++
++	if (!(cgc = find_cgrp_ctx(args->cgroup)))
++		return -ENOENT;
++
++	p->scx.dsq_vtime = cgc->tvtime_now;
++
++	return 0;
++}
++
++int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
++			     struct scx_cgroup_init_args *args)
++{
++	struct fcg_cgrp_ctx *cgc;
++	struct cgv_node *cgv_node;
++	struct cgv_node_stash empty_stash = {}, *stash;
++	u64 cgid = cgrp->kn->id;
++	int ret;
++
++	/*
++	 * Technically incorrect as cgroup ID is full 64bit while dsq ID is
++	 * 63bit. Should not be a problem in practice and easy to spot in the
++	 * unlikely case that it breaks.
++	 */
++	ret = scx_bpf_create_dsq(cgid, -1);
++	if (ret)
++		return ret;
++
++	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
++				   BPF_LOCAL_STORAGE_GET_F_CREATE);
++	if (!cgc) {
++		ret = -ENOMEM;
++		goto err_destroy_dsq;
++	}
++
++	cgc->weight = args->weight;
++	cgc->hweight = FCG_HWEIGHT_ONE;
++
++	ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
++				  BPF_NOEXIST);
++	if (ret) {
++		if (ret != -ENOMEM)
++			scx_bpf_error("unexpected stash creation error (%d)",
++				      ret);
++		goto err_destroy_dsq;
++	}
++
++	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
++	if (!stash) {
++		scx_bpf_error("unexpected cgv_node stash lookup failure");
++		ret = -ENOENT;
++		goto err_destroy_dsq;
++	}
++
++	cgv_node = bpf_obj_new(struct cgv_node);
++	if (!cgv_node) {
++		ret = -ENOMEM;
++		goto err_del_cgv_node;
++	}
++
++	cgv_node->cgid = cgid;
++	cgv_node->cvtime = cvtime_now;
++
++	cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
++	if (cgv_node) {
++		scx_bpf_error("unexpected !NULL cgv_node stash");
++		ret = -EBUSY;
++		goto err_drop;
++	}
++
++	return 0;
++
++err_drop:
++	bpf_obj_drop(cgv_node);
++err_del_cgv_node:
++	bpf_map_delete_elem(&cgv_node_stash, &cgid);
++err_destroy_dsq:
++	scx_bpf_destroy_dsq(cgid);
++	return ret;
++}
++
++void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
++{
++	u64 cgid = cgrp->kn->id;
++
++	/*
++	 * For now, there's no way find and remove the cgv_node if it's on the
++	 * cgv_tree. Let's drain them in the dispatch path as they get popped
++	 * off the front of the tree.
++	 */
++	bpf_map_delete_elem(&cgv_node_stash, &cgid);
++	scx_bpf_destroy_dsq(cgid);
++}
++
++void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
++		    struct cgroup *from, struct cgroup *to)
++{
++	struct fcg_cgrp_ctx *from_cgc, *to_cgc;
++	s64 vtime_delta;
++
++	/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
++	if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
++		return;
++
++	vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
++	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
++}
++
++s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
++{
++	return scx_bpf_create_dsq(FALLBACK_DSQ, -1);
++}
++
++void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
++{
++	UEI_RECORD(uei, ei);
++}
++
++SCX_OPS_DEFINE(flatcg_ops,
++	       .select_cpu		= (void *)fcg_select_cpu,
++	       .enqueue			= (void *)fcg_enqueue,
++	       .dispatch		= (void *)fcg_dispatch,
++	       .runnable		= (void *)fcg_runnable,
++	       .running			= (void *)fcg_running,
++	       .stopping		= (void *)fcg_stopping,
++	       .quiescent		= (void *)fcg_quiescent,
++	       .init_task		= (void *)fcg_init_task,
++	       .cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
++	       .cgroup_init		= (void *)fcg_cgroup_init,
++	       .cgroup_exit		= (void *)fcg_cgroup_exit,
++	       .cgroup_move		= (void *)fcg_cgroup_move,
++	       .init			= (void *)fcg_init,
++	       .exit			= (void *)fcg_exit,
++	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
++	       .name			= "flatcg");
+diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
+new file mode 100644
+index 000000000000..5d24ca9c29d9
+--- /dev/null
++++ b/tools/sched_ext/scx_flatcg.c
+@@ -0,0 +1,233 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
++ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
++ */
++#include <stdio.h>
++#include <signal.h>
++#include <unistd.h>
++#include <libgen.h>
++#include <limits.h>
++#include <inttypes.h>
++#include <fcntl.h>
++#include <time.h>
++#include <bpf/bpf.h>
++#include <scx/common.h>
++#include "scx_flatcg.h"
++#include "scx_flatcg.bpf.skel.h"
++
++#ifndef FILEID_KERNFS
++#define FILEID_KERNFS		0xfe
++#endif
++
++const char help_fmt[] =
++"A flattened cgroup hierarchy sched_ext scheduler.\n"
++"\n"
++"See the top-level comment in .bpf.c for more details.\n"
++"\n"
++"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
++"\n"
++"  -s SLICE_US   Override slice duration\n"
++"  -i INTERVAL   Report interval\n"
++"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
++"  -v            Print libbpf debug messages\n"
++"  -h            Display this help and exit\n";
++
++static bool verbose;
++static volatile int exit_req;
++
++static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
++{
++	if (level == LIBBPF_DEBUG && !verbose)
++		return 0;
++	return vfprintf(stderr, format, args);
++}
++
++static void sigint_handler(int dummy)
++{
++	exit_req = 1;
++}
++
++static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
++{
++	FILE *fp;
++	char buf[4096];
++	char *line, *cur = NULL, *tok;
++	__u64 sum = 0, idle = 0;
++	__u64 delta_sum, delta_idle;
++	int idx;
++
++	fp = fopen("/proc/stat", "r");
++	if (!fp) {
++		perror("fopen(\"/proc/stat\")");
++		return 0.0;
++	}
++
++	if (!fgets(buf, sizeof(buf), fp)) {
++		perror("fgets(\"/proc/stat\")");
++		fclose(fp);
++		return 0.0;
++	}
++	fclose(fp);
++
++	line = buf;
++	for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
++		char *endp = NULL;
++		__u64 v;
++
++		if (idx == 0) {
++			line = NULL;
++			continue;
++		}
++		v = strtoull(tok, &endp, 0);
++		if (!endp || *endp != '\0') {
++			fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
++				idx, tok);
++			continue;
++		}
++		sum += v;
++		if (idx == 4)
++			idle = v;
++	}
++
++	delta_sum = sum - *last_sum;
++	delta_idle = idle - *last_idle;
++	*last_sum = sum;
++	*last_idle = idle;
++
++	return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
++}
++
++static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
++{
++	__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
++	__u32 idx;
++
++	memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
++
++	for (idx = 0; idx < FCG_NR_STATS; idx++) {
++		int ret, cpu;
++
++		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
++					  &idx, cnts[idx]);
++		if (ret < 0)
++			continue;
++		for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
++			stats[idx] += cnts[idx][cpu];
++	}
++}
++
++int main(int argc, char **argv)
++{
++	struct scx_flatcg *skel;
++	struct bpf_link *link;
++	struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
++	bool dump_cgrps = false;
++	__u64 last_cpu_sum = 0, last_cpu_idle = 0;
++	__u64 last_stats[FCG_NR_STATS] = {};
++	unsigned long seq = 0;
++	__s32 opt;
++	__u64 ecode;
++
++	libbpf_set_print(libbpf_print_fn);
++	signal(SIGINT, sigint_handler);
++	signal(SIGTERM, sigint_handler);
++restart:
++	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
++
++	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
++
++	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
++		double v;
++
++		switch (opt) {
++		case 's':
++			v = strtod(optarg, NULL);
++			skel->rodata->cgrp_slice_ns = v * 1000;
++			break;
++		case 'i':
++			v = strtod(optarg, NULL);
++			intv_ts.tv_sec = v;
++			intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
++			break;
++		case 'd':
++			dump_cgrps = true;
++			break;
++		case 'f':
++			skel->rodata->fifo_sched = true;
++			break;
++		case 'v':
++			verbose = true;
++			break;
++		case 'h':
++		default:
++			fprintf(stderr, help_fmt, basename(argv[0]));
++			return opt != 'h';
++		}
++	}
++
++	printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
++	       (double)skel->rodata->cgrp_slice_ns / 1000000.0,
++	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
++	       dump_cgrps);
++
++	SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
++	link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);
++
++	while (!exit_req && !UEI_EXITED(skel, uei)) {
++		__u64 acc_stats[FCG_NR_STATS];
++		__u64 stats[FCG_NR_STATS];
++		float cpu_util;
++		int i;
++
++		cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);
++
++		fcg_read_stats(skel, acc_stats);
++		for (i = 0; i < FCG_NR_STATS; i++)
++			stats[i] = acc_stats[i] - last_stats[i];
++
++		memcpy(last_stats, acc_stats, sizeof(acc_stats));
++
++		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
++		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
++		printf("       act:%6llu  deact:%6llu global:%6llu local:%6llu\n",
++		       stats[FCG_STAT_ACT],
++		       stats[FCG_STAT_DEACT],
++		       stats[FCG_STAT_GLOBAL],
++		       stats[FCG_STAT_LOCAL]);
++		printf("HWT  cache:%6llu update:%6llu   skip:%6llu  race:%6llu\n",
++		       stats[FCG_STAT_HWT_CACHE],
++		       stats[FCG_STAT_HWT_UPDATES],
++		       stats[FCG_STAT_HWT_SKIP],
++		       stats[FCG_STAT_HWT_RACE]);
++		printf("ENQ   skip:%6llu   race:%6llu\n",
++		       stats[FCG_STAT_ENQ_SKIP],
++		       stats[FCG_STAT_ENQ_RACE]);
++		printf("CNS   keep:%6llu expire:%6llu  empty:%6llu  gone:%6llu\n",
++		       stats[FCG_STAT_CNS_KEEP],
++		       stats[FCG_STAT_CNS_EXPIRE],
++		       stats[FCG_STAT_CNS_EMPTY],
++		       stats[FCG_STAT_CNS_GONE]);
++		printf("PNC   next:%6llu  empty:%6llu nocgrp:%6llu  gone:%6llu race:%6llu fail:%6llu\n",
++		       stats[FCG_STAT_PNC_NEXT],
++		       stats[FCG_STAT_PNC_EMPTY],
++		       stats[FCG_STAT_PNC_NO_CGRP],
++		       stats[FCG_STAT_PNC_GONE],
++		       stats[FCG_STAT_PNC_RACE],
++		       stats[FCG_STAT_PNC_FAIL]);
++		printf("BAD remove:%6llu\n",
++		       acc_stats[FCG_STAT_BAD_REMOVAL]);
++		fflush(stdout);
++
++		nanosleep(&intv_ts, NULL);
++	}
++
++	bpf_link__destroy(link);
++	ecode = UEI_REPORT(skel, uei);
++	scx_flatcg__destroy(skel);
++
++	if (UEI_ECODE_RESTART(ecode))
++		goto restart;
++	return 0;
++}
+diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
+new file mode 100644
+index 000000000000..6f2ea50acb1c
+--- /dev/null
++++ b/tools/sched_ext/scx_flatcg.h
+@@ -0,0 +1,51 @@
++#ifndef __SCX_EXAMPLE_FLATCG_H
++#define __SCX_EXAMPLE_FLATCG_H
++
++enum {
++	FCG_HWEIGHT_ONE		= 1LLU << 16,
++};
++
++enum fcg_stat_idx {
++	FCG_STAT_ACT,
++	FCG_STAT_DEACT,
++	FCG_STAT_LOCAL,
++	FCG_STAT_GLOBAL,
++
++	FCG_STAT_HWT_UPDATES,
++	FCG_STAT_HWT_CACHE,
++	FCG_STAT_HWT_SKIP,
++	FCG_STAT_HWT_RACE,
++
++	FCG_STAT_ENQ_SKIP,
++	FCG_STAT_ENQ_RACE,
++
++	FCG_STAT_CNS_KEEP,
++	FCG_STAT_CNS_EXPIRE,
++	FCG_STAT_CNS_EMPTY,
++	FCG_STAT_CNS_GONE,
++
++	FCG_STAT_PNC_NO_CGRP,
++	FCG_STAT_PNC_NEXT,
++	FCG_STAT_PNC_EMPTY,
++	FCG_STAT_PNC_GONE,
++	FCG_STAT_PNC_RACE,
++	FCG_STAT_PNC_FAIL,
++
++	FCG_STAT_BAD_REMOVAL,
++
++	FCG_NR_STATS,
++};
++
++struct fcg_cgrp_ctx {
++	u32			nr_active;
++	u32			nr_runnable;
++	u32			queued;
++	u32			weight;
++	u32			hweight;
++	u64			child_weight_sum;
++	u64			hweight_gen;
++	s64			cvtime_delta;
++	u64			tvtime_now;
++};
++
++#endif /* __SCX_EXAMPLE_FLATCG_H */
 diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
 new file mode 100644
-index 000000000000..892278f12dce
+index 000000000000..5b39bee9eb23
 --- /dev/null
 +++ b/tools/sched_ext/scx_qmap.bpf.c
-@@ -0,0 +1,706 @@
+@@ -0,0 +1,813 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * A simple five-level FIFO queue scheduler.
@@ -10432,6 +13055,8 @@ index 000000000000..892278f12dce
 +enum consts {
 +	ONE_SEC_IN_NS		= 1000000000,
 +	SHARED_DSQ		= 0,
++	HIGHPRI_DSQ		= 1,
++	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
 +};
 +
 +char _license[] SEC("license") = "GPL";
@@ -10441,10 +13066,12 @@ index 000000000000..892278f12dce
 +const volatile u32 stall_kernel_nth;
 +const volatile u32 dsp_inf_loop_after;
 +const volatile u32 dsp_batch;
++const volatile bool highpri_boosting;
 +const volatile bool print_shared_dsq;
 +const volatile s32 disallow_tgid;
 +const volatile bool suppress_dump;
 +
++u64 nr_highpri_queued;
 +u32 test_error_cnt;
 +
 +UEI_DEFINE(uei);
@@ -10500,6 +13127,7 @@ index 000000000000..892278f12dce
 +/* Per-task scheduling context */
 +struct task_ctx {
 +	bool	force_local;	/* Dispatch directly to local_dsq */
++	bool	highpri;
 +	u64	core_sched_seq;
 +};
 +
@@ -10527,6 +13155,7 @@ index 000000000000..892278f12dce
 +/* Statistics */
 +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
 +u64 nr_core_sched_execed;
++u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
 +u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 +u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 +
@@ -10545,17 +13174,25 @@ index 000000000000..892278f12dce
 +	return -1;
 +}
 +
++static struct task_ctx *lookup_task_ctx(struct task_struct *p)
++{
++	struct task_ctx *tctx;
++
++	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
++		scx_bpf_error("task_ctx lookup failed");
++		return NULL;
++	}
++	return tctx;
++}
++
 +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 +		   s32 prev_cpu, u64 wake_flags)
 +{
 +	struct task_ctx *tctx;
 +	s32 cpu;
 +
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
++	if (!(tctx = lookup_task_ctx(p)))
 +		return -ESRCH;
-+	}
 +
 +	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 +
@@ -10602,11 +13239,8 @@ index 000000000000..892278f12dce
 +	if (test_error_cnt && !--test_error_cnt)
 +		scx_bpf_error("test triggering error");
 +
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
++	if (!(tctx = lookup_task_ctx(p)))
 +		return;
-+	}
 +
 +	/*
 +	 * All enqueued tasks must have their core_sched_seq updated for correct
@@ -10661,6 +13295,10 @@ index 000000000000..892278f12dce
 +		return;
 +	}
 +
++	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
++		tctx->highpri = true;
++		__sync_fetch_and_add(&nr_highpri_queued, 1);
++	}
 +	__sync_fetch_and_add(&nr_enqueued, 1);
 +}
 +
@@ -10677,13 +13315,80 @@ index 000000000000..892278f12dce
 +
 +static void update_core_sched_head_seq(struct task_struct *p)
 +{
-+	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
 +	int idx = weight_to_idx(p->scx.weight);
++	struct task_ctx *tctx;
 +
-+	if (tctx)
++	if ((tctx = lookup_task_ctx(p)))
 +		core_sched_head_seqs[idx] = tctx->core_sched_seq;
-+	else
-+		scx_bpf_error("task_ctx lookup failed");
++}
++
++/*
++ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
++ * selective priority boosting mechanism by scanning SHARED_DSQ looking for
++ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
++ * makes minor difference only when dsp_batch is larger than 1.
++ *
++ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
++ * non-rq-lock holding BPF programs. As demonstration, this function is called
++ * from qmap_dispatch() and monitor_timerfn().
++ */
++static bool dispatch_highpri(bool from_timer)
++{
++	struct task_struct *p;
++	s32 this_cpu = bpf_get_smp_processor_id();
++
++	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
++	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
++		static u64 highpri_seq;
++		struct task_ctx *tctx;
++
++		if (!(tctx = lookup_task_ctx(p)))
++			return false;
++
++		if (tctx->highpri) {
++			/* exercise the set_*() and vtime interface too */
++			__COMPAT_scx_bpf_dispatch_from_dsq_set_slice(
++				BPF_FOR_EACH_ITER, slice_ns * 2);
++			__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(
++				BPF_FOR_EACH_ITER, highpri_seq++);
++			__COMPAT_scx_bpf_dispatch_vtime_from_dsq(
++				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
++		}
++	}
++
++	/*
++	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
++	 * is found.
++	 */
++	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
++		bool dispatched = false;
++		s32 cpu;
++
++		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
++			cpu = this_cpu;
++		else
++			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
++
++		if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
++						       SCX_DSQ_LOCAL_ON | cpu,
++						       SCX_ENQ_PREEMPT)) {
++			if (cpu == this_cpu) {
++				dispatched = true;
++				__sync_fetch_and_add(&nr_expedited_local, 1);
++			} else {
++				__sync_fetch_and_add(&nr_expedited_remote, 1);
++			}
++			if (from_timer)
++				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
++		} else {
++			__sync_fetch_and_add(&nr_expedited_lost, 1);
++		}
++
++		if (dispatched)
++			return true;
++	}
++
++	return false;
 +}
 +
 +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
@@ -10694,7 +13399,10 @@ index 000000000000..892278f12dce
 +	void *fifo;
 +	s32 i, pid;
 +
-+	if (scx_bpf_consume(SHARED_DSQ))
++	if (dispatch_highpri(false))
++		return;
++
++	if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
 +		return;
 +
 +	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -10731,6 +13439,8 @@ index 000000000000..892278f12dce
 +
 +		/* Dispatch or advance. */
 +		bpf_repeat(BPF_MAX_LOOPS) {
++			struct task_ctx *tctx;
++
 +			if (bpf_map_pop_elem(fifo, &pid))
 +				break;
 +
@@ -10738,13 +13448,25 @@ index 000000000000..892278f12dce
 +			if (!p)
 +				continue;
 +
++			if (!(tctx = lookup_task_ctx(p))) {
++				bpf_task_release(p);
++				return;
++			}
++
++			if (tctx->highpri)
++				__sync_fetch_and_sub(&nr_highpri_queued, 1);
++
 +			update_core_sched_head_seq(p);
 +			__sync_fetch_and_add(&nr_dispatched, 1);
++
 +			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
 +			bpf_task_release(p);
++
 +			batch--;
 +			cpuc->dsp_cnt--;
 +			if (!batch || !scx_bpf_dispatch_nr_slots()) {
++				if (dispatch_highpri(false))
++					return;
 +				scx_bpf_consume(SHARED_DSQ);
 +				return;
 +			}
@@ -11054,6 +13776,10 @@ index 000000000000..892278f12dce
 +
 +static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 +{
++	bpf_rcu_read_lock();
++	dispatch_highpri(true);
++	bpf_rcu_read_unlock();
++
 +	monitor_cpuperf();
 +
 +	if (print_shared_dsq)
@@ -11075,6 +13801,10 @@ index 000000000000..892278f12dce
 +	if (ret)
 +		return ret;
 +
++	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
++	if (ret)
++		return ret;
++
 +	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 +	if (!timer)
 +		return -ESRCH;
@@ -11111,10 +13841,10 @@ index 000000000000..892278f12dce
 +	       .name			= "qmap");
 diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
 new file mode 100644
-index 000000000000..c9ca30d62b2b
+index 000000000000..ac45a02b4055
 --- /dev/null
 +++ b/tools/sched_ext/scx_qmap.c
-@@ -0,0 +1,144 @@
+@@ -0,0 +1,153 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
@@ -11146,6 +13876,7 @@ index 000000000000..c9ca30d62b2b
 +"  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 +"  -b COUNT      Dispatch upto COUNT tasks together\n"
 +"  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
++"  -H            Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
 +"  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 +"  -D LEN        Set scx_exit_info.dump buffer length\n"
 +"  -S            Suppress qmap-specific debug dump\n"
@@ -11180,7 +13911,7 @@ index 000000000000..c9ca30d62b2b
 +
 +	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 +
-+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
++	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
 +		switch (opt) {
 +		case 's':
 +			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -11203,6 +13934,9 @@ index 000000000000..c9ca30d62b2b
 +		case 'P':
 +			skel->rodata->print_shared_dsq = true;
 +			break;
++		case 'H':
++			skel->rodata->highpri_boosting = true;
++			break;
 +		case 'd':
 +			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 +			if (skel->rodata->disallow_tgid < 0)
@@ -11238,6 +13972,11 @@ index 000000000000..c9ca30d62b2b
 +		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
 +		       skel->bss->nr_core_sched_execed,
 +		       skel->bss->nr_ddsp_from_enq);
++		printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
++		       skel->bss->nr_expedited_local,
++		       skel->bss->nr_expedited_remote,
++		       skel->bss->nr_expedited_from_timer,
++		       skel->bss->nr_expedited_lost);
 +		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
 +			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
 +			       skel->bss->cpuperf_min,
@@ -11261,10 +14000,10 @@ index 000000000000..c9ca30d62b2b
 +}
 diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
 new file mode 100644
-index 000000000000..d457d2a74e1e
+index 000000000000..8bc626ede1c4
 --- /dev/null
 +++ b/tools/sched_ext/scx_show_state.py
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,40 @@
 +#!/usr/bin/env drgn
 +#
 +# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
@@ -11304,6 +14043,7 @@ index 000000000000..d457d2a74e1e
 +print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
 +print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
 +print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
++print(f'enable_seq    : {read_atomic("scx_enable_seq")}')
 diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
 new file mode 100644
 index 000000000000..ed7e8d535fc5
@@ -13191,10 +15931,10 @@ index 000000000000..97d45f1e5597
 +REGISTER_SCX_TEST(&init_enable_count)
 diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
 new file mode 100644
-index 000000000000..44612fdaf399
+index 000000000000..00bfa9cb95d3
 --- /dev/null
 +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
-@@ -0,0 +1,132 @@
+@@ -0,0 +1,164 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * A scheduler with every callback defined.
@@ -13292,6 +16032,32 @@ index 000000000000..44612fdaf399
 +void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p)
 +{}
 +
++s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp,
++		   struct scx_cgroup_init_args *args)
++{
++	return 0;
++}
++
++void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp)
++{}
++
++s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p,
++		   struct cgroup *from, struct cgroup *to)
++{
++	return 0;
++}
++
++void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p,
++		    struct cgroup *from, struct cgroup *to)
++{}
++
++void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
++	       struct cgroup *from, struct cgroup *to)
++{}
++
++void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
++{}
++
 +s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
 +{
 +	return 0;
@@ -13323,6 +16089,12 @@ index 000000000000..44612fdaf399
 +	.enable			= maximal_enable,
 +	.exit_task		= maximal_exit_task,
 +	.disable		= maximal_disable,
++	.cgroup_init		= maximal_cgroup_init,
++	.cgroup_exit		= maximal_cgroup_exit,
++	.cgroup_prep_move	= maximal_cgroup_prep_move,
++	.cgroup_move		= maximal_cgroup_move,
++	.cgroup_cancel_move	= maximal_cgroup_cancel_move,
++	.cgroup_set_weight	= maximal_cgroup_set_weight,
 +	.init			= maximal_init,
 +	.exit			= maximal_exit,
 +	.name			= "maximal",
@@ -15130,3 +17902,6 @@ index 000000000000..bc13dfec1267
 +int file_write_long(const char *path, long val);
 +
 +#endif // __SCX_TEST_H__
+-- 
+2.47.0.rc0
+