diff options
author | Jan200101 <sentrycraft123@gmail.com> | 2024-07-17 19:41:57 +0200 |
---|---|---|
committer | Jan200101 <sentrycraft123@gmail.com> | 2024-07-17 19:41:57 +0200 |
commit | 82bbf281341ef6fdc89bf3cd4b8f9e49884deccd (patch) | |
tree | 7d0f2ea8b4cf6bbab4f9413f4625176115069842 /SOURCES | |
parent | 805152f39f74846f5b07f681b55e3356907bd428 (diff) | |
download | kernel-fsync-82bbf281341ef6fdc89bf3cd4b8f9e49884deccd.tar.gz kernel-fsync-82bbf281341ef6fdc89bf3cd4b8f9e49884deccd.zip |
kernel 6.9.9
Diffstat (limited to 'SOURCES')
33 files changed, 15714 insertions, 55 deletions
diff --git a/SOURCES/0001-add-ally-x-dmi-quirk-for-controller-suspend.patch b/SOURCES/0001-add-ally-x-dmi-quirk-for-controller-suspend.patch new file mode 100644 index 0000000..e0753ae --- /dev/null +++ b/SOURCES/0001-add-ally-x-dmi-quirk-for-controller-suspend.patch @@ -0,0 +1,48 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jan200101 <sentrycraft123@gmail.com> +Date: Mon, 15 Jul 2024 19:31:40 +0200 +Subject: [PATCH] add ally x dmi quirk for controller suspend + +Signed-off-by: Jan200101 <sentrycraft123@gmail.com> +--- + drivers/platform/x86/asus-wmi.c | 3 +-- + include/linux/platform_data/x86/asus-wmi.h | 14 ++++++++++++++ + 2 files changed, 15 insertions(+), 2 deletions(-) + +diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c +index 4b78c4f4e..a8060e7fb 100644 +--- a/drivers/platform/x86/asus-wmi.c ++++ b/drivers/platform/x86/asus-wmi.c +@@ -4646,8 +4646,7 @@ static int asus_wmi_add(struct platform_device *pdev) + asus->egpu_enable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_EGPU); + asus->dgpu_disable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_DGPU); + asus->kbd_rgb_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_STATE); +- asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE) +- && dmi_match(DMI_BOARD_NAME, "RC71L"); ++ asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE) && dmi_check_system(ally_usb_switch_dmi_ids); + + if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE)) + asus->mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE; +diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h +index 870f4bb57..3e8edc07e 100644 +--- a/include/linux/platform_data/x86/asus-wmi.h ++++ b/include/linux/platform_data/x86/asus-wmi.h +@@ -225,4 +225,18 @@ static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { + NULL, + }; + ++static const struct dmi_system_id ally_usb_switch_dmi_ids[] = { ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "RC71L"), ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "RC72LA"), ++ }, ++ }, ++ NULL, ++}; ++ + #endif /* __PLATFORM_DATA_X86_ASUS_WMI_H */ diff --git a/SOURCES/0001-add-revoke_all-ioctl-to-release-event-and-joy-nodes-.patch b/SOURCES/0001-add-revoke_all-ioctl-to-release-event-and-joy-nodes-.patch new file mode 100644 index 0000000..71d9c59 --- /dev/null +++ b/SOURCES/0001-add-revoke_all-ioctl-to-release-event-and-joy-nodes-.patch @@ -0,0 +1,208 @@ +From e24eba6f9ffd2338028116ddc1e14ba5b68b997a Mon Sep 17 00:00:00 2001 +From: antheas <antheas@users.noreply.github.com> +Date: Wed, 17 Jul 2024 17:14:06 +0300 +Subject: [PATCH] add revoke_all ioctl to release event and joy nodes after + hiding + +--- + drivers/input/evdev.c | 24 ++++++++++++++++ + drivers/input/joydev.c | 54 ++++++++++++++++++++++++++++++----- + include/uapi/linux/input.h | 1 + + include/uapi/linux/joystick.h | 4 +++ + 4 files changed, 76 insertions(+), 7 deletions(-) + +diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c +index 51e0c4954600..87226069d076 100644 +--- a/drivers/input/evdev.c ++++ b/drivers/input/evdev.c +@@ -951,6 +951,21 @@ static int evdev_revoke(struct evdev *evdev, struct evdev_client *client, + return 0; + } + ++static int evdev_revoke_all(struct evdev *evdev, struct file *file) ++{ ++ struct evdev_client *client; ++ input_flush_device(&evdev->handle, file); ++ ++ spin_lock(&evdev->client_lock); ++ list_for_each_entry(client, &evdev->client_list, node) { ++ client->revoked = true; ++ evdev_ungrab(evdev, client); ++ wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); ++ } ++ spin_unlock(&evdev->client_lock); ++ return 0; ++} ++ + /* must be called with evdev-mutex held */ + static int evdev_set_mask(struct evdev_client *client, + unsigned int type, +@@ -1094,6 +1109,15 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd, + return -EINVAL; + else + return evdev_revoke(evdev, client, file); ++ ++ case EVIOCREVOKEALL: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ ++ if (p) ++ return -EINVAL; ++ else ++ return evdev_revoke_all(evdev, file); + + case EVIOCGMASK: { + void __user *codes_ptr; +diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c +index 5824bca02e5a..3bdf3a1971f7 100644 +--- a/drivers/input/joydev.c ++++ b/drivers/input/joydev.c +@@ -63,8 +63,29 @@ struct joydev_client { + struct fasync_struct *fasync; + struct joydev *joydev; + struct list_head node; ++ bool revoked; + }; + ++static int joydev_revoke(struct joydev *joydev, struct joydev_client *client) ++{ ++ client->revoked = true; ++ wake_up_interruptible(&joydev->wait); ++ return 0; ++} ++ ++static int joydev_revoke_all(struct joydev *joydev) ++{ ++ struct joydev_client *client; ++ ++ spin_lock(&joydev->client_lock); ++ list_for_each_entry(client, &joydev->client_list, node) { ++ client->revoked = true; ++ } ++ spin_unlock(&joydev->client_lock); ++ wake_up_interruptible(&joydev->wait); ++ return 0; ++} ++ + static int joydev_correct(int value, struct js_corr *corr) + { + switch (corr->type) { +@@ -89,6 +110,9 @@ static void joydev_pass_event(struct joydev_client *client, + struct js_event *event) + { + struct joydev *joydev = client->joydev; ++ ++ if (client->revoked) ++ return; + + /* + * IRQs already disabled, just acquire the lock +@@ -345,6 +369,9 @@ static ssize_t joydev_0x_read(struct joydev_client *client, + struct JS_DATA_TYPE data; + int i; + ++ if (client->revoked) ++ return -ENODEV; ++ + spin_lock_irq(&input->event_lock); + + /* +@@ -402,7 +429,7 @@ static ssize_t joydev_read(struct file *file, char __user *buf, + return -EAGAIN; + + retval = wait_event_interruptible(joydev->wait, +- !joydev->exist || joydev_data_pending(client)); ++ !joydev->exist || client->revoked || joydev_data_pending(client)); + if (retval) + return retval; + +@@ -438,7 +465,7 @@ static __poll_t joydev_poll(struct file *file, poll_table *wait) + + poll_wait(file, &joydev->wait, wait); + return (joydev_data_pending(client) ? (EPOLLIN | EPOLLRDNORM) : 0) | +- (joydev->exist ? 0 : (EPOLLHUP | EPOLLERR)); ++ (joydev->exist && !client->revoked ? 0 : (EPOLLHUP | EPOLLERR)); + } + + static int joydev_handle_JSIOCSAXMAP(struct joydev *joydev, +@@ -506,9 +533,8 @@ static int joydev_handle_JSIOCSBTNMAP(struct joydev *joydev, + return retval; + } + +- +-static int joydev_ioctl_common(struct joydev *joydev, +- unsigned int cmd, void __user *argp) ++static int joydev_ioctl_common(struct joydev *joydev, struct joydev_client *client, ++ unsigned int cmd, void __user *argp) + { + struct input_dev *dev = joydev->handle.dev; + size_t len; +@@ -556,6 +582,20 @@ static int joydev_ioctl_common(struct joydev *joydev, + return copy_to_user(argp, joydev->corr, + sizeof(joydev->corr[0]) * joydev->nabs) ? -EFAULT : 0; + ++ case JSIOCREVOKE: ++ if (argp) ++ return -EINVAL; ++ else ++ return joydev_revoke(joydev, client); ++ ++ case JSIOCREVOKEALL: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ ++ if (argp) ++ return -EINVAL; ++ else ++ return joydev_revoke_all(joydev); + } + + /* +@@ -649,7 +689,7 @@ static long joydev_compat_ioctl(struct file *file, + break; + + default: +- retval = joydev_ioctl_common(joydev, cmd, argp); ++ retval = joydev_ioctl_common(joydev, client, cmd, argp); + break; + } + +@@ -699,7 +739,7 @@ static long joydev_ioctl(struct file *file, + break; + + default: +- retval = joydev_ioctl_common(joydev, cmd, argp); ++ retval = joydev_ioctl_common(joydev, client, cmd, argp); + break; + } + out: +diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h +index 2557eb7b0561..38bfac937add 100644 +--- a/include/uapi/linux/input.h ++++ b/include/uapi/linux/input.h +@@ -185,6 +185,7 @@ struct input_mask { + + #define EVIOCGRAB _IOW('E', 0x90, int) /* Grab/Release device */ + #define EVIOCREVOKE _IOW('E', 0x91, int) /* Revoke device access */ ++#define EVIOCREVOKEALL _IOW('E', 0x94, int) /* Revoke device access from all clients. Requires CAP_SYS_ADMIN. */ + + /** + * EVIOCGMASK - Retrieve current event mask +diff --git a/include/uapi/linux/joystick.h b/include/uapi/linux/joystick.h +index 192bf2cf182d..543b004802f3 100644 +--- a/include/uapi/linux/joystick.h ++++ b/include/uapi/linux/joystick.h +@@ -66,6 +66,10 @@ struct js_event { + #define JSIOCSBTNMAP _IOW('j', 0x33, __u16[KEY_MAX - BTN_MISC + 1]) /* set button mapping */ + #define JSIOCGBTNMAP _IOR('j', 0x34, __u16[KEY_MAX - BTN_MISC + 1]) /* get button mapping */ + ++#define JSIOCREVOKE _IOW('j', 0x91, int) /* Revoke device access */ ++#define JSIOCREVOKEALL _IOW('j', 0x94, int) /* Revoke device access from all clients. Requires CAP_SYS_ADMIN. */ ++ ++ + /* + * Types and constants for get/set correction + */ +-- +2.45.2 + diff --git a/SOURCES/0001-add-support-for-ally-x-mcu-switch.patch b/SOURCES/0001-add-support-for-ally-x-mcu-switch.patch deleted file mode 100644 index bffa8c5..0000000 --- a/SOURCES/0001-add-support-for-ally-x-mcu-switch.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 79d958eccfa4a1cfbb552032e9542f03333005e7 Mon Sep 17 00:00:00 2001 -From: antheas <antheas@users.noreply.github.com> -Date: Mon, 15 Jul 2024 00:00:45 +0300 -Subject: [PATCH] add ally x dmi quirk for controller suspend - ---- - drivers/platform/x86/asus-wmi.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c -index 3f9b6285c9a6..8e362726b703 100644 ---- a/drivers/platform/x86/asus-wmi.c -+++ b/drivers/platform/x86/asus-wmi.c -@@ -4645,8 +4645,10 @@ static int asus_wmi_add(struct platform_device *pdev) - asus->egpu_enable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_EGPU); - asus->dgpu_disable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_DGPU); - asus->kbd_rgb_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_STATE); -- asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE) -- && dmi_match(DMI_BOARD_NAME, "RC71L"); -+ asus->ally_mcu_usb_switch = -+ acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE) && -+ (dmi_match(DMI_BOARD_NAME, "RC71L") || -+ dmi_match(DMI_BOARD_NAME, "RC72LA")); - - if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE)) - asus->mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE; --- -2.45.2 diff --git a/SOURCES/0001-fix-audio.patch b/SOURCES/0001-ally-x-audio-quirk.patch index 3f2c588..0fd97b1 100644 --- a/SOURCES/0001-fix-audio.patch +++ b/SOURCES/0001-ally-x-audio-quirk.patch @@ -1,7 +1,7 @@ From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Jan200101 <sentrycraft123@gmail.com> Date: Sat, 13 Jul 2024 07:34:05 +0200 -Subject: [PATCH] fix audio +Subject: [PATCH] Add Rog Ally X tas2781 audio quirk Signed-off-by: Jan200101 <sentrycraft123@gmail.com> --- diff --git a/SOURCES/cachy-bore.patch b/SOURCES/cachy-bore.patch index 9f0fcc4..c321203 100644 --- a/SOURCES/cachy-bore.patch +++ b/SOURCES/cachy-bore.patch @@ -1,21 +1,22 @@ -From 61efc6f62710e42ca623e209f901394d4f356cc9 Mon Sep 17 00:00:00 2001 -From: Peter Jung <admin@ptr1337.dev> -Date: Thu, 30 May 2024 10:45:04 +0200 +From 699662da34346e7dfea9523fb4ae2b18287f527c Mon Sep 17 00:00:00 2001 +From: Piotr Gorski <lucjan.lucjanov@gmail.com> +Date: Thu, 4 Jul 2024 21:28:26 +0200 Subject: [PATCH] bore -Signed-off-by: Peter Jung <admin@ptr1337.dev> +Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com> --- include/linux/sched.h | 10 ++ init/Kconfig | 17 +++ + kernel/Kconfig.hz | 16 +++ kernel/sched/core.c | 143 +++++++++++++++++++++ kernel/sched/debug.c | 60 ++++++++- - kernel/sched/fair.c | 272 +++++++++++++++++++++++++++++++++++++--- - kernel/sched/features.h | 20 ++- - kernel/sched/sched.h | 7 ++ - 7 files changed, 512 insertions(+), 17 deletions(-) + kernel/sched/fair.c | 275 +++++++++++++++++++++++++++++++++++++--- + kernel/sched/features.h | 28 +++- + kernel/sched/sched.h | 7 + + 8 files changed, 538 insertions(+), 18 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index 3c2abbc587b49..e7bf3a034aa20 100644 +index 3c2abbc58..e7bf3a034 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -547,6 +547,16 @@ struct sched_entity { @@ -36,7 +37,7 @@ index 3c2abbc587b49..e7bf3a034aa20 100644 u64 slice; diff --git a/init/Kconfig b/init/Kconfig -index 664bedb9a71fb..6f9c7fc90707a 100644 +index 459f44ef7..17385c859 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1279,6 +1279,23 @@ config CHECKPOINT_RESTORE @@ -63,8 +64,34 @@ index 664bedb9a71fb..6f9c7fc90707a 100644 config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d068..5f6eecd1e 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -55,5 +55,21 @@ config HZ + default 300 if HZ_300 + default 1000 if HZ_1000 + ++config MIN_BASE_SLICE_NS ++ int "Default value for min_base_slice_ns" ++ default 2000000 ++ help ++ The BORE Scheduler automatically calculates the optimal base ++ slice for the configured HZ using the following equation: ++ ++ base_slice_ns = max(min_base_slice_ns, 1000000000/HZ) ++ ++ This option sets the default lower bound limit of the base slice ++ to prevent the loss of task throughput due to overscheduling. ++ ++ Setting this value too high can cause the system to boot with ++ an unnecessarily large base slice, resulting in high scheduling ++ latency and poor system responsiveness. ++ + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index d211d40a2edc9..362df741dc85e 100644 +index d211d40a2..b6b2aa707 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4516,6 +4516,138 @@ int wake_up_state(struct task_struct *p, unsigned int state) @@ -84,7 +111,7 @@ index d211d40a2edc9..362df741dc85e 100644 + init_task.se.child_burst_last_cached = 0; +} + -+void inline sched_fork_bore(struct task_struct *p) { ++inline void sched_fork_bore(struct task_struct *p) { + p->se.burst_time = 0; + p->se.curr_burst_penalty = 0; + p->se.burst_score = 0; @@ -232,14 +259,14 @@ index d211d40a2edc9..362df741dc85e 100644 +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.2.0 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.2.4 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 8d5d98a5834df..b1786126171e9 100644 +index 8d5d98a58..b17861261 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { @@ -346,7 +373,7 @@ index 8d5d98a5834df..b1786126171e9 100644 P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 213c94d027a4c..3c2d149d0235d 100644 +index 213c94d02..6dffa3419 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -383,7 +410,7 @@ index 213c94d027a4c..3c2d149d0235d 100644 +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_base_slice = 1000000000ULL / HZ; +static unsigned int configured_sched_base_slice = 1000000000ULL / HZ; -+unsigned int sysctl_sched_min_base_slice = 2000000ULL; ++unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +#else // !CONFIG_SCHED_BORE unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; @@ -705,7 +732,17 @@ index 213c94d027a4c..3c2d149d0235d 100644 if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; -@@ -5472,7 +5697,7 @@ pick_next_entity(struct cfs_rq *cfs_rq) +@@ -5258,7 +5483,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + * on average, halfway through their slice, as such start tasks + * off with half a slice to ease into the competition. + */ +- if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) ++ if ((sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) || ++ (sched_feat(PLACE_DEADLINE_WAKEUP) && (flags & ENQUEUE_WAKEUP))) + vslice /= 2; + + /* +@@ -5472,7 +5698,7 @@ pick_next_entity(struct cfs_rq *cfs_rq) cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) return cfs_rq->next; @@ -714,7 +751,7 @@ index 213c94d027a4c..3c2d149d0235d 100644 } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -6835,6 +7060,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6835,6 +7061,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) bool was_sched_idle = sched_idle_rq(rq); util_est_dequeue(&rq->cfs, p); @@ -729,7 +766,7 @@ index 213c94d027a4c..3c2d149d0235d 100644 for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); -@@ -8369,10 +8602,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int +@@ -8369,10 +8603,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int cfs_rq = cfs_rq_of(se); update_curr(cfs_rq); @@ -741,7 +778,7 @@ index 213c94d027a4c..3c2d149d0235d 100644 goto preempt; return; -@@ -8590,16 +8820,25 @@ static void yield_task_fair(struct rq *rq) +@@ -8590,16 +8821,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ @@ -767,7 +804,7 @@ index 213c94d027a4c..3c2d149d0235d 100644 /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() -@@ -12660,6 +12899,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12660,6 +12900,9 @@ static void task_fork_fair(struct task_struct *p) curr = cfs_rq->curr; if (curr) update_curr(cfs_rq); @@ -778,10 +815,10 @@ index 213c94d027a4c..3c2d149d0235d 100644 rq_unlock(rq, &rf); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 143f55df890b1..bfeb9f65383d9 100644 +index 143f55df8..9ad25e4e7 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h -@@ -5,8 +5,26 @@ +@@ -5,8 +5,34 @@ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ SCHED_FEAT(PLACE_LAG, true) @@ -791,6 +828,14 @@ index 143f55df890b1..bfeb9f65383d9 100644 SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) -SCHED_FEAT(RUN_TO_PARITY, true) +/* ++ * Give waken tasks half a slice to ease into the competition. ++ */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(PLACE_DEADLINE_WAKEUP, true) ++#else // !CONFIG_SCHED_BORE ++SCHED_FEAT(PLACE_DEADLINE_WAKEUP, false) ++#endif // CONFIG_SCHED_BORE ++/* + * Inhibit (wakeup) preemption until the current task has exhausted its slice. + */ +#ifdef CONFIG_SCHED_BORE @@ -810,7 +855,7 @@ index 143f55df890b1..bfeb9f65383d9 100644 /* * Prefer to schedule the task we woke last (assuming it failed diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index ae50f212775e5..8c976d27f6e9c 100644 +index ae50f2127..8c976d27f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1967,7 +1967,11 @@ static inline void dirty_sched_domain_sysctl(int cpu) @@ -836,5 +881,5 @@ index ae50f212775e5..8c976d27f6e9c 100644 #ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; -- -2.45.1 +2.45.2.606.g9005149a4a diff --git a/SOURCES/kernel-aarch64-16k-debug-fedora.config b/SOURCES/kernel-aarch64-16k-debug-fedora.config index 0604ee6..4c44c22 100644 --- a/SOURCES/kernel-aarch64-16k-debug-fedora.config +++ b/SOURCES/kernel-aarch64-16k-debug-fedora.config @@ -10090,6 +10090,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-16k-fedora.config b/SOURCES/kernel-aarch64-16k-fedora.config index 42fd197..6ef0ca5 100644 --- a/SOURCES/kernel-aarch64-16k-fedora.config +++ b/SOURCES/kernel-aarch64-16k-fedora.config @@ -10061,6 +10061,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-64k-debug-rhel.config b/SOURCES/kernel-aarch64-64k-debug-rhel.config index c0f798c..6b426c3 100644 --- a/SOURCES/kernel-aarch64-64k-debug-rhel.config +++ b/SOURCES/kernel-aarch64-64k-debug-rhel.config @@ -8125,6 +8125,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-64k-rhel.config b/SOURCES/kernel-aarch64-64k-rhel.config index 16cdc94..a0b10ae 100644 --- a/SOURCES/kernel-aarch64-64k-rhel.config +++ b/SOURCES/kernel-aarch64-64k-rhel.config @@ -8100,6 +8100,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-debug-fedora.config b/SOURCES/kernel-aarch64-debug-fedora.config index 39ea91b..6411934 100644 --- a/SOURCES/kernel-aarch64-debug-fedora.config +++ b/SOURCES/kernel-aarch64-debug-fedora.config @@ -10089,6 +10089,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-debug-rhel.config b/SOURCES/kernel-aarch64-debug-rhel.config index f1cde0d..a2357fb 100644 --- a/SOURCES/kernel-aarch64-debug-rhel.config +++ b/SOURCES/kernel-aarch64-debug-rhel.config @@ -8121,6 +8121,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-fedora.config b/SOURCES/kernel-aarch64-fedora.config index 4db5583..912cde2 100644 --- a/SOURCES/kernel-aarch64-fedora.config +++ b/SOURCES/kernel-aarch64-fedora.config @@ -10060,6 +10060,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-rhel.config b/SOURCES/kernel-aarch64-rhel.config index a15f3ad..17340d8 100644 --- a/SOURCES/kernel-aarch64-rhel.config +++ b/SOURCES/kernel-aarch64-rhel.config @@ -8096,6 +8096,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-rt-debug-rhel.config b/SOURCES/kernel-aarch64-rt-debug-rhel.config index e31e543..2230ac1 100644 --- a/SOURCES/kernel-aarch64-rt-debug-rhel.config +++ b/SOURCES/kernel-aarch64-rt-debug-rhel.config @@ -8181,6 +8181,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-aarch64-rt-rhel.config b/SOURCES/kernel-aarch64-rt-rhel.config index acb67fc..d1b854a 100644 --- a/SOURCES/kernel-aarch64-rt-rhel.config +++ b/SOURCES/kernel-aarch64-rt-rhel.config @@ -8156,6 +8156,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-ppc64le-debug-fedora.config b/SOURCES/kernel-ppc64le-debug-fedora.config index 71e353f..da78efe 100644 --- a/SOURCES/kernel-ppc64le-debug-fedora.config +++ b/SOURCES/kernel-ppc64le-debug-fedora.config @@ -8341,6 +8341,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-ppc64le-debug-rhel.config b/SOURCES/kernel-ppc64le-debug-rhel.config index a0f2d00..8d613cf 100644 --- a/SOURCES/kernel-ppc64le-debug-rhel.config +++ b/SOURCES/kernel-ppc64le-debug-rhel.config @@ -7589,6 +7589,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-ppc64le-fedora.config b/SOURCES/kernel-ppc64le-fedora.config index c4d4b39..ffda948 100644 --- a/SOURCES/kernel-ppc64le-fedora.config +++ b/SOURCES/kernel-ppc64le-fedora.config @@ -8310,6 +8310,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-ppc64le-rhel.config b/SOURCES/kernel-ppc64le-rhel.config index d629774..a809663 100644 --- a/SOURCES/kernel-ppc64le-rhel.config +++ b/SOURCES/kernel-ppc64le-rhel.config @@ -7566,6 +7566,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-s390x-debug-fedora.config b/SOURCES/kernel-s390x-debug-fedora.config index 2b0e9c9..d1b1dce 100644 --- a/SOURCES/kernel-s390x-debug-fedora.config +++ b/SOURCES/kernel-s390x-debug-fedora.config @@ -8276,6 +8276,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-s390x-debug-rhel.config b/SOURCES/kernel-s390x-debug-rhel.config index 0775682..0330690 100644 --- a/SOURCES/kernel-s390x-debug-rhel.config +++ b/SOURCES/kernel-s390x-debug-rhel.config @@ -7571,6 +7571,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-s390x-fedora.config b/SOURCES/kernel-s390x-fedora.config index 62d88b8..8153e5e 100644 --- a/SOURCES/kernel-s390x-fedora.config +++ b/SOURCES/kernel-s390x-fedora.config @@ -8245,6 +8245,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-s390x-rhel.config b/SOURCES/kernel-s390x-rhel.config index d3d7844..9caba6f 100644 --- a/SOURCES/kernel-s390x-rhel.config +++ b/SOURCES/kernel-s390x-rhel.config @@ -7548,6 +7548,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-s390x-zfcpdump-rhel.config b/SOURCES/kernel-s390x-zfcpdump-rhel.config index ca19950..f39c27f 100644 --- a/SOURCES/kernel-s390x-zfcpdump-rhel.config +++ b/SOURCES/kernel-s390x-zfcpdump-rhel.config @@ -7571,6 +7571,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-x86_64-debug-fedora.config b/SOURCES/kernel-x86_64-debug-fedora.config index cc2952e..d4e1c16 100644 --- a/SOURCES/kernel-x86_64-debug-fedora.config +++ b/SOURCES/kernel-x86_64-debug-fedora.config @@ -8992,6 +8992,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-x86_64-debug-rhel.config b/SOURCES/kernel-x86_64-debug-rhel.config index 6c1983f..d814503 100644 --- a/SOURCES/kernel-x86_64-debug-rhel.config +++ b/SOURCES/kernel-x86_64-debug-rhel.config @@ -7966,6 +7966,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-x86_64-fedora.config b/SOURCES/kernel-x86_64-fedora.config index 91bf4f5..0c9353f 100644 --- a/SOURCES/kernel-x86_64-fedora.config +++ b/SOURCES/kernel-x86_64-fedora.config @@ -8962,6 +8962,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-x86_64-rhel.config b/SOURCES/kernel-x86_64-rhel.config index 9bf137e..35f7ad5 100644 --- a/SOURCES/kernel-x86_64-rhel.config +++ b/SOURCES/kernel-x86_64-rhel.config @@ -7942,6 +7942,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-x86_64-rt-debug-rhel.config b/SOURCES/kernel-x86_64-rt-debug-rhel.config index aa1a7f4..bae0b0e 100644 --- a/SOURCES/kernel-x86_64-rt-debug-rhel.config +++ b/SOURCES/kernel-x86_64-rt-debug-rhel.config @@ -8027,6 +8027,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel-x86_64-rt-rhel.config b/SOURCES/kernel-x86_64-rt-rhel.config index 2c8ec13..37a5069 100644 --- a/SOURCES/kernel-x86_64-rt-rhel.config +++ b/SOURCES/kernel-x86_64-rt-rhel.config @@ -8003,6 +8003,8 @@ CONFIG_NTSYNC=y CONFIG_USER_NS_UNPRIVILEGED=y CONFIG_TCP_CONG_BBR2=m CONFIG_SCHED_BORE=y +CONFIG_MIN_BASE_SLICE_NS=1000000 +CONFIG_SCHED_CLASS_EXT=y CONFIG_HID_IPTS=m CONFIG_HID_ITHC=m CONFIG_SURFACE_BOOK1_DGPU_SWITCH=m diff --git a/SOURCES/kernel.changelog b/SOURCES/kernel.changelog index d8e3274..ef2af95 100644 --- a/SOURCES/kernel.changelog +++ b/SOURCES/kernel.changelog @@ -1,3 +1,7 @@ +* Thu Jul 11 2024 Augusto Caringi <acaringi@redhat.com> [6.9.9-0] +- Linux v6.9.9 +Resolves: + * Fri Jul 05 2024 Augusto Caringi <acaringi@redhat.com> [6.9.8-0] - Add BugsFixed for 6.9 (Justin M. Forbes) - Turn on USB_SERIAL_F81232 for Fedora (Justin M. Forbes) diff --git a/SOURCES/patch-6.9-redhat.patch b/SOURCES/patch-6.9-redhat.patch index 6503803..f9444f1 100644 --- a/SOURCES/patch-6.9-redhat.patch +++ b/SOURCES/patch-6.9-redhat.patch @@ -72,7 +72,7 @@ index 000000000000..733a26bd887a + +endmenu diff --git a/Makefile b/Makefile -index 060e20dba35e..58478aa0cc73 100644 +index cbe3a580ff48..45ff8af1947c 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,18 @@ $(if $(filter __%, $(MAKECMDGOALS)), \ diff --git a/SOURCES/scx-kernel.patch b/SOURCES/scx-kernel.patch new file mode 100644 index 0000000..7ff03ff --- /dev/null +++ b/SOURCES/scx-kernel.patch @@ -0,0 +1,15332 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jan200101 <sentrycraft123@gmail.com> +Date: Wed, 17 Jul 2024 14:30:16 +0200 +Subject: [PATCH] scx + +Signed-off-by: Jan200101 <sentrycraft123@gmail.com> +--- + Documentation/scheduler/index.rst | 1 + + Documentation/scheduler/sched-ext.rst | 316 + + MAINTAINERS | 13 + + include/asm-generic/vmlinux.lds.h | 1 + + include/linux/cgroup.h | 4 +- + include/linux/sched.h | 5 + + include/linux/sched/ext.h | 206 + + include/linux/sched/task.h | 3 +- + include/trace/events/sched_ext.h | 32 + + include/uapi/linux/sched.h | 1 + + init/init_task.c | 12 + + kernel/Kconfig.preempt | 26 +- + kernel/fork.c | 17 +- + kernel/sched/build_policy.c | 10 + + kernel/sched/core.c | 250 +- + kernel/sched/cpufreq_schedutil.c | 50 +- + kernel/sched/debug.c | 3 + + kernel/sched/ext.c | 6537 +++++++++++++++++ + kernel/sched/ext.h | 69 + + kernel/sched/fair.c | 27 +- + kernel/sched/idle.c | 2 + + kernel/sched/sched.h | 171 +- + lib/dump_stack.c | 1 + + tools/Makefile | 10 +- + tools/sched_ext/.gitignore | 2 + + tools/sched_ext/Makefile | 246 + + tools/sched_ext/README.md | 258 + + .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 + + tools/sched_ext/include/scx/common.bpf.h | 401 + + tools/sched_ext/include/scx/common.h | 75 + + tools/sched_ext/include/scx/compat.bpf.h | 28 + + tools/sched_ext/include/scx/compat.h | 187 + + tools/sched_ext/include/scx/user_exit_info.h | 111 + + tools/sched_ext/scx_central.bpf.c | 361 + + tools/sched_ext/scx_central.c | 135 + + tools/sched_ext/scx_qmap.bpf.c | 706 ++ + tools/sched_ext/scx_qmap.c | 144 + + tools/sched_ext/scx_show_state.py | 39 + + tools/sched_ext/scx_simple.bpf.c | 156 + + tools/sched_ext/scx_simple.c | 107 + + tools/testing/selftests/sched_ext/.gitignore | 6 + + tools/testing/selftests/sched_ext/Makefile | 218 + + tools/testing/selftests/sched_ext/config | 9 + + .../selftests/sched_ext/create_dsq.bpf.c | 58 + + .../testing/selftests/sched_ext/create_dsq.c | 57 + + .../sched_ext/ddsp_bogus_dsq_fail.bpf.c | 42 + + .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 57 + + .../sched_ext/ddsp_vtimelocal_fail.bpf.c | 39 + + .../sched_ext/ddsp_vtimelocal_fail.c | 56 + + .../selftests/sched_ext/dsp_local_on.bpf.c | 65 + + .../selftests/sched_ext/dsp_local_on.c | 58 + + .../sched_ext/enq_last_no_enq_fails.bpf.c | 21 + + .../sched_ext/enq_last_no_enq_fails.c | 60 + + .../sched_ext/enq_select_cpu_fails.bpf.c | 43 + + .../sched_ext/enq_select_cpu_fails.c | 61 + + tools/testing/selftests/sched_ext/exit.bpf.c | 84 + + tools/testing/selftests/sched_ext/exit.c | 55 + + tools/testing/selftests/sched_ext/exit_test.h | 20 + + .../testing/selftests/sched_ext/hotplug.bpf.c | 61 + + tools/testing/selftests/sched_ext/hotplug.c | 168 + + .../selftests/sched_ext/hotplug_test.h | 15 + + .../sched_ext/init_enable_count.bpf.c | 53 + + .../selftests/sched_ext/init_enable_count.c | 166 + + .../testing/selftests/sched_ext/maximal.bpf.c | 132 + + tools/testing/selftests/sched_ext/maximal.c | 51 + + .../selftests/sched_ext/maybe_null.bpf.c | 36 + + .../testing/selftests/sched_ext/maybe_null.c | 49 + + .../sched_ext/maybe_null_fail_dsp.bpf.c | 25 + + .../sched_ext/maybe_null_fail_yld.bpf.c | 28 + + .../testing/selftests/sched_ext/minimal.bpf.c | 21 + + tools/testing/selftests/sched_ext/minimal.c | 58 + + .../selftests/sched_ext/prog_run.bpf.c | 32 + + tools/testing/selftests/sched_ext/prog_run.c | 78 + + .../testing/selftests/sched_ext/reload_loop.c | 75 + + tools/testing/selftests/sched_ext/runner.c | 201 + + tools/testing/selftests/sched_ext/scx_test.h | 131 + + .../selftests/sched_ext/select_cpu_dfl.bpf.c | 40 + + .../selftests/sched_ext/select_cpu_dfl.c | 72 + + .../sched_ext/select_cpu_dfl_nodispatch.bpf.c | 89 + + .../sched_ext/select_cpu_dfl_nodispatch.c | 72 + + .../sched_ext/select_cpu_dispatch.bpf.c | 41 + + .../selftests/sched_ext/select_cpu_dispatch.c | 70 + + .../select_cpu_dispatch_bad_dsq.bpf.c | 37 + + .../sched_ext/select_cpu_dispatch_bad_dsq.c | 56 + + .../select_cpu_dispatch_dbl_dsp.bpf.c | 38 + + .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 56 + + .../sched_ext/select_cpu_vtime.bpf.c | 92 + + .../selftests/sched_ext/select_cpu_vtime.c | 59 + + .../selftests/sched_ext/test_example.c | 49 + + tools/testing/selftests/sched_ext/util.c | 71 + + tools/testing/selftests/sched_ext/util.h | 13 + + 91 files changed, 13860 insertions(+), 118 deletions(-) + create mode 100644 Documentation/scheduler/sched-ext.rst + create mode 100644 include/linux/sched/ext.h + create mode 100644 include/trace/events/sched_ext.h + create mode 100644 kernel/sched/ext.c + create mode 100644 kernel/sched/ext.h + create mode 100644 tools/sched_ext/.gitignore + create mode 100644 tools/sched_ext/Makefile + create mode 100644 tools/sched_ext/README.md + create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h + create mode 100644 tools/sched_ext/include/scx/common.bpf.h + create mode 100644 tools/sched_ext/include/scx/common.h + create mode 100644 tools/sched_ext/include/scx/compat.bpf.h + create mode 100644 tools/sched_ext/include/scx/compat.h + create mode 100644 tools/sched_ext/include/scx/user_exit_info.h + create mode 100644 tools/sched_ext/scx_central.bpf.c + create mode 100644 tools/sched_ext/scx_central.c + create mode 100644 tools/sched_ext/scx_qmap.bpf.c + create mode 100644 tools/sched_ext/scx_qmap.c + create mode 100644 tools/sched_ext/scx_show_state.py + create mode 100644 tools/sched_ext/scx_simple.bpf.c + create mode 100644 tools/sched_ext/scx_simple.c + create mode 100644 tools/testing/selftests/sched_ext/.gitignore + create mode 100644 tools/testing/selftests/sched_ext/Makefile + create mode 100644 tools/testing/selftests/sched_ext/config + create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c + create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c + create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c + create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c + create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/exit.c + create mode 100644 tools/testing/selftests/sched_ext/exit_test.h + create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/hotplug.c + create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h + create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c + create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/maximal.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/minimal.c + create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/prog_run.c + create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c + create mode 100644 tools/testing/selftests/sched_ext/runner.c + create mode 100644 tools/testing/selftests/sched_ext/scx_test.h + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c + create mode 100644 tools/testing/selftests/sched_ext/test_example.c + create mode 100644 tools/testing/selftests/sched_ext/util.c + create mode 100644 tools/testing/selftests/sched_ext/util.h + +diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst +index 43bd8a145b7a..0611dc3dda8e 100644 +--- a/Documentation/scheduler/index.rst ++++ b/Documentation/scheduler/index.rst +@@ -20,6 +20,7 @@ Scheduler + sched-nice-design + sched-rt-group + sched-stats ++ sched-ext + sched-debug + + text_files +diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst +new file mode 100644 +index 000000000000..a707d2181a77 +--- /dev/null ++++ b/Documentation/scheduler/sched-ext.rst +@@ -0,0 +1,316 @@ ++========================== ++Extensible Scheduler Class ++========================== ++ ++sched_ext is a scheduler class whose behavior can be defined by a set of BPF ++programs - the BPF scheduler. ++ ++* sched_ext exports a full scheduling interface so that any scheduling ++ algorithm can be implemented on top. ++ ++* The BPF scheduler can group CPUs however it sees fit and schedule them ++ together, as tasks aren't tied to specific CPUs at the time of wakeup. ++ ++* The BPF scheduler can be turned on and off dynamically anytime. ++ ++* The system integrity is maintained no matter what the BPF scheduler does. ++ The default scheduling behavior is restored anytime an error is detected, ++ a runnable task stalls, or on invoking the SysRq key sequence ++ :kbd:`SysRq-S`. ++ ++* When the BPF scheduler triggers an error, debug information is dumped to ++ aid debugging. The debug dump is passed to and printed out by the ++ scheduler binary. The debug dump can also be accessed through the ++ `sched_ext_dump` tracepoint. The SysRq key sequence :kbd:`SysRq-D` ++ triggers a debug dump. This doesn't terminate the BPF scheduler and can ++ only be read through the tracepoint. ++ ++Switching to and from sched_ext ++=============================== ++ ++``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and ++``tools/sched_ext`` contains the example schedulers. The following config ++options should be enabled to use sched_ext: ++ ++.. code-block:: none ++ ++ CONFIG_BPF=y ++ CONFIG_SCHED_CLASS_EXT=y ++ CONFIG_BPF_SYSCALL=y ++ CONFIG_BPF_JIT=y ++ CONFIG_DEBUG_INFO_BTF=y ++ CONFIG_BPF_JIT_ALWAYS_ON=y ++ CONFIG_BPF_JIT_DEFAULT_ON=y ++ CONFIG_PAHOLE_HAS_SPLIT_BTF=y ++ CONFIG_PAHOLE_HAS_BTF_TAG=y ++ ++sched_ext is used only when the BPF scheduler is loaded and running. ++ ++If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be ++treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is ++loaded. ++ ++When the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is not set ++in ``ops->flags``, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE``, and ++``SCHED_EXT`` tasks are scheduled by sched_ext. ++ ++However, when the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is ++set in ``ops->flags``, only tasks with the ``SCHED_EXT`` policy are scheduled ++by sched_ext, while tasks with ``SCHED_NORMAL``, ``SCHED_BATCH`` and ++``SCHED_IDLE`` policies are scheduled by CFS. ++ ++Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or ++detection of any internal error including stalled runnable tasks aborts the ++BPF scheduler and reverts all tasks back to CFS. ++ ++.. code-block:: none ++ ++ # make -j16 -C tools/sched_ext ++ # tools/sched_ext/scx_simple ++ local=0 global=3 ++ local=5 global=24 ++ local=9 global=44 ++ local=13 global=56 ++ local=17 global=72 ++ ^CEXIT: BPF scheduler unregistered ++ ++The current status of the BPF scheduler can be determined as follows: ++ ++.. code-block:: none ++ ++ # cat /sys/kernel/sched_ext/state ++ enabled ++ # cat /sys/kernel/sched_ext/root/ops ++ simple ++ ++``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more ++detailed information: ++ ++.. code-block:: none ++ ++ # tools/sched_ext/scx_show_state.py ++ ops : simple ++ enabled : 1 ++ switching_all : 1 ++ switched_all : 1 ++ enable_state : enabled (2) ++ bypass_depth : 0 ++ nr_rejected : 0 ++ ++If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can ++be determined as follows: ++ ++.. code-block:: none ++ ++ # grep ext /proc/self/sched ++ ext.enabled : 1 ++ ++The Basics ++========== ++ ++Userspace can implement an arbitrary BPF scheduler by loading a set of BPF ++programs that implement ``struct sched_ext_ops``. The only mandatory field ++is ``ops.name`` which must be a valid BPF object name. All operations are ++optional. The following modified excerpt is from ++``tools/sched_ext/scx_simple.bpf.c`` showing a minimal global FIFO scheduler. ++ ++.. code-block:: c ++ ++ /* ++ * Decide which CPU a task should be migrated to before being ++ * enqueued (either at wakeup, fork time, or exec time). If an ++ * idle core is found by the default ops.select_cpu() implementation, ++ * then dispatch the task directly to SCX_DSQ_LOCAL and skip the ++ * ops.enqueue() callback. ++ * ++ * Note that this implementation has exactly the same behavior as the ++ * default ops.select_cpu implementation. The behavior of the scheduler ++ * would be exactly same if the implementation just didn't define the ++ * simple_select_cpu() struct_ops prog. ++ */ ++ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++ { ++ s32 cpu; ++ /* Need to initialize or the BPF verifier will reject the program */ ++ bool direct = false; ++ ++ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct); ++ ++ if (direct) ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); ++ ++ return cpu; ++ } ++ ++ /* ++ * Do a direct dispatch of a task to the global DSQ. This ops.enqueue() ++ * callback will only be invoked if we failed to find a core to dispatch ++ * to in ops.select_cpu() above. ++ * ++ * Note that this implementation has exactly the same behavior as the ++ * default ops.enqueue implementation, which just dispatches the task ++ * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same ++ * if the implementation just didn't define the simple_enqueue struct_ops ++ * prog. ++ */ ++ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) ++ { ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ } ++ ++ s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) ++ { ++ /* ++ * By default, all SCHED_EXT, SCHED_OTHER, SCHED_IDLE, and ++ * SCHED_BATCH tasks should use sched_ext. ++ */ ++ return 0; ++ } ++ ++ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) ++ { ++ exit_type = ei->type; ++ } ++ ++ SEC(".struct_ops") ++ struct sched_ext_ops simple_ops = { ++ .select_cpu = (void *)simple_select_cpu, ++ .enqueue = (void *)simple_enqueue, ++ .init = (void *)simple_init, ++ .exit = (void *)simple_exit, ++ .name = "simple", ++ }; ++ ++Dispatch Queues ++--------------- ++ ++To match the impedance between the scheduler core and the BPF scheduler, ++sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a ++priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), ++and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage ++an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and ++``scx_bpf_destroy_dsq()``. ++ ++A CPU always executes a task from its local DSQ. A task is "dispatched" to a ++DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's ++local DSQ. ++ ++When a CPU is looking for the next task to run, if the local DSQ is not ++empty, the first task is picked. Otherwise, the CPU tries to consume the ++global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` ++is invoked. ++ ++Scheduling Cycle ++---------------- ++ ++The following briefly shows how a waking task is scheduled and executed. ++ ++1. When a task is waking up, ``ops.select_cpu()`` is the first operation ++ invoked. This serves two purposes. First, CPU selection optimization ++ hint. Second, waking up the selected CPU if idle. ++ ++ The CPU selected by ``ops.select_cpu()`` is an optimization hint and not ++ binding. The actual decision is made at the last step of scheduling. ++ However, there is a small performance gain if the CPU ++ ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. ++ ++ A side-effect of selecting a CPU is waking it up from idle. While a BPF ++ scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, ++ using ``ops.select_cpu()`` judiciously can be simpler and more efficient. ++ ++ A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by ++ calling ``scx_bpf_dispatch()``. If the task is dispatched to ++ ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the ++ local DSQ of whichever CPU is returned from ``ops.select_cpu()``. ++ Additionally, dispatching directly from ``ops.select_cpu()`` will cause the ++ ``ops.enqueue()`` callback to be skipped. ++ ++ Note that the scheduler core will ignore an invalid CPU selection, for ++ example, if it's outside the allowed cpumask of the task. ++ ++2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the ++ task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()`` ++ can make one of the following decisions: ++ ++ * Immediately dispatch the task to either the global or local DSQ by ++ calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or ++ ``SCX_DSQ_LOCAL``, respectively. ++ ++ * Immediately dispatch the task to a custom DSQ by calling ++ ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. ++ ++ * Queue the task on the BPF side. ++ ++3. When a CPU is ready to schedule, it first looks at its local DSQ. If ++ empty, it then looks at the global DSQ. If there still isn't a task to ++ run, ``ops.dispatch()`` is invoked which can use the following two ++ functions to populate the local DSQ. ++ ++ * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can ++ be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, ++ ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` ++ currently can't be called with BPF locks held, this is being worked on ++ and will be supported. ``scx_bpf_dispatch()`` schedules dispatching ++ rather than performing them immediately. There can be up to ++ ``ops.dispatch_max_batch`` pending tasks. ++ ++ * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ ++ to the dispatching DSQ. This function cannot be called with any BPF ++ locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks ++ before trying to consume the specified DSQ. ++ ++4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, ++ the CPU runs the first one. If empty, the following steps are taken: ++ ++ * Try to consume the global DSQ. If successful, run the task. ++ ++ * If ``ops.dispatch()`` has dispatched any tasks, retry #3. ++ ++ * If the previous task is an SCX task and still runnable, keep executing ++ it (see ``SCX_OPS_ENQ_LAST``). ++ ++ * Go idle. ++ ++Note that the BPF scheduler can always choose to dispatch tasks immediately ++in ``ops.enqueue()`` as illustrated in the above simple example. If only the ++built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as ++a task is never queued on the BPF scheduler and both the local and global ++DSQs are consumed automatically. ++ ++``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use ++``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as ++``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue ++dispatching, and must be dispatched to with ``scx_bpf_dispatch()``. See the ++function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for ++more information. ++ ++Where to Look ++============= ++ ++* ``include/linux/sched/ext.h`` defines the core data structures, ops table ++ and constants. ++ ++* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. ++ The functions prefixed with ``scx_bpf_`` can be called from the BPF ++ scheduler. ++ ++* ``tools/sched_ext/`` hosts example BPF scheduler implementations. ++ ++ * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a ++ custom DSQ. ++ ++ * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five ++ levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. ++ ++ABI Instability ++=============== ++ ++The APIs provided by sched_ext to BPF schedulers programs have no stability ++guarantees. This includes the ops table callbacks and constants defined in ++``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in ++``kernel/sched/ext.c``. ++ ++While we will attempt to provide a relatively stable API surface when ++possible, they are subject to change without warning between kernel ++versions. +diff --git a/MAINTAINERS b/MAINTAINERS +index 3121709d99e3..bf3b77e96dc4 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -19623,6 +19623,19 @@ F: include/linux/wait.h + F: include/uapi/linux/sched.h + F: kernel/sched/ + ++SCHEDULER - SCHED_EXT ++R: Tejun Heo <tj@kernel.org> ++R: David Vernet <void@manifault.com> ++L: linux-kernel@vger.kernel.org ++S: Maintained ++W: https://github.com/sched-ext/scx ++T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git ++F: include/linux/sched/ext.h ++F: kernel/sched/ext.h ++F: kernel/sched/ext.c ++F: tools/sched_ext/ ++F: tools/testing/selftests/sched_ext ++ + SCSI LIBSAS SUBSYSTEM + R: John Garry <john.g.garry@oracle.com> + R: Jason Yan <yanaijie@huawei.com> +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index f7749d0f2562..05bfe4acba1d 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -131,6 +131,7 @@ + *(__dl_sched_class) \ + *(__rt_sched_class) \ + *(__fair_sched_class) \ ++ *(__ext_sched_class) \ + *(__idle_sched_class) \ + __sched_class_lowest = .; + +diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h +index 34aaf0e87def..bcebf8096e91 100644 +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -29,8 +29,6 @@ + + struct kernel_clone_args; + +-#ifdef CONFIG_CGROUPS +- + /* + * All weight knobs on the default hierarchy should use the following min, + * default and max values. The default value is the logarithmic center of +@@ -40,6 +38,8 @@ struct kernel_clone_args; + #define CGROUP_WEIGHT_DFL 100 + #define CGROUP_WEIGHT_MAX 10000 + ++#ifdef CONFIG_CGROUPS ++ + enum { + CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ + CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 3c2abbc587b4..dc07eb0d3290 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -80,6 +80,8 @@ struct task_group; + struct task_struct; + struct user_event_mm; + ++#include <linux/sched/ext.h> ++ + /* + * Task state bitmask. NOTE! These bits are also + * encoded in fs/proc/array.c: get_task_state(). +@@ -798,6 +800,9 @@ struct task_struct { + struct sched_rt_entity rt; + struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; ++#ifdef CONFIG_SCHED_CLASS_EXT ++ struct sched_ext_entity scx; ++#endif + const struct sched_class *sched_class; + + #ifdef CONFIG_SCHED_CORE +diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h +new file mode 100644 +index 000000000000..593d2f4909dd +--- /dev/null ++++ b/include/linux/sched/ext.h +@@ -0,0 +1,206 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst ++ * ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#ifndef _LINUX_SCHED_EXT_H ++#define _LINUX_SCHED_EXT_H ++ ++#ifdef CONFIG_SCHED_CLASS_EXT ++ ++#include <linux/llist.h> ++#include <linux/rhashtable-types.h> ++ ++enum scx_public_consts { ++ SCX_OPS_NAME_LEN = 128, ++ ++ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ ++ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ ++}; ++ ++/* ++ * DSQ (dispatch queue) IDs are 64bit of the format: ++ * ++ * Bits: [63] [62 .. 0] ++ * [ B] [ ID ] ++ * ++ * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs ++ * ID: 63 bit ID ++ * ++ * Built-in IDs: ++ * ++ * Bits: [63] [62] [61..32] [31 .. 0] ++ * [ 1] [ L] [ R ] [ V ] ++ * ++ * 1: 1 for built-in DSQs. ++ * L: 1 for LOCAL_ON DSQ IDs, 0 for others ++ * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. ++ */ ++enum scx_dsq_id_flags { ++ SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, ++ SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, ++ ++ SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, ++ SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, ++ SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, ++ SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, ++ SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, ++}; ++ ++/* ++ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered ++ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to ++ * buffer between the scheduler core and the BPF scheduler. See the ++ * documentation for more details. ++ */ ++struct scx_dispatch_q { ++ raw_spinlock_t lock; ++ struct list_head list; /* tasks in dispatch order */ ++ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ ++ u32 nr; ++ u32 seq; /* used by BPF iter */ ++ u64 id; ++ struct rhash_head hash_node; ++ struct llist_node free_node; ++ struct rcu_head rcu; ++}; ++ ++/* scx_entity.flags */ ++enum scx_ent_flags { ++ SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ ++ SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ ++ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ ++ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ ++ ++ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ ++ SCX_TASK_STATE_BITS = 2, ++ SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, ++ ++ SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ ++}; ++ ++/* scx_entity.flags & SCX_TASK_STATE_MASK */ ++enum scx_task_state { ++ SCX_TASK_NONE, /* ops.init_task() not called yet */ ++ SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ ++ SCX_TASK_READY, /* fully initialized, but not in sched_ext */ ++ SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ ++ ++ SCX_TASK_NR_STATES, ++}; ++ ++/* scx_entity.dsq_flags */ ++enum scx_ent_dsq_flags { ++ SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ ++}; ++ ++/* ++ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from ++ * everywhere and the following bits track which kfunc sets are currently ++ * allowed for %current. This simple per-task tracking works because SCX ops ++ * nest in a limited way. BPF will likely implement a way to allow and disallow ++ * kfuncs depending on the calling context which will replace this manual ++ * mechanism. See scx_kf_allow(). ++ */ ++enum scx_kf_mask { ++ SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ ++ /* all non-sleepables may be nested inside SLEEPABLE */ ++ SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ ++ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ ++ SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */ ++ /* ops.dequeue (in REST) may be nested inside DISPATCH */ ++ SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ ++ SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ ++ SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ ++ SCX_KF_REST = 1 << 5, /* other rq-locked operations */ ++ ++ __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | ++ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, ++ __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, ++}; ++ ++struct scx_dsq_list_node { ++ struct list_head node; ++ bool is_bpf_iter_cursor; ++}; ++ ++/* ++ * The following is embedded in task_struct and contains all fields necessary ++ * for a task to be scheduled by SCX. ++ */ ++struct sched_ext_entity { ++ struct scx_dispatch_q *dsq; ++ struct scx_dsq_list_node dsq_list; /* dispatch order */ ++ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ ++ u32 dsq_seq; ++ u32 dsq_flags; /* protected by DSQ lock */ ++ u32 flags; /* protected by rq lock */ ++ u32 weight; ++ s32 sticky_cpu; ++ s32 holding_cpu; ++ u32 kf_mask; /* see scx_kf_mask above */ ++ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ ++ atomic_long_t ops_state; ++ ++ struct list_head runnable_node; /* rq->scx.runnable_list */ ++ unsigned long runnable_at; ++ ++#ifdef CONFIG_SCHED_CORE ++ u64 core_sched_at; /* see scx_prio_less() */ ++#endif ++ u64 ddsp_dsq_id; ++ u64 ddsp_enq_flags; ++ ++ /* BPF scheduler modifiable fields */ ++ ++ /* ++ * Runtime budget in nsecs. This is usually set through ++ * scx_bpf_dispatch() but can also be modified directly by the BPF ++ * scheduler. Automatically decreased by SCX as the task executes. On ++ * depletion, a scheduling event is triggered. ++ * ++ * This value is cleared to zero if the task is preempted by ++ * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the ++ * task ran. Use p->se.sum_exec_runtime instead. ++ */ ++ u64 slice; ++ ++ /* ++ * Used to order tasks when dispatching to the vtime-ordered priority ++ * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() ++ * but can also be modified directly by the BPF scheduler. Modifying it ++ * while a task is queued on a dsq may mangle the ordering and is not ++ * recommended. ++ */ ++ u64 dsq_vtime; ++ ++ /* ++ * If set, reject future sched_setscheduler(2) calls updating the policy ++ * to %SCHED_EXT with -%EACCES. ++ * ++ * If set from ops.init_task() and the task's policy is already ++ * %SCHED_EXT, which can happen while the BPF scheduler is being loaded ++ * or by inhering the parent's policy during fork, the task's policy is ++ * rejected and forcefully reverted to %SCHED_NORMAL. The number of ++ * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. ++ */ ++ bool disallow; /* reject switching into SCX */ ++ ++ /* cold fields */ ++ /* must be the last field, see init_scx_entity() */ ++ struct list_head tasks_node; ++}; ++ ++void sched_ext_free(struct task_struct *p); ++void print_scx_info(const char *log_lvl, struct task_struct *p); ++ ++#else /* !CONFIG_SCHED_CLASS_EXT */ ++ ++static inline void sched_ext_free(struct task_struct *p) {} ++static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} ++ ++#endif /* CONFIG_SCHED_CLASS_EXT */ ++#endif /* _LINUX_SCHED_EXT_H */ +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index d362aacf9f89..4df2f9055587 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); + extern void init_idle(struct task_struct *idle, int cpu); + + extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +-extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); ++extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); ++extern void sched_cancel_fork(struct task_struct *p); + extern void sched_post_fork(struct task_struct *p); + extern void sched_dead(struct task_struct *p); + +diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h +new file mode 100644 +index 000000000000..fe19da7315a9 +--- /dev/null ++++ b/include/trace/events/sched_ext.h +@@ -0,0 +1,32 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM sched_ext ++ ++#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_SCHED_EXT_H ++ ++#include <linux/tracepoint.h> ++ ++TRACE_EVENT(sched_ext_dump, ++ ++ TP_PROTO(const char *line), ++ ++ TP_ARGS(line), ++ ++ TP_STRUCT__entry( ++ __string(line, line) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(line, line); ++ ), ++ ++ TP_printk("%s", ++ __get_str(line) ++ ) ++); ++ ++#endif /* _TRACE_SCHED_EXT_H */ ++ ++/* This part must be outside protection */ ++#include <trace/define_trace.h> +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..359a14cc76a4 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -118,6 +118,7 @@ struct clone_args { + /* SCHED_ISO: reserved but not implemented yet */ + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 ++#define SCHED_EXT 7 + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/init_task.c b/init/init_task.c +index 4daee6d761c8..ce882dbd2635 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -6,6 +6,7 @@ + #include <linux/sched/sysctl.h> + #include <linux/sched/rt.h> + #include <linux/sched/task.h> ++#include <linux/sched/ext.h> + #include <linux/init.h> + #include <linux/fs.h> + #include <linux/mm.h> +@@ -97,6 +98,17 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { + #endif + #ifdef CONFIG_CGROUP_SCHED + .sched_task_group = &root_task_group, ++#endif ++#ifdef CONFIG_SCHED_CLASS_EXT ++ .scx = { ++ .dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node), ++ .sticky_cpu = -1, ++ .holding_cpu = -1, ++ .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), ++ .runnable_at = INITIAL_JIFFIES, ++ .ddsp_dsq_id = SCX_DSQ_INVALID, ++ .slice = SCX_SLICE_DFL, ++ }, + #endif + .ptraced = LIST_HEAD_INIT(init_task.ptraced), + .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index c2f1fd95a821..f3d140c3acc1 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -133,4 +133,28 @@ config SCHED_CORE + which is the likely usage by Linux distributions, there should + be no measurable impact on performance. + +- ++config SCHED_CLASS_EXT ++ bool "Extensible Scheduling Class" ++ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF ++ help ++ This option enables a new scheduler class sched_ext (SCX), which ++ allows scheduling policies to be implemented as BPF programs to ++ achieve the following: ++ ++ - Ease of experimentation and exploration: Enabling rapid ++ iteration of new scheduling policies. ++ - Customization: Building application-specific schedulers which ++ implement policies that are not applicable to general-purpose ++ schedulers. ++ - Rapid scheduler deployments: Non-disruptive swap outs of ++ scheduling policies in production environments. ++ ++ sched_ext leverages BPF struct_ops feature to define a structure ++ which exports function callbacks and flags to BPF programs that ++ wish to implement scheduling policies. The struct_ops structure ++ exported by sched_ext is struct sched_ext_ops, and is conceptually ++ similar to struct sched_class. ++ ++ For more information: ++ Documentation/scheduler/sched-ext.rst ++ https://github.com/sched-ext/scx +diff --git a/kernel/fork.c b/kernel/fork.c +index aebb3e6c96dc..5d1f9de254d6 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -23,6 +23,7 @@ + #include <linux/sched/task.h> + #include <linux/sched/task_stack.h> + #include <linux/sched/cputime.h> ++#include <linux/sched/ext.h> + #include <linux/seq_file.h> + #include <linux/rtmutex.h> + #include <linux/init.h> +@@ -971,6 +972,7 @@ void __put_task_struct(struct task_struct *tsk) + WARN_ON(refcount_read(&tsk->usage)); + WARN_ON(tsk == current); + ++ sched_ext_free(tsk); + io_uring_free(tsk); + cgroup_free(tsk); + task_numa_free(tsk, true); +@@ -2363,7 +2365,7 @@ __latent_entropy struct task_struct *copy_process( + + retval = perf_event_init_task(p, clone_flags); + if (retval) +- goto bad_fork_cleanup_policy; ++ goto bad_fork_sched_cancel_fork; + retval = audit_alloc(p); + if (retval) + goto bad_fork_cleanup_perf; +@@ -2496,7 +2498,9 @@ __latent_entropy struct task_struct *copy_process( + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ +- sched_cgroup_fork(p, args); ++ retval = sched_cgroup_fork(p, args); ++ if (retval) ++ goto bad_fork_cancel_cgroup; + + /* + * From this point on we must avoid any synchronous user-space +@@ -2542,13 +2546,13 @@ __latent_entropy struct task_struct *copy_process( + /* Don't start children in a dying pid namespace */ + if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { + retval = -ENOMEM; +- goto bad_fork_cancel_cgroup; ++ goto bad_fork_core_free; + } + + /* Let kill terminate clone/fork in the middle */ + if (fatal_signal_pending(current)) { + retval = -EINTR; +- goto bad_fork_cancel_cgroup; ++ goto bad_fork_core_free; + } + + /* No more failure paths after this point. */ +@@ -2622,10 +2626,11 @@ __latent_entropy struct task_struct *copy_process( + + return p; + +-bad_fork_cancel_cgroup: ++bad_fork_core_free: + sched_core_free(p); + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); ++bad_fork_cancel_cgroup: + cgroup_cancel_fork(p, args); + bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) { +@@ -2664,6 +2669,8 @@ __latent_entropy struct task_struct *copy_process( + audit_free(p); + bad_fork_cleanup_perf: + perf_event_free_task(p); ++bad_fork_sched_cancel_fork: ++ sched_cancel_fork(p); + bad_fork_cleanup_policy: + lockdep_free_task(p); + #ifdef CONFIG_NUMA +diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c +index d9dc9ab3773f..e7d539bb721e 100644 +--- a/kernel/sched/build_policy.c ++++ b/kernel/sched/build_policy.c +@@ -16,18 +16,25 @@ + #include <linux/sched/clock.h> + #include <linux/sched/cputime.h> + #include <linux/sched/hotplug.h> ++#include <linux/sched/isolation.h> + #include <linux/sched/posix-timers.h> + #include <linux/sched/rt.h> + + #include <linux/cpuidle.h> + #include <linux/jiffies.h> ++#include <linux/kobject.h> + #include <linux/livepatch.h> ++#include <linux/pm.h> + #include <linux/psi.h> ++#include <linux/rhashtable.h> ++#include <linux/seq_buf.h> + #include <linux/seqlock_api.h> + #include <linux/slab.h> + #include <linux/suspend.h> + #include <linux/tsacct_kern.h> + #include <linux/vtime.h> ++#include <linux/sysrq.h> ++#include <linux/percpu-rwsem.h> + + #include <uapi/linux/sched/types.h> + +@@ -52,3 +59,6 @@ + #include "cputime.c" + #include "deadline.c" + ++#ifdef CONFIG_SCHED_CLASS_EXT ++# include "ext.c" ++#endif +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index d211d40a2edc..e5a6766b3a45 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p) + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + +- return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ ++ if (task_on_scx(p)) ++ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ ++ ++ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ + } + + /* +@@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a, + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi); + ++#ifdef CONFIG_SCHED_CLASS_EXT ++ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ ++ return scx_prio_less(a, b, in_fi); ++#endif ++ + return false; + } + +@@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq) + return true; + + /* +- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; +- * if there's more than one we need the tick for involuntary +- * preemption. ++ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks ++ * left. For CFS, if there's more than one we need the tick for ++ * involuntary preemption. For SCX, ask. + */ +- if (rq->nr_running > 1) ++ if (!scx_switched_all() && rq->nr_running > 1) ++ return false; ++ ++ if (scx_enabled() && !scx_can_stop_tick(rq)) + return false; + + /* +@@ -1327,27 +1338,24 @@ int tg_nop(struct task_group *tg, void *data) + static void set_load_weight(struct task_struct *p, bool update_load) + { + int prio = p->static_prio - MAX_RT_PRIO; +- struct load_weight *load = &p->se.load; ++ struct load_weight lw; + +- /* +- * SCHED_IDLE tasks get minimal weight: +- */ + if (task_has_idle_policy(p)) { +- load->weight = scale_load(WEIGHT_IDLEPRIO); +- load->inv_weight = WMULT_IDLEPRIO; +- return; ++ lw.weight = scale_load(WEIGHT_IDLEPRIO); ++ lw.inv_weight = WMULT_IDLEPRIO; ++ } else { ++ lw.weight = scale_load(sched_prio_to_weight[prio]); ++ lw.inv_weight = sched_prio_to_wmult[prio]; + } + + /* + * SCHED_OTHER tasks have to update their load when changing their + * weight + */ +- if (update_load && p->sched_class == &fair_sched_class) { +- reweight_task(p, prio); +- } else { +- load->weight = scale_load(sched_prio_to_weight[prio]); +- load->inv_weight = sched_prio_to_wmult[prio]; +- } ++ if (update_load && p->sched_class->reweight_task) ++ p->sched_class->reweight_task(task_rq(p), p, &lw); ++ else ++ p->se.load = lw; + } + + #ifdef CONFIG_UCLAMP_TASK +@@ -2214,6 +2222,17 @@ inline int task_curr(const struct task_struct *p) + return cpu_curr(task_cpu(p)) == p; + } + ++/* ++ * ->switching_to() is called with the pi_lock and rq_lock held and must not ++ * mess with locking. ++ */ ++void check_class_changing(struct rq *rq, struct task_struct *p, ++ const struct sched_class *prev_class) ++{ ++ if (prev_class != p->sched_class && p->sched_class->switching_to) ++ p->sched_class->switching_to(rq, p); ++} ++ + /* + * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, + * use the balance_callback list if you want balancing. +@@ -2221,9 +2240,9 @@ inline int task_curr(const struct task_struct *p) + * this means any call to check_class_changed() must be followed by a call to + * balance_callback(). + */ +-static inline void check_class_changed(struct rq *rq, struct task_struct *p, +- const struct sched_class *prev_class, +- int oldprio) ++void check_class_changed(struct rq *rq, struct task_struct *p, ++ const struct sched_class *prev_class, ++ int oldprio) + { + if (prev_class != p->sched_class) { + if (prev_class->switched_from) +@@ -3986,6 +4005,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) + + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) + { ++ /* ++ * The BPF scheduler may depend on select_task_rq() being invoked during ++ * wakeups. In addition, @p may end up executing on a different CPU ++ * regardless of what happens in the wakeup path making the ttwu_queue ++ * optimization less meaningful. Skip if on SCX. ++ */ ++ if (task_on_scx(p)) ++ return false; ++ + /* + * Do not complicate things with the async wake_list while the CPU is + * in hotplug state. +@@ -4553,6 +4581,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->rt.on_rq = 0; + p->rt.on_list = 0; + ++#ifdef CONFIG_SCHED_CLASS_EXT ++ init_scx_entity(&p->scx); ++#endif ++ + #ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); + #endif +@@ -4794,10 +4826,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + + if (dl_prio(p->prio)) + return -EAGAIN; +- else if (rt_prio(p->prio)) ++ ++ scx_pre_fork(p); ++ ++ if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; +- else ++#ifdef CONFIG_SCHED_CLASS_EXT ++ } else if (task_should_scx(p)) { ++ p->sched_class = &ext_sched_class; ++#endif ++ } else { + p->sched_class = &fair_sched_class; ++ } + + init_entity_runnable_average(&p->se); + +@@ -4817,7 +4857,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + return 0; + } + +-void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) ++int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) + { + unsigned long flags; + +@@ -4979,6 +4979,13 @@ + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return scx_fork(p); ++} ++ ++void sched_cancel_fork(struct task_struct *p) ++{ ++ scx_cancel_fork(p); + } + + void sched_post_fork(struct task_struct *p) +@@ -4987,6 +4994,7 @@ + sched_post_fork_bore(p); + #endif // CONFIG_SCHED_BORE + uclamp_post_fork(p); ++ scx_post_fork(p); + } + + unsigned long to_ratio(u64 period, u64 runtime) +@@ -5687,6 +5735,7 @@ void scheduler_tick(void) + calc_global_load_tick(rq); + sched_core_tick(rq); + task_tick_mm_cid(rq, curr); ++ scx_tick(rq); + + rq_unlock(rq, &rf); + +@@ -5699,8 +5748,10 @@ void scheduler_tick(void) + wq_worker_tick(curr); + + #ifdef CONFIG_SMP +- rq->idle_balance = idle_cpu(cpu); +- trigger_load_balance(rq); ++ if (!scx_switched_all()) { ++ rq->idle_balance = idle_cpu(cpu); ++ trigger_load_balance(rq); ++ } + #endif + } + +@@ -5991,7 +6042,19 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) + { + #ifdef CONFIG_SMP ++ const struct sched_class *start_class = prev->sched_class; + const struct sched_class *class; ++ ++#ifdef CONFIG_SCHED_CLASS_EXT ++ /* ++ * SCX requires a balance() call before every pick_next_task() including ++ * when waking up from SCHED_IDLE. If @start_class is below SCX, start ++ * from SCX instead. ++ */ ++ if (sched_class_above(&ext_sched_class, start_class)) ++ start_class = &ext_sched_class; ++#endif ++ + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same +@@ -6000,7 +6063,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ +- for_class_range(class, prev->sched_class, &idle_sched_class) { ++ for_active_class_range(class, start_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +@@ -6018,6 +6081,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + const struct sched_class *class; + struct task_struct *p; + ++ if (scx_enabled()) ++ goto restart; ++ + /* + * Optimization: we know that if all tasks are in the fair class we can + * call that function directly, but only if the @prev task wasn't of a +@@ -6058,10 +6124,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + if (prev->dl_server) + prev->dl_server = NULL; + +- for_each_class(class) { ++ for_each_active_class(class) { + p = class->pick_next_task(rq); +- if (p) ++ if (p) { ++ const struct sched_class *prev_class = prev->sched_class; ++ ++ if (class != prev_class && prev_class->switch_class) ++ prev_class->switch_class(rq, p); + return p; ++ } + } + + BUG(); /* The idle class should always have a runnable task. */ +@@ -6091,7 +6162,7 @@ static inline struct task_struct *pick_task(struct rq *rq) + const struct sched_class *class; + struct task_struct *p; + +- for_each_class(class) { ++ for_each_active_class(class) { + p = class->pick_task(rq); + if (p) + return p; +@@ -7081,12 +7152,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag + } + EXPORT_SYMBOL(default_wake_function); + +-static void __setscheduler_prio(struct task_struct *p, int prio) ++void __setscheduler_prio(struct task_struct *p, int prio) + { + if (dl_prio(prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(prio)) + p->sched_class = &rt_sched_class; ++#ifdef CONFIG_SCHED_CLASS_EXT ++ else if (task_should_scx(p)) ++ p->sched_class = &ext_sched_class; ++#endif + else + p->sched_class = &fair_sched_class; + +@@ -7247,6 +7322,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) + } + + __setscheduler_prio(p, prio); ++ check_class_changing(rq, p, prev_class); + + if (queued) + enqueue_task(rq, p, queue_flag); +@@ -7468,6 +7544,25 @@ int sched_core_idle_cpu(int cpu) + #endif + + #ifdef CONFIG_SMP ++/* ++ * Load avg and utiliztion metrics need to be updated periodically and before ++ * consumption. This function updates the metrics for all subsystems except for ++ * the fair class. @rq must be locked and have its clock updated. ++ */ ++bool update_other_load_avgs(struct rq *rq) ++{ ++ u64 now = rq_clock_pelt(rq); ++ const struct sched_class *curr_class = rq->curr->sched_class; ++ unsigned long hw_pressure = arch_scale_thermal_pressure(cpu_of(rq)); ++ ++ lockdep_assert_rq_held(rq); ++ ++ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | ++ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | ++ update_thermal_load_avg(now, rq, hw_pressure) | ++ update_irq_load_avg(rq, 0); ++} ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -7790,6 +7885,10 @@ static int __sched_setscheduler(struct task_struct *p, + goto unlock; + } + ++ retval = scx_check_setscheduler(p, policy); ++ if (retval) ++ goto unlock; ++ + /* + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. +@@ -7892,6 +7991,7 @@ static int __sched_setscheduler(struct task_struct *p, + __setscheduler_prio(p, newprio); + } + __setscheduler_uclamp(p, attr); ++ check_class_changing(rq, p, prev_class); + + if (queued) { + /* +@@ -9067,6 +9167,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: ++ case SCHED_EXT: + ret = 0; + break; + } +@@ -9094,6 +9195,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: ++ case SCHED_EXT: + ret = 0; + } + return ret; +@@ -9189,6 +9291,7 @@ void sched_show_task(struct task_struct *p) + + print_worker_info(KERN_INFO, p); + print_stop_info(KERN_INFO, p); ++ print_scx_info(KERN_INFO, p); + show_stack(p, NULL, KERN_INFO); + put_task_stack(p); + } +@@ -9681,6 +9784,8 @@ int sched_cpu_activate(unsigned int cpu) + cpuset_cpu_active(); + } + ++ scx_rq_activate(rq); ++ + /* + * Put the rq online, if not already. This happens: + * +@@ -9741,6 +9846,8 @@ int sched_cpu_deactivate(unsigned int cpu) + } + rq_unlock_irqrestore(rq, &rf); + ++ scx_rq_deactivate(rq); ++ + #ifdef CONFIG_SCHED_SMT + /* + * When going down, decrement the number of cores with SMT present. +@@ -10062,11 +10062,15 @@ + int i; + + /* Make sure the linker didn't screw up */ +- BUG_ON(&idle_sched_class != &fair_sched_class + 1 || +- &fair_sched_class != &rt_sched_class + 1 || +- &rt_sched_class != &dl_sched_class + 1); + #ifdef CONFIG_SMP +- BUG_ON(&dl_sched_class != &stop_sched_class + 1); ++ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); ++#endif ++ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); ++ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); ++ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); ++#ifdef CONFIG_SCHED_CLASS_EXT ++ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); ++ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); + #endif + + #ifdef CONFIG_SCHED_BORE +@@ -10097,6 +10208,7 @@ void __init sched_init(void) + balance_push_set(smp_processor_id(), false); + #endif + init_sched_fair_class(); ++ init_sched_ext_class(); + + psi_init(); + +@@ -10523,11 +10635,6 @@ void sched_move_task(struct task_struct *tsk) + } + } + +-static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +-{ +- return css ? container_of(css, struct task_group, css) : NULL; +-} +- + static struct cgroup_subsys_state * + cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) + { +@@ -11294,29 +11401,27 @@ static int cpu_local_stat_show(struct seq_file *sf, + } + + #ifdef CONFIG_FAIR_GROUP_SCHED ++ ++static unsigned long tg_weight(struct task_group *tg) ++{ ++ return scale_load_down(tg->shares); ++} ++ + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) + { +- struct task_group *tg = css_tg(css); +- u64 weight = scale_load_down(tg->shares); +- +- return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); ++ return sched_weight_to_cgroup(tg_weight(css_tg(css))); + } + + static int cpu_weight_write_u64(struct cgroup_subsys_state *css, +- struct cftype *cft, u64 weight) ++ struct cftype *cft, u64 cgrp_weight) + { +- /* +- * cgroup weight knobs should use the common MIN, DFL and MAX +- * values which are 1, 100 and 10000 respectively. While it loses +- * a bit of range on both ends, it maps pretty well onto the shares +- * value used by scheduler and the round-trip conversions preserve +- * the original value over the entire range. +- */ +- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) ++ unsigned long weight; ++ ++ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) + return -ERANGE; + +- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); ++ weight = sched_weight_from_cgroup(cgrp_weight); + + return sched_group_set_shares(css_tg(css), scale_load(weight)); + } +@@ -11324,7 +11429,7 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css, + static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) + { +- unsigned long weight = scale_load_down(css_tg(css)->shares); ++ unsigned long weight = tg_weight(css_tg(css)); + int last_delta = INT_MAX; + int prio, delta; + +@@ -12065,3 +12170,38 @@ void sched_mm_cid_fork(struct task_struct *t) + t->mm_cid_active = 1; + } + #endif ++ ++#ifdef CONFIG_SCHED_CLASS_EXT ++void sched_deq_and_put_task(struct task_struct *p, int queue_flags, ++ struct sched_enq_and_set_ctx *ctx) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_rq_held(rq); ++ ++ *ctx = (struct sched_enq_and_set_ctx){ ++ .p = p, ++ .queue_flags = queue_flags, ++ .queued = task_on_rq_queued(p), ++ .running = task_current(rq, p), ++ }; ++ ++ update_rq_clock(rq); ++ if (ctx->queued) ++ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); ++ if (ctx->running) ++ put_prev_task(rq, p); ++} ++ ++void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) ++{ ++ struct rq *rq = task_rq(ctx->p); ++ ++ lockdep_assert_rq_held(rq); ++ ++ if (ctx->queued) ++ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); ++ if (ctx->running) ++ set_next_task(rq, ctx->p); ++} ++#endif /* CONFIG_SCHED_CLASS_EXT */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index eece6244f9d2..e683e5d08daa 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + + static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) + { +- unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); ++ unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); + ++ if (!scx_switched_all()) ++ util += cpu_util_cfs_boost(sg_cpu->cpu); + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + util = max(util, boost); + sg_cpu->bw_min = min; +@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + } + + #ifdef CONFIG_NO_HZ_COMMON +-static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) ++static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) + { +- unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); +- bool ret = idle_calls == sg_cpu->saved_idle_calls; ++ unsigned long idle_calls; ++ bool ret; ++ ++ /* ++ * The heuristics in this function is for the fair class. For SCX, the ++ * performance target comes directly from the BPF scheduler. Let's just ++ * follow it. ++ */ ++ if (scx_switched_all()) ++ return false; ++ ++ /* if capped by uclamp_max, always update to be in compliance */ ++ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) ++ return false; ++ ++ /* ++ * Maintain the frequency if the CPU has not been idle recently, as ++ * reduction is likely to be premature. ++ */ ++ idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); ++ ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; + } + #else +-static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } ++static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } + #endif /* CONFIG_NO_HZ_COMMON */ + + /* +@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + return; + + next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); +- /* +- * Do not reduce the frequency if the CPU has not been idle +- * recently, as the reduction is likely to be premature then. +- * +- * Except when the rq is capped by uclamp_max. +- */ +- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && +- sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && ++ ++ if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && + !sg_policy->need_freq_update) { + next_f = sg_policy->next_freq; + +@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) + return; + +- /* +- * Do not reduce the target performance level if the CPU has not been +- * idle recently, as the reduction is likely to be premature then. +- * +- * Except when the rq is capped by uclamp_max. +- */ +- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && +- sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) ++ if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 8d5d98a5834d..6f306e1c9c3e 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -1089,6 +1089,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + P(dl.runtime); + P(dl.deadline); + } ++#ifdef CONFIG_SCHED_CLASS_EXT ++ __PS("ext.enabled", task_on_scx(p)); ++#endif + #undef PN_SCHEDSTAT + #undef P_SCHEDSTAT + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +new file mode 100644 +index 000000000000..b9fd7b7d4a86 +--- /dev/null ++++ b/kernel/sched/ext.c +@@ -0,0 +1,6537 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst ++ * ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) ++ ++enum scx_consts { ++ SCX_DSP_DFL_MAX_BATCH = 32, ++ SCX_DSP_MAX_LOOPS = 32, ++ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, ++ ++ SCX_EXIT_BT_LEN = 64, ++ SCX_EXIT_MSG_LEN = 1024, ++ SCX_EXIT_DUMP_DFL_LEN = 32768, ++ ++ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, ++}; ++ ++enum scx_exit_kind { ++ SCX_EXIT_NONE, ++ SCX_EXIT_DONE, ++ ++ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ ++ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ ++ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ ++ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ ++ ++ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ ++ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ ++ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ ++}; ++ ++/* ++ * An exit code can be specified when exiting with scx_bpf_exit() or ++ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN ++ * respectively. The codes are 64bit of the format: ++ * ++ * Bits: [63 .. 48 47 .. 32 31 .. 0] ++ * [ SYS ACT ] [ SYS RSN ] [ USR ] ++ * ++ * SYS ACT: System-defined exit actions ++ * SYS RSN: System-defined exit reasons ++ * USR : User-defined exit codes and reasons ++ * ++ * Using the above, users may communicate intention and context by ORing system ++ * actions and/or system reasons with a user-defined exit code. ++ */ ++enum scx_exit_code { ++ /* Reasons */ ++ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, ++ ++ /* Actions */ ++ SCX_ECODE_ACT_RESTART = 1LLU << 48, ++}; ++ ++/* ++ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is ++ * being disabled. ++ */ ++struct scx_exit_info { ++ /* %SCX_EXIT_* - broad category of the exit reason */ ++ enum scx_exit_kind kind; ++ ++ /* exit code if gracefully exiting */ ++ s64 exit_code; ++ ++ /* textual representation of the above */ ++ const char *reason; ++ ++ /* backtrace if exiting due to an error */ ++ unsigned long *bt; ++ u32 bt_len; ++ ++ /* informational message */ ++ char *msg; ++ ++ /* debug dump */ ++ char *dump; ++}; ++ ++/* sched_ext_ops.flags */ ++enum scx_ops_flags { ++ /* ++ * Keep built-in idle tracking even if ops.update_idle() is implemented. ++ */ ++ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, ++ ++ /* ++ * By default, if there are no other task to run on the CPU, ext core ++ * keeps running the current task even after its slice expires. If this ++ * flag is specified, such tasks are passed to ops.enqueue() with ++ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. ++ */ ++ SCX_OPS_ENQ_LAST = 1LLU << 1, ++ ++ /* ++ * An exiting task may schedule after PF_EXITING is set. In such cases, ++ * bpf_task_from_pid() may not be able to find the task and if the BPF ++ * scheduler depends on pid lookup for dispatching, the task will be ++ * lost leading to various issues including RCU grace period stalls. ++ * ++ * To mask this problem, by default, unhashed tasks are automatically ++ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't ++ * depend on pid lookups and wants to handle these tasks directly, the ++ * following flag can be used. ++ */ ++ SCX_OPS_ENQ_EXITING = 1LLU << 2, ++ ++ /* ++ * If set, only tasks with policy set to SCHED_EXT are attached to ++ * sched_ext. If clear, SCHED_NORMAL tasks are also included. ++ */ ++ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, ++ ++ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | ++ SCX_OPS_ENQ_LAST | ++ SCX_OPS_ENQ_EXITING | ++ SCX_OPS_SWITCH_PARTIAL, ++}; ++ ++/* argument container for ops.init_task() */ ++struct scx_init_task_args { ++ /* ++ * Set if ops.init_task() is being invoked on the fork path, as opposed ++ * to the scheduler transition path. ++ */ ++ bool fork; ++}; ++ ++/* argument container for ops.exit_task() */ ++struct scx_exit_task_args { ++ /* Whether the task exited before running on sched_ext. */ ++ bool cancelled; ++}; ++ ++enum scx_cpu_preempt_reason { ++ /* next task is being scheduled by &sched_class_rt */ ++ SCX_CPU_PREEMPT_RT, ++ /* next task is being scheduled by &sched_class_dl */ ++ SCX_CPU_PREEMPT_DL, ++ /* next task is being scheduled by &sched_class_stop */ ++ SCX_CPU_PREEMPT_STOP, ++ /* unknown reason for SCX being preempted */ ++ SCX_CPU_PREEMPT_UNKNOWN, ++}; ++ ++/* ++ * Argument container for ops->cpu_acquire(). Currently empty, but may be ++ * expanded in the future. ++ */ ++struct scx_cpu_acquire_args {}; ++ ++/* argument container for ops->cpu_release() */ ++struct scx_cpu_release_args { ++ /* the reason the CPU was preempted */ ++ enum scx_cpu_preempt_reason reason; ++ ++ /* the task that's going to be scheduled on the CPU */ ++ struct task_struct *task; ++}; ++ ++/* ++ * Informational context provided to dump operations. ++ */ ++struct scx_dump_ctx { ++ enum scx_exit_kind kind; ++ s64 exit_code; ++ const char *reason; ++ u64 at_ns; ++ u64 at_jiffies; ++}; ++ ++/** ++ * struct sched_ext_ops - Operation table for BPF scheduler implementation ++ * ++ * Userland can implement an arbitrary scheduling policy by implementing and ++ * loading operations in this table. ++ */ ++struct sched_ext_ops { ++ /** ++ * select_cpu - Pick the target CPU for a task which is being woken up ++ * @p: task being woken up ++ * @prev_cpu: the cpu @p was on before sleeping ++ * @wake_flags: SCX_WAKE_* ++ * ++ * Decision made here isn't final. @p may be moved to any CPU while it ++ * is getting dispatched for execution later. However, as @p is not on ++ * the rq at this point, getting the eventual execution CPU right here ++ * saves a small bit of overhead down the line. ++ * ++ * If an idle CPU is returned, the CPU is kicked and will try to ++ * dispatch. While an explicit custom mechanism can be added, ++ * select_cpu() serves as the default way to wake up idle CPUs. ++ * ++ * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p ++ * is dispatched, the ops.enqueue() callback will be skipped. Finally, ++ * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the ++ * local DSQ of whatever CPU is returned by this callback. ++ */ ++ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); ++ ++ /** ++ * enqueue - Enqueue a task on the BPF scheduler ++ * @p: task being enqueued ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() ++ * or enqueue on the BPF scheduler. If not directly dispatched, the bpf ++ * scheduler owns @p and if it fails to dispatch @p, the task will ++ * stall. ++ * ++ * If @p was dispatched from ops.select_cpu(), this callback is ++ * skipped. ++ */ ++ void (*enqueue)(struct task_struct *p, u64 enq_flags); ++ ++ /** ++ * dequeue - Remove a task from the BPF scheduler ++ * @p: task being dequeued ++ * @deq_flags: %SCX_DEQ_* ++ * ++ * Remove @p from the BPF scheduler. This is usually called to isolate ++ * the task while updating its scheduling properties (e.g. priority). ++ * ++ * The ext core keeps track of whether the BPF side owns a given task or ++ * not and can gracefully ignore spurious dispatches from BPF side, ++ * which makes it safe to not implement this method. However, depending ++ * on the scheduling logic, this can lead to confusing behaviors - e.g. ++ * scheduling position not being updated across a priority change. ++ */ ++ void (*dequeue)(struct task_struct *p, u64 deq_flags); ++ ++ /** ++ * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs ++ * @cpu: CPU to dispatch tasks for ++ * @prev: previous task being switched out ++ * ++ * Called when a CPU's local dsq is empty. The operation should dispatch ++ * one or more tasks from the BPF scheduler into the DSQs using ++ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using ++ * scx_bpf_consume(). ++ * ++ * The maximum number of times scx_bpf_dispatch() can be called without ++ * an intervening scx_bpf_consume() is specified by ++ * ops.dispatch_max_batch. See the comments on top of the two functions ++ * for more details. ++ * ++ * When not %NULL, @prev is an SCX task with its slice depleted. If ++ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in ++ * @prev->scx.flags, it is not enqueued yet and will be enqueued after ++ * ops.dispatch() returns. To keep executing @prev, return without ++ * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. ++ */ ++ void (*dispatch)(s32 cpu, struct task_struct *prev); ++ ++ /** ++ * tick - Periodic tick ++ * @p: task running currently ++ * ++ * This operation is called every 1/HZ seconds on CPUs which are ++ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an ++ * immediate dispatch cycle on the CPU. ++ */ ++ void (*tick)(struct task_struct *p); ++ ++ /** ++ * runnable - A task is becoming runnable on its associated CPU ++ * @p: task becoming runnable ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * This and the following three functions can be used to track a task's ++ * execution state transitions. A task becomes ->runnable() on a CPU, ++ * and then goes through one or more ->running() and ->stopping() pairs ++ * as it runs on the CPU, and eventually becomes ->quiescent() when it's ++ * done running on the CPU. ++ * ++ * @p is becoming runnable on the CPU because it's ++ * ++ * - waking up (%SCX_ENQ_WAKEUP) ++ * - being moved from another CPU ++ * - being restored after temporarily taken off the queue for an ++ * attribute change. ++ * ++ * This and ->enqueue() are related but not coupled. This operation ++ * notifies @p's state transition and may not be followed by ->enqueue() ++ * e.g. when @p is being dispatched to a remote CPU, or when @p is ++ * being enqueued on a CPU experiencing a hotplug event. Likewise, a ++ * task may be ->enqueue()'d without being preceded by this operation ++ * e.g. after exhausting its slice. ++ */ ++ void (*runnable)(struct task_struct *p, u64 enq_flags); ++ ++ /** ++ * running - A task is starting to run on its associated CPU ++ * @p: task starting to run ++ * ++ * See ->runnable() for explanation on the task state notifiers. ++ */ ++ void (*running)(struct task_struct *p); ++ ++ /** ++ * stopping - A task is stopping execution ++ * @p: task stopping to run ++ * @runnable: is task @p still runnable? ++ * ++ * See ->runnable() for explanation on the task state notifiers. If ++ * !@runnable, ->quiescent() will be invoked after this operation ++ * returns. ++ */ ++ void (*stopping)(struct task_struct *p, bool runnable); ++ ++ /** ++ * quiescent - A task is becoming not runnable on its associated CPU ++ * @p: task becoming not runnable ++ * @deq_flags: %SCX_DEQ_* ++ * ++ * See ->runnable() for explanation on the task state notifiers. ++ * ++ * @p is becoming quiescent on the CPU because it's ++ * ++ * - sleeping (%SCX_DEQ_SLEEP) ++ * - being moved to another CPU ++ * - being temporarily taken off the queue for an attribute change ++ * (%SCX_DEQ_SAVE) ++ * ++ * This and ->dequeue() are related but not coupled. This operation ++ * notifies @p's state transition and may not be preceded by ->dequeue() ++ * e.g. when @p is being dispatched to a remote CPU. ++ */ ++ void (*quiescent)(struct task_struct *p, u64 deq_flags); ++ ++ /** ++ * yield - Yield CPU ++ * @from: yielding task ++ * @to: optional yield target task ++ * ++ * If @to is NULL, @from is yielding the CPU to other runnable tasks. ++ * The BPF scheduler should ensure that other available tasks are ++ * dispatched before the yielding task. Return value is ignored in this ++ * case. ++ * ++ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf ++ * scheduler can implement the request, return %true; otherwise, %false. ++ */ ++ bool (*yield)(struct task_struct *from, struct task_struct *to); ++ ++ /** ++ * core_sched_before - Task ordering for core-sched ++ * @a: task A ++ * @b: task B ++ * ++ * Used by core-sched to determine the ordering between two tasks. See ++ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on ++ * core-sched. ++ * ++ * Both @a and @b are runnable and may or may not currently be queued on ++ * the BPF scheduler. Should return %true if @a should run before @b. ++ * %false if there's no required ordering or @b should run before @a. ++ * ++ * If not specified, the default is ordering them according to when they ++ * became runnable. ++ */ ++ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); ++ ++ /** ++ * set_weight - Set task weight ++ * @p: task to set weight for ++ * @weight: new weight [1..10000] ++ * ++ * Update @p's weight to @weight. ++ */ ++ void (*set_weight)(struct task_struct *p, u32 weight); ++ ++ /** ++ * set_cpumask - Set CPU affinity ++ * @p: task to set CPU affinity for ++ * @cpumask: cpumask of cpus that @p can run on ++ * ++ * Update @p's CPU affinity to @cpumask. ++ */ ++ void (*set_cpumask)(struct task_struct *p, ++ const struct cpumask *cpumask); ++ ++ /** ++ * update_idle - Update the idle state of a CPU ++ * @cpu: CPU to udpate the idle state for ++ * @idle: whether entering or exiting the idle state ++ * ++ * This operation is called when @rq's CPU goes or leaves the idle ++ * state. By default, implementing this operation disables the built-in ++ * idle CPU tracking and the following helpers become unavailable: ++ * ++ * - scx_bpf_select_cpu_dfl() ++ * - scx_bpf_test_and_clear_cpu_idle() ++ * - scx_bpf_pick_idle_cpu() ++ * ++ * The user also must implement ops.select_cpu() as the default ++ * implementation relies on scx_bpf_select_cpu_dfl(). ++ * ++ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle ++ * tracking. ++ */ ++ void (*update_idle)(s32 cpu, bool idle); ++ ++ /** ++ * cpu_acquire - A CPU is becoming available to the BPF scheduler ++ * @cpu: The CPU being acquired by the BPF scheduler. ++ * @args: Acquire arguments, see the struct definition. ++ * ++ * A CPU that was previously released from the BPF scheduler is now once ++ * again under its control. ++ */ ++ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); ++ ++ /** ++ * cpu_release - A CPU is taken away from the BPF scheduler ++ * @cpu: The CPU being released by the BPF scheduler. ++ * @args: Release arguments, see the struct definition. ++ * ++ * The specified CPU is no longer under the control of the BPF ++ * scheduler. This could be because it was preempted by a higher ++ * priority sched_class, though there may be other reasons as well. The ++ * caller should consult @args->reason to determine the cause. ++ */ ++ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); ++ ++ /** ++ * init_task - Initialize a task to run in a BPF scheduler ++ * @p: task to initialize for BPF scheduling ++ * @args: init arguments, see the struct definition ++ * ++ * Either we're loading a BPF scheduler or a new task is being forked. ++ * Initialize @p for BPF scheduling. This operation may block and can ++ * be used for allocations, and is called exactly once for a task. ++ * ++ * Return 0 for success, -errno for failure. An error return while ++ * loading will abort loading of the BPF scheduler. During a fork, it ++ * will abort that specific fork. ++ */ ++ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); ++ ++ /** ++ * exit_task - Exit a previously-running task from the system ++ * @p: task to exit ++ * ++ * @p is exiting or the BPF scheduler is being unloaded. Perform any ++ * necessary cleanup for @p. ++ */ ++ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); ++ ++ /** ++ * enable - Enable BPF scheduling for a task ++ * @p: task to enable BPF scheduling for ++ * ++ * Enable @p for BPF scheduling. enable() is called on @p any time it ++ * enters SCX, and is always paired with a matching disable(). ++ */ ++ void (*enable)(struct task_struct *p); ++ ++ /** ++ * disable - Disable BPF scheduling for a task ++ * @p: task to disable BPF scheduling for ++ * ++ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. ++ * Disable BPF scheduling for @p. A disable() call is always matched ++ * with a prior enable() call. ++ */ ++ void (*disable)(struct task_struct *p); ++ ++ /** ++ * dump - Dump BPF scheduler state on error ++ * @ctx: debug dump context ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. ++ */ ++ void (*dump)(struct scx_dump_ctx *ctx); ++ ++ /** ++ * dump_cpu - Dump BPF scheduler state for a CPU on error ++ * @ctx: debug dump context ++ * @cpu: CPU to generate debug dump for ++ * @idle: @cpu is currently idle without any runnable tasks ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for ++ * @cpu. If @idle is %true and this operation doesn't produce any ++ * output, @cpu is skipped for dump. ++ */ ++ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); ++ ++ /** ++ * dump_task - Dump BPF scheduler state for a runnable task on error ++ * @ctx: debug dump context ++ * @p: runnable task to generate debug dump for ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for ++ * @p. ++ */ ++ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); ++ ++ /* ++ * All online ops must come before ops.cpu_online(). ++ */ ++ ++ /** ++ * cpu_online - A CPU became online ++ * @cpu: CPU which just came up ++ * ++ * @cpu just came online. @cpu will not call ops.enqueue() or ++ * ops.dispatch(), nor run tasks associated with other CPUs beforehand. ++ */ ++ void (*cpu_online)(s32 cpu); ++ ++ /** ++ * cpu_offline - A CPU is going offline ++ * @cpu: CPU which is going offline ++ * ++ * @cpu is going offline. @cpu will not call ops.enqueue() or ++ * ops.dispatch(), nor run tasks associated with other CPUs afterwards. ++ */ ++ void (*cpu_offline)(s32 cpu); ++ ++ /* ++ * All CPU hotplug ops must come before ops.init(). ++ */ ++ ++ /** ++ * init - Initialize the BPF scheduler ++ */ ++ s32 (*init)(void); ++ ++ /** ++ * exit - Clean up after the BPF scheduler ++ * @info: Exit info ++ */ ++ void (*exit)(struct scx_exit_info *info); ++ ++ /** ++ * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch ++ */ ++ u32 dispatch_max_batch; ++ ++ /** ++ * flags - %SCX_OPS_* flags ++ */ ++ u64 flags; ++ ++ /** ++ * timeout_ms - The maximum amount of time, in milliseconds, that a ++ * runnable task should be able to wait before being scheduled. The ++ * maximum timeout may not exceed the default timeout of 30 seconds. ++ * ++ * Defaults to the maximum allowed timeout value of 30 seconds. ++ */ ++ u32 timeout_ms; ++ ++ /** ++ * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default ++ * value of 32768 is used. ++ */ ++ u32 exit_dump_len; ++ ++ /** ++ * hotplug_seq - A sequence number that may be set by the scheduler to ++ * detect when a hotplug event has occurred during the loading process. ++ * If 0, no detection occurs. Otherwise, the scheduler will fail to ++ * load if the sequence number does not match @scx_hotplug_seq on the ++ * enable path. ++ */ ++ u64 hotplug_seq; ++ ++ /** ++ * name - BPF scheduler's name ++ * ++ * Must be a non-zero valid BPF object name including only isalnum(), ++ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the ++ * BPF scheduler is enabled. ++ */ ++ char name[SCX_OPS_NAME_LEN]; ++}; ++ ++enum scx_opi { ++ SCX_OPI_BEGIN = 0, ++ SCX_OPI_NORMAL_BEGIN = 0, ++ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), ++ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), ++ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), ++ SCX_OPI_END = SCX_OP_IDX(init), ++}; ++ ++enum scx_wake_flags { ++ /* expose select WF_* flags as enums */ ++ SCX_WAKE_FORK = WF_FORK, ++ SCX_WAKE_TTWU = WF_TTWU, ++ SCX_WAKE_SYNC = WF_SYNC, ++}; ++ ++enum scx_enq_flags { ++ /* expose select ENQUEUE_* flags as enums */ ++ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, ++ SCX_ENQ_HEAD = ENQUEUE_HEAD, ++ ++ /* high 32bits are SCX specific */ ++ ++ /* ++ * Set the following to trigger preemption when calling ++ * scx_bpf_dispatch() with a local dsq as the target. The slice of the ++ * current task is cleared to zero and the CPU is kicked into the ++ * scheduling path. Implies %SCX_ENQ_HEAD. ++ */ ++ SCX_ENQ_PREEMPT = 1LLU << 32, ++ ++ /* ++ * The task being enqueued was previously enqueued on the current CPU's ++ * %SCX_DSQ_LOCAL, but was removed from it in a call to the ++ * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was ++ * invoked in a ->cpu_release() callback, and the task is again ++ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the ++ * task will not be scheduled on the CPU until at least the next invocation ++ * of the ->cpu_acquire() callback. ++ */ ++ SCX_ENQ_REENQ = 1LLU << 40, ++ ++ /* ++ * The task being enqueued is the only task available for the cpu. By ++ * default, ext core keeps executing such tasks but when ++ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the ++ * %SCX_ENQ_LAST flag set. ++ * ++ * If the BPF scheduler wants to continue executing the task, ++ * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. ++ * If the task gets queued on a different dsq or the BPF side, the BPF ++ * scheduler is responsible for triggering a follow-up scheduling event. ++ * Otherwise, Execution may stall. ++ */ ++ SCX_ENQ_LAST = 1LLU << 41, ++ ++ /* high 8 bits are internal */ ++ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, ++ ++ SCX_ENQ_CLEAR_OPSS = 1LLU << 56, ++ SCX_ENQ_DSQ_PRIQ = 1LLU << 57, ++}; ++ ++enum scx_deq_flags { ++ /* expose select DEQUEUE_* flags as enums */ ++ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, ++ ++ /* high 32bits are SCX specific */ ++ ++ /* ++ * The generic core-sched layer decided to execute the task even though ++ * it hasn't been dispatched yet. Dequeue from the BPF side. ++ */ ++ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, ++}; ++ ++enum scx_pick_idle_cpu_flags { ++ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ ++}; ++ ++enum scx_kick_flags { ++ /* ++ * Kick the target CPU if idle. Guarantees that the target CPU goes ++ * through at least one full scheduling cycle before going idle. If the ++ * target CPU can be determined to be currently not idle and going to go ++ * through a scheduling cycle before going idle, noop. ++ */ ++ SCX_KICK_IDLE = 1LLU << 0, ++ ++ /* ++ * Preempt the current task and execute the dispatch path. If the ++ * current task of the target CPU is an SCX task, its ->scx.slice is ++ * cleared to zero before the scheduling path is invoked so that the ++ * task expires and the dispatch path is invoked. ++ */ ++ SCX_KICK_PREEMPT = 1LLU << 1, ++ ++ /* ++ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will ++ * return after the target CPU finishes picking the next task. ++ */ ++ SCX_KICK_WAIT = 1LLU << 2, ++}; ++ ++enum scx_ops_enable_state { ++ SCX_OPS_PREPPING, ++ SCX_OPS_ENABLING, ++ SCX_OPS_ENABLED, ++ SCX_OPS_DISABLING, ++ SCX_OPS_DISABLED, ++}; ++ ++static const char *scx_ops_enable_state_str[] = { ++ [SCX_OPS_PREPPING] = "prepping", ++ [SCX_OPS_ENABLING] = "enabling", ++ [SCX_OPS_ENABLED] = "enabled", ++ [SCX_OPS_DISABLING] = "disabling", ++ [SCX_OPS_DISABLED] = "disabled", ++}; ++ ++/* ++ * sched_ext_entity->ops_state ++ * ++ * Used to track the task ownership between the SCX core and the BPF scheduler. ++ * State transitions look as follows: ++ * ++ * NONE -> QUEUEING -> QUEUED -> DISPATCHING ++ * ^ | | ++ * | v v ++ * \-------------------------------/ ++ * ++ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call ++ * sites for explanations on the conditions being waited upon and why they are ++ * safe. Transitions out of them into NONE or QUEUED must store_release and the ++ * waiters should load_acquire. ++ * ++ * Tracking scx_ops_state enables sched_ext core to reliably determine whether ++ * any given task can be dispatched by the BPF scheduler at all times and thus ++ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler ++ * to try to dispatch any task anytime regardless of its state as the SCX core ++ * can safely reject invalid dispatches. ++ */ ++enum scx_ops_state { ++ SCX_OPSS_NONE, /* owned by the SCX core */ ++ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ ++ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ ++ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ ++ ++ /* ++ * QSEQ brands each QUEUED instance so that, when dispatch races ++ * dequeue/requeue, the dispatcher can tell whether it still has a claim ++ * on the task being dispatched. ++ * ++ * As some 32bit archs can't do 64bit store_release/load_acquire, ++ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on ++ * 32bit machines. The dispatch race window QSEQ protects is very narrow ++ * and runs with IRQ disabled. 30 bits should be sufficient. ++ */ ++ SCX_OPSS_QSEQ_SHIFT = 2, ++}; ++ ++/* Use macros to ensure that the type is unsigned long for the masks */ ++#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) ++#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) ++ ++/* ++ * During exit, a task may schedule after losing its PIDs. When disabling the ++ * BPF scheduler, we need to be able to iterate tasks in every state to ++ * guarantee system safety. Maintain a dedicated task list which contains every ++ * task between its fork and eventual free. ++ */ ++static DEFINE_SPINLOCK(scx_tasks_lock); ++static LIST_HEAD(scx_tasks); ++ ++/* ops enable/disable */ ++static struct kthread_worker *scx_ops_helper; ++static DEFINE_MUTEX(scx_ops_enable_mutex); ++DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); ++DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); ++static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); ++static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); ++static bool scx_switching_all; ++DEFINE_STATIC_KEY_FALSE(__scx_switched_all); ++ ++static struct sched_ext_ops scx_ops; ++static bool scx_warned_zero_slice; ++ ++static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); ++static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); ++static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); ++static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); ++ ++struct static_key_false scx_has_op[SCX_OPI_END] = ++ { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; ++ ++static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); ++static struct scx_exit_info *scx_exit_info; ++ ++static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); ++static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); ++ ++/* ++ * The maximum amount of time in jiffies that a task may be runnable without ++ * being scheduled on a CPU. If this timeout is exceeded, it will trigger ++ * scx_ops_error(). ++ */ ++static unsigned long scx_watchdog_timeout; ++ ++/* ++ * The last time the delayed work was run. This delayed work relies on ++ * ksoftirqd being able to run to service timer interrupts, so it's possible ++ * that this work itself could get wedged. To account for this, we check that ++ * it's not stalled in the timer tick, and trigger an error if it is. ++ */ ++static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; ++ ++static struct delayed_work scx_watchdog_work; ++ ++/* idle tracking */ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_CPUMASK_OFFSTACK ++#define CL_ALIGNED_IF_ONSTACK ++#else ++#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp ++#endif ++ ++static struct { ++ cpumask_var_t cpu; ++ cpumask_var_t smt; ++} idle_masks CL_ALIGNED_IF_ONSTACK; ++ ++#endif /* CONFIG_SMP */ ++ ++/* for %SCX_KICK_WAIT */ ++static unsigned long __percpu *scx_kick_cpus_pnt_seqs; ++ ++/* ++ * Direct dispatch marker. ++ * ++ * Non-NULL values are used for direct dispatch from enqueue path. A valid ++ * pointer points to the task currently being enqueued. An ERR_PTR value is used ++ * to indicate that direct dispatch has already happened. ++ */ ++static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); ++ ++/* dispatch queues */ ++static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; ++ ++static const struct rhashtable_params dsq_hash_params = { ++ .key_len = 8, ++ .key_offset = offsetof(struct scx_dispatch_q, id), ++ .head_offset = offsetof(struct scx_dispatch_q, hash_node), ++}; ++ ++static struct rhashtable dsq_hash; ++static LLIST_HEAD(dsqs_to_free); ++ ++/* dispatch buf */ ++struct scx_dsp_buf_ent { ++ struct task_struct *task; ++ unsigned long qseq; ++ u64 dsq_id; ++ u64 enq_flags; ++}; ++ ++static u32 scx_dsp_max_batch; ++ ++struct scx_dsp_ctx { ++ struct rq *rq; ++ u32 cursor; ++ u32 nr_tasks; ++ struct scx_dsp_buf_ent buf[]; ++}; ++ ++static struct scx_dsp_ctx __percpu *scx_dsp_ctx; ++ ++/* string formatting from BPF */ ++struct scx_bstr_buf { ++ u64 data[MAX_BPRINTF_VARARGS]; ++ char line[SCX_EXIT_MSG_LEN]; ++}; ++ ++static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); ++static struct scx_bstr_buf scx_exit_bstr_buf; ++ ++/* ops debug dump */ ++struct scx_dump_data { ++ s32 cpu; ++ bool first; ++ s32 cursor; ++ struct seq_buf *s; ++ const char *prefix; ++ struct scx_bstr_buf buf; ++}; ++ ++struct scx_dump_data scx_dump_data = { ++ .cpu = -1, ++}; ++ ++/* /sys/kernel/sched_ext interface */ ++static struct kset *scx_kset; ++static struct kobject *scx_root_kobj; ++ ++#define CREATE_TRACE_POINTS ++#include <trace/events/sched_ext.h> ++ ++static void process_ddsp_deferred_locals(struct rq *rq); ++static void scx_bpf_kick_cpu(s32 cpu, u64 flags); ++static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, ++ s64 exit_code, ++ const char *fmt, ...); ++ ++#define scx_ops_error_kind(err, fmt, args...) \ ++ scx_ops_exit_kind((err), 0, fmt, ##args) ++ ++#define scx_ops_exit(code, fmt, args...) \ ++ scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) ++ ++#define scx_ops_error(fmt, args...) \ ++ scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) ++ ++#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) ++ ++static long jiffies_delta_msecs(unsigned long at, unsigned long now) ++{ ++ if (time_after(at, now)) ++ return jiffies_to_msecs(at - now); ++ else ++ return -(long)jiffies_to_msecs(now - at); ++} ++ ++/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ ++static u32 higher_bits(u32 flags) ++{ ++ return ~((1 << fls(flags)) - 1); ++} ++ ++/* return the mask with only the highest bit set */ ++static u32 highest_bit(u32 flags) ++{ ++ int bit = fls(flags); ++ return ((u64)1 << bit) >> 1; ++} ++ ++static bool u32_before(u32 a, u32 b) ++{ ++ return (s32)(a - b) < 0; ++} ++ ++/* ++ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX ++ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate ++ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check ++ * whether it's running from an allowed context. ++ * ++ * @mask is constant, always inline to cull the mask calculations. ++ */ ++static __always_inline void scx_kf_allow(u32 mask) ++{ ++ /* nesting is allowed only in increasing scx_kf_mask order */ ++ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, ++ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", ++ current->scx.kf_mask, mask); ++ current->scx.kf_mask |= mask; ++ barrier(); ++} ++ ++static void scx_kf_disallow(u32 mask) ++{ ++ barrier(); ++ current->scx.kf_mask &= ~mask; ++} ++ ++#define SCX_CALL_OP(mask, op, args...) \ ++do { \ ++ if (mask) { \ ++ scx_kf_allow(mask); \ ++ scx_ops.op(args); \ ++ scx_kf_disallow(mask); \ ++ } else { \ ++ scx_ops.op(args); \ ++ } \ ++} while (0) ++ ++#define SCX_CALL_OP_RET(mask, op, args...) \ ++({ \ ++ __typeof__(scx_ops.op(args)) __ret; \ ++ if (mask) { \ ++ scx_kf_allow(mask); \ ++ __ret = scx_ops.op(args); \ ++ scx_kf_disallow(mask); \ ++ } else { \ ++ __ret = scx_ops.op(args); \ ++ } \ ++ __ret; \ ++}) ++ ++/* ++ * Some kfuncs are allowed only on the tasks that are subjects of the ++ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such ++ * restrictions, the following SCX_CALL_OP_*() variants should be used when ++ * invoking scx_ops operations that take task arguments. These can only be used ++ * for non-nesting operations due to the way the tasks are tracked. ++ * ++ * kfuncs which can only operate on such tasks can in turn use ++ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on ++ * the specific task. ++ */ ++#define SCX_CALL_OP_TASK(mask, op, task, args...) \ ++do { \ ++ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ ++ current->scx.kf_tasks[0] = task; \ ++ SCX_CALL_OP(mask, op, task, ##args); \ ++ current->scx.kf_tasks[0] = NULL; \ ++} while (0) ++ ++#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ ++({ \ ++ __typeof__(scx_ops.op(task, ##args)) __ret; \ ++ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ ++ current->scx.kf_tasks[0] = task; \ ++ __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ ++ current->scx.kf_tasks[0] = NULL; \ ++ __ret; \ ++}) ++ ++#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ ++({ \ ++ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ ++ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ ++ current->scx.kf_tasks[0] = task0; \ ++ current->scx.kf_tasks[1] = task1; \ ++ __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ ++ current->scx.kf_tasks[0] = NULL; \ ++ current->scx.kf_tasks[1] = NULL; \ ++ __ret; \ ++}) ++ ++/* @mask is constant, always inline to cull unnecessary branches */ ++static __always_inline bool scx_kf_allowed(u32 mask) ++{ ++ if (unlikely(!(current->scx.kf_mask & mask))) { ++ scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", ++ mask, current->scx.kf_mask); ++ return false; ++ } ++ ++ if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) { ++ scx_ops_error("sleepable kfunc called from non-sleepable context"); ++ return false; ++ } ++ ++ /* ++ * Enforce nesting boundaries. e.g. A kfunc which can be called from ++ * DISPATCH must not be called if we're running DEQUEUE which is nested ++ * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE ++ * boundary thanks to the above in_interrupt() check. ++ */ ++ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && ++ (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { ++ scx_ops_error("cpu_release kfunc called from a nested operation"); ++ return false; ++ } ++ ++ if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && ++ (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { ++ scx_ops_error("dispatch kfunc called from a nested operation"); ++ return false; ++ } ++ ++ return true; ++} ++ ++/* see SCX_CALL_OP_TASK() */ ++static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, ++ struct task_struct *p) ++{ ++ if (!scx_kf_allowed(mask)) ++ return false; ++ ++ if (unlikely((p != current->scx.kf_tasks[0] && ++ p != current->scx.kf_tasks[1]))) { ++ scx_ops_error("called on a task not being operated on"); ++ return false; ++ } ++ ++ return true; ++} ++ ++/** ++ * nldsq_next_task - Iterate to the next task in a non-local DSQ ++ * @dsq: user dsq being interated ++ * @cur: current position, %NULL to start iteration ++ * @rev: walk backwards ++ * ++ * Returns %NULL when iteration is finished. ++ */ ++static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, ++ struct task_struct *cur, bool rev) ++{ ++ struct list_head *list_node; ++ struct scx_dsq_list_node *dsq_lnode; ++ ++ lockdep_assert_held(&dsq->lock); ++ ++ if (cur) ++ list_node = &cur->scx.dsq_list.node; ++ else ++ list_node = &dsq->list; ++ ++ /* find the next task, need to skip BPF iteration cursors */ ++ do { ++ if (rev) ++ list_node = list_node->prev; ++ else ++ list_node = list_node->next; ++ ++ if (list_node == &dsq->list) ++ return NULL; ++ ++ dsq_lnode = container_of(list_node, struct scx_dsq_list_node, ++ node); ++ } while (dsq_lnode->is_bpf_iter_cursor); ++ ++ return container_of(dsq_lnode, struct task_struct, scx.dsq_list); ++} ++ ++#define nldsq_for_each_task(p, dsq) \ ++ for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ ++ (p) = nldsq_next_task((dsq), (p), false)) ++ ++ ++/* ++ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] ++ * dispatch order. BPF-visible iterator is opaque and larger to allow future ++ * changes without breaking backward compatibility. Can be used with ++ * bpf_for_each(). See bpf_iter_scx_dsq_*(). ++ */ ++enum scx_dsq_iter_flags { ++ /* iterate in the reverse dispatch order */ ++ SCX_DSQ_ITER_REV = 1U << 0, ++ ++ __SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV, ++}; ++ ++struct bpf_iter_scx_dsq_kern { ++ struct scx_dsq_list_node cursor; ++ struct scx_dispatch_q *dsq; ++ u32 dsq_seq; ++ u32 flags; ++} __attribute__((aligned(8))); ++ ++struct bpf_iter_scx_dsq { ++ u64 __opaque[6]; ++} __attribute__((aligned(8))); ++ ++ ++/* ++ * SCX task iterator. ++ */ ++struct scx_task_iter { ++ struct sched_ext_entity cursor; ++ struct task_struct *locked; ++ struct rq *rq; ++ struct rq_flags rf; ++}; ++ ++/** ++ * scx_task_iter_init - Initialize a task iterator ++ * @iter: iterator to init ++ * ++ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, ++ * @iter must eventually be exited with scx_task_iter_exit(). ++ * ++ * scx_tasks_lock may be released between this and the first next() call or ++ * between any two next() calls. If scx_tasks_lock is released between two ++ * next() calls, the caller is responsible for ensuring that the task being ++ * iterated remains accessible either through RCU read lock or obtaining a ++ * reference count. ++ * ++ * All tasks which existed when the iteration started are guaranteed to be ++ * visited as long as they still exist. ++ */ ++static void scx_task_iter_init(struct scx_task_iter *iter) ++{ ++ lockdep_assert_held(&scx_tasks_lock); ++ ++ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; ++ list_add(&iter->cursor.tasks_node, &scx_tasks); ++ iter->locked = NULL; ++} ++ ++/** ++ * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator ++ * @iter: iterator to unlock rq for ++ * ++ * If @iter is in the middle of a locked iteration, it may be locking the rq of ++ * the task currently being visited. Unlock the rq if so. This function can be ++ * safely called anytime during an iteration. ++ * ++ * Returns %true if the rq @iter was locking is unlocked. %false if @iter was ++ * not locking an rq. ++ */ ++static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) ++{ ++ if (iter->locked) { ++ task_rq_unlock(iter->rq, iter->locked, &iter->rf); ++ iter->locked = NULL; ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++/** ++ * scx_task_iter_exit - Exit a task iterator ++ * @iter: iterator to exit ++ * ++ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. ++ * If the iterator holds a task's rq lock, that rq lock is released. See ++ * scx_task_iter_init() for details. ++ */ ++static void scx_task_iter_exit(struct scx_task_iter *iter) ++{ ++ lockdep_assert_held(&scx_tasks_lock); ++ ++ scx_task_iter_rq_unlock(iter); ++ list_del_init(&iter->cursor.tasks_node); ++} ++ ++/** ++ * scx_task_iter_next - Next task ++ * @iter: iterator to walk ++ * ++ * Visit the next task. See scx_task_iter_init() for details. ++ */ ++static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) ++{ ++ struct list_head *cursor = &iter->cursor.tasks_node; ++ struct sched_ext_entity *pos; ++ ++ lockdep_assert_held(&scx_tasks_lock); ++ ++ list_for_each_entry(pos, cursor, tasks_node) { ++ if (&pos->tasks_node == &scx_tasks) ++ return NULL; ++ if (!(pos->flags & SCX_TASK_CURSOR)) { ++ list_move(cursor, &pos->tasks_node); ++ return container_of(pos, struct task_struct, scx); ++ } ++ } ++ ++ /* can't happen, should always terminate at scx_tasks above */ ++ BUG(); ++} ++ ++/** ++ * scx_task_iter_next_locked - Next non-idle task with its rq locked ++ * @iter: iterator to walk ++ * @include_dead: Whether we should include dead tasks in the iteration ++ * ++ * Visit the non-idle task with its rq lock held. Allows callers to specify ++ * whether they would like to filter out dead tasks. See scx_task_iter_init() ++ * for details. ++ */ ++static struct task_struct * ++scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) ++{ ++ struct task_struct *p; ++retry: ++ scx_task_iter_rq_unlock(iter); ++ ++ while ((p = scx_task_iter_next(iter))) { ++ /* ++ * is_idle_task() tests %PF_IDLE which may not be set for CPUs ++ * which haven't yet been onlined. Test sched_class directly. ++ */ ++ if (p->sched_class != &idle_sched_class) ++ break; ++ } ++ if (!p) ++ return NULL; ++ ++ iter->rq = task_rq_lock(p, &iter->rf); ++ iter->locked = p; ++ ++ /* ++ * If we see %TASK_DEAD, @p already disabled preemption, is about to do ++ * the final __schedule(), won't ever need to be scheduled again and can ++ * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter ++ * the final __schedle() while we're locking its rq and thus will stay ++ * alive until the rq is unlocked. ++ */ ++ if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) ++ goto retry; ++ ++ return p; ++} ++ ++static enum scx_ops_enable_state scx_ops_enable_state(void) ++{ ++ return atomic_read(&scx_ops_enable_state_var); ++} ++ ++static enum scx_ops_enable_state ++scx_ops_set_enable_state(enum scx_ops_enable_state to) ++{ ++ return atomic_xchg(&scx_ops_enable_state_var, to); ++} ++ ++static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, ++ enum scx_ops_enable_state from) ++{ ++ int from_v = from; ++ ++ return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); ++} ++ ++static bool scx_ops_bypassing(void) ++{ ++ return unlikely(atomic_read(&scx_ops_bypass_depth)); ++} ++ ++/** ++ * wait_ops_state - Busy-wait the specified ops state to end ++ * @p: target task ++ * @opss: state to wait the end of ++ * ++ * Busy-wait for @p to transition out of @opss. This can only be used when the ++ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also ++ * has load_acquire semantics to ensure that the caller can see the updates made ++ * in the enqueueing and dispatching paths. ++ */ ++static void wait_ops_state(struct task_struct *p, unsigned long opss) ++{ ++ do { ++ cpu_relax(); ++ } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); ++} ++ ++/** ++ * ops_cpu_valid - Verify a cpu number ++ * @cpu: cpu number which came from a BPF ops ++ * @where: extra information reported on error ++ * ++ * @cpu is a cpu number which came from the BPF scheduler and can be any value. ++ * Verify that it is in range and one of the possible cpus. If invalid, trigger ++ * an ops error. ++ */ ++static bool ops_cpu_valid(s32 cpu, const char *where) ++{ ++ if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { ++ return true; ++ } else { ++ scx_ops_error("invalid CPU %d%s%s", cpu, ++ where ? " " : "", where ?: ""); ++ return false; ++ } ++} ++ ++/** ++ * ops_sanitize_err - Sanitize a -errno value ++ * @ops_name: operation to blame on failure ++ * @err: -errno value to sanitize ++ * ++ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return ++ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can ++ * cause misbehaviors. For an example, a large negative return from ++ * ops.init_task() triggers an oops when passed up the call chain because the ++ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is ++ * handled as a pointer. ++ */ ++static int ops_sanitize_err(const char *ops_name, s32 err) ++{ ++ if (err < 0 && err >= -MAX_ERRNO) ++ return err; ++ ++ scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); ++ return -EPROTO; ++} ++ ++static void run_deferred(struct rq *rq) ++{ ++ process_ddsp_deferred_locals(rq); ++} ++ ++#ifdef CONFIG_SMP ++static void deferred_bal_cb_workfn(struct rq *rq) ++{ ++ run_deferred(rq); ++} ++#endif ++ ++static void deferred_irq_workfn(struct irq_work *irq_work) ++{ ++ struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); ++ ++ raw_spin_rq_lock(rq); ++ run_deferred(rq); ++ raw_spin_rq_unlock(rq); ++} ++ ++/** ++ * schedule_deferred - Schedule execution of deferred actions on an rq ++ * @rq: target rq ++ * ++ * Schedule execution of deferred actions on @rq. Must be called with @rq ++ * locked. Deferred actions are executed with @rq locked but unpinned, and thus ++ * can unlock @rq to e.g. migrate tasks to other rqs. ++ */ ++static void schedule_deferred(struct rq *rq) ++{ ++ lockdep_assert_rq_held(rq); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If in the middle of waking up a task, task_woken_scx() will be called ++ * afterwards which will then run the deferred actions, no need to ++ * schedule anything. ++ */ ++ if (rq->scx.flags & SCX_RQ_IN_WAKEUP) ++ return; ++ ++ /* ++ * If in balance, the balance callbacks will be called before rq lock is ++ * released. Schedule one. ++ */ ++ if (rq->scx.flags & SCX_RQ_IN_BALANCE) { ++ queue_balance_callback(rq, &rq->scx.deferred_bal_cb, ++ deferred_bal_cb_workfn); ++ return; ++ } ++#endif ++ /* ++ * No scheduler hooks available. Queue an irq work. They are executed on ++ * IRQ re-enable which may take a bit longer than the scheduler hooks. ++ * The above WAKEUP and BALANCE paths should cover most of the cases and ++ * the time to IRQ re-enable shouldn't be long. ++ */ ++ irq_work_queue(&rq->scx.deferred_irq_work); ++} ++ ++/** ++ * touch_core_sched - Update timestamp used for core-sched task ordering ++ * @rq: rq to read clock from, must be locked ++ * @p: task to update the timestamp for ++ * ++ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to ++ * implement global or local-DSQ FIFO ordering for core-sched. Should be called ++ * when a task becomes runnable and its turn on the CPU ends (e.g. slice ++ * exhaustion). ++ */ ++static void touch_core_sched(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SCHED_CORE ++ /* ++ * It's okay to update the timestamp spuriously. Use ++ * sched_core_disabled() which is cheaper than enabled(). ++ */ ++ if (!sched_core_disabled()) ++ p->scx.core_sched_at = rq_clock_task(rq); ++#endif ++} ++ ++/** ++ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch ++ * @rq: rq to read clock from, must be locked ++ * @p: task being dispatched ++ * ++ * If the BPF scheduler implements custom core-sched ordering via ++ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO ++ * ordering within each local DSQ. This function is called from dispatch paths ++ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. ++ */ ++static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) ++{ ++ lockdep_assert_rq_held(rq); ++ assert_clock_updated(rq); ++ ++#ifdef CONFIG_SCHED_CORE ++ if (SCX_HAS_OP(core_sched_before)) ++ touch_core_sched(rq, p); ++#endif ++} ++ ++static void update_curr_scx(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ u64 now = rq_clock_task(rq); ++ u64 delta_exec; ++ ++ if (time_before_eq64(now, curr->se.exec_start)) ++ return; ++ ++ delta_exec = now - curr->se.exec_start; ++ curr->se.exec_start = now; ++ curr->se.sum_exec_runtime += delta_exec; ++ account_group_exec_runtime(curr, delta_exec); ++ cgroup_account_cputime(curr, delta_exec); ++ ++ if (curr->scx.slice != SCX_SLICE_INF) { ++ curr->scx.slice -= min(curr->scx.slice, delta_exec); ++ if (!curr->scx.slice) ++ touch_core_sched(rq, curr); ++ } ++} ++ ++static bool scx_dsq_priq_less(struct rb_node *node_a, ++ const struct rb_node *node_b) ++{ ++ const struct task_struct *a = ++ container_of(node_a, struct task_struct, scx.dsq_priq); ++ const struct task_struct *b = ++ container_of(node_b, struct task_struct, scx.dsq_priq); ++ ++ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); ++} ++ ++static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) ++{ ++ /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ ++ WRITE_ONCE(dsq->nr, dsq->nr + delta); ++} ++ ++static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, ++ u64 enq_flags) ++{ ++ bool is_local = dsq->id == SCX_DSQ_LOCAL; ++ ++ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); ++ WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || ++ !RB_EMPTY_NODE(&p->scx.dsq_priq)); ++ ++ if (!is_local) { ++ raw_spin_lock(&dsq->lock); ++ if (unlikely(dsq->id == SCX_DSQ_INVALID)) { ++ scx_ops_error("attempting to dispatch to a destroyed dsq"); ++ /* fall back to the global dsq */ ++ raw_spin_unlock(&dsq->lock); ++ dsq = &scx_dsq_global; ++ raw_spin_lock(&dsq->lock); ++ } ++ } ++ ++ if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && ++ (enq_flags & SCX_ENQ_DSQ_PRIQ))) { ++ /* ++ * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from ++ * their FIFO queues. To avoid confusion and accidentally ++ * starving vtime-dispatched tasks by FIFO-dispatched tasks, we ++ * disallow any internal DSQ from doing vtime ordering of ++ * tasks. ++ */ ++ scx_ops_error("cannot use vtime ordering for built-in DSQs"); ++ enq_flags &= ~SCX_ENQ_DSQ_PRIQ; ++ } ++ ++ if (enq_flags & SCX_ENQ_DSQ_PRIQ) { ++ struct rb_node *rbp; ++ ++ /* ++ * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are ++ * linked to both the rbtree and list on PRIQs, this can only be ++ * tested easily when adding the first task. ++ */ ++ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && ++ nldsq_next_task(dsq, NULL, false))) ++ scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", ++ dsq->id); ++ ++ p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; ++ rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); ++ ++ /* ++ * Find the previous task and insert after it on the list so ++ * that @dsq->list is vtime ordered. ++ */ ++ rbp = rb_prev(&p->scx.dsq_priq); ++ if (rbp) { ++ struct task_struct *prev = ++ container_of(rbp, struct task_struct, ++ scx.dsq_priq); ++ list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); ++ } else { ++ list_add(&p->scx.dsq_list.node, &dsq->list); ++ } ++ } else { ++ /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ ++ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) ++ scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", ++ dsq->id); ++ ++ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) ++ list_add(&p->scx.dsq_list.node, &dsq->list); ++ else ++ list_add_tail(&p->scx.dsq_list.node, &dsq->list); ++ } ++ ++ /* seq records the order tasks are queued, used by BPF DSQ iterator */ ++ dsq->seq++; ++ p->scx.dsq_seq = dsq->seq; ++ ++ dsq_mod_nr(dsq, 1); ++ p->scx.dsq = dsq; ++ ++ /* ++ * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the ++ * direct dispatch path, but we clear them here because the direct ++ * dispatch verdict may be overridden on the enqueue path during e.g. ++ * bypass. ++ */ ++ p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; ++ p->scx.ddsp_enq_flags = 0; ++ ++ /* ++ * We're transitioning out of QUEUEING or DISPATCHING. store_release to ++ * match waiters' load_acquire. ++ */ ++ if (enq_flags & SCX_ENQ_CLEAR_OPSS) ++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); ++ ++ if (is_local) { ++ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); ++ bool preempt = false; ++ ++ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && ++ rq->curr->sched_class == &ext_sched_class) { ++ rq->curr->scx.slice = 0; ++ preempt = true; ++ } ++ ++ if (preempt || sched_class_above(&ext_sched_class, ++ rq->curr->sched_class)) ++ resched_curr(rq); ++ } else { ++ raw_spin_unlock(&dsq->lock); ++ } ++} ++ ++static void task_unlink_from_dsq(struct task_struct *p, ++ struct scx_dispatch_q *dsq) ++{ ++ if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { ++ rb_erase(&p->scx.dsq_priq, &dsq->priq); ++ RB_CLEAR_NODE(&p->scx.dsq_priq); ++ p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; ++ } ++ ++ list_del_init(&p->scx.dsq_list.node); ++} ++ ++static void dispatch_dequeue(struct rq *rq, struct task_struct *p) ++{ ++ struct scx_dispatch_q *dsq = p->scx.dsq; ++ bool is_local = dsq == &rq->scx.local_dsq; ++ ++ if (!dsq) { ++ /* ++ * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. ++ * Unlinking is all that's needed to cancel. ++ */ ++ if (unlikely(!list_empty(&p->scx.dsq_list.node))) ++ list_del_init(&p->scx.dsq_list.node); ++ ++ /* ++ * When dispatching directly from the BPF scheduler to a local ++ * DSQ, the task isn't associated with any DSQ but ++ * @p->scx.holding_cpu may be set under the protection of ++ * %SCX_OPSS_DISPATCHING. ++ */ ++ if (p->scx.holding_cpu >= 0) ++ p->scx.holding_cpu = -1; ++ ++ return; ++ } ++ ++ if (!is_local) ++ raw_spin_lock(&dsq->lock); ++ ++ /* ++ * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't ++ * change underneath us. ++ */ ++ if (p->scx.holding_cpu < 0) { ++ /* @p must still be on @dsq, dequeue */ ++ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); ++ task_unlink_from_dsq(p, dsq); ++ dsq_mod_nr(dsq, -1); ++ } else { ++ /* ++ * We're racing against dispatch_to_local_dsq() which already ++ * removed @p from @dsq and set @p->scx.holding_cpu. Clear the ++ * holding_cpu which tells dispatch_to_local_dsq() that it lost ++ * the race. ++ */ ++ WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); ++ p->scx.holding_cpu = -1; ++ } ++ p->scx.dsq = NULL; ++ ++ if (!is_local) ++ raw_spin_unlock(&dsq->lock); ++} ++ ++static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) ++{ ++ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); ++} ++ ++static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) ++{ ++ lockdep_assert(rcu_read_lock_any_held()); ++ ++ if (dsq_id == SCX_DSQ_GLOBAL) ++ return &scx_dsq_global; ++ else ++ return find_user_dsq(dsq_id); ++} ++ ++static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, ++ struct task_struct *p) ++{ ++ struct scx_dispatch_q *dsq; ++ ++ if (dsq_id == SCX_DSQ_LOCAL) ++ return &rq->scx.local_dsq; ++ ++ dsq = find_non_local_dsq(dsq_id); ++ if (unlikely(!dsq)) { ++ scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", ++ dsq_id, p->comm, p->pid); ++ return &scx_dsq_global; ++ } ++ ++ return dsq; ++} ++ ++static void mark_direct_dispatch(struct task_struct *ddsp_task, ++ struct task_struct *p, u64 dsq_id, ++ u64 enq_flags) ++{ ++ /* ++ * Mark that dispatch already happened from ops.select_cpu() or ++ * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value ++ * which can never match a valid task pointer. ++ */ ++ __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); ++ ++ /* @p must match the task on the enqueue path */ ++ if (unlikely(p != ddsp_task)) { ++ if (IS_ERR(ddsp_task)) ++ scx_ops_error("%s[%d] already direct-dispatched", ++ p->comm, p->pid); ++ else ++ scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", ++ ddsp_task->comm, ddsp_task->pid, ++ p->comm, p->pid); ++ return; ++ } ++ ++ WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); ++ WARN_ON_ONCE(p->scx.ddsp_enq_flags); ++ ++ p->scx.ddsp_dsq_id = dsq_id; ++ p->scx.ddsp_enq_flags = enq_flags; ++} ++ ++static void direct_dispatch(struct task_struct *p, u64 enq_flags) ++{ ++ struct rq *rq = task_rq(p); ++ struct scx_dispatch_q *dsq; ++ u64 dsq_id = p->scx.ddsp_dsq_id; ++ ++ touch_core_sched_dispatch(rq, p); ++ ++ p->scx.ddsp_enq_flags |= enq_flags; ++ ++ /* ++ * We are in the enqueue path with @rq locked and pinned, and thus can't ++ * double lock a remote rq and enqueue to its local DSQ. For ++ * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer ++ * the enqueue so that it's executed when @rq can be unlocked. ++ */ ++ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { ++ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; ++ unsigned long opss; ++ ++ if (cpu == cpu_of(rq)) { ++ dsq_id = SCX_DSQ_LOCAL; ++ goto dispatch; ++ } ++ ++ opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; ++ ++ switch (opss & SCX_OPSS_STATE_MASK) { ++ case SCX_OPSS_NONE: ++ break; ++ case SCX_OPSS_QUEUEING: ++ /* ++ * As @p was never passed to the BPF side, _release is ++ * not strictly necessary. Still do it for consistency. ++ */ ++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); ++ break; ++ default: ++ WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", ++ p->comm, p->pid, opss); ++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); ++ break; ++ } ++ ++ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); ++ list_add_tail(&p->scx.dsq_list.node, ++ &rq->scx.ddsp_deferred_locals); ++ schedule_deferred(rq); ++ return; ++ } ++ ++dispatch: ++ dsq = find_dsq_for_dispatch(rq, dsq_id, p); ++ dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); ++} ++ ++static bool scx_rq_online(struct rq *rq) ++{ ++ return likely(rq->scx.flags & SCX_RQ_ONLINE); ++} ++ ++static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, ++ int sticky_cpu) ++{ ++ struct task_struct **ddsp_taskp; ++ unsigned long qseq; ++ ++ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); ++ ++ /* rq migration */ ++ if (sticky_cpu == cpu_of(rq)) ++ goto local_norefill; ++ ++ /* ++ * If !scx_rq_online(), we already told the BPF scheduler that the CPU ++ * is offline and are just running the hotplug path. Don't bother the ++ * BPF scheduler. ++ */ ++ if (!scx_rq_online(rq)) ++ goto local; ++ ++ if (scx_ops_bypassing()) { ++ if (enq_flags & SCX_ENQ_LAST) ++ goto local; ++ else ++ goto global; ++ } ++ ++ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) ++ goto direct; ++ ++ /* see %SCX_OPS_ENQ_EXITING */ ++ if (!static_branch_unlikely(&scx_ops_enq_exiting) && ++ unlikely(p->flags & PF_EXITING)) ++ goto local; ++ ++ /* see %SCX_OPS_ENQ_LAST */ ++ if (!static_branch_unlikely(&scx_ops_enq_last) && ++ (enq_flags & SCX_ENQ_LAST)) ++ goto local; ++ ++ if (!SCX_HAS_OP(enqueue)) ++ goto global; ++ ++ /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ ++ qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; ++ ++ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); ++ atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); ++ ++ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); ++ WARN_ON_ONCE(*ddsp_taskp); ++ *ddsp_taskp = p; ++ ++ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); ++ ++ *ddsp_taskp = NULL; ++ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) ++ goto direct; ++ ++ /* ++ * If not directly dispatched, QUEUEING isn't clear yet and dispatch or ++ * dequeue may be waiting. The store_release matches their load_acquire. ++ */ ++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); ++ return; ++ ++direct: ++ direct_dispatch(p, enq_flags); ++ return; ++ ++local: ++ /* ++ * For task-ordering, slice refill must be treated as implying the end ++ * of the current slice. Otherwise, the longer @p stays on the CPU, the ++ * higher priority it becomes from scx_prio_less()'s POV. ++ */ ++ touch_core_sched(rq, p); ++ p->scx.slice = SCX_SLICE_DFL; ++local_norefill: ++ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); ++ return; ++ ++global: ++ touch_core_sched(rq, p); /* see the comment in local: */ ++ p->scx.slice = SCX_SLICE_DFL; ++ dispatch_enqueue(&scx_dsq_global, p, enq_flags); ++} ++ ++static bool task_runnable(const struct task_struct *p) ++{ ++ return !list_empty(&p->scx.runnable_node); ++} ++ ++static void set_task_runnable(struct rq *rq, struct task_struct *p) ++{ ++ lockdep_assert_rq_held(rq); ++ ++ if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { ++ p->scx.runnable_at = jiffies; ++ p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; ++ } ++ ++ /* ++ * list_add_tail() must be used. scx_ops_bypass() depends on tasks being ++ * appened to the runnable_list. ++ */ ++ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); ++} ++ ++static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) ++{ ++ list_del_init(&p->scx.runnable_node); ++ if (reset_runnable_at) ++ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; ++} ++ ++static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) ++{ ++ int sticky_cpu = p->scx.sticky_cpu; ++ ++ if (enq_flags & ENQUEUE_WAKEUP) ++ rq->scx.flags |= SCX_RQ_IN_WAKEUP; ++ ++ enq_flags |= rq->scx.extra_enq_flags; ++ ++ if (sticky_cpu >= 0) ++ p->scx.sticky_cpu = -1; ++ ++ /* ++ * Restoring a running task will be immediately followed by ++ * set_next_task_scx() which expects the task to not be on the BPF ++ * scheduler as tasks can only start running through local DSQs. Force ++ * direct-dispatch into the local DSQ by setting the sticky_cpu. ++ */ ++ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) ++ sticky_cpu = cpu_of(rq); ++ ++ if (p->scx.flags & SCX_TASK_QUEUED) { ++ WARN_ON_ONCE(!task_runnable(p)); ++ goto out; ++ } ++ ++ set_task_runnable(rq, p); ++ p->scx.flags |= SCX_TASK_QUEUED; ++ rq->scx.nr_running++; ++ add_nr_running(rq, 1); ++ ++ if (SCX_HAS_OP(runnable)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); ++ ++ if (enq_flags & SCX_ENQ_WAKEUP) ++ touch_core_sched(rq, p); ++ ++ do_enqueue_task(rq, p, enq_flags, sticky_cpu); ++out: ++ rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; ++} ++ ++static void ops_dequeue(struct task_struct *p, u64 deq_flags) ++{ ++ unsigned long opss; ++ ++ /* dequeue is always temporary, don't reset runnable_at */ ++ clr_task_runnable(p, false); ++ ++ /* acquire ensures that we see the preceding updates on QUEUED */ ++ opss = atomic_long_read_acquire(&p->scx.ops_state); ++ ++ switch (opss & SCX_OPSS_STATE_MASK) { ++ case SCX_OPSS_NONE: ++ break; ++ case SCX_OPSS_QUEUEING: ++ /* ++ * QUEUEING is started and finished while holding @p's rq lock. ++ * As we're holding the rq lock now, we shouldn't see QUEUEING. ++ */ ++ BUG(); ++ case SCX_OPSS_QUEUED: ++ if (SCX_HAS_OP(dequeue)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); ++ ++ if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, ++ SCX_OPSS_NONE)) ++ break; ++ fallthrough; ++ case SCX_OPSS_DISPATCHING: ++ /* ++ * If @p is being dispatched from the BPF scheduler to a DSQ, ++ * wait for the transfer to complete so that @p doesn't get ++ * added to its DSQ after dequeueing is complete. ++ * ++ * As we're waiting on DISPATCHING with the rq locked, the ++ * dispatching side shouldn't try to lock the rq while ++ * DISPATCHING is set. See dispatch_to_local_dsq(). ++ * ++ * DISPATCHING shouldn't have qseq set and control can reach ++ * here with NONE @opss from the above QUEUED case block. ++ * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. ++ */ ++ wait_ops_state(p, SCX_OPSS_DISPATCHING); ++ BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); ++ break; ++ } ++} ++ ++static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) ++{ ++ if (!(p->scx.flags & SCX_TASK_QUEUED)) { ++ WARN_ON_ONCE(task_runnable(p)); ++ return; ++ } ++ ++ ops_dequeue(p, deq_flags); ++ ++ /* ++ * A currently running task which is going off @rq first gets dequeued ++ * and then stops running. As we want running <-> stopping transitions ++ * to be contained within runnable <-> quiescent transitions, trigger ++ * ->stopping() early here instead of in put_prev_task_scx(). ++ * ++ * @p may go through multiple stopping <-> running transitions between ++ * here and put_prev_task_scx() if task attribute changes occur while ++ * balance_scx() leaves @rq unlocked. However, they don't contain any ++ * information meaningful to the BPF scheduler and can be suppressed by ++ * skipping the callbacks if the task is !QUEUED. ++ */ ++ if (SCX_HAS_OP(stopping) && task_current(rq, p)) { ++ update_curr_scx(rq); ++ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); ++ } ++ ++ if (SCX_HAS_OP(quiescent)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); ++ ++ if (deq_flags & SCX_DEQ_SLEEP) ++ p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; ++ else ++ p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; ++ ++ p->scx.flags &= ~SCX_TASK_QUEUED; ++ rq->scx.nr_running--; ++ sub_nr_running(rq, 1); ++ ++ dispatch_dequeue(rq, p); ++} ++ ++static void yield_task_scx(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (SCX_HAS_OP(yield)) ++ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); ++ else ++ p->scx.slice = 0; ++} ++ ++static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) ++{ ++ struct task_struct *from = rq->curr; ++ ++ if (SCX_HAS_OP(yield)) ++ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); ++ else ++ return false; ++} ++ ++#ifdef CONFIG_SMP ++/** ++ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ ++ * @rq: rq to move the task into, currently locked ++ * @p: task to move ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * Move @p which is currently on a different rq to @rq's local DSQ. The caller ++ * must: ++ * ++ * 1. Start with exclusive access to @p either through its DSQ lock or ++ * %SCX_OPSS_DISPATCHING flag. ++ * ++ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). ++ * ++ * 3. Remember task_rq(@p). Release the exclusive access so that we don't ++ * deadlock with dequeue. ++ * ++ * 4. Lock @rq and the task_rq from #3. ++ * ++ * 5. Call this function. ++ * ++ * Returns %true if @p was successfully moved. %false after racing dequeue and ++ * losing. ++ */ ++static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, ++ u64 enq_flags) ++{ ++ struct rq *task_rq; ++ ++ lockdep_assert_rq_held(rq); ++ ++ /* ++ * If dequeue got to @p while we were trying to lock both rq's, it'd ++ * have cleared @p->scx.holding_cpu to -1. While other cpus may have ++ * updated it to different values afterwards, as this operation can't be ++ * preempted or recurse, @p->scx.holding_cpu can never become ++ * raw_smp_processor_id() again before we're done. Thus, we can tell ++ * whether we lost to dequeue by testing whether @p->scx.holding_cpu is ++ * still raw_smp_processor_id(). ++ * ++ * See dispatch_dequeue() for the counterpart. ++ */ ++ if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) ++ return false; ++ ++ /* @p->rq couldn't have changed if we're still the holding cpu */ ++ task_rq = task_rq(p); ++ lockdep_assert_rq_held(task_rq); ++ ++ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); ++ deactivate_task(task_rq, p, 0); ++ set_task_cpu(p, cpu_of(rq)); ++ p->scx.sticky_cpu = cpu_of(rq); ++ ++ /* ++ * We want to pass scx-specific enq_flags but activate_task() will ++ * truncate the upper 32 bit. As we own @rq, we can pass them through ++ * @rq->scx.extra_enq_flags instead. ++ */ ++ WARN_ON_ONCE(rq->scx.extra_enq_flags); ++ rq->scx.extra_enq_flags = enq_flags; ++ activate_task(rq, p, 0); ++ rq->scx.extra_enq_flags = 0; ++ ++ return true; ++} ++ ++/** ++ * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked ++ * @rq: current rq which is locked ++ * @src_rq: rq to move task from ++ * @dst_rq: rq to move task to ++ * ++ * We're holding @rq lock and trying to dispatch a task from @src_rq to ++ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether ++ * @rq stays locked isn't important as long as the state is restored after ++ * dispatch_to_local_dsq_unlock(). ++ */ ++static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq *src_rq, ++ struct rq *dst_rq) ++{ ++ if (src_rq == dst_rq) { ++ raw_spin_rq_unlock(rq); ++ raw_spin_rq_lock(dst_rq); ++ } else if (rq == src_rq) { ++ double_lock_balance(rq, dst_rq); ++ } else if (rq == dst_rq) { ++ double_lock_balance(rq, src_rq); ++ } else { ++ raw_spin_rq_unlock(rq); ++ double_rq_lock(src_rq, dst_rq); ++ } ++} ++ ++/** ++ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() ++ * @rq: current rq which is locked ++ * @src_rq: rq to move task from ++ * @dst_rq: rq to move task to ++ * ++ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. ++ */ ++static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq *src_rq, ++ struct rq *dst_rq) ++{ ++ if (src_rq == dst_rq) { ++ raw_spin_rq_unlock(dst_rq); ++ raw_spin_rq_lock(rq); ++ } else if (rq == src_rq) { ++ double_unlock_balance(rq, dst_rq); ++ } else if (rq == dst_rq) { ++ double_unlock_balance(rq, src_rq); ++ } else { ++ double_rq_unlock(src_rq, dst_rq); ++ raw_spin_rq_lock(rq); ++ } ++} ++#endif /* CONFIG_SMP */ ++ ++static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, ++ struct task_struct *p) ++{ ++ lockdep_assert_held(&dsq->lock); /* released on return */ ++ ++ /* @dsq is locked and @p is on this rq */ ++ WARN_ON_ONCE(p->scx.holding_cpu >= 0); ++ task_unlink_from_dsq(p, dsq); ++ list_add_tail(&p->scx.dsq_list.node, &rq->scx.local_dsq.list); ++ dsq_mod_nr(dsq, -1); ++ dsq_mod_nr(&rq->scx.local_dsq, 1); ++ p->scx.dsq = &rq->scx.local_dsq; ++ raw_spin_unlock(&dsq->lock); ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p ++ * can be pulled to @rq. ++ */ ++static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ if (unlikely(is_migration_disabled(p))) ++ return false; ++ if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p))) ++ return false; ++ if (!scx_rq_online(rq)) ++ return false; ++ return true; ++} ++ ++static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq, ++ struct task_struct *p, struct rq *task_rq) ++{ ++ bool moved = false; ++ ++ lockdep_assert_held(&dsq->lock); /* released on return */ ++ ++ /* ++ * @dsq is locked and @p is on a remote rq. @p is currently protected by ++ * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab ++ * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the ++ * rq lock or fail, do a little dancing from our side. See ++ * move_task_to_local_dsq(). ++ */ ++ WARN_ON_ONCE(p->scx.holding_cpu >= 0); ++ task_unlink_from_dsq(p, dsq); ++ dsq_mod_nr(dsq, -1); ++ p->scx.holding_cpu = raw_smp_processor_id(); ++ raw_spin_unlock(&dsq->lock); ++ ++ double_lock_balance(rq, task_rq); ++ ++ moved = move_task_to_local_dsq(rq, p, 0); ++ ++ double_unlock_balance(rq, task_rq); ++ ++ return moved; ++} ++#else /* CONFIG_SMP */ ++static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } ++static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq, ++ struct task_struct *p, struct rq *task_rq) { return false; } ++#endif /* CONFIG_SMP */ ++ ++static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) ++{ ++ struct task_struct *p; ++retry: ++ /* ++ * The caller can't expect to successfully consume a task if the task's ++ * addition to @dsq isn't guaranteed to be visible somehow. Test ++ * @dsq->list without locking and skip if it seems empty. ++ */ ++ if (list_empty(&dsq->list)) ++ return false; ++ ++ raw_spin_lock(&dsq->lock); ++ ++ nldsq_for_each_task(p, dsq) { ++ struct rq *task_rq = task_rq(p); ++ ++ if (rq == task_rq) { ++ consume_local_task(rq, dsq, p); ++ return true; ++ } ++ ++ if (task_can_run_on_remote_rq(p, rq)) { ++ if (likely(consume_remote_task(rq, dsq, p, task_rq))) ++ return true; ++ goto retry; ++ } ++ } ++ ++ raw_spin_unlock(&dsq->lock); ++ return false; ++} ++ ++enum dispatch_to_local_dsq_ret { ++ DTL_DISPATCHED, /* successfully dispatched */ ++ DTL_LOST, /* lost race to dequeue */ ++ DTL_NOT_LOCAL, /* destination is not a local DSQ */ ++ DTL_INVALID, /* invalid local dsq_id */ ++}; ++ ++/** ++ * dispatch_to_local_dsq - Dispatch a task to a local dsq ++ * @rq: current rq which is locked ++ * @dsq_id: destination dsq ID ++ * @p: task to dispatch ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by ++ * @dsq_id. This function performs all the synchronization dancing needed ++ * because local DSQs are protected with rq locks. ++ * ++ * The caller must have exclusive ownership of @p (e.g. through ++ * %SCX_OPSS_DISPATCHING). ++ */ ++static enum dispatch_to_local_dsq_ret ++dispatch_to_local_dsq(struct rq *rq, u64 dsq_id, struct task_struct *p, ++ u64 enq_flags) ++{ ++ struct rq *src_rq = task_rq(p); ++ struct rq *dst_rq; ++ ++ /* ++ * We're synchronized against dequeue through DISPATCHING. As @p can't ++ * be dequeued, its task_rq and cpus_allowed are stable too. ++ */ ++ if (dsq_id == SCX_DSQ_LOCAL) { ++ dst_rq = rq; ++ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { ++ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; ++ ++ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) ++ return DTL_INVALID; ++ dst_rq = cpu_rq(cpu); ++ } else { ++ return DTL_NOT_LOCAL; ++ } ++ ++ /* if dispatching to @rq that @p is already on, no lock dancing needed */ ++ if (rq == src_rq && rq == dst_rq) { ++ dispatch_enqueue(&dst_rq->scx.local_dsq, p, ++ enq_flags | SCX_ENQ_CLEAR_OPSS); ++ return DTL_DISPATCHED; ++ } ++ ++#ifdef CONFIG_SMP ++ if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { ++ struct rq *locked_dst_rq = dst_rq; ++ bool dsp; ++ ++ /* ++ * @p is on a possibly remote @src_rq which we need to lock to ++ * move the task. If dequeue is in progress, it'd be locking ++ * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq ++ * lock while holding DISPATCHING. ++ * ++ * As DISPATCHING guarantees that @p is wholly ours, we can ++ * pretend that we're moving from a DSQ and use the same ++ * mechanism - mark the task under transfer with holding_cpu, ++ * release DISPATCHING and then follow the same protocol. ++ */ ++ p->scx.holding_cpu = raw_smp_processor_id(); ++ ++ /* store_release ensures that dequeue sees the above */ ++ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); ++ ++ dispatch_to_local_dsq_lock(rq, src_rq, locked_dst_rq); ++ ++ /* ++ * We don't require the BPF scheduler to avoid dispatching to ++ * offline CPUs mostly for convenience but also because CPUs can ++ * go offline between scx_bpf_dispatch() calls and here. If @p ++ * is destined to an offline CPU, queue it on its current CPU ++ * instead, which should always be safe. As this is an allowed ++ * behavior, don't trigger an ops error. ++ */ ++ if (!scx_rq_online(dst_rq)) ++ dst_rq = src_rq; ++ ++ if (src_rq == dst_rq) { ++ /* ++ * As @p is staying on the same rq, there's no need to ++ * go through the full deactivate/activate cycle. ++ * Optimize by abbreviating the operations in ++ * move_task_to_local_dsq(). ++ */ ++ dsp = p->scx.holding_cpu == raw_smp_processor_id(); ++ if (likely(dsp)) { ++ p->scx.holding_cpu = -1; ++ dispatch_enqueue(&dst_rq->scx.local_dsq, p, ++ enq_flags); ++ } ++ } else { ++ dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); ++ } ++ ++ /* if the destination CPU is idle, wake it up */ ++ if (dsp && sched_class_above(p->sched_class, ++ dst_rq->curr->sched_class)) ++ resched_curr(dst_rq); ++ ++ dispatch_to_local_dsq_unlock(rq, src_rq, locked_dst_rq); ++ ++ return dsp ? DTL_DISPATCHED : DTL_LOST; ++ } ++#endif /* CONFIG_SMP */ ++ ++ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", ++ cpu_of(dst_rq), p->comm, p->pid); ++ return DTL_INVALID; ++} ++ ++/** ++ * finish_dispatch - Asynchronously finish dispatching a task ++ * @rq: current rq which is locked ++ * @p: task to finish dispatching ++ * @qseq_at_dispatch: qseq when @p started getting dispatched ++ * @dsq_id: destination DSQ ID ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * Dispatching to local DSQs may need to wait for queueing to complete or ++ * require rq lock dancing. As we don't wanna do either while inside ++ * ops.dispatch() to avoid locking order inversion, we split dispatching into ++ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the ++ * task and its qseq. Once ops.dispatch() returns, this function is called to ++ * finish up. ++ * ++ * There is no guarantee that @p is still valid for dispatching or even that it ++ * was valid in the first place. Make sure that the task is still owned by the ++ * BPF scheduler and claim the ownership before dispatching. ++ */ ++static void finish_dispatch(struct rq *rq, struct task_struct *p, ++ unsigned long qseq_at_dispatch, ++ u64 dsq_id, u64 enq_flags) ++{ ++ struct scx_dispatch_q *dsq; ++ unsigned long opss; ++ ++ touch_core_sched_dispatch(rq, p); ++retry: ++ /* ++ * No need for _acquire here. @p is accessed only after a successful ++ * try_cmpxchg to DISPATCHING. ++ */ ++ opss = atomic_long_read(&p->scx.ops_state); ++ ++ switch (opss & SCX_OPSS_STATE_MASK) { ++ case SCX_OPSS_DISPATCHING: ++ case SCX_OPSS_NONE: ++ /* someone else already got to it */ ++ return; ++ case SCX_OPSS_QUEUED: ++ /* ++ * If qseq doesn't match, @p has gone through at least one ++ * dispatch/dequeue and re-enqueue cycle between ++ * scx_bpf_dispatch() and here and we have no claim on it. ++ */ ++ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) ++ return; ++ ++ /* ++ * While we know @p is accessible, we don't yet have a claim on ++ * it - the BPF scheduler is allowed to dispatch tasks ++ * spuriously and there can be a racing dequeue attempt. Let's ++ * claim @p by atomically transitioning it from QUEUED to ++ * DISPATCHING. ++ */ ++ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, ++ SCX_OPSS_DISPATCHING))) ++ break; ++ goto retry; ++ case SCX_OPSS_QUEUEING: ++ /* ++ * do_enqueue_task() is in the process of transferring the task ++ * to the BPF scheduler while holding @p's rq lock. As we aren't ++ * holding any kernel or BPF resource that the enqueue path may ++ * depend upon, it's safe to wait. ++ */ ++ wait_ops_state(p, opss); ++ goto retry; ++ } ++ ++ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); ++ ++ switch (dispatch_to_local_dsq(rq, dsq_id, p, enq_flags)) { ++ case DTL_DISPATCHED: ++ break; ++ case DTL_LOST: ++ break; ++ case DTL_INVALID: ++ dsq_id = SCX_DSQ_GLOBAL; ++ fallthrough; ++ case DTL_NOT_LOCAL: ++ dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), ++ dsq_id, p); ++ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); ++ break; ++ } ++} ++ ++static void flush_dispatch_buf(struct rq *rq) ++{ ++ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); ++ u32 u; ++ ++ for (u = 0; u < dspc->cursor; u++) { ++ struct scx_dsp_buf_ent *ent = &dspc->buf[u]; ++ ++ finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id, ++ ent->enq_flags); ++ } ++ ++ dspc->nr_tasks += dspc->cursor; ++ dspc->cursor = 0; ++} ++ ++static int balance_one(struct rq *rq, struct task_struct *prev, bool local) ++{ ++ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); ++ bool prev_on_scx = prev->sched_class == &ext_sched_class; ++ int nr_loops = SCX_DSP_MAX_LOOPS; ++ bool has_tasks = false; ++ ++ lockdep_assert_rq_held(rq); ++ rq->scx.flags |= SCX_RQ_IN_BALANCE; ++ ++ if (static_branch_unlikely(&scx_ops_cpu_preempt) && ++ unlikely(rq->scx.cpu_released)) { ++ /* ++ * If the previous sched_class for the current CPU was not SCX, ++ * notify the BPF scheduler that it again has control of the ++ * core. This callback complements ->cpu_release(), which is ++ * emitted in scx_next_task_picked(). ++ */ ++ if (SCX_HAS_OP(cpu_acquire)) ++ SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); ++ rq->scx.cpu_released = false; ++ } ++ ++ if (prev_on_scx) { ++ WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); ++ update_curr_scx(rq); ++ ++ /* ++ * If @prev is runnable & has slice left, it has priority and ++ * fetching more just increases latency for the fetched tasks. ++ * Tell put_prev_task_scx() to put @prev on local_dsq. If the ++ * BPF scheduler wants to handle this explicitly, it should ++ * implement ->cpu_released(). ++ * ++ * See scx_ops_disable_workfn() for the explanation on the ++ * bypassing test. ++ * ++ * When balancing a remote CPU for core-sched, there won't be a ++ * following put_prev_task_scx() call and we don't own ++ * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the ++ * same conditions later and pick @rq->curr accordingly. ++ */ ++ if ((prev->scx.flags & SCX_TASK_QUEUED) && ++ prev->scx.slice && !scx_ops_bypassing()) { ++ if (local) ++ prev->scx.flags |= SCX_TASK_BAL_KEEP; ++ goto has_tasks; ++ } ++ } ++ ++ /* if there already are tasks to run, nothing to do */ ++ if (rq->scx.local_dsq.nr) ++ goto has_tasks; ++ ++ if (consume_dispatch_q(rq, &scx_dsq_global)) ++ goto has_tasks; ++ ++ if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) ++ goto out; ++ ++ dspc->rq = rq; ++ ++ /* ++ * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, ++ * the local DSQ might still end up empty after a successful ++ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() ++ * produced some tasks, retry. The BPF scheduler may depend on this ++ * looping behavior to simplify its implementation. ++ */ ++ do { ++ dspc->nr_tasks = 0; ++ ++ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), ++ prev_on_scx ? prev : NULL); ++ ++ flush_dispatch_buf(rq); ++ ++ if (rq->scx.local_dsq.nr) ++ goto has_tasks; ++ if (consume_dispatch_q(rq, &scx_dsq_global)) ++ goto has_tasks; ++ ++ /* ++ * ops.dispatch() can trap us in this loop by repeatedly ++ * dispatching ineligible tasks. Break out once in a while to ++ * allow the watchdog to run. As IRQ can't be enabled in ++ * balance(), we want to complete this scheduling cycle and then ++ * start a new one. IOW, we want to call resched_curr() on the ++ * next, most likely idle, task, not the current one. Use ++ * scx_bpf_kick_cpu() for deferred kicking. ++ */ ++ if (unlikely(!--nr_loops)) { ++ scx_bpf_kick_cpu(cpu_of(rq), 0); ++ break; ++ } ++ } while (dspc->nr_tasks); ++ ++ goto out; ++ ++has_tasks: ++ has_tasks = true; ++out: ++ rq->scx.flags &= ~SCX_RQ_IN_BALANCE; ++ return has_tasks; ++} ++ ++#ifdef CONFIG_SMP ++static int balance_scx(struct rq *rq, struct task_struct *prev, ++ struct rq_flags *rf) ++{ ++ int ret; ++ ++ rq_unpin_lock(rq, rf); ++ ++ ret = balance_one(rq, prev, true); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When core-sched is enabled, this ops.balance() call will be followed ++ * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() ++ * on the SMT siblings. Balance the siblings too. ++ */ ++ if (sched_core_enabled(rq)) { ++ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); ++ int scpu; ++ ++ for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { ++ struct rq *srq = cpu_rq(scpu); ++ struct task_struct *sprev = srq->curr; ++ ++ WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); ++ update_rq_clock(srq); ++ balance_one(srq, sprev, false); ++ } ++ } ++#endif ++ rq_repin_lock(rq, rf); ++ ++ return ret; ++} ++#endif ++ ++static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) ++{ ++ if (p->scx.flags & SCX_TASK_QUEUED) { ++ /* ++ * Core-sched might decide to execute @p before it is ++ * dispatched. Call ops_dequeue() to notify the BPF scheduler. ++ */ ++ ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); ++ dispatch_dequeue(rq, p); ++ } ++ ++ p->se.exec_start = rq_clock_task(rq); ++ ++ /* see dequeue_task_scx() on why we skip when !QUEUED */ ++ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, running, p); ++ ++ clr_task_runnable(p, true); ++ ++ /* ++ * @p is getting newly scheduled or got kicked after someone updated its ++ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). ++ */ ++ if ((p->scx.slice == SCX_SLICE_INF) != ++ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { ++ if (p->scx.slice == SCX_SLICE_INF) ++ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; ++ else ++ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; ++ ++ sched_update_tick_dependency(rq); ++ ++ /* ++ * For now, let's refresh the load_avgs just when transitioning ++ * in and out of nohz. In the future, we might want to add a ++ * mechanism which calls the following periodically on ++ * tick-stopped CPUs. ++ */ ++ update_other_load_avgs(rq); ++ } ++} ++ ++static void process_ddsp_deferred_locals(struct rq *rq) ++{ ++ struct task_struct *p, *tmp; ++ ++ lockdep_assert_rq_held(rq); ++ ++ /* ++ * Now that @rq can be unlocked, execute the deferred enqueueing of ++ * tasks directly dispatched to the local DSQs of other CPUs. See ++ * direct_dispatch(). ++ */ ++ list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals, ++ scx.dsq_list.node) { ++ s32 ret; ++ ++ list_del_init(&p->scx.dsq_list.node); ++ ++ ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p, ++ p->scx.ddsp_enq_flags); ++ WARN_ON_ONCE(ret == DTL_NOT_LOCAL); ++ } ++} ++ ++static void put_prev_task_scx(struct rq *rq, struct task_struct *p) ++{ ++#ifndef CONFIG_SMP ++ /* ++ * UP workaround. ++ * ++ * Because SCX may transfer tasks across CPUs during dispatch, dispatch ++ * is performed from its balance operation which isn't called in UP. ++ * Let's work around by calling it from the operations which come right ++ * after. ++ * ++ * 1. If the prev task is on SCX, pick_next_task() calls ++ * .put_prev_task() right after. As .put_prev_task() is also called ++ * from other places, we need to distinguish the calls which can be ++ * done by looking at the previous task's state - if still queued or ++ * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). ++ * This case is handled here. ++ * ++ * 2. If the prev task is not on SCX, the first following call into SCX ++ * will be .pick_next_task(), which is covered by calling ++ * balance_scx() from pick_next_task_scx(). ++ * ++ * Note that we can't merge the first case into the second as ++ * balance_scx() must be called before the previous SCX task goes ++ * through put_prev_task_scx(). ++ * ++ * @rq is pinned and can't be unlocked. As UP doesn't transfer tasks ++ * around, balance_one() doesn't need to. ++ */ ++ if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) ++ balance_one(rq, p, true); ++#endif ++ ++ update_curr_scx(rq); ++ ++ /* see dequeue_task_scx() on why we skip when !QUEUED */ ++ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); ++ ++ /* ++ * If we're being called from put_prev_task_balance(), balance_scx() may ++ * have decided that @p should keep running. ++ */ ++ if (p->scx.flags & SCX_TASK_BAL_KEEP) { ++ p->scx.flags &= ~SCX_TASK_BAL_KEEP; ++ set_task_runnable(rq, p); ++ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); ++ return; ++ } ++ ++ if (p->scx.flags & SCX_TASK_QUEUED) { ++ set_task_runnable(rq, p); ++ ++ /* ++ * If @p has slice left and balance_scx() didn't tag it for ++ * keeping, @p is getting preempted by a higher priority ++ * scheduler class or core-sched forcing a different task. Leave ++ * it at the head of the local DSQ. ++ */ ++ if (p->scx.slice && !scx_ops_bypassing()) { ++ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); ++ return; ++ } ++ ++ /* ++ * If we're in the pick_next_task path, balance_scx() should ++ * have already populated the local DSQ if there are any other ++ * available tasks. If empty, tell ops.enqueue() that @p is the ++ * only one available for this cpu. ops.enqueue() should put it ++ * on the local DSQ so that the subsequent pick_next_task_scx() ++ * can find the task unless it wants to trigger a separate ++ * follow-up scheduling event. ++ */ ++ if (list_empty(&rq->scx.local_dsq.list)) ++ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); ++ else ++ do_enqueue_task(rq, p, 0, -1); ++ } ++} ++ ++static struct task_struct *first_local_task(struct rq *rq) ++{ ++ return list_first_entry_or_null(&rq->scx.local_dsq.list, ++ struct task_struct, scx.dsq_list.node); ++} ++ ++static struct task_struct *pick_next_task_scx(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++#ifndef CONFIG_SMP ++ /* UP workaround - see the comment at the head of put_prev_task_scx() */ ++ if (unlikely(rq->curr->sched_class != &ext_sched_class)) ++ balance_one(rq, rq->curr, true); ++#endif ++ ++ p = first_local_task(rq); ++ if (!p) ++ return NULL; ++ ++ set_next_task_scx(rq, p, true); ++ ++ if (unlikely(!p->scx.slice)) { ++ if (!scx_ops_bypassing() && !scx_warned_zero_slice) { ++ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", ++ p->comm, p->pid); ++ scx_warned_zero_slice = true; ++ } ++ p->scx.slice = SCX_SLICE_DFL; ++ } ++ ++ return p; ++} ++ ++#ifdef CONFIG_SCHED_CORE ++/** ++ * scx_prio_less - Task ordering for core-sched ++ * @a: task A ++ * @b: task B ++ * ++ * Core-sched is implemented as an additional scheduling layer on top of the ++ * usual sched_class'es and needs to find out the expected task ordering. For ++ * SCX, core-sched calls this function to interrogate the task ordering. ++ * ++ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used ++ * to implement the default task ordering. The older the timestamp, the higher ++ * prority the task - the global FIFO ordering matching the default scheduling ++ * behavior. ++ * ++ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to ++ * implement FIFO ordering within each local DSQ. See pick_task_scx(). ++ */ ++bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, ++ bool in_fi) ++{ ++ /* ++ * The const qualifiers are dropped from task_struct pointers when ++ * calling ops.core_sched_before(). Accesses are controlled by the ++ * verifier. ++ */ ++ if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) ++ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, ++ (struct task_struct *)a, ++ (struct task_struct *)b); ++ else ++ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); ++} ++ ++/** ++ * pick_task_scx - Pick a candidate task for core-sched ++ * @rq: rq to pick the candidate task from ++ * ++ * Core-sched calls this function on each SMT sibling to determine the next ++ * tasks to run on the SMT siblings. balance_one() has been called on all ++ * siblings and put_prev_task_scx() has been called only for the current CPU. ++ * ++ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look ++ * at the first task in the local dsq. @rq->curr has to be considered explicitly ++ * to mimic %SCX_TASK_BAL_KEEP. ++ */ ++static struct task_struct *pick_task_scx(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ struct task_struct *first = first_local_task(rq); ++ ++ if (curr->scx.flags & SCX_TASK_QUEUED) { ++ /* is curr the only runnable task? */ ++ if (!first) ++ return curr; ++ ++ /* ++ * Does curr trump first? We can always go by core_sched_at for ++ * this comparison as it represents global FIFO ordering when ++ * the default core-sched ordering is used and local-DSQ FIFO ++ * ordering otherwise. ++ * ++ * We can have a task with an earlier timestamp on the DSQ. For ++ * example, when a current task is preempted by a sibling ++ * picking a different cookie, the task would be requeued at the ++ * head of the local DSQ with an earlier timestamp than the ++ * core-sched picked next task. Besides, the BPF scheduler may ++ * dispatch any tasks to the local DSQ anytime. ++ */ ++ if (curr->scx.slice && time_before64(curr->scx.core_sched_at, ++ first->scx.core_sched_at)) ++ return curr; ++ } ++ ++ return first; /* this may be %NULL */ ++} ++#endif /* CONFIG_SCHED_CORE */ ++ ++static enum scx_cpu_preempt_reason ++preempt_reason_from_class(const struct sched_class *class) ++{ ++#ifdef CONFIG_SMP ++ if (class == &stop_sched_class) ++ return SCX_CPU_PREEMPT_STOP; ++#endif ++ if (class == &dl_sched_class) ++ return SCX_CPU_PREEMPT_DL; ++ if (class == &rt_sched_class) ++ return SCX_CPU_PREEMPT_RT; ++ return SCX_CPU_PREEMPT_UNKNOWN; ++} ++ ++static void switch_class_scx(struct rq *rq, struct task_struct *next) ++{ ++ const struct sched_class *next_class = next->sched_class; ++ ++ if (!scx_enabled()) ++ return; ++#ifdef CONFIG_SMP ++ /* ++ * Pairs with the smp_load_acquire() issued by a CPU in ++ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a ++ * resched. ++ */ ++ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); ++#endif ++ if (!static_branch_unlikely(&scx_ops_cpu_preempt)) ++ return; ++ ++ /* ++ * The callback is conceptually meant to convey that the CPU is no ++ * longer under the control of SCX. Therefore, don't invoke the callback ++ * if the next class is below SCX (in which case the BPF scheduler has ++ * actively decided not to schedule any tasks on the CPU). ++ */ ++ if (sched_class_above(&ext_sched_class, next_class)) ++ return; ++ ++ /* ++ * At this point we know that SCX was preempted by a higher priority ++ * sched_class, so invoke the ->cpu_release() callback if we have not ++ * done so already. We only send the callback once between SCX being ++ * preempted, and it regaining control of the CPU. ++ * ++ * ->cpu_release() complements ->cpu_acquire(), which is emitted the ++ * next time that balance_scx() is invoked. ++ */ ++ if (!rq->scx.cpu_released) { ++ if (SCX_HAS_OP(cpu_release)) { ++ struct scx_cpu_release_args args = { ++ .reason = preempt_reason_from_class(next_class), ++ .task = next, ++ }; ++ ++ SCX_CALL_OP(SCX_KF_CPU_RELEASE, ++ cpu_release, cpu_of(rq), &args); ++ } ++ rq->scx.cpu_released = true; ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++static bool test_and_clear_cpu_idle(int cpu) ++{ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * SMT mask should be cleared whether we can claim @cpu or not. The SMT ++ * cluster is not wholly idle either way. This also prevents ++ * scx_pick_idle_cpu() from getting caught in an infinite loop. ++ */ ++ if (sched_smt_active()) { ++ const struct cpumask *smt = cpu_smt_mask(cpu); ++ ++ /* ++ * If offline, @cpu is not its own sibling and ++ * scx_pick_idle_cpu() can get caught in an infinite loop as ++ * @cpu is never cleared from idle_masks.smt. Ensure that @cpu ++ * is eventually cleared. ++ */ ++ if (cpumask_intersects(smt, idle_masks.smt)) ++ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); ++ else if (cpumask_test_cpu(cpu, idle_masks.smt)) ++ __cpumask_clear_cpu(cpu, idle_masks.smt); ++ } ++#endif ++ return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); ++} ++ ++static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) ++{ ++ int cpu; ++ ++retry: ++ if (sched_smt_active()) { ++ cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); ++ if (cpu < nr_cpu_ids) ++ goto found; ++ ++ if (flags & SCX_PICK_IDLE_CORE) ++ return -EBUSY; ++ } ++ ++ cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); ++ if (cpu >= nr_cpu_ids) ++ return -EBUSY; ++ ++found: ++ if (test_and_clear_cpu_idle(cpu)) ++ return cpu; ++ else ++ goto retry; ++} ++ ++static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, ++ u64 wake_flags, bool *found) ++{ ++ s32 cpu; ++ ++ *found = false; ++ ++ if (!static_branch_likely(&scx_builtin_idle_enabled)) { ++ scx_ops_error("built-in idle tracking is disabled"); ++ return prev_cpu; ++ } ++ ++ /* ++ * If WAKE_SYNC, the waker's local DSQ is empty, and the system is ++ * under utilized, wake up @p to the local DSQ of the waker. Checking ++ * only for an empty local DSQ is insufficient as it could give the ++ * wakee an unfair advantage when the system is oversaturated. ++ * Checking only for the presence of idle CPUs is also insufficient as ++ * the local DSQ of the waker could have tasks piled up on it even if ++ * there is an idle core elsewhere on the system. ++ */ ++ cpu = smp_processor_id(); ++ if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && ++ !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && ++ cpu_rq(cpu)->scx.local_dsq.nr == 0) { ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ goto cpu_found; ++ } ++ ++ if (p->nr_cpus_allowed == 1) { ++ if (test_and_clear_cpu_idle(prev_cpu)) { ++ cpu = prev_cpu; ++ goto cpu_found; ++ } else { ++ return prev_cpu; ++ } ++ } ++ ++ /* ++ * If CPU has SMT, any wholly idle CPU is likely a better pick than ++ * partially idle @prev_cpu. ++ */ ++ if (sched_smt_active()) { ++ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && ++ test_and_clear_cpu_idle(prev_cpu)) { ++ cpu = prev_cpu; ++ goto cpu_found; ++ } ++ ++ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); ++ if (cpu >= 0) ++ goto cpu_found; ++ } ++ ++ if (test_and_clear_cpu_idle(prev_cpu)) { ++ cpu = prev_cpu; ++ goto cpu_found; ++ } ++ ++ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); ++ if (cpu >= 0) ++ goto cpu_found; ++ ++ return prev_cpu; ++ ++cpu_found: ++ *found = true; ++ return cpu; ++} ++ ++static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) ++{ ++ /* ++ * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it ++ * can be a good migration opportunity with low cache and memory ++ * footprint. Returning a CPU different than @prev_cpu triggers ++ * immediate rq migration. However, for SCX, as the current rq ++ * association doesn't dictate where the task is going to run, this ++ * doesn't fit well. If necessary, we can later add a dedicated method ++ * which can decide to preempt self to force it through the regular ++ * scheduling path. ++ */ ++ if (unlikely(wake_flags & WF_EXEC)) ++ return prev_cpu; ++ ++ if (SCX_HAS_OP(select_cpu)) { ++ s32 cpu; ++ struct task_struct **ddsp_taskp; ++ ++ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); ++ WARN_ON_ONCE(*ddsp_taskp); ++ *ddsp_taskp = p; ++ ++ cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, ++ select_cpu, p, prev_cpu, wake_flags); ++ *ddsp_taskp = NULL; ++ if (ops_cpu_valid(cpu, "from ops.select_cpu()")) ++ return cpu; ++ else ++ return prev_cpu; ++ } else { ++ bool found; ++ s32 cpu; ++ ++ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); ++ if (found) { ++ p->scx.slice = SCX_SLICE_DFL; ++ p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; ++ } ++ return cpu; ++ } ++} ++ ++static void task_woken_scx(struct rq *rq, struct task_struct *p) ++{ ++ run_deferred(rq); ++} ++ ++static void set_cpus_allowed_scx(struct task_struct *p, ++ struct affinity_context *ac) ++{ ++ set_cpus_allowed_common(p, ac); ++ ++ /* ++ * The effective cpumask is stored in @p->cpus_ptr which may temporarily ++ * differ from the configured one in @p->cpus_mask. Always tell the bpf ++ * scheduler the effective one. ++ * ++ * Fine-grained memory write control is enforced by BPF making the const ++ * designation pointless. Cast it away when calling the operation. ++ */ ++ if (SCX_HAS_OP(set_cpumask)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, ++ (struct cpumask *)p->cpus_ptr); ++} ++ ++static void reset_idle_masks(void) ++{ ++ /* ++ * Consider all online cpus idle. Should converge to the actual state ++ * quickly. ++ */ ++ cpumask_copy(idle_masks.cpu, cpu_online_mask); ++ cpumask_copy(idle_masks.smt, cpu_online_mask); ++} ++ ++void __scx_update_idle(struct rq *rq, bool idle) ++{ ++ int cpu = cpu_of(rq); ++ ++ if (SCX_HAS_OP(update_idle)) { ++ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); ++ if (!static_branch_unlikely(&scx_builtin_idle_enabled)) ++ return; ++ } ++ ++ if (idle) ++ cpumask_set_cpu(cpu, idle_masks.cpu); ++ else ++ cpumask_clear_cpu(cpu, idle_masks.cpu); ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) { ++ const struct cpumask *smt = cpu_smt_mask(cpu); ++ ++ if (idle) { ++ /* ++ * idle_masks.smt handling is racy but that's fine as ++ * it's only for optimization and self-correcting. ++ */ ++ for_each_cpu(cpu, smt) { ++ if (!cpumask_test_cpu(cpu, idle_masks.cpu)) ++ return; ++ } ++ cpumask_or(idle_masks.smt, idle_masks.smt, smt); ++ } else { ++ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); ++ } ++ } ++#endif ++} ++ ++static void handle_hotplug(struct rq *rq, bool online) ++{ ++ int cpu = cpu_of(rq); ++ ++ atomic_long_inc(&scx_hotplug_seq); ++ ++ if (online && SCX_HAS_OP(cpu_online)) ++ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu); ++ else if (!online && SCX_HAS_OP(cpu_offline)) ++ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu); ++ else ++ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, ++ "cpu %d going %s, exiting scheduler", cpu, ++ online ? "online" : "offline"); ++} ++ ++void scx_rq_activate(struct rq *rq) ++{ ++ handle_hotplug(rq, true); ++} ++ ++void scx_rq_deactivate(struct rq *rq) ++{ ++ handle_hotplug(rq, false); ++} ++ ++static void rq_online_scx(struct rq *rq) ++{ ++ rq->scx.flags |= SCX_RQ_ONLINE; ++} ++ ++static void rq_offline_scx(struct rq *rq) ++{ ++ rq->scx.flags &= ~SCX_RQ_ONLINE; ++} ++ ++#else /* CONFIG_SMP */ ++ ++static bool test_and_clear_cpu_idle(int cpu) { return false; } ++static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } ++static void reset_idle_masks(void) {} ++ ++#endif /* CONFIG_SMP */ ++ ++static bool check_rq_for_timeouts(struct rq *rq) ++{ ++ struct task_struct *p; ++ struct rq_flags rf; ++ bool timed_out = false; ++ ++ rq_lock_irqsave(rq, &rf); ++ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { ++ unsigned long last_runnable = p->scx.runnable_at; ++ ++ if (unlikely(time_after(jiffies, ++ last_runnable + scx_watchdog_timeout))) { ++ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); ++ ++ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, ++ "%s[%d] failed to run for %u.%03us", ++ p->comm, p->pid, ++ dur_ms / 1000, dur_ms % 1000); ++ timed_out = true; ++ break; ++ } ++ } ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return timed_out; ++} ++ ++static void scx_watchdog_workfn(struct work_struct *work) ++{ ++ int cpu; ++ ++ WRITE_ONCE(scx_watchdog_timestamp, jiffies); ++ ++ for_each_online_cpu(cpu) { ++ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) ++ break; ++ ++ cond_resched(); ++ } ++ queue_delayed_work(system_unbound_wq, to_delayed_work(work), ++ scx_watchdog_timeout / 2); ++} ++ ++void scx_tick(struct rq *rq) ++{ ++ unsigned long last_check; ++ ++ if (!scx_enabled()) ++ return; ++ ++ last_check = READ_ONCE(scx_watchdog_timestamp); ++ if (unlikely(time_after(jiffies, ++ last_check + READ_ONCE(scx_watchdog_timeout)))) { ++ u32 dur_ms = jiffies_to_msecs(jiffies - last_check); ++ ++ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, ++ "watchdog failed to check in for %u.%03us", ++ dur_ms / 1000, dur_ms % 1000); ++ } ++ ++ update_other_load_avgs(rq); ++} ++ ++static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) ++{ ++ update_curr_scx(rq); ++ ++ /* ++ * While disabling, always resched and refresh core-sched timestamp as ++ * we can't trust the slice management or ops.core_sched_before(). ++ */ ++ if (scx_ops_bypassing()) { ++ curr->scx.slice = 0; ++ touch_core_sched(rq, curr); ++ } else if (SCX_HAS_OP(tick)) { ++ SCX_CALL_OP(SCX_KF_REST, tick, curr); ++ } ++ ++ if (!curr->scx.slice) ++ resched_curr(rq); ++} ++ ++static enum scx_task_state scx_get_task_state(const struct task_struct *p) ++{ ++ return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; ++} ++ ++static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) ++{ ++ enum scx_task_state prev_state = scx_get_task_state(p); ++ bool warn = false; ++ ++ BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); ++ ++ switch (state) { ++ case SCX_TASK_NONE: ++ break; ++ case SCX_TASK_INIT: ++ warn = prev_state != SCX_TASK_NONE; ++ break; ++ case SCX_TASK_READY: ++ warn = prev_state == SCX_TASK_NONE; ++ break; ++ case SCX_TASK_ENABLED: ++ warn = prev_state != SCX_TASK_READY; ++ break; ++ default: ++ warn = true; ++ return; ++ } ++ ++ WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", ++ prev_state, state, p->comm, p->pid); ++ ++ p->scx.flags &= ~SCX_TASK_STATE_MASK; ++ p->scx.flags |= state << SCX_TASK_STATE_SHIFT; ++} ++ ++static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) ++{ ++ int ret; ++ ++ p->scx.disallow = false; ++ ++ if (SCX_HAS_OP(init_task)) { ++ struct scx_init_task_args args = { ++ .fork = fork, ++ }; ++ ++ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); ++ if (unlikely(ret)) { ++ ret = ops_sanitize_err("init_task", ret); ++ return ret; ++ } ++ } ++ ++ scx_set_task_state(p, SCX_TASK_INIT); ++ ++ if (p->scx.disallow) { ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ rq = task_rq_lock(p, &rf); ++ ++ /* ++ * We're either in fork or load path and @p->policy will be ++ * applied right after. Reverting @p->policy here and rejecting ++ * %SCHED_EXT transitions from scx_check_setscheduler() ++ * guarantees that if ops.init_task() sets @p->disallow, @p can ++ * never be in SCX. ++ */ ++ if (p->policy == SCHED_EXT) { ++ p->policy = SCHED_NORMAL; ++ atomic_long_inc(&scx_nr_rejected); ++ } ++ ++ task_rq_unlock(rq, p, &rf); ++ } ++ ++ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; ++ return 0; ++} ++ ++static void scx_ops_enable_task(struct task_struct *p) ++{ ++ u32 weight; ++ ++ lockdep_assert_rq_held(task_rq(p)); ++ ++ /* ++ * Set the weight before calling ops.enable() so that the scheduler ++ * doesn't see a stale value if they inspect the task struct. ++ */ ++ if (task_has_idle_policy(p)) ++ weight = WEIGHT_IDLEPRIO; ++ else ++ weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; ++ ++ p->scx.weight = sched_weight_to_cgroup(weight); ++ ++ if (SCX_HAS_OP(enable)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); ++ scx_set_task_state(p, SCX_TASK_ENABLED); ++ ++ if (SCX_HAS_OP(set_weight)) ++ SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); ++} ++ ++static void scx_ops_disable_task(struct task_struct *p) ++{ ++ lockdep_assert_rq_held(task_rq(p)); ++ WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); ++ ++ if (SCX_HAS_OP(disable)) ++ SCX_CALL_OP(SCX_KF_REST, disable, p); ++ scx_set_task_state(p, SCX_TASK_READY); ++} ++ ++static void scx_ops_exit_task(struct task_struct *p) ++{ ++ struct scx_exit_task_args args = { ++ .cancelled = false, ++ }; ++ ++ lockdep_assert_rq_held(task_rq(p)); ++ ++ switch (scx_get_task_state(p)) { ++ case SCX_TASK_NONE: ++ return; ++ case SCX_TASK_INIT: ++ args.cancelled = true; ++ break; ++ case SCX_TASK_READY: ++ break; ++ case SCX_TASK_ENABLED: ++ scx_ops_disable_task(p); ++ break; ++ default: ++ WARN_ON_ONCE(true); ++ return; ++ } ++ ++ if (SCX_HAS_OP(exit_task)) ++ SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); ++ scx_set_task_state(p, SCX_TASK_NONE); ++} ++ ++void init_scx_entity(struct sched_ext_entity *scx) ++{ ++ /* ++ * init_idle() calls this function again after fork sequence is ++ * complete. Don't touch ->tasks_node as it's already linked. ++ */ ++ memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); ++ ++ INIT_LIST_HEAD(&scx->dsq_list.node); ++ RB_CLEAR_NODE(&scx->dsq_priq); ++ scx->sticky_cpu = -1; ++ scx->holding_cpu = -1; ++ INIT_LIST_HEAD(&scx->runnable_node); ++ scx->runnable_at = jiffies; ++ scx->ddsp_dsq_id = SCX_DSQ_INVALID; ++ scx->slice = SCX_SLICE_DFL; ++} ++ ++void scx_pre_fork(struct task_struct *p) ++{ ++ /* ++ * BPF scheduler enable/disable paths want to be able to iterate and ++ * update all tasks which can become complex when racing forks. As ++ * enable/disable are very cold paths, let's use a percpu_rwsem to ++ * exclude forks. ++ */ ++ percpu_down_read(&scx_fork_rwsem); ++} ++ ++int scx_fork(struct task_struct *p) ++{ ++ percpu_rwsem_assert_held(&scx_fork_rwsem); ++ ++ if (scx_enabled()) ++ return scx_ops_init_task(p, task_group(p), true); ++ else ++ return 0; ++} ++ ++void scx_post_fork(struct task_struct *p) ++{ ++ if (scx_enabled()) { ++ scx_set_task_state(p, SCX_TASK_READY); ++ ++ /* ++ * Enable the task immediately if it's running on sched_ext. ++ * Otherwise, it'll be enabled in switching_to_scx() if and ++ * when it's ever configured to run with a SCHED_EXT policy. ++ */ ++ if (p->sched_class == &ext_sched_class) { ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(p, &rf); ++ scx_ops_enable_task(p); ++ task_rq_unlock(rq, p, &rf); ++ } ++ } ++ ++ spin_lock_irq(&scx_tasks_lock); ++ list_add_tail(&p->scx.tasks_node, &scx_tasks); ++ spin_unlock_irq(&scx_tasks_lock); ++ ++ percpu_up_read(&scx_fork_rwsem); ++} ++ ++void scx_cancel_fork(struct task_struct *p) ++{ ++ if (scx_enabled()) { ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ rq = task_rq_lock(p, &rf); ++ WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); ++ scx_ops_exit_task(p); ++ task_rq_unlock(rq, p, &rf); ++ } ++ ++ percpu_up_read(&scx_fork_rwsem); ++} ++ ++void sched_ext_free(struct task_struct *p) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&scx_tasks_lock, flags); ++ list_del_init(&p->scx.tasks_node); ++ spin_unlock_irqrestore(&scx_tasks_lock, flags); ++ ++ /* ++ * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> ++ * ENABLED transitions can't race us. Disable ops for @p. ++ */ ++ if (scx_get_task_state(p) != SCX_TASK_NONE) { ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(p, &rf); ++ scx_ops_exit_task(p); ++ task_rq_unlock(rq, p, &rf); ++ } ++} ++ ++static void reweight_task_scx(struct rq *rq, struct task_struct *p, ++ const struct load_weight *lw) ++{ ++ lockdep_assert_rq_held(task_rq(p)); ++ ++ p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); ++ if (SCX_HAS_OP(set_weight)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); ++} ++ ++static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) ++{ ++} ++ ++static void switching_to_scx(struct rq *rq, struct task_struct *p) ++{ ++ scx_ops_enable_task(p); ++ ++ /* ++ * set_cpus_allowed_scx() is not called while @p is associated with a ++ * different scheduler class. Keep the BPF scheduler up-to-date. ++ */ ++ if (SCX_HAS_OP(set_cpumask)) ++ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, ++ (struct cpumask *)p->cpus_ptr); ++} ++ ++static void switched_from_scx(struct rq *rq, struct task_struct *p) ++{ ++ scx_ops_disable_task(p); ++} ++ ++static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} ++static void switched_to_scx(struct rq *rq, struct task_struct *p) {} ++ ++int scx_check_setscheduler(struct task_struct *p, int policy) ++{ ++ lockdep_assert_rq_held(task_rq(p)); ++ ++ /* if disallow, reject transitioning into SCX */ ++ if (scx_enabled() && READ_ONCE(p->scx.disallow) && ++ p->policy != policy && policy == SCHED_EXT) ++ return -EACCES; ++ ++ return 0; ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++bool scx_can_stop_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (scx_ops_bypassing()) ++ return false; ++ ++ if (p->sched_class != &ext_sched_class) ++ return true; ++ ++ /* ++ * @rq can dispatch from different DSQs, so we can't tell whether it ++ * needs the tick or not by looking at nr_running. Allow stopping ticks ++ * iff the BPF scheduler indicated so. See set_next_task_scx(). ++ */ ++ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; ++} ++#endif ++ ++/* ++ * Omitted operations: ++ * ++ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task ++ * isn't tied to the CPU at that point. Preemption is implemented by resetting ++ * the victim task's slice to 0 and triggering reschedule on the target CPU. ++ * ++ * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. ++ * ++ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of ++ * their current sched_class. Call them directly from sched core instead. ++ */ ++DEFINE_SCHED_CLASS(ext) = { ++ .enqueue_task = enqueue_task_scx, ++ .dequeue_task = dequeue_task_scx, ++ .yield_task = yield_task_scx, ++ .yield_to_task = yield_to_task_scx, ++ ++ .wakeup_preempt = wakeup_preempt_scx, ++ ++ .pick_next_task = pick_next_task_scx, ++ ++ .put_prev_task = put_prev_task_scx, ++ .set_next_task = set_next_task_scx, ++ ++ .switch_class = switch_class_scx, ++ ++#ifdef CONFIG_SMP ++ .balance = balance_scx, ++ .select_task_rq = select_task_rq_scx, ++ .task_woken = task_woken_scx, ++ .set_cpus_allowed = set_cpus_allowed_scx, ++ ++ .rq_online = rq_online_scx, ++ .rq_offline = rq_offline_scx, ++#endif ++ ++#ifdef CONFIG_SCHED_CORE ++ .pick_task = pick_task_scx, ++#endif ++ ++ .task_tick = task_tick_scx, ++ ++ .switching_to = switching_to_scx, ++ .switched_from = switched_from_scx, ++ .switched_to = switched_to_scx, ++ .reweight_task = reweight_task_scx, ++ .prio_changed = prio_changed_scx, ++ ++ .update_curr = update_curr_scx, ++ ++#ifdef CONFIG_UCLAMP_TASK ++ .uclamp_enabled = 1, ++#endif ++}; ++ ++static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) ++{ ++ memset(dsq, 0, sizeof(*dsq)); ++ ++ raw_spin_lock_init(&dsq->lock); ++ INIT_LIST_HEAD(&dsq->list); ++ dsq->id = dsq_id; ++} ++ ++static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) ++{ ++ struct scx_dispatch_q *dsq; ++ int ret; ++ ++ if (dsq_id & SCX_DSQ_FLAG_BUILTIN) ++ return ERR_PTR(-EINVAL); ++ ++ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); ++ if (!dsq) ++ return ERR_PTR(-ENOMEM); ++ ++ init_dsq(dsq, dsq_id); ++ ++ ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, ++ dsq_hash_params); ++ if (ret) { ++ kfree(dsq); ++ return ERR_PTR(ret); ++ } ++ return dsq; ++} ++ ++static void free_dsq_irq_workfn(struct irq_work *irq_work) ++{ ++ struct llist_node *to_free = llist_del_all(&dsqs_to_free); ++ struct scx_dispatch_q *dsq, *tmp_dsq; ++ ++ llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) ++ kfree_rcu(dsq, rcu); ++} ++ ++static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); ++ ++static void destroy_dsq(u64 dsq_id) ++{ ++ struct scx_dispatch_q *dsq; ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ dsq = find_user_dsq(dsq_id); ++ if (!dsq) ++ goto out_unlock_rcu; ++ ++ raw_spin_lock_irqsave(&dsq->lock, flags); ++ ++ if (dsq->nr) { ++ scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", ++ dsq->id, dsq->nr); ++ goto out_unlock_dsq; ++ } ++ ++ if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) ++ goto out_unlock_dsq; ++ ++ /* ++ * Mark dead by invalidating ->id to prevent dispatch_enqueue() from ++ * queueing more tasks. As this function can be called from anywhere, ++ * freeing is bounced through an irq work to avoid nesting RCU ++ * operations inside scheduler locks. ++ */ ++ dsq->id = SCX_DSQ_INVALID; ++ llist_add(&dsq->free_node, &dsqs_to_free); ++ irq_work_queue(&free_dsq_irq_work); ++ ++out_unlock_dsq: ++ raw_spin_unlock_irqrestore(&dsq->lock, flags); ++out_unlock_rcu: ++ rcu_read_unlock(); ++} ++ ++ ++/******************************************************************************** ++ * Sysfs interface and ops enable/disable. ++ */ ++ ++#define SCX_ATTR(_name) \ ++ static struct kobj_attribute scx_attr_##_name = { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = scx_attr_##_name##_show, \ ++ } ++ ++static ssize_t scx_attr_state_show(struct kobject *kobj, ++ struct kobj_attribute *ka, char *buf) ++{ ++ return sysfs_emit(buf, "%s\n", ++ scx_ops_enable_state_str[scx_ops_enable_state()]); ++} ++SCX_ATTR(state); ++ ++static ssize_t scx_attr_switch_all_show(struct kobject *kobj, ++ struct kobj_attribute *ka, char *buf) ++{ ++ return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); ++} ++SCX_ATTR(switch_all); ++ ++static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, ++ struct kobj_attribute *ka, char *buf) ++{ ++ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); ++} ++SCX_ATTR(nr_rejected); ++ ++static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, ++ struct kobj_attribute *ka, char *buf) ++{ ++ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); ++} ++SCX_ATTR(hotplug_seq); ++ ++static struct attribute *scx_global_attrs[] = { ++ &scx_attr_state.attr, ++ &scx_attr_switch_all.attr, ++ &scx_attr_nr_rejected.attr, ++ &scx_attr_hotplug_seq.attr, ++ NULL, ++}; ++ ++static const struct attribute_group scx_global_attr_group = { ++ .attrs = scx_global_attrs, ++}; ++ ++static void scx_kobj_release(struct kobject *kobj) ++{ ++ kfree(kobj); ++} ++ ++static ssize_t scx_attr_ops_show(struct kobject *kobj, ++ struct kobj_attribute *ka, char *buf) ++{ ++ return sysfs_emit(buf, "%s\n", scx_ops.name); ++} ++SCX_ATTR(ops); ++ ++static struct attribute *scx_sched_attrs[] = { ++ &scx_attr_ops.attr, ++ NULL, ++}; ++ATTRIBUTE_GROUPS(scx_sched); ++ ++static const struct kobj_type scx_ktype = { ++ .release = scx_kobj_release, ++ .sysfs_ops = &kobj_sysfs_ops, ++ .default_groups = scx_sched_groups, ++}; ++ ++static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) ++{ ++ return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); ++} ++ ++static const struct kset_uevent_ops scx_uevent_ops = { ++ .uevent = scx_uevent, ++}; ++ ++/* ++ * Used by sched_fork() and __setscheduler_prio() to pick the matching ++ * sched_class. dl/rt are already handled. ++ */ ++bool task_should_scx(struct task_struct *p) ++{ ++ if (!scx_enabled() || ++ unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) ++ return false; ++ if (READ_ONCE(scx_switching_all)) ++ return true; ++ return p->policy == SCHED_EXT; ++} ++ ++/** ++ * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress ++ * ++ * Bypassing guarantees that all runnable tasks make forward progress without ++ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might ++ * be held by tasks that the BPF scheduler is forgetting to run, which ++ * unfortunately also excludes toggling the static branches. ++ * ++ * Let's work around by overriding a couple ops and modifying behaviors based on ++ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue ++ * to force global FIFO scheduling. ++ * ++ * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. ++ * ++ * b. ops.dispatch() is ignored. ++ * ++ * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be ++ * trusted. Whenever a tick triggers, the running task is rotated to the tail ++ * of the queue with core_sched_at touched. ++ * ++ * d. pick_next_task() suppresses zero slice warning. ++ * ++ * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM ++ * operations. ++ * ++ * f. scx_prio_less() reverts to the default core_sched_at order. ++ */ ++static void scx_ops_bypass(bool bypass) ++{ ++ int depth, cpu; ++ ++ if (bypass) { ++ depth = atomic_inc_return(&scx_ops_bypass_depth); ++ WARN_ON_ONCE(depth <= 0); ++ if (depth != 1) ++ return; ++ } else { ++ depth = atomic_dec_return(&scx_ops_bypass_depth); ++ WARN_ON_ONCE(depth < 0); ++ if (depth != 0) ++ return; ++ } ++ ++ /* ++ * We need to guarantee that no tasks are on the BPF scheduler while ++ * bypassing. Either we see enabled or the enable path sees the ++ * increased bypass_depth before moving tasks to SCX. ++ */ ++ if (!scx_enabled()) ++ return; ++ ++ /* ++ * No task property is changing. We just need to make sure all currently ++ * queued tasks are re-queued according to the new scx_ops_bypassing() ++ * state. As an optimization, walk each rq's runnable_list instead of ++ * the scx_tasks list. ++ * ++ * This function can't trust the scheduler and thus can't use ++ * cpus_read_lock(). Walk all possible CPUs instead of online. ++ */ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ struct task_struct *p, *n; ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ /* ++ * The use of list_for_each_entry_safe_reverse() is required ++ * because each task is going to be removed from and added back ++ * to the runnable_list during iteration. Because they're added ++ * to the tail of the list, safe reverse iteration can still ++ * visit all nodes. ++ */ ++ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, ++ scx.runnable_node) { ++ struct sched_enq_and_set_ctx ctx; ++ ++ /* cycling deq/enq is enough, see the function comment */ ++ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); ++ sched_enq_and_set_task(&ctx); ++ } ++ ++ rq_unlock_irqrestore(rq, &rf); ++ ++ /* kick to restore ticks */ ++ resched_cpu(cpu); ++ } ++} ++ ++static void free_exit_info(struct scx_exit_info *ei) ++{ ++ kfree(ei->dump); ++ kfree(ei->msg); ++ kfree(ei->bt); ++ kfree(ei); ++} ++ ++static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) ++{ ++ struct scx_exit_info *ei; ++ ++ ei = kzalloc(sizeof(*ei), GFP_KERNEL); ++ if (!ei) ++ return NULL; ++ ++ ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); ++ ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); ++ ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); ++ ++ if (!ei->bt || !ei->msg || !ei->dump) { ++ free_exit_info(ei); ++ return NULL; ++ } ++ ++ return ei; ++} ++ ++static const char *scx_exit_reason(enum scx_exit_kind kind) ++{ ++ switch (kind) { ++ case SCX_EXIT_UNREG: ++ return "Scheduler unregistered from user space"; ++ case SCX_EXIT_UNREG_BPF: ++ return "Scheduler unregistered from BPF"; ++ case SCX_EXIT_UNREG_KERN: ++ return "Scheduler unregistered from the main kernel"; ++ case SCX_EXIT_SYSRQ: ++ return "disabled by sysrq-S"; ++ case SCX_EXIT_ERROR: ++ return "runtime error"; ++ case SCX_EXIT_ERROR_BPF: ++ return "scx_bpf_error"; ++ case SCX_EXIT_ERROR_STALL: ++ return "runnable task stall"; ++ default: ++ return "<UNKNOWN>"; ++ } ++} ++ ++static void scx_ops_disable_workfn(struct kthread_work *work) ++{ ++ struct scx_exit_info *ei = scx_exit_info; ++ struct scx_task_iter sti; ++ struct task_struct *p; ++ struct rhashtable_iter rht_iter; ++ struct scx_dispatch_q *dsq; ++ int i, kind; ++ ++ kind = atomic_read(&scx_exit_kind); ++ while (true) { ++ /* ++ * NONE indicates that a new scx_ops has been registered since ++ * disable was scheduled - don't kill the new ops. DONE ++ * indicates that the ops has already been disabled. ++ */ ++ if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) ++ return; ++ if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) ++ break; ++ } ++ ei->kind = kind; ++ ei->reason = scx_exit_reason(ei->kind); ++ ++ /* guarantee forward progress by bypassing scx_ops */ ++ scx_ops_bypass(true); ++ ++ switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { ++ case SCX_OPS_DISABLING: ++ WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); ++ break; ++ case SCX_OPS_DISABLED: ++ pr_warn("sched_ext: ops error detected without ops (%s)\n", ++ scx_exit_info->msg); ++ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != ++ SCX_OPS_DISABLING); ++ goto done; ++ default: ++ break; ++ } ++ ++ /* ++ * Here, every runnable task is guaranteed to make forward progress and ++ * we can safely use blocking synchronization constructs. Actually ++ * disable ops. ++ */ ++ mutex_lock(&scx_ops_enable_mutex); ++ ++ static_branch_disable(&__scx_switched_all); ++ WRITE_ONCE(scx_switching_all, false); ++ ++ /* ++ * Avoid racing against fork. See scx_ops_enable() for explanation on ++ * the locking order. ++ */ ++ percpu_down_write(&scx_fork_rwsem); ++ cpus_read_lock(); ++ ++ spin_lock_irq(&scx_tasks_lock); ++ scx_task_iter_init(&sti); ++ /* ++ * Invoke scx_ops_exit_task() on all non-idle tasks, including ++ * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, ++ * we may not have invoked sched_ext_free() on them by the time a ++ * scheduler is disabled. We must therefore exit the task here, or we'd ++ * fail to invoke ops.exit_task(), as the scheduler will have been ++ * unloaded by the time the task is subsequently exited on the ++ * sched_ext_free() path. ++ */ ++ while ((p = scx_task_iter_next_locked(&sti, true))) { ++ const struct sched_class *old_class = p->sched_class; ++ struct sched_enq_and_set_ctx ctx; ++ ++ if (READ_ONCE(p->__state) != TASK_DEAD) { ++ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, ++ &ctx); ++ ++ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); ++ __setscheduler_prio(p, p->prio); ++ check_class_changing(task_rq(p), p, old_class); ++ ++ sched_enq_and_set_task(&ctx); ++ ++ check_class_changed(task_rq(p), p, old_class, p->prio); ++ } ++ scx_ops_exit_task(p); ++ } ++ scx_task_iter_exit(&sti); ++ spin_unlock_irq(&scx_tasks_lock); ++ ++ /* no task is on scx, turn off all the switches and flush in-progress calls */ ++ static_branch_disable_cpuslocked(&__scx_ops_enabled); ++ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) ++ static_branch_disable_cpuslocked(&scx_has_op[i]); ++ static_branch_disable_cpuslocked(&scx_ops_enq_last); ++ static_branch_disable_cpuslocked(&scx_ops_enq_exiting); ++ static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); ++ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); ++ synchronize_rcu(); ++ ++ cpus_read_unlock(); ++ percpu_up_write(&scx_fork_rwsem); ++ ++ if (ei->kind >= SCX_EXIT_ERROR) { ++ printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); ++ ++ if (ei->msg[0] == '\0') ++ printk(KERN_ERR "sched_ext: %s\n", ei->reason); ++ else ++ printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); ++ ++ stack_trace_print(ei->bt, ei->bt_len, 2); ++ } ++ ++ if (scx_ops.exit) ++ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); ++ ++ cancel_delayed_work_sync(&scx_watchdog_work); ++ ++ /* ++ * Delete the kobject from the hierarchy eagerly in addition to just ++ * dropping a reference. Otherwise, if the object is deleted ++ * asynchronously, sysfs could observe an object of the same name still ++ * in the hierarchy when another scheduler is loaded. ++ */ ++ kobject_del(scx_root_kobj); ++ kobject_put(scx_root_kobj); ++ scx_root_kobj = NULL; ++ ++ memset(&scx_ops, 0, sizeof(scx_ops)); ++ ++ rhashtable_walk_enter(&dsq_hash, &rht_iter); ++ do { ++ rhashtable_walk_start(&rht_iter); ++ ++ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) ++ destroy_dsq(dsq->id); ++ ++ rhashtable_walk_stop(&rht_iter); ++ } while (dsq == ERR_PTR(-EAGAIN)); ++ rhashtable_walk_exit(&rht_iter); ++ ++ free_percpu(scx_dsp_ctx); ++ scx_dsp_ctx = NULL; ++ scx_dsp_max_batch = 0; ++ ++ free_exit_info(scx_exit_info); ++ scx_exit_info = NULL; ++ ++ mutex_unlock(&scx_ops_enable_mutex); ++ ++ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != ++ SCX_OPS_DISABLING); ++done: ++ scx_ops_bypass(false); ++} ++ ++static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); ++ ++static void schedule_scx_ops_disable_work(void) ++{ ++ struct kthread_worker *helper = READ_ONCE(scx_ops_helper); ++ ++ /* ++ * We may be called spuriously before the first bpf_sched_ext_reg(). If ++ * scx_ops_helper isn't set up yet, there's nothing to do. ++ */ ++ if (helper) ++ kthread_queue_work(helper, &scx_ops_disable_work); ++} ++ ++static void scx_ops_disable(enum scx_exit_kind kind) ++{ ++ int none = SCX_EXIT_NONE; ++ ++ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) ++ kind = SCX_EXIT_ERROR; ++ ++ atomic_try_cmpxchg(&scx_exit_kind, &none, kind); ++ ++ schedule_scx_ops_disable_work(); ++} ++ ++static void dump_newline(struct seq_buf *s) ++{ ++ trace_sched_ext_dump(""); ++ ++ /* @s may be zero sized and seq_buf triggers WARN if so */ ++ if (s->size) ++ seq_buf_putc(s, '\n'); ++} ++ ++static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) ++{ ++ va_list args; ++ ++#ifdef CONFIG_TRACEPOINTS ++ if (trace_sched_ext_dump_enabled()) { ++ /* protected by scx_dump_state()::dump_lock */ ++ static char line_buf[SCX_EXIT_MSG_LEN]; ++ ++ va_start(args, fmt); ++ vscnprintf(line_buf, sizeof(line_buf), fmt, args); ++ va_end(args); ++ ++ trace_sched_ext_dump(line_buf); ++ } ++#endif ++ /* @s may be zero sized and seq_buf triggers WARN if so */ ++ if (s->size) { ++ va_start(args, fmt); ++ seq_buf_vprintf(s, fmt, args); ++ va_end(args); ++ ++ seq_buf_putc(s, '\n'); ++ } ++} ++ ++static void dump_stack_trace(struct seq_buf *s, const char *prefix, ++ const unsigned long *bt, unsigned int len) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < len; i++) ++ dump_line(s, "%s%pS", prefix, (void *)bt[i]); ++} ++ ++static void ops_dump_init(struct seq_buf *s, const char *prefix) ++{ ++ struct scx_dump_data *dd = &scx_dump_data; ++ ++ lockdep_assert_irqs_disabled(); ++ ++ dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ ++ dd->first = true; ++ dd->cursor = 0; ++ dd->s = s; ++ dd->prefix = prefix; ++} ++ ++static void ops_dump_flush(void) ++{ ++ struct scx_dump_data *dd = &scx_dump_data; ++ char *line = dd->buf.line; ++ ++ if (!dd->cursor) ++ return; ++ ++ /* ++ * There's something to flush and this is the first line. Insert a blank ++ * line to distinguish ops dump. ++ */ ++ if (dd->first) { ++ dump_newline(dd->s); ++ dd->first = false; ++ } ++ ++ /* ++ * There may be multiple lines in $line. Scan and emit each line ++ * separately. ++ */ ++ while (true) { ++ char *end = line; ++ char c; ++ ++ while (*end != '\n' && *end != '\0') ++ end++; ++ ++ /* ++ * If $line overflowed, it may not have newline at the end. ++ * Always emit with a newline. ++ */ ++ c = *end; ++ *end = '\0'; ++ dump_line(dd->s, "%s%s", dd->prefix, line); ++ if (c == '\0') ++ break; ++ ++ /* move to the next line */ ++ end++; ++ if (*end == '\0') ++ break; ++ line = end; ++ } ++ ++ dd->cursor = 0; ++} ++ ++static void ops_dump_exit(void) ++{ ++ ops_dump_flush(); ++ scx_dump_data.cpu = -1; ++} ++ ++static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, ++ struct task_struct *p, char marker) ++{ ++ static unsigned long bt[SCX_EXIT_BT_LEN]; ++ char dsq_id_buf[19] = "(n/a)"; ++ unsigned long ops_state = atomic_long_read(&p->scx.ops_state); ++ unsigned int bt_len; ++ ++ if (p->scx.dsq) ++ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", ++ (unsigned long long)p->scx.dsq->id); ++ ++ dump_newline(s); ++ dump_line(s, " %c%c %s[%d] %+ldms", ++ marker, task_state_to_char(p), p->comm, p->pid, ++ jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); ++ dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", ++ scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, ++ p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ++ ops_state >> SCX_OPSS_QSEQ_SHIFT); ++ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu", ++ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, ++ p->scx.dsq_vtime); ++ dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); ++ ++ if (SCX_HAS_OP(dump_task)) { ++ ops_dump_init(s, " "); ++ SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); ++ ops_dump_exit(); ++ } ++ ++ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); ++ if (bt_len) { ++ dump_newline(s); ++ dump_stack_trace(s, " ", bt, bt_len); ++ } ++} ++ ++static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) ++{ ++ static DEFINE_SPINLOCK(dump_lock); ++ static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; ++ struct scx_dump_ctx dctx = { ++ .kind = ei->kind, ++ .exit_code = ei->exit_code, ++ .reason = ei->reason, ++ .at_ns = ktime_get_ns(), ++ .at_jiffies = jiffies, ++ }; ++ struct seq_buf s; ++ unsigned long flags; ++ char *buf; ++ int cpu; ++ ++ spin_lock_irqsave(&dump_lock, flags); ++ ++ seq_buf_init(&s, ei->dump, dump_len); ++ ++ if (ei->kind == SCX_EXIT_NONE) { ++ dump_line(&s, "Debug dump triggered by %s", ei->reason); ++ } else { ++ dump_line(&s, "%s[%d] triggered exit kind %d:", ++ current->comm, current->pid, ei->kind); ++ dump_line(&s, " %s (%s)", ei->reason, ei->msg); ++ dump_newline(&s); ++ dump_line(&s, "Backtrace:"); ++ dump_stack_trace(&s, " ", ei->bt, ei->bt_len); ++ } ++ ++ if (SCX_HAS_OP(dump)) { ++ ops_dump_init(&s, ""); ++ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); ++ ops_dump_exit(); ++ } ++ ++ dump_newline(&s); ++ dump_line(&s, "CPU states"); ++ dump_line(&s, "----------"); ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ struct task_struct *p; ++ struct seq_buf ns; ++ size_t avail, used; ++ bool idle; ++ ++ rq_lock(rq, &rf); ++ ++ idle = list_empty(&rq->scx.runnable_list) && ++ rq->curr->sched_class == &idle_sched_class; ++ ++ if (idle && !SCX_HAS_OP(dump_cpu)) ++ goto next; ++ ++ /* ++ * We don't yet know whether ops.dump_cpu() will produce output ++ * and we may want to skip the default CPU dump if it doesn't. ++ * Use a nested seq_buf to generate the standard dump so that we ++ * can decide whether to commit later. ++ */ ++ avail = seq_buf_get_buf(&s, &buf); ++ seq_buf_init(&ns, buf, avail); ++ ++ dump_newline(&ns); ++ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", ++ cpu, rq->scx.nr_running, rq->scx.flags, ++ rq->scx.cpu_released, rq->scx.ops_qseq, ++ rq->scx.pnt_seq); ++ dump_line(&ns, " curr=%s[%d] class=%ps", ++ rq->curr->comm, rq->curr->pid, ++ rq->curr->sched_class); ++ if (!cpumask_empty(rq->scx.cpus_to_kick)) ++ dump_line(&ns, " cpus_to_kick : %*pb", ++ cpumask_pr_args(rq->scx.cpus_to_kick)); ++ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) ++ dump_line(&ns, " idle_to_kick : %*pb", ++ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); ++ if (!cpumask_empty(rq->scx.cpus_to_preempt)) ++ dump_line(&ns, " cpus_to_preempt: %*pb", ++ cpumask_pr_args(rq->scx.cpus_to_preempt)); ++ if (!cpumask_empty(rq->scx.cpus_to_wait)) ++ dump_line(&ns, " cpus_to_wait : %*pb", ++ cpumask_pr_args(rq->scx.cpus_to_wait)); ++ ++ used = seq_buf_used(&ns); ++ if (SCX_HAS_OP(dump_cpu)) { ++ ops_dump_init(&ns, " "); ++ SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); ++ ops_dump_exit(); ++ } ++ ++ /* ++ * If idle && nothing generated by ops.dump_cpu(), there's ++ * nothing interesting. Skip. ++ */ ++ if (idle && used == seq_buf_used(&ns)) ++ goto next; ++ ++ /* ++ * $s may already have overflowed when $ns was created. If so, ++ * calling commit on it will trigger BUG. ++ */ ++ if (avail) { ++ seq_buf_commit(&s, seq_buf_used(&ns)); ++ if (seq_buf_has_overflowed(&ns)) ++ seq_buf_set_overflow(&s); ++ } ++ ++ if (rq->curr->sched_class == &ext_sched_class) ++ scx_dump_task(&s, &dctx, rq->curr, '*'); ++ ++ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) ++ scx_dump_task(&s, &dctx, p, ' '); ++ next: ++ rq_unlock(rq, &rf); ++ } ++ ++ if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) ++ memcpy(ei->dump + dump_len - sizeof(trunc_marker), ++ trunc_marker, sizeof(trunc_marker)); ++ ++ spin_unlock_irqrestore(&dump_lock, flags); ++} ++ ++static void scx_ops_error_irq_workfn(struct irq_work *irq_work) ++{ ++ struct scx_exit_info *ei = scx_exit_info; ++ ++ if (ei->kind >= SCX_EXIT_ERROR) ++ scx_dump_state(ei, scx_ops.exit_dump_len); ++ ++ schedule_scx_ops_disable_work(); ++} ++ ++static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); ++ ++static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, ++ s64 exit_code, ++ const char *fmt, ...) ++{ ++ struct scx_exit_info *ei = scx_exit_info; ++ int none = SCX_EXIT_NONE; ++ va_list args; ++ ++ if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) ++ return; ++ ++ ei->exit_code = exit_code; ++ ++ if (kind >= SCX_EXIT_ERROR) ++ ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); ++ ++ va_start(args, fmt); ++ vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); ++ va_end(args); ++ ++ /* ++ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again ++ * in scx_ops_disable_workfn(). ++ */ ++ ei->kind = kind; ++ ei->reason = scx_exit_reason(ei->kind); ++ ++ irq_work_queue(&scx_ops_error_irq_work); ++} ++ ++static struct kthread_worker *scx_create_rt_helper(const char *name) ++{ ++ struct kthread_worker *helper; ++ ++ helper = kthread_create_worker(0, name); ++ if (helper) ++ sched_set_fifo(helper->task); ++ return helper; ++} ++ ++static void check_hotplug_seq(const struct sched_ext_ops *ops) ++{ ++ unsigned long long global_hotplug_seq; ++ ++ /* ++ * If a hotplug event has occurred between when a scheduler was ++ * initialized, and when we were able to attach, exit and notify user ++ * space about it. ++ */ ++ if (ops->hotplug_seq) { ++ global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); ++ if (ops->hotplug_seq != global_hotplug_seq) { ++ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, ++ "expected hotplug seq %llu did not match actual %llu", ++ ops->hotplug_seq, global_hotplug_seq); ++ } ++ } ++} ++ ++static int validate_ops(const struct sched_ext_ops *ops) ++{ ++ /* ++ * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the ++ * ops.enqueue() callback isn't implemented. ++ */ ++ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { ++ scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int scx_ops_enable(struct sched_ext_ops *ops) ++{ ++ struct scx_task_iter sti; ++ struct task_struct *p; ++ unsigned long timeout; ++ int i, cpu, ret; ++ ++ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), ++ cpu_possible_mask)) { ++ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); ++ return -EINVAL; ++ } ++ ++ mutex_lock(&scx_ops_enable_mutex); ++ ++ if (!scx_ops_helper) { ++ WRITE_ONCE(scx_ops_helper, ++ scx_create_rt_helper("sched_ext_ops_helper")); ++ if (!scx_ops_helper) { ++ ret = -ENOMEM; ++ goto err_unlock; ++ } ++ } ++ ++ if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ++ ret = -EBUSY; ++ goto err_unlock; ++ } ++ ++ scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); ++ if (!scx_root_kobj) { ++ ret = -ENOMEM; ++ goto err_unlock; ++ } ++ ++ scx_root_kobj->kset = scx_kset; ++ ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); ++ if (ret < 0) ++ goto err; ++ ++ scx_exit_info = alloc_exit_info(ops->exit_dump_len); ++ if (!scx_exit_info) { ++ ret = -ENOMEM; ++ goto err_del; ++ } ++ ++ /* ++ * Set scx_ops, transition to PREPPING and clear exit info to arm the ++ * disable path. Failure triggers full disabling from here on. ++ */ ++ scx_ops = *ops; ++ ++ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != ++ SCX_OPS_DISABLED); ++ ++ atomic_set(&scx_exit_kind, SCX_EXIT_NONE); ++ scx_warned_zero_slice = false; ++ ++ atomic_long_set(&scx_nr_rejected, 0); ++ ++ for_each_possible_cpu(cpu) ++ cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; ++ ++ /* ++ * Keep CPUs stable during enable so that the BPF scheduler can track ++ * online CPUs by watching ->on/offline_cpu() after ->init(). ++ */ ++ cpus_read_lock(); ++ ++ if (scx_ops.init) { ++ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init); ++ if (ret) { ++ ret = ops_sanitize_err("init", ret); ++ goto err_disable_unlock_cpus; ++ } ++ } ++ ++ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) ++ if (((void (**)(void))ops)[i]) ++ static_branch_enable_cpuslocked(&scx_has_op[i]); ++ ++ cpus_read_unlock(); ++ ++ ret = validate_ops(ops); ++ if (ret) ++ goto err_disable; ++ ++ WARN_ON_ONCE(scx_dsp_ctx); ++ scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; ++ scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, ++ scx_dsp_max_batch), ++ __alignof__(struct scx_dsp_ctx)); ++ if (!scx_dsp_ctx) { ++ ret = -ENOMEM; ++ goto err_disable; ++ } ++ ++ if (ops->timeout_ms) ++ timeout = msecs_to_jiffies(ops->timeout_ms); ++ else ++ timeout = SCX_WATCHDOG_MAX_TIMEOUT; ++ ++ WRITE_ONCE(scx_watchdog_timeout, timeout); ++ WRITE_ONCE(scx_watchdog_timestamp, jiffies); ++ queue_delayed_work(system_unbound_wq, &scx_watchdog_work, ++ scx_watchdog_timeout / 2); ++ ++ /* ++ * Lock out forks before opening the floodgate so that they don't wander ++ * into the operations prematurely. ++ * ++ * We don't need to keep the CPUs stable but grab cpus_read_lock() to ++ * ease future locking changes for cgroup suport. ++ * ++ * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the ++ * following dependency chain: ++ * ++ * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock ++ */ ++ percpu_down_write(&scx_fork_rwsem); ++ cpus_read_lock(); ++ ++ check_hotplug_seq(ops); ++ ++ for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) ++ if (((void (**)(void))ops)[i]) ++ static_branch_enable_cpuslocked(&scx_has_op[i]); ++ ++ if (ops->flags & SCX_OPS_ENQ_LAST) ++ static_branch_enable_cpuslocked(&scx_ops_enq_last); ++ ++ if (ops->flags & SCX_OPS_ENQ_EXITING) ++ static_branch_enable_cpuslocked(&scx_ops_enq_exiting); ++ if (scx_ops.cpu_acquire || scx_ops.cpu_release) ++ static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); ++ ++ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { ++ reset_idle_masks(); ++ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); ++ } else { ++ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); ++ } ++ ++ static_branch_enable_cpuslocked(&__scx_ops_enabled); ++ ++ /* ++ * Enable ops for every task. Fork is excluded by scx_fork_rwsem ++ * preventing new tasks from being added. No need to exclude tasks ++ * leaving as sched_ext_free() can handle both prepped and enabled ++ * tasks. Prep all tasks first and then enable them with preemption ++ * disabled. ++ */ ++ spin_lock_irq(&scx_tasks_lock); ++ ++ scx_task_iter_init(&sti); ++ while ((p = scx_task_iter_next_locked(&sti, false))) { ++ get_task_struct(p); ++ scx_task_iter_rq_unlock(&sti); ++ spin_unlock_irq(&scx_tasks_lock); ++ ++ ret = scx_ops_init_task(p, task_group(p), false); ++ if (ret) { ++ put_task_struct(p); ++ spin_lock_irq(&scx_tasks_lock); ++ scx_task_iter_exit(&sti); ++ spin_unlock_irq(&scx_tasks_lock); ++ pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", ++ ret, p->comm, p->pid); ++ goto err_disable_unlock_all; ++ } ++ ++ put_task_struct(p); ++ spin_lock_irq(&scx_tasks_lock); ++ } ++ scx_task_iter_exit(&sti); ++ ++ /* ++ * All tasks are prepped but are still ops-disabled. Ensure that ++ * %current can't be scheduled out and switch everyone. ++ * preempt_disable() is necessary because we can't guarantee that ++ * %current won't be starved if scheduled out while switching. ++ */ ++ preempt_disable(); ++ ++ /* ++ * From here on, the disable path must assume that tasks have ops ++ * enabled and need to be recovered. ++ * ++ * Transition to ENABLING fails iff the BPF scheduler has already ++ * triggered scx_bpf_error(). Returning an error code here would lose ++ * the recorded error information. Exit indicating success so that the ++ * error is notified through ops.exit() with all the details. ++ */ ++ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { ++ preempt_enable(); ++ spin_unlock_irq(&scx_tasks_lock); ++ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); ++ ret = 0; ++ goto err_disable_unlock_all; ++ } ++ ++ /* ++ * We're fully committed and can't fail. The PREPPED -> ENABLED ++ * transitions here are synchronized against sched_ext_free() through ++ * scx_tasks_lock. ++ */ ++ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); ++ ++ scx_task_iter_init(&sti); ++ while ((p = scx_task_iter_next_locked(&sti, false))) { ++ const struct sched_class *old_class = p->sched_class; ++ struct sched_enq_and_set_ctx ctx; ++ ++ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); ++ ++ scx_set_task_state(p, SCX_TASK_READY); ++ __setscheduler_prio(p, p->prio); ++ check_class_changing(task_rq(p), p, old_class); ++ ++ sched_enq_and_set_task(&ctx); ++ ++ check_class_changed(task_rq(p), p, old_class, p->prio); ++ } ++ scx_task_iter_exit(&sti); ++ ++ spin_unlock_irq(&scx_tasks_lock); ++ preempt_enable(); ++ cpus_read_unlock(); ++ percpu_up_write(&scx_fork_rwsem); ++ ++ /* see above ENABLING transition for the explanation on exiting with 0 */ ++ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { ++ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); ++ ret = 0; ++ goto err_disable; ++ } ++ ++ if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) ++ static_branch_enable(&__scx_switched_all); ++ ++ kobject_uevent(scx_root_kobj, KOBJ_ADD); ++ mutex_unlock(&scx_ops_enable_mutex); ++ ++ return 0; ++ ++err_del: ++ kobject_del(scx_root_kobj); ++err: ++ kobject_put(scx_root_kobj); ++ scx_root_kobj = NULL; ++ if (scx_exit_info) { ++ free_exit_info(scx_exit_info); ++ scx_exit_info = NULL; ++ } ++err_unlock: ++ mutex_unlock(&scx_ops_enable_mutex); ++ return ret; ++ ++err_disable_unlock_all: ++ percpu_up_write(&scx_fork_rwsem); ++err_disable_unlock_cpus: ++ cpus_read_unlock(); ++err_disable: ++ mutex_unlock(&scx_ops_enable_mutex); ++ /* must be fully disabled before returning */ ++ scx_ops_disable(SCX_EXIT_ERROR); ++ kthread_flush_work(&scx_ops_disable_work); ++ return ret; ++} ++ ++ ++/******************************************************************************** ++ * bpf_struct_ops plumbing. ++ */ ++#include <linux/bpf_verifier.h> ++#include <linux/bpf.h> ++#include <linux/btf.h> ++ ++extern struct btf *btf_vmlinux; ++static const struct btf_type *task_struct_type; ++static u32 task_struct_type_id; ++ ++static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size, ++ enum bpf_access_type type, ++ const struct bpf_prog *prog, ++ struct bpf_insn_access_aux *info) ++{ ++ struct btf *btf = bpf_get_btf_vmlinux(); ++ const struct bpf_struct_ops_desc *st_ops_desc; ++ const struct btf_member *member; ++ const struct btf_type *t; ++ u32 btf_id, member_idx; ++ const char *mname; ++ ++ /* struct_ops op args are all sequential, 64-bit numbers */ ++ if (off != arg_n * sizeof(__u64)) ++ return false; ++ ++ /* btf_id should be the type id of struct sched_ext_ops */ ++ btf_id = prog->aux->attach_btf_id; ++ st_ops_desc = bpf_struct_ops_find(btf, btf_id); ++ if (!st_ops_desc) ++ return false; ++ ++ /* BTF type of struct sched_ext_ops */ ++ t = st_ops_desc->type; ++ ++ member_idx = prog->expected_attach_type; ++ if (member_idx >= btf_type_vlen(t)) ++ return false; ++ ++ /* ++ * Get the member name of this struct_ops program, which corresponds to ++ * a field in struct sched_ext_ops. For example, the member name of the ++ * dispatch struct_ops program (callback) is "dispatch". ++ */ ++ member = &btf_type_member(t)[member_idx]; ++ mname = btf_name_by_offset(btf_vmlinux, member->name_off); ++ ++ if (!strcmp(mname, op)) { ++ /* ++ * The value is a pointer to a type (struct task_struct) given ++ * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), ++ * however, can be a NULL (PTR_MAYBE_NULL). The BPF program ++ * should check the pointer to make sure it is not NULL before ++ * using it, or the verifier will reject the program. ++ * ++ * Longer term, this is something that should be addressed by ++ * BTF, and be fully contained within the verifier. ++ */ ++ info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; ++ info->btf = btf_vmlinux; ++ info->btf_id = task_struct_type_id; ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool bpf_scx_is_valid_access(int off, int size, ++ enum bpf_access_type type, ++ const struct bpf_prog *prog, ++ struct bpf_insn_access_aux *info) ++{ ++ if (type != BPF_READ) ++ return false; ++ if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) || ++ set_arg_maybe_null("yield", 1, off, size, type, prog, info)) ++ return true; ++ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) ++ return false; ++ if (off % size != 0) ++ return false; ++ ++ return btf_ctx_access(off, size, type, prog, info); ++} ++ ++static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, ++ const struct bpf_reg_state *reg, int off, ++ int size) ++{ ++ const struct btf_type *t; ++ ++ t = btf_type_by_id(reg->btf, reg->btf_id); ++ if (t == task_struct_type) { ++ if (off >= offsetof(struct task_struct, scx.slice) && ++ off + size <= offsetofend(struct task_struct, scx.slice)) ++ return SCALAR_VALUE; ++ if (off >= offsetof(struct task_struct, scx.dsq_vtime) && ++ off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) ++ return SCALAR_VALUE; ++ if (off >= offsetof(struct task_struct, scx.disallow) && ++ off + size <= offsetofend(struct task_struct, scx.disallow)) ++ return SCALAR_VALUE; ++ } ++ ++ return -EACCES; ++} ++ ++static const struct bpf_func_proto * ++bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ++{ ++ switch (func_id) { ++ case BPF_FUNC_task_storage_get: ++ return &bpf_task_storage_get_proto; ++ case BPF_FUNC_task_storage_delete: ++ return &bpf_task_storage_delete_proto; ++ default: ++ return bpf_base_func_proto(func_id, prog); ++ } ++} ++ ++static const struct bpf_verifier_ops bpf_scx_verifier_ops = { ++ .get_func_proto = bpf_scx_get_func_proto, ++ .is_valid_access = bpf_scx_is_valid_access, ++ .btf_struct_access = bpf_scx_btf_struct_access, ++}; ++ ++static int bpf_scx_init_member(const struct btf_type *t, ++ const struct btf_member *member, ++ void *kdata, const void *udata) ++{ ++ const struct sched_ext_ops *uops = udata; ++ struct sched_ext_ops *ops = kdata; ++ u32 moff = __btf_member_bit_offset(t, member) / 8; ++ int ret; ++ ++ switch (moff) { ++ case offsetof(struct sched_ext_ops, dispatch_max_batch): ++ if (*(u32 *)(udata + moff) > INT_MAX) ++ return -E2BIG; ++ ops->dispatch_max_batch = *(u32 *)(udata + moff); ++ return 1; ++ case offsetof(struct sched_ext_ops, flags): ++ if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) ++ return -EINVAL; ++ ops->flags = *(u64 *)(udata + moff); ++ return 1; ++ case offsetof(struct sched_ext_ops, name): ++ ret = bpf_obj_name_cpy(ops->name, uops->name, ++ sizeof(ops->name)); ++ if (ret < 0) ++ return ret; ++ if (ret == 0) ++ return -EINVAL; ++ return 1; ++ case offsetof(struct sched_ext_ops, timeout_ms): ++ if (msecs_to_jiffies(*(u32 *)(udata + moff)) > ++ SCX_WATCHDOG_MAX_TIMEOUT) ++ return -E2BIG; ++ ops->timeout_ms = *(u32 *)(udata + moff); ++ return 1; ++ case offsetof(struct sched_ext_ops, exit_dump_len): ++ ops->exit_dump_len = ++ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; ++ return 1; ++ case offsetof(struct sched_ext_ops, hotplug_seq): ++ ops->hotplug_seq = *(u64 *)(udata + moff); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static int bpf_scx_check_member(const struct btf_type *t, ++ const struct btf_member *member, ++ const struct bpf_prog *prog) ++{ ++ u32 moff = __btf_member_bit_offset(t, member) / 8; ++ ++ switch (moff) { ++ case offsetof(struct sched_ext_ops, init_task): ++ case offsetof(struct sched_ext_ops, cpu_online): ++ case offsetof(struct sched_ext_ops, cpu_offline): ++ case offsetof(struct sched_ext_ops, init): ++ case offsetof(struct sched_ext_ops, exit): ++ break; ++ default: ++ if (prog->sleepable) ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int bpf_scx_reg(void *kdata) ++{ ++ return scx_ops_enable(kdata); ++} ++ ++static void bpf_scx_unreg(void *kdata) ++{ ++ scx_ops_disable(SCX_EXIT_UNREG); ++ kthread_flush_work(&scx_ops_disable_work); ++} ++ ++static int bpf_scx_init(struct btf *btf) ++{ ++ u32 type_id; ++ ++ type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); ++ if (type_id < 0) ++ return -EINVAL; ++ task_struct_type = btf_type_by_id(btf, type_id); ++ task_struct_type_id = type_id; ++ ++ return 0; ++} ++ ++static int bpf_scx_update(void *kdata, void *old_kdata) ++{ ++ /* ++ * sched_ext does not support updating the actively-loaded BPF ++ * scheduler, as registering a BPF scheduler can always fail if the ++ * scheduler returns an error code for e.g. ops.init(), ops.init_task(), ++ * etc. Similarly, we can always race with unregistration happening ++ * elsewhere, such as with sysrq. ++ */ ++ return -EOPNOTSUPP; ++} ++ ++static int bpf_scx_validate(void *kdata) ++{ ++ return 0; ++} ++ ++static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } ++static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} ++static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} ++static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} ++static void runnable_stub(struct task_struct *p, u64 enq_flags) {} ++static void running_stub(struct task_struct *p) {} ++static void stopping_stub(struct task_struct *p, bool runnable) {} ++static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} ++static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } ++static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; } ++static void set_weight_stub(struct task_struct *p, u32 weight) {} ++static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} ++static void update_idle_stub(s32 cpu, bool idle) {} ++static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} ++static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} ++static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } ++static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} ++static void enable_stub(struct task_struct *p) {} ++static void disable_stub(struct task_struct *p) {} ++static void cpu_online_stub(s32 cpu) {} ++static void cpu_offline_stub(s32 cpu) {} ++static s32 init_stub(void) { return -EINVAL; } ++static void exit_stub(struct scx_exit_info *info) {} ++ ++static struct sched_ext_ops __bpf_ops_sched_ext_ops = { ++ .select_cpu = select_cpu_stub, ++ .enqueue = enqueue_stub, ++ .dequeue = dequeue_stub, ++ .dispatch = dispatch_stub, ++ .runnable = runnable_stub, ++ .running = running_stub, ++ .stopping = stopping_stub, ++ .quiescent = quiescent_stub, ++ .yield = yield_stub, ++ .core_sched_before = core_sched_before_stub, ++ .set_weight = set_weight_stub, ++ .set_cpumask = set_cpumask_stub, ++ .update_idle = update_idle_stub, ++ .cpu_acquire = cpu_acquire_stub, ++ .cpu_release = cpu_release_stub, ++ .init_task = init_task_stub, ++ .exit_task = exit_task_stub, ++ .enable = enable_stub, ++ .disable = disable_stub, ++ .cpu_online = cpu_online_stub, ++ .cpu_offline = cpu_offline_stub, ++ .init = init_stub, ++ .exit = exit_stub, ++}; ++ ++static struct bpf_struct_ops bpf_sched_ext_ops = { ++ .verifier_ops = &bpf_scx_verifier_ops, ++ .reg = bpf_scx_reg, ++ .unreg = bpf_scx_unreg, ++ .check_member = bpf_scx_check_member, ++ .init_member = bpf_scx_init_member, ++ .init = bpf_scx_init, ++ .update = bpf_scx_update, ++ .validate = bpf_scx_validate, ++ .name = "sched_ext_ops", ++ .owner = THIS_MODULE, ++ .cfi_stubs = &__bpf_ops_sched_ext_ops ++}; ++ ++ ++/******************************************************************************** ++ * System integration and init. ++ */ ++ ++static void sysrq_handle_sched_ext_reset(u8 key) ++{ ++ if (scx_ops_helper) ++ scx_ops_disable(SCX_EXIT_SYSRQ); ++ else ++ pr_info("sched_ext: BPF scheduler not yet used\n"); ++} ++ ++static const struct sysrq_key_op sysrq_sched_ext_reset_op = { ++ .handler = sysrq_handle_sched_ext_reset, ++ .help_msg = "reset-sched-ext(S)", ++ .action_msg = "Disable sched_ext and revert all tasks to CFS", ++ .enable_mask = SYSRQ_ENABLE_RTNICE, ++}; ++ ++static void sysrq_handle_sched_ext_dump(u8 key) ++{ ++ struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; ++ ++ if (scx_enabled()) ++ scx_dump_state(&ei, 0); ++} ++ ++static const struct sysrq_key_op sysrq_sched_ext_dump_op = { ++ .handler = sysrq_handle_sched_ext_dump, ++ .help_msg = "dump-sched-ext(D)", ++ .action_msg = "Trigger sched_ext debug dump", ++ .enable_mask = SYSRQ_ENABLE_RTNICE, ++}; ++ ++static bool can_skip_idle_kick(struct rq *rq) ++{ ++ lockdep_assert_rq_held(rq); ++ ++ /* ++ * We can skip idle kicking if @rq is going to go through at least one ++ * full SCX scheduling cycle before going idle. Just checking whether ++ * curr is not idle is insufficient because we could be racing ++ * balance_one() trying to pull the next task from a remote rq, which ++ * may fail, and @rq may become idle afterwards. ++ * ++ * The race window is small and we don't and can't guarantee that @rq is ++ * only kicked while idle anyway. Skip only when sure. ++ */ ++ return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); ++} ++ ++static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct scx_rq *this_scx = &this_rq->scx; ++ bool should_wait = false; ++ unsigned long flags; ++ ++ raw_spin_rq_lock_irqsave(rq, flags); ++ ++ /* ++ * During CPU hotplug, a CPU may depend on kicking itself to make ++ * forward progress. Allow kicking self regardless of online state. ++ */ ++ if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { ++ if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { ++ if (rq->curr->sched_class == &ext_sched_class) ++ rq->curr->scx.slice = 0; ++ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); ++ } ++ ++ if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { ++ pseqs[cpu] = rq->scx.pnt_seq; ++ should_wait = true; ++ } ++ ++ resched_curr(rq); ++ } else { ++ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); ++ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); ++ } ++ ++ raw_spin_rq_unlock_irqrestore(rq, flags); ++ ++ return should_wait; ++} ++ ++static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_rq_lock_irqsave(rq, flags); ++ ++ if (!can_skip_idle_kick(rq) && ++ (cpu_online(cpu) || cpu == cpu_of(this_rq))) ++ resched_curr(rq); ++ ++ raw_spin_rq_unlock_irqrestore(rq, flags); ++} ++ ++static void kick_cpus_irq_workfn(struct irq_work *irq_work) ++{ ++ struct rq *this_rq = this_rq(); ++ struct scx_rq *this_scx = &this_rq->scx; ++ unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); ++ bool should_wait = false; ++ s32 cpu; ++ ++ for_each_cpu(cpu, this_scx->cpus_to_kick) { ++ should_wait |= kick_one_cpu(cpu, this_rq, pseqs); ++ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); ++ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); ++ } ++ ++ for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { ++ kick_one_cpu_if_idle(cpu, this_rq); ++ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); ++ } ++ ++ if (!should_wait) ++ return; ++ ++ for_each_cpu(cpu, this_scx->cpus_to_wait) { ++ unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; ++ ++ if (cpu != cpu_of(this_rq)) { ++ /* ++ * Pairs with smp_store_release() issued by this CPU in ++ * scx_next_task_picked() on the resched path. ++ * ++ * We busy-wait here to guarantee that no other task can ++ * be scheduled on our core before the target CPU has ++ * entered the resched path. ++ */ ++ while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) ++ cpu_relax(); ++ } ++ ++ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); ++ } ++} ++ ++/** ++ * print_scx_info - print out sched_ext scheduler state ++ * @log_lvl: the log level to use when printing ++ * @p: target task ++ * ++ * If a sched_ext scheduler is enabled, print the name and state of the ++ * scheduler. If @p is on sched_ext, print further information about the task. ++ * ++ * This function can be safely called on any task as long as the task_struct ++ * itself is accessible. While safe, this function isn't synchronized and may ++ * print out mixups or garbages of limited length. ++ */ ++void print_scx_info(const char *log_lvl, struct task_struct *p) ++{ ++ enum scx_ops_enable_state state = scx_ops_enable_state(); ++ const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; ++ char runnable_at_buf[22] = "?"; ++ struct sched_class *class; ++ unsigned long runnable_at; ++ ++ if (state == SCX_OPS_DISABLED) ++ return; ++ ++ /* ++ * Carefully check if the task was running on sched_ext, and then ++ * carefully copy the time it's been runnable, and its state. ++ */ ++ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || ++ class != &ext_sched_class) { ++ printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, ++ scx_ops_enable_state_str[state], all); ++ return; ++ } ++ ++ if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, ++ sizeof(runnable_at))) ++ scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", ++ jiffies_delta_msecs(runnable_at, jiffies)); ++ ++ /* print everything onto one line to conserve console space */ ++ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", ++ log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, ++ runnable_at_buf); ++} ++ ++static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) ++{ ++ /* ++ * SCX schedulers often have userspace components which are sometimes ++ * involved in critial scheduling paths. PM operations involve freezing ++ * userspace which can lead to scheduling misbehaviors including stalls. ++ * Let's bypass while PM operations are in progress. ++ */ ++ switch (event) { ++ case PM_HIBERNATION_PREPARE: ++ case PM_SUSPEND_PREPARE: ++ case PM_RESTORE_PREPARE: ++ scx_ops_bypass(true); ++ break; ++ case PM_POST_HIBERNATION: ++ case PM_POST_SUSPEND: ++ case PM_POST_RESTORE: ++ scx_ops_bypass(false); ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block scx_pm_notifier = { ++ .notifier_call = scx_pm_handler, ++}; ++ ++void __init init_sched_ext_class(void) ++{ ++ s32 cpu, v; ++ ++ /* ++ * The following is to prevent the compiler from optimizing out the enum ++ * definitions so that BPF scheduler implementations can use them ++ * through the generated vmlinux.h. ++ */ ++ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT); ++ ++ BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); ++ init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); ++#ifdef CONFIG_SMP ++ BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); ++ BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); ++#endif ++ scx_kick_cpus_pnt_seqs = ++ __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, ++ __alignof__(scx_kick_cpus_pnt_seqs[0])); ++ BUG_ON(!scx_kick_cpus_pnt_seqs); ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); ++ INIT_LIST_HEAD(&rq->scx.runnable_list); ++ INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); ++ ++ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); ++ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); ++ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); ++ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); ++ init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); ++ init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); ++ ++ if (cpu_online(cpu)) ++ cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; ++ } ++ ++ register_sysrq_key('S', &sysrq_sched_ext_reset_op); ++ register_sysrq_key('D', &sysrq_sched_ext_dump_op); ++ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); ++} ++ ++ ++/******************************************************************************** ++ * Helpers that can be called from the BPF scheduler. ++ */ ++#include <linux/btf_ids.h> ++ ++__bpf_kfunc_start_defs(); ++ ++/** ++ * scx_bpf_create_dsq - Create a custom DSQ ++ * @dsq_id: DSQ to create ++ * @node: NUMA node to allocate from ++ * ++ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and ++ * ops.init_task(). ++ */ ++__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) ++{ ++ if (!scx_kf_allowed(SCX_KF_SLEEPABLE)) ++ return -EINVAL; ++ ++ if (unlikely(node >= (int)nr_node_ids || ++ (node < 0 && node != NUMA_NO_NODE))) ++ return -EINVAL; ++ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_sleepable) ++BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) ++BTF_KFUNCS_END(scx_kfunc_ids_sleepable) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_sleepable, ++}; ++ ++__bpf_kfunc_start_defs(); ++ ++/** ++ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() ++ * @p: task_struct to select a CPU for ++ * @prev_cpu: CPU @p was on previously ++ * @wake_flags: %SCX_WAKE_* flags ++ * @is_idle: out parameter indicating whether the returned CPU is idle ++ * ++ * Can only be called from ops.select_cpu() if the built-in CPU selection is ++ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. ++ * @p, @prev_cpu and @wake_flags match ops.select_cpu(). ++ * ++ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is ++ * currently idle and thus a good candidate for direct dispatching. ++ */ ++__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, ++ u64 wake_flags, bool *is_idle) ++{ ++ if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { ++ *is_idle = false; ++ return prev_cpu; ++ } ++#ifdef CONFIG_SMP ++ return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); ++#else ++ *is_idle = false; ++ return prev_cpu; ++#endif ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) ++BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) ++BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_select_cpu, ++}; ++ ++static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) ++{ ++ if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) ++ return false; ++ ++ lockdep_assert_irqs_disabled(); ++ ++ if (unlikely(!p)) { ++ scx_ops_error("called with NULL task"); ++ return false; ++ } ++ ++ if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { ++ scx_ops_error("invalid enq_flags 0x%llx", enq_flags); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) ++{ ++ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); ++ struct task_struct *ddsp_task; ++ ++ ddsp_task = __this_cpu_read(direct_dispatch_task); ++ if (ddsp_task) { ++ mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); ++ return; ++ } ++ ++ if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { ++ scx_ops_error("dispatch buffer overflow"); ++ return; ++ } ++ ++ dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ ++ .task = p, ++ .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, ++ .dsq_id = dsq_id, ++ .enq_flags = enq_flags, ++ }; ++} ++ ++__bpf_kfunc_start_defs(); ++ ++/** ++ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ ++ * @p: task_struct to dispatch ++ * @dsq_id: DSQ to dispatch to ++ * @slice: duration @p can run for in nsecs ++ * @enq_flags: SCX_ENQ_* ++ * ++ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe ++ * to call this function spuriously. Can be called from ops.enqueue(), ++ * ops.select_cpu(), and ops.dispatch(). ++ * ++ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch ++ * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be ++ * used to target the local DSQ of a CPU other than the enqueueing one. Use ++ * ops.select_cpu() to be on the target CPU in the first place. ++ * ++ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p ++ * will be directly dispatched to the corresponding dispatch queue after ++ * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be ++ * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). ++ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the ++ * task is dispatched. ++ * ++ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id ++ * and this function can be called upto ops.dispatch_max_batch times to dispatch ++ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the ++ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. ++ * ++ * This function doesn't have any locking restrictions and may be called under ++ * BPF locks (in the future when BPF introduces more flexible locking). ++ * ++ * @p is allowed to run for @slice. The scheduling path is triggered on slice ++ * exhaustion. If zero, the current residual slice is maintained. If ++ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with ++ * scx_bpf_kick_cpu() to trigger scheduling. ++ */ ++__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, ++ u64 enq_flags) ++{ ++ if (!scx_dispatch_preamble(p, enq_flags)) ++ return; ++ ++ if (slice) ++ p->scx.slice = slice; ++ else ++ p->scx.slice = p->scx.slice ?: 1; ++ ++ scx_dispatch_commit(p, dsq_id, enq_flags); ++} ++ ++/** ++ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ ++ * @p: task_struct to dispatch ++ * @dsq_id: DSQ to dispatch to ++ * @slice: duration @p can run for in nsecs ++ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ ++ * @enq_flags: SCX_ENQ_* ++ * ++ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. ++ * Tasks queued into the priority queue are ordered by @vtime and always ++ * consumed after the tasks in the FIFO queue. All other aspects are identical ++ * to scx_bpf_dispatch(). ++ * ++ * @vtime ordering is according to time_before64() which considers wrapping. A ++ * numerically larger vtime may indicate an earlier position in the ordering and ++ * vice-versa. ++ */ ++__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, ++ u64 slice, u64 vtime, u64 enq_flags) ++{ ++ if (!scx_dispatch_preamble(p, enq_flags)) ++ return; ++ ++ if (slice) ++ p->scx.slice = slice; ++ else ++ p->scx.slice = p->scx.slice ?: 1; ++ ++ p->scx.dsq_vtime = vtime; ++ ++ scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) ++BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) ++BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_enqueue_dispatch, ++}; ++ ++__bpf_kfunc_start_defs(); ++ ++/** ++ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots ++ * ++ * Can only be called from ops.dispatch(). ++ */ ++__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) ++{ ++ if (!scx_kf_allowed(SCX_KF_DISPATCH)) ++ return 0; ++ ++ return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); ++} ++ ++/** ++ * scx_bpf_dispatch_cancel - Cancel the latest dispatch ++ * ++ * Cancel the latest dispatch. Can be called multiple times to cancel further ++ * dispatches. Can only be called from ops.dispatch(). ++ */ ++__bpf_kfunc void scx_bpf_dispatch_cancel(void) ++{ ++ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); ++ ++ if (!scx_kf_allowed(SCX_KF_DISPATCH)) ++ return; ++ ++ if (dspc->cursor > 0) ++ dspc->cursor--; ++ else ++ scx_ops_error("dispatch buffer underflow"); ++} ++ ++/** ++ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ ++ * @dsq_id: DSQ to consume ++ * ++ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it ++ * to the current CPU's local DSQ for execution. Can only be called from ++ * ops.dispatch(). ++ * ++ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before ++ * trying to consume the specified DSQ. It may also grab rq locks and thus can't ++ * be called under any BPF locks. ++ * ++ * Returns %true if a task has been consumed, %false if there isn't any task to ++ * consume. ++ */ ++__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) ++{ ++ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); ++ struct scx_dispatch_q *dsq; ++ ++ if (!scx_kf_allowed(SCX_KF_DISPATCH)) ++ return false; ++ ++ flush_dispatch_buf(dspc->rq); ++ ++ dsq = find_non_local_dsq(dsq_id); ++ if (unlikely(!dsq)) { ++ scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); ++ return false; ++ } ++ ++ if (consume_dispatch_q(dspc->rq, dsq)) { ++ /* ++ * A successfully consumed task can be dequeued before it starts ++ * running while the CPU is trying to migrate other dispatched ++ * tasks. Bump nr_tasks to tell balance_scx() to retry on empty ++ * local DSQ. ++ */ ++ dspc->nr_tasks++; ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_dispatch) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) ++BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) ++BTF_ID_FLAGS(func, scx_bpf_consume) ++BTF_KFUNCS_END(scx_kfunc_ids_dispatch) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_dispatch, ++}; ++ ++__bpf_kfunc_start_defs(); ++ ++/** ++ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ ++ * ++ * Iterate over all of the tasks currently enqueued on the local DSQ of the ++ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of ++ * processed tasks. Can only be called from ops.cpu_release(). ++ */ ++__bpf_kfunc u32 scx_bpf_reenqueue_local(void) ++{ ++ LIST_HEAD(tasks); ++ u32 nr_enqueued = 0; ++ struct rq *rq; ++ struct task_struct *p, *n; ++ ++ if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) ++ return 0; ++ ++ rq = cpu_rq(smp_processor_id()); ++ lockdep_assert_rq_held(rq); ++ ++ /* ++ * The BPF scheduler may choose to dispatch tasks back to ++ * @rq->scx.local_dsq. Move all candidate tasks off to a private list ++ * first to avoid processing the same tasks repeatedly. ++ */ ++ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, ++ scx.dsq_list.node) { ++ /* ++ * If @p is being migrated, @p's current CPU may not agree with ++ * its allowed CPUs and the migration_cpu_stop is about to ++ * deactivate and re-activate @p anyway. Skip re-enqueueing. ++ * ++ * While racing sched property changes may also dequeue and ++ * re-enqueue a migrating task while its current CPU and allowed ++ * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to ++ * the current local DSQ for running tasks and thus are not ++ * visible to the BPF scheduler. ++ */ ++ if (p->migration_pending) ++ continue; ++ ++ dispatch_dequeue(rq, p); ++ list_add_tail(&p->scx.dsq_list.node, &tasks); ++ } ++ ++ list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { ++ list_del_init(&p->scx.dsq_list.node); ++ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); ++ nr_enqueued++; ++ } ++ ++ return nr_enqueued; ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) ++BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) ++BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_cpu_release, ++}; ++ ++__bpf_kfunc_start_defs(); ++ ++/** ++ * scx_bpf_kick_cpu - Trigger reschedule on a CPU ++ * @cpu: cpu to kick ++ * @flags: %SCX_KICK_* flags ++ * ++ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or ++ * trigger rescheduling on a busy CPU. This can be called from any online ++ * scx_ops operation and the actual kicking is performed asynchronously through ++ * an irq work. ++ */ ++__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) ++{ ++ struct rq *this_rq; ++ unsigned long irq_flags; ++ ++ if (!ops_cpu_valid(cpu, NULL)) ++ return; ++ ++ /* ++ * While bypassing for PM ops, IRQ handling may not be online which can ++ * lead to irq_work_queue() malfunction such as infinite busy wait for ++ * IRQ status update. Suppress kicking. ++ */ ++ if (scx_ops_bypassing()) ++ return; ++ ++ local_irq_save(irq_flags); ++ ++ this_rq = this_rq(); ++ ++ /* ++ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting ++ * rq locks. We can probably be smarter and avoid bouncing if called ++ * from ops which don't hold a rq lock. ++ */ ++ if (flags & SCX_KICK_IDLE) { ++ struct rq *target_rq = cpu_rq(cpu); ++ ++ if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) ++ scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); ++ ++ if (raw_spin_rq_trylock(target_rq)) { ++ if (can_skip_idle_kick(target_rq)) { ++ raw_spin_rq_unlock(target_rq); ++ goto out; ++ } ++ raw_spin_rq_unlock(target_rq); ++ } ++ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); ++ } else { ++ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); ++ ++ if (flags & SCX_KICK_PREEMPT) ++ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); ++ if (flags & SCX_KICK_WAIT) ++ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); ++ } ++ ++ irq_work_queue(&this_rq->scx.kick_cpus_irq_work); ++out: ++ local_irq_restore(irq_flags); ++} ++ ++/** ++ * scx_bpf_dsq_nr_queued - Return the number of queued tasks ++ * @dsq_id: id of the DSQ ++ * ++ * Return the number of tasks in the DSQ matching @dsq_id. If not found, ++ * -%ENOENT is returned. ++ */ ++__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) ++{ ++ struct scx_dispatch_q *dsq; ++ s32 ret; ++ ++ preempt_disable(); ++ ++ if (dsq_id == SCX_DSQ_LOCAL) { ++ ret = READ_ONCE(this_rq()->scx.local_dsq.nr); ++ goto out; ++ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { ++ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; ++ ++ if (ops_cpu_valid(cpu, NULL)) { ++ ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); ++ goto out; ++ } ++ } else { ++ dsq = find_non_local_dsq(dsq_id); ++ if (dsq) { ++ ret = READ_ONCE(dsq->nr); ++ goto out; ++ } ++ } ++ ret = -ENOENT; ++out: ++ preempt_enable(); ++ return ret; ++} ++ ++/** ++ * scx_bpf_destroy_dsq - Destroy a custom DSQ ++ * @dsq_id: DSQ to destroy ++ * ++ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with ++ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is ++ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ ++ * which doesn't exist. Can be called from any online scx_ops operations. ++ */ ++__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) ++{ ++ destroy_dsq(dsq_id); ++} ++ ++/** ++ * bpf_iter_scx_dsq_new - Create a DSQ iterator ++ * @it: iterator to initialize ++ * @dsq_id: DSQ to iterate ++ * @flags: %SCX_DSQ_ITER_* ++ * ++ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk ++ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes ++ * tasks which are already queued when this function is invoked. ++ */ ++__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, ++ u64 flags) ++{ ++ struct bpf_iter_scx_dsq_kern *kit = (void *)it; ++ ++ BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > ++ sizeof(struct bpf_iter_scx_dsq)); ++ BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != ++ __alignof__(struct bpf_iter_scx_dsq)); ++ ++ if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS) ++ return -EINVAL; ++ ++ kit->dsq = find_non_local_dsq(dsq_id); ++ if (!kit->dsq) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&kit->cursor.node); ++ kit->cursor.is_bpf_iter_cursor = true; ++ kit->dsq_seq = READ_ONCE(kit->dsq->seq); ++ kit->flags = flags; ++ ++ return 0; ++} ++ ++/** ++ * bpf_iter_scx_dsq_next - Progress a DSQ iterator ++ * @it: iterator to progress ++ * ++ * Return the next task. See bpf_iter_scx_dsq_new(). ++ */ ++__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) ++{ ++ struct bpf_iter_scx_dsq_kern *kit = (void *)it; ++ bool rev = kit->flags & SCX_DSQ_ITER_REV; ++ struct task_struct *p; ++ unsigned long flags; ++ ++ if (!kit->dsq) ++ return NULL; ++ ++ raw_spin_lock_irqsave(&kit->dsq->lock, flags); ++ ++ if (list_empty(&kit->cursor.node)) ++ p = NULL; ++ else ++ p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); ++ ++ /* ++ * Only tasks which were queued before the iteration started are ++ * visible. This bounds BPF iterations and guarantees that vtime never ++ * jumps in the other direction while iterating. ++ */ ++ do { ++ p = nldsq_next_task(kit->dsq, p, rev); ++ } while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq))); ++ ++ if (p) { ++ if (rev) ++ list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); ++ else ++ list_move(&kit->cursor.node, &p->scx.dsq_list.node); ++ } else { ++ list_del_init(&kit->cursor.node); ++ } ++ ++ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); ++ ++ return p; ++} ++ ++/** ++ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator ++ * @it: iterator to destroy ++ * ++ * Undo scx_iter_scx_dsq_new(). ++ */ ++__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) ++{ ++ struct bpf_iter_scx_dsq_kern *kit = (void *)it; ++ ++ if (!kit->dsq) ++ return; ++ ++ if (!list_empty(&kit->cursor.node)) { ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&kit->dsq->lock, flags); ++ list_del_init(&kit->cursor.node); ++ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); ++ } ++ kit->dsq = NULL; ++} ++ ++__bpf_kfunc_end_defs(); ++ ++static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, ++ char *fmt, unsigned long long *data, u32 data__sz) ++{ ++ struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; ++ s32 ret; ++ ++ if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || ++ (data__sz && !data)) { ++ scx_ops_error("invalid data=%p and data__sz=%u", ++ (void *)data, data__sz); ++ return -EINVAL; ++ } ++ ++ ret = copy_from_kernel_nofault(data_buf, data, data__sz); ++ if (ret < 0) { ++ scx_ops_error("failed to read data fields (%d)", ret); ++ return ret; ++ } ++ ++ ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, ++ &bprintf_data); ++ if (ret < 0) { ++ scx_ops_error("format preparation failed (%d)", ret); ++ return ret; ++ } ++ ++ ret = bstr_printf(line_buf, line_size, fmt, ++ bprintf_data.bin_args); ++ bpf_bprintf_cleanup(&bprintf_data); ++ if (ret < 0) { ++ scx_ops_error("(\"%s\", %p, %u) failed to format", ++ fmt, data, data__sz); ++ return ret; ++ } ++ ++ return ret; ++} ++ ++static s32 bstr_format(struct scx_bstr_buf *buf, ++ char *fmt, unsigned long long *data, u32 data__sz) ++{ ++ return __bstr_format(buf->data, buf->line, sizeof(buf->line), ++ fmt, data, data__sz); ++} ++ ++__bpf_kfunc_start_defs(); ++ ++/** ++ * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. ++ * @exit_code: Exit value to pass to user space via struct scx_exit_info. ++ * @fmt: error message format string ++ * @data: format string parameters packaged using ___bpf_fill() macro ++ * @data__sz: @data len, must end in '__sz' for the verifier ++ * ++ * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops ++ * disabling. ++ */ ++__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, ++ unsigned long long *data, u32 data__sz) ++{ ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); ++ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) ++ scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", ++ scx_exit_bstr_buf.line); ++ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); ++} ++ ++/** ++ * scx_bpf_error_bstr - Indicate fatal error ++ * @fmt: error message format string ++ * @data: format string parameters packaged using ___bpf_fill() macro ++ * @data__sz: @data len, must end in '__sz' for the verifier ++ * ++ * Indicate that the BPF scheduler encountered a fatal error and initiate ops ++ * disabling. ++ */ ++__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, ++ u32 data__sz) ++{ ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); ++ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) ++ scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", ++ scx_exit_bstr_buf.line); ++ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); ++} ++ ++/** ++ * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler ++ * @fmt: format string ++ * @data: format string parameters packaged using ___bpf_fill() macro ++ * @data__sz: @data len, must end in '__sz' for the verifier ++ * ++ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and ++ * dump_task() to generate extra debug dump specific to the BPF scheduler. ++ * ++ * The extra dump may be multiple lines. A single line may be split over ++ * multiple calls. The last line is automatically terminated. ++ */ ++__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, ++ u32 data__sz) ++{ ++ struct scx_dump_data *dd = &scx_dump_data; ++ struct scx_bstr_buf *buf = &dd->buf; ++ s32 ret; ++ ++ if (raw_smp_processor_id() != dd->cpu) { ++ scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); ++ return; ++ } ++ ++ /* append the formatted string to the line buf */ ++ ret = __bstr_format(buf->data, buf->line + dd->cursor, ++ sizeof(buf->line) - dd->cursor, fmt, data, data__sz); ++ if (ret < 0) { ++ dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", ++ dd->prefix, fmt, data, data__sz, ret); ++ return; ++ } ++ ++ dd->cursor += ret; ++ dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); ++ ++ if (!dd->cursor) ++ return; ++ ++ /* ++ * If the line buf overflowed or ends in a newline, flush it into the ++ * dump. This is to allow the caller to generate a single line over ++ * multiple calls. As ops_dump_flush() can also handle multiple lines in ++ * the line buf, the only case which can lead to an unexpected ++ * truncation is when the caller keeps generating newlines in the middle ++ * instead of the end consecutively. Don't do that. ++ */ ++ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') ++ ops_dump_flush(); ++} ++ ++/** ++ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU ++ * @cpu: CPU of interest ++ * ++ * Return the maximum relative capacity of @cpu in relation to the most ++ * performant CPU in the system. The return value is in the range [1, ++ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). ++ */ ++__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) ++{ ++ if (ops_cpu_valid(cpu, NULL)) ++ return arch_scale_cpu_capacity(cpu); ++ else ++ return SCX_CPUPERF_ONE; ++} ++ ++/** ++ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU ++ * @cpu: CPU of interest ++ * ++ * Return the current relative performance of @cpu in relation to its maximum. ++ * The return value is in the range [1, %SCX_CPUPERF_ONE]. ++ * ++ * The current performance level of a CPU in relation to the maximum performance ++ * available in the system can be calculated as follows: ++ * ++ * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE ++ * ++ * The result is in the range [1, %SCX_CPUPERF_ONE]. ++ */ ++__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) ++{ ++ if (ops_cpu_valid(cpu, NULL)) ++ return arch_scale_freq_capacity(cpu); ++ else ++ return SCX_CPUPERF_ONE; ++} ++ ++/** ++ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU ++ * @cpu: CPU of interest ++ * @perf: target performance level [0, %SCX_CPUPERF_ONE] ++ * @flags: %SCX_CPUPERF_* flags ++ * ++ * Set the target performance level of @cpu to @perf. @perf is in linear ++ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the ++ * schedutil cpufreq governor chooses the target frequency. ++ * ++ * The actual performance level chosen, CPU grouping, and the overhead and ++ * latency of the operations are dependent on the hardware and cpufreq driver in ++ * use. Consult hardware and cpufreq documentation for more information. The ++ * current performance level can be monitored using scx_bpf_cpuperf_cur(). ++ */ ++__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) ++{ ++ if (unlikely(perf > SCX_CPUPERF_ONE)) { ++ scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); ++ return; ++ } ++ ++ if (ops_cpu_valid(cpu, NULL)) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->scx.cpuperf_target = perf; ++ ++ rcu_read_lock_sched_notrace(); ++ cpufreq_update_util(cpu_rq(cpu), 0); ++ rcu_read_unlock_sched_notrace(); ++ } ++} ++ ++/** ++ * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs ++ * ++ * All valid CPU IDs in the system are smaller than the returned value. ++ */ ++__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) ++{ ++ return nr_cpu_ids; ++} ++ ++/** ++ * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask ++ */ ++__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) ++{ ++ return cpu_possible_mask; ++} ++ ++/** ++ * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask ++ */ ++__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) ++{ ++ return cpu_online_mask; ++} ++ ++/** ++ * scx_bpf_put_cpumask - Release a possible/online cpumask ++ * @cpumask: cpumask to release ++ */ ++__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) ++{ ++ /* ++ * Empty function body because we aren't actually acquiring or releasing ++ * a reference to a global cpumask, which is read-only in the caller and ++ * is never released. The acquire / release semantics here are just used ++ * to make the cpumask is a trusted pointer in the caller. ++ */ ++} ++ ++/** ++ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking ++ * per-CPU cpumask. ++ * ++ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. ++ */ ++__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) ++{ ++ if (!static_branch_likely(&scx_builtin_idle_enabled)) { ++ scx_ops_error("built-in idle tracking is disabled"); ++ return cpu_none_mask; ++ } ++ ++#ifdef CONFIG_SMP ++ return idle_masks.cpu; ++#else ++ return cpu_none_mask; ++#endif ++} ++ ++/** ++ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, ++ * per-physical-core cpumask. Can be used to determine if an entire physical ++ * core is free. ++ * ++ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. ++ */ ++__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) ++{ ++ if (!static_branch_likely(&scx_builtin_idle_enabled)) { ++ scx_ops_error("built-in idle tracking is disabled"); ++ return cpu_none_mask; ++ } ++ ++#ifdef CONFIG_SMP ++ if (sched_smt_active()) ++ return idle_masks.smt; ++ else ++ return idle_masks.cpu; ++#else ++ return cpu_none_mask; ++#endif ++} ++ ++/** ++ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to ++ * either the percpu, or SMT idle-tracking cpumask. ++ */ ++__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) ++{ ++ /* ++ * Empty function body because we aren't actually acquiring or releasing ++ * a reference to a global idle cpumask, which is read-only in the ++ * caller and is never released. The acquire / release semantics here ++ * are just used to make the cpumask a trusted pointer in the caller. ++ */ ++} ++ ++/** ++ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state ++ * @cpu: cpu to test and clear idle for ++ * ++ * Returns %true if @cpu was idle and its idle state was successfully cleared. ++ * %false otherwise. ++ * ++ * Unavailable if ops.update_idle() is implemented and ++ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. ++ */ ++__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) ++{ ++ if (!static_branch_likely(&scx_builtin_idle_enabled)) { ++ scx_ops_error("built-in idle tracking is disabled"); ++ return false; ++ } ++ ++ if (ops_cpu_valid(cpu, NULL)) ++ return test_and_clear_cpu_idle(cpu); ++ else ++ return false; ++} ++ ++/** ++ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu ++ * @cpus_allowed: Allowed cpumask ++ * @flags: %SCX_PICK_IDLE_CPU_* flags ++ * ++ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu ++ * number on success. -%EBUSY if no matching cpu was found. ++ * ++ * Idle CPU tracking may race against CPU scheduling state transitions. For ++ * example, this function may return -%EBUSY as CPUs are transitioning into the ++ * idle state. If the caller then assumes that there will be dispatch events on ++ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs ++ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and ++ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch ++ * event in the near future. ++ * ++ * Unavailable if ops.update_idle() is implemented and ++ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. ++ */ ++__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, ++ u64 flags) ++{ ++ if (!static_branch_likely(&scx_builtin_idle_enabled)) { ++ scx_ops_error("built-in idle tracking is disabled"); ++ return -EBUSY; ++ } ++ ++ return scx_pick_idle_cpu(cpus_allowed, flags); ++} ++ ++/** ++ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU ++ * @cpus_allowed: Allowed cpumask ++ * @flags: %SCX_PICK_IDLE_CPU_* flags ++ * ++ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any ++ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu ++ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is ++ * empty. ++ * ++ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not ++ * set, this function can't tell which CPUs are idle and will always pick any ++ * CPU. ++ */ ++__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, ++ u64 flags) ++{ ++ s32 cpu; ++ ++ if (static_branch_likely(&scx_builtin_idle_enabled)) { ++ cpu = scx_pick_idle_cpu(cpus_allowed, flags); ++ if (cpu >= 0) ++ return cpu; ++ } ++ ++ cpu = cpumask_any_distribute(cpus_allowed); ++ if (cpu < nr_cpu_ids) ++ return cpu; ++ else ++ return -EBUSY; ++} ++ ++/** ++ * scx_bpf_task_running - Is task currently running? ++ * @p: task of interest ++ */ ++__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) ++{ ++ return task_rq(p)->curr == p; ++} ++ ++/** ++ * scx_bpf_task_cpu - CPU a task is currently associated with ++ * @p: task of interest ++ */ ++__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) ++{ ++ return task_cpu(p); ++} ++ ++/** ++ * scx_bpf_cpu_rq - Fetch the rq of a CPU ++ * @cpu: CPU of the rq ++ */ ++__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) ++{ ++ if (!ops_cpu_valid(cpu, NULL)) ++ return NULL; ++ ++ return cpu_rq(cpu); ++} ++ ++__bpf_kfunc_end_defs(); ++ ++BTF_KFUNCS_START(scx_kfunc_ids_any) ++BTF_ID_FLAGS(func, scx_bpf_kick_cpu) ++BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) ++BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) ++BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) ++BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) ++BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) ++BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) ++BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) ++BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) ++BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) ++BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) ++BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) ++BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) ++BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) ++BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) ++BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) ++BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) ++BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) ++BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) ++BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) ++BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) ++BTF_ID_FLAGS(func, scx_bpf_cpu_rq) ++BTF_KFUNCS_END(scx_kfunc_ids_any) ++ ++static const struct btf_kfunc_id_set scx_kfunc_set_any = { ++ .owner = THIS_MODULE, ++ .set = &scx_kfunc_ids_any, ++}; ++ ++static int __init scx_init(void) ++{ ++ int ret; ++ ++ /* ++ * kfunc registration can't be done from init_sched_ext_class() as ++ * register_btf_kfunc_id_set() needs most of the system to be up. ++ * ++ * Some kfuncs are context-sensitive and can only be called from ++ * specific SCX ops. They are grouped into BTF sets accordingly. ++ * Unfortunately, BPF currently doesn't have a way of enforcing such ++ * restrictions. Eventually, the verifier should be able to enforce ++ * them. For now, register them the same and make each kfunc explicitly ++ * check using scx_kf_allowed(). ++ */ ++ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_sleepable)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_select_cpu)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_enqueue_dispatch)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_dispatch)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_cpu_release)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, ++ &scx_kfunc_set_any)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, ++ &scx_kfunc_set_any)) || ++ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, ++ &scx_kfunc_set_any))) { ++ pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); ++ return ret; ++ } ++ ++ ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); ++ if (ret) { ++ pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); ++ return ret; ++ } ++ ++ ret = register_pm_notifier(&scx_pm_notifier); ++ if (ret) { ++ pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); ++ return ret; ++ } ++ ++ scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); ++ if (!scx_kset) { ++ pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); ++ return -ENOMEM; ++ } ++ ++ ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); ++ if (ret < 0) { ++ pr_err("sched_ext: Failed to add global attributes\n"); ++ return ret; ++ } ++ ++ return 0; ++} ++__initcall(scx_init); +diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h +new file mode 100644 +index 000000000000..32d3a51f591a +--- /dev/null ++++ b/kernel/sched/ext.h +@@ -0,0 +1,69 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst ++ * ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#ifdef CONFIG_SCHED_CLASS_EXT ++ ++void scx_tick(struct rq *rq); ++void init_scx_entity(struct sched_ext_entity *scx); ++void scx_pre_fork(struct task_struct *p); ++int scx_fork(struct task_struct *p); ++void scx_post_fork(struct task_struct *p); ++void scx_cancel_fork(struct task_struct *p); ++bool scx_can_stop_tick(struct rq *rq); ++void scx_rq_activate(struct rq *rq); ++void scx_rq_deactivate(struct rq *rq); ++int scx_check_setscheduler(struct task_struct *p, int policy); ++bool task_should_scx(struct task_struct *p); ++void init_sched_ext_class(void); ++ ++static inline u32 scx_cpuperf_target(s32 cpu) ++{ ++ if (scx_enabled()) ++ return cpu_rq(cpu)->scx.cpuperf_target; ++ else ++ return 0; ++} ++ ++static inline bool task_on_scx(const struct task_struct *p) ++{ ++ return scx_enabled() && p->sched_class == &ext_sched_class; ++} ++ ++#ifdef CONFIG_SCHED_CORE ++bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, ++ bool in_fi); ++#endif ++ ++#else /* CONFIG_SCHED_CLASS_EXT */ ++ ++static inline void scx_tick(struct rq *rq) {} ++static inline void scx_pre_fork(struct task_struct *p) {} ++static inline int scx_fork(struct task_struct *p) { return 0; } ++static inline void scx_post_fork(struct task_struct *p) {} ++static inline void scx_cancel_fork(struct task_struct *p) {} ++static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } ++static inline bool scx_can_stop_tick(struct rq *rq) { return true; } ++static inline void scx_rq_activate(struct rq *rq) {} ++static inline void scx_rq_deactivate(struct rq *rq) {} ++static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } ++static inline bool task_on_scx(const struct task_struct *p) { return false; } ++static inline void init_sched_ext_class(void) {} ++ ++#endif /* CONFIG_SCHED_CLASS_EXT */ ++ ++#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) ++void __scx_update_idle(struct rq *rq, bool idle); ++ ++static inline void scx_update_idle(struct rq *rq, bool idle) ++{ ++ if (scx_enabled()) ++ __scx_update_idle(rq, idle); ++} ++#else ++static inline void scx_update_idle(struct rq *rq, bool idle) {} ++#endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 213c94d027a4..ee4fe81ba456 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4074,6 +4074,17 @@ + load->inv_weight = sched_prio_to_wmult[prio]; + } + ++static void reweight_task_fair(struct rq *rq, struct task_struct *p, ++ const struct load_weight *lw) ++{ ++ struct sched_entity *se = &p->se; ++ struct cfs_rq *cfs_rq = cfs_rq_of(se); ++ struct load_weight *load = &se->load; ++ ++ reweight_entity(cfs_rq, se, lw->weight); ++ load->inv_weight = lw->inv_weight; ++} ++ + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -8348,7 +8348,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): + */ +- if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) ++ if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION)) + return; + + find_matching_se(&se, &pse); +@@ -9309,28 +9309,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { + + static bool __update_blocked_others(struct rq *rq, bool *done) + { +- const struct sched_class *curr_class; +- u64 now = rq_clock_pelt(rq); +- unsigned long thermal_pressure; +- bool decayed; ++ bool updated; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ +- curr_class = rq->curr->sched_class; +- +- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); +- +- decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | +- update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | +- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | +- update_irq_load_avg(rq, 0); ++ updated = update_other_load_avgs(rq); + + if (others_have_blocked(rq)) + *done = false; + +- return decayed; ++ return updated; + } + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -13156,6 +13146,7 @@ DEFINE_SCHED_CLASS(fair) = { + .task_tick = task_tick_fair, + .task_fork = task_fork_fair, + ++ .reweight_task = reweight_task_fair, + .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, + .switched_to = switched_to_fair, +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 6135fbe83d68..3b6540cc436a 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -458,11 +458,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) + + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) + { ++ scx_update_idle(rq, false); + } + + static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) + { + update_idle_core(rq); ++ scx_update_idle(rq, true); + schedstat_inc(rq->sched_goidle); + } + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index ae50f212775e..9ce5074e8a8d 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -173,9 +173,19 @@ static inline int idle_policy(int policy) + { + return policy == SCHED_IDLE; + } ++ ++static inline int normal_policy(int policy) ++{ ++#ifdef CONFIG_SCHED_CLASS_EXT ++ if (policy == SCHED_EXT) ++ return true; ++#endif ++ return policy == SCHED_NORMAL; ++} ++ + static inline int fair_policy(int policy) + { +- return policy == SCHED_NORMAL || policy == SCHED_BATCH; ++ return normal_policy(policy) || policy == SCHED_BATCH; + } + + static inline int rt_policy(int policy) +@@ -223,6 +233,24 @@ static inline void update_avg(u64 *avg, u64 sample) + #define shr_bound(val, shift) \ + (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) + ++/* ++ * cgroup weight knobs should use the common MIN, DFL and MAX values which are ++ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it ++ * maps pretty well onto the shares value used by scheduler and the round-trip ++ * conversions preserve the original value over the entire range. ++ */ ++static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) ++{ ++ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); ++} ++ ++static inline unsigned long sched_weight_to_cgroup(unsigned long weight) ++{ ++ return clamp_t(unsigned long, ++ DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), ++ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); ++} ++ + /* + * !! For sched_setattr_nocheck() (kernel) only !! + * +@@ -461,6 +489,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) + return walk_tg_tree_from(&root_task_group, down, up, data); + } + ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ + extern int tg_nop(struct task_group *tg, void *data); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -569,6 +602,12 @@ do { \ + # define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) + # define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + ++struct rq; ++struct balance_callback { ++ struct balance_callback *next; ++ void (*func)(struct rq *rq); ++}; ++ + /* CFS-related fields in a runqueue */ + struct cfs_rq { + struct load_weight load; +@@ -677,6 +716,42 @@ struct cfs_rq { + #endif /* CONFIG_FAIR_GROUP_SCHED */ + }; + ++#ifdef CONFIG_SCHED_CLASS_EXT ++/* scx_rq->flags, protected by the rq lock */ ++enum scx_rq_flags { ++ /* ++ * A hotplugged CPU starts scheduling before rq_online_scx(). Track ++ * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called ++ * only while the BPF scheduler considers the CPU to be online. ++ */ ++ SCX_RQ_ONLINE = 1 << 0, ++ SCX_RQ_CAN_STOP_TICK = 1 << 1, ++ ++ SCX_RQ_IN_WAKEUP = 1 << 16, ++ SCX_RQ_IN_BALANCE = 1 << 17, ++}; ++ ++struct scx_rq { ++ struct scx_dispatch_q local_dsq; ++ struct list_head runnable_list; /* runnable tasks on this rq */ ++ struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */ ++ unsigned long ops_qseq; ++ u64 extra_enq_flags; /* see move_task_to_local_dsq() */ ++ u32 nr_running; ++ u32 flags; ++ u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ ++ bool cpu_released; ++ cpumask_var_t cpus_to_kick; ++ cpumask_var_t cpus_to_kick_if_idle; ++ cpumask_var_t cpus_to_preempt; ++ cpumask_var_t cpus_to_wait; ++ unsigned long pnt_seq; ++ struct balance_callback deferred_bal_cb; ++ struct irq_work deferred_irq_work; ++ struct irq_work kick_cpus_irq_work; ++}; ++#endif /* CONFIG_SCHED_CLASS_EXT */ ++ + static inline int rt_bandwidth_enabled(void) + { + return sysctl_sched_rt_runtime >= 0; +@@ -969,12 +1044,6 @@ struct uclamp_rq { + DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); + #endif /* CONFIG_UCLAMP_TASK */ + +-struct rq; +-struct balance_callback { +- struct balance_callback *next; +- void (*func)(struct rq *rq); +-}; +- + /* + * This is the main, per-CPU runqueue data structure. + * +@@ -1017,6 +1086,9 @@ struct rq { + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; ++#ifdef CONFIG_SCHED_CLASS_EXT ++ struct scx_rq scx; ++#endif + + #ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this CPU: */ +@@ -2276,6 +2348,8 @@ struct sched_class { + void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + ++ void (*switch_class)(struct rq *rq, struct task_struct *next); ++ + #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); +@@ -2303,8 +2377,11 @@ struct sched_class { + * cannot assume the switched_from/switched_to pair is serialized by + * rq->lock. They are however serialized by p->pi_lock. + */ ++ void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); ++ void (*reweight_task)(struct rq *this_rq, struct task_struct *task, ++ const struct load_weight *lw); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + +@@ -2353,19 +2430,54 @@ const struct sched_class name##_sched_class \ + extern struct sched_class __sched_class_highest[]; + extern struct sched_class __sched_class_lowest[]; + ++extern const struct sched_class stop_sched_class; ++extern const struct sched_class dl_sched_class; ++extern const struct sched_class rt_sched_class; ++extern const struct sched_class fair_sched_class; ++extern const struct sched_class idle_sched_class; ++ ++#ifdef CONFIG_SCHED_CLASS_EXT ++extern const struct sched_class ext_sched_class; ++ ++DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ ++DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ ++ ++#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) ++#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) ++#else /* !CONFIG_SCHED_CLASS_EXT */ ++#define scx_enabled() false ++#define scx_switched_all() false ++#endif /* !CONFIG_SCHED_CLASS_EXT */ ++ ++/* ++ * Iterate only active classes. SCX can take over all fair tasks or be ++ * completely disabled. If the former, skip fair. If the latter, skip SCX. ++ */ ++static inline const struct sched_class *next_active_class(const struct sched_class *class) ++{ ++ class++; ++#ifdef CONFIG_SCHED_CLASS_EXT ++ if (scx_switched_all() && class == &fair_sched_class) ++ class++; ++ if (!scx_enabled() && class == &ext_sched_class) ++ class++; ++#endif ++ return class; ++} ++ + #define for_class_range(class, _from, _to) \ + for (class = (_from); class < (_to); class++) + + #define for_each_class(class) \ + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +-#define sched_class_above(_a, _b) ((_a) < (_b)) ++#define for_active_class_range(class, _from, _to) \ ++ for (class = (_from); class != (_to); class = next_active_class(class)) + +-extern const struct sched_class stop_sched_class; +-extern const struct sched_class dl_sched_class; +-extern const struct sched_class rt_sched_class; +-extern const struct sched_class fair_sched_class; +-extern const struct sched_class idle_sched_class; ++#define for_each_active_class(class) \ ++ for_active_class_range(class, __sched_class_highest, __sched_class_lowest) ++ ++#define sched_class_above(_a, _b) ((_a) < (_b)) + + static inline bool sched_stop_runnable(struct rq *rq) + { +@@ -2462,7 +2574,7 @@ extern void init_sched_dl_class(void); + extern void init_sched_rt_class(void); + extern void init_sched_fair_class(void); + +-extern void reweight_task(struct task_struct *p, int prio); ++extern void __setscheduler_prio(struct task_struct *p, int prio); + + extern void resched_curr(struct rq *rq); + extern void resched_cpu(int cpu); +@@ -2542,6 +2654,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); + extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + ++extern void check_class_changing(struct rq *rq, struct task_struct *p, ++ const struct sched_class *prev_class); ++extern void check_class_changed(struct rq *rq, struct task_struct *p, ++ const struct sched_class *prev_class, ++ int oldprio); ++ + extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + + #ifdef CONFIG_PREEMPT_RT +@@ -3007,6 +3125,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} + #endif + + #ifdef CONFIG_SMP ++ ++bool update_other_load_avgs(struct rq *rq); ++ + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max); +@@ -3049,6 +3170,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) + { + return READ_ONCE(rq->avg_rt.util_avg); + } ++#else /* !CONFIG_SMP */ ++static inline bool update_other_load_avgs(struct rq *rq) { return false; } + #endif + + #ifdef CONFIG_UCLAMP_TASK +@@ -3481,4 +3604,24 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); + extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + ++#ifdef CONFIG_SCHED_CLASS_EXT ++/* ++ * Used by SCX in the enable/disable paths to move tasks between sched_classes ++ * and establish invariants. ++ */ ++struct sched_enq_and_set_ctx { ++ struct task_struct *p; ++ int queue_flags; ++ bool queued; ++ bool running; ++}; ++ ++void sched_deq_and_put_task(struct task_struct *p, int queue_flags, ++ struct sched_enq_and_set_ctx *ctx); ++void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); ++ ++#endif /* CONFIG_SCHED_CLASS_EXT */ ++ ++#include "ext.h" ++ + #endif /* _KERNEL_SCHED_SCHED_H */ +diff --git a/lib/dump_stack.c b/lib/dump_stack.c +index 222c6d6c8281..9581ef4efec5 100644 +--- a/lib/dump_stack.c ++++ b/lib/dump_stack.c +@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) + + print_worker_info(log_lvl, current); + print_stop_info(log_lvl, current); ++ print_scx_info(log_lvl, current); + } + + /** +diff --git a/tools/Makefile b/tools/Makefile +index 276f5d0d53a4..278d24723b74 100644 +--- a/tools/Makefile ++++ b/tools/Makefile +@@ -28,6 +28,7 @@ help: + @echo ' pci - PCI tools' + @echo ' perf - Linux performance measurement and analysis tool' + @echo ' selftests - various kernel selftests' ++ @echo ' sched_ext - sched_ext example schedulers' + @echo ' bootconfig - boot config tool' + @echo ' spi - spi tools' + @echo ' tmon - thermal monitoring and tuning tool' +@@ -91,6 +92,9 @@ perf: FORCE + $(Q)mkdir -p $(PERF_O) . + $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= + ++sched_ext: FORCE ++ $(call descend,sched_ext) ++ + selftests: FORCE + $(call descend,testing/$@) + +@@ -184,6 +188,9 @@ perf_clean: + $(Q)mkdir -p $(PERF_O) . + $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean + ++sched_ext_clean: ++ $(call descend,sched_ext,clean) ++ + selftests_clean: + $(call descend,testing/$(@:_clean=),clean) + +@@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ + mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ + freefall_clean build_clean libbpf_clean libsubcmd_clean \ + gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ +- intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean ++ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ ++ sched_ext_clean + + .PHONY: FORCE +diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore +new file mode 100644 +index 000000000000..d6264fe1c8cd +--- /dev/null ++++ b/tools/sched_ext/.gitignore +@@ -0,0 +1,2 @@ ++tools/ ++build/ +diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile +new file mode 100644 +index 000000000000..bf7e108f5ae1 +--- /dev/null ++++ b/tools/sched_ext/Makefile +@@ -0,0 +1,246 @@ ++# SPDX-License-Identifier: GPL-2.0 ++# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++include ../build/Build.include ++include ../scripts/Makefile.arch ++include ../scripts/Makefile.include ++ ++all: all_targets ++ ++ifneq ($(LLVM),) ++ifneq ($(filter %/,$(LLVM)),) ++LLVM_PREFIX := $(LLVM) ++else ifneq ($(filter -%,$(LLVM)),) ++LLVM_SUFFIX := $(LLVM) ++endif ++ ++CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi ++CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu ++CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl ++CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu ++CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu ++CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu ++CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu ++CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu ++CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu ++CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) ++ ++ifeq ($(CROSS_COMPILE),) ++ifeq ($(CLANG_TARGET_FLAGS),) ++$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) ++else ++CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) ++endif # CLANG_TARGET_FLAGS ++else ++CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) ++endif # CROSS_COMPILE ++ ++CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as ++else ++CC := $(CROSS_COMPILE)gcc ++endif # LLVM ++ ++CURDIR := $(abspath .) ++TOOLSDIR := $(abspath ..) ++LIBDIR := $(TOOLSDIR)/lib ++BPFDIR := $(LIBDIR)/bpf ++TOOLSINCDIR := $(TOOLSDIR)/include ++BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool ++APIDIR := $(TOOLSINCDIR)/uapi ++GENDIR := $(abspath ../../include/generated) ++GENHDR := $(GENDIR)/autoconf.h ++ ++ifeq ($(O),) ++OUTPUT_DIR := $(CURDIR)/build ++else ++OUTPUT_DIR := $(O)/build ++endif # O ++OBJ_DIR := $(OUTPUT_DIR)/obj ++INCLUDE_DIR := $(OUTPUT_DIR)/include ++BPFOBJ_DIR := $(OBJ_DIR)/libbpf ++SCXOBJ_DIR := $(OBJ_DIR)/sched_ext ++BINDIR := $(OUTPUT_DIR)/bin ++BPFOBJ := $(BPFOBJ_DIR)/libbpf.a ++ifneq ($(CROSS_COMPILE),) ++HOST_BUILD_DIR := $(OBJ_DIR)/host ++HOST_OUTPUT_DIR := host-tools ++HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include ++else ++HOST_BUILD_DIR := $(OBJ_DIR) ++HOST_OUTPUT_DIR := $(OUTPUT_DIR) ++HOST_INCLUDE_DIR := $(INCLUDE_DIR) ++endif ++HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a ++RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids ++DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool ++ ++VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ ++ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ ++ ../../vmlinux \ ++ /sys/kernel/btf/vmlinux \ ++ /boot/vmlinux-$(shell uname -r) ++VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) ++ifeq ($(VMLINUX_BTF),) ++$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") ++endif ++ ++BPFTOOL ?= $(DEFAULT_BPFTOOL) ++ ++ifneq ($(wildcard $(GENHDR)),) ++ GENFLAGS := -DHAVE_GENHDR ++endif ++ ++CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ ++ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ ++ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include ++ ++# Silence some warnings when compiled with clang ++ifneq ($(LLVM),) ++CFLAGS += -Wno-unused-command-line-argument ++endif ++ ++LDFLAGS = -lelf -lz -lpthread ++ ++IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \ ++ grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__') ++ ++# Get Clang's default includes on this system, as opposed to those seen by ++# '-target bpf'. This fixes "missing" files on some architectures/distros, ++# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. ++# ++# Use '-idirafter': Don't interfere with include mechanics except where the ++# build would have failed anyways. ++define get_sys_includes ++$(shell $(1) -v -E - </dev/null 2>&1 \ ++ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ ++$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') ++endef ++ ++BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ ++ $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \ ++ -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat \ ++ -I$(INCLUDE_DIR) -I$(APIDIR) \ ++ -I../../include \ ++ $(call get_sys_includes,$(CLANG)) \ ++ -Wall -Wno-compare-distinct-pointer-types \ ++ -O2 -mcpu=v3 ++ ++# sort removes libbpf duplicates when not cross-building ++MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ ++ $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \ ++ $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR)) ++ ++$(MAKE_DIRS): ++ $(call msg,MKDIR,,$@) ++ $(Q)mkdir -p $@ ++ ++$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ++ $(APIDIR)/linux/bpf.h \ ++ | $(OBJ_DIR)/libbpf ++ $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/ \ ++ EXTRA_CFLAGS='-g -O0 -fPIC' \ ++ DESTDIR=$(OUTPUT_DIR) prefix= all install_headers ++ ++$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ ++ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool ++ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ ++ ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \ ++ EXTRA_CFLAGS='-g -O0' \ ++ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ ++ LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \ ++ LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/ \ ++ prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin ++ ++$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) ++ifeq ($(VMLINUX_H),) ++ $(call msg,GEN,,$@) ++ $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ ++else ++ $(call msg,CP,,$@) ++ $(Q)cp "$(VMLINUX_H)" $@ ++endif ++ ++$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ ++ | $(BPFOBJ) $(SCXOBJ_DIR) ++ $(call msg,CLNG-BPF,,$(notdir $@)) ++ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ ++ ++$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) ++ $(eval sched=$(notdir $@)) ++ $(call msg,GEN-SKEL,,$(sched)) ++ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< ++ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) ++ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) ++ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) ++ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ ++ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) ++ ++SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) ++ ++c-sched-targets = scx_simple scx_qmap scx_central ++ ++$(addprefix $(BINDIR)/,$(c-sched-targets)): \ ++ $(BINDIR)/%: \ ++ $(filter-out %.bpf.c,%.c) \ ++ $(INCLUDE_DIR)/%.bpf.skel.h \ ++ $(SCX_COMMON_DEPS) ++ $(eval sched=$(notdir $@)) ++ $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o ++ $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) ++ ++$(c-sched-targets): %: $(BINDIR)/% ++ ++install: all ++ $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ ++ $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ ++ ++clean: ++ rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) ++ rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h ++ rm -f $(c-sched-targets) ++ ++help: ++ @echo 'Building targets' ++ @echo '================' ++ @echo '' ++ @echo ' all - Compile all schedulers' ++ @echo '' ++ @echo 'Alternatively, you may compile individual schedulers:' ++ @echo '' ++ @printf ' %s\n' $(c-sched-targets) ++ @echo '' ++ @echo 'For any scheduler build target, you may specify an alternative' ++ @echo 'build output path with the O= environment variable. For example:' ++ @echo '' ++ @echo ' O=/tmp/sched_ext make all' ++ @echo '' ++ @echo 'will compile all schedulers, and emit the build artifacts to' ++ @echo '/tmp/sched_ext/build.' ++ @echo '' ++ @echo '' ++ @echo 'Installing targets' ++ @echo '==================' ++ @echo '' ++ @echo ' install - Compile and install all schedulers to /usr/bin.' ++ @echo ' You may specify the DESTDIR= environment variable' ++ @echo ' to indicate a prefix for /usr/bin. For example:' ++ @echo '' ++ @echo ' DESTDIR=/tmp/sched_ext make install' ++ @echo '' ++ @echo ' will build the schedulers in CWD/build, and' ++ @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' ++ @echo '' ++ @echo '' ++ @echo 'Cleaning targets' ++ @echo '================' ++ @echo '' ++ @echo ' clean - Remove all generated files' ++ ++all_targets: $(c-sched-targets) ++ ++.PHONY: all all_targets $(c-sched-targets) clean help ++ ++# delete failed targets ++.DELETE_ON_ERROR: ++ ++# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets ++.SECONDARY: +diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md +new file mode 100644 +index 000000000000..8efe70cc4363 +--- /dev/null ++++ b/tools/sched_ext/README.md +@@ -0,0 +1,258 @@ ++SCHED_EXT EXAMPLE SCHEDULERS ++============================ ++ ++# Introduction ++ ++This directory contains a number of example sched_ext schedulers. These ++schedulers are meant to provide examples of different types of schedulers ++that can be built using sched_ext, and illustrate how various features of ++sched_ext can be used. ++ ++Some of the examples are performant, production-ready schedulers. That is, for ++the correct workload and with the correct tuning, they may be deployed in a ++production environment with acceptable or possibly even improved performance. ++Others are just examples that in practice, would not provide acceptable ++performance (though they could be improved to get there). ++ ++This README will describe these example schedulers, including describing the ++types of workloads or scenarios they're designed to accommodate, and whether or ++not they're production ready. For more details on any of these schedulers, ++please see the header comment in their .bpf.c file. ++ ++ ++# Compiling the examples ++ ++There are a few toolchain dependencies for compiling the example schedulers. ++ ++## Toolchain dependencies ++ ++1. clang >= 16.0.0 ++ ++The schedulers are BPF programs, and therefore must be compiled with clang. gcc ++is actively working on adding a BPF backend compiler as well, but are still ++missing some features such as BTF type tags which are necessary for using ++kptrs. ++ ++2. pahole >= 1.25 ++ ++You may need pahole in order to generate BTF from DWARF. ++ ++3. rust >= 1.70.0 ++ ++Rust schedulers uses features present in the rust toolchain >= 1.70.0. You ++should be able to use the stable build from rustup, but if that doesn't ++work, try using the rustup nightly build. ++ ++There are other requirements as well, such as make, but these are the main / ++non-trivial ones. ++ ++## Compiling the kernel ++ ++In order to run a sched_ext scheduler, you'll have to run a kernel compiled ++with the patches in this repository, and with a minimum set of necessary ++Kconfig options: ++ ++``` ++CONFIG_BPF=y ++CONFIG_SCHED_CLASS_EXT=y ++CONFIG_BPF_SYSCALL=y ++CONFIG_BPF_JIT=y ++CONFIG_DEBUG_INFO_BTF=y ++``` ++ ++It's also recommended that you also include the following Kconfig options: ++ ++``` ++CONFIG_BPF_JIT_ALWAYS_ON=y ++CONFIG_BPF_JIT_DEFAULT_ON=y ++CONFIG_PAHOLE_HAS_SPLIT_BTF=y ++CONFIG_PAHOLE_HAS_BTF_TAG=y ++``` ++ ++There is a `Kconfig` file in this directory whose contents you can append to ++your local `.config` file, as long as there are no conflicts with any existing ++options in the file. ++ ++## Getting a vmlinux.h file ++ ++You may notice that most of the example schedulers include a "vmlinux.h" file. ++This is a large, auto-generated header file that contains all of the types ++defined in some vmlinux binary that was compiled with ++[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig ++options specified above). ++ ++The header file is created using `bpftool`, by passing it a vmlinux binary ++compiled with BTF as follows: ++ ++```bash ++$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h ++``` ++ ++`bpftool` analyzes all of the BTF encodings in the binary, and produces a ++header file that can be included by BPF programs to access those types. For ++example, using vmlinux.h allows a scheduler to access fields defined directly ++in vmlinux as follows: ++ ++```c ++#include "vmlinux.h" ++// vmlinux.h is also implicitly included by scx_common.bpf.h. ++#include "scx_common.bpf.h" ++ ++/* ++ * vmlinux.h provides definitions for struct task_struct and ++ * struct scx_enable_args. ++ */ ++void BPF_STRUCT_OPS(example_enable, struct task_struct *p, ++ struct scx_enable_args *args) ++{ ++ bpf_printk("Task %s enabled in example scheduler", p->comm); ++} ++ ++// vmlinux.h provides the definition for struct sched_ext_ops. ++SEC(".struct_ops.link") ++struct sched_ext_ops example_ops { ++ .enable = (void *)example_enable, ++ .name = "example", ++} ++``` ++ ++The scheduler build system will generate this vmlinux.h file as part of the ++scheduler build pipeline. It looks for a vmlinux file in the following ++dependency order: ++ ++1. If the O= environment variable is defined, at `$O/vmlinux` ++2. If the KBUILD_OUTPUT= environment variable is defined, at ++ `$KBUILD_OUTPUT/vmlinux` ++3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're ++ compiling the schedulers) ++3. `/sys/kernel/btf/vmlinux` ++4. `/boot/vmlinux-$(uname -r)` ++ ++In other words, if you have compiled a kernel in your local repo, its vmlinux ++file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of ++the kernel you're currently running on. This means that if you're running on a ++kernel with sched_ext support, you may not need to compile a local kernel at ++all. ++ ++### Aside on CO-RE ++ ++One of the cooler features of BPF is that it supports ++[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run ++Everywhere). This feature allows you to reference fields inside of structs with ++types defined internal to the kernel, and not have to recompile if you load the ++BPF program on a different kernel with the field at a different offset. In our ++example above, we print out a task name with `p->comm`. CO-RE would perform ++relocations for that access when the program is loaded to ensure that it's ++referencing the correct offset for the currently running kernel. ++ ++## Compiling the schedulers ++ ++Once you have your toolchain setup, and a vmlinux that can be used to generate ++a full vmlinux.h file, you can compile the schedulers using `make`: ++ ++```bash ++$ make -j($nproc) ++``` ++ ++# Example schedulers ++ ++This directory contains the following example schedulers. These schedulers are ++for testing and demonstrating different aspects of sched_ext. While some may be ++useful in limited scenarios, they are not intended to be practical. ++ ++For more scheduler implementations, tools and documentation, visit ++https://github.com/sched-ext/scx. ++ ++## scx_simple ++ ++A simple scheduler that provides an example of a minimal sched_ext scheduler. ++scx_simple can be run in either global weighted vtime mode, or FIFO mode. ++ ++Though very simple, in limited scenarios, this scheduler can perform reasonably ++well on single-socket systems with a unified L3 cache. ++ ++## scx_qmap ++ ++Another simple, yet slightly more complex scheduler that provides an example of ++a basic weighted FIFO queuing policy. It also provides examples of some common ++useful BPF features, such as sleepable per-task storage allocation in the ++`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to ++enqueue tasks. It also illustrates how core-sched support could be implemented. ++ ++## scx_central ++ ++A "central" scheduler where scheduling decisions are made from a single CPU. ++This scheduler illustrates how scheduling decisions can be dispatched from a ++single CPU, allowing other cores to run with infinite slices, without timer ++ticks, and without having to incur the overhead of making scheduling decisions. ++ ++The approach demonstrated by this scheduler may be useful for any workload that ++benefits from minimizing scheduling overhead and timer ticks. An example of ++where this could be particularly useful is running VMs, where running with ++infinite slices and no timer ticks allows the VM to avoid unnecessary expensive ++vmexits. ++ ++ ++# Troubleshooting ++ ++There are a number of common issues that you may run into when building the ++schedulers. We'll go over some of the common ones here. ++ ++## Build Failures ++ ++### Old version of clang ++ ++``` ++error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole ++ _Static_assert(SCX_DSQ_FLAG_BUILTIN, ++ ^~~~~~~~~~~~~~~~~~~~ ++1 error generated. ++``` ++ ++This means you built the kernel or the schedulers with an older version of ++clang than what's supported (i.e. older than 16.0.0). To remediate this: ++ ++1. `which clang` to make sure you're using a sufficiently new version of clang. ++ ++2. `make fullclean` in the root path of the repository, and rebuild the kernel ++ and schedulers. ++ ++3. Rebuild the kernel, and then your example schedulers. ++ ++The schedulers are also cleaned if you invoke `make mrproper` in the root ++directory of the tree. ++ ++### Stale kernel build / incomplete vmlinux.h file ++ ++As described above, you'll need a `vmlinux.h` file that was generated from a ++vmlinux built with BTF, and with sched_ext support enabled. If you don't, ++you'll see errors such as the following which indicate that a type being ++referenced in a scheduler is unknown: ++ ++``` ++/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' ++ ++const struct scx_exit_info *ei) ++ ++^ ++``` ++ ++In order to resolve this, please follow the steps above in ++[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your ++schedulers are using a vmlinux.h file that includes the requisite types. ++ ++## Misc ++ ++### llvm: [OFF] ++ ++You may see the following output when building the schedulers: ++ ++``` ++Auto-detecting system features: ++... clang-bpf-co-re: [ on ] ++... llvm: [ OFF ] ++... libcap: [ on ] ++... libbfd: [ on ] ++``` ++ ++Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. +diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h +new file mode 100644 +index 000000000000..ad7d139ce907 +--- /dev/null ++++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h +@@ -0,0 +1,11 @@ ++/* ++ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when ++ * compiling BPF files although its content doesn't play any role. The file in ++ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is ++ * defined. When compiling a BPF source, __x86_64__ isn't set and thus ++ * stubs-32.h is selected. However, the file is not there if the system doesn't ++ * have 32bit glibc devel package installed leading to a build failure. ++ * ++ * The problem is worked around by making this file available in the include ++ * search paths before the system one when building BPF. ++ */ +diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h +new file mode 100644 +index 000000000000..20280df62857 +--- /dev/null ++++ b/tools/sched_ext/include/scx/common.bpf.h +@@ -0,0 +1,401 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#ifndef __SCX_COMMON_BPF_H ++#define __SCX_COMMON_BPF_H ++ ++#include "vmlinux.h" ++#include <bpf/bpf_helpers.h> ++#include <bpf/bpf_tracing.h> ++#include <asm-generic/errno.h> ++#include "user_exit_info.h" ++ ++#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ ++#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ ++#define PF_EXITING 0x00000004 ++#define CLOCK_MONOTONIC 1 ++ ++/* ++ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can ++ * lead to really confusing misbehaviors. Let's trigger a build failure. ++ */ ++static inline void ___vmlinux_h_sanity_check___(void) ++{ ++ _Static_assert(SCX_DSQ_FLAG_BUILTIN, ++ "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); ++} ++ ++s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; ++s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; ++void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; ++void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; ++u32 scx_bpf_dispatch_nr_slots(void) __ksym; ++void scx_bpf_dispatch_cancel(void) __ksym; ++bool scx_bpf_consume(u64 dsq_id) __ksym; ++u32 scx_bpf_reenqueue_local(void) __ksym; ++void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; ++s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; ++void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; ++int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; ++struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; ++void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; ++void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; ++void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; ++void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; ++u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; ++u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; ++void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; ++u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; ++const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; ++const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; ++void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; ++const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; ++const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; ++void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; ++bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; ++s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; ++s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; ++bool scx_bpf_task_running(const struct task_struct *p) __ksym; ++s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; ++struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; ++ ++static inline __attribute__((format(printf, 1, 2))) ++void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} ++ ++/* ++ * Helper macro for initializing the fmt and variadic argument inputs to both ++ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to ++ * refer to the initialized list of inputs to the bstr kfunc. ++ */ ++#define scx_bpf_bstr_preamble(fmt, args...) \ ++ static char ___fmt[] = fmt; \ ++ /* \ ++ * Note that __param[] must have at least one \ ++ * element to keep the verifier happy. \ ++ */ \ ++ unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ ++ \ ++ _Pragma("GCC diagnostic push") \ ++ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ ++ ___bpf_fill(___param, args); \ ++ _Pragma("GCC diagnostic pop") \ ++ ++/* ++ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments ++ * instead of an array of u64. Using this macro will cause the scheduler to ++ * exit cleanly with the specified exit code being passed to user space. ++ */ ++#define scx_bpf_exit(code, fmt, args...) \ ++({ \ ++ scx_bpf_bstr_preamble(fmt, args) \ ++ scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ ++ ___scx_bpf_bstr_format_checker(fmt, ##args); \ ++}) ++ ++/* ++ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments ++ * instead of an array of u64. Invoking this macro will cause the scheduler to ++ * exit in an erroneous state, with diagnostic information being passed to the ++ * user. ++ */ ++#define scx_bpf_error(fmt, args...) \ ++({ \ ++ scx_bpf_bstr_preamble(fmt, args) \ ++ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ ++ ___scx_bpf_bstr_format_checker(fmt, ##args); \ ++}) ++ ++/* ++ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments ++ * instead of an array of u64. To be used from ops.dump() and friends. ++ */ ++#define scx_bpf_dump(fmt, args...) \ ++({ \ ++ scx_bpf_bstr_preamble(fmt, args) \ ++ scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ ++ ___scx_bpf_bstr_format_checker(fmt, ##args); \ ++}) ++ ++#define BPF_STRUCT_OPS(name, args...) \ ++SEC("struct_ops/"#name) \ ++BPF_PROG(name, ##args) ++ ++#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ ++SEC("struct_ops.s/"#name) \ ++BPF_PROG(name, ##args) ++ ++/** ++ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized ++ * @elfsec: the data section of the BPF program in which to place the array ++ * @arr: the name of the array ++ * ++ * libbpf has an API for setting map value sizes. Since data sections (i.e. ++ * bss, data, rodata) themselves are maps, a data section can be resized. If ++ * a data section has an array as its last element, the BTF info for that ++ * array will be adjusted so that length of the array is extended to meet the ++ * new length of the data section. This macro annotates an array to have an ++ * element count of one with the assumption that this array can be resized ++ * within the userspace program. It also annotates the section specifier so ++ * this array exists in a custom sub data section which can be resized ++ * independently. ++ * ++ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an ++ * array declared with RESIZABLE_ARRAY(). ++ */ ++#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) ++ ++/** ++ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member ++ * @base: struct or array to index ++ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) ++ * ++ * The verifier often gets confused by the instruction sequence the compiler ++ * generates for indexing struct fields or arrays. This macro forces the ++ * compiler to generate a code sequence which first calculates the byte offset, ++ * checks it against the struct or array size and add that byte offset to ++ * generate the pointer to the member to help the verifier. ++ * ++ * Ideally, we want to abort if the calculated offset is out-of-bounds. However, ++ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller ++ * must check for %NULL and take appropriate action to appease the verifier. To ++ * avoid confusing the verifier, it's best to check for %NULL and dereference ++ * immediately. ++ * ++ * vptr = MEMBER_VPTR(my_array, [i][j]); ++ * if (!vptr) ++ * return error; ++ * *vptr = new_value; ++ * ++ * sizeof(@base) should encompass the memory area to be accessed and thus can't ++ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of ++ * `MEMBER_VPTR(ptr, ->member)`. ++ */ ++#define MEMBER_VPTR(base, member) (typeof((base) member) *) \ ++({ \ ++ u64 __base = (u64)&(base); \ ++ u64 __addr = (u64)&((base) member) - __base; \ ++ _Static_assert(sizeof(base) >= sizeof((base) member), \ ++ "@base is smaller than @member, is @base a pointer?"); \ ++ asm volatile ( \ ++ "if %0 <= %[max] goto +2\n" \ ++ "%0 = 0\n" \ ++ "goto +1\n" \ ++ "%0 += %1\n" \ ++ : "+r"(__addr) \ ++ : "r"(__base), \ ++ [max]"i"(sizeof(base) - sizeof((base) member))); \ ++ __addr; \ ++}) ++ ++/** ++ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element ++ * @arr: array to index into ++ * @i: array index ++ * @n: number of elements in array ++ * ++ * Similar to MEMBER_VPTR() but is intended for use with arrays where the ++ * element count needs to be explicit. ++ * It can be used in cases where a global array is defined with an initial ++ * size but is intended to be be resized before loading the BPF program. ++ * Without this version of the macro, MEMBER_VPTR() will use the compile time ++ * size of the array to compute the max, which will result in rejection by ++ * the verifier. ++ */ ++#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ ++({ \ ++ u64 __base = (u64)arr; \ ++ u64 __addr = (u64)&(arr[i]) - __base; \ ++ asm volatile ( \ ++ "if %0 <= %[max] goto +2\n" \ ++ "%0 = 0\n" \ ++ "goto +1\n" \ ++ "%0 += %1\n" \ ++ : "+r"(__addr) \ ++ : "r"(__base), \ ++ [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ ++ __addr; \ ++}) ++ ++ ++/* ++ * BPF declarations and helpers ++ */ ++ ++/* list and rbtree */ ++#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) ++#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) ++ ++void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; ++void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; ++ ++#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) ++#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) ++ ++void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; ++void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; ++struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; ++struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; ++struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, ++ struct bpf_rb_node *node) __ksym; ++int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, ++ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), ++ void *meta, __u64 off) __ksym; ++#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) ++ ++struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; ++ ++void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; ++#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) ++ ++/* task */ ++struct task_struct *bpf_task_from_pid(s32 pid) __ksym; ++struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; ++void bpf_task_release(struct task_struct *p) __ksym; ++ ++/* cgroup */ ++struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; ++void bpf_cgroup_release(struct cgroup *cgrp) __ksym; ++struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; ++ ++/* css iteration */ ++struct bpf_iter_css; ++struct cgroup_subsys_state; ++extern int bpf_iter_css_new(struct bpf_iter_css *it, ++ struct cgroup_subsys_state *start, ++ unsigned int flags) __weak __ksym; ++extern struct cgroup_subsys_state * ++bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; ++extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; ++ ++/* cpumask */ ++struct bpf_cpumask *bpf_cpumask_create(void) __ksym; ++struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; ++void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; ++u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; ++u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; ++void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; ++void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; ++bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; ++bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; ++bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; ++void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; ++void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; ++bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, ++ const struct cpumask *src2) __ksym; ++void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, ++ const struct cpumask *src2) __ksym; ++void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, ++ const struct cpumask *src2) __ksym; ++bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; ++bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; ++bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; ++bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; ++bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; ++void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; ++u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; ++u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, ++ const struct cpumask *src2) __ksym; ++ ++/* rcu */ ++void bpf_rcu_read_lock(void) __ksym; ++void bpf_rcu_read_unlock(void) __ksym; ++ ++ ++/* ++ * Other helpers ++ */ ++ ++/* useful compiler attributes */ ++#define likely(x) __builtin_expect(!!(x), 1) ++#define unlikely(x) __builtin_expect(!!(x), 0) ++#define __maybe_unused __attribute__((__unused__)) ++ ++/* ++ * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They ++ * prevent compiler from caching, redoing or reordering reads or writes. ++ */ ++typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; ++typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; ++typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; ++typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; ++ ++static __always_inline void __read_once_size(const volatile void *p, void *res, int size) ++{ ++ switch (size) { ++ case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; ++ case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; ++ case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; ++ case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; ++ default: ++ barrier(); ++ __builtin_memcpy((void *)res, (const void *)p, size); ++ barrier(); ++ } ++} ++ ++static __always_inline void __write_once_size(volatile void *p, void *res, int size) ++{ ++ switch (size) { ++ case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; ++ case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; ++ case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; ++ case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; ++ default: ++ barrier(); ++ __builtin_memcpy((void *)p, (const void *)res, size); ++ barrier(); ++ } ++} ++ ++#define READ_ONCE(x) \ ++({ \ ++ union { typeof(x) __val; char __c[1]; } __u = \ ++ { .__c = { 0 } }; \ ++ __read_once_size(&(x), __u.__c, sizeof(x)); \ ++ __u.__val; \ ++}) ++ ++#define WRITE_ONCE(x, val) \ ++({ \ ++ union { typeof(x) __val; char __c[1]; } __u = \ ++ { .__val = (val) }; \ ++ __write_once_size(&(x), __u.__c, sizeof(x)); \ ++ __u.__val; \ ++}) ++ ++/* ++ * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. ++ * @v: The value for which we're computing the base 2 logarithm. ++ */ ++static inline u32 log2_u32(u32 v) ++{ ++ u32 r; ++ u32 shift; ++ ++ r = (v > 0xFFFF) << 4; v >>= r; ++ shift = (v > 0xFF) << 3; v >>= shift; r |= shift; ++ shift = (v > 0xF) << 2; v >>= shift; r |= shift; ++ shift = (v > 0x3) << 1; v >>= shift; r |= shift; ++ r |= (v >> 1); ++ return r; ++} ++ ++/* ++ * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value. ++ * @v: The value for which we're computing the base 2 logarithm. ++ */ ++static inline u32 log2_u64(u64 v) ++{ ++ u32 hi = v >> 32; ++ if (hi) ++ return log2_u32(hi) + 32 + 1; ++ else ++ return log2_u32(v) + 1; ++} ++ ++#include "compat.bpf.h" ++ ++#endif /* __SCX_COMMON_BPF_H */ +diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h +new file mode 100644 +index 000000000000..5b0f90152152 +--- /dev/null ++++ b/tools/sched_ext/include/scx/common.h +@@ -0,0 +1,75 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ */ ++#ifndef __SCHED_EXT_COMMON_H ++#define __SCHED_EXT_COMMON_H ++ ++#ifdef __KERNEL__ ++#error "Should not be included by BPF programs" ++#endif ++ ++#include <stdarg.h> ++#include <stdio.h> ++#include <stdlib.h> ++#include <stdint.h> ++#include <errno.h> ++ ++typedef uint8_t u8; ++typedef uint16_t u16; ++typedef uint32_t u32; ++typedef uint64_t u64; ++typedef int8_t s8; ++typedef int16_t s16; ++typedef int32_t s32; ++typedef int64_t s64; ++ ++#define SCX_BUG(__fmt, ...) \ ++ do { \ ++ fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \ ++ if (errno) \ ++ fprintf(stderr, " (%s)\n", strerror(errno)); \ ++ else \ ++ fprintf(stderr, "\n"); \ ++ fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ ++ fprintf(stderr, "\n"); \ ++ \ ++ exit(EXIT_FAILURE); \ ++ } while (0) ++ ++#define SCX_BUG_ON(__cond, __fmt, ...) \ ++ do { \ ++ if (__cond) \ ++ SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ ++ } while (0) ++ ++/** ++ * RESIZE_ARRAY - Convenience macro for resizing a BPF array ++ * @__skel: the skeleton containing the array ++ * @elfsec: the data section of the BPF program in which the array exists ++ * @arr: the name of the array ++ * @n: the desired array element count ++ * ++ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two ++ * operations. It resizes the map which corresponds to the custom data ++ * section that contains the target array. As a side effect, the BTF info for ++ * the array is adjusted so that the array length is sized to cover the new ++ * data section size. The second operation is reassigning the skeleton pointer ++ * for that custom data section so that it points to the newly memory mapped ++ * region. ++ */ ++#define RESIZE_ARRAY(__skel, elfsec, arr, n) \ ++ do { \ ++ size_t __sz; \ ++ bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \ ++ sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \ ++ (__skel)->elfsec##_##arr = \ ++ bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \ ++ } while (0) ++ ++#include "user_exit_info.h" ++#include "compat.h" ++ ++#endif /* __SCHED_EXT_COMMON_H */ +diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h +new file mode 100644 +index 000000000000..3d2fe1208900 +--- /dev/null ++++ b/tools/sched_ext/include/scx/compat.bpf.h +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#ifndef __SCX_COMPAT_BPF_H ++#define __SCX_COMPAT_BPF_H ++ ++#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ ++({ \ ++ __type __ret = 0; \ ++ if (bpf_core_enum_value_exists(__type, __ent)) \ ++ __ret = __ent; \ ++ __ret; \ ++}) ++ ++/* ++ * Define sched_ext_ops. This may be expanded to define multiple variants for ++ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). ++ */ ++#define SCX_OPS_DEFINE(__name, ...) \ ++ SEC(".struct_ops.link") \ ++ struct sched_ext_ops __name = { \ ++ __VA_ARGS__, \ ++ }; ++ ++#endif /* __SCX_COMPAT_BPF_H */ +diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h +new file mode 100644 +index 000000000000..1bf8eddf20c2 +--- /dev/null ++++ b/tools/sched_ext/include/scx/compat.h +@@ -0,0 +1,187 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#ifndef __SCX_COMPAT_H ++#define __SCX_COMPAT_H ++ ++#include <bpf/btf.h> ++#include <fcntl.h> ++#include <stdlib.h> ++#include <unistd.h> ++ ++struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); ++ ++static inline void __COMPAT_load_vmlinux_btf(void) ++{ ++ if (!__COMPAT_vmlinux_btf) { ++ __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); ++ SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); ++ } ++} ++ ++static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) ++{ ++ const struct btf_type *t; ++ const char *n; ++ s32 tid; ++ int i; ++ ++ __COMPAT_load_vmlinux_btf(); ++ ++ tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); ++ if (tid < 0) ++ return false; ++ ++ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); ++ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); ++ ++ if (btf_is_enum(t)) { ++ struct btf_enum *e = btf_enum(t); ++ ++ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { ++ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); ++ SCX_BUG_ON(!n, "btf__name_by_offset()"); ++ if (!strcmp(n, name)) { ++ *v = e[i].val; ++ return true; ++ } ++ } ++ } else if (btf_is_enum64(t)) { ++ struct btf_enum64 *e = btf_enum64(t); ++ ++ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { ++ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); ++ SCX_BUG_ON(!n, "btf__name_by_offset()"); ++ if (!strcmp(n, name)) { ++ *v = btf_enum64_value(&e[i]); ++ return true; ++ } ++ } ++ } ++ ++ return false; ++} ++ ++#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ ++({ \ ++ u64 __val = 0; \ ++ __COMPAT_read_enum(__type, __ent, &__val); \ ++ __val; \ ++}) ++ ++static inline bool __COMPAT_has_ksym(const char *ksym) ++{ ++ __COMPAT_load_vmlinux_btf(); ++ return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0; ++} ++ ++static inline bool __COMPAT_struct_has_field(const char *type, const char *field) ++{ ++ const struct btf_type *t; ++ const struct btf_member *m; ++ const char *n; ++ s32 tid; ++ int i; ++ ++ __COMPAT_load_vmlinux_btf(); ++ tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); ++ if (tid < 0) ++ return false; ++ ++ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); ++ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); ++ ++ m = btf_members(t); ++ ++ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { ++ n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); ++ SCX_BUG_ON(!n, "btf__name_by_offset()"); ++ if (!strcmp(n, field)) ++ return true; ++ } ++ ++ return false; ++} ++ ++#define SCX_OPS_SWITCH_PARTIAL \ ++ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") ++ ++static inline long scx_hotplug_seq(void) ++{ ++ int fd; ++ char buf[32]; ++ ssize_t len; ++ long val; ++ ++ fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); ++ if (fd < 0) ++ return -ENOENT; ++ ++ len = read(fd, buf, sizeof(buf) - 1); ++ SCX_BUG_ON(len <= 0, "read failed (%ld)", len); ++ buf[len] = 0; ++ close(fd); ++ ++ val = strtoul(buf, NULL, 10); ++ SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); ++ ++ return val; ++} ++ ++/* ++ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() ++ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load ++ * and attach it, backward compatibility is automatically maintained where ++ * reasonable. ++ * ++ * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is ++ * the current minimum required kernel version. ++ */ ++#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ ++ struct __scx_name *__skel; \ ++ \ ++ SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \ ++ "sched_ext_ops.dump() missing, kernel too old?"); \ ++ \ ++ __skel = __scx_name##__open(); \ ++ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ ++ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ ++ __skel; \ ++}) ++ ++#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ ++ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ ++ SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ ++}) ++ ++/* ++ * New versions of bpftool now emit additional link placeholders for BPF maps, ++ * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps ++ * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do ++ * nothing with those links and won't attempt to auto-attach maps. ++ * ++ * To maintain compatibility with older libbpf while avoiding trying to attach ++ * twice, disable the autoattach feature on newer libbpf. ++ */ ++/* BACKPORT - bpf_mpa__set_autoattach() not available yet, commented out */ ++/*#if LIBBPF_MAJOR_VERSION > 1 || \ ++ (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5) ++#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \ ++ bpf_map__set_autoattach((__skel)->maps.__ops_name, false) ++#else*/ ++#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0) ++/*#endif*/ ++ ++#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ ++ struct bpf_link *__link; \ ++ __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \ ++ SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \ ++ __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ ++ SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ ++ __link; \ ++}) ++ ++#endif /* __SCX_COMPAT_H */ +diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h +new file mode 100644 +index 000000000000..891693ee604e +--- /dev/null ++++ b/tools/sched_ext/include/scx/user_exit_info.h +@@ -0,0 +1,111 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Define struct user_exit_info which is shared between BPF and userspace parts ++ * to communicate exit status and other information. ++ * ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#ifndef __USER_EXIT_INFO_H ++#define __USER_EXIT_INFO_H ++ ++enum uei_sizes { ++ UEI_REASON_LEN = 128, ++ UEI_MSG_LEN = 1024, ++ UEI_DUMP_DFL_LEN = 32768, ++}; ++ ++struct user_exit_info { ++ int kind; ++ s64 exit_code; ++ char reason[UEI_REASON_LEN]; ++ char msg[UEI_MSG_LEN]; ++}; ++ ++#ifdef __bpf__ ++ ++#include "vmlinux.h" ++#include <bpf/bpf_core_read.h> ++ ++#define UEI_DEFINE(__name) \ ++ char RESIZABLE_ARRAY(data, __name##_dump); \ ++ const volatile u32 __name##_dump_len; \ ++ struct user_exit_info __name SEC(".data") ++ ++#define UEI_RECORD(__uei_name, __ei) ({ \ ++ bpf_probe_read_kernel_str(__uei_name.reason, \ ++ sizeof(__uei_name.reason), (__ei)->reason); \ ++ bpf_probe_read_kernel_str(__uei_name.msg, \ ++ sizeof(__uei_name.msg), (__ei)->msg); \ ++ bpf_probe_read_kernel_str(__uei_name##_dump, \ ++ __uei_name##_dump_len, (__ei)->dump); \ ++ if (bpf_core_field_exists((__ei)->exit_code)) \ ++ __uei_name.exit_code = (__ei)->exit_code; \ ++ /* use __sync to force memory barrier */ \ ++ __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ ++ (__ei)->kind); \ ++}) ++ ++#else /* !__bpf__ */ ++ ++#include <stdio.h> ++#include <stdbool.h> ++ ++/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ ++#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ ++ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ ++ (__skel)->rodata->__uei_name##_dump_len = __len; \ ++ RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ ++}) ++ ++#define UEI_EXITED(__skel, __uei_name) ({ \ ++ /* use __sync to force memory barrier */ \ ++ __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ ++}) ++ ++#define UEI_REPORT(__skel, __uei_name) ({ \ ++ struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ ++ char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ ++ if (__uei_dump[0] != '\0') { \ ++ fputs("\nDEBUG DUMP\n", stderr); \ ++ fputs("================================================================================\n\n", stderr); \ ++ fputs(__uei_dump, stderr); \ ++ fputs("\n================================================================================\n\n", stderr); \ ++ } \ ++ fprintf(stderr, "EXIT: %s", __uei->reason); \ ++ if (__uei->msg[0] != '\0') \ ++ fprintf(stderr, " (%s)", __uei->msg); \ ++ fputs("\n", stderr); \ ++ __uei->exit_code; \ ++}) ++ ++/* ++ * We can't import vmlinux.h while compiling user C code. Let's duplicate ++ * scx_exit_code definition. ++ */ ++enum scx_exit_code { ++ /* Reasons */ ++ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, ++ ++ /* Actions */ ++ SCX_ECODE_ACT_RESTART = 1LLU << 48, ++}; ++ ++enum uei_ecode_mask { ++ UEI_ECODE_USER_MASK = ((1LLU << 32) - 1), ++ UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32, ++ UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48, ++}; ++ ++/* ++ * These macro interpret the ecode returned from UEI_REPORT(). ++ */ ++#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK) ++#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK) ++#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK) ++ ++#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) ++ ++#endif /* __bpf__ */ ++#endif /* __USER_EXIT_INFO_H */ +diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c +new file mode 100644 +index 000000000000..1d8fd570eaa7 +--- /dev/null ++++ b/tools/sched_ext/scx_central.bpf.c +@@ -0,0 +1,361 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A central FIFO sched_ext scheduler which demonstrates the followings: ++ * ++ * a. Making all scheduling decisions from one CPU: ++ * ++ * The central CPU is the only one making scheduling decisions. All other ++ * CPUs kick the central CPU when they run out of tasks to run. ++ * ++ * There is one global BPF queue and the central CPU schedules all CPUs by ++ * dispatching from the global queue to each CPU's local dsq from dispatch(). ++ * This isn't the most straightforward. e.g. It'd be easier to bounce ++ * through per-CPU BPF queues. The current design is chosen to maximally ++ * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. ++ * ++ * b. Tickless operation ++ * ++ * All tasks are dispatched with the infinite slice which allows stopping the ++ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full ++ * parameter. The tickless operation can be observed through ++ * /proc/interrupts. ++ * ++ * Periodic switching is enforced by a periodic timer checking all CPUs and ++ * preempting them as necessary. Unfortunately, BPF timer currently doesn't ++ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to ++ * the central CPU. ++ * ++ * c. Preemption ++ * ++ * Kthreads are unconditionally queued to the head of a matching local dsq ++ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always ++ * prioritized over user threads, which is required for ensuring forward ++ * progress as e.g. the periodic timer may run on a ksoftirqd and if the ++ * ksoftirqd gets starved by a user thread, there may not be anything else to ++ * vacate that user thread. ++ * ++ * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the ++ * next tasks. ++ * ++ * This scheduler is designed to maximize usage of various SCX mechanisms. A ++ * more practical implementation would likely put the scheduling loop outside ++ * the central CPU's dispatch() path and add some form of priority mechanism. ++ * ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++enum { ++ FALLBACK_DSQ_ID = 0, ++ MS_TO_NS = 1000LLU * 1000, ++ TIMER_INTERVAL_NS = 1 * MS_TO_NS, ++}; ++ ++const volatile s32 central_cpu; ++const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ ++const volatile u64 slice_ns = SCX_SLICE_DFL; ++ ++bool timer_pinned = true; ++u64 nr_total, nr_locals, nr_queued, nr_lost_pids; ++u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; ++u64 nr_overflows; ++ ++UEI_DEFINE(uei); ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_QUEUE); ++ __uint(max_entries, 4096); ++ __type(value, s32); ++} central_q SEC(".maps"); ++ ++/* can't use percpu map due to bad lookups */ ++bool RESIZABLE_ARRAY(data, cpu_gimme_task); ++u64 RESIZABLE_ARRAY(data, cpu_started_at); ++ ++struct central_timer { ++ struct bpf_timer timer; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_ARRAY); ++ __uint(max_entries, 1); ++ __type(key, u32); ++ __type(value, struct central_timer); ++} central_timer SEC(".maps"); ++ ++static bool vtime_before(u64 a, u64 b) ++{ ++ return (s64)(a - b) < 0; ++} ++ ++s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ /* ++ * Steer wakeups to the central CPU as much as possible to avoid ++ * disturbing other CPUs. It's safe to blindly return the central cpu as ++ * select_cpu() is a hint and if @p can't be on it, the kernel will ++ * automatically pick a fallback CPU. ++ */ ++ return central_cpu; ++} ++ ++void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ s32 pid = p->pid; ++ ++ __sync_fetch_and_add(&nr_total, 1); ++ ++ /* ++ * Push per-cpu kthreads at the head of local dsq's and preempt the ++ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked ++ * behind other threads which is necessary for forward progress ++ * guarantee as we depend on the BPF timer which may run from ksoftirqd. ++ */ ++ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { ++ __sync_fetch_and_add(&nr_locals, 1); ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, ++ enq_flags | SCX_ENQ_PREEMPT); ++ return; ++ } ++ ++ if (bpf_map_push_elem(¢ral_q, &pid, 0)) { ++ __sync_fetch_and_add(&nr_overflows, 1); ++ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); ++ return; ++ } ++ ++ __sync_fetch_and_add(&nr_queued, 1); ++ ++ if (!scx_bpf_task_running(p)) ++ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); ++} ++ ++static bool dispatch_to_cpu(s32 cpu) ++{ ++ struct task_struct *p; ++ s32 pid; ++ ++ bpf_repeat(BPF_MAX_LOOPS) { ++ if (bpf_map_pop_elem(¢ral_q, &pid)) ++ break; ++ ++ __sync_fetch_and_sub(&nr_queued, 1); ++ ++ p = bpf_task_from_pid(pid); ++ if (!p) { ++ __sync_fetch_and_add(&nr_lost_pids, 1); ++ continue; ++ } ++ ++ /* ++ * If we can't run the task at the top, do the dumb thing and ++ * bounce it to the fallback dsq. ++ */ ++ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { ++ __sync_fetch_and_add(&nr_mismatches, 1); ++ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); ++ bpf_task_release(p); ++ /* ++ * We might run out of dispatch buffer slots if we continue dispatching ++ * to the fallback DSQ, without dispatching to the local DSQ of the ++ * target CPU. In such a case, break the loop now as will fail the ++ * next dispatch operation. ++ */ ++ if (!scx_bpf_dispatch_nr_slots()) ++ break; ++ continue; ++ } ++ ++ /* dispatch to local and mark that @cpu doesn't need more */ ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); ++ ++ if (cpu != central_cpu) ++ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); ++ ++ bpf_task_release(p); ++ return true; ++ } ++ ++ return false; ++} ++ ++void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ if (cpu == central_cpu) { ++ /* dispatch for all other CPUs first */ ++ __sync_fetch_and_add(&nr_dispatches, 1); ++ ++ bpf_for(cpu, 0, nr_cpu_ids) { ++ bool *gimme; ++ ++ if (!scx_bpf_dispatch_nr_slots()) ++ break; ++ ++ /* central's gimme is never set */ ++ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); ++ if (gimme && !*gimme) ++ continue; ++ ++ if (dispatch_to_cpu(cpu)) ++ *gimme = false; ++ } ++ ++ /* ++ * Retry if we ran out of dispatch buffer slots as we might have ++ * skipped some CPUs and also need to dispatch for self. The ext ++ * core automatically retries if the local dsq is empty but we ++ * can't rely on that as we're dispatching for other CPUs too. ++ * Kick self explicitly to retry. ++ */ ++ if (!scx_bpf_dispatch_nr_slots()) { ++ __sync_fetch_and_add(&nr_retries, 1); ++ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); ++ return; ++ } ++ ++ /* look for a task to run on the central CPU */ ++ if (scx_bpf_consume(FALLBACK_DSQ_ID)) ++ return; ++ dispatch_to_cpu(central_cpu); ++ } else { ++ bool *gimme; ++ ++ if (scx_bpf_consume(FALLBACK_DSQ_ID)) ++ return; ++ ++ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); ++ if (gimme) ++ *gimme = true; ++ ++ /* ++ * Force dispatch on the scheduling CPU so that it finds a task ++ * to run for us. ++ */ ++ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); ++ } ++} ++ ++void BPF_STRUCT_OPS(central_running, struct task_struct *p) ++{ ++ s32 cpu = scx_bpf_task_cpu(p); ++ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); ++ if (started_at) ++ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ ++} ++ ++void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) ++{ ++ s32 cpu = scx_bpf_task_cpu(p); ++ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); ++ if (started_at) ++ *started_at = 0; ++} ++ ++static int central_timerfn(void *map, int *key, struct bpf_timer *timer) ++{ ++ u64 now = bpf_ktime_get_ns(); ++ u64 nr_to_kick = nr_queued; ++ s32 i, curr_cpu; ++ ++ curr_cpu = bpf_get_smp_processor_id(); ++ if (timer_pinned && (curr_cpu != central_cpu)) { ++ scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", ++ curr_cpu, central_cpu); ++ return 0; ++ } ++ ++ bpf_for(i, 0, nr_cpu_ids) { ++ s32 cpu = (nr_timers + i) % nr_cpu_ids; ++ u64 *started_at; ++ ++ if (cpu == central_cpu) ++ continue; ++ ++ /* kick iff the current one exhausted its slice */ ++ started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); ++ if (started_at && *started_at && ++ vtime_before(now, *started_at + slice_ns)) ++ continue; ++ ++ /* and there's something pending */ ++ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || ++ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) ++ ; ++ else if (nr_to_kick) ++ nr_to_kick--; ++ else ++ continue; ++ ++ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); ++ } ++ ++ bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); ++ __sync_fetch_and_add(&nr_timers, 1); ++ return 0; ++} ++ ++int BPF_STRUCT_OPS_SLEEPABLE(central_init) ++{ ++ u32 key = 0; ++ struct bpf_timer *timer; ++ int ret; ++ ++ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); ++ if (ret) ++ return ret; ++ ++ timer = bpf_map_lookup_elem(¢ral_timer, &key); ++ if (!timer) ++ return -ESRCH; ++ ++ if (bpf_get_smp_processor_id() != central_cpu) { ++ scx_bpf_error("init from non-central CPU"); ++ return -EINVAL; ++ } ++ ++ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); ++ bpf_timer_set_callback(timer, central_timerfn); ++ ++ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); ++ /* ++ * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a ++ * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. ++ * Retry without the PIN. This would be the perfect use case for ++ * bpf_core_enum_value_exists() but the enum type doesn't have a name ++ * and can't be used with bpf_core_enum_value_exists(). Oh well... ++ */ ++ if (ret == -EINVAL) { ++ timer_pinned = false; ++ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); ++ } ++ if (ret) ++ scx_bpf_error("bpf_timer_start failed (%d)", ret); ++ return ret; ++} ++ ++void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SCX_OPS_DEFINE(central_ops, ++ /* ++ * We are offloading all scheduling decisions to the central CPU ++ * and thus being the last task on a given CPU doesn't mean ++ * anything special. Enqueue the last tasks like any other tasks. ++ */ ++ .flags = SCX_OPS_ENQ_LAST, ++ ++ .select_cpu = (void *)central_select_cpu, ++ .enqueue = (void *)central_enqueue, ++ .dispatch = (void *)central_dispatch, ++ .running = (void *)central_running, ++ .stopping = (void *)central_stopping, ++ .init = (void *)central_init, ++ .exit = (void *)central_exit, ++ .name = "central"); +diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c +new file mode 100644 +index 000000000000..21deea320bd7 +--- /dev/null ++++ b/tools/sched_ext/scx_central.c +@@ -0,0 +1,135 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#define _GNU_SOURCE ++#include <sched.h> ++#include <stdio.h> ++#include <unistd.h> ++#include <inttypes.h> ++#include <signal.h> ++#include <libgen.h> ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include "scx_central.bpf.skel.h" ++ ++const char help_fmt[] = ++"A central FIFO sched_ext scheduler.\n" ++"\n" ++"See the top-level comment in .bpf.c for more details.\n" ++"\n" ++"Usage: %s [-s SLICE_US] [-c CPU]\n" ++"\n" ++" -s SLICE_US Override slice duration\n" ++" -c CPU Override the central CPU (default: 0)\n" ++" -v Print libbpf debug messages\n" ++" -h Display this help and exit\n"; ++ ++static bool verbose; ++static volatile int exit_req; ++ ++static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) ++{ ++ if (level == LIBBPF_DEBUG && !verbose) ++ return 0; ++ return vfprintf(stderr, format, args); ++} ++ ++static void sigint_handler(int dummy) ++{ ++ exit_req = 1; ++} ++ ++int main(int argc, char **argv) ++{ ++ struct scx_central *skel; ++ struct bpf_link *link; ++ __u64 seq = 0, ecode; ++ __s32 opt; ++ cpu_set_t *cpuset; ++ ++ libbpf_set_print(libbpf_print_fn); ++ signal(SIGINT, sigint_handler); ++ signal(SIGTERM, sigint_handler); ++restart: ++ skel = SCX_OPS_OPEN(central_ops, scx_central); ++ ++ skel->rodata->central_cpu = 0; ++ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); ++ ++ while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { ++ switch (opt) { ++ case 's': ++ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; ++ break; ++ case 'c': ++ skel->rodata->central_cpu = strtoul(optarg, NULL, 0); ++ break; ++ case 'v': ++ verbose = true; ++ break; ++ default: ++ fprintf(stderr, help_fmt, basename(argv[0])); ++ return opt != 'h'; ++ } ++ } ++ ++ /* Resize arrays so their element count is equal to cpu count. */ ++ RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids); ++ RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids); ++ ++ SCX_OPS_LOAD(skel, central_ops, scx_central, uei); ++ ++ /* ++ * Affinitize the loading thread to the central CPU, as: ++ * - That's where the BPF timer is first invoked in the BPF program. ++ * - We probably don't want this user space component to take up a core ++ * from a task that would benefit from avoiding preemption on one of ++ * the tickless cores. ++ * ++ * Until BPF supports pinning the timer, it's not guaranteed that it ++ * will always be invoked on the central CPU. In practice, this ++ * suffices the majority of the time. ++ */ ++ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); ++ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); ++ CPU_ZERO(cpuset); ++ CPU_SET(skel->rodata->central_cpu, cpuset); ++ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), ++ "Failed to affinitize to central CPU %d (max %d)", ++ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); ++ CPU_FREE(cpuset); ++ ++ link = SCX_OPS_ATTACH(skel, central_ops, scx_central); ++ ++ if (!skel->data->timer_pinned) ++ printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); ++ ++ while (!exit_req && !UEI_EXITED(skel, uei)) { ++ printf("[SEQ %llu]\n", seq++); ++ printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", ++ skel->bss->nr_total, ++ skel->bss->nr_locals, ++ skel->bss->nr_queued, ++ skel->bss->nr_lost_pids); ++ printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", ++ skel->bss->nr_timers, ++ skel->bss->nr_dispatches, ++ skel->bss->nr_mismatches, ++ skel->bss->nr_retries); ++ printf("overflow:%10" PRIu64 "\n", ++ skel->bss->nr_overflows); ++ fflush(stdout); ++ sleep(1); ++ } ++ ++ bpf_link__destroy(link); ++ ecode = UEI_REPORT(skel, uei); ++ scx_central__destroy(skel); ++ ++ if (UEI_ECODE_RESTART(ecode)) ++ goto restart; ++ return 0; ++} +diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c +new file mode 100644 +index 000000000000..892278f12dce +--- /dev/null ++++ b/tools/sched_ext/scx_qmap.bpf.c +@@ -0,0 +1,706 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A simple five-level FIFO queue scheduler. ++ * ++ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets ++ * assigned to one depending on its compound weight. Each CPU round robins ++ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from ++ * queue0, 2 from queue1, 4 from queue2 and so on. ++ * ++ * This scheduler demonstrates: ++ * ++ * - BPF-side queueing using PIDs. ++ * - Sleepable per-task storage allocation using ops.prep_enable(). ++ * - Using ops.cpu_release() to handle a higher priority scheduling class taking ++ * the CPU away. ++ * - Core-sched support. ++ * ++ * This scheduler is primarily for demonstration and testing of sched_ext ++ * features and unlikely to be useful for actual workloads. ++ * ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#include <scx/common.bpf.h> ++ ++enum consts { ++ ONE_SEC_IN_NS = 1000000000, ++ SHARED_DSQ = 0, ++}; ++ ++char _license[] SEC("license") = "GPL"; ++ ++const volatile u64 slice_ns = SCX_SLICE_DFL; ++const volatile u32 stall_user_nth; ++const volatile u32 stall_kernel_nth; ++const volatile u32 dsp_inf_loop_after; ++const volatile u32 dsp_batch; ++const volatile bool print_shared_dsq; ++const volatile s32 disallow_tgid; ++const volatile bool suppress_dump; ++ ++u32 test_error_cnt; ++ ++UEI_DEFINE(uei); ++ ++struct qmap { ++ __uint(type, BPF_MAP_TYPE_QUEUE); ++ __uint(max_entries, 4096); ++ __type(value, u32); ++} queue0 SEC(".maps"), ++ queue1 SEC(".maps"), ++ queue2 SEC(".maps"), ++ queue3 SEC(".maps"), ++ queue4 SEC(".maps"); ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); ++ __uint(max_entries, 5); ++ __type(key, int); ++ __array(values, struct qmap); ++} queue_arr SEC(".maps") = { ++ .values = { ++ [0] = &queue0, ++ [1] = &queue1, ++ [2] = &queue2, ++ [3] = &queue3, ++ [4] = &queue4, ++ }, ++}; ++ ++/* ++ * If enabled, CPU performance target is set according to the queue index ++ * according to the following table. ++ */ ++static const u32 qidx_to_cpuperf_target[] = { ++ [0] = SCX_CPUPERF_ONE * 0 / 4, ++ [1] = SCX_CPUPERF_ONE * 1 / 4, ++ [2] = SCX_CPUPERF_ONE * 2 / 4, ++ [3] = SCX_CPUPERF_ONE * 3 / 4, ++ [4] = SCX_CPUPERF_ONE * 4 / 4, ++}; ++ ++/* ++ * Per-queue sequence numbers to implement core-sched ordering. ++ * ++ * Tail seq is assigned to each queued task and incremented. Head seq tracks the ++ * sequence number of the latest dispatched task. The distance between the a ++ * task's seq and the associated queue's head seq is called the queue distance ++ * and used when comparing two tasks for ordering. See qmap_core_sched_before(). ++ */ ++static u64 core_sched_head_seqs[5]; ++static u64 core_sched_tail_seqs[5]; ++ ++/* Per-task scheduling context */ ++struct task_ctx { ++ bool force_local; /* Dispatch directly to local_dsq */ ++ u64 core_sched_seq; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); ++ __uint(map_flags, BPF_F_NO_PREALLOC); ++ __type(key, int); ++ __type(value, struct task_ctx); ++} task_ctx_stor SEC(".maps"); ++ ++struct cpu_ctx { ++ u64 dsp_idx; /* dispatch index */ ++ u64 dsp_cnt; /* remaining count */ ++ u32 avg_weight; ++ u32 cpuperf_target; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); ++ __uint(max_entries, 1); ++ __type(key, u32); ++ __type(value, struct cpu_ctx); ++} cpu_ctx_stor SEC(".maps"); ++ ++/* Statistics */ ++u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; ++u64 nr_core_sched_execed; ++u32 cpuperf_min, cpuperf_avg, cpuperf_max; ++u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; ++ ++static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) ++{ ++ s32 cpu; ++ ++ if (p->nr_cpus_allowed == 1 || ++ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) ++ return prev_cpu; ++ ++ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); ++ if (cpu >= 0) ++ return cpu; ++ ++ return -1; ++} ++ ++s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ struct task_ctx *tctx; ++ s32 cpu; ++ ++ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); ++ if (!tctx) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return -ESRCH; ++ } ++ ++ cpu = pick_direct_dispatch_cpu(p, prev_cpu); ++ ++ if (cpu >= 0) { ++ tctx->force_local = true; ++ return cpu; ++ } else { ++ return prev_cpu; ++ } ++} ++ ++static int weight_to_idx(u32 weight) ++{ ++ /* Coarsely map the compound weight to a FIFO. */ ++ if (weight <= 25) ++ return 0; ++ else if (weight <= 50) ++ return 1; ++ else if (weight < 200) ++ return 2; ++ else if (weight < 400) ++ return 3; ++ else ++ return 4; ++} ++ ++void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ static u32 user_cnt, kernel_cnt; ++ struct task_ctx *tctx; ++ u32 pid = p->pid; ++ int idx = weight_to_idx(p->scx.weight); ++ void *ring; ++ s32 cpu; ++ ++ if (p->flags & PF_KTHREAD) { ++ if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) ++ return; ++ } else { ++ if (stall_user_nth && !(++user_cnt % stall_user_nth)) ++ return; ++ } ++ ++ if (test_error_cnt && !--test_error_cnt) ++ scx_bpf_error("test triggering error"); ++ ++ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); ++ if (!tctx) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return; ++ } ++ ++ /* ++ * All enqueued tasks must have their core_sched_seq updated for correct ++ * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in ++ * qmap_ops.flags. ++ */ ++ tctx->core_sched_seq = core_sched_tail_seqs[idx]++; ++ ++ /* ++ * If qmap_select_cpu() is telling us to or this is the last runnable ++ * task on the CPU, enqueue locally. ++ */ ++ if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { ++ tctx->force_local = false; ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); ++ return; ++ } ++ ++ /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ ++ if (!(enq_flags & SCX_ENQ_WAKEUP) && ++ (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { ++ __sync_fetch_and_add(&nr_ddsp_from_enq, 1); ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); ++ return; ++ } ++ ++ /* ++ * If the task was re-enqueued due to the CPU being preempted by a ++ * higher priority scheduling class, just re-enqueue the task directly ++ * on the global DSQ. As we want another CPU to pick it up, find and ++ * kick an idle CPU. ++ */ ++ if (enq_flags & SCX_ENQ_REENQ) { ++ s32 cpu; ++ ++ scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); ++ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); ++ if (cpu >= 0) ++ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); ++ return; ++ } ++ ++ ring = bpf_map_lookup_elem(&queue_arr, &idx); ++ if (!ring) { ++ scx_bpf_error("failed to find ring %d", idx); ++ return; ++ } ++ ++ /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ ++ if (bpf_map_push_elem(ring, &pid, 0)) { ++ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); ++ return; ++ } ++ ++ __sync_fetch_and_add(&nr_enqueued, 1); ++} ++ ++/* ++ * The BPF queue map doesn't support removal and sched_ext can handle spurious ++ * dispatches. qmap_dequeue() is only used to collect statistics. ++ */ ++void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) ++{ ++ __sync_fetch_and_add(&nr_dequeued, 1); ++ if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) ++ __sync_fetch_and_add(&nr_core_sched_execed, 1); ++} ++ ++static void update_core_sched_head_seq(struct task_struct *p) ++{ ++ struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); ++ int idx = weight_to_idx(p->scx.weight); ++ ++ if (tctx) ++ core_sched_head_seqs[idx] = tctx->core_sched_seq; ++ else ++ scx_bpf_error("task_ctx lookup failed"); ++} ++ ++void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ struct task_struct *p; ++ struct cpu_ctx *cpuc; ++ u32 zero = 0, batch = dsp_batch ?: 1; ++ void *fifo; ++ s32 i, pid; ++ ++ if (scx_bpf_consume(SHARED_DSQ)) ++ return; ++ ++ if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { ++ /* ++ * PID 2 should be kthreadd which should mostly be idle and off ++ * the scheduler. Let's keep dispatching it to force the kernel ++ * to call this function over and over again. ++ */ ++ p = bpf_task_from_pid(2); ++ if (p) { ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); ++ bpf_task_release(p); ++ return; ++ } ++ } ++ ++ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { ++ scx_bpf_error("failed to look up cpu_ctx"); ++ return; ++ } ++ ++ for (i = 0; i < 5; i++) { ++ /* Advance the dispatch cursor and pick the fifo. */ ++ if (!cpuc->dsp_cnt) { ++ cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; ++ cpuc->dsp_cnt = 1 << cpuc->dsp_idx; ++ } ++ ++ fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); ++ if (!fifo) { ++ scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); ++ return; ++ } ++ ++ /* Dispatch or advance. */ ++ bpf_repeat(BPF_MAX_LOOPS) { ++ if (bpf_map_pop_elem(fifo, &pid)) ++ break; ++ ++ p = bpf_task_from_pid(pid); ++ if (!p) ++ continue; ++ ++ update_core_sched_head_seq(p); ++ __sync_fetch_and_add(&nr_dispatched, 1); ++ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); ++ bpf_task_release(p); ++ batch--; ++ cpuc->dsp_cnt--; ++ if (!batch || !scx_bpf_dispatch_nr_slots()) { ++ scx_bpf_consume(SHARED_DSQ); ++ return; ++ } ++ if (!cpuc->dsp_cnt) ++ break; ++ } ++ ++ cpuc->dsp_cnt = 0; ++ } ++} ++ ++void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) ++{ ++ struct cpu_ctx *cpuc; ++ u32 zero = 0; ++ int idx; ++ ++ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { ++ scx_bpf_error("failed to look up cpu_ctx"); ++ return; ++ } ++ ++ /* ++ * Use the running avg of weights to select the target cpuperf level. ++ * This is a demonstration of the cpuperf feature rather than a ++ * practical strategy to regulate CPU frequency. ++ */ ++ cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; ++ idx = weight_to_idx(cpuc->avg_weight); ++ cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; ++ ++ scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); ++} ++ ++/* ++ * The distance from the head of the queue scaled by the weight of the queue. ++ * The lower the number, the older the task and the higher the priority. ++ */ ++static s64 task_qdist(struct task_struct *p) ++{ ++ int idx = weight_to_idx(p->scx.weight); ++ struct task_ctx *tctx; ++ s64 qdist; ++ ++ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); ++ if (!tctx) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return 0; ++ } ++ ++ qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; ++ ++ /* ++ * As queue index increments, the priority doubles. The queue w/ index 3 ++ * is dispatched twice more frequently than 2. Reflect the difference by ++ * scaling qdists accordingly. Note that the shift amount needs to be ++ * flipped depending on the sign to avoid flipping priority direction. ++ */ ++ if (qdist >= 0) ++ return qdist << (4 - idx); ++ else ++ return qdist << idx; ++} ++ ++/* ++ * This is called to determine the task ordering when core-sched is picking ++ * tasks to execute on SMT siblings and should encode about the same ordering as ++ * the regular scheduling path. Use the priority-scaled distances from the head ++ * of the queues to compare the two tasks which should be consistent with the ++ * dispatch path behavior. ++ */ ++bool BPF_STRUCT_OPS(qmap_core_sched_before, ++ struct task_struct *a, struct task_struct *b) ++{ ++ return task_qdist(a) > task_qdist(b); ++} ++ ++void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) ++{ ++ u32 cnt; ++ ++ /* ++ * Called when @cpu is taken by a higher priority scheduling class. This ++ * makes @cpu no longer available for executing sched_ext tasks. As we ++ * don't want the tasks in @cpu's local dsq to sit there until @cpu ++ * becomes available again, re-enqueue them into the global dsq. See ++ * %SCX_ENQ_REENQ handling in qmap_enqueue(). ++ */ ++ cnt = scx_bpf_reenqueue_local(); ++ if (cnt) ++ __sync_fetch_and_add(&nr_reenqueued, cnt); ++} ++ ++s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, ++ struct scx_init_task_args *args) ++{ ++ if (p->tgid == disallow_tgid) ++ p->scx.disallow = true; ++ ++ /* ++ * @p is new. Let's ensure that its task_ctx is available. We can sleep ++ * in this function and the following will automatically use GFP_KERNEL. ++ */ ++ if (bpf_task_storage_get(&task_ctx_stor, p, 0, ++ BPF_LOCAL_STORAGE_GET_F_CREATE)) ++ return 0; ++ else ++ return -ENOMEM; ++} ++ ++void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) ++{ ++ s32 i, pid; ++ ++ if (suppress_dump) ++ return; ++ ++ bpf_for(i, 0, 5) { ++ void *fifo; ++ ++ if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) ++ return; ++ ++ scx_bpf_dump("QMAP FIFO[%d]:", i); ++ bpf_repeat(4096) { ++ if (bpf_map_pop_elem(fifo, &pid)) ++ break; ++ scx_bpf_dump(" %d", pid); ++ } ++ scx_bpf_dump("\n"); ++ } ++} ++ ++void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) ++{ ++ u32 zero = 0; ++ struct cpu_ctx *cpuc; ++ ++ if (suppress_dump || idle) ++ return; ++ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) ++ return; ++ ++ scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", ++ cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, ++ cpuc->cpuperf_target); ++} ++ ++void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) ++{ ++ struct task_ctx *taskc; ++ ++ if (suppress_dump) ++ return; ++ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) ++ return; ++ ++ scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", ++ taskc->force_local, taskc->core_sched_seq); ++} ++ ++/* ++ * Print out the online and possible CPU map using bpf_printk() as a ++ * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). ++ */ ++static void print_cpus(void) ++{ ++ const struct cpumask *possible, *online; ++ s32 cpu; ++ char buf[128] = "", *p; ++ int idx; ++ ++ possible = scx_bpf_get_possible_cpumask(); ++ online = scx_bpf_get_online_cpumask(); ++ ++ idx = 0; ++ bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { ++ if (!(p = MEMBER_VPTR(buf, [idx++]))) ++ break; ++ if (bpf_cpumask_test_cpu(cpu, online)) ++ *p++ = 'O'; ++ else if (bpf_cpumask_test_cpu(cpu, possible)) ++ *p++ = 'X'; ++ else ++ *p++ = ' '; ++ ++ if ((cpu & 7) == 7) { ++ if (!(p = MEMBER_VPTR(buf, [idx++]))) ++ break; ++ *p++ = '|'; ++ } ++ } ++ buf[sizeof(buf) - 1] = '\0'; ++ ++ scx_bpf_put_cpumask(online); ++ scx_bpf_put_cpumask(possible); ++ ++ bpf_printk("CPUS: |%s", buf); ++} ++ ++void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) ++{ ++ bpf_printk("CPU %d coming online", cpu); ++ /* @cpu is already online at this point */ ++ print_cpus(); ++} ++ ++void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) ++{ ++ bpf_printk("CPU %d going offline", cpu); ++ /* @cpu is still online at this point */ ++ print_cpus(); ++} ++ ++struct monitor_timer { ++ struct bpf_timer timer; ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_ARRAY); ++ __uint(max_entries, 1); ++ __type(key, u32); ++ __type(value, struct monitor_timer); ++} monitor_timer SEC(".maps"); ++ ++/* ++ * Print out the min, avg and max performance levels of CPUs every second to ++ * demonstrate the cpuperf interface. ++ */ ++static void monitor_cpuperf(void) ++{ ++ u32 zero = 0, nr_cpu_ids; ++ u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; ++ u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; ++ const struct cpumask *online; ++ int i, nr_online_cpus = 0; ++ ++ nr_cpu_ids = scx_bpf_nr_cpu_ids(); ++ online = scx_bpf_get_online_cpumask(); ++ ++ bpf_for(i, 0, nr_cpu_ids) { ++ struct cpu_ctx *cpuc; ++ u32 cap, cur; ++ ++ if (!bpf_cpumask_test_cpu(i, online)) ++ continue; ++ nr_online_cpus++; ++ ++ /* collect the capacity and current cpuperf */ ++ cap = scx_bpf_cpuperf_cap(i); ++ cur = scx_bpf_cpuperf_cur(i); ++ ++ cur_min = cur < cur_min ? cur : cur_min; ++ cur_max = cur > cur_max ? cur : cur_max; ++ ++ /* ++ * $cur is relative to $cap. Scale it down accordingly so that ++ * it's in the same scale as other CPUs and $cur_sum/$cap_sum ++ * makes sense. ++ */ ++ cur_sum += cur * cap / SCX_CPUPERF_ONE; ++ cap_sum += cap; ++ ++ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { ++ scx_bpf_error("failed to look up cpu_ctx"); ++ goto out; ++ } ++ ++ /* collect target */ ++ cur = cpuc->cpuperf_target; ++ target_sum += cur; ++ target_min = cur < target_min ? cur : target_min; ++ target_max = cur > target_max ? cur : target_max; ++ } ++ ++ cpuperf_min = cur_min; ++ cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; ++ cpuperf_max = cur_max; ++ ++ cpuperf_target_min = target_min; ++ cpuperf_target_avg = target_sum / nr_online_cpus; ++ cpuperf_target_max = target_max; ++out: ++ scx_bpf_put_cpumask(online); ++} ++ ++/* ++ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of ++ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to ++ * see meaningful dumps in the trace pipe. ++ */ ++static void dump_shared_dsq(void) ++{ ++ struct task_struct *p; ++ s32 nr; ++ ++ if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) ++ return; ++ ++ bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); ++ ++ bpf_rcu_read_lock(); ++ bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) ++ bpf_printk("%s[%d]", p->comm, p->pid); ++ bpf_rcu_read_unlock(); ++} ++ ++static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) ++{ ++ monitor_cpuperf(); ++ ++ if (print_shared_dsq) ++ dump_shared_dsq(); ++ ++ bpf_timer_start(timer, ONE_SEC_IN_NS, 0); ++ return 0; ++} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) ++{ ++ u32 key = 0; ++ struct bpf_timer *timer; ++ s32 ret; ++ ++ print_cpus(); ++ ++ ret = scx_bpf_create_dsq(SHARED_DSQ, -1); ++ if (ret) ++ return ret; ++ ++ timer = bpf_map_lookup_elem(&monitor_timer, &key); ++ if (!timer) ++ return -ESRCH; ++ ++ bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); ++ bpf_timer_set_callback(timer, monitor_timerfn); ++ ++ return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); ++} ++ ++void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SCX_OPS_DEFINE(qmap_ops, ++ .select_cpu = (void *)qmap_select_cpu, ++ .enqueue = (void *)qmap_enqueue, ++ .dequeue = (void *)qmap_dequeue, ++ .dispatch = (void *)qmap_dispatch, ++ .tick = (void *)qmap_tick, ++ .core_sched_before = (void *)qmap_core_sched_before, ++ .cpu_release = (void *)qmap_cpu_release, ++ .init_task = (void *)qmap_init_task, ++ .dump = (void *)qmap_dump, ++ .dump_cpu = (void *)qmap_dump_cpu, ++ .dump_task = (void *)qmap_dump_task, ++ .cpu_online = (void *)qmap_cpu_online, ++ .cpu_offline = (void *)qmap_cpu_offline, ++ .init = (void *)qmap_init, ++ .exit = (void *)qmap_exit, ++ .flags = SCX_OPS_ENQ_LAST, ++ .timeout_ms = 5000U, ++ .name = "qmap"); +diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c +new file mode 100644 +index 000000000000..c9ca30d62b2b +--- /dev/null ++++ b/tools/sched_ext/scx_qmap.c +@@ -0,0 +1,144 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#include <stdio.h> ++#include <stdlib.h> ++#include <unistd.h> ++#include <inttypes.h> ++#include <signal.h> ++#include <libgen.h> ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include "scx_qmap.bpf.skel.h" ++ ++const char help_fmt[] = ++"A simple five-level FIFO queue sched_ext scheduler.\n" ++"\n" ++"See the top-level comment in .bpf.c for more details.\n" ++"\n" ++"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" ++" [-P] [-d PID] [-D LEN] [-p] [-v]\n" ++"\n" ++" -s SLICE_US Override slice duration\n" ++" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" ++" -t COUNT Stall every COUNT'th user thread\n" ++" -T COUNT Stall every COUNT'th kernel thread\n" ++" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" ++" -b COUNT Dispatch upto COUNT tasks together\n" ++" -P Print out DSQ content to trace_pipe every second, use with -b\n" ++" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" ++" -D LEN Set scx_exit_info.dump buffer length\n" ++" -S Suppress qmap-specific debug dump\n" ++" -p Switch only tasks on SCHED_EXT policy instead of all\n" ++" -v Print libbpf debug messages\n" ++" -h Display this help and exit\n"; ++ ++static bool verbose; ++static volatile int exit_req; ++ ++static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) ++{ ++ if (level == LIBBPF_DEBUG && !verbose) ++ return 0; ++ return vfprintf(stderr, format, args); ++} ++ ++static void sigint_handler(int dummy) ++{ ++ exit_req = 1; ++} ++ ++int main(int argc, char **argv) ++{ ++ struct scx_qmap *skel; ++ struct bpf_link *link; ++ int opt; ++ ++ libbpf_set_print(libbpf_print_fn); ++ signal(SIGINT, sigint_handler); ++ signal(SIGTERM, sigint_handler); ++ ++ skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); ++ ++ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { ++ switch (opt) { ++ case 's': ++ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; ++ break; ++ case 'e': ++ skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); ++ break; ++ case 't': ++ skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); ++ break; ++ case 'T': ++ skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); ++ break; ++ case 'l': ++ skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); ++ break; ++ case 'b': ++ skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); ++ break; ++ case 'P': ++ skel->rodata->print_shared_dsq = true; ++ break; ++ case 'd': ++ skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); ++ if (skel->rodata->disallow_tgid < 0) ++ skel->rodata->disallow_tgid = getpid(); ++ break; ++ case 'D': ++ skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0); ++ break; ++ case 'S': ++ skel->rodata->suppress_dump = true; ++ break; ++ case 'p': ++ skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; ++ break; ++ case 'v': ++ verbose = true; ++ break; ++ default: ++ fprintf(stderr, help_fmt, basename(argv[0])); ++ return opt != 'h'; ++ } ++ } ++ ++ SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); ++ link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); ++ ++ while (!exit_req && !UEI_EXITED(skel, uei)) { ++ long nr_enqueued = skel->bss->nr_enqueued; ++ long nr_dispatched = skel->bss->nr_dispatched; ++ ++ printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n", ++ nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, ++ skel->bss->nr_reenqueued, skel->bss->nr_dequeued, ++ skel->bss->nr_core_sched_execed, ++ skel->bss->nr_ddsp_from_enq); ++ if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) ++ printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", ++ skel->bss->cpuperf_min, ++ skel->bss->cpuperf_avg, ++ skel->bss->cpuperf_max, ++ skel->bss->cpuperf_target_min, ++ skel->bss->cpuperf_target_avg, ++ skel->bss->cpuperf_target_max); ++ fflush(stdout); ++ sleep(1); ++ } ++ ++ bpf_link__destroy(link); ++ UEI_REPORT(skel, uei); ++ scx_qmap__destroy(skel); ++ /* ++ * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart ++ * on CPU hotplug events. ++ */ ++ return 0; ++} +diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py +new file mode 100644 +index 000000000000..d457d2a74e1e +--- /dev/null ++++ b/tools/sched_ext/scx_show_state.py +@@ -0,0 +1,39 @@ ++#!/usr/bin/env drgn ++# ++# Copyright (C) 2024 Tejun Heo <tj@kernel.org> ++# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. ++ ++desc = """ ++This is a drgn script to show the current sched_ext state. ++For more info on drgn, visit https://github.com/osandov/drgn. ++""" ++ ++import drgn ++import sys ++ ++def err(s): ++ print(s, file=sys.stderr, flush=True) ++ sys.exit(1) ++ ++def read_int(name): ++ return int(prog[name].value_()) ++ ++def read_atomic(name): ++ return prog[name].counter.value_() ++ ++def read_static_key(name): ++ return prog[name].key.enabled.counter.value_() ++ ++def ops_state_str(state): ++ return prog['scx_ops_enable_state_str'][state].string_().decode() ++ ++ops = prog['scx_ops'] ++enable_state = read_atomic("scx_ops_enable_state_var") ++ ++print(f'ops : {ops.name.string_().decode()}') ++print(f'enabled : {read_static_key("__scx_ops_enabled")}') ++print(f'switching_all : {read_int("scx_switching_all")}') ++print(f'switched_all : {read_static_key("__scx_switched_all")}') ++print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') ++print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') ++print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') +diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c +new file mode 100644 +index 000000000000..ed7e8d535fc5 +--- /dev/null ++++ b/tools/sched_ext/scx_simple.bpf.c +@@ -0,0 +1,156 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A simple scheduler. ++ * ++ * By default, it operates as a simple global weighted vtime scheduler and can ++ * be switched to FIFO scheduling. It also demonstrates the following niceties. ++ * ++ * - Statistics tracking how many tasks are queued to local and global dsq's. ++ * - Termination notification for userspace. ++ * ++ * While very simple, this scheduler should work reasonably well on CPUs with a ++ * uniform L3 cache topology. While preemption is not implemented, the fact that ++ * the scheduling queue is shared across all CPUs means that whatever is at the ++ * front of the queue is likely to be executed fairly quickly given enough ++ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads ++ * but comes with the usual problems with FIFO scheduling where saturating ++ * threads can easily drown out interactive ones. ++ * ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++const volatile bool fifo_sched; ++ ++static u64 vtime_now; ++UEI_DEFINE(uei); ++ ++/* ++ * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues ++ * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We ++ * therefore create a separate DSQ with ID 0 that we dispatch to and consume ++ * from. If scx_simple only supported global FIFO scheduling, then we could ++ * just use SCX_DSQ_GLOBAL. ++ */ ++#define SHARED_DSQ 0 ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); ++ __uint(key_size, sizeof(u32)); ++ __uint(value_size, sizeof(u64)); ++ __uint(max_entries, 2); /* [local, global] */ ++} stats SEC(".maps"); ++ ++static void stat_inc(u32 idx) ++{ ++ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); ++ if (cnt_p) ++ (*cnt_p)++; ++} ++ ++static inline bool vtime_before(u64 a, u64 b) ++{ ++ return (s64)(a - b) < 0; ++} ++ ++s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) ++{ ++ bool is_idle = false; ++ s32 cpu; ++ ++ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); ++ if (is_idle) { ++ stat_inc(0); /* count local queueing */ ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); ++ } ++ ++ return cpu; ++} ++ ++void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ stat_inc(1); /* count global queueing */ ++ ++ if (fifo_sched) { ++ scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); ++ } else { ++ u64 vtime = p->scx.dsq_vtime; ++ ++ /* ++ * Limit the amount of budget that an idling task can accumulate ++ * to one slice. ++ */ ++ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) ++ vtime = vtime_now - SCX_SLICE_DFL; ++ ++ scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, ++ enq_flags); ++ } ++} ++ ++void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ scx_bpf_consume(SHARED_DSQ); ++} ++ ++void BPF_STRUCT_OPS(simple_running, struct task_struct *p) ++{ ++ if (fifo_sched) ++ return; ++ ++ /* ++ * Global vtime always progresses forward as tasks start executing. The ++ * test and update can be performed concurrently from multiple CPUs and ++ * thus racy. Any error should be contained and temporary. Let's just ++ * live with it. ++ */ ++ if (vtime_before(vtime_now, p->scx.dsq_vtime)) ++ vtime_now = p->scx.dsq_vtime; ++} ++ ++void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) ++{ ++ if (fifo_sched) ++ return; ++ ++ /* ++ * Scale the execution time by the inverse of the weight and charge. ++ * ++ * Note that the default yield implementation yields by setting ++ * @p->scx.slice to zero and the following would treat the yielding task ++ * as if it has consumed all its slice. If this penalizes yielding tasks ++ * too much, determine the execution time by taking explicit timestamps ++ * instead of depending on @p->scx.slice. ++ */ ++ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; ++} ++ ++void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) ++{ ++ p->scx.dsq_vtime = vtime_now; ++} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) ++{ ++ return scx_bpf_create_dsq(SHARED_DSQ, -1); ++} ++ ++void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SCX_OPS_DEFINE(simple_ops, ++ .select_cpu = (void *)simple_select_cpu, ++ .enqueue = (void *)simple_enqueue, ++ .dispatch = (void *)simple_dispatch, ++ .running = (void *)simple_running, ++ .stopping = (void *)simple_stopping, ++ .enable = (void *)simple_enable, ++ .init = (void *)simple_init, ++ .exit = (void *)simple_exit, ++ .name = "simple"); +diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c +new file mode 100644 +index 000000000000..76d83199545c +--- /dev/null ++++ b/tools/sched_ext/scx_simple.c +@@ -0,0 +1,107 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2022 David Vernet <dvernet@meta.com> ++ */ ++#include <stdio.h> ++#include <unistd.h> ++#include <signal.h> ++#include <libgen.h> ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include "scx_simple.bpf.skel.h" ++ ++const char help_fmt[] = ++"A simple sched_ext scheduler.\n" ++"\n" ++"See the top-level comment in .bpf.c for more details.\n" ++"\n" ++"Usage: %s [-f] [-v]\n" ++"\n" ++" -f Use FIFO scheduling instead of weighted vtime scheduling\n" ++" -v Print libbpf debug messages\n" ++" -h Display this help and exit\n"; ++ ++static bool verbose; ++static volatile int exit_req; ++ ++static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) ++{ ++ if (level == LIBBPF_DEBUG && !verbose) ++ return 0; ++ return vfprintf(stderr, format, args); ++} ++ ++static void sigint_handler(int simple) ++{ ++ exit_req = 1; ++} ++ ++static void read_stats(struct scx_simple *skel, __u64 *stats) ++{ ++ int nr_cpus = libbpf_num_possible_cpus(); ++ __u64 cnts[2][nr_cpus]; ++ __u32 idx; ++ ++ memset(stats, 0, sizeof(stats[0]) * 2); ++ ++ for (idx = 0; idx < 2; idx++) { ++ int ret, cpu; ++ ++ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), ++ &idx, cnts[idx]); ++ if (ret < 0) ++ continue; ++ for (cpu = 0; cpu < nr_cpus; cpu++) ++ stats[idx] += cnts[idx][cpu]; ++ } ++} ++ ++int main(int argc, char **argv) ++{ ++ struct scx_simple *skel; ++ struct bpf_link *link; ++ __u32 opt; ++ __u64 ecode; ++ ++ libbpf_set_print(libbpf_print_fn); ++ signal(SIGINT, sigint_handler); ++ signal(SIGTERM, sigint_handler); ++restart: ++ skel = SCX_OPS_OPEN(simple_ops, scx_simple); ++ ++ while ((opt = getopt(argc, argv, "fvh")) != -1) { ++ switch (opt) { ++ case 'f': ++ skel->rodata->fifo_sched = true; ++ break; ++ case 'v': ++ verbose = true; ++ break; ++ default: ++ fprintf(stderr, help_fmt, basename(argv[0])); ++ return opt != 'h'; ++ } ++ } ++ ++ SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); ++ link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); ++ ++ while (!exit_req && !UEI_EXITED(skel, uei)) { ++ __u64 stats[2]; ++ ++ read_stats(skel, stats); ++ printf("local=%llu global=%llu\n", stats[0], stats[1]); ++ fflush(stdout); ++ sleep(1); ++ } ++ ++ bpf_link__destroy(link); ++ ecode = UEI_REPORT(skel, uei); ++ scx_simple__destroy(skel); ++ ++ if (UEI_ECODE_RESTART(ecode)) ++ goto restart; ++ return 0; ++} +diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore +new file mode 100644 +index 000000000000..ae5491a114c0 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/.gitignore +@@ -0,0 +1,6 @@ ++* ++!*.c ++!*.h ++!Makefile ++!.gitignore ++!config +diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile +new file mode 100644 +index 000000000000..0754a2c110a1 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/Makefile +@@ -0,0 +1,218 @@ ++# SPDX-License-Identifier: GPL-2.0 ++# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. ++include ../../../build/Build.include ++include ../../../scripts/Makefile.arch ++include ../../../scripts/Makefile.include ++include ../lib.mk ++ ++ifneq ($(LLVM),) ++ifneq ($(filter %/,$(LLVM)),) ++LLVM_PREFIX := $(LLVM) ++else ifneq ($(filter -%,$(LLVM)),) ++LLVM_SUFFIX := $(LLVM) ++endif ++ ++CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as ++else ++CC := gcc ++endif # LLVM ++ ++ifneq ($(CROSS_COMPILE),) ++$(error CROSS_COMPILE not supported for scx selftests) ++endif # CROSS_COMPILE ++ ++CURDIR := $(abspath .) ++REPOROOT := $(abspath ../../../..) ++TOOLSDIR := $(REPOROOT)/tools ++LIBDIR := $(TOOLSDIR)/lib ++BPFDIR := $(LIBDIR)/bpf ++TOOLSINCDIR := $(TOOLSDIR)/include ++BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool ++APIDIR := $(TOOLSINCDIR)/uapi ++GENDIR := $(REPOROOT)/include/generated ++GENHDR := $(GENDIR)/autoconf.h ++SCXTOOLSDIR := $(TOOLSDIR)/sched_ext ++SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include ++ ++OUTPUT_DIR := $(CURDIR)/build ++OBJ_DIR := $(OUTPUT_DIR)/obj ++INCLUDE_DIR := $(OUTPUT_DIR)/include ++BPFOBJ_DIR := $(OBJ_DIR)/libbpf ++SCXOBJ_DIR := $(OBJ_DIR)/sched_ext ++BPFOBJ := $(BPFOBJ_DIR)/libbpf.a ++LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a ++DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool ++HOST_BUILD_DIR := $(OBJ_DIR) ++HOST_OUTPUT_DIR := $(OUTPUT_DIR) ++ ++VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ ++ /sys/kernel/btf/vmlinux \ ++ /boot/vmlinux-$(shell uname -r) ++VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) ++ifeq ($(VMLINUX_BTF),) ++$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") ++endif ++ ++BPFTOOL ?= $(DEFAULT_BPFTOOL) ++ ++ifneq ($(wildcard $(GENHDR)),) ++ GENFLAGS := -DHAVE_GENHDR ++endif ++ ++CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ ++ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ ++ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) ++ ++# Silence some warnings when compiled with clang ++ifneq ($(LLVM),) ++CFLAGS += -Wno-unused-command-line-argument ++endif ++ ++LDFLAGS = -lelf -lz -lpthread -lzstd ++ ++IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \ ++ grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__') ++ ++# Get Clang's default includes on this system, as opposed to those seen by ++# '-target bpf'. This fixes "missing" files on some architectures/distros, ++# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. ++# ++# Use '-idirafter': Don't interfere with include mechanics except where the ++# build would have failed anyways. ++define get_sys_includes ++$(shell $(1) -v -E - </dev/null 2>&1 \ ++ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ ++$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') ++endef ++ ++BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ ++ $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \ ++ -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat \ ++ -I$(INCLUDE_DIR) -I$(APIDIR) -I$(SCXTOOLSINCDIR) \ ++ -I$(REPOROOT)/include \ ++ $(call get_sys_includes,$(CLANG)) \ ++ -Wall -Wno-compare-distinct-pointer-types \ ++ -Wno-incompatible-function-pointer-types \ ++ -O2 -mcpu=v3 ++ ++# sort removes libbpf duplicates when not cross-building ++MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(OBJ_DIR)/libbpf \ ++ $(OBJ_DIR)/bpftool $(OBJ_DIR)/resolve_btfids \ ++ $(INCLUDE_DIR) $(SCXOBJ_DIR)) ++ ++$(MAKE_DIRS): ++ $(call msg,MKDIR,,$@) ++ $(Q)mkdir -p $@ ++ ++$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ++ $(APIDIR)/linux/bpf.h \ ++ | $(OBJ_DIR)/libbpf ++ $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/ \ ++ EXTRA_CFLAGS='-g -O0 -fPIC' \ ++ DESTDIR=$(OUTPUT_DIR) prefix= all install_headers ++ ++$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ ++ $(LIBBPF_OUTPUT) | $(OBJ_DIR)/bpftool ++ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ ++ ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \ ++ EXTRA_CFLAGS='-g -O0' \ ++ OUTPUT=$(OBJ_DIR)/bpftool/ \ ++ LIBBPF_OUTPUT=$(OBJ_DIR)/libbpf/ \ ++ LIBBPF_DESTDIR=$(OUTPUT_DIR)/ \ ++ prefix= DESTDIR=$(OUTPUT_DIR)/ install-bin ++ ++$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) ++ifeq ($(VMLINUX_H),) ++ $(call msg,GEN,,$@) ++ $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ ++else ++ $(call msg,CP,,$@) ++ $(Q)cp "$(VMLINUX_H)" $@ ++endif ++ ++$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) ++ $(call msg,CLNG-BPF,,$(notdir $@)) ++ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ ++ ++$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) ++ $(eval sched=$(notdir $@)) ++ $(call msg,GEN-SKEL,,$(sched)) ++ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< ++ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) ++ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) ++ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) ++ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ ++ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) ++ ++################ ++# C schedulers # ++################ ++ ++override define CLEAN ++ rm -rf $(OUTPUT_DIR) ++ rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h ++ rm -f $(TEST_GEN_PROGS) ++ rm -f runner ++endef ++ ++# Every testcase takes all of the BPF progs are dependencies by default. This ++# allows testcases to load any BPF scheduler, which is useful for testcases ++# that don't need their own prog to run their test. ++all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog))) ++ ++auto-test-targets := \ ++ create_dsq \ ++ enq_last_no_enq_fails \ ++ enq_select_cpu_fails \ ++ ddsp_bogus_dsq_fail \ ++ ddsp_vtimelocal_fail \ ++ dsp_local_on \ ++ exit \ ++ hotplug \ ++ init_enable_count \ ++ maximal \ ++ maybe_null \ ++ minimal \ ++ prog_run \ ++ reload_loop \ ++ select_cpu_dfl \ ++ select_cpu_dfl_nodispatch \ ++ select_cpu_dispatch \ ++ select_cpu_dispatch_bad_dsq \ ++ select_cpu_dispatch_dbl_dsp \ ++ select_cpu_vtime \ ++ test_example \ ++ ++testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) ++ ++$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) ++ $(CC) $(CFLAGS) -c $< -o $@ ++ ++# Create all of the test targets object files, whose testcase objects will be ++# registered into the runner in ELF constructors. ++# ++# Note that we must do double expansion here in order to support conditionally ++# compiling BPF object files only if one is present, as the wildcard Make ++# function doesn't support using implicit rules otherwise. ++$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR) ++ $(eval test=$(patsubst %.o,%.c,$(notdir $@))) ++ $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o ++ ++$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR) ++ $(CC) $(CFLAGS) -c $< -o $@ ++ ++runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets) ++ @echo "$(testcase-targets)" ++ $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) ++ ++TEST_GEN_PROGS := runner ++ ++all: runner ++ ++.PHONY: all clean help ++ ++.DEFAULT_GOAL := all ++ ++.DELETE_ON_ERROR: ++ ++.SECONDARY: +diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config +new file mode 100644 +index 000000000000..0de9b4ee249d +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/config +@@ -0,0 +1,9 @@ ++CONFIG_SCHED_DEBUG=y ++CONFIG_SCHED_CLASS_EXT=y ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_EXT_GROUP_SCHED=y ++CONFIG_BPF=y ++CONFIG_BPF_SYSCALL=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_BTF=y +diff --git a/tools/testing/selftests/sched_ext/create_dsq.bpf.c b/tools/testing/selftests/sched_ext/create_dsq.bpf.c +new file mode 100644 +index 000000000000..23f79ed343f0 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c +@@ -0,0 +1,58 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Create and destroy DSQs in a loop. ++ * ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++void BPF_STRUCT_OPS(create_dsq_exit_task, struct task_struct *p, ++ struct scx_exit_task_args *args) ++{ ++ scx_bpf_destroy_dsq(p->pid); ++} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init_task, struct task_struct *p, ++ struct scx_init_task_args *args) ++{ ++ s32 err; ++ ++ err = scx_bpf_create_dsq(p->pid, -1); ++ if (err) ++ scx_bpf_error("Failed to create DSQ for %s[%d]", ++ p->comm, p->pid); ++ ++ return err; ++} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init) ++{ ++ u32 i; ++ s32 err; ++ ++ bpf_for(i, 0, 1024) { ++ err = scx_bpf_create_dsq(i, -1); ++ if (err) { ++ scx_bpf_error("Failed to create DSQ %d", i); ++ return 0; ++ } ++ } ++ ++ bpf_for(i, 0, 1024) { ++ scx_bpf_destroy_dsq(i); ++ } ++ ++ return 0; ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops create_dsq_ops = { ++ .init_task = create_dsq_init_task, ++ .exit_task = create_dsq_exit_task, ++ .init = create_dsq_init, ++ .name = "create_dsq", ++}; +diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c +new file mode 100644 +index 000000000000..fa946d9146d4 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/create_dsq.c +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "create_dsq.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct create_dsq *skel; ++ ++ skel = create_dsq__open_and_load(); ++ if (!skel) { ++ SCX_ERR("Failed to open and load skel"); ++ return SCX_TEST_FAIL; ++ } ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct create_dsq *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.create_dsq_ops); ++ if (!link) { ++ SCX_ERR("Failed to attach scheduler"); ++ return SCX_TEST_FAIL; ++ } ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct create_dsq *skel = ctx; ++ ++ create_dsq__destroy(skel); ++} ++ ++struct scx_test create_dsq = { ++ .name = "create_dsq", ++ .description = "Create and destroy a dsq in a loop", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&create_dsq) +diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +new file mode 100644 +index 000000000000..e97ad41d354a +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +@@ -0,0 +1,42 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ */ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++UEI_DEFINE(uei); ++ ++s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); ++ ++ if (cpu >= 0) { ++ /* ++ * If we dispatch to a bogus DSQ that will fall back to the ++ * builtin global DSQ, we fail gracefully. ++ */ ++ scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, ++ p->scx.dsq_vtime, 0); ++ return cpu; ++ } ++ ++ return prev_cpu; ++} ++ ++void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { ++ .select_cpu = ddsp_bogus_dsq_fail_select_cpu, ++ .exit = ddsp_bogus_dsq_fail_exit, ++ .name = "ddsp_bogus_dsq_fail", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c +new file mode 100644 +index 000000000000..e65d22f23f3b +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "ddsp_bogus_dsq_fail.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct ddsp_bogus_dsq_fail *skel; ++ ++ skel = ddsp_bogus_dsq_fail__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct ddsp_bogus_dsq_fail *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); ++ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); ++ ++ sleep(1); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct ddsp_bogus_dsq_fail *skel = ctx; ++ ++ ddsp_bogus_dsq_fail__destroy(skel); ++} ++ ++struct scx_test ddsp_bogus_dsq_fail = { ++ .name = "ddsp_bogus_dsq_fail", ++ .description = "Verify we gracefully fail, and fall back to using a " ++ "built-in DSQ, if we do a direct dispatch to an invalid" ++ " DSQ in ops.select_cpu()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) +diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +new file mode 100644 +index 000000000000..dde7e7dafbfb +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +@@ -0,0 +1,39 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ */ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++UEI_DEFINE(uei); ++ ++s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); ++ ++ if (cpu >= 0) { ++ /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ ++ scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, ++ p->scx.dsq_vtime, 0); ++ return cpu; ++ } ++ ++ return prev_cpu; ++} ++ ++void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops ddsp_vtimelocal_fail_ops = { ++ .select_cpu = ddsp_vtimelocal_fail_select_cpu, ++ .exit = ddsp_vtimelocal_fail_exit, ++ .name = "ddsp_vtimelocal_fail", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c +new file mode 100644 +index 000000000000..abafee587cd6 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <unistd.h> ++#include "ddsp_vtimelocal_fail.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct ddsp_vtimelocal_fail *skel; ++ ++ skel = ddsp_vtimelocal_fail__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct ddsp_vtimelocal_fail *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); ++ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); ++ ++ sleep(1); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct ddsp_vtimelocal_fail *skel = ctx; ++ ++ ddsp_vtimelocal_fail__destroy(skel); ++} ++ ++struct scx_test ddsp_vtimelocal_fail = { ++ .name = "ddsp_vtimelocal_fail", ++ .description = "Verify we gracefully fail, and fall back to using a " ++ "built-in DSQ, if we do a direct vtime dispatch to a " ++ "built-in DSQ from DSQ in ops.select_cpu()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) +diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +new file mode 100644 +index 000000000000..efb4672decb4 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +@@ -0,0 +1,65 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++const volatile s32 nr_cpus; ++ ++UEI_DEFINE(uei); ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_QUEUE); ++ __uint(max_entries, 8192); ++ __type(value, s32); ++} queue SEC(".maps"); ++ ++s32 BPF_STRUCT_OPS(dsp_local_on_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ return prev_cpu; ++} ++ ++void BPF_STRUCT_OPS(dsp_local_on_enqueue, struct task_struct *p, ++ u64 enq_flags) ++{ ++ s32 pid = p->pid; ++ ++ if (bpf_map_push_elem(&queue, &pid, 0)) ++ scx_bpf_error("Failed to enqueue %s[%d]", p->comm, p->pid); ++} ++ ++void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ s32 pid, target; ++ struct task_struct *p; ++ ++ if (bpf_map_pop_elem(&queue, &pid)) ++ return; ++ ++ p = bpf_task_from_pid(pid); ++ if (!p) ++ return; ++ ++ target = bpf_get_prandom_u32() % nr_cpus; ++ ++ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); ++ bpf_task_release(p); ++} ++ ++void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops dsp_local_on_ops = { ++ .select_cpu = dsp_local_on_select_cpu, ++ .enqueue = dsp_local_on_enqueue, ++ .dispatch = dsp_local_on_dispatch, ++ .exit = dsp_local_on_exit, ++ .name = "dsp_local_on", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c +new file mode 100644 +index 000000000000..472851b56854 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/dsp_local_on.c +@@ -0,0 +1,58 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <unistd.h> ++#include "dsp_local_on.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct dsp_local_on *skel; ++ ++ skel = dsp_local_on__open(); ++ SCX_FAIL_IF(!skel, "Failed to open"); ++ ++ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); ++ SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct dsp_local_on *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.dsp_local_on_ops); ++ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); ++ ++ /* Just sleeping is fine, plenty of scheduling events happening */ ++ sleep(1); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct dsp_local_on *skel = ctx; ++ ++ dsp_local_on__destroy(skel); ++} ++ ++struct scx_test dsp_local_on = { ++ .name = "dsp_local_on", ++ .description = "Verify we can directly dispatch tasks to a local DSQs " ++ "from osp.dispatch()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&dsp_local_on) +diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c +new file mode 100644 +index 000000000000..b0b99531d5d5 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c +@@ -0,0 +1,21 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates the behavior of direct dispatching with a default ++ * select_cpu implementation. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops enq_last_no_enq_fails_ops = { ++ .name = "enq_last_no_enq_fails", ++ /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ ++ .flags = SCX_OPS_ENQ_LAST, ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c +new file mode 100644 +index 000000000000..2a3eda5e2c0b +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c +@@ -0,0 +1,60 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "enq_last_no_enq_fails.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct enq_last_no_enq_fails *skel; ++ ++ skel = enq_last_no_enq_fails__open_and_load(); ++ if (!skel) { ++ SCX_ERR("Failed to open and load skel"); ++ return SCX_TEST_FAIL; ++ } ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct enq_last_no_enq_fails *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); ++ if (link) { ++ SCX_ERR("Incorrectly succeeded in to attaching scheduler"); ++ return SCX_TEST_FAIL; ++ } ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct enq_last_no_enq_fails *skel = ctx; ++ ++ enq_last_no_enq_fails__destroy(skel); ++} ++ ++struct scx_test enq_last_no_enq_fails = { ++ .name = "enq_last_no_enq_fails", ++ .description = "Verify we fail to load a scheduler if we specify " ++ "the SCX_OPS_ENQ_LAST flag without defining " ++ "ops.enqueue()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&enq_last_no_enq_fails) +diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +new file mode 100644 +index 000000000000..b3dfc1033cd6 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++/* Manually specify the signature until the kfunc is added to the scx repo. */ ++s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, ++ bool *found) __ksym; ++ ++s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ return prev_cpu; ++} ++ ++void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, ++ u64 enq_flags) ++{ ++ /* ++ * Need to initialize the variable or the verifier will fail to load. ++ * Improving these semantics is actively being worked on. ++ */ ++ bool found = false; ++ ++ /* Can only call from ops.select_cpu() */ ++ scx_bpf_select_cpu_dfl(p, 0, 0, &found); ++ ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops enq_select_cpu_fails_ops = { ++ .select_cpu = enq_select_cpu_fails_select_cpu, ++ .enqueue = enq_select_cpu_fails_enqueue, ++ .name = "enq_select_cpu_fails", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c +new file mode 100644 +index 000000000000..dd1350e5f002 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c +@@ -0,0 +1,61 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "enq_select_cpu_fails.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct enq_select_cpu_fails *skel; ++ ++ skel = enq_select_cpu_fails__open_and_load(); ++ if (!skel) { ++ SCX_ERR("Failed to open and load skel"); ++ return SCX_TEST_FAIL; ++ } ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct enq_select_cpu_fails *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); ++ if (!link) { ++ SCX_ERR("Failed to attach scheduler"); ++ return SCX_TEST_FAIL; ++ } ++ ++ sleep(1); ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct enq_select_cpu_fails *skel = ctx; ++ ++ enq_select_cpu_fails__destroy(skel); ++} ++ ++struct scx_test enq_select_cpu_fails = { ++ .name = "enq_select_cpu_fails", ++ .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " ++ "from ops.enqueue()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&enq_select_cpu_fails) +diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c +new file mode 100644 +index 000000000000..ae12ddaac921 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/exit.bpf.c +@@ -0,0 +1,84 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++#include "exit_test.h" ++ ++const volatile int exit_point; ++UEI_DEFINE(uei); ++ ++#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point) ++ ++s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ bool found; ++ ++ if (exit_point == EXIT_SELECT_CPU) ++ EXIT_CLEANLY(); ++ ++ return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found); ++} ++ ++void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ if (exit_point == EXIT_ENQUEUE) ++ EXIT_CLEANLY(); ++ ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++} ++ ++void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) ++{ ++ if (exit_point == EXIT_DISPATCH) ++ EXIT_CLEANLY(); ++ ++ scx_bpf_consume(SCX_DSQ_GLOBAL); ++} ++ ++void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) ++{ ++ if (exit_point == EXIT_ENABLE) ++ EXIT_CLEANLY(); ++} ++ ++s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p, ++ struct scx_init_task_args *args) ++{ ++ if (exit_point == EXIT_INIT_TASK) ++ EXIT_CLEANLY(); ++ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init) ++{ ++ if (exit_point == EXIT_INIT) ++ EXIT_CLEANLY(); ++ ++ return 0; ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops exit_ops = { ++ .select_cpu = exit_select_cpu, ++ .enqueue = exit_enqueue, ++ .dispatch = exit_dispatch, ++ .init_task = exit_init_task, ++ .enable = exit_enable, ++ .exit = exit_exit, ++ .init = exit_init, ++ .name = "exit", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c +new file mode 100644 +index 000000000000..31bcd06e21cd +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/exit.c +@@ -0,0 +1,55 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <sched.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "exit.bpf.skel.h" ++#include "scx_test.h" ++ ++#include "exit_test.h" ++ ++static enum scx_test_status run(void *ctx) ++{ ++ enum exit_test_case tc; ++ ++ for (tc = 0; tc < NUM_EXITS; tc++) { ++ struct exit *skel; ++ struct bpf_link *link; ++ char buf[16]; ++ ++ skel = exit__open(); ++ skel->rodata->exit_point = tc; ++ exit__load(skel); ++ link = bpf_map__attach_struct_ops(skel->maps.exit_ops); ++ if (!link) { ++ SCX_ERR("Failed to attach scheduler"); ++ exit__destroy(skel); ++ return SCX_TEST_FAIL; ++ } ++ ++ /* Assumes uei.kind is written last */ ++ while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) ++ sched_yield(); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); ++ SCX_EQ(skel->data->uei.exit_code, tc); ++ sprintf(buf, "%d", tc); ++ SCX_ASSERT(!strcmp(skel->data->uei.msg, buf)); ++ bpf_link__destroy(link); ++ exit__destroy(skel); ++ } ++ ++ return SCX_TEST_PASS; ++} ++ ++struct scx_test exit_test = { ++ .name = "exit", ++ .description = "Verify we can cleanly exit a scheduler in multiple places", ++ .run = run, ++}; ++REGISTER_SCX_TEST(&exit_test) +diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h +new file mode 100644 +index 000000000000..94f0268b9cb8 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/exit_test.h +@@ -0,0 +1,20 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++ ++#ifndef __EXIT_TEST_H__ ++#define __EXIT_TEST_H__ ++ ++enum exit_test_case { ++ EXIT_SELECT_CPU, ++ EXIT_ENQUEUE, ++ EXIT_DISPATCH, ++ EXIT_ENABLE, ++ EXIT_INIT_TASK, ++ EXIT_INIT, ++ NUM_EXITS, ++}; ++ ++#endif // # __EXIT_TEST_H__ +diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c +new file mode 100644 +index 000000000000..8f2601db39f3 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c +@@ -0,0 +1,61 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++#include "hotplug_test.h" ++ ++UEI_DEFINE(uei); ++ ++void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++static void exit_from_hotplug(s32 cpu, bool onlining) ++{ ++ /* ++ * Ignored, just used to verify that we can invoke blocking kfuncs ++ * from the hotplug path. ++ */ ++ scx_bpf_create_dsq(0, -1); ++ ++ s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN; ++ ++ if (onlining) ++ code |= HOTPLUG_ONLINING; ++ ++ scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu, ++ onlining ? "online" : "offline"); ++} ++ ++void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_online, s32 cpu) ++{ ++ exit_from_hotplug(cpu, true); ++} ++ ++void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu) ++{ ++ exit_from_hotplug(cpu, false); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops hotplug_cb_ops = { ++ .cpu_online = hotplug_cpu_online, ++ .cpu_offline = hotplug_cpu_offline, ++ .exit = hotplug_exit, ++ .name = "hotplug_cbs", ++ .timeout_ms = 1000U, ++}; ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops hotplug_nocb_ops = { ++ .exit = hotplug_exit, ++ .name = "hotplug_nocbs", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c +new file mode 100644 +index 000000000000..87bf220b1bce +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/hotplug.c +@@ -0,0 +1,168 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <sched.h> ++#include <scx/common.h> ++#include <sched.h> ++#include <sys/wait.h> ++#include <unistd.h> ++ ++#include "hotplug_test.h" ++#include "hotplug.bpf.skel.h" ++#include "scx_test.h" ++#include "util.h" ++ ++const char *online_path = "/sys/devices/system/cpu/cpu1/online"; ++ ++static bool is_cpu_online(void) ++{ ++ return file_read_long(online_path) > 0; ++} ++ ++static void toggle_online_status(bool online) ++{ ++ long val = online ? 1 : 0; ++ int ret; ++ ++ ret = file_write_long(online_path, val); ++ if (ret != 0) ++ fprintf(stderr, "Failed to bring CPU %s (%s)", ++ online ? "online" : "offline", strerror(errno)); ++} ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ if (!is_cpu_online()) ++ return SCX_TEST_SKIP; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) ++{ ++ struct hotplug *skel; ++ struct bpf_link *link; ++ long kind, code; ++ ++ SCX_ASSERT(is_cpu_online()); ++ ++ skel = hotplug__open_and_load(); ++ SCX_ASSERT(skel); ++ ++ /* Testing the offline -> online path, so go offline before starting */ ++ if (onlining) ++ toggle_online_status(0); ++ ++ if (cbs_defined) { ++ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF); ++ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN; ++ if (onlining) ++ code |= HOTPLUG_ONLINING; ++ } else { ++ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); ++ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | ++ SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); ++ } ++ ++ if (cbs_defined) ++ link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops); ++ else ++ link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); ++ ++ if (!link) { ++ SCX_ERR("Failed to attach scheduler"); ++ hotplug__destroy(skel); ++ return SCX_TEST_FAIL; ++ } ++ ++ toggle_online_status(onlining ? 1 : 0); ++ ++ while (!UEI_EXITED(skel, uei)) ++ sched_yield(); ++ ++ SCX_EQ(skel->data->uei.kind, kind); ++ SCX_EQ(UEI_REPORT(skel, uei), code); ++ ++ if (!onlining) ++ toggle_online_status(1); ++ ++ bpf_link__destroy(link); ++ hotplug__destroy(skel); ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status test_hotplug_attach(void) ++{ ++ struct hotplug *skel; ++ struct bpf_link *link; ++ enum scx_test_status status = SCX_TEST_PASS; ++ long kind, code; ++ ++ SCX_ASSERT(is_cpu_online()); ++ SCX_ASSERT(scx_hotplug_seq() > 0); ++ ++ skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug); ++ SCX_ASSERT(skel); ++ ++ SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei); ++ ++ /* ++ * Take the CPU offline to increment the global hotplug seq, which ++ * should cause attach to fail due to us setting the hotplug seq above ++ */ ++ toggle_online_status(0); ++ link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); ++ ++ toggle_online_status(1); ++ ++ SCX_ASSERT(link); ++ while (!UEI_EXITED(skel, uei)) ++ sched_yield(); ++ ++ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); ++ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | ++ SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); ++ SCX_EQ(skel->data->uei.kind, kind); ++ SCX_EQ(UEI_REPORT(skel, uei), code); ++ ++ bpf_link__destroy(link); ++ hotplug__destroy(skel); ++ ++ return status; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ ++#define HP_TEST(__onlining, __cbs_defined) ({ \ ++ if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS) \ ++ return SCX_TEST_FAIL; \ ++}) ++ ++ HP_TEST(true, true); ++ HP_TEST(false, true); ++ HP_TEST(true, false); ++ HP_TEST(false, false); ++ ++#undef HP_TEST ++ ++ return test_hotplug_attach(); ++} ++ ++static void cleanup(void *ctx) ++{ ++ toggle_online_status(1); ++} ++ ++struct scx_test hotplug_test = { ++ .name = "hotplug", ++ .description = "Verify hotplug behavior", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&hotplug_test) +diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h +new file mode 100644 +index 000000000000..73d236f90787 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/hotplug_test.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++ ++#ifndef __HOTPLUG_TEST_H__ ++#define __HOTPLUG_TEST_H__ ++ ++enum hotplug_test_flags { ++ HOTPLUG_EXIT_RSN = 1LLU << 0, ++ HOTPLUG_ONLINING = 1LLU << 1, ++}; ++ ++#endif // # __HOTPLUG_TEST_H__ +diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c +new file mode 100644 +index 000000000000..47ea89a626c3 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that verifies that we do proper counting of init, enable, etc ++ * callbacks. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; ++u64 init_fork_cnt, init_transition_cnt; ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, ++ struct scx_init_task_args *args) ++{ ++ __sync_fetch_and_add(&init_task_cnt, 1); ++ ++ if (args->fork) ++ __sync_fetch_and_add(&init_fork_cnt, 1); ++ else ++ __sync_fetch_and_add(&init_transition_cnt, 1); ++ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) ++{ ++ __sync_fetch_and_add(&exit_task_cnt, 1); ++} ++ ++void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) ++{ ++ __sync_fetch_and_add(&enable_cnt, 1); ++} ++ ++void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) ++{ ++ __sync_fetch_and_add(&disable_cnt, 1); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops init_enable_count_ops = { ++ .init_task = cnt_init_task, ++ .exit_task = cnt_exit_task, ++ .enable = cnt_enable, ++ .disable = cnt_disable, ++ .name = "init_enable_count", ++}; +diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c +new file mode 100644 +index 000000000000..97d45f1e5597 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/init_enable_count.c +@@ -0,0 +1,166 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <stdio.h> ++#include <unistd.h> ++#include <sched.h> ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include "scx_test.h" ++#include "init_enable_count.bpf.skel.h" ++ ++#define SCHED_EXT 7 ++ ++static struct init_enable_count * ++open_load_prog(bool global) ++{ ++ struct init_enable_count *skel; ++ ++ skel = init_enable_count__open(); ++ SCX_BUG_ON(!skel, "Failed to open skel"); ++ ++ if (!global) ++ skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; ++ ++ SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); ++ ++ return skel; ++} ++ ++static enum scx_test_status run_test(bool global) ++{ ++ struct init_enable_count *skel; ++ struct bpf_link *link; ++ const u32 num_children = 5, num_pre_forks = 1024; ++ int ret, i, status; ++ struct sched_param param = {}; ++ pid_t pids[num_pre_forks]; ++ ++ skel = open_load_prog(global); ++ ++ /* ++ * Fork a bunch of children before we attach the scheduler so that we ++ * ensure (at least in practical terms) that there are more tasks that ++ * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that ++ * take the fork() path either below or in other processes. ++ */ ++ for (i = 0; i < num_pre_forks; i++) { ++ pids[i] = fork(); ++ SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); ++ if (pids[i] == 0) { ++ sleep(1); ++ exit(0); ++ } ++ } ++ ++ link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); ++ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); ++ ++ for (i = 0; i < num_pre_forks; i++) { ++ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], ++ "Failed to wait for pre-forked child\n"); ++ ++ SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, ++ status); ++ } ++ ++ bpf_link__destroy(link); ++ SCX_GE(skel->bss->init_task_cnt, num_pre_forks); ++ SCX_GE(skel->bss->exit_task_cnt, num_pre_forks); ++ ++ link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); ++ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); ++ ++ /* SCHED_EXT children */ ++ for (i = 0; i < num_children; i++) { ++ pids[i] = fork(); ++ SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); ++ ++ if (pids[i] == 0) { ++ ret = sched_setscheduler(0, SCHED_EXT, ¶m); ++ SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); ++ ++ /* ++ * Reset to SCHED_OTHER for half of them. Counts for ++ * everything should still be the same regardless, as ++ * ops.disable() is invoked even if a task is still on ++ * SCHED_EXT before it exits. ++ */ ++ if (i % 2 == 0) { ++ ret = sched_setscheduler(0, SCHED_OTHER, ¶m); ++ SCX_BUG_ON(ret, "Failed to reset sched to normal"); ++ } ++ exit(0); ++ } ++ } ++ for (i = 0; i < num_children; i++) { ++ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], ++ "Failed to wait for SCX child\n"); ++ ++ SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, ++ status); ++ } ++ ++ /* SCHED_OTHER children */ ++ for (i = 0; i < num_children; i++) { ++ pids[i] = fork(); ++ if (pids[i] == 0) ++ exit(0); ++ } ++ ++ for (i = 0; i < num_children; i++) { ++ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], ++ "Failed to wait for normal child\n"); ++ ++ SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, ++ status); ++ } ++ ++ bpf_link__destroy(link); ++ ++ SCX_GE(skel->bss->init_task_cnt, 2 * num_children); ++ SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); ++ ++ if (global) { ++ SCX_GE(skel->bss->enable_cnt, 2 * num_children); ++ SCX_GE(skel->bss->disable_cnt, 2 * num_children); ++ } else { ++ SCX_EQ(skel->bss->enable_cnt, num_children); ++ SCX_EQ(skel->bss->disable_cnt, num_children); ++ } ++ /* ++ * We forked a ton of tasks before we attached the scheduler above, so ++ * this should be fine. Technically it could be flaky if a ton of forks ++ * are happening at the same time in other processes, but that should ++ * be exceedingly unlikely. ++ */ ++ SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt); ++ SCX_GE(skel->bss->init_fork_cnt, 2 * num_children); ++ ++ init_enable_count__destroy(skel); ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ enum scx_test_status status; ++ ++ status = run_test(true); ++ if (status != SCX_TEST_PASS) ++ return status; ++ ++ return run_test(false); ++} ++ ++struct scx_test init_enable_count = { ++ .name = "init_enable_count", ++ .description = "Verify we do the correct amount of counting of init, " ++ "enable, etc callbacks.", ++ .run = run, ++}; ++REGISTER_SCX_TEST(&init_enable_count) +diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c +new file mode 100644 +index 000000000000..44612fdaf399 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/maximal.bpf.c +@@ -0,0 +1,132 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler with every callback defined. ++ * ++ * This scheduler defines every callback. ++ * ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, ++ u64 wake_flags) ++{ ++ return prev_cpu; ++} ++ ++void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++} ++ ++void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) ++{} ++ ++void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ scx_bpf_consume(SCX_DSQ_GLOBAL); ++} ++ ++void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) ++{} ++ ++void BPF_STRUCT_OPS(maximal_running, struct task_struct *p) ++{} ++ ++void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable) ++{} ++ ++void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags) ++{} ++ ++bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from, ++ struct task_struct *to) ++{ ++ return false; ++} ++ ++bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a, ++ struct task_struct *b) ++{ ++ return false; ++} ++ ++void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight) ++{} ++ ++void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, ++ const struct cpumask *cpumask) ++{} ++ ++void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, ++ struct scx_cpu_acquire_args *args) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, ++ struct scx_cpu_release_args *args) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) ++{} ++ ++void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu) ++{} ++ ++s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p, ++ struct scx_init_task_args *args) ++{ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p) ++{} ++ ++void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, ++ struct scx_exit_task_args *args) ++{} ++ ++void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) ++{} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) ++{ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) ++{} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops maximal_ops = { ++ .select_cpu = maximal_select_cpu, ++ .enqueue = maximal_enqueue, ++ .dequeue = maximal_dequeue, ++ .dispatch = maximal_dispatch, ++ .runnable = maximal_runnable, ++ .running = maximal_running, ++ .stopping = maximal_stopping, ++ .quiescent = maximal_quiescent, ++ .yield = maximal_yield, ++ .core_sched_before = maximal_core_sched_before, ++ .set_weight = maximal_set_weight, ++ .set_cpumask = maximal_set_cpumask, ++ .update_idle = maximal_update_idle, ++ .cpu_acquire = maximal_cpu_acquire, ++ .cpu_release = maximal_cpu_release, ++ .cpu_online = maximal_cpu_online, ++ .cpu_offline = maximal_cpu_offline, ++ .init_task = maximal_init_task, ++ .enable = maximal_enable, ++ .exit_task = maximal_exit_task, ++ .disable = maximal_disable, ++ .init = maximal_init, ++ .exit = maximal_exit, ++ .name = "maximal", ++}; +diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c +new file mode 100644 +index 000000000000..f38fc973c380 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/maximal.c +@@ -0,0 +1,51 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "maximal.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct maximal *skel; ++ ++ skel = maximal__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct maximal *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); ++ SCX_FAIL_IF(!link, "Failed to attach scheduler"); ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct maximal *skel = ctx; ++ ++ maximal__destroy(skel); ++} ++ ++struct scx_test maximal = { ++ .name = "maximal", ++ .description = "Verify we can load a scheduler with every callback defined", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&maximal) +diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c +new file mode 100644 +index 000000000000..27d0f386acfb +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c +@@ -0,0 +1,36 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++u64 vtime_test; ++ ++void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) ++{} ++ ++void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) ++{ ++ if (p != NULL) ++ vtime_test = p->scx.dsq_vtime; ++} ++ ++bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from, ++ struct task_struct *to) ++{ ++ if (to) ++ bpf_printk("Yielding to %s[%d]", to->comm, to->pid); ++ ++ return false; ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops maybe_null_success = { ++ .dispatch = maybe_null_success_dispatch, ++ .yield = maybe_null_success_yield, ++ .enable = maybe_null_running, ++ .name = "minimal", ++}; +diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c +new file mode 100644 +index 000000000000..31cfafb0cf65 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/maybe_null.c +@@ -0,0 +1,49 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "maybe_null.bpf.skel.h" ++#include "maybe_null_fail_dsp.bpf.skel.h" ++#include "maybe_null_fail_yld.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct maybe_null *skel; ++ struct maybe_null_fail_dsp *fail_dsp; ++ struct maybe_null_fail_yld *fail_yld; ++ ++ skel = maybe_null__open_and_load(); ++ if (!skel) { ++ SCX_ERR("Failed to open and load maybe_null skel"); ++ return SCX_TEST_FAIL; ++ } ++ maybe_null__destroy(skel); ++ ++ fail_dsp = maybe_null_fail_dsp__open_and_load(); ++ if (fail_dsp) { ++ maybe_null_fail_dsp__destroy(fail_dsp); ++ SCX_ERR("Should failed to open and load maybe_null_fail_dsp skel"); ++ return SCX_TEST_FAIL; ++ } ++ ++ fail_yld = maybe_null_fail_yld__open_and_load(); ++ if (fail_yld) { ++ maybe_null_fail_yld__destroy(fail_yld); ++ SCX_ERR("Should failed to open and load maybe_null_fail_yld skel"); ++ return SCX_TEST_FAIL; ++ } ++ ++ return SCX_TEST_PASS; ++} ++ ++struct scx_test maybe_null = { ++ .name = "maybe_null", ++ .description = "Verify if PTR_MAYBE_NULL work for .dispatch", ++ .run = run, ++}; ++REGISTER_SCX_TEST(&maybe_null) +diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c +new file mode 100644 +index 000000000000..c0641050271d +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c +@@ -0,0 +1,25 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++u64 vtime_test; ++ ++void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) ++{} ++ ++void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) ++{ ++ vtime_test = p->scx.dsq_vtime; ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops maybe_null_fail = { ++ .dispatch = maybe_null_fail_dispatch, ++ .enable = maybe_null_running, ++ .name = "maybe_null_fail_dispatch", ++}; +diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c +new file mode 100644 +index 000000000000..3c1740028e3b +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++u64 vtime_test; ++ ++void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) ++{} ++ ++bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from, ++ struct task_struct *to) ++{ ++ bpf_printk("Yielding to %s[%d]", to->comm, to->pid); ++ ++ return false; ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops maybe_null_fail = { ++ .yield = maybe_null_fail_yield, ++ .enable = maybe_null_running, ++ .name = "maybe_null_fail_yield", ++}; +diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c +new file mode 100644 +index 000000000000..6a7eccef0104 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/minimal.bpf.c +@@ -0,0 +1,21 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A completely minimal scheduler. ++ * ++ * This scheduler defines the absolute minimal set of struct sched_ext_ops ++ * fields: its name. It should _not_ fail to be loaded, and can be used to ++ * exercise the default scheduling paths in ext.c. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops minimal_ops = { ++ .name = "minimal", ++}; +diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c +new file mode 100644 +index 000000000000..6c5db8ebbf8a +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/minimal.c +@@ -0,0 +1,58 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "minimal.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct minimal *skel; ++ ++ skel = minimal__open_and_load(); ++ if (!skel) { ++ SCX_ERR("Failed to open and load skel"); ++ return SCX_TEST_FAIL; ++ } ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct minimal *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); ++ if (!link) { ++ SCX_ERR("Failed to attach scheduler"); ++ return SCX_TEST_FAIL; ++ } ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct minimal *skel = ctx; ++ ++ minimal__destroy(skel); ++} ++ ++struct scx_test minimal = { ++ .name = "minimal", ++ .description = "Verify we can load a fully minimal scheduler", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&minimal) +diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c +new file mode 100644 +index 000000000000..fd2c8f12af16 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c +@@ -0,0 +1,32 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates that we can invoke sched_ext kfuncs in ++ * BPF_PROG_TYPE_SYSCALL programs. ++ * ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++UEI_DEFINE(uei); ++ ++char _license[] SEC("license") = "GPL"; ++ ++SEC("syscall") ++int BPF_PROG(prog_run_syscall) ++{ ++ scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops prog_run_ops = { ++ .exit = prog_run_exit, ++ .name = "prog_run", ++}; +diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c +new file mode 100644 +index 000000000000..3cd57ef8daaa +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/prog_run.c +@@ -0,0 +1,78 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <sched.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "prog_run.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct prog_run *skel; ++ ++ skel = prog_run__open_and_load(); ++ if (!skel) { ++ SCX_ERR("Failed to open and load skel"); ++ return SCX_TEST_FAIL; ++ } ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct prog_run *skel = ctx; ++ struct bpf_link *link; ++ int prog_fd, err = 0; ++ ++ prog_fd = bpf_program__fd(skel->progs.prog_run_syscall); ++ if (prog_fd < 0) { ++ SCX_ERR("Failed to get BPF_PROG_RUN prog"); ++ return SCX_TEST_FAIL; ++ } ++ ++ LIBBPF_OPTS(bpf_test_run_opts, topts); ++ ++ link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops); ++ if (!link) { ++ SCX_ERR("Failed to attach scheduler"); ++ close(prog_fd); ++ return SCX_TEST_FAIL; ++ } ++ ++ err = bpf_prog_test_run_opts(prog_fd, &topts); ++ SCX_EQ(err, 0); ++ ++ /* Assumes uei.kind is written last */ ++ while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) ++ sched_yield(); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); ++ SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef); ++ close(prog_fd); ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct prog_run *skel = ctx; ++ ++ prog_run__destroy(skel); ++} ++ ++struct scx_test prog_run = { ++ .name = "prog_run", ++ .description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&prog_run) +diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c +new file mode 100644 +index 000000000000..5cfba2d6e056 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/reload_loop.c +@@ -0,0 +1,75 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <pthread.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "maximal.bpf.skel.h" ++#include "scx_test.h" ++ ++static struct maximal *skel; ++static pthread_t threads[2]; ++ ++bool force_exit = false; ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ skel = maximal__open_and_load(); ++ if (!skel) { ++ SCX_ERR("Failed to open and load skel"); ++ return SCX_TEST_FAIL; ++ } ++ ++ return SCX_TEST_PASS; ++} ++ ++static void *do_reload_loop(void *arg) ++{ ++ u32 i; ++ ++ for (i = 0; i < 1024 && !force_exit; i++) { ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); ++ if (link) ++ bpf_link__destroy(link); ++ } ++ ++ return NULL; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ int err; ++ void *ret; ++ ++ err = pthread_create(&threads[0], NULL, do_reload_loop, NULL); ++ SCX_FAIL_IF(err, "Failed to create thread 0"); ++ ++ err = pthread_create(&threads[1], NULL, do_reload_loop, NULL); ++ SCX_FAIL_IF(err, "Failed to create thread 1"); ++ ++ SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed"); ++ SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed"); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ force_exit = true; ++ maximal__destroy(skel); ++} ++ ++struct scx_test reload_loop = { ++ .name = "reload_loop", ++ .description = "Stress test loading and unloading schedulers repeatedly in a tight loop", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&reload_loop) +diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c +new file mode 100644 +index 000000000000..eab48c7ff309 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/runner.c +@@ -0,0 +1,201 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ */ ++#include <stdio.h> ++#include <unistd.h> ++#include <signal.h> ++#include <libgen.h> ++#include <bpf/bpf.h> ++#include "scx_test.h" ++ ++const char help_fmt[] = ++"The runner for sched_ext tests.\n" ++"\n" ++"The runner is statically linked against all testcases, and runs them all serially.\n" ++"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" ++"scheduler may be loaded at any given time." ++"\n" ++"Usage: %s [-t TEST] [-h]\n" ++"\n" ++" -t TEST Only run tests whose name includes this string\n" ++" -s Include print output for skipped tests\n" ++" -q Don't print the test descriptions during run\n" ++" -h Display this help and exit\n"; ++ ++static volatile int exit_req; ++static bool quiet, print_skipped; ++ ++#define MAX_SCX_TESTS 2048 ++ ++static struct scx_test __scx_tests[MAX_SCX_TESTS]; ++static unsigned __scx_num_tests = 0; ++ ++static void sigint_handler(int simple) ++{ ++ exit_req = 1; ++} ++ ++static void print_test_preamble(const struct scx_test *test, bool quiet) ++{ ++ printf("===== START =====\n"); ++ printf("TEST: %s\n", test->name); ++ if (!quiet) ++ printf("DESCRIPTION: %s\n", test->description); ++ printf("OUTPUT:\n"); ++} ++ ++static const char *status_to_result(enum scx_test_status status) ++{ ++ switch (status) { ++ case SCX_TEST_PASS: ++ case SCX_TEST_SKIP: ++ return "ok"; ++ case SCX_TEST_FAIL: ++ return "not ok"; ++ default: ++ return "<UNKNOWN>"; ++ } ++} ++ ++static void print_test_result(const struct scx_test *test, ++ enum scx_test_status status, ++ unsigned int testnum) ++{ ++ const char *result = status_to_result(status); ++ const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; ++ ++ printf("%s %u %s # %s\n", result, testnum, test->name, directive); ++ printf("===== END =====\n"); ++} ++ ++static bool should_skip_test(const struct scx_test *test, const char * filter) ++{ ++ return !strstr(test->name, filter); ++} ++ ++static enum scx_test_status run_test(const struct scx_test *test) ++{ ++ enum scx_test_status status; ++ void *context = NULL; ++ ++ if (test->setup) { ++ status = test->setup(&context); ++ if (status != SCX_TEST_PASS) ++ return status; ++ } ++ ++ status = test->run(context); ++ ++ if (test->cleanup) ++ test->cleanup(context); ++ ++ return status; ++} ++ ++static bool test_valid(const struct scx_test *test) ++{ ++ if (!test) { ++ fprintf(stderr, "NULL test detected\n"); ++ return false; ++ } ++ ++ if (!test->name) { ++ fprintf(stderr, ++ "Test with no name found. Must specify test name.\n"); ++ return false; ++ } ++ ++ if (!test->description) { ++ fprintf(stderr, "Test %s requires description.\n", test->name); ++ return false; ++ } ++ ++ if (!test->run) { ++ fprintf(stderr, "Test %s has no run() callback\n", test->name); ++ return false; ++ } ++ ++ return true; ++} ++ ++int main(int argc, char **argv) ++{ ++ const char *filter = NULL; ++ unsigned testnum = 0, i; ++ unsigned passed = 0, skipped = 0, failed = 0; ++ int opt; ++ ++ signal(SIGINT, sigint_handler); ++ signal(SIGTERM, sigint_handler); ++ ++ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); ++ ++ while ((opt = getopt(argc, argv, "qst:h")) != -1) { ++ switch (opt) { ++ case 'q': ++ quiet = true; ++ break; ++ case 's': ++ print_skipped = true; ++ break; ++ case 't': ++ filter = optarg; ++ break; ++ default: ++ fprintf(stderr, help_fmt, basename(argv[0])); ++ return opt != 'h'; ++ } ++ } ++ ++ for (i = 0; i < __scx_num_tests; i++) { ++ enum scx_test_status status; ++ struct scx_test *test = &__scx_tests[i]; ++ ++ if (filter && should_skip_test(test, filter)) { ++ /* ++ * Printing the skipped tests and their preambles can ++ * add a lot of noise to the runner output. Printing ++ * this is only really useful for CI, so let's skip it ++ * by default. ++ */ ++ if (print_skipped) { ++ print_test_preamble(test, quiet); ++ print_test_result(test, SCX_TEST_SKIP, ++testnum); ++ } ++ continue; ++ } ++ ++ print_test_preamble(test, quiet); ++ status = run_test(test); ++ print_test_result(test, status, ++testnum); ++ switch (status) { ++ case SCX_TEST_PASS: ++ passed++; ++ break; ++ case SCX_TEST_SKIP: ++ skipped++; ++ break; ++ case SCX_TEST_FAIL: ++ failed++; ++ break; ++ } ++ } ++ printf("\n\n=============================\n\n"); ++ printf("RESULTS:\n\n"); ++ printf("PASSED: %u\n", passed); ++ printf("SKIPPED: %u\n", skipped); ++ printf("FAILED: %u\n", failed); ++ ++ return 0; ++} ++ ++void scx_test_register(struct scx_test *test) ++{ ++ SCX_BUG_ON(!test_valid(test), "Invalid test found"); ++ SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); ++ ++ __scx_tests[__scx_num_tests++] = *test; ++} +diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h +new file mode 100644 +index 000000000000..90b8d6915bb7 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/scx_test.h +@@ -0,0 +1,131 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ */ ++ ++#ifndef __SCX_TEST_H__ ++#define __SCX_TEST_H__ ++ ++#include <errno.h> ++#include <scx/common.h> ++#include <scx/compat.h> ++ ++enum scx_test_status { ++ SCX_TEST_PASS = 0, ++ SCX_TEST_SKIP, ++ SCX_TEST_FAIL, ++}; ++ ++#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent) ++ ++struct scx_test { ++ /** ++ * name - The name of the testcase. ++ */ ++ const char *name; ++ ++ /** ++ * description - A description of your testcase: what it tests and is ++ * meant to validate. ++ */ ++ const char *description; ++ ++ /* ++ * setup - Setup the test. ++ * @ctx: A pointer to a context object that will be passed to run and ++ * cleanup. ++ * ++ * An optional callback that allows a testcase to perform setup for its ++ * run. A test may return SCX_TEST_SKIP to skip the run. ++ */ ++ enum scx_test_status (*setup)(void **ctx); ++ ++ /* ++ * run - Run the test. ++ * @ctx: Context set in the setup() callback. If @ctx was not set in ++ * setup(), it is NULL. ++ * ++ * The main test. Callers should return one of: ++ * ++ * - SCX_TEST_PASS: Test passed ++ * - SCX_TEST_SKIP: Test should be skipped ++ * - SCX_TEST_FAIL: Test failed ++ * ++ * This callback must be defined. ++ */ ++ enum scx_test_status (*run)(void *ctx); ++ ++ /* ++ * cleanup - Perform cleanup following the test ++ * @ctx: Context set in the setup() callback. If @ctx was not set in ++ * setup(), it is NULL. ++ * ++ * An optional callback that allows a test to perform cleanup after ++ * being run. This callback is run even if the run() callback returns ++ * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns ++ * SCX_TEST_SKIP or SCX_TEST_FAIL. ++ */ ++ void (*cleanup)(void *ctx); ++}; ++ ++void scx_test_register(struct scx_test *test); ++ ++#define REGISTER_SCX_TEST(__test) \ ++ __attribute__((constructor)) \ ++ static void ___scxregister##__LINE__(void) \ ++ { \ ++ scx_test_register(__test); \ ++ } ++ ++#define SCX_ERR(__fmt, ...) \ ++ do { \ ++ fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ ++ fprintf(stderr, __fmt"\n", ##__VA_ARGS__); \ ++ } while (0) ++ ++#define SCX_FAIL(__fmt, ...) \ ++ do { \ ++ SCX_ERR(__fmt, ##__VA_ARGS__); \ ++ return SCX_TEST_FAIL; \ ++ } while (0) ++ ++#define SCX_FAIL_IF(__cond, __fmt, ...) \ ++ do { \ ++ if (__cond) \ ++ SCX_FAIL(__fmt, ##__VA_ARGS__); \ ++ } while (0) ++ ++#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ ++ #_x, #_y, (u64)(_x), (u64)(_y)) ++#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ ++ #_x, #_y, (u64)(_x), (u64)(_y)) ++#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ ++ #_x, #_y, (u64)(_x), (u64)(_y)) ++#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ ++ #_x, #_y, (u64)(_x), (u64)(_y)) ++#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ ++ #_x, #_y, (u64)(_x), (u64)(_y)) ++#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ ++ #_x, (u64)(_x)) ++ ++#define SCX_ECODE_VAL(__ecode) ({ \ ++ u64 __val = 0; \ ++ bool __found = false; \ ++ \ ++ __found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val); \ ++ SCX_ASSERT(__found); \ ++ (s64)__val; \ ++}) ++ ++#define SCX_KIND_VAL(__kind) ({ \ ++ u64 __val = 0; \ ++ bool __found = false; \ ++ \ ++ __found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val); \ ++ SCX_ASSERT(__found); \ ++ __val; \ ++}) ++ ++#endif // # __SCX_TEST_H__ +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +new file mode 100644 +index 000000000000..2ed2991afafe +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +@@ -0,0 +1,40 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates the behavior of direct dispatching with a default ++ * select_cpu implementation. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++bool saw_local = false; ++ ++static bool task_is_test(const struct task_struct *p) ++{ ++ return !bpf_strncmp(p->comm, 9, "select_cpu"); ++} ++ ++void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, ++ u64 enq_flags) ++{ ++ const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); ++ ++ if (task_is_test(p) && ++ bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { ++ saw_local = true; ++ } ++ scx_bpf_put_idle_cpumask(idle_mask); ++ ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops select_cpu_dfl_ops = { ++ .enqueue = select_cpu_dfl_enqueue, ++ .name = "select_cpu_dfl", ++}; +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c +new file mode 100644 +index 000000000000..a53a40c2d2f0 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c +@@ -0,0 +1,72 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "select_cpu_dfl.bpf.skel.h" ++#include "scx_test.h" ++ ++#define NUM_CHILDREN 1028 ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct select_cpu_dfl *skel; ++ ++ skel = select_cpu_dfl__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct select_cpu_dfl *skel = ctx; ++ struct bpf_link *link; ++ pid_t pids[NUM_CHILDREN]; ++ int i, status; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); ++ SCX_FAIL_IF(!link, "Failed to attach scheduler"); ++ ++ for (i = 0; i < NUM_CHILDREN; i++) { ++ pids[i] = fork(); ++ if (pids[i] == 0) { ++ sleep(1); ++ exit(0); ++ } ++ } ++ ++ for (i = 0; i < NUM_CHILDREN; i++) { ++ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); ++ SCX_EQ(status, 0); ++ } ++ ++ SCX_ASSERT(!skel->bss->saw_local); ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct select_cpu_dfl *skel = ctx; ++ ++ select_cpu_dfl__destroy(skel); ++} ++ ++struct scx_test select_cpu_dfl = { ++ .name = "select_cpu_dfl", ++ .description = "Verify the default ops.select_cpu() dispatches tasks " ++ "when idles cores are found, and skips ops.enqueue()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&select_cpu_dfl) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +new file mode 100644 +index 000000000000..4bb5abb2d369 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +@@ -0,0 +1,89 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates the behavior of direct dispatching with a default ++ * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag ++ * specified. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++bool saw_local = false; ++ ++/* Per-task scheduling context */ ++struct task_ctx { ++ bool force_local; /* CPU changed by ops.select_cpu() */ ++}; ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); ++ __uint(map_flags, BPF_F_NO_PREALLOC); ++ __type(key, int); ++ __type(value, struct task_ctx); ++} task_ctx_stor SEC(".maps"); ++ ++/* Manually specify the signature until the kfunc is added to the scx repo. */ ++s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, ++ bool *found) __ksym; ++ ++s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ struct task_ctx *tctx; ++ s32 cpu; ++ ++ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); ++ if (!tctx) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return -ESRCH; ++ } ++ ++ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, ++ &tctx->force_local); ++ ++ return cpu; ++} ++ ++void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, ++ u64 enq_flags) ++{ ++ u64 dsq_id = SCX_DSQ_GLOBAL; ++ struct task_ctx *tctx; ++ ++ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); ++ if (!tctx) { ++ scx_bpf_error("task_ctx lookup failed"); ++ return; ++ } ++ ++ if (tctx->force_local) { ++ dsq_id = SCX_DSQ_LOCAL; ++ tctx->force_local = false; ++ saw_local = true; ++ } ++ ++ scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); ++} ++ ++s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, ++ struct task_struct *p, struct scx_init_task_args *args) ++{ ++ if (bpf_task_storage_get(&task_ctx_stor, p, 0, ++ BPF_LOCAL_STORAGE_GET_F_CREATE)) ++ return 0; ++ else ++ return -ENOMEM; ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { ++ .select_cpu = select_cpu_dfl_nodispatch_select_cpu, ++ .enqueue = select_cpu_dfl_nodispatch_enqueue, ++ .init_task = select_cpu_dfl_nodispatch_init_task, ++ .name = "select_cpu_dfl_nodispatch", ++}; +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c +new file mode 100644 +index 000000000000..1d85bf4bf3a3 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c +@@ -0,0 +1,72 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "select_cpu_dfl_nodispatch.bpf.skel.h" ++#include "scx_test.h" ++ ++#define NUM_CHILDREN 1028 ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct select_cpu_dfl_nodispatch *skel; ++ ++ skel = select_cpu_dfl_nodispatch__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct select_cpu_dfl_nodispatch *skel = ctx; ++ struct bpf_link *link; ++ pid_t pids[NUM_CHILDREN]; ++ int i, status; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); ++ SCX_FAIL_IF(!link, "Failed to attach scheduler"); ++ ++ for (i = 0; i < NUM_CHILDREN; i++) { ++ pids[i] = fork(); ++ if (pids[i] == 0) { ++ sleep(1); ++ exit(0); ++ } ++ } ++ ++ for (i = 0; i < NUM_CHILDREN; i++) { ++ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); ++ SCX_EQ(status, 0); ++ } ++ ++ SCX_ASSERT(skel->bss->saw_local); ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct select_cpu_dfl_nodispatch *skel = ctx; ++ ++ select_cpu_dfl_nodispatch__destroy(skel); ++} ++ ++struct scx_test select_cpu_dfl_nodispatch = { ++ .name = "select_cpu_dfl_nodispatch", ++ .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " ++ "ops.select_cpu()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +new file mode 100644 +index 000000000000..f0b96a4a04b2 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +@@ -0,0 +1,41 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates the behavior of direct dispatching with a default ++ * select_cpu implementation. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ u64 dsq_id = SCX_DSQ_LOCAL; ++ s32 cpu = prev_cpu; ++ ++ if (scx_bpf_test_and_clear_cpu_idle(cpu)) ++ goto dispatch; ++ ++ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); ++ if (cpu >= 0) ++ goto dispatch; ++ ++ dsq_id = SCX_DSQ_GLOBAL; ++ cpu = prev_cpu; ++ ++dispatch: ++ scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); ++ return cpu; ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops select_cpu_dispatch_ops = { ++ .select_cpu = select_cpu_dispatch_select_cpu, ++ .name = "select_cpu_dispatch", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c +new file mode 100644 +index 000000000000..0309ca8785b3 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c +@@ -0,0 +1,70 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "select_cpu_dispatch.bpf.skel.h" ++#include "scx_test.h" ++ ++#define NUM_CHILDREN 1028 ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct select_cpu_dispatch *skel; ++ ++ skel = select_cpu_dispatch__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct select_cpu_dispatch *skel = ctx; ++ struct bpf_link *link; ++ pid_t pids[NUM_CHILDREN]; ++ int i, status; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); ++ SCX_FAIL_IF(!link, "Failed to attach scheduler"); ++ ++ for (i = 0; i < NUM_CHILDREN; i++) { ++ pids[i] = fork(); ++ if (pids[i] == 0) { ++ sleep(1); ++ exit(0); ++ } ++ } ++ ++ for (i = 0; i < NUM_CHILDREN; i++) { ++ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); ++ SCX_EQ(status, 0); ++ } ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct select_cpu_dispatch *skel = ctx; ++ ++ select_cpu_dispatch__destroy(skel); ++} ++ ++struct scx_test select_cpu_dispatch = { ++ .name = "select_cpu_dispatch", ++ .description = "Test direct dispatching to built-in DSQs from " ++ "ops.select_cpu()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&select_cpu_dispatch) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +new file mode 100644 +index 000000000000..7b42ddce0f56 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates the behavior of direct dispatching with a default ++ * select_cpu implementation. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++UEI_DEFINE(uei); ++ ++s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ /* Dispatching to a random DSQ should fail. */ ++ scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); ++ ++ return prev_cpu; ++} ++ ++void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { ++ .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, ++ .exit = select_cpu_dispatch_bad_dsq_exit, ++ .name = "select_cpu_dispatch_bad_dsq", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c +new file mode 100644 +index 000000000000..47eb6ed7627d +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct select_cpu_dispatch_bad_dsq *skel; ++ ++ skel = select_cpu_dispatch_bad_dsq__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct select_cpu_dispatch_bad_dsq *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); ++ SCX_FAIL_IF(!link, "Failed to attach scheduler"); ++ ++ sleep(1); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct select_cpu_dispatch_bad_dsq *skel = ctx; ++ ++ select_cpu_dispatch_bad_dsq__destroy(skel); ++} ++ ++struct scx_test select_cpu_dispatch_bad_dsq = { ++ .name = "select_cpu_dispatch_bad_dsq", ++ .description = "Verify graceful failure if we direct-dispatch to a " ++ "bogus DSQ in ops.select_cpu()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +new file mode 100644 +index 000000000000..653e3dc0b4dc +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates the behavior of direct dispatching with a default ++ * select_cpu implementation. ++ * ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++UEI_DEFINE(uei); ++ ++s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ /* Dispatching twice in a row is disallowed. */ ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ ++ return prev_cpu; ++} ++ ++void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) ++{ ++ UEI_RECORD(uei, ei); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { ++ .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, ++ .exit = select_cpu_dispatch_dbl_dsp_exit, ++ .name = "select_cpu_dispatch_dbl_dsp", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c +new file mode 100644 +index 000000000000..48ff028a3c46 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2023 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2023 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct select_cpu_dispatch_dbl_dsp *skel; ++ ++ skel = select_cpu_dispatch_dbl_dsp__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct select_cpu_dispatch_dbl_dsp *skel = ctx; ++ struct bpf_link *link; ++ ++ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); ++ SCX_FAIL_IF(!link, "Failed to attach scheduler"); ++ ++ sleep(1); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct select_cpu_dispatch_dbl_dsp *skel = ctx; ++ ++ select_cpu_dispatch_dbl_dsp__destroy(skel); ++} ++ ++struct scx_test select_cpu_dispatch_dbl_dsp = { ++ .name = "select_cpu_dispatch_dbl_dsp", ++ .description = "Verify graceful failure if we dispatch twice to a " ++ "DSQ in ops.select_cpu()", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +new file mode 100644 +index 000000000000..7f3ebf4fc2ea +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +@@ -0,0 +1,92 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * A scheduler that validates that enqueue flags are properly stored and ++ * applied at dispatch time when a task is directly dispatched from ++ * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and ++ * making the test a very basic vtime scheduler. ++ * ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ */ ++ ++#include <scx/common.bpf.h> ++ ++char _license[] SEC("license") = "GPL"; ++ ++volatile bool consumed; ++ ++static u64 vtime_now; ++ ++#define VTIME_DSQ 0 ++ ++static inline bool vtime_before(u64 a, u64 b) ++{ ++ return (s64)(a - b) < 0; ++} ++ ++static inline u64 task_vtime(const struct task_struct *p) ++{ ++ u64 vtime = p->scx.dsq_vtime; ++ ++ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) ++ return vtime_now - SCX_SLICE_DFL; ++ else ++ return vtime; ++} ++ ++s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, ++ s32 prev_cpu, u64 wake_flags) ++{ ++ s32 cpu; ++ ++ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); ++ if (cpu >= 0) ++ goto ddsp; ++ ++ cpu = prev_cpu; ++ scx_bpf_test_and_clear_cpu_idle(cpu); ++ddsp: ++ scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); ++ return cpu; ++} ++ ++void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) ++{ ++ if (scx_bpf_consume(VTIME_DSQ)) ++ consumed = true; ++} ++ ++void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) ++{ ++ if (vtime_before(vtime_now, p->scx.dsq_vtime)) ++ vtime_now = p->scx.dsq_vtime; ++} ++ ++void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, ++ bool runnable) ++{ ++ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; ++} ++ ++void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) ++{ ++ p->scx.dsq_vtime = vtime_now; ++} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) ++{ ++ return scx_bpf_create_dsq(VTIME_DSQ, -1); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops select_cpu_vtime_ops = { ++ .select_cpu = select_cpu_vtime_select_cpu, ++ .dispatch = select_cpu_vtime_dispatch, ++ .running = select_cpu_vtime_running, ++ .stopping = select_cpu_vtime_stopping, ++ .enable = select_cpu_vtime_enable, ++ .init = select_cpu_vtime_init, ++ .name = "select_cpu_vtime", ++ .timeout_ms = 1000U, ++}; +diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c +new file mode 100644 +index 000000000000..b4629c2364f5 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c +@@ -0,0 +1,59 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include <sys/wait.h> ++#include <unistd.h> ++#include "select_cpu_vtime.bpf.skel.h" ++#include "scx_test.h" ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct select_cpu_vtime *skel; ++ ++ skel = select_cpu_vtime__open_and_load(); ++ SCX_FAIL_IF(!skel, "Failed to open and load skel"); ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct select_cpu_vtime *skel = ctx; ++ struct bpf_link *link; ++ ++ SCX_ASSERT(!skel->bss->consumed); ++ ++ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); ++ SCX_FAIL_IF(!link, "Failed to attach scheduler"); ++ ++ sleep(1); ++ ++ SCX_ASSERT(skel->bss->consumed); ++ ++ bpf_link__destroy(link); ++ ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct select_cpu_vtime *skel = ctx; ++ ++ select_cpu_vtime__destroy(skel); ++} ++ ++struct scx_test select_cpu_vtime = { ++ .name = "select_cpu_vtime", ++ .description = "Test doing direct vtime-dispatching from " ++ "ops.select_cpu(), to a non-built-in DSQ", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&select_cpu_vtime) +diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c +new file mode 100644 +index 000000000000..ce36cdf03cdc +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/test_example.c +@@ -0,0 +1,49 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 Tejun Heo <tj@kernel.org> ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <bpf/bpf.h> ++#include <scx/common.h> ++#include "scx_test.h" ++ ++static bool setup_called = false; ++static bool run_called = false; ++static bool cleanup_called = false; ++ ++static int context = 10; ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ setup_called = true; ++ *ctx = &context; ++ ++ return SCX_TEST_PASS; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ int *arg = ctx; ++ ++ SCX_ASSERT(setup_called); ++ SCX_ASSERT(!run_called && !cleanup_called); ++ SCX_EQ(*arg, context); ++ ++ run_called = true; ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup (void *ctx) ++{ ++ SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); ++} ++ ++struct scx_test example = { ++ .name = "example", ++ .description = "Validate the basic function of the test suite itself", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&example) +diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c +new file mode 100644 +index 000000000000..e47769c91918 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/util.c +@@ -0,0 +1,71 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <dvernet@meta.com> ++ */ ++#include <errno.h> ++#include <fcntl.h> ++#include <stdio.h> ++#include <stdlib.h> ++#include <string.h> ++#include <unistd.h> ++ ++/* Returns read len on success, or -errno on failure. */ ++static ssize_t read_text(const char *path, char *buf, size_t max_len) ++{ ++ ssize_t len; ++ int fd; ++ ++ fd = open(path, O_RDONLY); ++ if (fd < 0) ++ return -errno; ++ ++ len = read(fd, buf, max_len - 1); ++ ++ if (len >= 0) ++ buf[len] = 0; ++ ++ close(fd); ++ return len < 0 ? -errno : len; ++} ++ ++/* Returns written len on success, or -errno on failure. */ ++static ssize_t write_text(const char *path, char *buf, ssize_t len) ++{ ++ int fd; ++ ssize_t written; ++ ++ fd = open(path, O_WRONLY | O_APPEND); ++ if (fd < 0) ++ return -errno; ++ ++ written = write(fd, buf, len); ++ close(fd); ++ return written < 0 ? -errno : written; ++} ++ ++long file_read_long(const char *path) ++{ ++ char buf[128]; ++ ++ ++ if (read_text(path, buf, sizeof(buf)) <= 0) ++ return -1; ++ ++ return atol(buf); ++} ++ ++int file_write_long(const char *path, long val) ++{ ++ char buf[64]; ++ int ret; ++ ++ ret = sprintf(buf, "%lu", val); ++ if (ret < 0) ++ return ret; ++ ++ if (write_text(path, buf, sizeof(buf)) <= 0) ++ return -1; ++ ++ return 0; ++} +diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h +new file mode 100644 +index 000000000000..bc13dfec1267 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/util.h +@@ -0,0 +1,13 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2024 David Vernet <void@manifault.com> ++ */ ++ ++#ifndef __SCX_TEST_UTIL_H__ ++#define __SCX_TEST_UTIL_H__ ++ ++long file_read_long(const char *path); ++int file_write_long(const char *path, long val); ++ ++#endif // __SCX_TEST_H__ |