From 9c1b62aff214e27adddf3d401bdb4dc993aa691d Mon Sep 17 00:00:00 2001 From: Sentry Date: Sat, 28 Nov 2020 20:10:19 +0100 Subject: kernel 5.9.11 --- .../0001-update-phy-on-pine64-a64-devices.patch | 16 +- SOURCES/fsync.patch | 597 -- SOURCES/futex2.patch | 6697 ++++++++++++++++++++ SOURCES/kernel-aarch64-debug-fedora.config | 2 +- SOURCES/kernel-aarch64-debug-rhel.config | 2 +- SOURCES/kernel-aarch64-fedora.config | 2 +- SOURCES/kernel-aarch64-rhel.config | 2 +- SOURCES/kernel-armv7hl-debug-fedora.config | 2 +- SOURCES/kernel-armv7hl-fedora.config | 2 +- SOURCES/kernel-armv7hl-lpae-debug-fedora.config | 2 +- SOURCES/kernel-armv7hl-lpae-fedora.config | 2 +- SOURCES/kernel-i686-debug-fedora.config | 2 +- SOURCES/kernel-i686-fedora.config | 2 +- SOURCES/kernel-ppc64le-debug-fedora.config | 2 +- SOURCES/kernel-ppc64le-debug-rhel.config | 2 +- SOURCES/kernel-ppc64le-fedora.config | 2 +- SOURCES/kernel-ppc64le-rhel.config | 2 +- SOURCES/kernel-s390x-debug-fedora.config | 2 +- SOURCES/kernel-s390x-debug-rhel.config | 3 +- SOURCES/kernel-s390x-fedora.config | 2 +- SOURCES/kernel-s390x-rhel.config | 2 +- SOURCES/kernel-s390x-zfcpdump-rhel.config | 2 +- SOURCES/kernel-x86_64-debug-fedora.config | 2 +- SOURCES/kernel-x86_64-debug-rhel.config | 2 +- SOURCES/kernel-x86_64-fedora.config | 2 +- SOURCES/kernel-x86_64-rhel.config | 2 +- SPECS/kernel.spec | 26 +- 27 files changed, 6729 insertions(+), 654 deletions(-) delete mode 100644 SOURCES/fsync.patch create mode 100644 SOURCES/futex2.patch diff --git a/SOURCES/0001-update-phy-on-pine64-a64-devices.patch b/SOURCES/0001-update-phy-on-pine64-a64-devices.patch index ee94f53..0a1f4dc 100644 --- a/SOURCES/0001-update-phy-on-pine64-a64-devices.patch +++ b/SOURCES/0001-update-phy-on-pine64-a64-devices.patch @@ -4,24 +4,10 @@ Date: Mon, 26 Oct 2020 17:01:57 +0000 Subject: [PATCH 1/2] update phy on pine64 a64 devices --- - arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts | 2 +- arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts | 2 +- arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts | 2 +- - 3 files changed, 3 insertions(+), 3 deletions(-) + 2 files changed, 2 insertions(+), 2 deletions(-) -diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts -index b26181cf9095..01728a4c5309 100644 ---- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts -+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts -@@ -13,7 +13,7 @@ / { - &emac { - pinctrl-names = "default"; - pinctrl-0 = <&rgmii_pins>; -- phy-mode = "rgmii"; -+ phy-mode = "rgmii-id"; - phy-handle = <&ext_rgmii_phy>; - status = "okay"; - }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts index 2165f238af13..9741fb5caa6f 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts diff --git a/SOURCES/fsync.patch b/SOURCES/fsync.patch deleted file mode 100644 index ef1446c..0000000 --- a/SOURCES/fsync.patch +++ /dev/null @@ -1,597 +0,0 @@ -From 7b5df0248ce255ef5b7204d65a7b3783ebb76a3d Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Fri, 13 Dec 2019 11:08:02 -0300 -Subject: [PATCH 1/2] futex: Implement mechanism to wait on any of several - futexes -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows -a thread to wait on several futexes at the same time, and be awoken by -any of them. In a sense, it implements one of the features that was -supported by pooling on the old FUTEX_FD interface. - -The use case lies in the Wine implementation of the Windows NT interface -WaitMultipleObjects. This Windows API function allows a thread to sleep -waiting on the first of a set of event sources (mutexes, timers, signal, -console input, etc) to signal. Considering this is a primitive -synchronization operation for Windows applications, being able to quickly -signal events on the producer side, and quickly go to sleep on the -consumer side is essential for good performance of those running over Wine. - -Wine developers have an implementation that uses eventfd, but it suffers -from FD exhaustion (there is applications that go to the order of -multi-milion FDs), and higher CPU utilization than this new operation. - -The futex list is passed as an array of `struct futex_wait_block` -(pointer, value, bitset) to the kernel, which will enqueue all of them -and sleep if none was already triggered. It returns a hint of which -futex caused the wake up event to userspace, but the hint doesn't -guarantee that is the only futex triggered. Before calling the syscall -again, userspace should traverse the list, trying to re-acquire any of -the other futexes, to prevent an immediate -EWOULDBLOCK return code from -the kernel. - -This was tested using three mechanisms: - -1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and -running the unmodified tools/testing/selftests/futex and a full linux -distro on top of this kernel. - -2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a -multi-threaded, event-handling setup. - -3) By running the Wine fsync implementation and executing multi-threaded -applications, in particular modern games, on top of this implementation. - -Changes were tested for the following ABIs: x86_64, i386 and x32. -Support for x32 applications is not implemented since it would -take a major rework adding a new entry point and splitting the current -futex 64 entry point in two and we can't change the current x32 syscall -number without breaking user space compatibility. - -CC: Steven Rostedt -Cc: Richard Yao -Cc: Thomas Gleixner -Cc: Peter Zijlstra -Co-developed-by: Zebediah Figura -Signed-off-by: Zebediah Figura -Co-developed-by: Steven Noonan -Signed-off-by: Steven Noonan -Co-developed-by: Pierre-Loup A. Griffais -Signed-off-by: Pierre-Loup A. Griffais -Signed-off-by: Gabriel Krisman Bertazi -[Added compatibility code] -Co-developed-by: André Almeida -Signed-off-by: André Almeida - -Adjusted for v5.9: Removed `put_futex_key` calls. ---- - include/uapi/linux/futex.h | 20 +++ - kernel/futex.c | 352 ++++++++++++++++++++++++++++++++++++- - 2 files changed, 370 insertions(+), 2 deletions(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index a89eb0accd5e2..580001e89c6ca 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,6 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 -+#define FUTEX_WAIT_MULTIPLE 13 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -40,6 +41,8 @@ - FUTEX_PRIVATE_FLAG) - #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) -+#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ -+ FUTEX_PRIVATE_FLAG) - - /* - * Support for robust futexes: the kernel cleans up held futexes at -@@ -150,4 +153,21 @@ struct robust_list_head { - (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ - | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) - -+/* -+ * Maximum number of multiple futexes to wait for -+ */ -+#define FUTEX_MULTIPLE_MAX_COUNT 128 -+ -+/** -+ * struct futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ - #endif /* _UAPI_LINUX_FUTEX_H */ -diff --git a/kernel/futex.c b/kernel/futex.c -index a5876694a60eb..6f4bea76df460 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -197,6 +197,8 @@ struct futex_pi_state { - * @rt_waiter: rt_waiter storage for use with requeue_pi - * @requeue_pi_key: the requeue_pi target futex key - * @bitset: bitset for the optional bitmasked wakeup -+ * @uaddr: userspace address of futex -+ * @uval: expected futex's value - * - * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so - * we can wake only the relevant ones (hashed queues may be shared). -@@ -219,6 +221,8 @@ struct futex_q { - struct rt_mutex_waiter *rt_waiter; - union futex_key *requeue_pi_key; - u32 bitset; -+ u32 __user *uaddr; -+ u32 uval; - } __randomize_layout; - - static const struct futex_q futex_q_init = { -@@ -2304,6 +2308,29 @@ static int unqueue_me(struct futex_q *q) - return ret; - } - -+/** -+ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket -+ * @q: The list of futexes to unqueue -+ * @count: Number of futexes in the list -+ * -+ * Helper to unqueue a list of futexes. This can't fail. -+ * -+ * Return: -+ * - >=0 - Index of the last futex that was awoken; -+ * - -1 - If no futex was awoken -+ */ -+static int unqueue_multiple(struct futex_q *q, int count) -+{ -+ int ret = -1; -+ int i; -+ -+ for (i = 0; i < count; i++) { -+ if (!unqueue_me(&q[i])) -+ ret = i; -+ } -+ return ret; -+} -+ - /* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry -@@ -2662,6 +2689,205 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - -+/** -+ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes -+ * @qs: The corresponding futex list -+ * @count: The size of the lists -+ * @flags: Futex flags (FLAGS_SHARED, etc.) -+ * @awaken: Index of the last awoken futex -+ * -+ * Prepare multiple futexes in a single step and enqueue them. This may fail if -+ * the futex list is invalid or if any futex was already awoken. On success the -+ * task is ready to interruptible sleep. -+ * -+ * Return: -+ * - 1 - One of the futexes was awaken by another thread -+ * - 0 - Success -+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL -+ */ -+static int futex_wait_multiple_setup(struct futex_q *qs, int count, -+ unsigned int flags, int *awaken) -+{ -+ struct futex_hash_bucket *hb; -+ int ret, i; -+ u32 uval; -+ -+ /* -+ * Enqueuing multiple futexes is tricky, because we need to -+ * enqueue each futex in the list before dealing with the next -+ * one to avoid deadlocking on the hash bucket. But, before -+ * enqueuing, we need to make sure that current->state is -+ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which -+ * cannot be done before the get_futex_key of the next key, -+ * because it calls get_user_pages, which can sleep. Thus, we -+ * fetch the list of futexes keys in two steps, by first pinning -+ * all the memory keys in the futex key, and only then we read -+ * each key and queue the corresponding futex. -+ */ -+retry: -+ for (i = 0; i < count; i++) { -+ qs[i].key = FUTEX_KEY_INIT; -+ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, -+ &qs[i].key, FUTEX_READ); -+ if (unlikely(ret)) { -+ return ret; -+ } -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ struct futex_q *q = &qs[i]; -+ -+ hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, q->uaddr); -+ if (ret) { -+ /* -+ * We need to try to handle the fault, which -+ * cannot be done without sleep, so we need to -+ * undo all the work already done, to make sure -+ * we don't miss any wake ups. Therefore, clean -+ * up, handle the fault and retry from the -+ * beginning. -+ */ -+ queue_unlock(hb); -+ -+ /* -+ * Keys 0..(i-1) are implicitly put -+ * on unqueue_multiple. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ /* -+ * On a real fault, prioritize the error even if -+ * some other futex was awoken. Userspace gave -+ * us a bad address, -EFAULT them. -+ */ -+ ret = get_user(uval, q->uaddr); -+ if (ret) -+ return ret; -+ -+ /* -+ * Even if the page fault was handled, If -+ * something was already awaken, we can safely -+ * give up and succeed to give a hint for userspace to -+ * acquire the right futex faster. -+ */ -+ if (*awaken >= 0) -+ return 1; -+ -+ goto retry; -+ } -+ -+ if (uval != q->uval) { -+ queue_unlock(hb); -+ -+ /* -+ * If something was already awaken, we can -+ * safely ignore the error and succeed. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ __set_current_state(TASK_RUNNING); -+ if (*awaken >= 0) -+ return 1; -+ -+ return -EWOULDBLOCK; -+ } -+ -+ /* -+ * The bucket lock can't be held while dealing with the -+ * next futex. Queue each futex at this moment so hb can -+ * be unlocked. -+ */ -+ queue_me(&qs[i], hb); -+ } -+ return 0; -+} -+ -+/** -+ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes -+ * @qs: The list of futexes to wait on -+ * @op: Operation code from futex's syscall -+ * @count: The number of objects -+ * @abs_time: Timeout before giving up and returning to userspace -+ * -+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function -+ * sleeps on a group of futexes and returns on the first futex that -+ * triggered, or after the timeout has elapsed. -+ * -+ * Return: -+ * - >=0 - Hint to the futex that was awoken -+ * - <0 - On error -+ */ -+static int futex_wait_multiple(struct futex_q *qs, int op, -+ u32 count, ktime_t *abs_time) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ int ret, flags = 0, hint = 0; -+ unsigned int i; -+ -+ if (!(op & FUTEX_PRIVATE_FLAG)) -+ flags |= FLAGS_SHARED; -+ -+ if (op & FUTEX_CLOCK_REALTIME) -+ flags |= FLAGS_CLOCKRT; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, 0); -+ while (1) { -+ ret = futex_wait_multiple_setup(qs, count, flags, &hint); -+ if (ret) { -+ if (ret > 0) { -+ /* A futex was awaken during setup */ -+ ret = hint; -+ } -+ break; -+ } -+ -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* -+ * Avoid sleeping if another thread already tried to -+ * wake us. -+ */ -+ for (i = 0; i < count; i++) { -+ if (plist_node_empty(&qs[i].list)) -+ break; -+ } -+ -+ if (i == count && (!to || to->task)) -+ freezable_schedule(); -+ -+ ret = unqueue_multiple(qs, count); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (ret >= 0) -+ break; -+ if (to && !to->task) { -+ ret = -ETIMEDOUT; -+ break; -+ } else if (signal_pending(current)) { -+ ret = -ERESTARTSYS; -+ break; -+ } -+ /* -+ * The final case is a spurious wakeup, for -+ * which just retry. -+ */ -+ } -+ -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ -+ return ret; -+} -+ - static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) - { -@@ -3774,6 +4000,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - return -ENOSYS; - } - -+/** -+ * futex_read_wait_block - Read an array of futex_wait_block from userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function creates and allocate an array of futex_q (we zero it to -+ * initialize the fields) and then, for each futex_wait_block element from -+ * userspace, fill a futex_q element with proper values. -+ */ -+inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct futex_wait_block fwb; -+ struct futex_wait_block __user *entry = -+ (struct futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = fwb.uaddr; -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} - - SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, -@@ -3786,7 +4049,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) -@@ -3807,6 +4071,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs; -+ -+#ifdef CONFIG_X86_X32 -+ if (unlikely(in_x32_syscall())) -+ return -ENOSYS; -+#endif -+ qs = futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - -@@ -3969,6 +4252,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - #endif /* CONFIG_COMPAT */ - - #ifdef CONFIG_COMPAT_32BIT_TIME -+/** -+ * struct compat_futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex (compatible pointer) -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct compat_futex_wait_block { -+ compat_uptr_t uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ -+/** -+ * compat_futex_read_wait_block - Read an array of futex_wait_block from -+ * userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function does the same as futex_read_wait_block(), except that it -+ * converts the pointer to the futex from the compat version to the regular one. -+ */ -+inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, -+ u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct compat_futex_wait_block fwb; -+ struct compat_futex_wait_block __user *entry = -+ (struct compat_futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = compat_ptr(fwb.uaddr); -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} -+ - SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, - u32, val3) -@@ -3980,7 +4314,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - if (!timespec64_valid(&ts)) -@@ -3995,6 +4330,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - #endif /* CONFIG_COMPAT_32BIT_TIME */ - -From ccdddb50d330d2ee1a4d2cbfdd27bdd7fb10eec3 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Fri, 7 Feb 2020 23:28:02 -0300 -Subject: [PATCH 2/2] futex: Add Proton compatibility code - ---- - include/uapi/linux/futex.h | 2 +- - kernel/futex.c | 5 +++-- - 2 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 580001e..a3e7608 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,7 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 --#define FUTEX_WAIT_MULTIPLE 13 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -diff --git a/kernel/futex.c b/kernel/futex.c -index caba751..84c520c 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -4074,7 +4074,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); -@@ -4277,6 +4277,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - */ - struct compat_futex_wait_block { - compat_uptr_t uaddr; -+ __u32 pad; - __u32 val; - __u32 bitset; - }; -@@ -4339,7 +4340,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); diff --git a/SOURCES/futex2.patch b/SOURCES/futex2.patch new file mode 100644 index 0000000..bfd12ba --- /dev/null +++ b/SOURCES/futex2.patch @@ -0,0 +1,6697 @@ +From ada1f13b98e86cb7ac4140c4976c3d165006d995 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Wed, 5 Aug 2020 12:40:26 -0300 +Subject: [PATCH 01/13] futex2: Add new futex interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Initial implementation for futex2. Support only private u32 wait/wake, with +timeout (monotonic and realtime clocks). + +Signed-off-by: André Almeida +--- + MAINTAINERS | 2 +- + arch/x86/entry/syscalls/syscall_32.tbl | 2 + + arch/x86/entry/syscalls/syscall_64.tbl | 2 + + include/linux/syscalls.h | 7 + + include/uapi/asm-generic/unistd.h | 8 +- + include/uapi/linux/futex.h | 40 ++ + init/Kconfig | 7 + + kernel/Makefile | 1 + + kernel/futex2.c | 484 +++++++++++++++++++++++++ + kernel/sys_ni.c | 4 + + 10 files changed, 555 insertions(+), 2 deletions(-) + create mode 100644 kernel/futex2.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index 867157311dc8..0c425f74ed88 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -7214,7 +7214,7 @@ F: Documentation/locking/*futex* + F: include/asm-generic/futex.h + F: include/linux/futex.h + F: include/uapi/linux/futex.h +-F: kernel/futex.c ++F: kernel/futex* + F: tools/perf/bench/futex* + F: tools/testing/selftests/futex/ + +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 9d1102873666..955322962964 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -444,3 +444,5 @@ + 437 i386 openat2 sys_openat2 + 438 i386 pidfd_getfd sys_pidfd_getfd + 439 i386 faccessat2 sys_faccessat2 ++440 i386 futex_wait sys_futex_wait ++441 i386 futex_wake sys_futex_wake +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index f30d6ae9a688..4133bfe96891 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -361,6 +361,8 @@ + 437 common openat2 sys_openat2 + 438 common pidfd_getfd sys_pidfd_getfd + 439 common faccessat2 sys_faccessat2 ++440 common futex_wait sys_futex_wait ++441 common futex_wake sys_futex_wake + + # + # x32-specific system call numbers start at 512 to avoid cache impact +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 75ac7f8ae93c..38c3a87dbfc2 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -589,6 +589,13 @@ asmlinkage long sys_get_robust_list(int pid, + asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, + size_t len); + ++/* kernel/futex2.c */ ++asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, ++ unsigned long flags, ++ struct __kernel_timespec __user __user *timo); ++asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long nr_wake, ++ unsigned long flags); ++ + /* kernel/hrtimer.c */ + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, + struct __kernel_timespec __user *rmtp); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index 995b36c2ea7d..80567ade774a 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -860,8 +860,14 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) + #define __NR_faccessat2 439 + __SYSCALL(__NR_faccessat2, sys_faccessat2) + ++#define __NR_futex_wait 440 ++__SYSCALL(__NR_futex_wait, sys_futex_wait) ++ ++#define __NR_futex_wake 441 ++__SYSCALL(__NR_futex_wake, sys_futex_wake) ++ + #undef __NR_syscalls +-#define __NR_syscalls 440 ++#define __NR_syscalls 442 + + /* + * 32 bit systems traditionally used different +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..35a5bf1cd41b 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -41,6 +41,46 @@ + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) + ++/* Size argument to futex2 syscall */ ++#define FUTEX_8 0 ++#define FUTEX_16 1 ++#define FUTEX_32 2 ++ ++#define FUTEX_SIZE_MASK 0x3 ++ ++#define FUTEX_SHARED_FLAG 8 ++ ++#define FUTEX_NUMA_FLAG 16 ++ ++/* ++ * struct futexXX_numa - struct for NUMA-aware futex operation ++ * @value: futex value ++ * @hint: node id to operate ++ */ ++ ++struct futex8_numa { ++ __u8 value; ++ __u8 hint; ++}; ++ ++struct futex16_numa { ++ __u16 value; ++ __u16 hint; ++}; ++ ++struct futex32_numa { ++ __u32 value; ++ __u32 hint; ++}; ++ ++#define FUTEX_WAITV_MAX 128 ++ ++struct futex_waitv { ++ void *uaddr; ++ unsigned int val; ++ unsigned int flags; ++}; ++ + /* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. +diff --git a/init/Kconfig b/init/Kconfig +index 2a5df1cf838c..440f21f5c3d8 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1522,6 +1522,13 @@ config FUTEX + support for "fast userspace mutexes". The resulting kernel may not + run glibc-based applications correctly. + ++config FUTEX2 ++ bool "Enable futex2 support" if EXPERT ++ depends on FUTEX ++ default n ++ help ++ Experimental support for futex2 interface. ++ + config FUTEX_PI + bool + depends on FUTEX && RT_MUTEXES +diff --git a/kernel/Makefile b/kernel/Makefile +index 9a20016d4900..51ea9bc647bf 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o + obj-$(CONFIG_STACKTRACE) += stacktrace.o + obj-y += time/ + obj-$(CONFIG_FUTEX) += futex.o ++obj-$(CONFIG_FUTEX2) += futex2.o + obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o + obj-$(CONFIG_SMP) += smp.o + ifneq ($(CONFIG_SMP),y) +diff --git a/kernel/futex2.c b/kernel/futex2.c +new file mode 100644 +index 000000000000..107b80a466d0 +--- /dev/null ++++ b/kernel/futex2.c +@@ -0,0 +1,484 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * futex2 system call interface by André Almeida ++ * ++ * Copyright 2020 Collabora Ltd. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/** ++ * struct futex_waiter - List entry for a waiter ++ * @key.address: Memory address of userspace futex ++ * @key.mm: Pointer to memory management struct of this process ++ * @key: Stores information that uniquely identify a futex ++ * @list: List node struct ++ * @val: Expected value for this waiter ++ * @flags: Flags ++ * @bucket: Pointer to the bucket for this waiter ++ * @index: Index of waiter in futexv list ++ */ ++struct futex_waiter { ++ struct futex_key { ++ uintptr_t address; ++ struct mm_struct *mm; ++ } key; ++ struct list_head list; ++ unsigned int val; ++ unsigned int flags; ++ struct futex_bucket *bucket; ++ unsigned int index; ++}; ++ ++/** ++ * struct futex_bucket - A bucket of futex's hash table ++ * @waiters: Number of waiters in the bucket ++ * @lock: Bucket lock ++ * @list: List of waiters on this bucket ++ */ ++struct futex_bucket { ++ atomic_t waiters; ++ spinlock_t lock; ++ struct list_head list; ++}; ++ ++struct futexv { ++ struct task_struct *task; ++ int hint; ++ struct futex_waiter objects[0]; ++}; ++ ++struct futex_single_waiter { ++ struct futexv parent; ++ struct futex_waiter waiter; ++} __packed; ++ ++struct futex_bucket *futex_table; ++ ++/* mask for futex2 flag operations */ ++#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | \ ++ FUTEX_CLOCK_REALTIME) ++ ++// mask for sys_futex_waitv ++#define FUTEXV_MASK (FUTEX_CLOCK_REALTIME) ++ ++// mask for each futex in futex_waitv list ++#define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG) ++ ++int futex2_hashsize; ++ ++/* ++ * Reflects a new waiter being added to the waitqueue. ++ */ ++static inline void bucket_inc_waiters(struct futex_bucket *bucket) ++{ ++#ifdef CONFIG_SMP ++ atomic_inc(&bucket->waiters); ++ /* ++ * Full barrier (A), see the ordering comment above. ++ */ ++ smp_mb__after_atomic(); ++#endif ++} ++ ++/* ++ * Reflects a waiter being removed from the waitqueue by wakeup ++ * paths. ++ */ ++static inline void bucket_dec_waiters(struct futex_bucket *bucket) ++{ ++#ifdef CONFIG_SMP ++ atomic_dec(&bucket->waiters); ++#endif ++} ++ ++/* ++ * Get the number of waiters in a bucket ++ */ ++static inline int bucket_get_waiters(struct futex_bucket *bucket) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * Full barrier (B), see the ordering comment above. ++ */ ++ smp_mb(); ++ return atomic_read(&bucket->waiters); ++#else ++ return 1; ++#endif ++} ++ ++/** ++ * futex_get_bucket - Check if the user address is valid, prepare internal ++ * data and calculate the hash ++ * @uaddr: futex user address ++ * @key: data that uniquely identifies a futex ++ * ++ * Return: address of bucket on success, error code otherwise ++ */ ++static struct futex_bucket *futex_get_bucket(void __user *uaddr, ++ struct futex_key *key) ++{ ++ uintptr_t address = (uintptr_t) uaddr; ++ u32 hash_key; ++ ++ /* Checking if uaddr is valid and accessible */ ++ if (unlikely(!IS_ALIGNED(address, sizeof(u32)))) ++ return ERR_PTR(-EINVAL); ++ if (unlikely(!access_ok(address, sizeof(u32)))) ++ return ERR_PTR(-EFAULT); ++ ++ key->address = address; ++ key->mm = current->mm; ++ ++ /* Generate hash key for this futex using uaddr and current->mm */ ++ hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0); ++ ++ /* Since HASH_SIZE is 2^n, subtracting 1 makes a perfect bit mask */ ++ return &futex_table[hash_key & (futex2_hashsize - 1)]; ++} ++ ++/** ++ * futex_get_user - Get the userspace value on this address ++ * @uval: variable to store the value ++ * @uaddr: userspace address ++ * ++ * Check the comment at futex_get_user_val for more information. ++ */ ++static int futex_get_user(u32 *uval, u32 *uaddr) ++{ ++ int ret; ++ ++ pagefault_disable(); ++ ret = __get_user(*uval, uaddr); ++ pagefault_enable(); ++ ++ return ret; ++} ++ ++/** ++ * futex_setup_time - Prepare the timeout mechanism, without starting it. ++ * @timo: Timeout value from userspace ++ * @timeout: Pointer to hrtimer handler ++ * @flags: Flags from userspace, to decide which clockid to use ++ * ++ * Return: 0 on success, error code otherwise ++ */ ++static int futex_setup_time(struct __kernel_timespec __user *timo, ++ struct hrtimer_sleeper *timeout, ++ unsigned int flags) ++{ ++ ktime_t time; ++ struct timespec64 ts; ++ clockid_t clockid = (flags & FUTEX_CLOCK_REALTIME) ? ++ CLOCK_REALTIME : CLOCK_MONOTONIC; ++ ++ if (get_timespec64(&ts, timo)) ++ return -EFAULT; ++ ++ if (!timespec64_valid(&ts)) ++ return -EINVAL; ++ ++ time = timespec64_to_ktime(ts); ++ ++ hrtimer_init_sleeper(timeout, clockid, HRTIMER_MODE_ABS); ++ ++ hrtimer_set_expires(&timeout->timer, time); ++ ++ return 0; ++} ++ ++ ++/** ++ * futex_get_user_value - Get the value from the userspace address and compares ++ * with the expected one. In success, leaves the function ++ * holding the bucket lock. Else, hold no lock. ++ * @bucket: hash bucket of this address ++ * @uaddr: futex's userspace address ++ * @val: expected value ++ * @multiple: is this call in the wait on multiple path ++ * ++ * Return: 0 on success, error code otherwise ++ */ ++static int futex_get_user_value(struct futex_bucket *bucket, u32 __user *uaddr, ++ unsigned int val, bool multiple) ++{ ++ u32 uval; ++ int ret; ++ ++ /* ++ * Get the value from user futex address. ++ * ++ * Since we are in a hurry, we use a spin lock and we can't sleep. ++ * Try to get the value with page fault disabled (when enable, we might ++ * sleep). ++ * ++ * If we fail, we aren't sure if the address is invalid or is just a ++ * page fault. Then, release the lock (so we can sleep) and try to get ++ * the value with page fault enabled. In order to trigger a page fault ++ * handling, we just call __get_user() again. ++ * ++ * If get_user succeeds, this mean that the address is valid and we do ++ * the loop again. Since we just handled the page fault, the page is ++ * likely pinned in memory and we should be luckier this time and be ++ * able to get the value. If we fail anyway, we will try again. ++ * ++ * If even with page faults enabled we get and error, this means that ++ * the address is not valid and we return from the syscall. ++ */ ++ do { ++ spin_lock(&bucket->lock); ++ ++ ret = futex_get_user(&uval, uaddr); ++ ++ if (ret) { ++ spin_unlock(&bucket->lock); ++ if (multiple || __get_user(uval, uaddr)) ++ return -EFAULT; ++ ++ } ++ } while (ret); ++ ++ if (uval != val) { ++ spin_unlock(&bucket->lock); ++ return -EWOULDBLOCK; ++ } ++ ++ return 0; ++} ++ ++/** ++ * futex_dequeue - Remove a futex from a queue ++ * @bucket: current bucket holding the futex ++ * @waiter: futex to be removed ++ * ++ * Return: True if futex was removed by this function, false if another wake ++ * thread removed this futex. ++ * ++ * This function should be used after we found that this futex was in a queue. ++ * Thus, it needs to be removed before the next step. However, someone could ++ * wake it between the time of the first check and the time to get the lock for ++ * the bucket. Check one more time if the futex is there with the bucket locked. ++ * If it's there, just remove it and return true. Else, mark the removal as ++ * false and do nothing. ++ */ ++static bool futex_dequeue(struct futex_bucket *bucket, struct futex_waiter *waiter) ++{ ++ bool removed = true; ++ ++ spin_lock(&bucket->lock); ++ if (list_empty(&waiter->list)) ++ removed = false; ++ else ++ list_del(&waiter->list); ++ spin_unlock(&bucket->lock); ++ ++ if (removed) ++ bucket_dec_waiters(bucket); ++ ++ return removed; ++} ++ ++/** ++ * sys_futex_wait - Wait on a futex address if (*uaddr) == val ++ * @uaddr: User address of futex ++ * @val: Expected value of futex ++ * @flags: Specify the size of futex and the clockid ++ * @timo: Optional absolute timeout. Supports only 64bit time. ++ */ ++SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, ++ unsigned int, flags, struct __kernel_timespec __user *, timo) ++{ ++ unsigned int size = flags & FUTEX_SIZE_MASK; ++ struct hrtimer_sleeper timeout; ++ struct futex_bucket *bucket; ++ struct futex_single_waiter wait_single; ++ struct futex_waiter *waiter; ++ int ret; ++ ++ wait_single.parent.task = current; ++ wait_single.parent.hint = 0; ++ waiter = &wait_single.waiter; ++ waiter->index = 0; ++ ++ if (flags & ~FUTEX2_MASK) ++ return -EINVAL; ++ ++ if (size != FUTEX_32) ++ return -EINVAL; ++ ++ if (timo) { ++ ret = futex_setup_time(timo, &timeout, flags); ++ if (ret) ++ return ret; ++ } ++ ++ /* Get an unlocked hash bucket */ ++ bucket = futex_get_bucket(uaddr, &waiter->key); ++ if (IS_ERR(bucket)) ++ return PTR_ERR(bucket); ++ ++ if (timo) ++ hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS); ++ ++retry: ++ bucket_inc_waiters(bucket); ++ ++ /* Compare the expected and current value, get the bucket lock */ ++ ret = futex_get_user_value(bucket, uaddr, val, false); ++ if (ret) { ++ bucket_dec_waiters(bucket); ++ goto out; ++ } ++ ++ /* Add the waiter to the hash table and sleep */ ++ set_current_state(TASK_INTERRUPTIBLE); ++ list_add_tail(&waiter->list, &bucket->list); ++ spin_unlock(&bucket->lock); ++ ++ /* Do not sleep if someone woke this futex or if it was timeouted */ ++ if (!list_empty_careful(&waiter->list) && (!timo || timeout.task)) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * One of those things triggered this wake: ++ * ++ * * We have been removed from the bucket. futex_wake() woke us. We just ++ * need to return 0 to userspace. ++ * ++ * However, if we find ourselves in the bucket we must remove ourselves ++ * from the bucket and ... ++ * ++ * * If the there's a timeout and it has expired, return -ETIMEDOUT. ++ * ++ * * If there is a signal pending, something wants to kill our thread. ++ * Return -ERESTARTSYS. ++ * ++ * * If there's no signal pending, it was a spurious wake (scheduler ++ * gave us a change to do some work, even if we don't want to). We ++ * need to remove ourselves from the bucket and add again, to prevent ++ * losing wakeups in the meantime. ++ */ ++ ++ /* Normal wake */ ++ if (list_empty_careful(&waiter->list)) ++ goto out; ++ ++ if (!futex_dequeue(bucket, waiter)) ++ goto out; ++ ++ /* Timeout */ ++ if (timo && !timeout.task) ++ return -ETIMEDOUT; ++ ++ /* Spurious wakeup */ ++ if (!signal_pending(current)) ++ goto retry; ++ ++ /* Some signal is pending */ ++ ret = -ERESTARTSYS; ++out: ++ if (timo) ++ hrtimer_cancel(&timeout.timer); ++ ++ return ret; ++} ++ ++static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) ++{ ++ uintptr_t parent = waiter - sizeof(struct futexv) ++ - (uintptr_t) (index * sizeof(struct futex_waiter)); ++ ++ return (struct futexv *) parent; ++} ++ ++/** ++ * sys_futex_wake - Wake a number of futexes waiting on an address ++ * @uaddr: Address of futex to be woken up ++ * @nr_wake: Number of futexes to be woken up ++ * @flags: TODO ++ */ ++SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, ++ unsigned int, flags) ++{ ++ unsigned int size = flags & FUTEX_SIZE_MASK; ++ struct futex_waiter waiter, *aux, *tmp; ++ struct futex_bucket *bucket; ++ struct task_struct *task; ++ DEFINE_WAKE_Q(wake_q); ++ int ret = 0; ++ ++ if (flags & ~FUTEX2_MASK) ++ return -EINVAL; ++ ++ if (size != FUTEX_32) ++ return -EINVAL; ++ ++ bucket = futex_get_bucket(uaddr, &waiter.key); ++ if (IS_ERR(bucket)) ++ return PTR_ERR(bucket); ++ ++ if (!bucket_get_waiters(bucket)) ++ return 0; ++ ++ spin_lock(&bucket->lock); ++ list_for_each_entry_safe(aux, tmp, &bucket->list, list) { ++ if (ret >= nr_wake) ++ break; ++ ++ if (waiter.key.address == aux->key.address && ++ waiter.key.mm == aux->key.mm) { ++ struct futexv *parent = ++ futex_get_parent((uintptr_t) aux, aux->index); ++ ++ parent->hint = 1; ++ task = parent->task; ++ get_task_struct(task); ++ list_del_init_careful(&aux->list); ++ wake_q_add_safe(&wake_q, task); ++ ret++; ++ bucket_dec_waiters(bucket); ++ } ++ } ++ spin_unlock(&bucket->lock); ++ ++ wake_up_q(&wake_q); ++ ++ return ret; ++} ++ ++static int __init futex2_init(void) ++{ ++ int i; ++ unsigned int futex_shift; ++ ++#if CONFIG_BASE_SMALL ++ futex2_hashsize = 16; ++#else ++ futex2_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); ++#endif ++ ++ futex_table = alloc_large_system_hash("futex2", sizeof(struct futex_bucket), ++ futex2_hashsize, 0, ++ futex2_hashsize < 256 ? HASH_SMALL : 0, ++ &futex_shift, NULL, ++ futex2_hashsize, futex2_hashsize); ++ futex2_hashsize = 1UL << futex_shift; ++ ++ for (i = 0; i < futex2_hashsize; i++) { ++ INIT_LIST_HEAD(&futex_table[i].list); ++ spin_lock_init(&futex_table[i].lock); ++ atomic_set(&futex_table[i].waiters, 0); ++ } ++ ++ return 0; ++} ++core_initcall(futex2_init); +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 4d59775ea79c..10049bc56c24 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -148,6 +148,10 @@ COND_SYSCALL_COMPAT(set_robust_list); + COND_SYSCALL(get_robust_list); + COND_SYSCALL_COMPAT(get_robust_list); + ++/* kernel/futex2.c */ ++COND_SYSCALL(futex_wait); ++COND_SYSCALL(futex_wake); ++ + /* kernel/hrtimer.c */ + + /* kernel/itimer.c */ +-- +2.28.0 + +From 08110d54945541dd186a7dabeef58be08011dde7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 15 Oct 2020 17:15:57 -0300 +Subject: [PATCH 02/13] futex2: Add suport for vectorized wait +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add support to wait on multiple futexes + +Signed-off-by: André Almeida +--- + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + include/uapi/asm-generic/unistd.h | 5 +- + kernel/futex2.c | 430 +++++++++++++++++-------- + kernel/sys_ni.c | 1 + + 5 files changed, 304 insertions(+), 134 deletions(-) + +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 955322962964..c844c0cbf0e5 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -446,3 +446,4 @@ + 439 i386 faccessat2 sys_faccessat2 + 440 i386 futex_wait sys_futex_wait + 441 i386 futex_wake sys_futex_wake ++442 i386 futex_waitv sys_futex_waitv +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index 4133bfe96891..0901c26c6786 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -363,6 +363,7 @@ + 439 common faccessat2 sys_faccessat2 + 440 common futex_wait sys_futex_wait + 441 common futex_wake sys_futex_wake ++442 common futex_waitv sys_futex_waitv + + # + # x32-specific system call numbers start at 512 to avoid cache impact +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index 80567ade774a..d7ebbed0a18c 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -866,8 +866,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) + #define __NR_futex_wake 441 + __SYSCALL(__NR_futex_wake, sys_futex_wake) + ++#define __NR_futex_waitv 442 ++__SYSCALL(__NR_futex_waitv, sys_futex_waitv) ++ + #undef __NR_syscalls +-#define __NR_syscalls 442 ++#define __NR_syscalls 443 + + /* + * 32 bit systems traditionally used different +diff --git a/kernel/futex2.c b/kernel/futex2.c +index 107b80a466d0..4b782b5ef615 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -48,14 +48,25 @@ struct futex_bucket { + struct list_head list; + }; + ++/** ++ * struct futexv - List of futexes to be waited ++ * @task: Task to be awaken ++ * @hint: Was someone on this list awaken? ++ * @objects: List of futexes ++ */ + struct futexv { + struct task_struct *task; +- int hint; ++ bool hint; + struct futex_waiter objects[0]; + }; + ++/** ++ * struct futex_single_waiter - Wrapper for a futexv of one element ++ * @futexv: TODO ++ * @waiter: TODO ++ */ + struct futex_single_waiter { +- struct futexv parent; ++ struct futexv futexv; + struct futex_waiter waiter; + } __packed; + +@@ -65,10 +76,10 @@ struct futex_bucket *futex_table; + #define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | \ + FUTEX_CLOCK_REALTIME) + +-// mask for sys_futex_waitv ++/* mask for sys_futex_waitv flag */ + #define FUTEXV_MASK (FUTEX_CLOCK_REALTIME) + +-// mask for each futex in futex_waitv list ++/* mask for each futex in futex_waitv list */ + #define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG) + + int futex2_hashsize; +@@ -151,7 +162,7 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr, + * + * Check the comment at futex_get_user_val for more information. + */ +-static int futex_get_user(u32 *uval, u32 *uaddr) ++static int futex_get_user(u32 *uval, u32 __user *uaddr) + { + int ret; + +@@ -194,95 +205,227 @@ static int futex_setup_time(struct __kernel_timespec __user *timo, + return 0; + } + ++/** ++ * futex_dequeue_multiple - Remove multiple futexes from hash table ++ * @futexv: list of waiters ++ * @nr: number of futexes to be removed ++ * ++ * This function should be used after we found that this futex was in a queue. ++ * Thus, it needs to be removed before the next step. However, someone could ++ * wake it between the time of the first check and the time to get the lock for ++ * the bucket. Check one more time if the futex is there with the bucket locked. ++ * If it's there, just remove it and return true. Else, mark the removal as ++ * false and do nothing. ++ * ++ * Return: ++ * * -1 if no futex was woken during the removal ++ * * =< 0 at least one futex was found woken, index of the last one ++ */ ++static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr) ++{ ++ int i, ret = -1; ++ ++ for (i = 0; i < nr; i++) { ++ spin_lock(&futexv->objects[i].bucket->lock); ++ if (!list_empty_careful(&futexv->objects[i].list)) { ++ list_del_init_careful(&futexv->objects[i].list); ++ bucket_dec_waiters(futexv->objects[i].bucket); ++ } else { ++ ret = i; ++ } ++ spin_unlock(&futexv->objects[i].bucket->lock); ++ } ++ ++ return ret; ++} + + /** +- * futex_get_user_value - Get the value from the userspace address and compares +- * with the expected one. In success, leaves the function +- * holding the bucket lock. Else, hold no lock. +- * @bucket: hash bucket of this address +- * @uaddr: futex's userspace address +- * @val: expected value +- * @multiple: is this call in the wait on multiple path ++ * futex_enqueue - Check the value and enqueue a futex on a wait list ++ * ++ * @futexv: List of futexes ++ * @nr_futexes: Number of futexes in the list ++ * @awaken: If a futex was awaken during enqueueing, store the index here ++ * ++ * Get the value from the userspace address and compares with the expected one. ++ * In success, enqueue the futex in the correct bucket ++ * ++ * Get the value from user futex address. ++ * ++ * Since we are in a hurry, we use a spin lock and we can't sleep. ++ * Try to get the value with page fault disabled (when enable, we might ++ * sleep). ++ * ++ * If we fail, we aren't sure if the address is invalid or is just a ++ * page fault. Then, release the lock (so we can sleep) and try to get ++ * the value with page fault enabled. In order to trigger a page fault ++ * handling, we just call __get_user() again. If we sleep with enqueued ++ * futexes, we might miss a wake, so dequeue everything before sleeping. ++ * ++ * If get_user succeeds, this mean that the address is valid and we do ++ * the work again. Since we just handled the page fault, the page is ++ * likely pinned in memory and we should be luckier this time and be ++ * able to get the value. If we fail anyway, we will try again. ++ * ++ * If even with page faults enabled we get and error, this means that ++ * the address is not valid and we return from the syscall. ++ * ++ * If we got an unexpected value or need to treat a page fault and realized that ++ * a futex was awaken, we can priority this and return success. + * + * Return: 0 on success, error code otherwise + */ +-static int futex_get_user_value(struct futex_bucket *bucket, u32 __user *uaddr, +- unsigned int val, bool multiple) ++static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, ++ unsigned int *awaken) + { +- u32 uval; +- int ret; ++ int i, ret; ++ u32 uval, *uaddr, val; ++ struct futex_bucket *bucket; + +- /* +- * Get the value from user futex address. +- * +- * Since we are in a hurry, we use a spin lock and we can't sleep. +- * Try to get the value with page fault disabled (when enable, we might +- * sleep). +- * +- * If we fail, we aren't sure if the address is invalid or is just a +- * page fault. Then, release the lock (so we can sleep) and try to get +- * the value with page fault enabled. In order to trigger a page fault +- * handling, we just call __get_user() again. +- * +- * If get_user succeeds, this mean that the address is valid and we do +- * the loop again. Since we just handled the page fault, the page is +- * likely pinned in memory and we should be luckier this time and be +- * able to get the value. If we fail anyway, we will try again. +- * +- * If even with page faults enabled we get and error, this means that +- * the address is not valid and we return from the syscall. +- */ +- do { +- spin_lock(&bucket->lock); ++retry: ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < nr_futexes; i++) { ++ uaddr = (u32 * __user) futexv->objects[i].key.address; ++ val = (u32) futexv->objects[i].val; ++ bucket = futexv->objects[i].bucket; ++ ++ bucket_inc_waiters(bucket); ++ spin_lock(&bucket->lock); + +- ret = futex_get_user(&uval, uaddr); ++ ret = futex_get_user(&uval, uaddr); + +- if (ret) { ++ if (unlikely(ret)) { + spin_unlock(&bucket->lock); +- if (multiple || __get_user(uval, uaddr)) ++ ++ bucket_dec_waiters(bucket); ++ __set_current_state(TASK_RUNNING); ++ *awaken = futex_dequeue_multiple(futexv, i); ++ ++ if (__get_user(uval, uaddr)) + return -EFAULT; + ++ if (*awaken >= 0) ++ return 0; ++ ++ goto retry; ++ } ++ ++ if (uval != val) { ++ spin_unlock(&bucket->lock); ++ ++ bucket_dec_waiters(bucket); ++ __set_current_state(TASK_RUNNING); ++ *awaken = futex_dequeue_multiple(futexv, i); ++ ++ if (*awaken >= 0) ++ return 0; ++ ++ return -EWOULDBLOCK; + } +- } while (ret); + +- if (uval != val) { ++ list_add_tail(&futexv->objects[i].list, &bucket->list); + spin_unlock(&bucket->lock); +- return -EWOULDBLOCK; + } + + return 0; + } + ++ ++static int __futex_wait(struct futexv *futexv, ++ unsigned int nr_futexes, ++ struct hrtimer_sleeper *timeout) ++{ ++ int ret; ++ unsigned int awaken = -1; ++ ++ while (1) { ++ ret = futex_enqueue(futexv, nr_futexes, &awaken); ++ ++ if (ret < 0) ++ break; ++ ++ if (awaken <= 0) { ++ return awaken; ++ } ++ ++ ++ /* Before sleeping, check if someone was woken */ ++ if (!futexv->hint && (!timeout || timeout->task)) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * One of those things triggered this wake: ++ * ++ * * We have been removed from the bucket. futex_wake() woke ++ * us. We just need to dequeue return 0 to userspace. ++ * ++ * However, if no futex was dequeued by a futex_wake(): ++ * ++ * * If the there's a timeout and it has expired, ++ * return -ETIMEDOUT. ++ * ++ * * If there is a signal pending, something wants to kill our ++ * thread, return -ERESTARTSYS. ++ * ++ * * If there's no signal pending, it was a spurious wake ++ * (scheduler gave us a change to do some work, even if we ++ * don't want to). We need to remove ourselves from the ++ * bucket and add again, to prevent losing wakeups in the ++ * meantime. ++ */ ++ ++ ret = futex_dequeue_multiple(futexv, nr_futexes); ++ ++ /* Normal wake */ ++ if (ret >= 0) ++ break; ++ ++ if (timeout && !timeout->task) ++ return -ETIMEDOUT; ++ ++ /* signal */ ++ if (signal_pending(current)) ++ return -ERESTARTSYS; ++ ++ /* spurious wake, do everything again */ ++ } ++ ++ return ret; ++} ++ + /** +- * futex_dequeue - Remove a futex from a queue +- * @bucket: current bucket holding the futex +- * @waiter: futex to be removed ++ * futex_wait - Setup the timer and wait on a list of futexes ++ * @futexv: List of waiters ++ * @nr_futexes: Number of waiters ++ * @timo: Timeout ++ * @timeout: Timeout ++ * @flags: Timeout flags + * +- * Return: True if futex was removed by this function, false if another wake +- * thread removed this futex. +- * +- * This function should be used after we found that this futex was in a queue. +- * Thus, it needs to be removed before the next step. However, someone could +- * wake it between the time of the first check and the time to get the lock for +- * the bucket. Check one more time if the futex is there with the bucket locked. +- * If it's there, just remove it and return true. Else, mark the removal as +- * false and do nothing. ++ * Return: error code, or a hint of one of the waiters + */ +-static bool futex_dequeue(struct futex_bucket *bucket, struct futex_waiter *waiter) ++static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, ++ struct __kernel_timespec __user *timo, ++ struct hrtimer_sleeper *timeout, unsigned int flags) + { +- bool removed = true; ++ int ret; + +- spin_lock(&bucket->lock); +- if (list_empty(&waiter->list)) +- removed = false; +- else +- list_del(&waiter->list); +- spin_unlock(&bucket->lock); ++ if (timo) { ++ ret = futex_setup_time(timo, timeout, flags); ++ if (ret) ++ return ret; + +- if (removed) +- bucket_dec_waiters(bucket); ++ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); ++ } + +- return removed; ++ ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL); ++ ++ ++ if (timo) ++ hrtimer_cancel(&timeout->timer); ++ ++ return ret; + } + + /** +@@ -297,15 +440,20 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + { + unsigned int size = flags & FUTEX_SIZE_MASK; + struct hrtimer_sleeper timeout; +- struct futex_bucket *bucket; + struct futex_single_waiter wait_single; + struct futex_waiter *waiter; ++ struct futexv *futexv; + int ret; + +- wait_single.parent.task = current; +- wait_single.parent.hint = 0; ++ futexv = &wait_single.futexv; ++ futexv->task = current; ++ futexv->hint = false; ++ + waiter = &wait_single.waiter; + waiter->index = 0; ++ waiter->val = val; ++ ++ INIT_LIST_HEAD(&waiter->list); + + if (flags & ~FUTEX2_MASK) + return -EINVAL; +@@ -313,85 +461,101 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + if (size != FUTEX_32) + return -EINVAL; + +- if (timo) { +- ret = futex_setup_time(timo, &timeout, flags); +- if (ret) +- return ret; +- } +- + /* Get an unlocked hash bucket */ +- bucket = futex_get_bucket(uaddr, &waiter->key); +- if (IS_ERR(bucket)) +- return PTR_ERR(bucket); ++ waiter->bucket = futex_get_bucket(uaddr, &waiter->key); ++ if (IS_ERR(waiter->bucket)) ++ return PTR_ERR(waiter->bucket); + +- if (timo) +- hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS); ++ ret = futex_wait(futexv, 1, timo, &timeout, flags); + +-retry: +- bucket_inc_waiters(bucket); ++ return ret; ++} + +- /* Compare the expected and current value, get the bucket lock */ +- ret = futex_get_user_value(bucket, uaddr, val, false); +- if (ret) { +- bucket_dec_waiters(bucket); +- goto out; +- } ++/** ++ * futex_parse_waitv - Parse a waitv array from userspace ++ * @futexv: list of waiters ++ * @uwaitv: userspace list ++ * @nr_futexes: number of waiters in the list ++ * ++ * Return: Error code on failure, pointer to a prepared futexv otherwise ++ */ ++static int futex_parse_waitv(struct futexv *futexv, ++ struct futex_waitv __user *uwaitv, ++ unsigned int nr_futexes) ++{ ++ struct futex_waitv waitv; ++ unsigned int i; ++ struct futex_bucket *bucket; + +- /* Add the waiter to the hash table and sleep */ +- set_current_state(TASK_INTERRUPTIBLE); +- list_add_tail(&waiter->list, &bucket->list); +- spin_unlock(&bucket->lock); ++ for (i = 0; i < nr_futexes; i++) { ++ if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) ++ return -EFAULT; + +- /* Do not sleep if someone woke this futex or if it was timeouted */ +- if (!list_empty_careful(&waiter->list) && (!timo || timeout.task)) +- freezable_schedule(); ++ if ((waitv.flags & ~FUTEXV_WAITER_MASK) || ++ (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) ++ return -EINVAL; + +- __set_current_state(TASK_RUNNING); ++ bucket = futex_get_bucket(waitv.uaddr, ++ &futexv->objects[i].key); ++ if (IS_ERR(bucket)) ++ return PTR_ERR(bucket); + +- /* +- * One of those things triggered this wake: +- * +- * * We have been removed from the bucket. futex_wake() woke us. We just +- * need to return 0 to userspace. +- * +- * However, if we find ourselves in the bucket we must remove ourselves +- * from the bucket and ... +- * +- * * If the there's a timeout and it has expired, return -ETIMEDOUT. +- * +- * * If there is a signal pending, something wants to kill our thread. +- * Return -ERESTARTSYS. +- * +- * * If there's no signal pending, it was a spurious wake (scheduler +- * gave us a change to do some work, even if we don't want to). We +- * need to remove ourselves from the bucket and add again, to prevent +- * losing wakeups in the meantime. +- */ ++ futexv->objects[i].bucket = bucket; ++ futexv->objects[i].val = waitv.val; ++ futexv->objects[i].flags = waitv.flags; ++ futexv->objects[i].index = i; ++ INIT_LIST_HEAD(&futexv->objects[i].list); ++ } + +- /* Normal wake */ +- if (list_empty_careful(&waiter->list)) +- goto out; ++ return 0; ++} + +- if (!futex_dequeue(bucket, waiter)) +- goto out; ++/** ++ * sys_futex_waitv - function ++ * @waiters: TODO ++ * @nr_futexes: TODO ++ * @flags: TODO ++ * @timo: TODO ++ */ ++SYSCALL_DEFINE4(futex_waitv, struct futex_waitv __user *, waiters, ++ unsigned int, nr_futexes, unsigned int, flags, ++ struct __kernel_timespec __user *, timo) ++{ ++ struct hrtimer_sleeper timeout; ++ struct futexv *futexv; ++ int ret; + +- /* Timeout */ +- if (timo && !timeout.task) +- return -ETIMEDOUT; ++ if (flags & ~FUTEXV_MASK) ++ return -EINVAL; + +- /* Spurious wakeup */ +- if (!signal_pending(current)) +- goto retry; ++ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) ++ return -EINVAL; + +- /* Some signal is pending */ +- ret = -ERESTARTSYS; +-out: +- if (timo) +- hrtimer_cancel(&timeout.timer); ++ futexv = kmalloc(sizeof(struct futexv) + ++ (sizeof(struct futex_waiter) * nr_futexes), ++ GFP_KERNEL); ++ if (!futexv) ++ return -ENOMEM; ++ ++ futexv->hint = false; ++ futexv->task = current; ++ ++ ret = futex_parse_waitv(futexv, waiters, nr_futexes); ++ if (!ret) ++ ret = futex_wait(futexv, nr_futexes, timo, &timeout, flags); ++ ++ kfree(futexv); + + return ret; + } + ++/** ++ * futex_get_parent - Get parent ++ * @waiter: TODO ++ * @index: TODO ++ * ++ * Return: TODO ++ */ + static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) + { + uintptr_t parent = waiter - sizeof(struct futexv) +@@ -439,7 +603,7 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + struct futexv *parent = + futex_get_parent((uintptr_t) aux, aux->index); + +- parent->hint = 1; ++ parent->hint = true; + task = parent->task; + get_task_struct(task); + list_del_init_careful(&aux->list); +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 10049bc56c24..3e1a713d3e57 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -151,6 +151,7 @@ COND_SYSCALL_COMPAT(get_robust_list); + /* kernel/futex2.c */ + COND_SYSCALL(futex_wait); + COND_SYSCALL(futex_wake); ++COND_SYSCALL(futex_waitv); + + /* kernel/hrtimer.c */ + +-- +2.28.0 + +From d8120d2ee1729a6933a606a6720f3e3116e4f699 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 9 Jul 2020 11:34:40 -0300 +Subject: [PATCH 03/13] selftests: futex: Add futex2 wake/wait test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add a simple test to test wake/wait mechanism using futex2 interface. +Create helper files so more tests can evaluate futex2. While 32bit ABIs +from glibc aren't able to use 64 bit sized time variables, add a +temporary workaround that implements the required types and calls the +appropriated syscalls, since futex2 doesn't supports 32 bit sized time. + +Signed-off-by: André Almeida +--- + tools/include/uapi/asm-generic/unistd.h | 7 +- + .../selftests/futex/functional/.gitignore | 1 + + .../selftests/futex/functional/Makefile | 4 +- + .../selftests/futex/functional/futex2_wait.c | 111 ++++++++++++++++++ + .../testing/selftests/futex/functional/run.sh | 3 + + .../selftests/futex/include/futex2test.h | 77 ++++++++++++ + 6 files changed, 201 insertions(+), 2 deletions(-) + create mode 100644 tools/testing/selftests/futex/functional/futex2_wait.c + create mode 100644 tools/testing/selftests/futex/include/futex2test.h + +diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h +index 995b36c2ea7d..dd457de21bad 100644 +--- a/tools/include/uapi/asm-generic/unistd.h ++++ b/tools/include/uapi/asm-generic/unistd.h +@@ -860,8 +860,13 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) + #define __NR_faccessat2 439 + __SYSCALL(__NR_faccessat2, sys_faccessat2) + ++#define __NR_futex_wait 440 ++__SYSCALL(__NR_futex_wait, sys_futex_wait) ++#define __NR_futex_wake 441 ++__SYSCALL(__NR_futex_wake, sys_futex_wake) ++ + #undef __NR_syscalls +-#define __NR_syscalls 440 ++#define __NR_syscalls 442 + + /* + * 32 bit systems traditionally used different +diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore +index 0efcd494daab..d61f1df94360 100644 +--- a/tools/testing/selftests/futex/functional/.gitignore ++++ b/tools/testing/selftests/futex/functional/.gitignore +@@ -6,3 +6,4 @@ futex_wait_private_mapped_file + futex_wait_timeout + futex_wait_uninitialized_heap + futex_wait_wouldblock ++futex2_wait +diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile +index 23207829ec75..7142a94a7ac3 100644 +--- a/tools/testing/selftests/futex/functional/Makefile ++++ b/tools/testing/selftests/futex/functional/Makefile +@@ -5,6 +5,7 @@ LDLIBS := -lpthread -lrt + + HEADERS := \ + ../include/futextest.h \ ++ ../include/futex2test.h \ + ../include/atomic.h \ + ../include/logging.h + TEST_GEN_FILES := \ +@@ -14,7 +15,8 @@ TEST_GEN_FILES := \ + futex_requeue_pi_signal_restart \ + futex_requeue_pi_mismatched_ops \ + futex_wait_uninitialized_heap \ +- futex_wait_private_mapped_file ++ futex_wait_private_mapped_file \ ++ futex2_wait + + TEST_PROGS := run.sh + +diff --git a/tools/testing/selftests/futex/functional/futex2_wait.c b/tools/testing/selftests/futex/functional/futex2_wait.c +new file mode 100644 +index 000000000000..752ed26803b3 +--- /dev/null ++++ b/tools/testing/selftests/futex/functional/futex2_wait.c +@@ -0,0 +1,111 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/****************************************************************************** ++ * ++ * Copyright Collabora Ltd., 2020 ++ * ++ * DESCRIPTION ++ * Test wait/wake mechanism of futex2, using 32bit sized futexes. ++ * ++ * AUTHOR ++ * André Almeida ++ * ++ * HISTORY ++ * 2020-Jul-9: Initial version by André ++ * ++ *****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "futex2test.h" ++#include "logging.h" ++ ++#define TEST_NAME "futex-wait-wouldblock" ++#define timeout_ns 30000000 ++#define WAKE_WAIT_US 10000 ++futex_t f1 = FUTEX_INITIALIZER; ++ ++void usage(char *prog) ++{ ++ printf("Usage: %s\n", prog); ++ printf(" -c Use color\n"); ++ printf(" -h Display this help message\n"); ++ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", ++ VQUIET, VCRITICAL, VINFO); ++} ++ ++void *waiterfn(void *arg) ++{ ++ struct timespec64 to64; ++ ++ /* setting absolute timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); ++ ++ to64.tv_nsec += timeout_ns; ++ ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; ++ } ++ ++ if (futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64)) ++ printf("waiter failed errno %d\n", errno); ++ ++ return NULL; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ pthread_t waiter; ++ int res, ret = RET_PASS; ++ int c; ++ ++ while ((c = getopt(argc, argv, "cht:v:")) != -1) { ++ switch (c) { ++ case 'c': ++ log_color(1); ++ break; ++ case 'h': ++ usage(basename(argv[0])); ++ exit(0); ++ case 'v': ++ log_verbosity(atoi(optarg)); ++ break; ++ default: ++ usage(basename(argv[0])); ++ exit(1); ++ } ++ } ++ ++ ksft_print_header(); ++ ksft_set_plan(1); ++ ksft_print_msg("%s: Test FUTEX_WAIT\n", ++ basename(argv[0])); ++ ++ info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1); ++ ++ if (pthread_create(&waiter, NULL, waiterfn, NULL)) ++ error("pthread_create failed\n", errno); ++ ++ usleep(WAKE_WAIT_US); ++ ++ info("Calling futex2_wake on f1: %u @ %p with val=%u\n", f1, &f1, f1); ++ res = futex2_wake(&f1, 1, FUTEX_PRIVATE_FLAG | FUTEX_32); ++ if (res != 1) { ++ ksft_test_result_fail("futex2_wake returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wake wouldblock succeeds\n"); ++ } ++ ++ ksft_print_cnts(); ++ return ret; ++} +diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh +index 1acb6ace1680..3730159c865a 100755 +--- a/tools/testing/selftests/futex/functional/run.sh ++++ b/tools/testing/selftests/futex/functional/run.sh +@@ -73,3 +73,6 @@ echo + echo + ./futex_wait_uninitialized_heap $COLOR + ./futex_wait_private_mapped_file $COLOR ++ ++echo ++./futex2_wait $COLOR +diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h +new file mode 100644 +index 000000000000..807b8b57fe61 +--- /dev/null ++++ b/tools/testing/selftests/futex/include/futex2test.h +@@ -0,0 +1,77 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++/****************************************************************************** ++ * ++ * Copyright Collabora Ltd., 2020 ++ * ++ * DESCRIPTION ++ * Futex2 library addons for old futex library ++ * ++ * AUTHOR ++ * André Almeida ++ * ++ * HISTORY ++ * 2020-Jul-9: Initial version by André ++ * ++ *****************************************************************************/ ++#include "futextest.h" ++#include ++ ++#define NSEC_PER_SEC 1000000000L ++ ++#ifndef FUTEX_8 ++# define FUTEX_8 0 ++#endif ++#ifndef FUTEX_16 ++# define FUTEX_16 1 ++#endif ++#ifndef FUTEX_32 ++#define FUTEX_32 2 ++#endif ++#ifdef __x86_64__ ++# ifndef FUTEX_64 ++# define FUTEX_64 3 ++# endif ++#endif ++ ++/* ++ * - Y2038 section for 32-bit applications - ++ * ++ * Remove this when glibc is ready for y2038. Then, always compile with ++ * `-DTIME_BITS=64` or `-D__USE_TIME_BITS64`. glibc will provide both ++ * timespec64 and clock_gettime64 so we won't need to define here. ++ */ ++#if defined(__i386__) || __TIMESIZE == 32 ++# define NR_gettime __NR_clock_gettime64 ++#else ++# define NR_gettime __NR_clock_gettime ++#endif ++ ++struct timespec64 { ++ long long tv_sec; /* seconds */ ++ long long tv_nsec; /* nanoseconds */ ++}; ++ ++int gettime64(clock_t clockid, struct timespec64 *tv) ++{ ++ return syscall(NR_gettime, clockid, tv); ++} ++/* ++ * - End of Y2038 section - ++ */ ++ ++/* ++ * wait for uaddr if (*uaddr == val) ++ */ ++static inline int futex2_wait(volatile void *uaddr, unsigned long val, ++ unsigned long flags, struct timespec64 *timo) ++{ ++ return syscall(__NR_futex_wait, uaddr, val, flags, timo); ++} ++ ++/* ++ * wake nr futexes waiting for uaddr ++ */ ++static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned long flags) ++{ ++ return syscall(__NR_futex_wake, uaddr, nr, flags); ++} +-- +2.28.0 + +From d4a7ca72f276b2e337eaedcbbe58a2782e0e7d3b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 9 Jul 2020 11:36:14 -0300 +Subject: [PATCH 04/13] selftests: futex: Add futex2 timeout test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Adapt existing futex wait timeout file to test the same mechanism for +futex2. + +Signed-off-by: André Almeida +--- + .../futex/functional/futex_wait_timeout.c | 38 ++++++++++++++----- + 1 file changed, 29 insertions(+), 9 deletions(-) + +diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +index ee55e6d389a3..d2e7ae18985b 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +@@ -11,6 +11,7 @@ + * + * HISTORY + * 2009-Nov-6: Initial version by Darren Hart ++ * 2020-Jul-9: Add futex2 test by André + * + *****************************************************************************/ + +@@ -20,7 +21,7 @@ + #include + #include + #include +-#include "futextest.h" ++#include "futex2test.h" + #include "logging.h" + + #define TEST_NAME "futex-wait-timeout" +@@ -40,7 +41,8 @@ void usage(char *prog) + int main(int argc, char *argv[]) + { + futex_t f1 = FUTEX_INITIALIZER; +- struct timespec to; ++ struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; ++ struct timespec64 to64; + int res, ret = RET_PASS; + int c; + +@@ -65,22 +67,40 @@ int main(int argc, char *argv[]) + } + + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(2); + ksft_print_msg("%s: Block on a futex and wait for timeout\n", + basename(argv[0])); + ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); + +- /* initialize timeout */ +- to.tv_sec = 0; +- to.tv_nsec = timeout_ns; +- + info("Calling futex_wait on f1: %u @ %p\n", f1, &f1); + res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG); + if (!res || errno != ETIMEDOUT) { +- fail("futex_wait returned %d\n", ret < 0 ? errno : ret); ++ ksft_test_result_fail("futex_wait returned %d\n", ret < 0 ? errno : ret); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex_wait timeout succeeds\n"); ++ } ++ ++ /* setting absolute timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); ++ ++ to64.tv_nsec += timeout_ns; ++ ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; ++ } ++ ++ info("Calling futex2_wait on f1: %u @ %p\n", f1, &f1); ++ res = futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64); ++ if (!res || errno != ETIMEDOUT) { ++ ksft_test_result_fail("futex2_wait returned %d\n", ret < 0 ? errno : ret); + ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wait timeout succeeds\n"); + } + +- print_result(TEST_NAME, ret); ++ ksft_print_cnts(); + return ret; + } +-- +2.28.0 + +From 6d2252d43d36a5eb2b9170351128007e27f47737 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 9 Jul 2020 11:37:42 -0300 +Subject: [PATCH 05/13] selftests: futex: Add futex2 wouldblock test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Adapt existing futex wait wouldblock file to test the same mechanism for +futex2. + +Signed-off-by: André Almeida +--- + .../futex/functional/futex_wait_wouldblock.c | 33 ++++++++++++++++--- + 1 file changed, 29 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +index 0ae390ff8164..8187f0754cd2 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +@@ -12,6 +12,7 @@ + * + * HISTORY + * 2009-Nov-14: Initial version by Gowrishankar ++ * 2020-Jul-9: Add futex2 test by André + * + *****************************************************************************/ + +@@ -21,7 +22,7 @@ + #include + #include + #include +-#include "futextest.h" ++#include "futex2test.h" + #include "logging.h" + + #define TEST_NAME "futex-wait-wouldblock" +@@ -39,6 +40,7 @@ void usage(char *prog) + int main(int argc, char *argv[]) + { + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; ++ struct timespec64 to64; + futex_t f1 = FUTEX_INITIALIZER; + int res, ret = RET_PASS; + int c; +@@ -61,18 +63,41 @@ int main(int argc, char *argv[]) + } + + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(2); + ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", + basename(argv[0])); + + info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); + if (!res || errno != EWOULDBLOCK) { +- fail("futex_wait returned: %d %s\n", ++ ksft_test_result_fail("futex_wait returned: %d %s\n", + res ? errno : res, res ? strerror(errno) : ""); + ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); + } + +- print_result(TEST_NAME, ret); ++ /* setting absolute timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); ++ ++ to64.tv_nsec += timeout_ns; ++ ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; ++ } ++ ++ info("Calling futex2_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); ++ res = futex2_wait(&f1, f1+1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64); ++ if (!res || errno != EWOULDBLOCK) { ++ ksft_test_result_fail("futex2_wait returned: %d %s\n", ++ res ? errno : res, res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wait wouldblock succeeds\n"); ++ } ++ ++ ksft_print_cnts(); + return ret; + } +-- +2.28.0 + +From 6b35a09be663f5a844e089f1ddd370137832e7a7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Wed, 14 Oct 2020 16:10:09 -0300 +Subject: [PATCH 06/13] DONOTMERGE: futex: Add a clone of futex implementation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +For comparative performance tests between the original futex and the new +futex2 interface, create a clone of the current futex. In that way, we +can have a fair comparison, since the futex2 table will be empty with no +contention for the bucket locks. Since futex is widely used in the host +system, the performance tests could get misleading results by the tests +competing with the system for resources. + +Signed-off-by: André Almeida +--- + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + include/linux/syscalls.h | 3 + + include/uapi/asm-generic/unistd.h | 5 +- + kernel/Makefile | 1 + + kernel/futex1.c | 3384 +++++++++++++++++ + kernel/sys_ni.c | 2 + + tools/arch/x86/include/asm/unistd_64.h | 12 + + tools/include/uapi/asm-generic/unistd.h | 6 +- + .../arch/x86/entry/syscalls/syscall_64.tbl | 3 + + tools/perf/bench/futex.h | 23 +- + 11 files changed, 3438 insertions(+), 3 deletions(-) + create mode 100644 kernel/futex1.c + +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index c844c0cbf0e5..820fa53ccf75 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -447,3 +447,4 @@ + 440 i386 futex_wait sys_futex_wait + 441 i386 futex_wake sys_futex_wake + 442 i386 futex_waitv sys_futex_waitv ++443 i386 futex1 sys_futex1 +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index 0901c26c6786..99795136cb98 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -364,6 +364,7 @@ + 440 common futex_wait sys_futex_wait + 441 common futex_wake sys_futex_wake + 442 common futex_waitv sys_futex_waitv ++443 common futex1 sys_futex1 + + # + # x32-specific system call numbers start at 512 to avoid cache impact +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 38c3a87dbfc2..0351f6ad09a9 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -596,6 +596,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, + asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long nr_wake, + unsigned long flags); + ++asmlinkage long sys_futex1(void __user *uaddr, unsigned long nr_wake, ++ unsigned long flags); ++ + /* kernel/hrtimer.c */ + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, + struct __kernel_timespec __user *rmtp); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index d7ebbed0a18c..e3ba6cb1f76d 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -869,8 +869,11 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake) + #define __NR_futex_waitv 442 + __SYSCALL(__NR_futex_waitv, sys_futex_waitv) + ++#define __NR_futex1 443 ++__SYSCALL(__NR_futex1, sys_futex1) ++ + #undef __NR_syscalls +-#define __NR_syscalls 443 ++#define __NR_syscalls 444 + + /* + * 32 bit systems traditionally used different +diff --git a/kernel/Makefile b/kernel/Makefile +index 51ea9bc647bf..0fe55a8cb9e2 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o + obj-$(CONFIG_STACKTRACE) += stacktrace.o + obj-y += time/ + obj-$(CONFIG_FUTEX) += futex.o ++obj-$(CONFIG_FUTEX2) += futex1.o + obj-$(CONFIG_FUTEX2) += futex2.o + obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o + obj-$(CONFIG_SMP) += smp.o +diff --git a/kernel/futex1.c b/kernel/futex1.c +new file mode 100644 +index 000000000000..4f7bf312fefd +--- /dev/null ++++ b/kernel/futex1.c +@@ -0,0 +1,3384 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * Fast Userspace Mutexes (which I call "Futexes!"). ++ * (C) Rusty Russell, IBM 2002 ++ * ++ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar ++ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved ++ * ++ * Removed page pinning, fix privately mapped COW pages and other cleanups ++ * (C) Copyright 2003, 2004 Jamie Lokier ++ * ++ * Robust futex support started by Ingo Molnar ++ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved ++ * Thanks to Thomas Gleixner for suggestions, analysis and fixes. ++ * ++ * PI-futex support started by Ingo Molnar and Thomas Gleixner ++ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar ++ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner ++ * ++ * PRIVATE futexes by Eric Dumazet ++ * Copyright (C) 2007 Eric Dumazet ++ * ++ * Requeue-PI support by Darren Hart ++ * Copyright (C) IBM Corporation, 2009 ++ * Thanks to Thomas Gleixner for conceptual design and careful reviews. ++ * ++ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly ++ * enough at me, Linus for the original (flawed) idea, Matthew ++ * Kirkwood for proof-of-concept implementation. ++ * ++ * "The futexes are also cursed." ++ * "But they come in a choice of three flavours!" ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "locking/rtmutex_common.h" ++ ++/* ++ * READ this before attempting to hack on futexes! ++ * ++ * Basic futex operation and ordering guarantees ++ * ============================================= ++ * ++ * The waiter reads the futex value in user space and calls ++ * futex_wait(). This function computes the hash bucket and acquires ++ * the hash bucket lock. After that it reads the futex user space value ++ * again and verifies that the data has not changed. If it has not changed ++ * it enqueues itself into the hash bucket, releases the hash bucket lock ++ * and schedules. ++ * ++ * The waker side modifies the user space value of the futex and calls ++ * futex_wake(). This function computes the hash bucket and acquires the ++ * hash bucket lock. Then it looks for waiters on that futex in the hash ++ * bucket and wakes them. ++ * ++ * In futex wake up scenarios where no tasks are blocked on a futex, taking ++ * the hb spinlock can be avoided and simply return. In order for this ++ * optimization to work, ordering guarantees must exist so that the waiter ++ * being added to the list is acknowledged when the list is concurrently being ++ * checked by the waker, avoiding scenarios like the following: ++ * ++ * CPU 0 CPU 1 ++ * val = *futex; ++ * sys_futex(WAIT, futex, val); ++ * futex_wait(futex, val); ++ * uval = *futex; ++ * *futex = newval; ++ * sys_futex(WAKE, futex); ++ * futex_wake(futex); ++ * if (queue_empty()) ++ * return; ++ * if (uval == val) ++ * lock(hash_bucket(futex)); ++ * queue(); ++ * unlock(hash_bucket(futex)); ++ * schedule(); ++ * ++ * This would cause the waiter on CPU 0 to wait forever because it ++ * missed the transition of the user space value from val to newval ++ * and the waker did not find the waiter in the hash bucket queue. ++ * ++ * The correct serialization ensures that a waiter either observes ++ * the changed user space value before blocking or is woken by a ++ * concurrent waker: ++ * ++ * CPU 0 CPU 1 ++ * val = *futex; ++ * sys_futex(WAIT, futex, val); ++ * futex_wait(futex, val); ++ * ++ * waiters++; (a) ++ * smp_mb(); (A) <-- paired with -. ++ * | ++ * lock(hash_bucket(futex)); | ++ * | ++ * uval = *futex; | ++ * | *futex = newval; ++ * | sys_futex(WAKE, futex); ++ * | futex_wake(futex); ++ * | ++ * `--------> smp_mb(); (B) ++ * if (uval == val) ++ * queue(); ++ * unlock(hash_bucket(futex)); ++ * schedule(); if (waiters) ++ * lock(hash_bucket(futex)); ++ * else wake_waiters(futex); ++ * waiters--; (b) unlock(hash_bucket(futex)); ++ * ++ * Where (A) orders the waiters increment and the futex value read through ++ * atomic operations (see hb_waiters_inc) and where (B) orders the write ++ * to futex and the waiters read (see hb_waiters_pending()). ++ * ++ * This yields the following case (where X:=waiters, Y:=futex): ++ * ++ * X = Y = 0 ++ * ++ * w[X]=1 w[Y]=1 ++ * MB MB ++ * r[Y]=y r[X]=x ++ * ++ * Which guarantees that x==0 && y==0 is impossible; which translates back into ++ * the guarantee that we cannot both miss the futex variable change and the ++ * enqueue. ++ * ++ * Note that a new waiter is accounted for in (a) even when it is possible that ++ * the wait call can return error, in which case we backtrack from it in (b). ++ * Refer to the comment in queue_lock(). ++ * ++ * Similarly, in order to account for waiters being requeued on another ++ * address we always increment the waiters for the destination bucket before ++ * acquiring the lock. It then decrements them again after releasing it - ++ * the code that actually moves the futex(es) between hash buckets (requeue_futex) ++ * will do the additional required waiter count housekeeping. This is done for ++ * double_lock_hb() and double_unlock_hb(), respectively. ++ */ ++ ++#ifdef CONFIG_HAVE_FUTEX_CMPXCHG ++#define futex_cmpxchg_enabled 1 ++#else ++static int __read_mostly futex_cmpxchg_enabled; ++#endif ++ ++/* ++ * Futex flags used to encode options to functions and preserve them across ++ * restarts. ++ */ ++#ifdef CONFIG_MMU ++# define FLAGS_SHARED 0x01 ++#else ++/* ++ * NOMMU does not have per process address space. Let the compiler optimize ++ * code away. ++ */ ++# define FLAGS_SHARED 0x00 ++#endif ++#define FLAGS_CLOCKRT 0x02 ++#define FLAGS_HAS_TIMEOUT 0x04 ++ ++/* ++ * Priority Inheritance state: ++ */ ++struct futex_pi_state { ++ /* ++ * list of 'owned' pi_state instances - these have to be ++ * cleaned up in do_exit() if the task exits prematurely: ++ */ ++ struct list_head list; ++ ++ /* ++ * The PI object: ++ */ ++ struct rt_mutex pi_mutex; ++ ++ struct task_struct *owner; ++ refcount_t refcount; ++ ++ union futex_key key; ++} __randomize_layout; ++ ++/** ++ * struct futex_q - The hashed futex queue entry, one per waiting task ++ * @list: priority-sorted list of tasks waiting on this futex ++ * @task: the task waiting on the futex ++ * @lock_ptr: the hash bucket lock ++ * @key: the key the futex is hashed on ++ * @pi_state: optional priority inheritance state ++ * @rt_waiter: rt_waiter storage for use with requeue_pi ++ * @requeue_pi_key: the requeue_pi target futex key ++ * @bitset: bitset for the optional bitmasked wakeup ++ * ++ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so ++ * we can wake only the relevant ones (hashed queues may be shared). ++ * ++ * A futex_q has a woken state, just like tasks have TASK_RUNNING. ++ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. ++ * The order of wakeup is always to make the first condition true, then ++ * the second. ++ * ++ * PI futexes are typically woken before they are removed from the hash list via ++ * the rt_mutex code. See unqueue_me_pi(). ++ */ ++struct futex_q { ++ struct plist_node list; ++ ++ struct task_struct *task; ++ spinlock_t *lock_ptr; ++ union futex_key key; ++ struct futex_pi_state *pi_state; ++ struct rt_mutex_waiter *rt_waiter; ++ union futex_key *requeue_pi_key; ++ u32 bitset; ++} __randomize_layout; ++ ++static const struct futex_q futex_q_init = { ++ /* list gets initialized in queue_me()*/ ++ .key = FUTEX_KEY_INIT, ++ .bitset = FUTEX_BITSET_MATCH_ANY ++}; ++ ++/* ++ * Hash buckets are shared by all the futex_keys that hash to the same ++ * location. Each key may have multiple futex_q structures, one for each task ++ * waiting on a futex. ++ */ ++struct futex_hash_bucket { ++ atomic_t waiters; ++ spinlock_t lock; ++ struct plist_head chain; ++} ____cacheline_aligned_in_smp; ++ ++/* ++ * The base of the bucket array and its size are always used together ++ * (after initialization only in hash_futex()), so ensure that they ++ * reside in the same cacheline. ++ */ ++static struct { ++ struct futex_hash_bucket *queues; ++ unsigned long hashsize; ++} __futex_data __read_mostly __aligned(2*sizeof(long)); ++#define futex_queues (__futex_data.queues) ++#define futex_hashsize (__futex_data.hashsize) ++ ++ ++/* ++ * Fault injections for futexes. ++ */ ++#ifdef CONFIG_FAIL_FUTEX ++ ++static struct { ++ struct fault_attr attr; ++ ++ bool ignore_private; ++} fail_futex = { ++ .attr = FAULT_ATTR_INITIALIZER, ++ .ignore_private = false, ++}; ++ ++static int __init setup_fail_futex(char *str) ++{ ++ return setup_fault_attr(&fail_futex.attr, str); ++} ++__setup("fail_futex=", setup_fail_futex); ++ ++static bool should_fail_futex(bool fshared) ++{ ++ if (fail_futex.ignore_private && !fshared) ++ return false; ++ ++ return should_fail(&fail_futex.attr, 1); ++} ++ ++#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS ++ ++static int __init fail_futex_debugfs(void) ++{ ++ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; ++ struct dentry *dir; ++ ++ dir = fault_create_debugfs_attr("fail_futex", NULL, ++ &fail_futex.attr); ++ if (IS_ERR(dir)) ++ return PTR_ERR(dir); ++ ++ debugfs_create_bool("ignore-private", mode, dir, ++ &fail_futex.ignore_private); ++ return 0; ++} ++ ++late_initcall(fail_futex_debugfs); ++ ++#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ ++ ++#else ++static inline bool should_fail_futex(bool fshared) ++{ ++ return false; ++} ++#endif /* CONFIG_FAIL_FUTEX */ ++ ++/* ++ * Reflects a new waiter being added to the waitqueue. ++ */ ++static inline void hb_waiters_inc(struct futex_hash_bucket *hb) ++{ ++#ifdef CONFIG_SMP ++ atomic_inc(&hb->waiters); ++ /* ++ * Full barrier (A), see the ordering comment above. ++ */ ++ smp_mb__after_atomic(); ++#endif ++} ++ ++/* ++ * Reflects a waiter being removed from the waitqueue by wakeup ++ * paths. ++ */ ++static inline void hb_waiters_dec(struct futex_hash_bucket *hb) ++{ ++#ifdef CONFIG_SMP ++ atomic_dec(&hb->waiters); ++#endif ++} ++ ++static inline int hb_waiters_pending(struct futex_hash_bucket *hb) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * Full barrier (B), see the ordering comment above. ++ */ ++ smp_mb(); ++ return atomic_read(&hb->waiters); ++#else ++ return 1; ++#endif ++} ++ ++/** ++ * hash_futex - Return the hash bucket in the global hash ++ * @key: Pointer to the futex key for which the hash is calculated ++ * ++ * We hash on the keys returned from get_futex_key (see below) and return the ++ * corresponding hash bucket in the global hash. ++ */ ++static struct futex_hash_bucket *hash_futex(union futex_key *key) ++{ ++ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, ++ key->both.offset); ++ ++ return &futex_queues[hash & (futex_hashsize - 1)]; ++} ++ ++ ++/** ++ * match_futex - Check whether two futex keys are equal ++ * @key1: Pointer to key1 ++ * @key2: Pointer to key2 ++ * ++ * Return 1 if two futex_keys are equal, 0 otherwise. ++ */ ++static inline int match_futex(union futex_key *key1, union futex_key *key2) ++{ ++ return (key1 && key2 ++ && key1->both.word == key2->both.word ++ && key1->both.ptr == key2->both.ptr ++ && key1->both.offset == key2->both.offset); ++} ++ ++enum futex_access { ++ FUTEX_READ, ++ FUTEX_WRITE ++}; ++ ++/** ++ * futex_setup_timer - set up the sleeping hrtimer. ++ * @time: ptr to the given timeout value ++ * @timeout: the hrtimer_sleeper structure to be set up ++ * @flags: futex flags ++ * @range_ns: optional range in ns ++ * ++ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout ++ * value given ++ */ ++static inline struct hrtimer_sleeper * ++futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, ++ int flags, u64 range_ns) ++{ ++ if (!time) ++ return NULL; ++ ++ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? ++ CLOCK_REALTIME : CLOCK_MONOTONIC, ++ HRTIMER_MODE_ABS); ++ /* ++ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is ++ * effectively the same as calling hrtimer_set_expires(). ++ */ ++ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); ++ ++ return timeout; ++} ++ ++/* ++ * Generate a machine wide unique identifier for this inode. ++ * ++ * This relies on u64 not wrapping in the life-time of the machine; which with ++ * 1ns resolution means almost 585 years. ++ * ++ * This further relies on the fact that a well formed program will not unmap ++ * the file while it has a (shared) futex waiting on it. This mapping will have ++ * a file reference which pins the mount and inode. ++ * ++ * If for some reason an inode gets evicted and read back in again, it will get ++ * a new sequence number and will _NOT_ match, even though it is the exact same ++ * file. ++ * ++ * It is important that match_futex() will never have a false-positive, esp. ++ * for PI futexes that can mess up the state. The above argues that false-negatives ++ * are only possible for malformed programs. ++ */ ++static u64 get_inode_sequence_number(struct inode *inode) ++{ ++ static atomic64_t i_seq; ++ u64 old; ++ ++ /* Does the inode already have a sequence number? */ ++ old = atomic64_read(&inode->i_sequence); ++ if (likely(old)) ++ return old; ++ ++ for (;;) { ++ u64 new = atomic64_add_return(1, &i_seq); ++ if (WARN_ON_ONCE(!new)) ++ continue; ++ ++ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); ++ if (old) ++ return old; ++ return new; ++ } ++} ++ ++/** ++ * get_futex_key() - Get parameters which are the keys for a futex ++ * @uaddr: virtual address of the futex ++ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED ++ * @key: address where result is stored. ++ * @rw: mapping needs to be read/write (values: FUTEX_READ, ++ * FUTEX_WRITE) ++ * ++ * Return: a negative error code or 0 ++ * ++ * The key words are stored in @key on success. ++ * ++ * For shared mappings (when @fshared), the key is: ++ * ++ * ( inode->i_sequence, page->index, offset_within_page ) ++ * ++ * [ also see get_inode_sequence_number() ] ++ * ++ * For private mappings (or when !@fshared), the key is: ++ * ++ * ( current->mm, address, 0 ) ++ * ++ * This allows (cross process, where applicable) identification of the futex ++ * without keeping the page pinned for the duration of the FUTEX_WAIT. ++ * ++ * lock_page() might sleep, the caller should not hold a spinlock. ++ */ ++static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, ++ enum futex_access rw) ++{ ++ unsigned long address = (unsigned long)uaddr; ++ struct mm_struct *mm = current->mm; ++ struct page *page, *tail; ++ struct address_space *mapping; ++ int err, ro = 0; ++ ++ /* ++ * The futex address must be "naturally" aligned. ++ */ ++ key->both.offset = address % PAGE_SIZE; ++ if (unlikely((address % sizeof(u32)) != 0)) ++ return -EINVAL; ++ address -= key->both.offset; ++ ++ if (unlikely(!access_ok(uaddr, sizeof(u32)))) ++ return -EFAULT; ++ ++ if (unlikely(should_fail_futex(fshared))) ++ return -EFAULT; ++ ++ /* ++ * PROCESS_PRIVATE futexes are fast. ++ * As the mm cannot disappear under us and the 'key' only needs ++ * virtual address, we dont even have to find the underlying vma. ++ * Note : We do have to check 'uaddr' is a valid user address, ++ * but access_ok() should be faster than find_vma() ++ */ ++ if (!fshared) { ++ key->private.mm = mm; ++ key->private.address = address; ++ return 0; ++ } ++ ++again: ++ /* Ignore any VERIFY_READ mapping (futex common case) */ ++ if (unlikely(should_fail_futex(true))) ++ return -EFAULT; ++ ++ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); ++ /* ++ * If write access is not required (eg. FUTEX_WAIT), try ++ * and get read-only access. ++ */ ++ if (err == -EFAULT && rw == FUTEX_READ) { ++ err = get_user_pages_fast(address, 1, 0, &page); ++ ro = 1; ++ } ++ if (err < 0) ++ return err; ++ else ++ err = 0; ++ ++ /* ++ * The treatment of mapping from this point on is critical. The page ++ * lock protects many things but in this context the page lock ++ * stabilizes mapping, prevents inode freeing in the shared ++ * file-backed region case and guards against movement to swap cache. ++ * ++ * Strictly speaking the page lock is not needed in all cases being ++ * considered here and page lock forces unnecessarily serialization ++ * From this point on, mapping will be re-verified if necessary and ++ * page lock will be acquired only if it is unavoidable ++ * ++ * Mapping checks require the head page for any compound page so the ++ * head page and mapping is looked up now. For anonymous pages, it ++ * does not matter if the page splits in the future as the key is ++ * based on the address. For filesystem-backed pages, the tail is ++ * required as the index of the page determines the key. For ++ * base pages, there is no tail page and tail == page. ++ */ ++ tail = page; ++ page = compound_head(page); ++ mapping = READ_ONCE(page->mapping); ++ ++ /* ++ * If page->mapping is NULL, then it cannot be a PageAnon ++ * page; but it might be the ZERO_PAGE or in the gate area or ++ * in a special mapping (all cases which we are happy to fail); ++ * or it may have been a good file page when get_user_pages_fast ++ * found it, but truncated or holepunched or subjected to ++ * invalidate_complete_page2 before we got the page lock (also ++ * cases which we are happy to fail). And we hold a reference, ++ * so refcount care in invalidate_complete_page's remove_mapping ++ * prevents drop_caches from setting mapping to NULL beneath us. ++ * ++ * The case we do have to guard against is when memory pressure made ++ * shmem_writepage move it from filecache to swapcache beneath us: ++ * an unlikely race, but we do need to retry for page->mapping. ++ */ ++ if (unlikely(!mapping)) { ++ int shmem_swizzled; ++ ++ /* ++ * Page lock is required to identify which special case above ++ * applies. If this is really a shmem page then the page lock ++ * will prevent unexpected transitions. ++ */ ++ lock_page(page); ++ shmem_swizzled = PageSwapCache(page) || page->mapping; ++ unlock_page(page); ++ put_page(page); ++ ++ if (shmem_swizzled) ++ goto again; ++ ++ return -EFAULT; ++ } ++ ++ /* ++ * Private mappings are handled in a simple way. ++ * ++ * If the futex key is stored on an anonymous page, then the associated ++ * object is the mm which is implicitly pinned by the calling process. ++ * ++ * NOTE: When userspace waits on a MAP_SHARED mapping, even if ++ * it's a read-only handle, it's expected that futexes attach to ++ * the object not the particular process. ++ */ ++ if (PageAnon(page)) { ++ /* ++ * A RO anonymous page will never change and thus doesn't make ++ * sense for futex operations. ++ */ ++ if (unlikely(should_fail_futex(true)) || ro) { ++ err = -EFAULT; ++ goto out; ++ } ++ ++ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ ++ key->private.mm = mm; ++ key->private.address = address; ++ ++ } else { ++ struct inode *inode; ++ ++ /* ++ * The associated futex object in this case is the inode and ++ * the page->mapping must be traversed. Ordinarily this should ++ * be stabilised under page lock but it's not strictly ++ * necessary in this case as we just want to pin the inode, not ++ * update the radix tree or anything like that. ++ * ++ * The RCU read lock is taken as the inode is finally freed ++ * under RCU. If the mapping still matches expectations then the ++ * mapping->host can be safely accessed as being a valid inode. ++ */ ++ rcu_read_lock(); ++ ++ if (READ_ONCE(page->mapping) != mapping) { ++ rcu_read_unlock(); ++ put_page(page); ++ ++ goto again; ++ } ++ ++ inode = READ_ONCE(mapping->host); ++ if (!inode) { ++ rcu_read_unlock(); ++ put_page(page); ++ ++ goto again; ++ } ++ ++ key->both.offset |= FUT_OFF_INODE; /* inode-based key */ ++ key->shared.i_seq = get_inode_sequence_number(inode); ++ key->shared.pgoff = basepage_index(tail); ++ rcu_read_unlock(); ++ } ++ ++out: ++ put_page(page); ++ return err; ++} ++ ++/** ++ * fault_in_user_writeable() - Fault in user address and verify RW access ++ * @uaddr: pointer to faulting user space address ++ * ++ * Slow path to fixup the fault we just took in the atomic write ++ * access to @uaddr. ++ * ++ * We have no generic implementation of a non-destructive write to the ++ * user address. We know that we faulted in the atomic pagefault ++ * disabled section so we can as well avoid the #PF overhead by ++ * calling get_user_pages() right away. ++ */ ++static int fault_in_user_writeable(u32 __user *uaddr) ++{ ++ struct mm_struct *mm = current->mm; ++ int ret; ++ ++ mmap_read_lock(mm); ++ ret = fixup_user_fault(mm, (unsigned long)uaddr, ++ FAULT_FLAG_WRITE, NULL); ++ mmap_read_unlock(mm); ++ ++ return ret < 0 ? ret : 0; ++} ++ ++/** ++ * futex_top_waiter() - Return the highest priority waiter on a futex ++ * @hb: the hash bucket the futex_q's reside in ++ * @key: the futex key (to distinguish it from other futex futex_q's) ++ * ++ * Must be called with the hb lock held. ++ */ ++static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, ++ union futex_key *key) ++{ ++ struct futex_q *this; ++ ++ plist_for_each_entry(this, &hb->chain, list) { ++ if (match_futex(&this->key, key)) ++ return this; ++ } ++ return NULL; ++} ++ ++static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, ++ u32 uval, u32 newval) ++{ ++ int ret; ++ ++ pagefault_disable(); ++ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); ++ pagefault_enable(); ++ ++ return ret; ++} ++ ++static int get_futex_value_locked(u32 *dest, u32 __user *from) ++{ ++ int ret; ++ ++ pagefault_disable(); ++ ret = __get_user(*dest, from); ++ pagefault_enable(); ++ ++ return ret ? -EFAULT : 0; ++} ++ ++ ++/* ++ * PI code: ++ */ ++static int refill_pi_state_cache(void) ++{ ++ struct futex_pi_state *pi_state; ++ ++ if (likely(current->pi_state_cache)) ++ return 0; ++ ++ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); ++ ++ if (!pi_state) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&pi_state->list); ++ /* pi_mutex gets initialized later */ ++ pi_state->owner = NULL; ++ refcount_set(&pi_state->refcount, 1); ++ pi_state->key = FUTEX_KEY_INIT; ++ ++ current->pi_state_cache = pi_state; ++ ++ return 0; ++} ++ ++static struct futex_pi_state *alloc_pi_state(void) ++{ ++ struct futex_pi_state *pi_state = current->pi_state_cache; ++ ++ WARN_ON(!pi_state); ++ current->pi_state_cache = NULL; ++ ++ return pi_state; ++} ++ ++static void get_pi_state(struct futex_pi_state *pi_state) ++{ ++ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); ++} ++ ++/* ++ * Drops a reference to the pi_state object and frees or caches it ++ * when the last reference is gone. ++ */ ++static void put_pi_state(struct futex_pi_state *pi_state) ++{ ++ if (!pi_state) ++ return; ++ ++ if (!refcount_dec_and_test(&pi_state->refcount)) ++ return; ++ ++ /* ++ * If pi_state->owner is NULL, the owner is most probably dying ++ * and has cleaned up the pi_state already ++ */ ++ if (pi_state->owner) { ++ struct task_struct *owner; ++ ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ owner = pi_state->owner; ++ if (owner) { ++ raw_spin_lock(&owner->pi_lock); ++ list_del_init(&pi_state->list); ++ raw_spin_unlock(&owner->pi_lock); ++ } ++ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ } ++ ++ if (current->pi_state_cache) { ++ kfree(pi_state); ++ } else { ++ /* ++ * pi_state->list is already empty. ++ * clear pi_state->owner. ++ * refcount is at 0 - put it back to 1. ++ */ ++ pi_state->owner = NULL; ++ refcount_set(&pi_state->refcount, 1); ++ current->pi_state_cache = pi_state; ++ } ++} ++ ++/* ++ * We need to check the following states: ++ * ++ * Waiter | pi_state | pi->owner | uTID | uODIED | ? ++ * ++ * [1] NULL | --- | --- | 0 | 0/1 | Valid ++ * [2] NULL | --- | --- | >0 | 0/1 | Valid ++ * ++ * [3] Found | NULL | -- | Any | 0/1 | Invalid ++ * ++ * [4] Found | Found | NULL | 0 | 1 | Valid ++ * [5] Found | Found | NULL | >0 | 1 | Invalid ++ * ++ * [6] Found | Found | task | 0 | 1 | Valid ++ * ++ * [7] Found | Found | NULL | Any | 0 | Invalid ++ * ++ * [8] Found | Found | task | ==taskTID | 0/1 | Valid ++ * [9] Found | Found | task | 0 | 0 | Invalid ++ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid ++ * ++ * [1] Indicates that the kernel can acquire the futex atomically. We ++ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. ++ * ++ * [2] Valid, if TID does not belong to a kernel thread. If no matching ++ * thread is found then it indicates that the owner TID has died. ++ * ++ * [3] Invalid. The waiter is queued on a non PI futex ++ * ++ * [4] Valid state after exit_robust_list(), which sets the user space ++ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. ++ * ++ * [5] The user space value got manipulated between exit_robust_list() ++ * and exit_pi_state_list() ++ * ++ * [6] Valid state after exit_pi_state_list() which sets the new owner in ++ * the pi_state but cannot access the user space value. ++ * ++ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. ++ * ++ * [8] Owner and user space value match ++ * ++ * [9] There is no transient state which sets the user space TID to 0 ++ * except exit_robust_list(), but this is indicated by the ++ * FUTEX_OWNER_DIED bit. See [4] ++ * ++ * [10] There is no transient state which leaves owner and user space ++ * TID out of sync. ++ * ++ * ++ * Serialization and lifetime rules: ++ * ++ * hb->lock: ++ * ++ * hb -> futex_q, relation ++ * futex_q -> pi_state, relation ++ * ++ * (cannot be raw because hb can contain arbitrary amount ++ * of futex_q's) ++ * ++ * pi_mutex->wait_lock: ++ * ++ * {uval, pi_state} ++ * ++ * (and pi_mutex 'obviously') ++ * ++ * p->pi_lock: ++ * ++ * p->pi_state_list -> pi_state->list, relation ++ * ++ * pi_state->refcount: ++ * ++ * pi_state lifetime ++ * ++ * ++ * Lock order: ++ * ++ * hb->lock ++ * pi_mutex->wait_lock ++ * p->pi_lock ++ * ++ */ ++ ++/* ++ * Validate that the existing waiter has a pi_state and sanity check ++ * the pi_state against the user space value. If correct, attach to ++ * it. ++ */ ++static int attach_to_pi_state(u32 __user *uaddr, u32 uval, ++ struct futex_pi_state *pi_state, ++ struct futex_pi_state **ps) ++{ ++ pid_t pid = uval & FUTEX_TID_MASK; ++ u32 uval2; ++ int ret; ++ ++ /* ++ * Userspace might have messed up non-PI and PI futexes [3] ++ */ ++ if (unlikely(!pi_state)) ++ return -EINVAL; ++ ++ /* ++ * We get here with hb->lock held, and having found a ++ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q ++ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), ++ * which in turn means that futex_lock_pi() still has a reference on ++ * our pi_state. ++ * ++ * The waiter holding a reference on @pi_state also protects against ++ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() ++ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently ++ * free pi_state before we can take a reference ourselves. ++ */ ++ WARN_ON(!refcount_read(&pi_state->refcount)); ++ ++ /* ++ * Now that we have a pi_state, we can acquire wait_lock ++ * and do the state validation. ++ */ ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ /* ++ * Since {uval, pi_state} is serialized by wait_lock, and our current ++ * uval was read without holding it, it can have changed. Verify it ++ * still is what we expect it to be, otherwise retry the entire ++ * operation. ++ */ ++ if (get_futex_value_locked(&uval2, uaddr)) ++ goto out_efault; ++ ++ if (uval != uval2) ++ goto out_eagain; ++ ++ /* ++ * Handle the owner died case: ++ */ ++ if (uval & FUTEX_OWNER_DIED) { ++ /* ++ * exit_pi_state_list sets owner to NULL and wakes the ++ * topmost waiter. The task which acquires the ++ * pi_state->rt_mutex will fixup owner. ++ */ ++ if (!pi_state->owner) { ++ /* ++ * No pi state owner, but the user space TID ++ * is not 0. Inconsistent state. [5] ++ */ ++ if (pid) ++ goto out_einval; ++ /* ++ * Take a ref on the state and return success. [4] ++ */ ++ goto out_attach; ++ } ++ ++ /* ++ * If TID is 0, then either the dying owner has not ++ * yet executed exit_pi_state_list() or some waiter ++ * acquired the rtmutex in the pi state, but did not ++ * yet fixup the TID in user space. ++ * ++ * Take a ref on the state and return success. [6] ++ */ ++ if (!pid) ++ goto out_attach; ++ } else { ++ /* ++ * If the owner died bit is not set, then the pi_state ++ * must have an owner. [7] ++ */ ++ if (!pi_state->owner) ++ goto out_einval; ++ } ++ ++ /* ++ * Bail out if user space manipulated the futex value. If pi ++ * state exists then the owner TID must be the same as the ++ * user space TID. [9/10] ++ */ ++ if (pid != task_pid_vnr(pi_state->owner)) ++ goto out_einval; ++ ++out_attach: ++ get_pi_state(pi_state); ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ *ps = pi_state; ++ return 0; ++ ++out_einval: ++ ret = -EINVAL; ++ goto out_error; ++ ++out_eagain: ++ ret = -EAGAIN; ++ goto out_error; ++ ++out_efault: ++ ret = -EFAULT; ++ goto out_error; ++ ++out_error: ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ return ret; ++} ++ ++/** ++ * wait_for_owner_exiting - Block until the owner has exited ++ * @ret: owner's current futex lock status ++ * @exiting: Pointer to the exiting task ++ * ++ * Caller must hold a refcount on @exiting. ++ */ ++static void wait_for_owner_exiting(int ret, struct task_struct *exiting) ++{ ++ if (ret != -EBUSY) { ++ WARN_ON_ONCE(exiting); ++ return; ++ } ++ ++ if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) ++ return; ++ ++ mutex_lock(&exiting->futex_exit_mutex); ++ /* ++ * No point in doing state checking here. If the waiter got here ++ * while the task was in exec()->exec_futex_release() then it can ++ * have any FUTEX_STATE_* value when the waiter has acquired the ++ * mutex. OK, if running, EXITING or DEAD if it reached exit() ++ * already. Highly unlikely and not a problem. Just one more round ++ * through the futex maze. ++ */ ++ mutex_unlock(&exiting->futex_exit_mutex); ++ ++ put_task_struct(exiting); ++} ++ ++static int handle_exit_race(u32 __user *uaddr, u32 uval, ++ struct task_struct *tsk) ++{ ++ u32 uval2; ++ ++ /* ++ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the ++ * caller that the alleged owner is busy. ++ */ ++ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) ++ return -EBUSY; ++ ++ /* ++ * Reread the user space value to handle the following situation: ++ * ++ * CPU0 CPU1 ++ * ++ * sys_exit() sys_futex() ++ * do_exit() futex_lock_pi() ++ * futex_lock_pi_atomic() ++ * exit_signals(tsk) No waiters: ++ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID ++ * mm_release(tsk) Set waiter bit ++ * exit_robust_list(tsk) { *uaddr = 0x80000PID; ++ * Set owner died attach_to_pi_owner() { ++ * *uaddr = 0xC0000000; tsk = get_task(PID); ++ * } if (!tsk->flags & PF_EXITING) { ++ * ... attach(); ++ * tsk->futex_state = } else { ++ * FUTEX_STATE_DEAD; if (tsk->futex_state != ++ * FUTEX_STATE_DEAD) ++ * return -EAGAIN; ++ * return -ESRCH; <--- FAIL ++ * } ++ * ++ * Returning ESRCH unconditionally is wrong here because the ++ * user space value has been changed by the exiting task. ++ * ++ * The same logic applies to the case where the exiting task is ++ * already gone. ++ */ ++ if (get_futex_value_locked(&uval2, uaddr)) ++ return -EFAULT; ++ ++ /* If the user space value has changed, try again. */ ++ if (uval2 != uval) ++ return -EAGAIN; ++ ++ /* ++ * The exiting task did not have a robust list, the robust list was ++ * corrupted or the user space value in *uaddr is simply bogus. ++ * Give up and tell user space. ++ */ ++ return -ESRCH; ++} ++ ++/* ++ * Lookup the task for the TID provided from user space and attach to ++ * it after doing proper sanity checks. ++ */ ++static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, ++ struct futex_pi_state **ps, ++ struct task_struct **exiting) ++{ ++ pid_t pid = uval & FUTEX_TID_MASK; ++ struct futex_pi_state *pi_state; ++ struct task_struct *p; ++ ++ /* ++ * We are the first waiter - try to look up the real owner and attach ++ * the new pi_state to it, but bail out when TID = 0 [1] ++ * ++ * The !pid check is paranoid. None of the call sites should end up ++ * with pid == 0, but better safe than sorry. Let the caller retry ++ */ ++ if (!pid) ++ return -EAGAIN; ++ p = find_get_task_by_vpid(pid); ++ if (!p) ++ return handle_exit_race(uaddr, uval, NULL); ++ ++ if (unlikely(p->flags & PF_KTHREAD)) { ++ put_task_struct(p); ++ return -EPERM; ++ } ++ ++ /* ++ * We need to look at the task state to figure out, whether the ++ * task is exiting. To protect against the change of the task state ++ * in futex_exit_release(), we do this protected by p->pi_lock: ++ */ ++ raw_spin_lock_irq(&p->pi_lock); ++ if (unlikely(p->futex_state != FUTEX_STATE_OK)) { ++ /* ++ * The task is on the way out. When the futex state is ++ * FUTEX_STATE_DEAD, we know that the task has finished ++ * the cleanup: ++ */ ++ int ret = handle_exit_race(uaddr, uval, p); ++ ++ raw_spin_unlock_irq(&p->pi_lock); ++ /* ++ * If the owner task is between FUTEX_STATE_EXITING and ++ * FUTEX_STATE_DEAD then store the task pointer and keep ++ * the reference on the task struct. The calling code will ++ * drop all locks, wait for the task to reach ++ * FUTEX_STATE_DEAD and then drop the refcount. This is ++ * required to prevent a live lock when the current task ++ * preempted the exiting task between the two states. ++ */ ++ if (ret == -EBUSY) ++ *exiting = p; ++ else ++ put_task_struct(p); ++ return ret; ++ } ++ ++ /* ++ * No existing pi state. First waiter. [2] ++ * ++ * This creates pi_state, we have hb->lock held, this means nothing can ++ * observe this state, wait_lock is irrelevant. ++ */ ++ pi_state = alloc_pi_state(); ++ ++ /* ++ * Initialize the pi_mutex in locked state and make @p ++ * the owner of it: ++ */ ++ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); ++ ++ /* Store the key for possible exit cleanups: */ ++ pi_state->key = *key; ++ ++ WARN_ON(!list_empty(&pi_state->list)); ++ list_add(&pi_state->list, &p->pi_state_list); ++ /* ++ * Assignment without holding pi_state->pi_mutex.wait_lock is safe ++ * because there is no concurrency as the object is not published yet. ++ */ ++ pi_state->owner = p; ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ put_task_struct(p); ++ ++ *ps = pi_state; ++ ++ return 0; ++} ++ ++static int lookup_pi_state(u32 __user *uaddr, u32 uval, ++ struct futex_hash_bucket *hb, ++ union futex_key *key, struct futex_pi_state **ps, ++ struct task_struct **exiting) ++{ ++ struct futex_q *top_waiter = futex_top_waiter(hb, key); ++ ++ /* ++ * If there is a waiter on that futex, validate it and ++ * attach to the pi_state when the validation succeeds. ++ */ ++ if (top_waiter) ++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); ++ ++ /* ++ * We are the first waiter - try to look up the owner based on ++ * @uval and attach to it. ++ */ ++ return attach_to_pi_owner(uaddr, uval, key, ps, exiting); ++} ++ ++static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) ++{ ++ int err; ++ u32 curval; ++ ++ if (unlikely(should_fail_futex(true))) ++ return -EFAULT; ++ ++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); ++ if (unlikely(err)) ++ return err; ++ ++ /* If user space value changed, let the caller retry */ ++ return curval != uval ? -EAGAIN : 0; ++} ++ ++/** ++ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex ++ * @uaddr: the pi futex user address ++ * @hb: the pi futex hash bucket ++ * @key: the futex key associated with uaddr and hb ++ * @ps: the pi_state pointer where we store the result of the ++ * lookup ++ * @task: the task to perform the atomic lock work for. This will ++ * be "current" except in the case of requeue pi. ++ * @exiting: Pointer to store the task pointer of the owner task ++ * which is in the middle of exiting ++ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) ++ * ++ * Return: ++ * - 0 - ready to wait; ++ * - 1 - acquired the lock; ++ * - <0 - error ++ * ++ * The hb->lock and futex_key refs shall be held by the caller. ++ * ++ * @exiting is only set when the return value is -EBUSY. If so, this holds ++ * a refcount on the exiting task on return and the caller needs to drop it ++ * after waiting for the exit to complete. ++ */ ++static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, ++ union futex_key *key, ++ struct futex_pi_state **ps, ++ struct task_struct *task, ++ struct task_struct **exiting, ++ int set_waiters) ++{ ++ u32 uval, newval, vpid = task_pid_vnr(task); ++ struct futex_q *top_waiter; ++ int ret; ++ ++ /* ++ * Read the user space value first so we can validate a few ++ * things before proceeding further. ++ */ ++ if (get_futex_value_locked(&uval, uaddr)) ++ return -EFAULT; ++ ++ if (unlikely(should_fail_futex(true))) ++ return -EFAULT; ++ ++ /* ++ * Detect deadlocks. ++ */ ++ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) ++ return -EDEADLK; ++ ++ if ((unlikely(should_fail_futex(true)))) ++ return -EDEADLK; ++ ++ /* ++ * Lookup existing state first. If it exists, try to attach to ++ * its pi_state. ++ */ ++ top_waiter = futex_top_waiter(hb, key); ++ if (top_waiter) ++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); ++ ++ /* ++ * No waiter and user TID is 0. We are here because the ++ * waiters or the owner died bit is set or called from ++ * requeue_cmp_pi or for whatever reason something took the ++ * syscall. ++ */ ++ if (!(uval & FUTEX_TID_MASK)) { ++ /* ++ * We take over the futex. No other waiters and the user space ++ * TID is 0. We preserve the owner died bit. ++ */ ++ newval = uval & FUTEX_OWNER_DIED; ++ newval |= vpid; ++ ++ /* The futex requeue_pi code can enforce the waiters bit */ ++ if (set_waiters) ++ newval |= FUTEX_WAITERS; ++ ++ ret = lock_pi_update_atomic(uaddr, uval, newval); ++ /* If the take over worked, return 1 */ ++ return ret < 0 ? ret : 1; ++ } ++ ++ /* ++ * First waiter. Set the waiters bit before attaching ourself to ++ * the owner. If owner tries to unlock, it will be forced into ++ * the kernel and blocked on hb->lock. ++ */ ++ newval = uval | FUTEX_WAITERS; ++ ret = lock_pi_update_atomic(uaddr, uval, newval); ++ if (ret) ++ return ret; ++ /* ++ * If the update of the user space value succeeded, we try to ++ * attach to the owner. If that fails, no harm done, we only ++ * set the FUTEX_WAITERS bit in the user space variable. ++ */ ++ return attach_to_pi_owner(uaddr, newval, key, ps, exiting); ++} ++ ++/** ++ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket ++ * @q: The futex_q to unqueue ++ * ++ * The q->lock_ptr must not be NULL and must be held by the caller. ++ */ ++static void __unqueue_futex(struct futex_q *q) ++{ ++ struct futex_hash_bucket *hb; ++ ++ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) ++ return; ++ lockdep_assert_held(q->lock_ptr); ++ ++ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); ++ plist_del(&q->list, &hb->chain); ++ hb_waiters_dec(hb); ++} ++ ++/* ++ * The hash bucket lock must be held when this is called. ++ * Afterwards, the futex_q must not be accessed. Callers ++ * must ensure to later call wake_up_q() for the actual ++ * wakeups to occur. ++ */ ++static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) ++{ ++ struct task_struct *p = q->task; ++ ++ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) ++ return; ++ ++ get_task_struct(p); ++ __unqueue_futex(q); ++ /* ++ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL ++ * is written, without taking any locks. This is possible in the event ++ * of a spurious wakeup, for example. A memory barrier is required here ++ * to prevent the following store to lock_ptr from getting ahead of the ++ * plist_del in __unqueue_futex(). ++ */ ++ smp_store_release(&q->lock_ptr, NULL); ++ ++ /* ++ * Queue the task for later wakeup for after we've released ++ * the hb->lock. ++ */ ++ wake_q_add_safe(wake_q, p); ++} ++ ++/* ++ * Caller must hold a reference on @pi_state. ++ */ ++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) ++{ ++ u32 curval, newval; ++ struct task_struct *new_owner; ++ bool postunlock = false; ++ DEFINE_WAKE_Q(wake_q); ++ int ret = 0; ++ ++ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); ++ if (WARN_ON_ONCE(!new_owner)) { ++ /* ++ * As per the comment in futex_unlock_pi() this should not happen. ++ * ++ * When this happens, give up our locks and try again, giving ++ * the futex_lock_pi() instance time to complete, either by ++ * waiting on the rtmutex or removing itself from the futex ++ * queue. ++ */ ++ ret = -EAGAIN; ++ goto out_unlock; ++ } ++ ++ /* ++ * We pass it to the next owner. The WAITERS bit is always kept ++ * enabled while there is PI state around. We cleanup the owner ++ * died bit, because we are the owner. ++ */ ++ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); ++ ++ if (unlikely(should_fail_futex(true))) ++ ret = -EFAULT; ++ ++ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); ++ if (!ret && (curval != uval)) { ++ /* ++ * If a unconditional UNLOCK_PI operation (user space did not ++ * try the TID->0 transition) raced with a waiter setting the ++ * FUTEX_WAITERS flag between get_user() and locking the hash ++ * bucket lock, retry the operation. ++ */ ++ if ((FUTEX_TID_MASK & curval) == uval) ++ ret = -EAGAIN; ++ else ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ goto out_unlock; ++ ++ /* ++ * This is a point of no return; once we modify the uval there is no ++ * going back and subsequent operations must not fail. ++ */ ++ ++ raw_spin_lock(&pi_state->owner->pi_lock); ++ WARN_ON(list_empty(&pi_state->list)); ++ list_del_init(&pi_state->list); ++ raw_spin_unlock(&pi_state->owner->pi_lock); ++ ++ raw_spin_lock(&new_owner->pi_lock); ++ WARN_ON(!list_empty(&pi_state->list)); ++ list_add(&pi_state->list, &new_owner->pi_state_list); ++ pi_state->owner = new_owner; ++ raw_spin_unlock(&new_owner->pi_lock); ++ ++ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); ++ ++out_unlock: ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ if (postunlock) ++ rt_mutex_postunlock(&wake_q); ++ ++ return ret; ++} ++ ++/* ++ * Express the locking dependencies for lockdep: ++ */ ++static inline void ++double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) ++{ ++ if (hb1 <= hb2) { ++ spin_lock(&hb1->lock); ++ if (hb1 < hb2) ++ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); ++ } else { /* hb1 > hb2 */ ++ spin_lock(&hb2->lock); ++ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void ++double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) ++{ ++ spin_unlock(&hb1->lock); ++ if (hb1 != hb2) ++ spin_unlock(&hb2->lock); ++} ++ ++/* ++ * Wake up waiters matching bitset queued on this futex (uaddr). ++ */ ++static int ++futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ++{ ++ struct futex_hash_bucket *hb; ++ struct futex_q *this, *next; ++ union futex_key key = FUTEX_KEY_INIT; ++ int ret; ++ DEFINE_WAKE_Q(wake_q); ++ ++ if (!bitset) ++ return -EINVAL; ++ ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ hb = hash_futex(&key); ++ ++ /* Make sure we really have tasks to wakeup */ ++ if (!hb_waiters_pending(hb)) ++ return ret; ++ ++ spin_lock(&hb->lock); ++ ++ plist_for_each_entry_safe(this, next, &hb->chain, list) { ++ if (match_futex (&this->key, &key)) { ++ if (this->pi_state || this->rt_waiter) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ /* Check if one of the bits is set in both bitsets */ ++ if (!(this->bitset & bitset)) ++ continue; ++ ++ mark_wake_futex(&wake_q, this); ++ if (++ret >= nr_wake) ++ break; ++ } ++ } ++ ++ spin_unlock(&hb->lock); ++ wake_up_q(&wake_q); ++ return ret; ++} ++ ++static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) ++{ ++ unsigned int op = (encoded_op & 0x70000000) >> 28; ++ unsigned int cmp = (encoded_op & 0x0f000000) >> 24; ++ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); ++ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); ++ int oldval, ret; ++ ++ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { ++ if (oparg < 0 || oparg > 31) { ++ char comm[sizeof(current->comm)]; ++ /* ++ * kill this print and return -EINVAL when userspace ++ * is sane again ++ */ ++ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", ++ get_task_comm(comm, current), oparg); ++ oparg &= 31; ++ } ++ oparg = 1 << oparg; ++ } ++ ++ pagefault_disable(); ++ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); ++ pagefault_enable(); ++ if (ret) ++ return ret; ++ ++ switch (cmp) { ++ case FUTEX_OP_CMP_EQ: ++ return oldval == cmparg; ++ case FUTEX_OP_CMP_NE: ++ return oldval != cmparg; ++ case FUTEX_OP_CMP_LT: ++ return oldval < cmparg; ++ case FUTEX_OP_CMP_GE: ++ return oldval >= cmparg; ++ case FUTEX_OP_CMP_LE: ++ return oldval <= cmparg; ++ case FUTEX_OP_CMP_GT: ++ return oldval > cmparg; ++ default: ++ return -ENOSYS; ++ } ++} ++ ++/* ++ * Wake up all waiters hashed on the physical page that is mapped ++ * to this virtual address: ++ */ ++static int ++futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, ++ int nr_wake, int nr_wake2, int op) ++{ ++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; ++ struct futex_hash_bucket *hb1, *hb2; ++ struct futex_q *this, *next; ++ int ret, op_ret; ++ DEFINE_WAKE_Q(wake_q); ++ ++retry: ++ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; ++ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ hb1 = hash_futex(&key1); ++ hb2 = hash_futex(&key2); ++ ++retry_private: ++ double_lock_hb(hb1, hb2); ++ op_ret = futex_atomic_op_inuser(op, uaddr2); ++ if (unlikely(op_ret < 0)) { ++ double_unlock_hb(hb1, hb2); ++ ++ if (!IS_ENABLED(CONFIG_MMU) || ++ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { ++ /* ++ * we don't get EFAULT from MMU faults if we don't have ++ * an MMU, but we might get them from range checking ++ */ ++ ret = op_ret; ++ return ret; ++ } ++ ++ if (op_ret == -EFAULT) { ++ ret = fault_in_user_writeable(uaddr2); ++ if (ret) ++ return ret; ++ } ++ ++ if (!(flags & FLAGS_SHARED)) { ++ cond_resched(); ++ goto retry_private; ++ } ++ ++ cond_resched(); ++ goto retry; ++ } ++ ++ plist_for_each_entry_safe(this, next, &hb1->chain, list) { ++ if (match_futex (&this->key, &key1)) { ++ if (this->pi_state || this->rt_waiter) { ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ mark_wake_futex(&wake_q, this); ++ if (++ret >= nr_wake) ++ break; ++ } ++ } ++ ++ if (op_ret > 0) { ++ op_ret = 0; ++ plist_for_each_entry_safe(this, next, &hb2->chain, list) { ++ if (match_futex (&this->key, &key2)) { ++ if (this->pi_state || this->rt_waiter) { ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ mark_wake_futex(&wake_q, this); ++ if (++op_ret >= nr_wake2) ++ break; ++ } ++ } ++ ret += op_ret; ++ } ++ ++out_unlock: ++ double_unlock_hb(hb1, hb2); ++ wake_up_q(&wake_q); ++ return ret; ++} ++ ++/** ++ * requeue_futex() - Requeue a futex_q from one hb to another ++ * @q: the futex_q to requeue ++ * @hb1: the source hash_bucket ++ * @hb2: the target hash_bucket ++ * @key2: the new key for the requeued futex_q ++ */ ++static inline ++void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, ++ struct futex_hash_bucket *hb2, union futex_key *key2) ++{ ++ ++ /* ++ * If key1 and key2 hash to the same bucket, no need to ++ * requeue. ++ */ ++ if (likely(&hb1->chain != &hb2->chain)) { ++ plist_del(&q->list, &hb1->chain); ++ hb_waiters_dec(hb1); ++ hb_waiters_inc(hb2); ++ plist_add(&q->list, &hb2->chain); ++ q->lock_ptr = &hb2->lock; ++ } ++ q->key = *key2; ++} ++ ++/** ++ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue ++ * @q: the futex_q ++ * @key: the key of the requeue target futex ++ * @hb: the hash_bucket of the requeue target futex ++ * ++ * During futex_requeue, with requeue_pi=1, it is possible to acquire the ++ * target futex if it is uncontended or via a lock steal. Set the futex_q key ++ * to the requeue target futex so the waiter can detect the wakeup on the right ++ * futex, but remove it from the hb and NULL the rt_waiter so it can detect ++ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock ++ * to protect access to the pi_state to fixup the owner later. Must be called ++ * with both q->lock_ptr and hb->lock held. ++ */ ++static inline ++void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, ++ struct futex_hash_bucket *hb) ++{ ++ q->key = *key; ++ ++ __unqueue_futex(q); ++ ++ WARN_ON(!q->rt_waiter); ++ q->rt_waiter = NULL; ++ ++ q->lock_ptr = &hb->lock; ++ ++ wake_up_state(q->task, TASK_NORMAL); ++} ++ ++/** ++ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter ++ * @pifutex: the user address of the to futex ++ * @hb1: the from futex hash bucket, must be locked by the caller ++ * @hb2: the to futex hash bucket, must be locked by the caller ++ * @key1: the from futex key ++ * @key2: the to futex key ++ * @ps: address to store the pi_state pointer ++ * @exiting: Pointer to store the task pointer of the owner task ++ * which is in the middle of exiting ++ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) ++ * ++ * Try and get the lock on behalf of the top waiter if we can do it atomically. ++ * Wake the top waiter if we succeed. If the caller specified set_waiters, ++ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. ++ * hb1 and hb2 must be held by the caller. ++ * ++ * @exiting is only set when the return value is -EBUSY. If so, this holds ++ * a refcount on the exiting task on return and the caller needs to drop it ++ * after waiting for the exit to complete. ++ * ++ * Return: ++ * - 0 - failed to acquire the lock atomically; ++ * - >0 - acquired the lock, return value is vpid of the top_waiter ++ * - <0 - error ++ */ ++static int ++futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ++ struct futex_hash_bucket *hb2, union futex_key *key1, ++ union futex_key *key2, struct futex_pi_state **ps, ++ struct task_struct **exiting, int set_waiters) ++{ ++ struct futex_q *top_waiter = NULL; ++ u32 curval; ++ int ret, vpid; ++ ++ if (get_futex_value_locked(&curval, pifutex)) ++ return -EFAULT; ++ ++ if (unlikely(should_fail_futex(true))) ++ return -EFAULT; ++ ++ /* ++ * Find the top_waiter and determine if there are additional waiters. ++ * If the caller intends to requeue more than 1 waiter to pifutex, ++ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, ++ * as we have means to handle the possible fault. If not, don't set ++ * the bit unecessarily as it will force the subsequent unlock to enter ++ * the kernel. ++ */ ++ top_waiter = futex_top_waiter(hb1, key1); ++ ++ /* There are no waiters, nothing for us to do. */ ++ if (!top_waiter) ++ return 0; ++ ++ /* Ensure we requeue to the expected futex. */ ++ if (!match_futex(top_waiter->requeue_pi_key, key2)) ++ return -EINVAL; ++ ++ /* ++ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in ++ * the contended case or if set_waiters is 1. The pi_state is returned ++ * in ps in contended cases. ++ */ ++ vpid = task_pid_vnr(top_waiter->task); ++ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, ++ exiting, set_waiters); ++ if (ret == 1) { ++ requeue_pi_wake_futex(top_waiter, key2, hb2); ++ return vpid; ++ } ++ return ret; ++} ++ ++/** ++ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 ++ * @uaddr1: source futex user address ++ * @flags: futex flags (FLAGS_SHARED, etc.) ++ * @uaddr2: target futex user address ++ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) ++ * @nr_requeue: number of waiters to requeue (0-INT_MAX) ++ * @cmpval: @uaddr1 expected value (or %NULL) ++ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a ++ * pi futex (pi to pi requeue is not supported) ++ * ++ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire ++ * uaddr2 atomically on behalf of the top waiter. ++ * ++ * Return: ++ * - >=0 - on success, the number of tasks requeued or woken; ++ * - <0 - on error ++ */ ++static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ++ u32 __user *uaddr2, int nr_wake, int nr_requeue, ++ u32 *cmpval, int requeue_pi) ++{ ++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; ++ int task_count = 0, ret; ++ struct futex_pi_state *pi_state = NULL; ++ struct futex_hash_bucket *hb1, *hb2; ++ struct futex_q *this, *next; ++ DEFINE_WAKE_Q(wake_q); ++ ++ if (nr_wake < 0 || nr_requeue < 0) ++ return -EINVAL; ++ ++ /* ++ * When PI not supported: return -ENOSYS if requeue_pi is true, ++ * consequently the compiler knows requeue_pi is always false past ++ * this point which will optimize away all the conditional code ++ * further down. ++ */ ++ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) ++ return -ENOSYS; ++ ++ if (requeue_pi) { ++ /* ++ * Requeue PI only works on two distinct uaddrs. This ++ * check is only valid for private futexes. See below. ++ */ ++ if (uaddr1 == uaddr2) ++ return -EINVAL; ++ ++ /* ++ * requeue_pi requires a pi_state, try to allocate it now ++ * without any locks in case it fails. ++ */ ++ if (refill_pi_state_cache()) ++ return -ENOMEM; ++ /* ++ * requeue_pi must wake as many tasks as it can, up to nr_wake ++ * + nr_requeue, since it acquires the rt_mutex prior to ++ * returning to userspace, so as to not leave the rt_mutex with ++ * waiters and no owner. However, second and third wake-ups ++ * cannot be predicted as they involve race conditions with the ++ * first wake and a fault while looking up the pi_state. Both ++ * pthread_cond_signal() and pthread_cond_broadcast() should ++ * use nr_wake=1. ++ */ ++ if (nr_wake != 1) ++ return -EINVAL; ++ } ++ ++retry: ++ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; ++ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, ++ requeue_pi ? FUTEX_WRITE : FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++ /* ++ * The check above which compares uaddrs is not sufficient for ++ * shared futexes. We need to compare the keys: ++ */ ++ if (requeue_pi && match_futex(&key1, &key2)) ++ return -EINVAL; ++ ++ hb1 = hash_futex(&key1); ++ hb2 = hash_futex(&key2); ++ ++retry_private: ++ hb_waiters_inc(hb2); ++ double_lock_hb(hb1, hb2); ++ ++ if (likely(cmpval != NULL)) { ++ u32 curval; ++ ++ ret = get_futex_value_locked(&curval, uaddr1); ++ ++ if (unlikely(ret)) { ++ double_unlock_hb(hb1, hb2); ++ hb_waiters_dec(hb2); ++ ++ ret = get_user(curval, uaddr1); ++ if (ret) ++ return ret; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ goto retry; ++ } ++ if (curval != *cmpval) { ++ ret = -EAGAIN; ++ goto out_unlock; ++ } ++ } ++ ++ if (requeue_pi && (task_count - nr_wake < nr_requeue)) { ++ struct task_struct *exiting = NULL; ++ ++ /* ++ * Attempt to acquire uaddr2 and wake the top waiter. If we ++ * intend to requeue waiters, force setting the FUTEX_WAITERS ++ * bit. We force this here where we are able to easily handle ++ * faults rather in the requeue loop below. ++ */ ++ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, ++ &key2, &pi_state, ++ &exiting, nr_requeue); ++ ++ /* ++ * At this point the top_waiter has either taken uaddr2 or is ++ * waiting on it. If the former, then the pi_state will not ++ * exist yet, look it up one more time to ensure we have a ++ * reference to it. If the lock was taken, ret contains the ++ * vpid of the top waiter task. ++ * If the lock was not taken, we have pi_state and an initial ++ * refcount on it. In case of an error we have nothing. ++ */ ++ if (ret > 0) { ++ WARN_ON(pi_state); ++ task_count++; ++ /* ++ * If we acquired the lock, then the user space value ++ * of uaddr2 should be vpid. It cannot be changed by ++ * the top waiter as it is blocked on hb2 lock if it ++ * tries to do so. If something fiddled with it behind ++ * our back the pi state lookup might unearth it. So ++ * we rather use the known value than rereading and ++ * handing potential crap to lookup_pi_state. ++ * ++ * If that call succeeds then we have pi_state and an ++ * initial refcount on it. ++ */ ++ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, ++ &pi_state, &exiting); ++ } ++ ++ switch (ret) { ++ case 0: ++ /* We hold a reference on the pi state. */ ++ break; ++ ++ /* If the above failed, then pi_state is NULL */ ++ case -EFAULT: ++ double_unlock_hb(hb1, hb2); ++ hb_waiters_dec(hb2); ++ ret = fault_in_user_writeable(uaddr2); ++ if (!ret) ++ goto retry; ++ return ret; ++ case -EBUSY: ++ case -EAGAIN: ++ /* ++ * Two reasons for this: ++ * - EBUSY: Owner is exiting and we just wait for the ++ * exit to complete. ++ * - EAGAIN: The user space value changed. ++ */ ++ double_unlock_hb(hb1, hb2); ++ hb_waiters_dec(hb2); ++ /* ++ * Handle the case where the owner is in the middle of ++ * exiting. Wait for the exit to complete otherwise ++ * this task might loop forever, aka. live lock. ++ */ ++ wait_for_owner_exiting(ret, exiting); ++ cond_resched(); ++ goto retry; ++ default: ++ goto out_unlock; ++ } ++ } ++ ++ plist_for_each_entry_safe(this, next, &hb1->chain, list) { ++ if (task_count - nr_wake >= nr_requeue) ++ break; ++ ++ if (!match_futex(&this->key, &key1)) ++ continue; ++ ++ /* ++ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always ++ * be paired with each other and no other futex ops. ++ * ++ * We should never be requeueing a futex_q with a pi_state, ++ * which is awaiting a futex_unlock_pi(). ++ */ ++ if ((requeue_pi && !this->rt_waiter) || ++ (!requeue_pi && this->rt_waiter) || ++ this->pi_state) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ /* ++ * Wake nr_wake waiters. For requeue_pi, if we acquired the ++ * lock, we already woke the top_waiter. If not, it will be ++ * woken by futex_unlock_pi(). ++ */ ++ if (++task_count <= nr_wake && !requeue_pi) { ++ mark_wake_futex(&wake_q, this); ++ continue; ++ } ++ ++ /* Ensure we requeue to the expected futex for requeue_pi. */ ++ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ /* ++ * Requeue nr_requeue waiters and possibly one more in the case ++ * of requeue_pi if we couldn't acquire the lock atomically. ++ */ ++ if (requeue_pi) { ++ /* ++ * Prepare the waiter to take the rt_mutex. Take a ++ * refcount on the pi_state and store the pointer in ++ * the futex_q object of the waiter. ++ */ ++ get_pi_state(pi_state); ++ this->pi_state = pi_state; ++ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, ++ this->rt_waiter, ++ this->task); ++ if (ret == 1) { ++ /* ++ * We got the lock. We do neither drop the ++ * refcount on pi_state nor clear ++ * this->pi_state because the waiter needs the ++ * pi_state for cleaning up the user space ++ * value. It will drop the refcount after ++ * doing so. ++ */ ++ requeue_pi_wake_futex(this, &key2, hb2); ++ continue; ++ } else if (ret) { ++ /* ++ * rt_mutex_start_proxy_lock() detected a ++ * potential deadlock when we tried to queue ++ * that waiter. Drop the pi_state reference ++ * which we took above and remove the pointer ++ * to the state from the waiters futex_q ++ * object. ++ */ ++ this->pi_state = NULL; ++ put_pi_state(pi_state); ++ /* ++ * We stop queueing more waiters and let user ++ * space deal with the mess. ++ */ ++ break; ++ } ++ } ++ requeue_futex(this, hb1, hb2, &key2); ++ } ++ ++ /* ++ * We took an extra initial reference to the pi_state either ++ * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We ++ * need to drop it here again. ++ */ ++ put_pi_state(pi_state); ++ ++out_unlock: ++ double_unlock_hb(hb1, hb2); ++ wake_up_q(&wake_q); ++ hb_waiters_dec(hb2); ++ return ret ? ret : task_count; ++} ++ ++/* The key must be already stored in q->key. */ ++static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) ++ __acquires(&hb->lock) ++{ ++ struct futex_hash_bucket *hb; ++ ++ hb = hash_futex(&q->key); ++ ++ /* ++ * Increment the counter before taking the lock so that ++ * a potential waker won't miss a to-be-slept task that is ++ * waiting for the spinlock. This is safe as all queue_lock() ++ * users end up calling queue_me(). Similarly, for housekeeping, ++ * decrement the counter at queue_unlock() when some error has ++ * occurred and we don't end up adding the task to the list. ++ */ ++ hb_waiters_inc(hb); /* implies smp_mb(); (A) */ ++ ++ q->lock_ptr = &hb->lock; ++ ++ spin_lock(&hb->lock); ++ return hb; ++} ++ ++static inline void ++queue_unlock(struct futex_hash_bucket *hb) ++ __releases(&hb->lock) ++{ ++ spin_unlock(&hb->lock); ++ hb_waiters_dec(hb); ++} ++ ++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) ++{ ++ int prio; ++ ++ /* ++ * The priority used to register this element is ++ * - either the real thread-priority for the real-time threads ++ * (i.e. threads with a priority lower than MAX_RT_PRIO) ++ * - or MAX_RT_PRIO for non-RT threads. ++ * Thus, all RT-threads are woken first in priority order, and ++ * the others are woken last, in FIFO order. ++ */ ++ prio = min(current->normal_prio, MAX_RT_PRIO); ++ ++ plist_node_init(&q->list, prio); ++ plist_add(&q->list, &hb->chain); ++ q->task = current; ++} ++ ++/** ++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket ++ * @q: The futex_q to enqueue ++ * @hb: The destination hash bucket ++ * ++ * The hb->lock must be held by the caller, and is released here. A call to ++ * queue_me() is typically paired with exactly one call to unqueue_me(). The ++ * exceptions involve the PI related operations, which may use unqueue_me_pi() ++ * or nothing if the unqueue is done as part of the wake process and the unqueue ++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for ++ * an example). ++ */ ++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) ++ __releases(&hb->lock) ++{ ++ __queue_me(q, hb); ++ spin_unlock(&hb->lock); ++} ++ ++/** ++ * unqueue_me() - Remove the futex_q from its futex_hash_bucket ++ * @q: The futex_q to unqueue ++ * ++ * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must ++ * be paired with exactly one earlier call to queue_me(). ++ * ++ * Return: ++ * - 1 - if the futex_q was still queued (and we removed unqueued it); ++ * - 0 - if the futex_q was already removed by the waking thread ++ */ ++static int unqueue_me(struct futex_q *q) ++{ ++ spinlock_t *lock_ptr; ++ int ret = 0; ++ ++ /* In the common case we don't take the spinlock, which is nice. */ ++retry: ++ /* ++ * q->lock_ptr can change between this read and the following spin_lock. ++ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and ++ * optimizing lock_ptr out of the logic below. ++ */ ++ lock_ptr = READ_ONCE(q->lock_ptr); ++ if (lock_ptr != NULL) { ++ spin_lock(lock_ptr); ++ /* ++ * q->lock_ptr can change between reading it and ++ * spin_lock(), causing us to take the wrong lock. This ++ * corrects the race condition. ++ * ++ * Reasoning goes like this: if we have the wrong lock, ++ * q->lock_ptr must have changed (maybe several times) ++ * between reading it and the spin_lock(). It can ++ * change again after the spin_lock() but only if it was ++ * already changed before the spin_lock(). It cannot, ++ * however, change back to the original value. Therefore ++ * we can detect whether we acquired the correct lock. ++ */ ++ if (unlikely(lock_ptr != q->lock_ptr)) { ++ spin_unlock(lock_ptr); ++ goto retry; ++ } ++ __unqueue_futex(q); ++ ++ BUG_ON(q->pi_state); ++ ++ spin_unlock(lock_ptr); ++ ret = 1; ++ } ++ ++ return ret; ++} ++ ++/* ++ * PI futexes can not be requeued and must remove themself from the ++ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry ++ * and dropped here. ++ */ ++static void unqueue_me_pi(struct futex_q *q) ++ __releases(q->lock_ptr) ++{ ++ __unqueue_futex(q); ++ ++ BUG_ON(!q->pi_state); ++ put_pi_state(q->pi_state); ++ q->pi_state = NULL; ++ ++ spin_unlock(q->lock_ptr); ++} ++ ++static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, ++ struct task_struct *argowner) ++{ ++ struct futex_pi_state *pi_state = q->pi_state; ++ u32 uval, curval, newval; ++ struct task_struct *oldowner, *newowner; ++ u32 newtid; ++ int ret, err = 0; ++ ++ lockdep_assert_held(q->lock_ptr); ++ ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ oldowner = pi_state->owner; ++ ++ /* ++ * We are here because either: ++ * ++ * - we stole the lock and pi_state->owner needs updating to reflect ++ * that (@argowner == current), ++ * ++ * or: ++ * ++ * - someone stole our lock and we need to fix things to point to the ++ * new owner (@argowner == NULL). ++ * ++ * Either way, we have to replace the TID in the user space variable. ++ * This must be atomic as we have to preserve the owner died bit here. ++ * ++ * Note: We write the user space value _before_ changing the pi_state ++ * because we can fault here. Imagine swapped out pages or a fork ++ * that marked all the anonymous memory readonly for cow. ++ * ++ * Modifying pi_state _before_ the user space value would leave the ++ * pi_state in an inconsistent state when we fault here, because we ++ * need to drop the locks to handle the fault. This might be observed ++ * in the PID check in lookup_pi_state. ++ */ ++retry: ++ if (!argowner) { ++ if (oldowner != current) { ++ /* ++ * We raced against a concurrent self; things are ++ * already fixed up. Nothing to do. ++ */ ++ ret = 0; ++ goto out_unlock; ++ } ++ ++ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { ++ /* We got the lock after all, nothing to fix. */ ++ ret = 0; ++ goto out_unlock; ++ } ++ ++ /* ++ * Since we just failed the trylock; there must be an owner. ++ */ ++ newowner = rt_mutex_owner(&pi_state->pi_mutex); ++ BUG_ON(!newowner); ++ } else { ++ WARN_ON_ONCE(argowner != current); ++ if (oldowner == current) { ++ /* ++ * We raced against a concurrent self; things are ++ * already fixed up. Nothing to do. ++ */ ++ ret = 0; ++ goto out_unlock; ++ } ++ newowner = argowner; ++ } ++ ++ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; ++ /* Owner died? */ ++ if (!pi_state->owner) ++ newtid |= FUTEX_OWNER_DIED; ++ ++ err = get_futex_value_locked(&uval, uaddr); ++ if (err) ++ goto handle_err; ++ ++ for (;;) { ++ newval = (uval & FUTEX_OWNER_DIED) | newtid; ++ ++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); ++ if (err) ++ goto handle_err; ++ ++ if (curval == uval) ++ break; ++ uval = curval; ++ } ++ ++ /* ++ * We fixed up user space. Now we need to fix the pi_state ++ * itself. ++ */ ++ if (pi_state->owner != NULL) { ++ raw_spin_lock(&pi_state->owner->pi_lock); ++ WARN_ON(list_empty(&pi_state->list)); ++ list_del_init(&pi_state->list); ++ raw_spin_unlock(&pi_state->owner->pi_lock); ++ } ++ ++ pi_state->owner = newowner; ++ ++ raw_spin_lock(&newowner->pi_lock); ++ WARN_ON(!list_empty(&pi_state->list)); ++ list_add(&pi_state->list, &newowner->pi_state_list); ++ raw_spin_unlock(&newowner->pi_lock); ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ return 0; ++ ++ /* ++ * In order to reschedule or handle a page fault, we need to drop the ++ * locks here. In the case of a fault, this gives the other task ++ * (either the highest priority waiter itself or the task which stole ++ * the rtmutex) the chance to try the fixup of the pi_state. So once we ++ * are back from handling the fault we need to check the pi_state after ++ * reacquiring the locks and before trying to do another fixup. When ++ * the fixup has been done already we simply return. ++ * ++ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely ++ * drop hb->lock since the caller owns the hb -> futex_q relation. ++ * Dropping the pi_mutex->wait_lock requires the state revalidate. ++ */ ++handle_err: ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ spin_unlock(q->lock_ptr); ++ ++ switch (err) { ++ case -EFAULT: ++ ret = fault_in_user_writeable(uaddr); ++ break; ++ ++ case -EAGAIN: ++ cond_resched(); ++ ret = 0; ++ break; ++ ++ default: ++ WARN_ON_ONCE(1); ++ ret = err; ++ break; ++ } ++ ++ spin_lock(q->lock_ptr); ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ /* ++ * Check if someone else fixed it for us: ++ */ ++ if (pi_state->owner != oldowner) { ++ ret = 0; ++ goto out_unlock; ++ } ++ ++ if (ret) ++ goto out_unlock; ++ ++ goto retry; ++ ++out_unlock: ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ return ret; ++} ++ ++static long futex_wait_restart(struct restart_block *restart); ++ ++/** ++ * fixup_owner() - Post lock pi_state and corner case management ++ * @uaddr: user address of the futex ++ * @q: futex_q (contains pi_state and access to the rt_mutex) ++ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) ++ * ++ * After attempting to lock an rt_mutex, this function is called to cleanup ++ * the pi_state owner as well as handle race conditions that may allow us to ++ * acquire the lock. Must be called with the hb lock held. ++ * ++ * Return: ++ * - 1 - success, lock taken; ++ * - 0 - success, lock not taken; ++ * - <0 - on error (-EFAULT) ++ */ ++static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) ++{ ++ int ret = 0; ++ ++ if (locked) { ++ /* ++ * Got the lock. We might not be the anticipated owner if we ++ * did a lock-steal - fix up the PI-state in that case: ++ * ++ * Speculative pi_state->owner read (we don't hold wait_lock); ++ * since we own the lock pi_state->owner == current is the ++ * stable state, anything else needs more attention. ++ */ ++ if (q->pi_state->owner != current) ++ ret = fixup_pi_state_owner(uaddr, q, current); ++ return ret ? ret : locked; ++ } ++ ++ /* ++ * If we didn't get the lock; check if anybody stole it from us. In ++ * that case, we need to fix up the uval to point to them instead of ++ * us, otherwise bad things happen. [10] ++ * ++ * Another speculative read; pi_state->owner == current is unstable ++ * but needs our attention. ++ */ ++ if (q->pi_state->owner == current) { ++ ret = fixup_pi_state_owner(uaddr, q, NULL); ++ return ret; ++ } ++ ++ /* ++ * Paranoia check. If we did not take the lock, then we should not be ++ * the owner of the rt_mutex. ++ */ ++ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { ++ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " ++ "pi-state %p\n", ret, ++ q->pi_state->pi_mutex.owner, ++ q->pi_state->owner); ++ } ++ ++ return ret; ++} ++ ++/** ++ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal ++ * @hb: the futex hash bucket, must be locked by the caller ++ * @q: the futex_q to queue up on ++ * @timeout: the prepared hrtimer_sleeper, or null for no timeout ++ */ ++static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, ++ struct hrtimer_sleeper *timeout) ++{ ++ /* ++ * The task state is guaranteed to be set before another task can ++ * wake it. set_current_state() is implemented using smp_store_mb() and ++ * queue_me() calls spin_unlock() upon completion, both serializing ++ * access to the hash list and forcing another memory barrier. ++ */ ++ set_current_state(TASK_INTERRUPTIBLE); ++ queue_me(q, hb); ++ ++ /* Arm the timer */ ++ if (timeout) ++ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); ++ ++ /* ++ * If we have been removed from the hash list, then another task ++ * has tried to wake us, and we can skip the call to schedule(). ++ */ ++ if (likely(!plist_node_empty(&q->list))) { ++ /* ++ * If the timer has already expired, current will already be ++ * flagged for rescheduling. Only call schedule if there ++ * is no timeout, or if it has yet to expire. ++ */ ++ if (!timeout || timeout->task) ++ freezable_schedule(); ++ } ++ __set_current_state(TASK_RUNNING); ++} ++ ++/** ++ * futex_wait_setup() - Prepare to wait on a futex ++ * @uaddr: the futex userspace address ++ * @val: the expected value ++ * @flags: futex flags (FLAGS_SHARED, etc.) ++ * @q: the associated futex_q ++ * @hb: storage for hash_bucket pointer to be returned to caller ++ * ++ * Setup the futex_q and locate the hash_bucket. Get the futex value and ++ * compare it with the expected value. Handle atomic faults internally. ++ * Return with the hb lock held and a q.key reference on success, and unlocked ++ * with no q.key reference on failure. ++ * ++ * Return: ++ * - 0 - uaddr contains val and hb has been locked; ++ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked ++ */ ++static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ++ struct futex_q *q, struct futex_hash_bucket **hb) ++{ ++ u32 uval; ++ int ret; ++ ++ /* ++ * Access the page AFTER the hash-bucket is locked. ++ * Order is important: ++ * ++ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); ++ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } ++ * ++ * The basic logical guarantee of a futex is that it blocks ONLY ++ * if cond(var) is known to be true at the time of blocking, for ++ * any cond. If we locked the hash-bucket after testing *uaddr, that ++ * would open a race condition where we could block indefinitely with ++ * cond(var) false, which would violate the guarantee. ++ * ++ * On the other hand, we insert q and release the hash-bucket only ++ * after testing *uaddr. This guarantees that futex_wait() will NOT ++ * absorb a wakeup if *uaddr does not match the desired values ++ * while the syscall executes. ++ */ ++retry: ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++retry_private: ++ *hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (ret) { ++ queue_unlock(*hb); ++ ++ ret = get_user(uval, uaddr); ++ if (ret) ++ return ret; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ goto retry; ++ } ++ ++ if (uval != val) { ++ queue_unlock(*hb); ++ ret = -EWOULDBLOCK; ++ } ++ ++ return ret; ++} ++ ++static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ++ ktime_t *abs_time, u32 bitset) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ struct restart_block *restart; ++ struct futex_hash_bucket *hb; ++ struct futex_q q = futex_q_init; ++ int ret; ++ ++ if (!bitset) ++ return -EINVAL; ++ q.bitset = bitset; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, ++ current->timer_slack_ns); ++retry: ++ /* ++ * Prepare to wait on uaddr. On success, holds hb lock and increments ++ * q.key refs. ++ */ ++ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); ++ if (ret) ++ goto out; ++ ++ /* queue_me and wait for wakeup, timeout, or a signal. */ ++ futex_wait_queue_me(hb, &q, to); ++ ++ /* If we were woken (and unqueued), we succeeded, whatever. */ ++ ret = 0; ++ /* unqueue_me() drops q.key ref */ ++ if (!unqueue_me(&q)) ++ goto out; ++ ret = -ETIMEDOUT; ++ if (to && !to->task) ++ goto out; ++ ++ /* ++ * We expect signal_pending(current), but we might be the ++ * victim of a spurious wakeup as well. ++ */ ++ if (!signal_pending(current)) ++ goto retry; ++ ++ ret = -ERESTARTSYS; ++ if (!abs_time) ++ goto out; ++ ++ restart = ¤t->restart_block; ++ restart->fn = futex_wait_restart; ++ restart->futex.uaddr = uaddr; ++ restart->futex.val = val; ++ restart->futex.time = *abs_time; ++ restart->futex.bitset = bitset; ++ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; ++ ++ ret = -ERESTART_RESTARTBLOCK; ++ ++out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ return ret; ++} ++ ++ ++static long futex_wait_restart(struct restart_block *restart) ++{ ++ u32 __user *uaddr = restart->futex.uaddr; ++ ktime_t t, *tp = NULL; ++ ++ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { ++ t = restart->futex.time; ++ tp = &t; ++ } ++ restart->fn = do_no_restart_syscall; ++ ++ return (long)futex_wait(uaddr, restart->futex.flags, ++ restart->futex.val, tp, restart->futex.bitset); ++} ++ ++ ++/* ++ * Userspace tried a 0 -> TID atomic transition of the futex value ++ * and failed. The kernel side here does the whole locking operation: ++ * if there are waiters then it will block as a consequence of relying ++ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see ++ * a 0 value of the futex too.). ++ * ++ * Also serves as futex trylock_pi()'ing, and due semantics. ++ */ ++static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ++ ktime_t *time, int trylock) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ struct futex_pi_state *pi_state = NULL; ++ struct task_struct *exiting = NULL; ++ struct rt_mutex_waiter rt_waiter; ++ struct futex_hash_bucket *hb; ++ struct futex_q q = futex_q_init; ++ int res, ret; ++ ++ if (!IS_ENABLED(CONFIG_FUTEX_PI)) ++ return -ENOSYS; ++ ++ if (refill_pi_state_cache()) ++ return -ENOMEM; ++ ++ to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); ++ ++retry: ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); ++ if (unlikely(ret != 0)) ++ goto out; ++ ++retry_private: ++ hb = queue_lock(&q); ++ ++ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, ++ &exiting, 0); ++ if (unlikely(ret)) { ++ /* ++ * Atomic work succeeded and we got the lock, ++ * or failed. Either way, we do _not_ block. ++ */ ++ switch (ret) { ++ case 1: ++ /* We got the lock. */ ++ ret = 0; ++ goto out_unlock_put_key; ++ case -EFAULT: ++ goto uaddr_faulted; ++ case -EBUSY: ++ case -EAGAIN: ++ /* ++ * Two reasons for this: ++ * - EBUSY: Task is exiting and we just wait for the ++ * exit to complete. ++ * - EAGAIN: The user space value changed. ++ */ ++ queue_unlock(hb); ++ /* ++ * Handle the case where the owner is in the middle of ++ * exiting. Wait for the exit to complete otherwise ++ * this task might loop forever, aka. live lock. ++ */ ++ wait_for_owner_exiting(ret, exiting); ++ cond_resched(); ++ goto retry; ++ default: ++ goto out_unlock_put_key; ++ } ++ } ++ ++ WARN_ON(!q.pi_state); ++ ++ /* ++ * Only actually queue now that the atomic ops are done: ++ */ ++ __queue_me(&q, hb); ++ ++ if (trylock) { ++ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); ++ /* Fixup the trylock return value: */ ++ ret = ret ? 0 : -EWOULDBLOCK; ++ goto no_block; ++ } ++ ++ rt_mutex_init_waiter(&rt_waiter); ++ ++ /* ++ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not ++ * hold it while doing rt_mutex_start_proxy(), because then it will ++ * include hb->lock in the blocking chain, even through we'll not in ++ * fact hold it while blocking. This will lead it to report -EDEADLK ++ * and BUG when futex_unlock_pi() interleaves with this. ++ * ++ * Therefore acquire wait_lock while holding hb->lock, but drop the ++ * latter before calling __rt_mutex_start_proxy_lock(). This ++ * interleaves with futex_unlock_pi() -- which does a similar lock ++ * handoff -- such that the latter can observe the futex_q::pi_state ++ * before __rt_mutex_start_proxy_lock() is done. ++ */ ++ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); ++ spin_unlock(q.lock_ptr); ++ /* ++ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter ++ * such that futex_unlock_pi() is guaranteed to observe the waiter when ++ * it sees the futex_q::pi_state. ++ */ ++ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); ++ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); ++ ++ if (ret) { ++ if (ret == 1) ++ ret = 0; ++ goto cleanup; ++ } ++ ++ if (unlikely(to)) ++ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); ++ ++ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); ++ ++cleanup: ++ spin_lock(q.lock_ptr); ++ /* ++ * If we failed to acquire the lock (deadlock/signal/timeout), we must ++ * first acquire the hb->lock before removing the lock from the ++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait ++ * lists consistent. ++ * ++ * In particular; it is important that futex_unlock_pi() can not ++ * observe this inconsistency. ++ */ ++ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ++ ret = 0; ++ ++no_block: ++ /* ++ * Fixup the pi_state owner and possibly acquire the lock if we ++ * haven't already. ++ */ ++ res = fixup_owner(uaddr, &q, !ret); ++ /* ++ * If fixup_owner() returned an error, proprogate that. If it acquired ++ * the lock, clear our -ETIMEDOUT or -EINTR. ++ */ ++ if (res) ++ ret = (res < 0) ? res : 0; ++ ++ /* ++ * If fixup_owner() faulted and was unable to handle the fault, unlock ++ * it and return the fault to userspace. ++ */ ++ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { ++ pi_state = q.pi_state; ++ get_pi_state(pi_state); ++ } ++ ++ /* Unqueue and drop the lock */ ++ unqueue_me_pi(&q); ++ ++ if (pi_state) { ++ rt_mutex_futex_unlock(&pi_state->pi_mutex); ++ put_pi_state(pi_state); ++ } ++ ++ goto out; ++ ++out_unlock_put_key: ++ queue_unlock(hb); ++ ++out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ return ret != -EINTR ? ret : -ERESTARTNOINTR; ++ ++uaddr_faulted: ++ queue_unlock(hb); ++ ++ ret = fault_in_user_writeable(uaddr); ++ if (ret) ++ goto out; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ goto retry; ++} ++ ++/* ++ * Userspace attempted a TID -> 0 atomic transition, and failed. ++ * This is the in-kernel slowpath: we look up the PI state (if any), ++ * and do the rt-mutex unlock. ++ */ ++static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ++{ ++ u32 curval, uval, vpid = task_pid_vnr(current); ++ union futex_key key = FUTEX_KEY_INIT; ++ struct futex_hash_bucket *hb; ++ struct futex_q *top_waiter; ++ int ret; ++ ++ if (!IS_ENABLED(CONFIG_FUTEX_PI)) ++ return -ENOSYS; ++ ++retry: ++ if (get_user(uval, uaddr)) ++ return -EFAULT; ++ /* ++ * We release only a lock we actually own: ++ */ ++ if ((uval & FUTEX_TID_MASK) != vpid) ++ return -EPERM; ++ ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); ++ if (ret) ++ return ret; ++ ++ hb = hash_futex(&key); ++ spin_lock(&hb->lock); ++ ++ /* ++ * Check waiters first. We do not trust user space values at ++ * all and we at least want to know if user space fiddled ++ * with the futex value instead of blindly unlocking. ++ */ ++ top_waiter = futex_top_waiter(hb, &key); ++ if (top_waiter) { ++ struct futex_pi_state *pi_state = top_waiter->pi_state; ++ ++ ret = -EINVAL; ++ if (!pi_state) ++ goto out_unlock; ++ ++ /* ++ * If current does not own the pi_state then the futex is ++ * inconsistent and user space fiddled with the futex value. ++ */ ++ if (pi_state->owner != current) ++ goto out_unlock; ++ ++ get_pi_state(pi_state); ++ /* ++ * By taking wait_lock while still holding hb->lock, we ensure ++ * there is no point where we hold neither; and therefore ++ * wake_futex_pi() must observe a state consistent with what we ++ * observed. ++ * ++ * In particular; this forces __rt_mutex_start_proxy() to ++ * complete such that we're guaranteed to observe the ++ * rt_waiter. Also see the WARN in wake_futex_pi(). ++ */ ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ spin_unlock(&hb->lock); ++ ++ /* drops pi_state->pi_mutex.wait_lock */ ++ ret = wake_futex_pi(uaddr, uval, pi_state); ++ ++ put_pi_state(pi_state); ++ ++ /* ++ * Success, we're done! No tricky corner cases. ++ */ ++ if (!ret) ++ goto out_putkey; ++ /* ++ * The atomic access to the futex value generated a ++ * pagefault, so retry the user-access and the wakeup: ++ */ ++ if (ret == -EFAULT) ++ goto pi_faulted; ++ /* ++ * A unconditional UNLOCK_PI op raced against a waiter ++ * setting the FUTEX_WAITERS bit. Try again. ++ */ ++ if (ret == -EAGAIN) ++ goto pi_retry; ++ /* ++ * wake_futex_pi has detected invalid state. Tell user ++ * space. ++ */ ++ goto out_putkey; ++ } ++ ++ /* ++ * We have no kernel internal state, i.e. no waiters in the ++ * kernel. Waiters which are about to queue themselves are stuck ++ * on hb->lock. So we can safely ignore them. We do neither ++ * preserve the WAITERS bit not the OWNER_DIED one. We are the ++ * owner. ++ */ ++ if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { ++ spin_unlock(&hb->lock); ++ switch (ret) { ++ case -EFAULT: ++ goto pi_faulted; ++ ++ case -EAGAIN: ++ goto pi_retry; ++ ++ default: ++ WARN_ON_ONCE(1); ++ goto out_putkey; ++ } ++ } ++ ++ /* ++ * If uval has changed, let user space handle it. ++ */ ++ ret = (curval == uval) ? 0 : -EAGAIN; ++ ++out_unlock: ++ spin_unlock(&hb->lock); ++out_putkey: ++ return ret; ++ ++pi_retry: ++ cond_resched(); ++ goto retry; ++ ++pi_faulted: ++ ++ ret = fault_in_user_writeable(uaddr); ++ if (!ret) ++ goto retry; ++ ++ return ret; ++} ++ ++/** ++ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex ++ * @hb: the hash_bucket futex_q was original enqueued on ++ * @q: the futex_q woken while waiting to be requeued ++ * @key2: the futex_key of the requeue target futex ++ * @timeout: the timeout associated with the wait (NULL if none) ++ * ++ * Detect if the task was woken on the initial futex as opposed to the requeue ++ * target futex. If so, determine if it was a timeout or a signal that caused ++ * the wakeup and return the appropriate error code to the caller. Must be ++ * called with the hb lock held. ++ * ++ * Return: ++ * - 0 = no early wakeup detected; ++ * - <0 = -ETIMEDOUT or -ERESTARTNOINTR ++ */ ++static inline ++int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, ++ struct futex_q *q, union futex_key *key2, ++ struct hrtimer_sleeper *timeout) ++{ ++ int ret = 0; ++ ++ /* ++ * With the hb lock held, we avoid races while we process the wakeup. ++ * We only need to hold hb (and not hb2) to ensure atomicity as the ++ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. ++ * It can't be requeued from uaddr2 to something else since we don't ++ * support a PI aware source futex for requeue. ++ */ ++ if (!match_futex(&q->key, key2)) { ++ WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); ++ /* ++ * We were woken prior to requeue by a timeout or a signal. ++ * Unqueue the futex_q and determine which it was. ++ */ ++ plist_del(&q->list, &hb->chain); ++ hb_waiters_dec(hb); ++ ++ /* Handle spurious wakeups gracefully */ ++ ret = -EWOULDBLOCK; ++ if (timeout && !timeout->task) ++ ret = -ETIMEDOUT; ++ else if (signal_pending(current)) ++ ret = -ERESTARTNOINTR; ++ } ++ return ret; ++} ++ ++/** ++ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 ++ * @uaddr: the futex we initially wait on (non-pi) ++ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be ++ * the same type, no requeueing from private to shared, etc. ++ * @val: the expected value of uaddr ++ * @abs_time: absolute timeout ++ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all ++ * @uaddr2: the pi futex we will take prior to returning to user-space ++ * ++ * The caller will wait on uaddr and will be requeued by futex_requeue() to ++ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake ++ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to ++ * userspace. This ensures the rt_mutex maintains an owner when it has waiters; ++ * without one, the pi logic would not know which task to boost/deboost, if ++ * there was a need to. ++ * ++ * We call schedule in futex_wait_queue_me() when we enqueue and return there ++ * via the following-- ++ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() ++ * 2) wakeup on uaddr2 after a requeue ++ * 3) signal ++ * 4) timeout ++ * ++ * If 3, cleanup and return -ERESTARTNOINTR. ++ * ++ * If 2, we may then block on trying to take the rt_mutex and return via: ++ * 5) successful lock ++ * 6) signal ++ * 7) timeout ++ * 8) other lock acquisition failure ++ * ++ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). ++ * ++ * If 4 or 7, we cleanup and return with -ETIMEDOUT. ++ * ++ * Return: ++ * - 0 - On success; ++ * - <0 - On error ++ */ ++static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ++ u32 val, ktime_t *abs_time, u32 bitset, ++ u32 __user *uaddr2) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ struct futex_pi_state *pi_state = NULL; ++ struct rt_mutex_waiter rt_waiter; ++ struct futex_hash_bucket *hb; ++ union futex_key key2 = FUTEX_KEY_INIT; ++ struct futex_q q = futex_q_init; ++ int res, ret; ++ ++ if (!IS_ENABLED(CONFIG_FUTEX_PI)) ++ return -ENOSYS; ++ ++ if (uaddr == uaddr2) ++ return -EINVAL; ++ ++ if (!bitset) ++ return -EINVAL; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, ++ current->timer_slack_ns); ++ ++ /* ++ * The waiter is allocated on our stack, manipulated by the requeue ++ * code while we sleep on uaddr. ++ */ ++ rt_mutex_init_waiter(&rt_waiter); ++ ++ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); ++ if (unlikely(ret != 0)) ++ goto out; ++ ++ q.bitset = bitset; ++ q.rt_waiter = &rt_waiter; ++ q.requeue_pi_key = &key2; ++ ++ /* ++ * Prepare to wait on uaddr. On success, increments q.key (key1) ref ++ * count. ++ */ ++ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); ++ if (ret) ++ goto out; ++ ++ /* ++ * The check above which compares uaddrs is not sufficient for ++ * shared futexes. We need to compare the keys: ++ */ ++ if (match_futex(&q.key, &key2)) { ++ queue_unlock(hb); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ /* Queue the futex_q, drop the hb lock, wait for wakeup. */ ++ futex_wait_queue_me(hb, &q, to); ++ ++ spin_lock(&hb->lock); ++ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); ++ spin_unlock(&hb->lock); ++ if (ret) ++ goto out; ++ ++ /* ++ * In order for us to be here, we know our q.key == key2, and since ++ * we took the hb->lock above, we also know that futex_requeue() has ++ * completed and we no longer have to concern ourselves with a wakeup ++ * race with the atomic proxy lock acquisition by the requeue code. The ++ * futex_requeue dropped our key1 reference and incremented our key2 ++ * reference count. ++ */ ++ ++ /* Check if the requeue code acquired the second futex for us. */ ++ if (!q.rt_waiter) { ++ /* ++ * Got the lock. We might not be the anticipated owner if we ++ * did a lock-steal - fix up the PI-state in that case. ++ */ ++ if (q.pi_state && (q.pi_state->owner != current)) { ++ spin_lock(q.lock_ptr); ++ ret = fixup_pi_state_owner(uaddr2, &q, current); ++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { ++ pi_state = q.pi_state; ++ get_pi_state(pi_state); ++ } ++ /* ++ * Drop the reference to the pi state which ++ * the requeue_pi() code acquired for us. ++ */ ++ put_pi_state(q.pi_state); ++ spin_unlock(q.lock_ptr); ++ } ++ } else { ++ struct rt_mutex *pi_mutex; ++ ++ /* ++ * We have been woken up by futex_unlock_pi(), a timeout, or a ++ * signal. futex_unlock_pi() will not destroy the lock_ptr nor ++ * the pi_state. ++ */ ++ WARN_ON(!q.pi_state); ++ pi_mutex = &q.pi_state->pi_mutex; ++ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); ++ ++ spin_lock(q.lock_ptr); ++ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ++ ret = 0; ++ ++ debug_rt_mutex_free_waiter(&rt_waiter); ++ /* ++ * Fixup the pi_state owner and possibly acquire the lock if we ++ * haven't already. ++ */ ++ res = fixup_owner(uaddr2, &q, !ret); ++ /* ++ * If fixup_owner() returned an error, proprogate that. If it ++ * acquired the lock, clear -ETIMEDOUT or -EINTR. ++ */ ++ if (res) ++ ret = (res < 0) ? res : 0; ++ ++ /* ++ * If fixup_pi_state_owner() faulted and was unable to handle ++ * the fault, unlock the rt_mutex and return the fault to ++ * userspace. ++ */ ++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { ++ pi_state = q.pi_state; ++ get_pi_state(pi_state); ++ } ++ ++ /* Unqueue and drop the lock. */ ++ unqueue_me_pi(&q); ++ } ++ ++ if (pi_state) { ++ rt_mutex_futex_unlock(&pi_state->pi_mutex); ++ put_pi_state(pi_state); ++ } ++ ++ if (ret == -EINTR) { ++ /* ++ * We've already been requeued, but cannot restart by calling ++ * futex_lock_pi() directly. We could restart this syscall, but ++ * it would detect that the user space "val" changed and return ++ * -EWOULDBLOCK. Save the overhead of the restart and return ++ * -EWOULDBLOCK directly. ++ */ ++ ret = -EWOULDBLOCK; ++ } ++ ++out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ return ret; ++} ++ ++static long do_futex1(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ++ u32 __user *uaddr2, u32 val2, u32 val3) ++{ ++ int cmd = op & FUTEX_CMD_MASK; ++ unsigned int flags = 0; ++ ++ if (!(op & FUTEX_PRIVATE_FLAG)) ++ flags |= FLAGS_SHARED; ++ ++ if (op & FUTEX_CLOCK_REALTIME) { ++ flags |= FLAGS_CLOCKRT; ++ if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ ++ cmd != FUTEX_WAIT_REQUEUE_PI) ++ return -ENOSYS; ++ } ++ ++ switch (cmd) { ++ case FUTEX_LOCK_PI: ++ case FUTEX_UNLOCK_PI: ++ case FUTEX_TRYLOCK_PI: ++ case FUTEX_WAIT_REQUEUE_PI: ++ case FUTEX_CMP_REQUEUE_PI: ++ if (!futex_cmpxchg_enabled) ++ return -ENOSYS; ++ } ++ ++ switch (cmd) { ++ case FUTEX_WAIT: ++ val3 = FUTEX_BITSET_MATCH_ANY; ++ fallthrough; ++ case FUTEX_WAIT_BITSET: ++ return futex_wait(uaddr, flags, val, timeout, val3); ++ case FUTEX_WAKE: ++ val3 = FUTEX_BITSET_MATCH_ANY; ++ fallthrough; ++ case FUTEX_WAKE_BITSET: ++ return futex_wake(uaddr, flags, val, val3); ++ case FUTEX_REQUEUE: ++ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); ++ case FUTEX_CMP_REQUEUE: ++ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); ++ case FUTEX_WAKE_OP: ++ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); ++ case FUTEX_LOCK_PI: ++ return futex_lock_pi(uaddr, flags, timeout, 0); ++ case FUTEX_UNLOCK_PI: ++ return futex_unlock_pi(uaddr, flags); ++ case FUTEX_TRYLOCK_PI: ++ return futex_lock_pi(uaddr, flags, NULL, 1); ++ case FUTEX_WAIT_REQUEUE_PI: ++ val3 = FUTEX_BITSET_MATCH_ANY; ++ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, ++ uaddr2); ++ case FUTEX_CMP_REQUEUE_PI: ++ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); ++ } ++ return -ENOSYS; ++} ++ ++ ++SYSCALL_DEFINE6(futex1, u32 __user *, uaddr, int, op, u32, val, ++ struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, ++ u32, val3) ++{ ++ struct timespec64 ts; ++ ktime_t t, *tp = NULL; ++ u32 val2 = 0; ++ int cmd = op & FUTEX_CMD_MASK; ++ ++ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || ++ cmd == FUTEX_WAIT_BITSET || ++ cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) ++ return -EFAULT; ++ if (get_timespec64(&ts, utime)) ++ return -EFAULT; ++ if (!timespec64_valid(&ts)) ++ return -EINVAL; ++ ++ t = timespec64_to_ktime(ts); ++ if (cmd == FUTEX_WAIT) ++ t = ktime_add_safe(ktime_get(), t); ++ tp = &t; ++ } ++ /* ++ * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. ++ * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. ++ */ ++ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || ++ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) ++ val2 = (u32) (unsigned long) utime; ++ ++ return do_futex1(uaddr, op, val, tp, uaddr2, val2, val3); ++} ++ ++static void __init futex_detect_cmpxchg(void) ++{ ++#ifndef CONFIG_HAVE_FUTEX_CMPXCHG ++ u32 curval; ++ ++ /* ++ * This will fail and we want it. Some arch implementations do ++ * runtime detection of the futex_atomic_cmpxchg_inatomic() ++ * functionality. We want to know that before we call in any ++ * of the complex code paths. Also we want to prevent ++ * registration of robust lists in that case. NULL is ++ * guaranteed to fault and we get -EFAULT on functional ++ * implementation, the non-functional ones will return ++ * -ENOSYS. ++ */ ++ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) ++ futex_cmpxchg_enabled = 1; ++#endif ++} ++ ++static int __init futex_init(void) ++{ ++ unsigned int futex_shift; ++ unsigned long i; ++ ++#if CONFIG_BASE_SMALL ++ futex_hashsize = 16; ++#else ++ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); ++#endif ++ ++ futex_queues = alloc_large_system_hash("futex1", sizeof(*futex_queues), ++ futex_hashsize, 0, ++ futex_hashsize < 256 ? HASH_SMALL : 0, ++ &futex_shift, NULL, ++ futex_hashsize, futex_hashsize); ++ futex_hashsize = 1UL << futex_shift; ++ ++ futex_detect_cmpxchg(); ++ ++ for (i = 0; i < futex_hashsize; i++) { ++ atomic_set(&futex_queues[i].waiters, 0); ++ plist_head_init(&futex_queues[i].chain); ++ spin_lock_init(&futex_queues[i].lock); ++ } ++ ++ return 0; ++} ++core_initcall(futex_init); +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 3e1a713d3e57..b53a24a99a14 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -153,6 +153,8 @@ COND_SYSCALL(futex_wait); + COND_SYSCALL(futex_wake); + COND_SYSCALL(futex_waitv); + ++COND_SYSCALL(futex1); ++ + /* kernel/hrtimer.c */ + + /* kernel/itimer.c */ +diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h +index 4205ed4158bf..43de5a59ac1c 100644 +--- a/tools/arch/x86/include/asm/unistd_64.h ++++ b/tools/arch/x86/include/asm/unistd_64.h +@@ -17,3 +17,15 @@ + #ifndef __NR_setns + #define __NR_setns 308 + #endif ++ ++#ifndef __NR_futex_wait ++#define __NR_futex_wait 440 ++#endif ++ ++#ifndef __NR_futex_wake ++#define __NR_futex_wake 441 ++#endif ++ ++#ifndef __NR_futex1 ++#define __NR_futex1 442 ++#endif +diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h +index dd457de21bad..f737eaeecbb6 100644 +--- a/tools/include/uapi/asm-generic/unistd.h ++++ b/tools/include/uapi/asm-generic/unistd.h +@@ -862,11 +862,15 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) + + #define __NR_futex_wait 440 + __SYSCALL(__NR_futex_wait, sys_futex_wait) ++ + #define __NR_futex_wake 441 + __SYSCALL(__NR_futex_wake, sys_futex_wake) + ++#define __NR_futex1 442 ++__SYSCALL(__NR_futex1, sys_futex1) ++ + #undef __NR_syscalls +-#define __NR_syscalls 442 ++#define __NR_syscalls 443 + + /* + * 32 bit systems traditionally used different +diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +index f30d6ae9a688..1a516b081207 100644 +--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +@@ -361,6 +361,9 @@ + 437 common openat2 sys_openat2 + 438 common pidfd_getfd sys_pidfd_getfd + 439 common faccessat2 sys_faccessat2 ++440 common futex_wait sys_futex_wait ++441 common futex_wake sys_futex_wake ++442 common futex1 sys_futex1 + + # + # x32-specific system call numbers start at 512 to avoid cache impact +diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h +index 31b53cc7d5bc..baf6a0d077ac 100644 +--- a/tools/perf/bench/futex.h ++++ b/tools/perf/bench/futex.h +@@ -8,10 +8,14 @@ + #ifndef _FUTEX_H + #define _FUTEX_H + ++//#define FUTEX1 0 ++#define UNUSED(x) (void)(x) ++ + #include + #include + #include + #include ++#include + + /** + * futex() - SYS_futex syscall wrapper +@@ -34,7 +38,13 @@ + * like-named arguments in the following wrappers except where noted below. + */ + #define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \ +- syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3) ++ syscall(__NR_futex1, uaddr, op | opflags, val, timeout, uaddr2, val3) ++ ++#define futex2_wake(uaddr, nr, flags) \ ++ syscall(__NR_futex_wake, uaddr, nr, flags | FUTEX_32) ++ ++#define futex2_wait(uaddr, val, flags, timeout) \ ++ syscall(__NR_futex_wait, uaddr, val, flags | FUTEX_32, timeout) + + /** + * futex_wait() - block on uaddr with optional timeout +@@ -43,7 +53,13 @@ + static inline int + futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags) + { ++#ifdef FUTEX1 + return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); ++#else ++ UNUSED(timeout); ++ UNUSED(opflags); ++ return futex2_wait(uaddr, val, 0, NULL); ++#endif + } + + /** +@@ -53,7 +69,12 @@ futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflag + static inline int + futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) + { ++#ifdef FUTEX1 + return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); ++#else ++ UNUSED(opflags); ++ return futex2_wake(uaddr, nr_wake, 0); ++#endif + } + + /** +-- +2.28.0 + +From 2f5e38a4191ac6fd5040435f6a41433add3711a6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 15 Oct 2020 18:06:40 -0300 +Subject: [PATCH 07/13] futex2: Add support for shared futexes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add support for shared futexes for cross-process resources. + +Signed-off-by: André Almeida +--- + kernel/futex2.c | 169 +++++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 146 insertions(+), 23 deletions(-) + +diff --git a/kernel/futex2.c b/kernel/futex2.c +index 4b782b5ef615..ae743ddf223e 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -6,7 +6,9 @@ + */ + + #include ++#include + #include ++#include + #include + #include + #include +@@ -15,6 +17,7 @@ + + /** + * struct futex_waiter - List entry for a waiter ++ * @uaddr: Memory address of userspace futex + * @key.address: Memory address of userspace futex + * @key.mm: Pointer to memory management struct of this process + * @key: Stores information that uniquely identify a futex +@@ -25,6 +28,7 @@ + * @index: Index of waiter in futexv list + */ + struct futex_waiter { ++ uintptr_t uaddr; + struct futex_key { + uintptr_t address; + struct mm_struct *mm; +@@ -125,16 +129,109 @@ static inline int bucket_get_waiters(struct futex_bucket *bucket) + #endif + } + ++static u64 get_inode_sequence_number(struct inode *inode) ++{ ++ static atomic64_t i_seq; ++ u64 old; ++ ++ /* Does the inode already have a sequence number? */ ++ old = atomic64_read(&inode->i_sequence); ++ if (likely(old)) ++ return old; ++ ++ for (;;) { ++ u64 new = atomic64_add_return(1, &i_seq); ++ if (WARN_ON_ONCE(!new)) ++ continue; ++ ++ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); ++ if (old) ++ return old; ++ return new; ++ } ++} ++ ++static int futex_get_shared_key(uintptr_t address, struct mm_struct *mm, ++ struct futex_key *key) ++{ ++ int err; ++ struct page *page, *tail; ++ struct address_space *mapping; ++ ++again: ++ err = get_user_pages_fast(address, 1, 0, &page); ++ ++ if (err < 0) ++ return err; ++ else ++ err = 0; ++ ++ ++ tail = page; ++ page = compound_head(page); ++ mapping = READ_ONCE(page->mapping); ++ ++ ++ if (unlikely(!mapping)) { ++ int shmem_swizzled; ++ ++ lock_page(page); ++ shmem_swizzled = PageSwapCache(page) || page->mapping; ++ unlock_page(page); ++ put_page(page); ++ ++ if (shmem_swizzled) ++ goto again; ++ ++ return -EFAULT; ++ } ++ ++ if (PageAnon(page)) { ++ ++ key->mm = mm; ++ key->address = address; ++ ++ } else { ++ struct inode *inode; ++ ++ rcu_read_lock(); ++ ++ if (READ_ONCE(page->mapping) != mapping) { ++ rcu_read_unlock(); ++ put_page(page); ++ ++ goto again; ++ } ++ ++ inode = READ_ONCE(mapping->host); ++ if (!inode) { ++ rcu_read_unlock(); ++ put_page(page); ++ ++ goto again; ++ } ++ ++ key->address = get_inode_sequence_number(inode); ++ key->mm = (struct mm_struct *) basepage_index(tail); ++ rcu_read_unlock(); ++ } ++ ++ put_page(page); ++ return err; ++} ++ + /** + * futex_get_bucket - Check if the user address is valid, prepare internal + * data and calculate the hash + * @uaddr: futex user address + * @key: data that uniquely identifies a futex ++ * @shared: is this a shared futex? + * + * Return: address of bucket on success, error code otherwise + */ + static struct futex_bucket *futex_get_bucket(void __user *uaddr, +- struct futex_key *key) ++ struct futex_key *key, ++ bool shared) + { + uintptr_t address = (uintptr_t) uaddr; + u32 hash_key; +@@ -145,8 +242,12 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr, + if (unlikely(!access_ok(address, sizeof(u32)))) + return ERR_PTR(-EFAULT); + +- key->address = address; +- key->mm = current->mm; ++ if (!shared) { ++ key->address = address; ++ key->mm = current->mm; ++ } else { ++ futex_get_shared_key(address, current->mm, key); ++ } + + /* Generate hash key for this futex using uaddr and current->mm */ + hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0); +@@ -275,9 +376,10 @@ static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr) + * Return: 0 on success, error code otherwise + */ + static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, +- unsigned int *awaken) ++ int *awaken) + { + int i, ret; ++ bool shared; + u32 uval, *uaddr, val; + struct futex_bucket *bucket; + +@@ -285,9 +387,13 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, + set_current_state(TASK_INTERRUPTIBLE); + + for (i = 0; i < nr_futexes; i++) { +- uaddr = (u32 * __user) futexv->objects[i].key.address; ++ uaddr = (u32 * __user) futexv->objects[i].uaddr; + val = (u32) futexv->objects[i].val; +- bucket = futexv->objects[i].bucket; ++ shared = (futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false; ++ if (shared) ++ bucket = futex_get_bucket((void *) uaddr, &futexv->objects[i].key, true); ++ else ++ bucket = futexv->objects[i].bucket; + + bucket_inc_waiters(bucket); + spin_lock(&bucket->lock); +@@ -301,11 +407,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, + __set_current_state(TASK_RUNNING); + *awaken = futex_dequeue_multiple(futexv, i); + ++ if (shared) ++ goto retry; ++ + if (__get_user(uval, uaddr)) + return -EFAULT; + + if (*awaken >= 0) +- return 0; ++ return 1; + + goto retry; + } +@@ -313,12 +422,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, + if (uval != val) { + spin_unlock(&bucket->lock); + ++ + bucket_dec_waiters(bucket); + __set_current_state(TASK_RUNNING); + *awaken = futex_dequeue_multiple(futexv, i); + +- if (*awaken >= 0) +- return 0; ++ if (*awaken >= 0) { ++ return 1; ++ } + + return -EWOULDBLOCK; + } +@@ -336,19 +447,18 @@ static int __futex_wait(struct futexv *futexv, + struct hrtimer_sleeper *timeout) + { + int ret; +- unsigned int awaken = -1; + +- while (1) { +- ret = futex_enqueue(futexv, nr_futexes, &awaken); + +- if (ret < 0) +- break; ++ while (1) { ++ int awaken = -1; + +- if (awaken <= 0) { +- return awaken; ++ ret = futex_enqueue(futexv, nr_futexes, &awaken); ++ if (ret) { ++ if (awaken >= 0) ++ return awaken; ++ return ret; + } + +- + /* Before sleeping, check if someone was woken */ + if (!futexv->hint && (!timeout || timeout->task)) + freezable_schedule(); +@@ -419,6 +529,7 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, + hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); + } + ++ + ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL); + + +@@ -438,9 +549,10 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, + SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + unsigned int, flags, struct __kernel_timespec __user *, timo) + { ++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; + unsigned int size = flags & FUTEX_SIZE_MASK; +- struct hrtimer_sleeper timeout; + struct futex_single_waiter wait_single; ++ struct hrtimer_sleeper timeout; + struct futex_waiter *waiter; + struct futexv *futexv; + int ret; +@@ -452,6 +564,7 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + waiter = &wait_single.waiter; + waiter->index = 0; + waiter->val = val; ++ waiter->uaddr = (uintptr_t) uaddr; + + INIT_LIST_HEAD(&waiter->list); + +@@ -462,11 +575,14 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + return -EINVAL; + + /* Get an unlocked hash bucket */ +- waiter->bucket = futex_get_bucket(uaddr, &waiter->key); +- if (IS_ERR(waiter->bucket)) ++ waiter->bucket = futex_get_bucket(uaddr, &waiter->key, shared); ++ if (IS_ERR(waiter->bucket)) { + return PTR_ERR(waiter->bucket); ++ } + + ret = futex_wait(futexv, 1, timo, &timeout, flags); ++ if (ret > 0) ++ ret = 0; + + return ret; + } +@@ -486,8 +602,10 @@ static int futex_parse_waitv(struct futexv *futexv, + struct futex_waitv waitv; + unsigned int i; + struct futex_bucket *bucket; ++ bool shared; + + for (i = 0; i < nr_futexes; i++) { ++ + if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) + return -EFAULT; + +@@ -495,8 +613,10 @@ static int futex_parse_waitv(struct futexv *futexv, + (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) + return -EINVAL; + ++ shared = (waitv.flags & FUTEX_SHARED_FLAG) ? true : false; ++ + bucket = futex_get_bucket(waitv.uaddr, +- &futexv->objects[i].key); ++ &futexv->objects[i].key, shared); + if (IS_ERR(bucket)) + return PTR_ERR(bucket); + +@@ -505,6 +625,7 @@ static int futex_parse_waitv(struct futexv *futexv, + futexv->objects[i].flags = waitv.flags; + futexv->objects[i].index = i; + INIT_LIST_HEAD(&futexv->objects[i].list); ++ futexv->objects[i].uaddr = (uintptr_t) waitv.uaddr; + } + + return 0; +@@ -573,6 +694,7 @@ static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) + SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + unsigned int, flags) + { ++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; + unsigned int size = flags & FUTEX_SIZE_MASK; + struct futex_waiter waiter, *aux, *tmp; + struct futex_bucket *bucket; +@@ -586,9 +708,10 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + if (size != FUTEX_32) + return -EINVAL; + +- bucket = futex_get_bucket(uaddr, &waiter.key); +- if (IS_ERR(bucket)) ++ bucket = futex_get_bucket(uaddr, &waiter.key, shared); ++ if (IS_ERR(bucket)) { + return PTR_ERR(bucket); ++ } + + if (!bucket_get_waiters(bucket)) + return 0; +-- +2.28.0 + +From 909eb056421668b5d42f8c4dfa92339851a43dd8 Mon Sep 17 00:00:00 2001 +From: Gabriel Krisman Bertazi +Date: Mon, 2 Nov 2020 18:41:38 -0500 +Subject: [PATCH 08/13] Revert "futex: Remove needless goto's" + +This reverts commit d7c5ed73b19c4640426d9c106f70ec2cb532034d. +--- + kernel/futex.c | 40 ++++++++++++++++++++++++---------------- + 1 file changed, 24 insertions(+), 16 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index 6c00c0952313..a671d371b11f 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1593,13 +1593,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); + if (unlikely(ret != 0)) +- return ret; ++ goto out; + + hb = hash_futex(&key); + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) +- return ret; ++ goto out; + + spin_lock(&hb->lock); + +@@ -1622,6 +1622,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) + + spin_unlock(&hb->lock); + wake_up_q(&wake_q); ++out: + return ret; + } + +@@ -1688,10 +1689,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + if (unlikely(ret != 0)) +- return ret; ++ goto out; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) +- return ret; ++ goto out; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); +@@ -1709,13 +1710,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + * an MMU, but we might get them from range checking + */ + ret = op_ret; +- return ret; ++ goto out; + } + + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) +- return ret; ++ goto out; + } + + if (!(flags & FLAGS_SHARED)) { +@@ -1758,6 +1759,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); ++out: + return ret; + } + +@@ -1964,18 +1966,20 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + if (unlikely(ret != 0)) +- return ret; ++ goto out; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? FUTEX_WRITE : FUTEX_READ); + if (unlikely(ret != 0)) +- return ret; ++ goto out; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ +- if (requeue_pi && match_futex(&key1, &key2)) +- return -EINVAL; ++ if (requeue_pi && match_futex(&key1, &key2)) { ++ ret = -EINVAL; ++ goto out; ++ } + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); +@@ -1995,7 +1999,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + + ret = get_user(curval, uaddr1); + if (ret) +- return ret; ++ goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; +@@ -2061,7 +2065,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; +- return ret; ++ goto out; + case -EBUSY: + case -EAGAIN: + /* +@@ -2180,6 +2184,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); + hb_waiters_dec(hb2); ++ ++out: + return ret ? ret : task_count; + } + +@@ -2537,7 +2543,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current); +- return ret ? ret : locked; ++ goto out; + } + + /* +@@ -2550,7 +2556,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); +- return ret; ++ goto out; + } + + /* +@@ -2564,7 +2570,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) + q->pi_state->owner); + } + +- return ret; ++out: ++ return ret ? ret : locked; + } + + /** +@@ -2661,7 +2668,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + + ret = get_user(uval, uaddr); + if (ret) +- return ret; ++ goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; +@@ -2674,6 +2681,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + ret = -EWOULDBLOCK; + } + ++out: + return ret; + } + +-- +2.28.0 + +From fee513186b69c4a65534fd790545877974ef17d3 Mon Sep 17 00:00:00 2001 +From: Gabriel Krisman Bertazi +Date: Mon, 2 Nov 2020 18:41:54 -0500 +Subject: [PATCH 09/13] Revert "futex: Remove put_futex_key()" + +This reverts commit 9180bd467f9abdb44afde650d07e3b9dd66d837c. +--- + kernel/futex.c | 61 ++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 49 insertions(+), 12 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index a671d371b11f..647de692c874 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -661,6 +661,10 @@ static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + return err; + } + ++static inline void put_futex_key(union futex_key *key) ++{ ++} ++ + /** + * fault_in_user_writeable() - Fault in user address and verify RW access + * @uaddr: pointer to faulting user space address +@@ -1599,7 +1603,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) +- goto out; ++ goto out_put_key; + + spin_lock(&hb->lock); + +@@ -1622,6 +1626,8 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) + + spin_unlock(&hb->lock); + wake_up_q(&wake_q); ++out_put_key: ++ put_futex_key(&key); + out: + return ret; + } +@@ -1692,7 +1698,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + goto out; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) +- goto out; ++ goto out_put_key1; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); +@@ -1710,13 +1716,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + * an MMU, but we might get them from range checking + */ + ret = op_ret; +- goto out; ++ goto out_put_keys; + } + + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) +- goto out; ++ goto out_put_keys; + } + + if (!(flags & FLAGS_SHARED)) { +@@ -1724,6 +1730,8 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + goto retry_private; + } + ++ put_futex_key(&key2); ++ put_futex_key(&key1); + cond_resched(); + goto retry; + } +@@ -1759,6 +1767,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); ++out_put_keys: ++ put_futex_key(&key2); ++out_put_key1: ++ put_futex_key(&key1); + out: + return ret; + } +@@ -1970,7 +1982,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? FUTEX_WRITE : FUTEX_READ); + if (unlikely(ret != 0)) +- goto out; ++ goto out_put_key1; + + /* + * The check above which compares uaddrs is not sufficient for +@@ -1978,7 +1990,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + */ + if (requeue_pi && match_futex(&key1, &key2)) { + ret = -EINVAL; +- goto out; ++ goto out_put_keys; + } + + hb1 = hash_futex(&key1); +@@ -1999,11 +2011,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + + ret = get_user(curval, uaddr1); + if (ret) +- goto out; ++ goto out_put_keys; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + ++ put_futex_key(&key2); ++ put_futex_key(&key1); + goto retry; + } + if (curval != *cmpval) { +@@ -2062,6 +2076,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + case -EFAULT: + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); ++ put_futex_key(&key2); ++ put_futex_key(&key1); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; +@@ -2076,6 +2092,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + */ + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); ++ put_futex_key(&key2); ++ put_futex_key(&key1); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise +@@ -2185,6 +2203,10 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + wake_up_q(&wake_q); + hb_waiters_dec(hb2); + ++out_put_keys: ++ put_futex_key(&key2); ++out_put_key1: ++ put_futex_key(&key1); + out: + return ret ? ret : task_count; + } +@@ -2673,6 +2695,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + if (!(flags & FLAGS_SHARED)) + goto retry_private; + ++ put_futex_key(&q->key); + goto retry; + } + +@@ -2682,6 +2705,8 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + } + + out: ++ if (ret) ++ put_futex_key(&q->key); + return ret; + } + +@@ -2826,6 +2851,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + * - EAGAIN: The user space value changed. + */ + queue_unlock(hb); ++ put_futex_key(&q.key); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise +@@ -2933,11 +2959,13 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + put_pi_state(pi_state); + } + +- goto out; ++ goto out_put_key; + + out_unlock_put_key: + queue_unlock(hb); + ++out_put_key: ++ put_futex_key(&q.key); + out: + if (to) { + hrtimer_cancel(&to->timer); +@@ -2950,11 +2978,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + + ret = fault_in_user_writeable(uaddr); + if (ret) +- goto out; ++ goto out_put_key; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + ++ put_futex_key(&q.key); + goto retry; + } + +@@ -3083,13 +3112,16 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) + out_unlock: + spin_unlock(&hb->lock); + out_putkey: ++ put_futex_key(&key); + return ret; + + pi_retry: ++ put_futex_key(&key); + cond_resched(); + goto retry; + + pi_faulted: ++ put_futex_key(&key); + + ret = fault_in_user_writeable(uaddr); + if (!ret) +@@ -3231,7 +3263,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) +- goto out; ++ goto out_key2; + + /* + * The check above which compares uaddrs is not sufficient for +@@ -3240,7 +3272,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + if (match_futex(&q.key, &key2)) { + queue_unlock(hb); + ret = -EINVAL; +- goto out; ++ goto out_put_keys; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ +@@ -3250,7 +3282,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) +- goto out; ++ goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since +@@ -3340,6 +3372,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + ret = -EWOULDBLOCK; + } + ++out_put_keys: ++ put_futex_key(&q.key); ++out_key2: ++ put_futex_key(&key2); ++ + out: + if (to) { + hrtimer_cancel(&to->timer); +-- +2.28.0 + +From 3b1489448a277fc1c34ca12e859193c3a7f3446c Mon Sep 17 00:00:00 2001 +From: Gabriel Krisman Bertazi +Date: Fri, 12 Jul 2019 14:16:20 -0400 +Subject: [PATCH 10/13] futex: Split key setup from key queue locking and read + +split the futex key setup from the queue locking and key reading. This +is usefull to support the setup of multiple keys at the same time, like +what is done in futex_requeue() and what will be done for the +FUTEX_WAIT_MULTIPLE command. + +Signed-off-by: Gabriel Krisman Bertazi +--- + kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 42 insertions(+), 29 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index 647de692c874..f05349def492 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2634,6 +2634,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + __set_current_state(TASK_RUNNING); + } + ++static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ++ struct futex_q *q, struct futex_hash_bucket **hb) ++{ ++ ++ u32 uval; ++ int ret; ++ ++retry_private: ++ *hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (ret) { ++ queue_unlock(*hb); ++ ++ ret = get_user(uval, uaddr); ++ if (ret) ++ return ret; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ return 1; ++ } ++ ++ if (uval != val) { ++ queue_unlock(*hb); ++ ret = -EWOULDBLOCK; ++ } ++ ++ return ret; ++} ++ + /** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address +@@ -2654,7 +2687,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) + { +- u32 uval; + int ret; + + /* +@@ -2675,38 +2707,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +-retry: +- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); +- if (unlikely(ret != 0)) +- return ret; +- +-retry_private: +- *hb = queue_lock(q); ++ do { ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, ++ &q->key, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; + +- ret = get_futex_value_locked(&uval, uaddr); ++ ret = __futex_wait_setup(uaddr, val, flags, q, hb); + +- if (ret) { +- queue_unlock(*hb); +- +- ret = get_user(uval, uaddr); ++ /* Drop key reference if retry or error. */ + if (ret) +- goto out; ++ put_futex_key(&q->key); ++ } while (ret > 0); + +- if (!(flags & FLAGS_SHARED)) +- goto retry_private; +- +- put_futex_key(&q->key); +- goto retry; +- } +- +- if (uval != val) { +- queue_unlock(*hb); +- ret = -EWOULDBLOCK; +- } +- +-out: +- if (ret) +- put_futex_key(&q->key); + return ret; + } + +-- +2.28.0 + +From 539862895e53b9a774f3a2271d1e7db57879d0d7 Mon Sep 17 00:00:00 2001 +From: Gabriel Krisman Bertazi +Date: Mon, 8 Jul 2019 09:44:09 -0400 +Subject: [PATCH 11/13] futex: Implement FUTEX_WAIT_MULTIPLE + +This is a new futex operation to allow a thread to wait on several +futexes at the same time, and wake up on any of them. In a sense, it +implements one of the features that was supported by pooling on the old +FUTEX_FD interface. + +My use case for this feature lies in Wine, where we want to implement a +similar function available in Windows, mainly for event handling. The +wine folks have an implementation of the userspace side using eventfd, +but it suffers from bad performance, as shown in the measurements below. + +Technically, the old FUTEX_WAIT implementation can be easily +reimplemented using do_futex_wait_multiple, with a count one, and I have +a patch demonstrating how it works. I'm not proposing it, since futex +is such a tricky code, that I'd be more confortable to have +FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, +before considering modifying FUTEX_WAIT. + +This was tested using three mechanisms: + +1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and +running tools/testing/selftests/futex and a full linux distro on top of +this kernel. + +2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a +multi thread, event handling setup. + +3) By running the Wine fsync implementation and executing multi-threaded +applications, in particular modern games on top of the implementation. + +Signed-off-by: Zebediah Figura +Signed-off-by: Steven Noonan +Signed-off-by: Pierre-Loup A. Griffais +Signed-off-by: Gabriel Krisman Bertazi +--- + include/uapi/linux/futex.h | 7 ++ + kernel/futex.c | 159 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 162 insertions(+), 4 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index 35a5bf1cd41b..aefb0b83b784 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 13 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -190,4 +191,10 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index f05349def492..775f780a96c4 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -166,6 +166,7 @@ static int __read_mostly futex_cmpxchg_enabled; + #endif + #define FLAGS_CLOCKRT 0x02 + #define FLAGS_HAS_TIMEOUT 0x04 ++#define FLAGS_WAKE_MULTIPLE 0x08 + + /* + * Priority Inheritance state: +@@ -2723,6 +2724,148 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++static int do_futex_wait_multiple(struct futex_wait_block *wb, ++ u32 count, unsigned int flags, ++ ktime_t *abs_time) ++{ ++ ++ struct hrtimer_sleeper timeout, *to; ++ struct futex_hash_bucket *hb; ++ struct futex_q *qs = NULL; ++ int ret; ++ int i; ++ ++ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); ++ if (!qs) ++ return -ENOMEM; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, ++ current->timer_slack_ns); ++ retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ qs[i].bitset = wb[i].bitset; ++ ++ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret != 0)) { ++ for (--i; i >= 0; i--) ++ put_futex_key(&qs[i].key); ++ goto out; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, ++ flags, &qs[i], &hb); ++ if (ret) { ++ /* Drop the failed key directly. keys 0..(i-1) ++ * will be put by unqueue_me. */ ++ put_futex_key(&qs[i].key); ++ ++ /* Undo the partial work we did. */ ++ for (--i; i >= 0; i--) ++ unqueue_me(&qs[i]); ++ ++ __set_current_state(TASK_RUNNING); ++ if (ret > 0) ++ goto retry; ++ goto out; ++ } ++ ++ /* We can't hold to the bucket lock when dealing with ++ * the next futex. Queue ourselves now so we can unlock ++ * it before moving on. */ ++ queue_me(&qs[i], hb); ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* There is no easy to way to check if we are wake already on ++ * multiple futexes without waking through each one of them. So ++ * just sleep and let the scheduler handle it. ++ */ ++ if (!to || to->task) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = -ETIMEDOUT; ++ /* If we were woken (and unqueued), we succeeded. */ ++ for (i = 0; i < count; i++) ++ if (!unqueue_me(&qs[i])) ++ ret = i; ++ ++ /* Succeed wakeup */ ++ if (ret >= 0) ++ goto out; ++ ++ /* Woken by triggered timeout */ ++ if (to && !to->task) ++ goto out; ++ ++ /* ++ * We expect signal_pending(current), but we might be the ++ * victim of a spurious wakeup as well. ++ */ ++ if (!signal_pending(current)) ++ goto retry; ++ ++ ret = -ERESTARTSYS; ++ if (!abs_time) ++ goto out; ++ ++ ret = -ERESTART_RESTARTBLOCK; ++ out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ kfree(qs); ++ return ret; ++} ++ ++static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, ++ u32 count, ktime_t *abs_time) ++{ ++ struct futex_wait_block *wb; ++ struct restart_block *restart; ++ int ret; ++ ++ if (!count) ++ return -EINVAL; ++ ++ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); ++ if (!wb) ++ return -ENOMEM; ++ ++ if (copy_from_user(wb, uaddr, ++ count * sizeof(struct futex_wait_block))) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ ret = do_futex_wait_multiple(wb, count, flags, abs_time); ++ ++ if (ret == -ERESTART_RESTARTBLOCK) { ++ restart = ¤t->restart_block; ++ restart->fn = futex_wait_restart; ++ restart->futex.uaddr = uaddr; ++ restart->futex.val = count; ++ restart->futex.time = *abs_time; ++ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | ++ FLAGS_WAKE_MULTIPLE); ++ } ++ ++out: ++ kfree(wb); ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -2800,6 +2943,10 @@ static long futex_wait_restart(struct restart_block *restart) + } + restart->fn = do_no_restart_syscall; + ++ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) ++ return (long)futex_wait_multiple(uaddr, restart->futex.flags, ++ restart->futex.val, tp); ++ + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); + } +@@ -3843,6 +3990,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); ++ case FUTEX_WAIT_MULTIPLE: ++ return futex_wait_multiple(uaddr, flags, val, timeout); + } + return -ENOSYS; + } +@@ -3859,7 +4008,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3868,7 +4018,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); +@@ -4055,14 +4205,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); +-- +2.28.0 + +From f56b85af005d46e9ef920a6728e61f7c47cf561e Mon Sep 17 00:00:00 2001 +From: Gabriel Krisman Bertazi +Date: Mon, 2 Nov 2020 18:50:26 -0500 +Subject: [PATCH 12/13] futex: Change WAIT_MULTIPLE opcode to 31 + +Signed-off-by: Gabriel Krisman Bertazi +--- + include/uapi/linux/futex.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index aefb0b83b784..fe2b67ac0c5e 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,7 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 +-#define FUTEX_WAIT_MULTIPLE 13 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +-- +2.28.0 + +From 022e2f888a50fb8d062e26bc385abf02c0be84a3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Mon, 16 Nov 2020 21:22:21 -0300 +Subject: [PATCH 13/13] futex2: Add sysfs entry for syscall numbers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: André Almeida +--- + kernel/futex2.c | 42 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 42 insertions(+) + +diff --git a/kernel/futex2.c b/kernel/futex2.c +index ae743ddf223e..4bdff8bfc78d 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -742,6 +742,48 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + return ret; + } + ++static ssize_t wait_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_futex_wait); ++ ++} ++static struct kobj_attribute futex2_wait_attr = __ATTR_RO(wait); ++ ++static ssize_t wake_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_futex_wake); ++ ++} ++static struct kobj_attribute futex2_wake_attr = __ATTR_RO(wake); ++ ++static ssize_t waitv_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_futex_waitv); ++ ++} ++static struct kobj_attribute futex2_waitv_attr = __ATTR_RO(waitv); ++ ++static struct attribute *futex2_sysfs_attrs[] = { ++ &futex2_wait_attr.attr, ++ &futex2_wake_attr.attr, ++ &futex2_waitv_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group futex2_sysfs_attr_group = { ++ .attrs = futex2_sysfs_attrs, ++ .name = "futex2", ++}; ++ ++static int __init futex2_sysfs_init(void) ++{ ++ return sysfs_create_group(kernel_kobj, &futex2_sysfs_attr_group); ++} ++subsys_initcall(futex2_sysfs_init); ++ + static int __init futex2_init(void) + { + int i; +-- +2.28.0 + diff --git a/SOURCES/kernel-aarch64-debug-fedora.config b/SOURCES/kernel-aarch64-debug-fedora.config index 242af30..020e8d8 100644 --- a/SOURCES/kernel-aarch64-debug-fedora.config +++ b/SOURCES/kernel-aarch64-debug-fedora.config @@ -8005,4 +8005,4 @@ CONFIG_ZYNQMP_POWER=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-aarch64-debug-rhel.config b/SOURCES/kernel-aarch64-debug-rhel.config index 44d74da..f561a09 100644 --- a/SOURCES/kernel-aarch64-debug-rhel.config +++ b/SOURCES/kernel-aarch64-debug-rhel.config @@ -6271,4 +6271,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-aarch64-fedora.config b/SOURCES/kernel-aarch64-fedora.config index 85dd36b..0ed3d0c 100644 --- a/SOURCES/kernel-aarch64-fedora.config +++ b/SOURCES/kernel-aarch64-fedora.config @@ -7983,4 +7983,4 @@ CONFIG_ZYNQMP_POWER=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-aarch64-rhel.config b/SOURCES/kernel-aarch64-rhel.config index 20018cd..2467e8a 100644 --- a/SOURCES/kernel-aarch64-rhel.config +++ b/SOURCES/kernel-aarch64-rhel.config @@ -6250,4 +6250,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-armv7hl-debug-fedora.config b/SOURCES/kernel-armv7hl-debug-fedora.config index 96fe756..51f6e9e 100644 --- a/SOURCES/kernel-armv7hl-debug-fedora.config +++ b/SOURCES/kernel-armv7hl-debug-fedora.config @@ -8303,4 +8303,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-armv7hl-fedora.config b/SOURCES/kernel-armv7hl-fedora.config index f7803eb..3d4994a 100644 --- a/SOURCES/kernel-armv7hl-fedora.config +++ b/SOURCES/kernel-armv7hl-fedora.config @@ -8282,4 +8282,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-armv7hl-lpae-debug-fedora.config b/SOURCES/kernel-armv7hl-lpae-debug-fedora.config index 473d426..d71a096 100644 --- a/SOURCES/kernel-armv7hl-lpae-debug-fedora.config +++ b/SOURCES/kernel-armv7hl-lpae-debug-fedora.config @@ -8028,4 +8028,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-armv7hl-lpae-fedora.config b/SOURCES/kernel-armv7hl-lpae-fedora.config index fda01ed..8216e2c 100644 --- a/SOURCES/kernel-armv7hl-lpae-fedora.config +++ b/SOURCES/kernel-armv7hl-lpae-fedora.config @@ -8007,4 +8007,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-i686-debug-fedora.config b/SOURCES/kernel-i686-debug-fedora.config index 09068c8..b22c446 100644 --- a/SOURCES/kernel-i686-debug-fedora.config +++ b/SOURCES/kernel-i686-debug-fedora.config @@ -7329,4 +7329,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-i686-fedora.config b/SOURCES/kernel-i686-fedora.config index 076b5a7..0b6f96f 100644 --- a/SOURCES/kernel-i686-fedora.config +++ b/SOURCES/kernel-i686-fedora.config @@ -7307,4 +7307,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-ppc64le-debug-fedora.config b/SOURCES/kernel-ppc64le-debug-fedora.config index 948474f..bd9a95c 100644 --- a/SOURCES/kernel-ppc64le-debug-fedora.config +++ b/SOURCES/kernel-ppc64le-debug-fedora.config @@ -6805,4 +6805,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-ppc64le-debug-rhel.config b/SOURCES/kernel-ppc64le-debug-rhel.config index 3d3a986..3b9dab5 100644 --- a/SOURCES/kernel-ppc64le-debug-rhel.config +++ b/SOURCES/kernel-ppc64le-debug-rhel.config @@ -6118,4 +6118,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-ppc64le-fedora.config b/SOURCES/kernel-ppc64le-fedora.config index 2362172..566913b 100644 --- a/SOURCES/kernel-ppc64le-fedora.config +++ b/SOURCES/kernel-ppc64le-fedora.config @@ -6782,4 +6782,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-ppc64le-rhel.config b/SOURCES/kernel-ppc64le-rhel.config index d5ea6e9..b56545f 100644 --- a/SOURCES/kernel-ppc64le-rhel.config +++ b/SOURCES/kernel-ppc64le-rhel.config @@ -6100,4 +6100,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-s390x-debug-fedora.config b/SOURCES/kernel-s390x-debug-fedora.config index 8ce6075..839879b 100644 --- a/SOURCES/kernel-s390x-debug-fedora.config +++ b/SOURCES/kernel-s390x-debug-fedora.config @@ -6746,4 +6746,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-s390x-debug-rhel.config b/SOURCES/kernel-s390x-debug-rhel.config index 0de42ff..30b9f47 100644 --- a/SOURCES/kernel-s390x-debug-rhel.config +++ b/SOURCES/kernel-s390x-debug-rhel.config @@ -6071,4 +6071,5 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not se +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-s390x-fedora.config b/SOURCES/kernel-s390x-fedora.config index 7324eb1..51c63df 100644 --- a/SOURCES/kernel-s390x-fedora.config +++ b/SOURCES/kernel-s390x-fedora.config @@ -6723,4 +6723,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-s390x-rhel.config b/SOURCES/kernel-s390x-rhel.config index cf00d7c..37522ae 100644 --- a/SOURCES/kernel-s390x-rhel.config +++ b/SOURCES/kernel-s390x-rhel.config @@ -6053,4 +6053,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-s390x-zfcpdump-rhel.config b/SOURCES/kernel-s390x-zfcpdump-rhel.config index 6eb4856..20a04e0 100644 --- a/SOURCES/kernel-s390x-zfcpdump-rhel.config +++ b/SOURCES/kernel-s390x-zfcpdump-rhel.config @@ -6089,4 +6089,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-x86_64-debug-fedora.config b/SOURCES/kernel-x86_64-debug-fedora.config index 1a5c4fa..f654ea8 100644 --- a/SOURCES/kernel-x86_64-debug-fedora.config +++ b/SOURCES/kernel-x86_64-debug-fedora.config @@ -7381,4 +7381,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-x86_64-debug-rhel.config b/SOURCES/kernel-x86_64-debug-rhel.config index 0f2195e..bd45aae 100644 --- a/SOURCES/kernel-x86_64-debug-rhel.config +++ b/SOURCES/kernel-x86_64-debug-rhel.config @@ -6403,4 +6403,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-x86_64-fedora.config b/SOURCES/kernel-x86_64-fedora.config index e70e902..e4797db 100644 --- a/SOURCES/kernel-x86_64-fedora.config +++ b/SOURCES/kernel-x86_64-fedora.config @@ -7359,4 +7359,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # This option determines the default init for the system if no init= # warnings from C=1 sparse checker or -Wextra compilations. It has CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set \ No newline at end of file +CONFIG_FUTEX2=y diff --git a/SOURCES/kernel-x86_64-rhel.config b/SOURCES/kernel-x86_64-rhel.config index 6d5f980..09ede4d 100644 --- a/SOURCES/kernel-x86_64-rhel.config +++ b/SOURCES/kernel-x86_64-rhel.config @@ -6383,4 +6383,4 @@ CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set # CONFIG_ZX_TDM is not set CONFIG_ZENIFY=y -# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set +CONFIG_FUTEX2=y diff --git a/SPECS/kernel.spec b/SPECS/kernel.spec index 832018d..8280db3 100644 --- a/SPECS/kernel.spec +++ b/SPECS/kernel.spec @@ -94,7 +94,7 @@ Summary: The Linux kernel %if 0%{?released_kernel} # Do we have a -stable update to apply? -%define stable_update 10 +%define stable_update 11 # Set rpm version accordingly %if 0%{?stable_update} %define stablerev %{stable_update} @@ -852,30 +852,15 @@ Patch108: iommu-tegra-smmu-Fix-TLB-line-for-Tegra210.patch # A patch to fix some undocumented things broke a bunch of Allwinner networks due to wrong assumptions Patch124: 0001-update-phy-on-pine64-a64-devices.patch -# https://patchwork.kernel.org/project/linux-arm-kernel/patch/20201024162515.30032-2-wens@kernel.org/ -Patch125: arm-sun8i-realtek-phy-fixes.patch # https://patchwork.kernel.org/project/linux-arm-kernel/patch/20201025140144.28693-1-ats@offog.org/ Patch126: ARM-dts-sun7i-pcduino3-nano-enable-RGMII-RX-TX-delay-on-PHY.patch -# https://patchwork.kernel.org/project/linux-arm-kernel/patch/20201025081949.783443-1-jernej.skrabec@siol.net/ -Patch127: ARM-dts-sun8i-r40-bananapi-m2-ultra-Fix-ethernet-node.patch -# https://patchwork.kernel.org/project/linux-arm-kernel/patch/20201022185839.2779245-1-jernej.skrabec@siol.net/ -Patch128: arm64-dts-allwinner-a64-OrangePi-Win-Fix-ethernet-node.patch -# https://patchwork.kernel.org/project/linux-arm-kernel/patch/20201028115817.68113-1-nperic@gmail.com/ -Patch129: arm64-dts-allwinner-h5-OrangePi-Prime-Fix-ethernet-node.patch -# https://patchwork.kernel.org/project/linux-arm-kernel/patch/20201023184858.3272918-1-jernej.skrabec@siol.net/ -Patch130: arm64-dts-allwinner-h5-OrangePi-PC2-Fix-ethernet-node.patch -# https://patchwork.kernel.org/project/linux-arm-kernel/patch/20201023194902.368239-1-jernej.skrabec@siol.net/ -Patch131: arm64-dts-allwinner-h6-Pine-H64-Fix-ethernet-node.patch # rhbz 1897038 Patch132: bluetooth-fix-LL-privacy-BLE-device-fails-to-connect.patch -# CVE-2020-28941 rhbz 1899985 1899986 -Patch133: speakup-do-not-let-the-line-discipline-be-used-several-times.patch - # Linux-tkg patches - https://github.com/Frogging-Family/linux-tkg/tree/master/linux-tkg-patches/5.8 Patch200: zen.patch -Patch201: fsync.patch +Patch201: futex2.patch # END OF PATCH DEFINITIONS @@ -2990,8 +2975,11 @@ fi # # %changelog -* Wed Nov 25 19:20:00 CET 2020 Jan Drögehoff - 5.9.10-201.fsync -- Linux v5.9.10 fsync zen +* Sat Nov 28 20:09:49 CET 2020 Jan Drögehoff - 5.9.11-201.fsync +- Linux v5.9.11 futex2 zen + +* Tue Nov 24 11:22:38 CST 2020 Justin M. Forbes - 5.9.11-200 +- Linux v5.9.11 * Mon Nov 23 09:58:15 CST 2020 Justin M. Forbes - 5.9.10-200 - Linux v5.9.10 -- cgit v1.2.3