From ada1f13b98e86cb7ac4140c4976c3d165006d995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 5 Aug 2020 12:40:26 -0300 Subject: [PATCH 01/13] futex2: Add new futex interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initial implementation for futex2. Support only private u32 wait/wake, with timeout (monotonic and realtime clocks). Signed-off-by: André Almeida --- MAINTAINERS | 2 +- arch/x86/entry/syscalls/syscall_32.tbl | 2 + arch/x86/entry/syscalls/syscall_64.tbl | 2 + include/linux/syscalls.h | 7 + include/uapi/asm-generic/unistd.h | 8 +- include/uapi/linux/futex.h | 40 ++ init/Kconfig | 7 + kernel/Makefile | 1 + kernel/futex2.c | 484 +++++++++++++++++++++++++ kernel/sys_ni.c | 4 + 10 files changed, 555 insertions(+), 2 deletions(-) create mode 100644 kernel/futex2.c diff --git a/MAINTAINERS b/MAINTAINERS index 867157311dc8..0c425f74ed88 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7214,7 +7214,7 @@ F: Documentation/locking/*futex* F: include/asm-generic/futex.h F: include/linux/futex.h F: include/uapi/linux/futex.h -F: kernel/futex.c +F: kernel/futex* F: tools/perf/bench/futex* F: tools/testing/selftests/futex/ diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9d1102873666..955322962964 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -444,3 +444,5 @@ 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 +440 i386 futex_wait sys_futex_wait +441 i386 futex_wake sys_futex_wake diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f30d6ae9a688..4133bfe96891 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -361,6 +361,8 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common futex_wait sys_futex_wait +441 common futex_wake sys_futex_wake # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 75ac7f8ae93c..38c3a87dbfc2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -589,6 +589,13 @@ asmlinkage long sys_get_robust_list(int pid, asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +/* kernel/futex2.c */ +asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, + unsigned long flags, + struct __kernel_timespec __user __user *timo); +asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long nr_wake, + unsigned long flags); + /* kernel/hrtimer.c */ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 995b36c2ea7d..80567ade774a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -860,8 +860,14 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_futex_wait 440 +__SYSCALL(__NR_futex_wait, sys_futex_wait) + +#define __NR_futex_wake 441 +__SYSCALL(__NR_futex_wake, sys_futex_wake) + #undef __NR_syscalls -#define __NR_syscalls 440 +#define __NR_syscalls 442 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index a89eb0accd5e..35a5bf1cd41b 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -41,6 +41,46 @@ #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) +/* Size argument to futex2 syscall */ +#define FUTEX_8 0 +#define FUTEX_16 1 +#define FUTEX_32 2 + +#define FUTEX_SIZE_MASK 0x3 + +#define FUTEX_SHARED_FLAG 8 + +#define FUTEX_NUMA_FLAG 16 + +/* + * struct futexXX_numa - struct for NUMA-aware futex operation + * @value: futex value + * @hint: node id to operate + */ + +struct futex8_numa { + __u8 value; + __u8 hint; +}; + +struct futex16_numa { + __u16 value; + __u16 hint; +}; + +struct futex32_numa { + __u32 value; + __u32 hint; +}; + +#define FUTEX_WAITV_MAX 128 + +struct futex_waitv { + void *uaddr; + unsigned int val; + unsigned int flags; +}; + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/init/Kconfig b/init/Kconfig index 2a5df1cf838c..440f21f5c3d8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1522,6 +1522,13 @@ config FUTEX support for "fast userspace mutexes". The resulting kernel may not run glibc-based applications correctly. +config FUTEX2 + bool "Enable futex2 support" if EXPERT + depends on FUTEX + default n + help + Experimental support for futex2 interface. + config FUTEX_PI bool depends on FUTEX && RT_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..51ea9bc647bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_FUTEX) += futex.o +obj-$(CONFIG_FUTEX2) += futex2.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) diff --git a/kernel/futex2.c b/kernel/futex2.c new file mode 100644 index 000000000000..107b80a466d0 --- /dev/null +++ b/kernel/futex2.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * futex2 system call interface by André Almeida + * + * Copyright 2020 Collabora Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +/** + * struct futex_waiter - List entry for a waiter + * @key.address: Memory address of userspace futex + * @key.mm: Pointer to memory management struct of this process + * @key: Stores information that uniquely identify a futex + * @list: List node struct + * @val: Expected value for this waiter + * @flags: Flags + * @bucket: Pointer to the bucket for this waiter + * @index: Index of waiter in futexv list + */ +struct futex_waiter { + struct futex_key { + uintptr_t address; + struct mm_struct *mm; + } key; + struct list_head list; + unsigned int val; + unsigned int flags; + struct futex_bucket *bucket; + unsigned int index; +}; + +/** + * struct futex_bucket - A bucket of futex's hash table + * @waiters: Number of waiters in the bucket + * @lock: Bucket lock + * @list: List of waiters on this bucket + */ +struct futex_bucket { + atomic_t waiters; + spinlock_t lock; + struct list_head list; +}; + +struct futexv { + struct task_struct *task; + int hint; + struct futex_waiter objects[0]; +}; + +struct futex_single_waiter { + struct futexv parent; + struct futex_waiter waiter; +} __packed; + +struct futex_bucket *futex_table; + +/* mask for futex2 flag operations */ +#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | \ + FUTEX_CLOCK_REALTIME) + +// mask for sys_futex_waitv +#define FUTEXV_MASK (FUTEX_CLOCK_REALTIME) + +// mask for each futex in futex_waitv list +#define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG) + +int futex2_hashsize; + +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void bucket_inc_waiters(struct futex_bucket *bucket) +{ +#ifdef CONFIG_SMP + atomic_inc(&bucket->waiters); + /* + * Full barrier (A), see the ordering comment above. + */ + smp_mb__after_atomic(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void bucket_dec_waiters(struct futex_bucket *bucket) +{ +#ifdef CONFIG_SMP + atomic_dec(&bucket->waiters); +#endif +} + +/* + * Get the number of waiters in a bucket + */ +static inline int bucket_get_waiters(struct futex_bucket *bucket) +{ +#ifdef CONFIG_SMP + /* + * Full barrier (B), see the ordering comment above. + */ + smp_mb(); + return atomic_read(&bucket->waiters); +#else + return 1; +#endif +} + +/** + * futex_get_bucket - Check if the user address is valid, prepare internal + * data and calculate the hash + * @uaddr: futex user address + * @key: data that uniquely identifies a futex + * + * Return: address of bucket on success, error code otherwise + */ +static struct futex_bucket *futex_get_bucket(void __user *uaddr, + struct futex_key *key) +{ + uintptr_t address = (uintptr_t) uaddr; + u32 hash_key; + + /* Checking if uaddr is valid and accessible */ + if (unlikely(!IS_ALIGNED(address, sizeof(u32)))) + return ERR_PTR(-EINVAL); + if (unlikely(!access_ok(address, sizeof(u32)))) + return ERR_PTR(-EFAULT); + + key->address = address; + key->mm = current->mm; + + /* Generate hash key for this futex using uaddr and current->mm */ + hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0); + + /* Since HASH_SIZE is 2^n, subtracting 1 makes a perfect bit mask */ + return &futex_table[hash_key & (futex2_hashsize - 1)]; +} + +/** + * futex_get_user - Get the userspace value on this address + * @uval: variable to store the value + * @uaddr: userspace address + * + * Check the comment at futex_get_user_val for more information. + */ +static int futex_get_user(u32 *uval, u32 *uaddr) +{ + int ret; + + pagefault_disable(); + ret = __get_user(*uval, uaddr); + pagefault_enable(); + + return ret; +} + +/** + * futex_setup_time - Prepare the timeout mechanism, without starting it. + * @timo: Timeout value from userspace + * @timeout: Pointer to hrtimer handler + * @flags: Flags from userspace, to decide which clockid to use + * + * Return: 0 on success, error code otherwise + */ +static int futex_setup_time(struct __kernel_timespec __user *timo, + struct hrtimer_sleeper *timeout, + unsigned int flags) +{ + ktime_t time; + struct timespec64 ts; + clockid_t clockid = (flags & FUTEX_CLOCK_REALTIME) ? + CLOCK_REALTIME : CLOCK_MONOTONIC; + + if (get_timespec64(&ts, timo)) + return -EFAULT; + + if (!timespec64_valid(&ts)) + return -EINVAL; + + time = timespec64_to_ktime(ts); + + hrtimer_init_sleeper(timeout, clockid, HRTIMER_MODE_ABS); + + hrtimer_set_expires(&timeout->timer, time); + + return 0; +} + + +/** + * futex_get_user_value - Get the value from the userspace address and compares + * with the expected one. In success, leaves the function + * holding the bucket lock. Else, hold no lock. + * @bucket: hash bucket of this address + * @uaddr: futex's userspace address + * @val: expected value + * @multiple: is this call in the wait on multiple path + * + * Return: 0 on success, error code otherwise + */ +static int futex_get_user_value(struct futex_bucket *bucket, u32 __user *uaddr, + unsigned int val, bool multiple) +{ + u32 uval; + int ret; + + /* + * Get the value from user futex address. + * + * Since we are in a hurry, we use a spin lock and we can't sleep. + * Try to get the value with page fault disabled (when enable, we might + * sleep). + * + * If we fail, we aren't sure if the address is invalid or is just a + * page fault. Then, release the lock (so we can sleep) and try to get + * the value with page fault enabled. In order to trigger a page fault + * handling, we just call __get_user() again. + * + * If get_user succeeds, this mean that the address is valid and we do + * the loop again. Since we just handled the page fault, the page is + * likely pinned in memory and we should be luckier this time and be + * able to get the value. If we fail anyway, we will try again. + * + * If even with page faults enabled we get and error, this means that + * the address is not valid and we return from the syscall. + */ + do { + spin_lock(&bucket->lock); + + ret = futex_get_user(&uval, uaddr); + + if (ret) { + spin_unlock(&bucket->lock); + if (multiple || __get_user(uval, uaddr)) + return -EFAULT; + + } + } while (ret); + + if (uval != val) { + spin_unlock(&bucket->lock); + return -EWOULDBLOCK; + } + + return 0; +} + +/** + * futex_dequeue - Remove a futex from a queue + * @bucket: current bucket holding the futex + * @waiter: futex to be removed + * + * Return: True if futex was removed by this function, false if another wake + * thread removed this futex. + * + * This function should be used after we found that this futex was in a queue. + * Thus, it needs to be removed before the next step. However, someone could + * wake it between the time of the first check and the time to get the lock for + * the bucket. Check one more time if the futex is there with the bucket locked. + * If it's there, just remove it and return true. Else, mark the removal as + * false and do nothing. + */ +static bool futex_dequeue(struct futex_bucket *bucket, struct futex_waiter *waiter) +{ + bool removed = true; + + spin_lock(&bucket->lock); + if (list_empty(&waiter->list)) + removed = false; + else + list_del(&waiter->list); + spin_unlock(&bucket->lock); + + if (removed) + bucket_dec_waiters(bucket); + + return removed; +} + +/** + * sys_futex_wait - Wait on a futex address if (*uaddr) == val + * @uaddr: User address of futex + * @val: Expected value of futex + * @flags: Specify the size of futex and the clockid + * @timo: Optional absolute timeout. Supports only 64bit time. + */ +SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + unsigned int, flags, struct __kernel_timespec __user *, timo) +{ + unsigned int size = flags & FUTEX_SIZE_MASK; + struct hrtimer_sleeper timeout; + struct futex_bucket *bucket; + struct futex_single_waiter wait_single; + struct futex_waiter *waiter; + int ret; + + wait_single.parent.task = current; + wait_single.parent.hint = 0; + waiter = &wait_single.waiter; + waiter->index = 0; + + if (flags & ~FUTEX2_MASK) + return -EINVAL; + + if (size != FUTEX_32) + return -EINVAL; + + if (timo) { + ret = futex_setup_time(timo, &timeout, flags); + if (ret) + return ret; + } + + /* Get an unlocked hash bucket */ + bucket = futex_get_bucket(uaddr, &waiter->key); + if (IS_ERR(bucket)) + return PTR_ERR(bucket); + + if (timo) + hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS); + +retry: + bucket_inc_waiters(bucket); + + /* Compare the expected and current value, get the bucket lock */ + ret = futex_get_user_value(bucket, uaddr, val, false); + if (ret) { + bucket_dec_waiters(bucket); + goto out; + } + + /* Add the waiter to the hash table and sleep */ + set_current_state(TASK_INTERRUPTIBLE); + list_add_tail(&waiter->list, &bucket->list); + spin_unlock(&bucket->lock); + + /* Do not sleep if someone woke this futex or if it was timeouted */ + if (!list_empty_careful(&waiter->list) && (!timo || timeout.task)) + freezable_schedule(); + + __set_current_state(TASK_RUNNING); + + /* + * One of those things triggered this wake: + * + * * We have been removed from the bucket. futex_wake() woke us. We just + * need to return 0 to userspace. + * + * However, if we find ourselves in the bucket we must remove ourselves + * from the bucket and ... + * + * * If the there's a timeout and it has expired, return -ETIMEDOUT. + * + * * If there is a signal pending, something wants to kill our thread. + * Return -ERESTARTSYS. + * + * * If there's no signal pending, it was a spurious wake (scheduler + * gave us a change to do some work, even if we don't want to). We + * need to remove ourselves from the bucket and add again, to prevent + * losing wakeups in the meantime. + */ + + /* Normal wake */ + if (list_empty_careful(&waiter->list)) + goto out; + + if (!futex_dequeue(bucket, waiter)) + goto out; + + /* Timeout */ + if (timo && !timeout.task) + return -ETIMEDOUT; + + /* Spurious wakeup */ + if (!signal_pending(current)) + goto retry; + + /* Some signal is pending */ + ret = -ERESTARTSYS; +out: + if (timo) + hrtimer_cancel(&timeout.timer); + + return ret; +} + +static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) +{ + uintptr_t parent = waiter - sizeof(struct futexv) + - (uintptr_t) (index * sizeof(struct futex_waiter)); + + return (struct futexv *) parent; +} + +/** + * sys_futex_wake - Wake a number of futexes waiting on an address + * @uaddr: Address of futex to be woken up + * @nr_wake: Number of futexes to be woken up + * @flags: TODO + */ +SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + unsigned int, flags) +{ + unsigned int size = flags & FUTEX_SIZE_MASK; + struct futex_waiter waiter, *aux, *tmp; + struct futex_bucket *bucket; + struct task_struct *task; + DEFINE_WAKE_Q(wake_q); + int ret = 0; + + if (flags & ~FUTEX2_MASK) + return -EINVAL; + + if (size != FUTEX_32) + return -EINVAL; + + bucket = futex_get_bucket(uaddr, &waiter.key); + if (IS_ERR(bucket)) + return PTR_ERR(bucket); + + if (!bucket_get_waiters(bucket)) + return 0; + + spin_lock(&bucket->lock); + list_for_each_entry_safe(aux, tmp, &bucket->list, list) { + if (ret >= nr_wake) + break; + + if (waiter.key.address == aux->key.address && + waiter.key.mm == aux->key.mm) { + struct futexv *parent = + futex_get_parent((uintptr_t) aux, aux->index); + + parent->hint = 1; + task = parent->task; + get_task_struct(task); + list_del_init_careful(&aux->list); + wake_q_add_safe(&wake_q, task); + ret++; + bucket_dec_waiters(bucket); + } + } + spin_unlock(&bucket->lock); + + wake_up_q(&wake_q); + + return ret; +} + +static int __init futex2_init(void) +{ + int i; + unsigned int futex_shift; + +#if CONFIG_BASE_SMALL + futex2_hashsize = 16; +#else + futex2_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + + futex_table = alloc_large_system_hash("futex2", sizeof(struct futex_bucket), + futex2_hashsize, 0, + futex2_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex2_hashsize, futex2_hashsize); + futex2_hashsize = 1UL << futex_shift; + + for (i = 0; i < futex2_hashsize; i++) { + INIT_LIST_HEAD(&futex_table[i].list); + spin_lock_init(&futex_table[i].lock); + atomic_set(&futex_table[i].waiters, 0); + } + + return 0; +} +core_initcall(futex2_init); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d59775ea79c..10049bc56c24 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -148,6 +148,10 @@ COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); +/* kernel/futex2.c */ +COND_SYSCALL(futex_wait); +COND_SYSCALL(futex_wake); + /* kernel/hrtimer.c */ /* kernel/itimer.c */ -- 2.28.0 From 08110d54945541dd186a7dabeef58be08011dde7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 15 Oct 2020 17:15:57 -0300 Subject: [PATCH 02/13] futex2: Add suport for vectorized wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to wait on multiple futexes Signed-off-by: André Almeida --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/uapi/asm-generic/unistd.h | 5 +- kernel/futex2.c | 430 +++++++++++++++++-------- kernel/sys_ni.c | 1 + 5 files changed, 304 insertions(+), 134 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 955322962964..c844c0cbf0e5 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -446,3 +446,4 @@ 439 i386 faccessat2 sys_faccessat2 440 i386 futex_wait sys_futex_wait 441 i386 futex_wake sys_futex_wake +442 i386 futex_waitv sys_futex_waitv diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 4133bfe96891..0901c26c6786 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -363,6 +363,7 @@ 439 common faccessat2 sys_faccessat2 440 common futex_wait sys_futex_wait 441 common futex_wake sys_futex_wake +442 common futex_waitv sys_futex_waitv # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 80567ade774a..d7ebbed0a18c 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -866,8 +866,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) #define __NR_futex_wake 441 __SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex_waitv 442 +__SYSCALL(__NR_futex_waitv, sys_futex_waitv) + #undef __NR_syscalls -#define __NR_syscalls 442 +#define __NR_syscalls 443 /* * 32 bit systems traditionally used different diff --git a/kernel/futex2.c b/kernel/futex2.c index 107b80a466d0..4b782b5ef615 100644 --- a/kernel/futex2.c +++ b/kernel/futex2.c @@ -48,14 +48,25 @@ struct futex_bucket { struct list_head list; }; +/** + * struct futexv - List of futexes to be waited + * @task: Task to be awaken + * @hint: Was someone on this list awaken? + * @objects: List of futexes + */ struct futexv { struct task_struct *task; - int hint; + bool hint; struct futex_waiter objects[0]; }; +/** + * struct futex_single_waiter - Wrapper for a futexv of one element + * @futexv: TODO + * @waiter: TODO + */ struct futex_single_waiter { - struct futexv parent; + struct futexv futexv; struct futex_waiter waiter; } __packed; @@ -65,10 +76,10 @@ struct futex_bucket *futex_table; #define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | \ FUTEX_CLOCK_REALTIME) -// mask for sys_futex_waitv +/* mask for sys_futex_waitv flag */ #define FUTEXV_MASK (FUTEX_CLOCK_REALTIME) -// mask for each futex in futex_waitv list +/* mask for each futex in futex_waitv list */ #define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG) int futex2_hashsize; @@ -151,7 +162,7 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr, * * Check the comment at futex_get_user_val for more information. */ -static int futex_get_user(u32 *uval, u32 *uaddr) +static int futex_get_user(u32 *uval, u32 __user *uaddr) { int ret; @@ -194,95 +205,227 @@ static int futex_setup_time(struct __kernel_timespec __user *timo, return 0; } +/** + * futex_dequeue_multiple - Remove multiple futexes from hash table + * @futexv: list of waiters + * @nr: number of futexes to be removed + * + * This function should be used after we found that this futex was in a queue. + * Thus, it needs to be removed before the next step. However, someone could + * wake it between the time of the first check and the time to get the lock for + * the bucket. Check one more time if the futex is there with the bucket locked. + * If it's there, just remove it and return true. Else, mark the removal as + * false and do nothing. + * + * Return: + * * -1 if no futex was woken during the removal + * * =< 0 at least one futex was found woken, index of the last one + */ +static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr) +{ + int i, ret = -1; + + for (i = 0; i < nr; i++) { + spin_lock(&futexv->objects[i].bucket->lock); + if (!list_empty_careful(&futexv->objects[i].list)) { + list_del_init_careful(&futexv->objects[i].list); + bucket_dec_waiters(futexv->objects[i].bucket); + } else { + ret = i; + } + spin_unlock(&futexv->objects[i].bucket->lock); + } + + return ret; +} /** - * futex_get_user_value - Get the value from the userspace address and compares - * with the expected one. In success, leaves the function - * holding the bucket lock. Else, hold no lock. - * @bucket: hash bucket of this address - * @uaddr: futex's userspace address - * @val: expected value - * @multiple: is this call in the wait on multiple path + * futex_enqueue - Check the value and enqueue a futex on a wait list + * + * @futexv: List of futexes + * @nr_futexes: Number of futexes in the list + * @awaken: If a futex was awaken during enqueueing, store the index here + * + * Get the value from the userspace address and compares with the expected one. + * In success, enqueue the futex in the correct bucket + * + * Get the value from user futex address. + * + * Since we are in a hurry, we use a spin lock and we can't sleep. + * Try to get the value with page fault disabled (when enable, we might + * sleep). + * + * If we fail, we aren't sure if the address is invalid or is just a + * page fault. Then, release the lock (so we can sleep) and try to get + * the value with page fault enabled. In order to trigger a page fault + * handling, we just call __get_user() again. If we sleep with enqueued + * futexes, we might miss a wake, so dequeue everything before sleeping. + * + * If get_user succeeds, this mean that the address is valid and we do + * the work again. Since we just handled the page fault, the page is + * likely pinned in memory and we should be luckier this time and be + * able to get the value. If we fail anyway, we will try again. + * + * If even with page faults enabled we get and error, this means that + * the address is not valid and we return from the syscall. + * + * If we got an unexpected value or need to treat a page fault and realized that + * a futex was awaken, we can priority this and return success. * * Return: 0 on success, error code otherwise */ -static int futex_get_user_value(struct futex_bucket *bucket, u32 __user *uaddr, - unsigned int val, bool multiple) +static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, + unsigned int *awaken) { - u32 uval; - int ret; + int i, ret; + u32 uval, *uaddr, val; + struct futex_bucket *bucket; - /* - * Get the value from user futex address. - * - * Since we are in a hurry, we use a spin lock and we can't sleep. - * Try to get the value with page fault disabled (when enable, we might - * sleep). - * - * If we fail, we aren't sure if the address is invalid or is just a - * page fault. Then, release the lock (so we can sleep) and try to get - * the value with page fault enabled. In order to trigger a page fault - * handling, we just call __get_user() again. - * - * If get_user succeeds, this mean that the address is valid and we do - * the loop again. Since we just handled the page fault, the page is - * likely pinned in memory and we should be luckier this time and be - * able to get the value. If we fail anyway, we will try again. - * - * If even with page faults enabled we get and error, this means that - * the address is not valid and we return from the syscall. - */ - do { - spin_lock(&bucket->lock); +retry: + set_current_state(TASK_INTERRUPTIBLE); + + for (i = 0; i < nr_futexes; i++) { + uaddr = (u32 * __user) futexv->objects[i].key.address; + val = (u32) futexv->objects[i].val; + bucket = futexv->objects[i].bucket; + + bucket_inc_waiters(bucket); + spin_lock(&bucket->lock); - ret = futex_get_user(&uval, uaddr); + ret = futex_get_user(&uval, uaddr); - if (ret) { + if (unlikely(ret)) { spin_unlock(&bucket->lock); - if (multiple || __get_user(uval, uaddr)) + + bucket_dec_waiters(bucket); + __set_current_state(TASK_RUNNING); + *awaken = futex_dequeue_multiple(futexv, i); + + if (__get_user(uval, uaddr)) return -EFAULT; + if (*awaken >= 0) + return 0; + + goto retry; + } + + if (uval != val) { + spin_unlock(&bucket->lock); + + bucket_dec_waiters(bucket); + __set_current_state(TASK_RUNNING); + *awaken = futex_dequeue_multiple(futexv, i); + + if (*awaken >= 0) + return 0; + + return -EWOULDBLOCK; } - } while (ret); - if (uval != val) { + list_add_tail(&futexv->objects[i].list, &bucket->list); spin_unlock(&bucket->lock); - return -EWOULDBLOCK; } return 0; } + +static int __futex_wait(struct futexv *futexv, + unsigned int nr_futexes, + struct hrtimer_sleeper *timeout) +{ + int ret; + unsigned int awaken = -1; + + while (1) { + ret = futex_enqueue(futexv, nr_futexes, &awaken); + + if (ret < 0) + break; + + if (awaken <= 0) { + return awaken; + } + + + /* Before sleeping, check if someone was woken */ + if (!futexv->hint && (!timeout || timeout->task)) + freezable_schedule(); + + __set_current_state(TASK_RUNNING); + + /* + * One of those things triggered this wake: + * + * * We have been removed from the bucket. futex_wake() woke + * us. We just need to dequeue return 0 to userspace. + * + * However, if no futex was dequeued by a futex_wake(): + * + * * If the there's a timeout and it has expired, + * return -ETIMEDOUT. + * + * * If there is a signal pending, something wants to kill our + * thread, return -ERESTARTSYS. + * + * * If there's no signal pending, it was a spurious wake + * (scheduler gave us a change to do some work, even if we + * don't want to). We need to remove ourselves from the + * bucket and add again, to prevent losing wakeups in the + * meantime. + */ + + ret = futex_dequeue_multiple(futexv, nr_futexes); + + /* Normal wake */ + if (ret >= 0) + break; + + if (timeout && !timeout->task) + return -ETIMEDOUT; + + /* signal */ + if (signal_pending(current)) + return -ERESTARTSYS; + + /* spurious wake, do everything again */ + } + + return ret; +} + /** - * futex_dequeue - Remove a futex from a queue - * @bucket: current bucket holding the futex - * @waiter: futex to be removed + * futex_wait - Setup the timer and wait on a list of futexes + * @futexv: List of waiters + * @nr_futexes: Number of waiters + * @timo: Timeout + * @timeout: Timeout + * @flags: Timeout flags * - * Return: True if futex was removed by this function, false if another wake - * thread removed this futex. - * - * This function should be used after we found that this futex was in a queue. - * Thus, it needs to be removed before the next step. However, someone could - * wake it between the time of the first check and the time to get the lock for - * the bucket. Check one more time if the futex is there with the bucket locked. - * If it's there, just remove it and return true. Else, mark the removal as - * false and do nothing. + * Return: error code, or a hint of one of the waiters */ -static bool futex_dequeue(struct futex_bucket *bucket, struct futex_waiter *waiter) +static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, + struct __kernel_timespec __user *timo, + struct hrtimer_sleeper *timeout, unsigned int flags) { - bool removed = true; + int ret; - spin_lock(&bucket->lock); - if (list_empty(&waiter->list)) - removed = false; - else - list_del(&waiter->list); - spin_unlock(&bucket->lock); + if (timo) { + ret = futex_setup_time(timo, timeout, flags); + if (ret) + return ret; - if (removed) - bucket_dec_waiters(bucket); + hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); + } - return removed; + ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL); + + + if (timo) + hrtimer_cancel(&timeout->timer); + + return ret; } /** @@ -297,15 +440,20 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, { unsigned int size = flags & FUTEX_SIZE_MASK; struct hrtimer_sleeper timeout; - struct futex_bucket *bucket; struct futex_single_waiter wait_single; struct futex_waiter *waiter; + struct futexv *futexv; int ret; - wait_single.parent.task = current; - wait_single.parent.hint = 0; + futexv = &wait_single.futexv; + futexv->task = current; + futexv->hint = false; + waiter = &wait_single.waiter; waiter->index = 0; + waiter->val = val; + + INIT_LIST_HEAD(&waiter->list); if (flags & ~FUTEX2_MASK) return -EINVAL; @@ -313,85 +461,101 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, if (size != FUTEX_32) return -EINVAL; - if (timo) { - ret = futex_setup_time(timo, &timeout, flags); - if (ret) - return ret; - } - /* Get an unlocked hash bucket */ - bucket = futex_get_bucket(uaddr, &waiter->key); - if (IS_ERR(bucket)) - return PTR_ERR(bucket); + waiter->bucket = futex_get_bucket(uaddr, &waiter->key); + if (IS_ERR(waiter->bucket)) + return PTR_ERR(waiter->bucket); - if (timo) - hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS); + ret = futex_wait(futexv, 1, timo, &timeout, flags); -retry: - bucket_inc_waiters(bucket); + return ret; +} - /* Compare the expected and current value, get the bucket lock */ - ret = futex_get_user_value(bucket, uaddr, val, false); - if (ret) { - bucket_dec_waiters(bucket); - goto out; - } +/** + * futex_parse_waitv - Parse a waitv array from userspace + * @futexv: list of waiters + * @uwaitv: userspace list + * @nr_futexes: number of waiters in the list + * + * Return: Error code on failure, pointer to a prepared futexv otherwise + */ +static int futex_parse_waitv(struct futexv *futexv, + struct futex_waitv __user *uwaitv, + unsigned int nr_futexes) +{ + struct futex_waitv waitv; + unsigned int i; + struct futex_bucket *bucket; - /* Add the waiter to the hash table and sleep */ - set_current_state(TASK_INTERRUPTIBLE); - list_add_tail(&waiter->list, &bucket->list); - spin_unlock(&bucket->lock); + for (i = 0; i < nr_futexes; i++) { + if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) + return -EFAULT; - /* Do not sleep if someone woke this futex or if it was timeouted */ - if (!list_empty_careful(&waiter->list) && (!timo || timeout.task)) - freezable_schedule(); + if ((waitv.flags & ~FUTEXV_WAITER_MASK) || + (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) + return -EINVAL; - __set_current_state(TASK_RUNNING); + bucket = futex_get_bucket(waitv.uaddr, + &futexv->objects[i].key); + if (IS_ERR(bucket)) + return PTR_ERR(bucket); - /* - * One of those things triggered this wake: - * - * * We have been removed from the bucket. futex_wake() woke us. We just - * need to return 0 to userspace. - * - * However, if we find ourselves in the bucket we must remove ourselves - * from the bucket and ... - * - * * If the there's a timeout and it has expired, return -ETIMEDOUT. - * - * * If there is a signal pending, something wants to kill our thread. - * Return -ERESTARTSYS. - * - * * If there's no signal pending, it was a spurious wake (scheduler - * gave us a change to do some work, even if we don't want to). We - * need to remove ourselves from the bucket and add again, to prevent - * losing wakeups in the meantime. - */ + futexv->objects[i].bucket = bucket; + futexv->objects[i].val = waitv.val; + futexv->objects[i].flags = waitv.flags; + futexv->objects[i].index = i; + INIT_LIST_HEAD(&futexv->objects[i].list); + } - /* Normal wake */ - if (list_empty_careful(&waiter->list)) - goto out; + return 0; +} - if (!futex_dequeue(bucket, waiter)) - goto out; +/** + * sys_futex_waitv - function + * @waiters: TODO + * @nr_futexes: TODO + * @flags: TODO + * @timo: TODO + */ +SYSCALL_DEFINE4(futex_waitv, struct futex_waitv __user *, waiters, + unsigned int, nr_futexes, unsigned int, flags, + struct __kernel_timespec __user *, timo) +{ + struct hrtimer_sleeper timeout; + struct futexv *futexv; + int ret; - /* Timeout */ - if (timo && !timeout.task) - return -ETIMEDOUT; + if (flags & ~FUTEXV_MASK) + return -EINVAL; - /* Spurious wakeup */ - if (!signal_pending(current)) - goto retry; + if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) + return -EINVAL; - /* Some signal is pending */ - ret = -ERESTARTSYS; -out: - if (timo) - hrtimer_cancel(&timeout.timer); + futexv = kmalloc(sizeof(struct futexv) + + (sizeof(struct futex_waiter) * nr_futexes), + GFP_KERNEL); + if (!futexv) + return -ENOMEM; + + futexv->hint = false; + futexv->task = current; + + ret = futex_parse_waitv(futexv, waiters, nr_futexes); + if (!ret) + ret = futex_wait(futexv, nr_futexes, timo, &timeout, flags); + + kfree(futexv); return ret; } +/** + * futex_get_parent - Get parent + * @waiter: TODO + * @index: TODO + * + * Return: TODO + */ static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) { uintptr_t parent = waiter - sizeof(struct futexv) @@ -439,7 +603,7 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, struct futexv *parent = futex_get_parent((uintptr_t) aux, aux->index); - parent->hint = 1; + parent->hint = true; task = parent->task; get_task_struct(task); list_del_init_careful(&aux->list); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 10049bc56c24..3e1a713d3e57 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -151,6 +151,7 @@ COND_SYSCALL_COMPAT(get_robust_list); /* kernel/futex2.c */ COND_SYSCALL(futex_wait); COND_SYSCALL(futex_wake); +COND_SYSCALL(futex_waitv); /* kernel/hrtimer.c */ -- 2.28.0 From d8120d2ee1729a6933a606a6720f3e3116e4f699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 9 Jul 2020 11:34:40 -0300 Subject: [PATCH 03/13] selftests: futex: Add futex2 wake/wait test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a simple test to test wake/wait mechanism using futex2 interface. Create helper files so more tests can evaluate futex2. While 32bit ABIs from glibc aren't able to use 64 bit sized time variables, add a temporary workaround that implements the required types and calls the appropriated syscalls, since futex2 doesn't supports 32 bit sized time. Signed-off-by: André Almeida --- tools/include/uapi/asm-generic/unistd.h | 7 +- .../selftests/futex/functional/.gitignore | 1 + .../selftests/futex/functional/Makefile | 4 +- .../selftests/futex/functional/futex2_wait.c | 111 ++++++++++++++++++ .../testing/selftests/futex/functional/run.sh | 3 + .../selftests/futex/include/futex2test.h | 77 ++++++++++++ 6 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/futex/functional/futex2_wait.c create mode 100644 tools/testing/selftests/futex/include/futex2test.h diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 995b36c2ea7d..dd457de21bad 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -860,8 +860,13 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_futex_wait 440 +__SYSCALL(__NR_futex_wait, sys_futex_wait) +#define __NR_futex_wake 441 +__SYSCALL(__NR_futex_wake, sys_futex_wake) + #undef __NR_syscalls -#define __NR_syscalls 440 +#define __NR_syscalls 442 /* * 32 bit systems traditionally used different diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore index 0efcd494daab..d61f1df94360 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -6,3 +6,4 @@ futex_wait_private_mapped_file futex_wait_timeout futex_wait_uninitialized_heap futex_wait_wouldblock +futex2_wait diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 23207829ec75..7142a94a7ac3 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -5,6 +5,7 @@ LDLIBS := -lpthread -lrt HEADERS := \ ../include/futextest.h \ + ../include/futex2test.h \ ../include/atomic.h \ ../include/logging.h TEST_GEN_FILES := \ @@ -14,7 +15,8 @@ TEST_GEN_FILES := \ futex_requeue_pi_signal_restart \ futex_requeue_pi_mismatched_ops \ futex_wait_uninitialized_heap \ - futex_wait_private_mapped_file + futex_wait_private_mapped_file \ + futex2_wait TEST_PROGS := run.sh diff --git a/tools/testing/selftests/futex/functional/futex2_wait.c b/tools/testing/selftests/futex/functional/futex2_wait.c new file mode 100644 index 000000000000..752ed26803b3 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex2_wait.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/****************************************************************************** + * + * Copyright Collabora Ltd., 2020 + * + * DESCRIPTION + * Test wait/wake mechanism of futex2, using 32bit sized futexes. + * + * AUTHOR + * André Almeida + * + * HISTORY + * 2020-Jul-9: Initial version by André + * + *****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "futex2test.h" +#include "logging.h" + +#define TEST_NAME "futex-wait-wouldblock" +#define timeout_ns 30000000 +#define WAKE_WAIT_US 10000 +futex_t f1 = FUTEX_INITIALIZER; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +void *waiterfn(void *arg) +{ + struct timespec64 to64; + + /* setting absolute timeout for futex2 */ + if (gettime64(CLOCK_MONOTONIC, &to64)) + error("gettime64 failed\n", errno); + + to64.tv_nsec += timeout_ns; + + if (to64.tv_nsec >= 1000000000) { + to64.tv_sec++; + to64.tv_nsec -= 1000000000; + } + + if (futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64)) + printf("waiter failed errno %d\n", errno); + + return NULL; +} + +int main(int argc, char *argv[]) +{ + pthread_t waiter; + int res, ret = RET_PASS; + int c; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + ksft_print_header(); + ksft_set_plan(1); + ksft_print_msg("%s: Test FUTEX_WAIT\n", + basename(argv[0])); + + info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1); + + if (pthread_create(&waiter, NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + + info("Calling futex2_wake on f1: %u @ %p with val=%u\n", f1, &f1, f1); + res = futex2_wake(&f1, 1, FUTEX_PRIVATE_FLAG | FUTEX_32); + if (res != 1) { + ksft_test_result_fail("futex2_wake returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex2_wake wouldblock succeeds\n"); + } + + ksft_print_cnts(); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 1acb6ace1680..3730159c865a 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -73,3 +73,6 @@ echo echo ./futex_wait_uninitialized_heap $COLOR ./futex_wait_private_mapped_file $COLOR + +echo +./futex2_wait $COLOR diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h new file mode 100644 index 000000000000..807b8b57fe61 --- /dev/null +++ b/tools/testing/selftests/futex/include/futex2test.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/****************************************************************************** + * + * Copyright Collabora Ltd., 2020 + * + * DESCRIPTION + * Futex2 library addons for old futex library + * + * AUTHOR + * André Almeida + * + * HISTORY + * 2020-Jul-9: Initial version by André + * + *****************************************************************************/ +#include "futextest.h" +#include + +#define NSEC_PER_SEC 1000000000L + +#ifndef FUTEX_8 +# define FUTEX_8 0 +#endif +#ifndef FUTEX_16 +# define FUTEX_16 1 +#endif +#ifndef FUTEX_32 +#define FUTEX_32 2 +#endif +#ifdef __x86_64__ +# ifndef FUTEX_64 +# define FUTEX_64 3 +# endif +#endif + +/* + * - Y2038 section for 32-bit applications - + * + * Remove this when glibc is ready for y2038. Then, always compile with + * `-DTIME_BITS=64` or `-D__USE_TIME_BITS64`. glibc will provide both + * timespec64 and clock_gettime64 so we won't need to define here. + */ +#if defined(__i386__) || __TIMESIZE == 32 +# define NR_gettime __NR_clock_gettime64 +#else +# define NR_gettime __NR_clock_gettime +#endif + +struct timespec64 { + long long tv_sec; /* seconds */ + long long tv_nsec; /* nanoseconds */ +}; + +int gettime64(clock_t clockid, struct timespec64 *tv) +{ + return syscall(NR_gettime, clockid, tv); +} +/* + * - End of Y2038 section - + */ + +/* + * wait for uaddr if (*uaddr == val) + */ +static inline int futex2_wait(volatile void *uaddr, unsigned long val, + unsigned long flags, struct timespec64 *timo) +{ + return syscall(__NR_futex_wait, uaddr, val, flags, timo); +} + +/* + * wake nr futexes waiting for uaddr + */ +static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned long flags) +{ + return syscall(__NR_futex_wake, uaddr, nr, flags); +} -- 2.28.0 From d4a7ca72f276b2e337eaedcbbe58a2782e0e7d3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 9 Jul 2020 11:36:14 -0300 Subject: [PATCH 04/13] selftests: futex: Add futex2 timeout test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapt existing futex wait timeout file to test the same mechanism for futex2. Signed-off-by: André Almeida --- .../futex/functional/futex_wait_timeout.c | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index ee55e6d389a3..d2e7ae18985b 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -11,6 +11,7 @@ * * HISTORY * 2009-Nov-6: Initial version by Darren Hart + * 2020-Jul-9: Add futex2 test by André * *****************************************************************************/ @@ -20,7 +21,7 @@ #include #include #include -#include "futextest.h" +#include "futex2test.h" #include "logging.h" #define TEST_NAME "futex-wait-timeout" @@ -40,7 +41,8 @@ void usage(char *prog) int main(int argc, char *argv[]) { futex_t f1 = FUTEX_INITIALIZER; - struct timespec to; + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + struct timespec64 to64; int res, ret = RET_PASS; int c; @@ -65,22 +67,40 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(2); ksft_print_msg("%s: Block on a futex and wait for timeout\n", basename(argv[0])); ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); - /* initialize timeout */ - to.tv_sec = 0; - to.tv_nsec = timeout_ns; - info("Calling futex_wait on f1: %u @ %p\n", f1, &f1); res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG); if (!res || errno != ETIMEDOUT) { - fail("futex_wait returned %d\n", ret < 0 ? errno : ret); + ksft_test_result_fail("futex_wait returned %d\n", ret < 0 ? errno : ret); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_wait timeout succeeds\n"); + } + + /* setting absolute timeout for futex2 */ + if (gettime64(CLOCK_MONOTONIC, &to64)) + error("gettime64 failed\n", errno); + + to64.tv_nsec += timeout_ns; + + if (to64.tv_nsec >= 1000000000) { + to64.tv_sec++; + to64.tv_nsec -= 1000000000; + } + + info("Calling futex2_wait on f1: %u @ %p\n", f1, &f1); + res = futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64); + if (!res || errno != ETIMEDOUT) { + ksft_test_result_fail("futex2_wait returned %d\n", ret < 0 ? errno : ret); ret = RET_FAIL; + } else { + ksft_test_result_pass("futex2_wait timeout succeeds\n"); } - print_result(TEST_NAME, ret); + ksft_print_cnts(); return ret; } -- 2.28.0 From 6d2252d43d36a5eb2b9170351128007e27f47737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 9 Jul 2020 11:37:42 -0300 Subject: [PATCH 05/13] selftests: futex: Add futex2 wouldblock test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapt existing futex wait wouldblock file to test the same mechanism for futex2. Signed-off-by: André Almeida --- .../futex/functional/futex_wait_wouldblock.c | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 0ae390ff8164..8187f0754cd2 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -12,6 +12,7 @@ * * HISTORY * 2009-Nov-14: Initial version by Gowrishankar + * 2020-Jul-9: Add futex2 test by André * *****************************************************************************/ @@ -21,7 +22,7 @@ #include #include #include -#include "futextest.h" +#include "futex2test.h" #include "logging.h" #define TEST_NAME "futex-wait-wouldblock" @@ -39,6 +40,7 @@ void usage(char *prog) int main(int argc, char *argv[]) { struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + struct timespec64 to64; futex_t f1 = FUTEX_INITIALIZER; int res, ret = RET_PASS; int c; @@ -61,18 +63,41 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(2); ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", basename(argv[0])); info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); if (!res || errno != EWOULDBLOCK) { - fail("futex_wait returned: %d %s\n", + ksft_test_result_fail("futex_wait returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_wait wouldblock succeeds\n"); } - print_result(TEST_NAME, ret); + /* setting absolute timeout for futex2 */ + if (gettime64(CLOCK_MONOTONIC, &to64)) + error("gettime64 failed\n", errno); + + to64.tv_nsec += timeout_ns; + + if (to64.tv_nsec >= 1000000000) { + to64.tv_sec++; + to64.tv_nsec -= 1000000000; + } + + info("Calling futex2_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + res = futex2_wait(&f1, f1+1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64); + if (!res || errno != EWOULDBLOCK) { + ksft_test_result_fail("futex2_wait returned: %d %s\n", + res ? errno : res, res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex2_wait wouldblock succeeds\n"); + } + + ksft_print_cnts(); return ret; } -- 2.28.0 From 6b35a09be663f5a844e089f1ddd370137832e7a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 14 Oct 2020 16:10:09 -0300 Subject: [PATCH 06/13] DONOTMERGE: futex: Add a clone of futex implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For comparative performance tests between the original futex and the new futex2 interface, create a clone of the current futex. In that way, we can have a fair comparison, since the futex2 table will be empty with no contention for the bucket locks. Since futex is widely used in the host system, the performance tests could get misleading results by the tests competing with the system for resources. Signed-off-by: André Almeida --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 3 + include/uapi/asm-generic/unistd.h | 5 +- kernel/Makefile | 1 + kernel/futex1.c | 3384 +++++++++++++++++ kernel/sys_ni.c | 2 + tools/arch/x86/include/asm/unistd_64.h | 12 + tools/include/uapi/asm-generic/unistd.h | 6 +- .../arch/x86/entry/syscalls/syscall_64.tbl | 3 + tools/perf/bench/futex.h | 23 +- 11 files changed, 3438 insertions(+), 3 deletions(-) create mode 100644 kernel/futex1.c diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c844c0cbf0e5..820fa53ccf75 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,3 +447,4 @@ 440 i386 futex_wait sys_futex_wait 441 i386 futex_wake sys_futex_wake 442 i386 futex_waitv sys_futex_waitv +443 i386 futex1 sys_futex1 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 0901c26c6786..99795136cb98 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,6 +364,7 @@ 440 common futex_wait sys_futex_wait 441 common futex_wake sys_futex_wake 442 common futex_waitv sys_futex_waitv +443 common futex1 sys_futex1 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 38c3a87dbfc2..0351f6ad09a9 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -596,6 +596,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long nr_wake, unsigned long flags); +asmlinkage long sys_futex1(void __user *uaddr, unsigned long nr_wake, + unsigned long flags); + /* kernel/hrtimer.c */ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d7ebbed0a18c..e3ba6cb1f76d 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -869,8 +869,11 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake) #define __NR_futex_waitv 442 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_futex1 443 +__SYSCALL(__NR_futex1, sys_futex1) + #undef __NR_syscalls -#define __NR_syscalls 443 +#define __NR_syscalls 444 /* * 32 bit systems traditionally used different diff --git a/kernel/Makefile b/kernel/Makefile index 51ea9bc647bf..0fe55a8cb9e2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_FUTEX) += futex.o +obj-$(CONFIG_FUTEX2) += futex1.o obj-$(CONFIG_FUTEX2) += futex2.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o diff --git a/kernel/futex1.c b/kernel/futex1.c new file mode 100644 index 000000000000..4f7bf312fefd --- /dev/null +++ b/kernel/futex1.c @@ -0,0 +1,3384 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar + * (C) Copyright 2003 Red Hat Inc, All Rights Reserved + * + * Removed page pinning, fix privately mapped COW pages and other cleanups + * (C) Copyright 2003, 2004 Jamie Lokier + * + * Robust futex support started by Ingo Molnar + * (C) Copyright 2006 Red Hat Inc, All Rights Reserved + * Thanks to Thomas Gleixner for suggestions, analysis and fixes. + * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet + * + * Requeue-PI support by Darren Hart + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "locking/rtmutex_common.h" + +/* + * READ this before attempting to hack on futexes! + * + * Basic futex operation and ordering guarantees + * ============================================= + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * if (queue_empty()) + * return; + * if (uval == val) + * lock(hash_bucket(futex)); + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * + * waiters++; (a) + * smp_mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `--------> smp_mb(); (B) + * if (uval == val) + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); if (waiters) + * lock(hash_bucket(futex)); + * else wake_waiters(futex); + * waiters--; (b) unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read through + * atomic operations (see hb_waiters_inc) and where (B) orders the write + * to futex and the waiters read (see hb_waiters_pending()). + * + * This yields the following case (where X:=waiters, Y:=futex): + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + * + * Note that a new waiter is accounted for in (a) even when it is possible that + * the wait call can return error, in which case we backtrack from it in (b). + * Refer to the comment in queue_lock(). + * + * Similarly, in order to account for waiters being requeued on another + * address we always increment the waiters for the destination bucket before + * acquiring the lock. It then decrements them again after releasing it - + * the code that actually moves the futex(es) between hash buckets (requeue_futex) + * will do the additional required waiter count housekeeping. This is done for + * double_lock_hb() and double_unlock_hb(), respectively. + */ + +#ifdef CONFIG_HAVE_FUTEX_CMPXCHG +#define futex_cmpxchg_enabled 1 +#else +static int __read_mostly futex_cmpxchg_enabled; +#endif + +/* + * Futex flags used to encode options to functions and preserve them across + * restarts. + */ +#ifdef CONFIG_MMU +# define FLAGS_SHARED 0x01 +#else +/* + * NOMMU does not have per process address space. Let the compiler optimize + * code away. + */ +# define FLAGS_SHARED 0x00 +#endif +#define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 + +/* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + refcount_t refcount; + + union futex_key key; +} __randomize_layout; + +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @list: priority-sorted list of tasks waiting on this futex + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @pi_state: optional priority inheritance state + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so + * we can wake only the relevant ones (hashed queues may be shared). + * + * A futex_q has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. + * The order of wakeup is always to make the first condition true, then + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See unqueue_me_pi(). + */ +struct futex_q { + struct plist_node list; + + struct task_struct *task; + spinlock_t *lock_ptr; + union futex_key key; + struct futex_pi_state *pi_state; + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; +} __randomize_layout; + +static const struct futex_q futex_q_init = { + /* list gets initialized in queue_me()*/ + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY +}; + +/* + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task + * waiting on a futex. + */ +struct futex_hash_bucket { + atomic_t waiters; + spinlock_t lock; + struct plist_head chain; +} ____cacheline_aligned_in_smp; + +/* + * The base of the bucket array and its size are always used together + * (after initialization only in hash_futex()), so ensure that they + * reside in the same cacheline. + */ +static struct { + struct futex_hash_bucket *queues; + unsigned long hashsize; +} __futex_data __read_mostly __aligned(2*sizeof(long)); +#define futex_queues (__futex_data.queues) +#define futex_hashsize (__futex_data.hashsize) + + +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX + +static struct { + struct fault_attr attr; + + bool ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = false, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +static bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private); + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else +static inline bool should_fail_futex(bool fshared) +{ + return false; +} +#endif /* CONFIG_FAIL_FUTEX */ + +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void hb_waiters_inc(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_inc(&hb->waiters); + /* + * Full barrier (A), see the ordering comment above. + */ + smp_mb__after_atomic(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} + +static inline int hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + /* + * Full barrier (B), see the ordering comment above. + */ + smp_mb(); + return atomic_read(&hb->waiters); +#else + return 1; +#endif +} + +/** + * hash_futex - Return the hash bucket in the global hash + * @key: Pointer to the futex key for which the hash is calculated + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket in the global hash. + */ +static struct futex_hash_bucket *hash_futex(union futex_key *key) +{ + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, + key->both.offset); + + return &futex_queues[hash & (futex_hashsize - 1)]; +} + + +/** + * match_futex - Check whether two futex keys are equal + * @key1: Pointer to key1 + * @key2: Pointer to key2 + * + * Return 1 if two futex_keys are equal, 0 otherwise. + */ +static inline int match_futex(union futex_key *key1, union futex_key *key2) +{ + return (key1 && key2 + && key1->both.word == key2->both.word + && key1->both.ptr == key2->both.ptr + && key1->both.offset == key2->both.offset); +} + +enum futex_access { + FUTEX_READ, + FUTEX_WRITE +}; + +/** + * futex_setup_timer - set up the sleeping hrtimer. + * @time: ptr to the given timeout value + * @timeout: the hrtimer_sleeper structure to be set up + * @flags: futex flags + * @range_ns: optional range in ns + * + * Return: Initialized hrtimer_sleeper structure or NULL if no timeout + * value given + */ +static inline struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) +{ + if (!time) + return NULL; + + hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + /* + * If range_ns is 0, calling hrtimer_set_expires_range_ns() is + * effectively the same as calling hrtimer_set_expires(). + */ + hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); + + return timeout; +} + +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + +/** + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: FUTEX_READ, + * FUTEX_WRITE) + * + * Return: a negative error code or 0 + * + * The key words are stored in @key on success. + * + * For shared mappings (when @fshared), the key is: + * + * ( inode->i_sequence, page->index, offset_within_page ) + * + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. + * + * lock_page() might sleep, the caller should not hold a spinlock. + */ +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) +{ + unsigned long address = (unsigned long)uaddr; + struct mm_struct *mm = current->mm; + struct page *page, *tail; + struct address_space *mapping; + int err, ro = 0; + + /* + * The futex address must be "naturally" aligned. + */ + key->both.offset = address % PAGE_SIZE; + if (unlikely((address % sizeof(u32)) != 0)) + return -EINVAL; + address -= key->both.offset; + + if (unlikely(!access_ok(uaddr, sizeof(u32)))) + return -EFAULT; + + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + + /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + key->private.mm = mm; + key->private.address = address; + return 0; + } + +again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); + /* + * If write access is not required (eg. FUTEX_WAIT), try + * and get read-only access. + */ + if (err == -EFAULT && rw == FUTEX_READ) { + err = get_user_pages_fast(address, 1, 0, &page); + ro = 1; + } + if (err < 0) + return err; + else + err = 0; + + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + * + * Mapping checks require the head page for any compound page so the + * head page and mapping is looked up now. For anonymous pages, it + * does not matter if the page splits in the future as the key is + * based on the address. For filesystem-backed pages, the tail is + * required as the index of the page determines the key. For + * base pages, there is no tail page and tail == page. + */ + tail = page; + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + + /* + * If page->mapping is NULL, then it cannot be a PageAnon + * page; but it might be the ZERO_PAGE or in the gate area or + * in a special mapping (all cases which we are happy to fail); + * or it may have been a good file page when get_user_pages_fast + * found it, but truncated or holepunched or subjected to + * invalidate_complete_page2 before we got the page lock (also + * cases which we are happy to fail). And we hold a reference, + * so refcount care in invalidate_complete_page's remove_mapping + * prevents drop_caches from setting mapping to NULL beneath us. + * + * The case we do have to guard against is when memory pressure made + * shmem_writepage move it from filecache to swapcache beneath us: + * an unlikely race, but we do need to retry for page->mapping. + */ + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; + unlock_page(page); + put_page(page); + + if (shmem_swizzled) + goto again; + + return -EFAULT; + } + + /* + * Private mappings are handled in a simple way. + * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * + * NOTE: When userspace waits on a MAP_SHARED mapping, even if + * it's a read-only handle, it's expected that futexes attach to + * the object not the particular process. + */ + if (PageAnon(page)) { + /* + * A RO anonymous page will never change and thus doesn't make + * sense for futex operations. + */ + if (unlikely(should_fail_futex(true)) || ro) { + err = -EFAULT; + goto out; + } + + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ + key->private.mm = mm; + key->private.address = address; + + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.i_seq = get_inode_sequence_number(inode); + key->shared.pgoff = basepage_index(tail); + rcu_read_unlock(); + } + +out: + put_page(page); + return err; +} + +/** + * fault_in_user_writeable() - Fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non-destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +static int fault_in_user_writeable(u32 __user *uaddr) +{ + struct mm_struct *mm = current->mm; + int ret; + + mmap_read_lock(mm); + ret = fixup_user_fault(mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE, NULL); + mmap_read_unlock(mm); + + return ret < 0 ? ret : 0; +} + +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) +{ + int ret; + + pagefault_disable(); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); + pagefault_enable(); + + return ret; +} + +static int get_futex_value_locked(u32 *dest, u32 __user *from) +{ + int ret; + + pagefault_disable(); + ret = __get_user(*dest, from); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + refcount_set(&pi_state->refcount, 1); + pi_state->key = FUTEX_KEY_INIT; + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state *alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); +} + +/* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. + */ +static void put_pi_state(struct futex_pi_state *pi_state) +{ + if (!pi_state) + return; + + if (!refcount_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + struct task_struct *owner; + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + } + + if (current->pi_state_cache) { + kfree(pi_state); + } else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + refcount_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * We need to check the following states: + * + * Waiter | pi_state | pi->owner | uTID | uODIED | ? + * + * [1] NULL | --- | --- | 0 | 0/1 | Valid + * [2] NULL | --- | --- | >0 | 0/1 | Valid + * + * [3] Found | NULL | -- | Any | 0/1 | Invalid + * + * [4] Found | Found | NULL | 0 | 1 | Valid + * [5] Found | Found | NULL | >0 | 1 | Invalid + * + * [6] Found | Found | task | 0 | 1 | Valid + * + * [7] Found | Found | NULL | Any | 0 | Invalid + * + * [8] Found | Found | task | ==taskTID | 0/1 | Valid + * [9] Found | Found | task | 0 | 0 | Invalid + * [10] Found | Found | task | !=taskTID | 0/1 | Invalid + * + * [1] Indicates that the kernel can acquire the futex atomically. We + * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2] Valid, if TID does not belong to a kernel thread. If no matching + * thread is found then it indicates that the owner TID has died. + * + * [3] Invalid. The waiter is queued on a non PI futex + * + * [4] Valid state after exit_robust_list(), which sets the user space + * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5] The user space value got manipulated between exit_robust_list() + * and exit_pi_state_list() + * + * [6] Valid state after exit_pi_state_list() which sets the new owner in + * the pi_state but cannot access the user space value. + * + * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8] Owner and user space value match + * + * [9] There is no transient state which sets the user space TID to 0 + * except exit_robust_list(), but this is indicated by the + * FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * + */ + +/* + * Validate that the existing waiter has a pi_state and sanity check + * the pi_state against the user space value. If correct, attach to + * it. + */ +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, + struct futex_pi_state **ps) +{ + pid_t pid = uval & FUTEX_TID_MASK; + u32 uval2; + int ret; + + /* + * Userspace might have messed up non-PI and PI futexes [3] + */ + if (unlikely(!pi_state)) + return -EINVAL; + + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ + WARN_ON(!refcount_read(&pi_state->refcount)); + + /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (get_futex_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + + /* + * Handle the owner died case: + */ + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and wakes the + * topmost waiter. The task which acquires the + * pi_state->rt_mutex will fixup owner. + */ + if (!pi_state->owner) { + /* + * No pi state owner, but the user space TID + * is not 0. Inconsistent state. [5] + */ + if (pid) + goto out_einval; + /* + * Take a ref on the state and return success. [4] + */ + goto out_attach; + } + + /* + * If TID is 0, then either the dying owner has not + * yet executed exit_pi_state_list() or some waiter + * acquired the rtmutex in the pi state, but did not + * yet fixup the TID in user space. + * + * Take a ref on the state and return success. [6] + */ + if (!pid) + goto out_attach; + } else { + /* + * If the owner died bit is not set, then the pi_state + * must have an owner. [7] + */ + if (!pi_state->owner) + goto out_einval; + } + + /* + * Bail out if user space manipulated the futex value. If pi + * state exists then the owner TID must be the same as the + * user space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + *ps = pi_state; + return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; +} + +/** + * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status + * @exiting: Pointer to the exiting task + * + * Caller must hold a refcount on @exiting. + */ +static void wait_for_owner_exiting(int ret, struct task_struct *exiting) +{ + if (ret != -EBUSY) { + WARN_ON_ONCE(exiting); + return; + } + + if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) + return; + + mutex_lock(&exiting->futex_exit_mutex); + /* + * No point in doing state checking here. If the waiter got here + * while the task was in exec()->exec_futex_release() then it can + * have any FUTEX_STATE_* value when the waiter has acquired the + * mutex. OK, if running, EXITING or DEAD if it reached exit() + * already. Highly unlikely and not a problem. Just one more round + * through the futex maze. + */ + mutex_unlock(&exiting->futex_exit_mutex); + + put_task_struct(exiting); +} + +static int handle_exit_race(u32 __user *uaddr, u32 uval, + struct task_struct *tsk) +{ + u32 uval2; + + /* + * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the + * caller that the alleged owner is busy. + */ + if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) + return -EBUSY; + + /* + * Reread the user space value to handle the following situation: + * + * CPU0 CPU1 + * + * sys_exit() sys_futex() + * do_exit() futex_lock_pi() + * futex_lock_pi_atomic() + * exit_signals(tsk) No waiters: + * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID + * mm_release(tsk) Set waiter bit + * exit_robust_list(tsk) { *uaddr = 0x80000PID; + * Set owner died attach_to_pi_owner() { + * *uaddr = 0xC0000000; tsk = get_task(PID); + * } if (!tsk->flags & PF_EXITING) { + * ... attach(); + * tsk->futex_state = } else { + * FUTEX_STATE_DEAD; if (tsk->futex_state != + * FUTEX_STATE_DEAD) + * return -EAGAIN; + * return -ESRCH; <--- FAIL + * } + * + * Returning ESRCH unconditionally is wrong here because the + * user space value has been changed by the exiting task. + * + * The same logic applies to the case where the exiting task is + * already gone. + */ + if (get_futex_value_locked(&uval2, uaddr)) + return -EFAULT; + + /* If the user space value has changed, try again. */ + if (uval2 != uval) + return -EAGAIN; + + /* + * The exiting task did not have a robust list, the robust list was + * corrupted or the user space value in *uaddr is simply bogus. + * Give up and tell user space. + */ + return -ESRCH; +} + +/* + * Lookup the task for the TID provided from user space and attach to + * it after doing proper sanity checks. + */ +static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, + struct futex_pi_state **ps, + struct task_struct **exiting) +{ + pid_t pid = uval & FUTEX_TID_MASK; + struct futex_pi_state *pi_state; + struct task_struct *p; + + /* + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when TID = 0 [1] + * + * The !pid check is paranoid. None of the call sites should end up + * with pid == 0, but better safe than sorry. Let the caller retry + */ + if (!pid) + return -EAGAIN; + p = find_get_task_by_vpid(pid); + if (!p) + return handle_exit_race(uaddr, uval, NULL); + + if (unlikely(p->flags & PF_KTHREAD)) { + put_task_struct(p); + return -EPERM; + } + + /* + * We need to look at the task state to figure out, whether the + * task is exiting. To protect against the change of the task state + * in futex_exit_release(), we do this protected by p->pi_lock: + */ + raw_spin_lock_irq(&p->pi_lock); + if (unlikely(p->futex_state != FUTEX_STATE_OK)) { + /* + * The task is on the way out. When the futex state is + * FUTEX_STATE_DEAD, we know that the task has finished + * the cleanup: + */ + int ret = handle_exit_race(uaddr, uval, p); + + raw_spin_unlock_irq(&p->pi_lock); + /* + * If the owner task is between FUTEX_STATE_EXITING and + * FUTEX_STATE_DEAD then store the task pointer and keep + * the reference on the task struct. The calling code will + * drop all locks, wait for the task to reach + * FUTEX_STATE_DEAD and then drop the refcount. This is + * required to prevent a live lock when the current task + * preempted the exiting task between the two states. + */ + if (ret == -EBUSY) + *exiting = p; + else + put_task_struct(p); + return ret; + } + + /* + * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. + */ + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make @p + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ + pi_state->owner = p; + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + *ps = pi_state; + + return 0; +} + +static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps, + struct task_struct **exiting) +{ + struct futex_q *top_waiter = futex_top_waiter(hb, key); + + /* + * If there is a waiter on that futex, validate it and + * attach to the pi_state when the validation succeeds. + */ + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + + /* + * We are the first waiter - try to look up the owner based on + * @uval and attach to it. + */ + return attach_to_pi_owner(uaddr, uval, key, ps, exiting); +} + +static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +{ + int err; + u32 curval; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (unlikely(err)) + return err; + + /* If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; +} + +/** + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Return: + * - 0 - ready to wait; + * - 1 - acquired the lock; + * - <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, + struct task_struct **exiting, + int set_waiters) +{ + u32 uval, newval, vpid = task_pid_vnr(task); + struct futex_q *top_waiter; + int ret; + + /* + * Read the user space value first so we can validate a few + * things before proceeding further. + */ + if (get_futex_value_locked(&uval, uaddr)) + return -EFAULT; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) + return -EDEADLK; + + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + + /* + * Lookup existing state first. If it exists, try to attach to + * its pi_state. + */ + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + + /* + * No waiter and user TID is 0. We are here because the + * waiters or the owner died bit is set or called from + * requeue_cmp_pi or for whatever reason something took the + * syscall. + */ + if (!(uval & FUTEX_TID_MASK)) { + /* + * We take over the futex. No other waiters and the user space + * TID is 0. We preserve the owner died bit. + */ + newval = uval & FUTEX_OWNER_DIED; + newval |= vpid; + + /* The futex requeue_pi code can enforce the waiters bit */ + if (set_waiters) + newval |= FUTEX_WAITERS; + + ret = lock_pi_update_atomic(uaddr, uval, newval); + /* If the take over worked, return 1 */ + return ret < 0 ? ret : 1; + } + + /* + * First waiter. Set the waiters bit before attaching ourself to + * the owner. If owner tries to unlock, it will be forced into + * the kernel and blocked on hb->lock. + */ + newval = uval | FUTEX_WAITERS; + ret = lock_pi_update_atomic(uaddr, uval, newval); + if (ret) + return ret; + /* + * If the update of the user space value succeeded, we try to + * attach to the owner. If that fails, no harm done, we only + * set the FUTEX_WAITERS bit in the user space variable. + */ + return attach_to_pi_owner(uaddr, newval, key, ps, exiting); +} + +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) + return; + lockdep_assert_held(q->lock_ptr); + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); +} + +/* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. Callers + * must ensure to later call wake_up_q() for the actual + * wakeups to occur. + */ +static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) +{ + struct task_struct *p = q->task; + + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + + get_task_struct(p); + __unqueue_futex(q); + /* + * The waiting task can free the futex_q as soon as q->lock_ptr = NULL + * is written, without taking any locks. This is possible in the event + * of a spurious wakeup, for example. A memory barrier is required here + * to prevent the following store to lock_ptr from getting ahead of the + * plist_del in __unqueue_futex(). + */ + smp_store_release(&q->lock_ptr, NULL); + + /* + * Queue the task for later wakeup for after we've released + * the hb->lock. + */ + wake_q_add_safe(wake_q, p); +} + +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) +{ + u32 curval, newval; + struct task_struct *new_owner; + bool postunlock = false; + DEFINE_WAKE_Q(wake_q); + int ret = 0; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!new_owner)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } + + /* + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. + */ + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + + ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (!ret && (curval != uval)) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } + + if (ret) + goto out_unlock; + + /* + * This is a point of no return; once we modify the uval there is no + * going back and subsequent operations must not fail. + */ + + raw_spin_lock(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&pi_state->owner->pi_lock); + + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + + if (postunlock) + rt_mutex_postunlock(&wake_q); + + return ret; +} + +/* + * Express the locking dependencies for lockdep: + */ +static inline void +double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + if (hb1 <= hb2) { + spin_lock(&hb1->lock); + if (hb1 < hb2) + spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); + } else { /* hb1 > hb2 */ + spin_lock(&hb2->lock); + spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); + } +} + +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); +} + +/* + * Wake up waiters matching bitset queued on this futex (uaddr). + */ +static int +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + union futex_key key = FUTEX_KEY_INIT; + int ret; + DEFINE_WAKE_Q(wake_q); + + if (!bitset) + return -EINVAL; + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + + hb = hash_futex(&key); + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) + return ret; + + spin_lock(&hb->lock); + + plist_for_each_entry_safe(this, next, &hb->chain, list) { + if (match_futex (&this->key, &key)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + break; + } + + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; + + mark_wake_futex(&wake_q, this); + if (++ret >= nr_wake) + break; + } + } + + spin_unlock(&hb->lock); + wake_up_q(&wake_q); + return ret; +} + +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +{ + unsigned int op = (encoded_op & 0x70000000) >> 28; + unsigned int cmp = (encoded_op & 0x0f000000) >> 24; + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); + int oldval, ret; + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { + if (oparg < 0 || oparg > 31) { + char comm[sizeof(current->comm)]; + /* + * kill this print and return -EINVAL when userspace + * is sane again + */ + pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", + get_task_comm(comm, current), oparg); + oparg &= 31; + } + oparg = 1 << oparg; + } + + pagefault_disable(); + ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + pagefault_enable(); + if (ret) + return ret; + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + return oldval == cmparg; + case FUTEX_OP_CMP_NE: + return oldval != cmparg; + case FUTEX_OP_CMP_LT: + return oldval < cmparg; + case FUTEX_OP_CMP_GE: + return oldval >= cmparg; + case FUTEX_OP_CMP_LE: + return oldval <= cmparg; + case FUTEX_OP_CMP_GT: + return oldval > cmparg; + default: + return -ENOSYS; + } +} + +/* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int +futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + int nr_wake, int nr_wake2, int op) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + int ret, op_ret; + DEFINE_WAKE_Q(wake_q); + +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) + return ret; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + +retry_private: + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { + double_unlock_hb(hb1, hb2); + + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ + ret = op_ret; + return ret; + } + + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + return ret; + } + + if (!(flags & FLAGS_SHARED)) { + cond_resched(); + goto retry_private; + } + + cond_resched(); + goto retry; + } + + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (match_futex (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + mark_wake_futex(&wake_q, this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + op_ret = 0; + plist_for_each_entry_safe(this, next, &hb2->chain, list) { + if (match_futex (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + mark_wake_futex(&wake_q, this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + +out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); + return ret; +} + +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + hb_waiters_dec(hb1); + hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; + } + q->key = *key2; +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) +{ + q->key = *key; + + __unqueue_futex(q); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + q->lock_ptr = &hb->lock; + + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + * + * Return: + * - 0 - failed to acquire the lock atomically; + * - >0 - acquired the lock, return value is vpid of the top_waiter + * - <0 - error + */ +static int +futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key1, + union futex_key *key2, struct futex_pi_state **ps, + struct task_struct **exiting, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret, vpid; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* Ensure we requeue to the expected futex. */ + if (!match_futex(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + + /* + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. + */ + vpid = task_pid_vnr(top_waiter->task); + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + exiting, set_waiters); + if (ret == 1) { + requeue_pi_wake_futex(top_waiter, key2, hb2); + return vpid; + } + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * @uaddr1: source futex user address + * @flags: futex flags (FLAGS_SHARED, etc.) + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Return: + * - >=0 - on success, the number of tasks requeued or woken; + * - <0 - on error + */ +static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_requeue, + u32 *cmpval, int requeue_pi) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + DEFINE_WAKE_Q(wake_q); + + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + + /* + * When PI not supported: return -ENOSYS if requeue_pi is true, + * consequently the compiler knows requeue_pi is always false past + * this point which will optimize away all the conditional code + * further down. + */ + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; + + if (requeue_pi) { + /* + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. + */ + if (uaddr1 == uaddr2) + return -EINVAL; + + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } + +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? FUTEX_WRITE : FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + +retry_private: + hb_waiters_inc(hb2); + double_lock_hb(hb1, hb2); + + if (likely(cmpval != NULL)) { + u32 curval; + + ret = get_futex_value_locked(&curval, uaddr1); + + if (unlikely(ret)) { + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); + + ret = get_user(curval, uaddr1); + if (ret) + return ret; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } + + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + struct task_struct *exiting = NULL; + + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, + &exiting, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. If the lock was taken, ret contains the + * vpid of the top waiter task. + * If the lock was not taken, we have pi_state and an initial + * refcount on it. In case of an error we have nothing. + */ + if (ret > 0) { + WARN_ON(pi_state); + task_count++; + /* + * If we acquired the lock, then the user space value + * of uaddr2 should be vpid. It cannot be changed by + * the top waiter as it is blocked on hb2 lock if it + * tries to do so. If something fiddled with it behind + * our back the pi state lookup might unearth it. So + * we rather use the known value than rereading and + * handing potential crap to lookup_pi_state. + * + * If that call succeeds then we have pi_state and an + * initial refcount on it. + */ + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, + &pi_state, &exiting); + } + + switch (ret) { + case 0: + /* We hold a reference on the pi state. */ + break; + + /* If the above failed, then pi_state is NULL */ + case -EFAULT: + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + return ret; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Owner is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (task_count - nr_wake >= nr_requeue) + break; + + if (!match_futex(&this->key, &key1)) + continue; + + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter) || + this->pi_state) { + ret = -EINVAL; + break; + } + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { + mark_wake_futex(&wake_q, this); + continue; + } + + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } + + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* + * Prepare the waiter to take the rt_mutex. Take a + * refcount on the pi_state and store the pointer in + * the futex_q object of the waiter. + */ + get_pi_state(pi_state); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + if (ret == 1) { + /* + * We got the lock. We do neither drop the + * refcount on pi_state nor clear + * this->pi_state because the waiter needs the + * pi_state for cleaning up the user space + * value. It will drop the refcount after + * doing so. + */ + requeue_pi_wake_futex(this, &key2, hb2); + continue; + } else if (ret) { + /* + * rt_mutex_start_proxy_lock() detected a + * potential deadlock when we tried to queue + * that waiter. Drop the pi_state reference + * which we took above and remove the pointer + * to the state from the waiters futex_q + * object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + /* + * We stop queueing more waiters and let user + * space deal with the mess. + */ + break; + } + } + requeue_futex(this, hb1, hb2, &key2); + } + + /* + * We took an extra initial reference to the pi_state either + * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We + * need to drop it here again. + */ + put_pi_state(pi_state); + +out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); + hb_waiters_dec(hb2); + return ret ? ret : task_count; +} + +/* The key must be already stored in q->key. */ +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + __acquires(&hb->lock) +{ + struct futex_hash_bucket *hb; + + hb = hash_futex(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all queue_lock() + * users end up calling queue_me(). Similarly, for housekeeping, + * decrement the counter at queue_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + hb_waiters_inc(hb); /* implies smp_mb(); (A) */ + + q->lock_ptr = &hb->lock; + + spin_lock(&hb->lock); + return hb; +} + +static inline void +queue_unlock(struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + spin_unlock(&hb->lock); + hb_waiters_dec(hb); +} + +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +{ + int prio; + + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + */ + prio = min(current->normal_prio, MAX_RT_PRIO); + + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + q->task = current; +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); + spin_unlock(&hb->lock); +} + +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Return: + * - 1 - if the futex_q was still queued (and we removed unqueued it); + * - 0 - if the futex_q was already removed by the waking thread + */ +static int unqueue_me(struct futex_q *q) +{ + spinlock_t *lock_ptr; + int ret = 0; + + /* In the common case we don't take the spinlock, which is nice. */ +retry: + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); + if (lock_ptr != NULL) { + spin_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } + __unqueue_futex(q); + + BUG_ON(q->pi_state); + + spin_unlock(lock_ptr); + ret = 1; + } + + return ret; +} + +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry + * and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) +{ + __unqueue_futex(q); + + BUG_ON(!q->pi_state); + put_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(q->lock_ptr); +} + +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + u32 uval, curval, newval; + struct task_struct *oldowner, *newowner; + u32 newtid; + int ret, err = 0; + + lockdep_assert_held(q->lock_ptr); + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + oldowner = pi_state->owner; + + /* + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), + * + * or: + * + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the pi_state + * because we can fault here. Imagine swapped out pages or a fork + * that marked all the anonymous memory readonly for cow. + * + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID check in lookup_pi_state. + */ +retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + ret = 0; + goto out_unlock; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + err = get_futex_value_locked(&uval, uaddr); + if (err) + goto handle_err; + + for (;;) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (err) + goto handle_err; + + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ + if (pi_state->owner != NULL) { + raw_spin_lock(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&pi_state->owner->pi_lock); + } + + pi_state->owner = newowner; + + raw_spin_lock(&newowner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &newowner->pi_state_list); + raw_spin_unlock(&newowner->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + + return 0; + + /* + * In order to reschedule or handle a page fault, we need to drop the + * locks here. In the case of a fault, this gives the other task + * (either the highest priority waiter itself or the task which stole + * the rtmutex) the chance to try the fixup of the pi_state. So once we + * are back from handling the fault we need to check the pi_state after + * reacquiring the locks and before trying to do another fixup. When + * the fixup has been done already we simply return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. + */ +handle_err: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(q->lock_ptr); + + switch (err) { + case -EFAULT: + ret = fault_in_user_writeable(uaddr); + break; + + case -EAGAIN: + cond_resched(); + ret = 0; + break; + + default: + WARN_ON_ONCE(1); + ret = err; + break; + } + + spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) { + ret = 0; + goto out_unlock; + } + + if (ret) + goto out_unlock; + + goto retry; + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; +} + +static long futex_wait_restart(struct restart_block *restart); + +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Return: + * - 1 - success, lock taken; + * - 0 - success, lock not taken; + * - <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) +{ + int ret = 0; + + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + * + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current); + return ret ? ret : locked; + } + + /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + return ret; + } + + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + } + + return ret; +} + +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using smp_store_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + set_current_state(TASK_INTERRUPTIBLE); + queue_me(q, hb); + + /* Arm the timer */ + if (timeout) + hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); + + /* + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + freezable_schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @flags: futex flags (FLAGS_SHARED, etc.) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Return: + * - 0 - uaddr contains val and hb has been locked; + * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; + + /* + * Access the page AFTER the hash-bucket is locked. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + +retry_private: + *hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, uaddr); + + if (ret) { + queue_unlock(*hb); + + ret = get_user(uval, uaddr); + if (ret) + return ret; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } + + if (uval != val) { + queue_unlock(*hb); + ret = -EWOULDBLOCK; + } + + return ret; +} + +static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) +{ + struct hrtimer_sleeper timeout, *to; + struct restart_block *restart; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int ret; + + if (!bitset) + return -EINVAL; + q.bitset = bitset; + + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); +retry: + /* + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out; + + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to); + + /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; + /* unqueue_me() drops q.key ref */ + if (!unqueue_me(&q)) + goto out; + ret = -ETIMEDOUT; + if (to && !to->task) + goto out; + + /* + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. + */ + if (!signal_pending(current)) + goto retry; + + ret = -ERESTARTSYS; + if (!abs_time) + goto out; + + restart = ¤t->restart_block; + restart->fn = futex_wait_restart; + restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = *abs_time; + restart->futex.bitset = bitset; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; + + ret = -ERESTART_RESTARTBLOCK; + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + + +static long futex_wait_restart(struct restart_block *restart) +{ + u32 __user *uaddr = restart->futex.uaddr; + ktime_t t, *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t = restart->futex.time; + tp = &t; + } + restart->fn = do_no_restart_syscall; + + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); +} + + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. + */ +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + ktime_t *time, int trylock) +{ + struct hrtimer_sleeper timeout, *to; + struct futex_pi_state *pi_state = NULL; + struct task_struct *exiting = NULL; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int res, ret; + + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + + if (refill_pi_state_cache()) + return -ENOMEM; + + to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); + +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); + if (unlikely(ret != 0)) + goto out; + +retry_private: + hb = queue_lock(&q); + + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); + if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ + switch (ret) { + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Task is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + queue_unlock(hb); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock_put_key; + } + } + + WARN_ON(!q.pi_state); + + /* + * Only actually queue now that the atomic ops are done: + */ + __queue_me(&q, hb); + + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; + } + + rt_mutex_init_waiter(&rt_waiter); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; + goto cleanup; + } + + if (unlikely(to)) + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + +cleanup: + spin_lock(q.lock_ptr); + /* + * If we failed to acquire the lock (deadlock/signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait + * lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. + */ + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + + goto out; + +out_unlock_put_key: + queue_unlock(hb); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret != -EINTR ? ret : -ERESTARTNOINTR; + +uaddr_faulted: + queue_unlock(hb); + + ret = fault_in_user_writeable(uaddr); + if (ret) + goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) +{ + u32 curval, uval, vpid = task_pid_vnr(current); + union futex_key key = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb; + struct futex_q *top_waiter; + int ret; + + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != vpid) + return -EPERM; + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); + if (ret) + return ret; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + /* + * Check waiters first. We do not trust user space values at + * all and we at least want to know if user space fiddled + * with the futex value instead of blindly unlocking. + */ + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. + * + * In particular; this forces __rt_mutex_start_proxy() to + * complete such that we're guaranteed to observe the + * rt_waiter. Also see the WARN in wake_futex_pi(). + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + /* drops pi_state->pi_mutex.wait_lock */ + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. + */ + if (!ret) + goto out_putkey; + /* + * The atomic access to the futex value generated a + * pagefault, so retry the user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) + goto pi_retry; + /* + * wake_futex_pi has detected invalid state. Tell user + * space. + */ + goto out_putkey; + } + + /* + * We have no kernel internal state, i.e. no waiters in the + * kernel. Waiters which are about to queue themselves are stuck + * on hb->lock. So we can safely ignore them. We do neither + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. + */ + if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { + spin_unlock(&hb->lock); + switch (ret) { + case -EFAULT: + goto pi_faulted; + + case -EAGAIN: + goto pi_retry; + + default: + WARN_ON_ONCE(1); + goto out_putkey; + } + } + + /* + * If uval has changed, let user space handle it. + */ + ret = (curval == uval) ? 0 : -EAGAIN; + +out_unlock: + spin_unlock(&hb->lock); +out_putkey: + return ret; + +pi_retry: + cond_resched(); + goto retry; + +pi_faulted: + + ret = fault_in_user_writeable(uaddr); + if (!ret) + goto retry; + + return ret; +} + +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Return: + * - 0 = no early wakeup detected; + * - <0 = -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initially wait on (non-pi) + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following-- + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout + * + * If 3, cleanup and return -ERESTARTNOINTR. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Return: + * - 0 - On success; + * - <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + u32 val, ktime_t *abs_time, u32 bitset, + u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + int res, ret; + + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + + if (uaddr == uaddr2) + return -EINVAL; + + if (!bitset) + return -EINVAL; + + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + rt_mutex_init_waiter(&rt_waiter); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) + goto out; + + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (match_futex(&q.key, &key2)) { + queue_unlock(hb); + ret = -EINVAL; + goto out; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + put_pi_state(q.pi_state); + spin_unlock(q.lock_ptr); + } + } else { + struct rt_mutex *pi_mutex; + + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + + spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* + * If fixup_pi_state_owner() faulted and was unable to handle + * the fault, unlock the rt_mutex and return the fault to + * userspace. + */ + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart by calling + * futex_lock_pi() directly. We could restart this syscall, but + * it would detect that the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart and return + * -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + +static long do_futex1(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3) +{ + int cmd = op & FUTEX_CMD_MASK; + unsigned int flags = 0; + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; + + if (op & FUTEX_CLOCK_REALTIME) { + flags |= FLAGS_CLOCKRT; + if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ + cmd != FUTEX_WAIT_REQUEUE_PI) + return -ENOSYS; + } + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + if (!futex_cmpxchg_enabled) + return -ENOSYS; + } + + switch (cmd) { + case FUTEX_WAIT: + val3 = FUTEX_BITSET_MATCH_ANY; + fallthrough; + case FUTEX_WAIT_BITSET: + return futex_wait(uaddr, flags, val, timeout, val3); + case FUTEX_WAKE: + val3 = FUTEX_BITSET_MATCH_ANY; + fallthrough; + case FUTEX_WAKE_BITSET: + return futex_wake(uaddr, flags, val, val3); + case FUTEX_REQUEUE: + return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); + case FUTEX_CMP_REQUEUE: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); + case FUTEX_WAKE_OP: + return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); + case FUTEX_LOCK_PI: + return futex_lock_pi(uaddr, flags, timeout, 0); + case FUTEX_UNLOCK_PI: + return futex_unlock_pi(uaddr, flags); + case FUTEX_TRYLOCK_PI: + return futex_lock_pi(uaddr, flags, NULL, 1); + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + } + return -ENOSYS; +} + + +SYSCALL_DEFINE6(futex1, u32 __user *, uaddr, int, op, u32, val, + struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) +{ + struct timespec64 ts; + ktime_t t, *tp = NULL; + u32 val2 = 0; + int cmd = op & FUTEX_CMD_MASK; + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) + return -EINVAL; + + t = timespec64_to_ktime(ts); + if (cmd == FUTEX_WAIT) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } + /* + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. + * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. + */ + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (u32) (unsigned long) utime; + + return do_futex1(uaddr, op, val, tp, uaddr2, val2, val3); +} + +static void __init futex_detect_cmpxchg(void) +{ +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG + u32 curval; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non-functional ones will return + * -ENOSYS. + */ + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) + futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ + unsigned int futex_shift; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + + futex_queues = alloc_large_system_hash("futex1", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; + + futex_detect_cmpxchg(); + + for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); + plist_head_init(&futex_queues[i].chain); + spin_lock_init(&futex_queues[i].lock); + } + + return 0; +} +core_initcall(futex_init); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 3e1a713d3e57..b53a24a99a14 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -153,6 +153,8 @@ COND_SYSCALL(futex_wait); COND_SYSCALL(futex_wake); COND_SYSCALL(futex_waitv); +COND_SYSCALL(futex1); + /* kernel/hrtimer.c */ /* kernel/itimer.c */ diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h index 4205ed4158bf..43de5a59ac1c 100644 --- a/tools/arch/x86/include/asm/unistd_64.h +++ b/tools/arch/x86/include/asm/unistd_64.h @@ -17,3 +17,15 @@ #ifndef __NR_setns #define __NR_setns 308 #endif + +#ifndef __NR_futex_wait +#define __NR_futex_wait 440 +#endif + +#ifndef __NR_futex_wake +#define __NR_futex_wake 441 +#endif + +#ifndef __NR_futex1 +#define __NR_futex1 442 +#endif diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index dd457de21bad..f737eaeecbb6 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -862,11 +862,15 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_futex_wait 440 __SYSCALL(__NR_futex_wait, sys_futex_wait) + #define __NR_futex_wake 441 __SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex1 442 +__SYSCALL(__NR_futex1, sys_futex1) + #undef __NR_syscalls -#define __NR_syscalls 442 +#define __NR_syscalls 443 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index f30d6ae9a688..1a516b081207 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -361,6 +361,9 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common futex_wait sys_futex_wait +441 common futex_wake sys_futex_wake +442 common futex1 sys_futex1 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h index 31b53cc7d5bc..baf6a0d077ac 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h @@ -8,10 +8,14 @@ #ifndef _FUTEX_H #define _FUTEX_H +//#define FUTEX1 0 +#define UNUSED(x) (void)(x) + #include #include #include #include +#include /** * futex() - SYS_futex syscall wrapper @@ -34,7 +38,13 @@ * like-named arguments in the following wrappers except where noted below. */ #define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \ - syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3) + syscall(__NR_futex1, uaddr, op | opflags, val, timeout, uaddr2, val3) + +#define futex2_wake(uaddr, nr, flags) \ + syscall(__NR_futex_wake, uaddr, nr, flags | FUTEX_32) + +#define futex2_wait(uaddr, val, flags, timeout) \ + syscall(__NR_futex_wait, uaddr, val, flags | FUTEX_32, timeout) /** * futex_wait() - block on uaddr with optional timeout @@ -43,7 +53,13 @@ static inline int futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags) { +#ifdef FUTEX1 return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); +#else + UNUSED(timeout); + UNUSED(opflags); + return futex2_wait(uaddr, val, 0, NULL); +#endif } /** @@ -53,7 +69,12 @@ futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflag static inline int futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) { +#ifdef FUTEX1 return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); +#else + UNUSED(opflags); + return futex2_wake(uaddr, nr_wake, 0); +#endif } /** -- 2.28.0 From 2f5e38a4191ac6fd5040435f6a41433add3711a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 15 Oct 2020 18:06:40 -0300 Subject: [PATCH 07/13] futex2: Add support for shared futexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for shared futexes for cross-process resources. Signed-off-by: André Almeida --- kernel/futex2.c | 169 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 146 insertions(+), 23 deletions(-) diff --git a/kernel/futex2.c b/kernel/futex2.c index 4b782b5ef615..ae743ddf223e 100644 --- a/kernel/futex2.c +++ b/kernel/futex2.c @@ -6,7 +6,9 @@ */ #include +#include #include +#include #include #include #include @@ -15,6 +17,7 @@ /** * struct futex_waiter - List entry for a waiter + * @uaddr: Memory address of userspace futex * @key.address: Memory address of userspace futex * @key.mm: Pointer to memory management struct of this process * @key: Stores information that uniquely identify a futex @@ -25,6 +28,7 @@ * @index: Index of waiter in futexv list */ struct futex_waiter { + uintptr_t uaddr; struct futex_key { uintptr_t address; struct mm_struct *mm; @@ -125,16 +129,109 @@ static inline int bucket_get_waiters(struct futex_bucket *bucket) #endif } +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + +static int futex_get_shared_key(uintptr_t address, struct mm_struct *mm, + struct futex_key *key) +{ + int err; + struct page *page, *tail; + struct address_space *mapping; + +again: + err = get_user_pages_fast(address, 1, 0, &page); + + if (err < 0) + return err; + else + err = 0; + + + tail = page; + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + + + if (unlikely(!mapping)) { + int shmem_swizzled; + + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; + unlock_page(page); + put_page(page); + + if (shmem_swizzled) + goto again; + + return -EFAULT; + } + + if (PageAnon(page)) { + + key->mm = mm; + key->address = address; + + } else { + struct inode *inode; + + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + key->address = get_inode_sequence_number(inode); + key->mm = (struct mm_struct *) basepage_index(tail); + rcu_read_unlock(); + } + + put_page(page); + return err; +} + /** * futex_get_bucket - Check if the user address is valid, prepare internal * data and calculate the hash * @uaddr: futex user address * @key: data that uniquely identifies a futex + * @shared: is this a shared futex? * * Return: address of bucket on success, error code otherwise */ static struct futex_bucket *futex_get_bucket(void __user *uaddr, - struct futex_key *key) + struct futex_key *key, + bool shared) { uintptr_t address = (uintptr_t) uaddr; u32 hash_key; @@ -145,8 +242,12 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr, if (unlikely(!access_ok(address, sizeof(u32)))) return ERR_PTR(-EFAULT); - key->address = address; - key->mm = current->mm; + if (!shared) { + key->address = address; + key->mm = current->mm; + } else { + futex_get_shared_key(address, current->mm, key); + } /* Generate hash key for this futex using uaddr and current->mm */ hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0); @@ -275,9 +376,10 @@ static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr) * Return: 0 on success, error code otherwise */ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, - unsigned int *awaken) + int *awaken) { int i, ret; + bool shared; u32 uval, *uaddr, val; struct futex_bucket *bucket; @@ -285,9 +387,13 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, set_current_state(TASK_INTERRUPTIBLE); for (i = 0; i < nr_futexes; i++) { - uaddr = (u32 * __user) futexv->objects[i].key.address; + uaddr = (u32 * __user) futexv->objects[i].uaddr; val = (u32) futexv->objects[i].val; - bucket = futexv->objects[i].bucket; + shared = (futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false; + if (shared) + bucket = futex_get_bucket((void *) uaddr, &futexv->objects[i].key, true); + else + bucket = futexv->objects[i].bucket; bucket_inc_waiters(bucket); spin_lock(&bucket->lock); @@ -301,11 +407,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, __set_current_state(TASK_RUNNING); *awaken = futex_dequeue_multiple(futexv, i); + if (shared) + goto retry; + if (__get_user(uval, uaddr)) return -EFAULT; if (*awaken >= 0) - return 0; + return 1; goto retry; } @@ -313,12 +422,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, if (uval != val) { spin_unlock(&bucket->lock); + bucket_dec_waiters(bucket); __set_current_state(TASK_RUNNING); *awaken = futex_dequeue_multiple(futexv, i); - if (*awaken >= 0) - return 0; + if (*awaken >= 0) { + return 1; + } return -EWOULDBLOCK; } @@ -336,19 +447,18 @@ static int __futex_wait(struct futexv *futexv, struct hrtimer_sleeper *timeout) { int ret; - unsigned int awaken = -1; - while (1) { - ret = futex_enqueue(futexv, nr_futexes, &awaken); - if (ret < 0) - break; + while (1) { + int awaken = -1; - if (awaken <= 0) { - return awaken; + ret = futex_enqueue(futexv, nr_futexes, &awaken); + if (ret) { + if (awaken >= 0) + return awaken; + return ret; } - /* Before sleeping, check if someone was woken */ if (!futexv->hint && (!timeout || timeout->task)) freezable_schedule(); @@ -419,6 +529,7 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); } + ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL); @@ -438,9 +549,10 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, unsigned int, flags, struct __kernel_timespec __user *, timo) { + bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; unsigned int size = flags & FUTEX_SIZE_MASK; - struct hrtimer_sleeper timeout; struct futex_single_waiter wait_single; + struct hrtimer_sleeper timeout; struct futex_waiter *waiter; struct futexv *futexv; int ret; @@ -452,6 +564,7 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, waiter = &wait_single.waiter; waiter->index = 0; waiter->val = val; + waiter->uaddr = (uintptr_t) uaddr; INIT_LIST_HEAD(&waiter->list); @@ -462,11 +575,14 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, return -EINVAL; /* Get an unlocked hash bucket */ - waiter->bucket = futex_get_bucket(uaddr, &waiter->key); - if (IS_ERR(waiter->bucket)) + waiter->bucket = futex_get_bucket(uaddr, &waiter->key, shared); + if (IS_ERR(waiter->bucket)) { return PTR_ERR(waiter->bucket); + } ret = futex_wait(futexv, 1, timo, &timeout, flags); + if (ret > 0) + ret = 0; return ret; } @@ -486,8 +602,10 @@ static int futex_parse_waitv(struct futexv *futexv, struct futex_waitv waitv; unsigned int i; struct futex_bucket *bucket; + bool shared; for (i = 0; i < nr_futexes; i++) { + if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) return -EFAULT; @@ -495,8 +613,10 @@ static int futex_parse_waitv(struct futexv *futexv, (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) return -EINVAL; + shared = (waitv.flags & FUTEX_SHARED_FLAG) ? true : false; + bucket = futex_get_bucket(waitv.uaddr, - &futexv->objects[i].key); + &futexv->objects[i].key, shared); if (IS_ERR(bucket)) return PTR_ERR(bucket); @@ -505,6 +625,7 @@ static int futex_parse_waitv(struct futexv *futexv, futexv->objects[i].flags = waitv.flags; futexv->objects[i].index = i; INIT_LIST_HEAD(&futexv->objects[i].list); + futexv->objects[i].uaddr = (uintptr_t) waitv.uaddr; } return 0; @@ -573,6 +694,7 @@ static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, unsigned int, flags) { + bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; unsigned int size = flags & FUTEX_SIZE_MASK; struct futex_waiter waiter, *aux, *tmp; struct futex_bucket *bucket; @@ -586,9 +708,10 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, if (size != FUTEX_32) return -EINVAL; - bucket = futex_get_bucket(uaddr, &waiter.key); - if (IS_ERR(bucket)) + bucket = futex_get_bucket(uaddr, &waiter.key, shared); + if (IS_ERR(bucket)) { return PTR_ERR(bucket); + } if (!bucket_get_waiters(bucket)) return 0; -- 2.28.0 From 909eb056421668b5d42f8c4dfa92339851a43dd8 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 2 Nov 2020 18:41:38 -0500 Subject: [PATCH 08/13] Revert "futex: Remove needless goto's" This reverts commit d7c5ed73b19c4640426d9c106f70ec2cb532034d. --- kernel/futex.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6c00c0952313..a671d371b11f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1593,13 +1593,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - return ret; + goto out; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - return ret; + goto out; spin_lock(&hb->lock); @@ -1622,6 +1622,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); +out: return ret; } @@ -1688,10 +1689,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - return ret; + goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - return ret; + goto out; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1709,13 +1710,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, * an MMU, but we might get them from range checking */ ret = op_ret; - return ret; + goto out; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - return ret; + goto out; } if (!(flags & FLAGS_SHARED)) { @@ -1758,6 +1759,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); +out: return ret; } @@ -1964,18 +1966,20 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - return ret; + goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - return ret; + goto out; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) - return -EINVAL; + if (requeue_pi && match_futex(&key1, &key2)) { + ret = -EINVAL; + goto out; + } hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1995,7 +1999,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ret = get_user(curval, uaddr1); if (ret) - return ret; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2061,7 +2065,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - return ret; + goto out; case -EBUSY: case -EAGAIN: /* @@ -2180,6 +2184,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); + +out: return ret ? ret : task_count; } @@ -2537,7 +2543,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - return ret ? ret : locked; + goto out; } /* @@ -2550,7 +2556,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - return ret; + goto out; } /* @@ -2564,7 +2570,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } - return ret; +out: + return ret ? ret : locked; } /** @@ -2661,7 +2668,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ret = get_user(uval, uaddr); if (ret) - return ret; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2674,6 +2681,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ret = -EWOULDBLOCK; } +out: return ret; } -- 2.28.0 From fee513186b69c4a65534fd790545877974ef17d3 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 2 Nov 2020 18:41:54 -0500 Subject: [PATCH 09/13] Revert "futex: Remove put_futex_key()" This reverts commit 9180bd467f9abdb44afde650d07e3b9dd66d837c. --- kernel/futex.c | 61 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index a671d371b11f..647de692c874 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -661,6 +661,10 @@ static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, return err; } +static inline void put_futex_key(union futex_key *key) +{ +} + /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -1599,7 +1603,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out; + goto out_put_key; spin_lock(&hb->lock); @@ -1622,6 +1626,8 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); +out_put_key: + put_futex_key(&key); out: return ret; } @@ -1692,7 +1698,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1710,13 +1716,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, * an MMU, but we might get them from range checking */ ret = op_ret; - goto out; + goto out_put_keys; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out; + goto out_put_keys; } if (!(flags & FLAGS_SHARED)) { @@ -1724,6 +1730,8 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, goto retry_private; } + put_futex_key(&key2); + put_futex_key(&key1); cond_resched(); goto retry; } @@ -1759,6 +1767,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); +out_put_keys: + put_futex_key(&key2); +out_put_key1: + put_futex_key(&key1); out: return ret; } @@ -1970,7 +1982,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; /* * The check above which compares uaddrs is not sufficient for @@ -1978,7 +1990,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ if (requeue_pi && match_futex(&key1, &key2)) { ret = -EINVAL; - goto out; + goto out_put_keys; } hb1 = hash_futex(&key1); @@ -1999,11 +2011,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ret = get_user(curval, uaddr1); if (ret) - goto out; + goto out_put_keys; if (!(flags & FLAGS_SHARED)) goto retry_private; + put_futex_key(&key2); + put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2062,6 +2076,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); + put_futex_key(&key2); + put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; @@ -2076,6 +2092,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); + put_futex_key(&key2); + put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2185,6 +2203,10 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, wake_up_q(&wake_q); hb_waiters_dec(hb2); +out_put_keys: + put_futex_key(&key2); +out_put_key1: + put_futex_key(&key1); out: return ret ? ret : task_count; } @@ -2673,6 +2695,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, if (!(flags & FLAGS_SHARED)) goto retry_private; + put_futex_key(&q->key); goto retry; } @@ -2682,6 +2705,8 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, } out: + if (ret) + put_futex_key(&q->key); return ret; } @@ -2826,6 +2851,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, * - EAGAIN: The user space value changed. */ queue_unlock(hb); + put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2933,11 +2959,13 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, put_pi_state(pi_state); } - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(hb); +out_put_key: + put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2950,11 +2978,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ret = fault_in_user_writeable(uaddr); if (ret) - goto out; + goto out_put_key; if (!(flags & FLAGS_SHARED)) goto retry_private; + put_futex_key(&q.key); goto retry; } @@ -3083,13 +3112,16 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) out_unlock: spin_unlock(&hb->lock); out_putkey: + put_futex_key(&key); return ret; pi_retry: + put_futex_key(&key); cond_resched(); goto retry; pi_faulted: + put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3231,7 +3263,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out; + goto out_key2; /* * The check above which compares uaddrs is not sufficient for @@ -3240,7 +3272,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out; + goto out_put_keys; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3250,7 +3282,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out; + goto out_put_keys; /* * In order for us to be here, we know our q.key == key2, and since @@ -3340,6 +3372,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } +out_put_keys: + put_futex_key(&q.key); +out_key2: + put_futex_key(&key2); + out: if (to) { hrtimer_cancel(&to->timer); -- 2.28.0 From 3b1489448a277fc1c34ca12e859193c3a7f3446c Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 12 Jul 2019 14:16:20 -0400 Subject: [PATCH 10/13] futex: Split key setup from key queue locking and read split the futex key setup from the queue locking and key reading. This is usefull to support the setup of multiple keys at the same time, like what is done in futex_requeue() and what will be done for the FUTEX_WAIT_MULTIPLE command. Signed-off-by: Gabriel Krisman Bertazi --- kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 647de692c874..f05349def492 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2634,6 +2634,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, __set_current_state(TASK_RUNNING); } +static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + + u32 uval; + int ret; + +retry_private: + *hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, uaddr); + + if (ret) { + queue_unlock(*hb); + + ret = get_user(uval, uaddr); + if (ret) + return ret; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + return 1; + } + + if (uval != val) { + queue_unlock(*hb); + ret = -EWOULDBLOCK; + } + + return ret; +} + /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address @@ -2654,7 +2687,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { - u32 uval; int ret; /* @@ -2675,38 +2707,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ -retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - -retry_private: - *hb = queue_lock(q); + do { + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, + &q->key, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; - ret = get_futex_value_locked(&uval, uaddr); + ret = __futex_wait_setup(uaddr, val, flags, q, hb); - if (ret) { - queue_unlock(*hb); - - ret = get_user(uval, uaddr); + /* Drop key reference if retry or error. */ if (ret) - goto out; + put_futex_key(&q->key); + } while (ret > 0); - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&q->key); - goto retry; - } - - if (uval != val) { - queue_unlock(*hb); - ret = -EWOULDBLOCK; - } - -out: - if (ret) - put_futex_key(&q->key); return ret; } -- 2.28.0 From 539862895e53b9a774f3a2271d1e7db57879d0d7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 8 Jul 2019 09:44:09 -0400 Subject: [PATCH 11/13] futex: Implement FUTEX_WAIT_MULTIPLE This is a new futex operation to allow a thread to wait on several futexes at the same time, and wake up on any of them. In a sense, it implements one of the features that was supported by pooling on the old FUTEX_FD interface. My use case for this feature lies in Wine, where we want to implement a similar function available in Windows, mainly for event handling. The wine folks have an implementation of the userspace side using eventfd, but it suffers from bad performance, as shown in the measurements below. Technically, the old FUTEX_WAIT implementation can be easily reimplemented using do_futex_wait_multiple, with a count one, and I have a patch demonstrating how it works. I'm not proposing it, since futex is such a tricky code, that I'd be more confortable to have FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, before considering modifying FUTEX_WAIT. This was tested using three mechanisms: 1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and running tools/testing/selftests/futex and a full linux distro on top of this kernel. 2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a multi thread, event handling setup. 3) By running the Wine fsync implementation and executing multi-threaded applications, in particular modern games on top of the implementation. Signed-off-by: Zebediah Figura Signed-off-by: Steven Noonan Signed-off-by: Pierre-Loup A. Griffais Signed-off-by: Gabriel Krisman Bertazi --- include/uapi/linux/futex.h | 7 ++ kernel/futex.c | 159 ++++++++++++++++++++++++++++++++++++- 2 files changed, 162 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 35a5bf1cd41b..aefb0b83b784 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -21,6 +21,7 @@ #define FUTEX_WAKE_BITSET 10 #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_WAIT_MULTIPLE 13 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -190,4 +191,10 @@ struct robust_list_head { (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) +struct futex_wait_block { + __u32 __user *uaddr; + __u32 val; + __u32 bitset; +}; + #endif /* _UAPI_LINUX_FUTEX_H */ diff --git a/kernel/futex.c b/kernel/futex.c index f05349def492..775f780a96c4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -166,6 +166,7 @@ static int __read_mostly futex_cmpxchg_enabled; #endif #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 +#define FLAGS_WAKE_MULTIPLE 0x08 /* * Priority Inheritance state: @@ -2723,6 +2724,148 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, return ret; } +static int do_futex_wait_multiple(struct futex_wait_block *wb, + u32 count, unsigned int flags, + ktime_t *abs_time) +{ + + struct hrtimer_sleeper timeout, *to; + struct futex_hash_bucket *hb; + struct futex_q *qs = NULL; + int ret; + int i; + + qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); + if (!qs) + return -ENOMEM; + + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); + retry: + for (i = 0; i < count; i++) { + qs[i].key = FUTEX_KEY_INIT; + qs[i].bitset = wb[i].bitset; + + ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, + &qs[i].key, FUTEX_READ); + if (unlikely(ret != 0)) { + for (--i; i >= 0; i--) + put_futex_key(&qs[i].key); + goto out; + } + } + + set_current_state(TASK_INTERRUPTIBLE); + + for (i = 0; i < count; i++) { + ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, + flags, &qs[i], &hb); + if (ret) { + /* Drop the failed key directly. keys 0..(i-1) + * will be put by unqueue_me. */ + put_futex_key(&qs[i].key); + + /* Undo the partial work we did. */ + for (--i; i >= 0; i--) + unqueue_me(&qs[i]); + + __set_current_state(TASK_RUNNING); + if (ret > 0) + goto retry; + goto out; + } + + /* We can't hold to the bucket lock when dealing with + * the next futex. Queue ourselves now so we can unlock + * it before moving on. */ + queue_me(&qs[i], hb); + } + + if (to) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + /* There is no easy to way to check if we are wake already on + * multiple futexes without waking through each one of them. So + * just sleep and let the scheduler handle it. + */ + if (!to || to->task) + freezable_schedule(); + + __set_current_state(TASK_RUNNING); + + ret = -ETIMEDOUT; + /* If we were woken (and unqueued), we succeeded. */ + for (i = 0; i < count; i++) + if (!unqueue_me(&qs[i])) + ret = i; + + /* Succeed wakeup */ + if (ret >= 0) + goto out; + + /* Woken by triggered timeout */ + if (to && !to->task) + goto out; + + /* + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. + */ + if (!signal_pending(current)) + goto retry; + + ret = -ERESTARTSYS; + if (!abs_time) + goto out; + + ret = -ERESTART_RESTARTBLOCK; + out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + + kfree(qs); + return ret; +} + +static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, + u32 count, ktime_t *abs_time) +{ + struct futex_wait_block *wb; + struct restart_block *restart; + int ret; + + if (!count) + return -EINVAL; + + wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); + if (!wb) + return -ENOMEM; + + if (copy_from_user(wb, uaddr, + count * sizeof(struct futex_wait_block))) { + ret = -EFAULT; + goto out; + } + + ret = do_futex_wait_multiple(wb, count, flags, abs_time); + + if (ret == -ERESTART_RESTARTBLOCK) { + restart = ¤t->restart_block; + restart->fn = futex_wait_restart; + restart->futex.uaddr = uaddr; + restart->futex.val = count; + restart->futex.time = *abs_time; + restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | + FLAGS_WAKE_MULTIPLE); + } + +out: + kfree(wb); + return ret; +} + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { @@ -2800,6 +2943,10 @@ static long futex_wait_restart(struct restart_block *restart) } restart->fn = do_no_restart_syscall; + if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) + return (long)futex_wait_multiple(uaddr, restart->futex.flags, + restart->futex.val, tp); + return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); } @@ -3843,6 +3990,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, uaddr2); case FUTEX_CMP_REQUEUE_PI: return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + case FUTEX_WAIT_MULTIPLE: + return futex_wait_multiple(uaddr, flags, val, timeout); } return -ENOSYS; } @@ -3859,7 +4008,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + cmd == FUTEX_WAIT_REQUEUE_PI || + cmd == FUTEX_WAIT_MULTIPLE)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) @@ -3868,7 +4018,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return -EINVAL; t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) t = ktime_add_safe(ktime_get(), t); else if (!(op & FUTEX_CLOCK_REALTIME)) t = timens_ktime_to_host(CLOCK_MONOTONIC, t); @@ -4055,14 +4205,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + cmd == FUTEX_WAIT_REQUEUE_PI || + cmd == FUTEX_WAIT_MULTIPLE)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; if (!timespec64_valid(&ts)) return -EINVAL; t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) t = ktime_add_safe(ktime_get(), t); else if (!(op & FUTEX_CLOCK_REALTIME)) t = timens_ktime_to_host(CLOCK_MONOTONIC, t); -- 2.28.0 From f56b85af005d46e9ef920a6728e61f7c47cf561e Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 2 Nov 2020 18:50:26 -0500 Subject: [PATCH 12/13] futex: Change WAIT_MULTIPLE opcode to 31 Signed-off-by: Gabriel Krisman Bertazi --- include/uapi/linux/futex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index aefb0b83b784..fe2b67ac0c5e 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -21,7 +21,7 @@ #define FUTEX_WAKE_BITSET 10 #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 -#define FUTEX_WAIT_MULTIPLE 13 +#define FUTEX_WAIT_MULTIPLE 31 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 -- 2.28.0 From 022e2f888a50fb8d062e26bc385abf02c0be84a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 16 Nov 2020 21:22:21 -0300 Subject: [PATCH 13/13] futex2: Add sysfs entry for syscall numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: André Almeida --- kernel/futex2.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/futex2.c b/kernel/futex2.c index ae743ddf223e..4bdff8bfc78d 100644 --- a/kernel/futex2.c +++ b/kernel/futex2.c @@ -742,6 +742,48 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, return ret; } +static ssize_t wait_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", __NR_futex_wait); + +} +static struct kobj_attribute futex2_wait_attr = __ATTR_RO(wait); + +static ssize_t wake_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", __NR_futex_wake); + +} +static struct kobj_attribute futex2_wake_attr = __ATTR_RO(wake); + +static ssize_t waitv_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", __NR_futex_waitv); + +} +static struct kobj_attribute futex2_waitv_attr = __ATTR_RO(waitv); + +static struct attribute *futex2_sysfs_attrs[] = { + &futex2_wait_attr.attr, + &futex2_wake_attr.attr, + &futex2_waitv_attr.attr, + NULL, +}; + +static const struct attribute_group futex2_sysfs_attr_group = { + .attrs = futex2_sysfs_attrs, + .name = "futex2", +}; + +static int __init futex2_sysfs_init(void) +{ + return sysfs_create_group(kernel_kobj, &futex2_sysfs_attr_group); +} +subsys_initcall(futex2_sysfs_init); + static int __init futex2_init(void) { int i; -- 2.28.0