aboutsummaryrefslogtreecommitdiff
path: root/SOURCES/futex2.patch
diff options
context:
space:
mode:
authorSentry <sentry@Sentry-Desktop-Fedora.local>2020-11-28 20:10:19 +0100
committerJan200101 <sentrycraft123@gmail.com>2020-12-05 19:40:07 +0100
commit9c1b62aff214e27adddf3d401bdb4dc993aa691d (patch)
treee5d11bdda6631ba9852742c3d0f021e1f9b6990b /SOURCES/futex2.patch
parent08a4be94ecc76dc69ba588fd590271335cadab38 (diff)
downloadkernel-fsync-9c1b62aff214e27adddf3d401bdb4dc993aa691d.tar.gz
kernel-fsync-9c1b62aff214e27adddf3d401bdb4dc993aa691d.zip
kernel 5.9.11
Diffstat (limited to 'SOURCES/futex2.patch')
-rw-r--r--SOURCES/futex2.patch6697
1 files changed, 6697 insertions, 0 deletions
diff --git a/SOURCES/futex2.patch b/SOURCES/futex2.patch
new file mode 100644
index 0000000..bfd12ba
--- /dev/null
+++ b/SOURCES/futex2.patch
@@ -0,0 +1,6697 @@
+From ada1f13b98e86cb7ac4140c4976c3d165006d995 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Wed, 5 Aug 2020 12:40:26 -0300
+Subject: [PATCH 01/13] futex2: Add new futex interface
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Initial implementation for futex2. Support only private u32 wait/wake, with
+timeout (monotonic and realtime clocks).
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ MAINTAINERS | 2 +-
+ arch/x86/entry/syscalls/syscall_32.tbl | 2 +
+ arch/x86/entry/syscalls/syscall_64.tbl | 2 +
+ include/linux/syscalls.h | 7 +
+ include/uapi/asm-generic/unistd.h | 8 +-
+ include/uapi/linux/futex.h | 40 ++
+ init/Kconfig | 7 +
+ kernel/Makefile | 1 +
+ kernel/futex2.c | 484 +++++++++++++++++++++++++
+ kernel/sys_ni.c | 4 +
+ 10 files changed, 555 insertions(+), 2 deletions(-)
+ create mode 100644 kernel/futex2.c
+
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 867157311dc8..0c425f74ed88 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -7214,7 +7214,7 @@ F: Documentation/locking/*futex*
+ F: include/asm-generic/futex.h
+ F: include/linux/futex.h
+ F: include/uapi/linux/futex.h
+-F: kernel/futex.c
++F: kernel/futex*
+ F: tools/perf/bench/futex*
+ F: tools/testing/selftests/futex/
+
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index 9d1102873666..955322962964 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -444,3 +444,5 @@
+ 437 i386 openat2 sys_openat2
+ 438 i386 pidfd_getfd sys_pidfd_getfd
+ 439 i386 faccessat2 sys_faccessat2
++440 i386 futex_wait sys_futex_wait
++441 i386 futex_wake sys_futex_wake
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index f30d6ae9a688..4133bfe96891 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -361,6 +361,8 @@
+ 437 common openat2 sys_openat2
+ 438 common pidfd_getfd sys_pidfd_getfd
+ 439 common faccessat2 sys_faccessat2
++440 common futex_wait sys_futex_wait
++441 common futex_wake sys_futex_wake
+
+ #
+ # x32-specific system call numbers start at 512 to avoid cache impact
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index 75ac7f8ae93c..38c3a87dbfc2 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -589,6 +589,13 @@ asmlinkage long sys_get_robust_list(int pid,
+ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
+ size_t len);
+
++/* kernel/futex2.c */
++asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val,
++ unsigned long flags,
++ struct __kernel_timespec __user __user *timo);
++asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long nr_wake,
++ unsigned long flags);
++
+ /* kernel/hrtimer.c */
+ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
+ struct __kernel_timespec __user *rmtp);
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index 995b36c2ea7d..80567ade774a 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -860,8 +860,14 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
+ #define __NR_faccessat2 439
+ __SYSCALL(__NR_faccessat2, sys_faccessat2)
+
++#define __NR_futex_wait 440
++__SYSCALL(__NR_futex_wait, sys_futex_wait)
++
++#define __NR_futex_wake 441
++__SYSCALL(__NR_futex_wake, sys_futex_wake)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 440
++#define __NR_syscalls 442
+
+ /*
+ * 32 bit systems traditionally used different
+diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
+index a89eb0accd5e..35a5bf1cd41b 100644
+--- a/include/uapi/linux/futex.h
++++ b/include/uapi/linux/futex.h
+@@ -41,6 +41,46 @@
+ #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
+ FUTEX_PRIVATE_FLAG)
+
++/* Size argument to futex2 syscall */
++#define FUTEX_8 0
++#define FUTEX_16 1
++#define FUTEX_32 2
++
++#define FUTEX_SIZE_MASK 0x3
++
++#define FUTEX_SHARED_FLAG 8
++
++#define FUTEX_NUMA_FLAG 16
++
++/*
++ * struct futexXX_numa - struct for NUMA-aware futex operation
++ * @value: futex value
++ * @hint: node id to operate
++ */
++
++struct futex8_numa {
++ __u8 value;
++ __u8 hint;
++};
++
++struct futex16_numa {
++ __u16 value;
++ __u16 hint;
++};
++
++struct futex32_numa {
++ __u32 value;
++ __u32 hint;
++};
++
++#define FUTEX_WAITV_MAX 128
++
++struct futex_waitv {
++ void *uaddr;
++ unsigned int val;
++ unsigned int flags;
++};
++
+ /*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+diff --git a/init/Kconfig b/init/Kconfig
+index 2a5df1cf838c..440f21f5c3d8 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1522,6 +1522,13 @@ config FUTEX
+ support for "fast userspace mutexes". The resulting kernel may not
+ run glibc-based applications correctly.
+
++config FUTEX2
++ bool "Enable futex2 support" if EXPERT
++ depends on FUTEX
++ default n
++ help
++ Experimental support for futex2 interface.
++
+ config FUTEX_PI
+ bool
+ depends on FUTEX && RT_MUTEXES
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 9a20016d4900..51ea9bc647bf 100644
+--- a/kernel/Makefile
++++ b/kernel/Makefile
+@@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o
+ obj-$(CONFIG_STACKTRACE) += stacktrace.o
+ obj-y += time/
+ obj-$(CONFIG_FUTEX) += futex.o
++obj-$(CONFIG_FUTEX2) += futex2.o
+ obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+ obj-$(CONFIG_SMP) += smp.o
+ ifneq ($(CONFIG_SMP),y)
+diff --git a/kernel/futex2.c b/kernel/futex2.c
+new file mode 100644
+index 000000000000..107b80a466d0
+--- /dev/null
++++ b/kernel/futex2.c
+@@ -0,0 +1,484 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * futex2 system call interface by André Almeida <andrealmeid@collabora.com>
++ *
++ * Copyright 2020 Collabora Ltd.
++ */
++
++#include <linux/freezer.h>
++#include <linux/jhash.h>
++#include <linux/sched/wake_q.h>
++#include <linux/spinlock.h>
++#include <linux/syscalls.h>
++#include <linux/memblock.h>
++#include <uapi/linux/futex.h>
++
++/**
++ * struct futex_waiter - List entry for a waiter
++ * @key.address: Memory address of userspace futex
++ * @key.mm: Pointer to memory management struct of this process
++ * @key: Stores information that uniquely identify a futex
++ * @list: List node struct
++ * @val: Expected value for this waiter
++ * @flags: Flags
++ * @bucket: Pointer to the bucket for this waiter
++ * @index: Index of waiter in futexv list
++ */
++struct futex_waiter {
++ struct futex_key {
++ uintptr_t address;
++ struct mm_struct *mm;
++ } key;
++ struct list_head list;
++ unsigned int val;
++ unsigned int flags;
++ struct futex_bucket *bucket;
++ unsigned int index;
++};
++
++/**
++ * struct futex_bucket - A bucket of futex's hash table
++ * @waiters: Number of waiters in the bucket
++ * @lock: Bucket lock
++ * @list: List of waiters on this bucket
++ */
++struct futex_bucket {
++ atomic_t waiters;
++ spinlock_t lock;
++ struct list_head list;
++};
++
++struct futexv {
++ struct task_struct *task;
++ int hint;
++ struct futex_waiter objects[0];
++};
++
++struct futex_single_waiter {
++ struct futexv parent;
++ struct futex_waiter waiter;
++} __packed;
++
++struct futex_bucket *futex_table;
++
++/* mask for futex2 flag operations */
++#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | \
++ FUTEX_CLOCK_REALTIME)
++
++// mask for sys_futex_waitv
++#define FUTEXV_MASK (FUTEX_CLOCK_REALTIME)
++
++// mask for each futex in futex_waitv list
++#define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG)
++
++int futex2_hashsize;
++
++/*
++ * Reflects a new waiter being added to the waitqueue.
++ */
++static inline void bucket_inc_waiters(struct futex_bucket *bucket)
++{
++#ifdef CONFIG_SMP
++ atomic_inc(&bucket->waiters);
++ /*
++ * Full barrier (A), see the ordering comment above.
++ */
++ smp_mb__after_atomic();
++#endif
++}
++
++/*
++ * Reflects a waiter being removed from the waitqueue by wakeup
++ * paths.
++ */
++static inline void bucket_dec_waiters(struct futex_bucket *bucket)
++{
++#ifdef CONFIG_SMP
++ atomic_dec(&bucket->waiters);
++#endif
++}
++
++/*
++ * Get the number of waiters in a bucket
++ */
++static inline int bucket_get_waiters(struct futex_bucket *bucket)
++{
++#ifdef CONFIG_SMP
++ /*
++ * Full barrier (B), see the ordering comment above.
++ */
++ smp_mb();
++ return atomic_read(&bucket->waiters);
++#else
++ return 1;
++#endif
++}
++
++/**
++ * futex_get_bucket - Check if the user address is valid, prepare internal
++ * data and calculate the hash
++ * @uaddr: futex user address
++ * @key: data that uniquely identifies a futex
++ *
++ * Return: address of bucket on success, error code otherwise
++ */
++static struct futex_bucket *futex_get_bucket(void __user *uaddr,
++ struct futex_key *key)
++{
++ uintptr_t address = (uintptr_t) uaddr;
++ u32 hash_key;
++
++ /* Checking if uaddr is valid and accessible */
++ if (unlikely(!IS_ALIGNED(address, sizeof(u32))))
++ return ERR_PTR(-EINVAL);
++ if (unlikely(!access_ok(address, sizeof(u32))))
++ return ERR_PTR(-EFAULT);
++
++ key->address = address;
++ key->mm = current->mm;
++
++ /* Generate hash key for this futex using uaddr and current->mm */
++ hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0);
++
++ /* Since HASH_SIZE is 2^n, subtracting 1 makes a perfect bit mask */
++ return &futex_table[hash_key & (futex2_hashsize - 1)];
++}
++
++/**
++ * futex_get_user - Get the userspace value on this address
++ * @uval: variable to store the value
++ * @uaddr: userspace address
++ *
++ * Check the comment at futex_get_user_val for more information.
++ */
++static int futex_get_user(u32 *uval, u32 *uaddr)
++{
++ int ret;
++
++ pagefault_disable();
++ ret = __get_user(*uval, uaddr);
++ pagefault_enable();
++
++ return ret;
++}
++
++/**
++ * futex_setup_time - Prepare the timeout mechanism, without starting it.
++ * @timo: Timeout value from userspace
++ * @timeout: Pointer to hrtimer handler
++ * @flags: Flags from userspace, to decide which clockid to use
++ *
++ * Return: 0 on success, error code otherwise
++ */
++static int futex_setup_time(struct __kernel_timespec __user *timo,
++ struct hrtimer_sleeper *timeout,
++ unsigned int flags)
++{
++ ktime_t time;
++ struct timespec64 ts;
++ clockid_t clockid = (flags & FUTEX_CLOCK_REALTIME) ?
++ CLOCK_REALTIME : CLOCK_MONOTONIC;
++
++ if (get_timespec64(&ts, timo))
++ return -EFAULT;
++
++ if (!timespec64_valid(&ts))
++ return -EINVAL;
++
++ time = timespec64_to_ktime(ts);
++
++ hrtimer_init_sleeper(timeout, clockid, HRTIMER_MODE_ABS);
++
++ hrtimer_set_expires(&timeout->timer, time);
++
++ return 0;
++}
++
++
++/**
++ * futex_get_user_value - Get the value from the userspace address and compares
++ * with the expected one. In success, leaves the function
++ * holding the bucket lock. Else, hold no lock.
++ * @bucket: hash bucket of this address
++ * @uaddr: futex's userspace address
++ * @val: expected value
++ * @multiple: is this call in the wait on multiple path
++ *
++ * Return: 0 on success, error code otherwise
++ */
++static int futex_get_user_value(struct futex_bucket *bucket, u32 __user *uaddr,
++ unsigned int val, bool multiple)
++{
++ u32 uval;
++ int ret;
++
++ /*
++ * Get the value from user futex address.
++ *
++ * Since we are in a hurry, we use a spin lock and we can't sleep.
++ * Try to get the value with page fault disabled (when enable, we might
++ * sleep).
++ *
++ * If we fail, we aren't sure if the address is invalid or is just a
++ * page fault. Then, release the lock (so we can sleep) and try to get
++ * the value with page fault enabled. In order to trigger a page fault
++ * handling, we just call __get_user() again.
++ *
++ * If get_user succeeds, this mean that the address is valid and we do
++ * the loop again. Since we just handled the page fault, the page is
++ * likely pinned in memory and we should be luckier this time and be
++ * able to get the value. If we fail anyway, we will try again.
++ *
++ * If even with page faults enabled we get and error, this means that
++ * the address is not valid and we return from the syscall.
++ */
++ do {
++ spin_lock(&bucket->lock);
++
++ ret = futex_get_user(&uval, uaddr);
++
++ if (ret) {
++ spin_unlock(&bucket->lock);
++ if (multiple || __get_user(uval, uaddr))
++ return -EFAULT;
++
++ }
++ } while (ret);
++
++ if (uval != val) {
++ spin_unlock(&bucket->lock);
++ return -EWOULDBLOCK;
++ }
++
++ return 0;
++}
++
++/**
++ * futex_dequeue - Remove a futex from a queue
++ * @bucket: current bucket holding the futex
++ * @waiter: futex to be removed
++ *
++ * Return: True if futex was removed by this function, false if another wake
++ * thread removed this futex.
++ *
++ * This function should be used after we found that this futex was in a queue.
++ * Thus, it needs to be removed before the next step. However, someone could
++ * wake it between the time of the first check and the time to get the lock for
++ * the bucket. Check one more time if the futex is there with the bucket locked.
++ * If it's there, just remove it and return true. Else, mark the removal as
++ * false and do nothing.
++ */
++static bool futex_dequeue(struct futex_bucket *bucket, struct futex_waiter *waiter)
++{
++ bool removed = true;
++
++ spin_lock(&bucket->lock);
++ if (list_empty(&waiter->list))
++ removed = false;
++ else
++ list_del(&waiter->list);
++ spin_unlock(&bucket->lock);
++
++ if (removed)
++ bucket_dec_waiters(bucket);
++
++ return removed;
++}
++
++/**
++ * sys_futex_wait - Wait on a futex address if (*uaddr) == val
++ * @uaddr: User address of futex
++ * @val: Expected value of futex
++ * @flags: Specify the size of futex and the clockid
++ * @timo: Optional absolute timeout. Supports only 64bit time.
++ */
++SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val,
++ unsigned int, flags, struct __kernel_timespec __user *, timo)
++{
++ unsigned int size = flags & FUTEX_SIZE_MASK;
++ struct hrtimer_sleeper timeout;
++ struct futex_bucket *bucket;
++ struct futex_single_waiter wait_single;
++ struct futex_waiter *waiter;
++ int ret;
++
++ wait_single.parent.task = current;
++ wait_single.parent.hint = 0;
++ waiter = &wait_single.waiter;
++ waiter->index = 0;
++
++ if (flags & ~FUTEX2_MASK)
++ return -EINVAL;
++
++ if (size != FUTEX_32)
++ return -EINVAL;
++
++ if (timo) {
++ ret = futex_setup_time(timo, &timeout, flags);
++ if (ret)
++ return ret;
++ }
++
++ /* Get an unlocked hash bucket */
++ bucket = futex_get_bucket(uaddr, &waiter->key);
++ if (IS_ERR(bucket))
++ return PTR_ERR(bucket);
++
++ if (timo)
++ hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS);
++
++retry:
++ bucket_inc_waiters(bucket);
++
++ /* Compare the expected and current value, get the bucket lock */
++ ret = futex_get_user_value(bucket, uaddr, val, false);
++ if (ret) {
++ bucket_dec_waiters(bucket);
++ goto out;
++ }
++
++ /* Add the waiter to the hash table and sleep */
++ set_current_state(TASK_INTERRUPTIBLE);
++ list_add_tail(&waiter->list, &bucket->list);
++ spin_unlock(&bucket->lock);
++
++ /* Do not sleep if someone woke this futex or if it was timeouted */
++ if (!list_empty_careful(&waiter->list) && (!timo || timeout.task))
++ freezable_schedule();
++
++ __set_current_state(TASK_RUNNING);
++
++ /*
++ * One of those things triggered this wake:
++ *
++ * * We have been removed from the bucket. futex_wake() woke us. We just
++ * need to return 0 to userspace.
++ *
++ * However, if we find ourselves in the bucket we must remove ourselves
++ * from the bucket and ...
++ *
++ * * If the there's a timeout and it has expired, return -ETIMEDOUT.
++ *
++ * * If there is a signal pending, something wants to kill our thread.
++ * Return -ERESTARTSYS.
++ *
++ * * If there's no signal pending, it was a spurious wake (scheduler
++ * gave us a change to do some work, even if we don't want to). We
++ * need to remove ourselves from the bucket and add again, to prevent
++ * losing wakeups in the meantime.
++ */
++
++ /* Normal wake */
++ if (list_empty_careful(&waiter->list))
++ goto out;
++
++ if (!futex_dequeue(bucket, waiter))
++ goto out;
++
++ /* Timeout */
++ if (timo && !timeout.task)
++ return -ETIMEDOUT;
++
++ /* Spurious wakeup */
++ if (!signal_pending(current))
++ goto retry;
++
++ /* Some signal is pending */
++ ret = -ERESTARTSYS;
++out:
++ if (timo)
++ hrtimer_cancel(&timeout.timer);
++
++ return ret;
++}
++
++static struct futexv *futex_get_parent(uintptr_t waiter, u8 index)
++{
++ uintptr_t parent = waiter - sizeof(struct futexv)
++ - (uintptr_t) (index * sizeof(struct futex_waiter));
++
++ return (struct futexv *) parent;
++}
++
++/**
++ * sys_futex_wake - Wake a number of futexes waiting on an address
++ * @uaddr: Address of futex to be woken up
++ * @nr_wake: Number of futexes to be woken up
++ * @flags: TODO
++ */
++SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake,
++ unsigned int, flags)
++{
++ unsigned int size = flags & FUTEX_SIZE_MASK;
++ struct futex_waiter waiter, *aux, *tmp;
++ struct futex_bucket *bucket;
++ struct task_struct *task;
++ DEFINE_WAKE_Q(wake_q);
++ int ret = 0;
++
++ if (flags & ~FUTEX2_MASK)
++ return -EINVAL;
++
++ if (size != FUTEX_32)
++ return -EINVAL;
++
++ bucket = futex_get_bucket(uaddr, &waiter.key);
++ if (IS_ERR(bucket))
++ return PTR_ERR(bucket);
++
++ if (!bucket_get_waiters(bucket))
++ return 0;
++
++ spin_lock(&bucket->lock);
++ list_for_each_entry_safe(aux, tmp, &bucket->list, list) {
++ if (ret >= nr_wake)
++ break;
++
++ if (waiter.key.address == aux->key.address &&
++ waiter.key.mm == aux->key.mm) {
++ struct futexv *parent =
++ futex_get_parent((uintptr_t) aux, aux->index);
++
++ parent->hint = 1;
++ task = parent->task;
++ get_task_struct(task);
++ list_del_init_careful(&aux->list);
++ wake_q_add_safe(&wake_q, task);
++ ret++;
++ bucket_dec_waiters(bucket);
++ }
++ }
++ spin_unlock(&bucket->lock);
++
++ wake_up_q(&wake_q);
++
++ return ret;
++}
++
++static int __init futex2_init(void)
++{
++ int i;
++ unsigned int futex_shift;
++
++#if CONFIG_BASE_SMALL
++ futex2_hashsize = 16;
++#else
++ futex2_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
++#endif
++
++ futex_table = alloc_large_system_hash("futex2", sizeof(struct futex_bucket),
++ futex2_hashsize, 0,
++ futex2_hashsize < 256 ? HASH_SMALL : 0,
++ &futex_shift, NULL,
++ futex2_hashsize, futex2_hashsize);
++ futex2_hashsize = 1UL << futex_shift;
++
++ for (i = 0; i < futex2_hashsize; i++) {
++ INIT_LIST_HEAD(&futex_table[i].list);
++ spin_lock_init(&futex_table[i].lock);
++ atomic_set(&futex_table[i].waiters, 0);
++ }
++
++ return 0;
++}
++core_initcall(futex2_init);
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index 4d59775ea79c..10049bc56c24 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -148,6 +148,10 @@ COND_SYSCALL_COMPAT(set_robust_list);
+ COND_SYSCALL(get_robust_list);
+ COND_SYSCALL_COMPAT(get_robust_list);
+
++/* kernel/futex2.c */
++COND_SYSCALL(futex_wait);
++COND_SYSCALL(futex_wake);
++
+ /* kernel/hrtimer.c */
+
+ /* kernel/itimer.c */
+--
+2.28.0
+
+From 08110d54945541dd186a7dabeef58be08011dde7 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Thu, 15 Oct 2020 17:15:57 -0300
+Subject: [PATCH 02/13] futex2: Add suport for vectorized wait
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add support to wait on multiple futexes
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ arch/x86/entry/syscalls/syscall_32.tbl | 1 +
+ arch/x86/entry/syscalls/syscall_64.tbl | 1 +
+ include/uapi/asm-generic/unistd.h | 5 +-
+ kernel/futex2.c | 430 +++++++++++++++++--------
+ kernel/sys_ni.c | 1 +
+ 5 files changed, 304 insertions(+), 134 deletions(-)
+
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index 955322962964..c844c0cbf0e5 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -446,3 +446,4 @@
+ 439 i386 faccessat2 sys_faccessat2
+ 440 i386 futex_wait sys_futex_wait
+ 441 i386 futex_wake sys_futex_wake
++442 i386 futex_waitv sys_futex_waitv
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index 4133bfe96891..0901c26c6786 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -363,6 +363,7 @@
+ 439 common faccessat2 sys_faccessat2
+ 440 common futex_wait sys_futex_wait
+ 441 common futex_wake sys_futex_wake
++442 common futex_waitv sys_futex_waitv
+
+ #
+ # x32-specific system call numbers start at 512 to avoid cache impact
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index 80567ade774a..d7ebbed0a18c 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -866,8 +866,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait)
+ #define __NR_futex_wake 441
+ __SYSCALL(__NR_futex_wake, sys_futex_wake)
+
++#define __NR_futex_waitv 442
++__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 442
++#define __NR_syscalls 443
+
+ /*
+ * 32 bit systems traditionally used different
+diff --git a/kernel/futex2.c b/kernel/futex2.c
+index 107b80a466d0..4b782b5ef615 100644
+--- a/kernel/futex2.c
++++ b/kernel/futex2.c
+@@ -48,14 +48,25 @@ struct futex_bucket {
+ struct list_head list;
+ };
+
++/**
++ * struct futexv - List of futexes to be waited
++ * @task: Task to be awaken
++ * @hint: Was someone on this list awaken?
++ * @objects: List of futexes
++ */
+ struct futexv {
+ struct task_struct *task;
+- int hint;
++ bool hint;
+ struct futex_waiter objects[0];
+ };
+
++/**
++ * struct futex_single_waiter - Wrapper for a futexv of one element
++ * @futexv: TODO
++ * @waiter: TODO
++ */
+ struct futex_single_waiter {
+- struct futexv parent;
++ struct futexv futexv;
+ struct futex_waiter waiter;
+ } __packed;
+
+@@ -65,10 +76,10 @@ struct futex_bucket *futex_table;
+ #define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG | \
+ FUTEX_CLOCK_REALTIME)
+
+-// mask for sys_futex_waitv
++/* mask for sys_futex_waitv flag */
+ #define FUTEXV_MASK (FUTEX_CLOCK_REALTIME)
+
+-// mask for each futex in futex_waitv list
++/* mask for each futex in futex_waitv list */
+ #define FUTEXV_WAITER_MASK (FUTEX_SIZE_MASK | FUTEX_SHARED_FLAG)
+
+ int futex2_hashsize;
+@@ -151,7 +162,7 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr,
+ *
+ * Check the comment at futex_get_user_val for more information.
+ */
+-static int futex_get_user(u32 *uval, u32 *uaddr)
++static int futex_get_user(u32 *uval, u32 __user *uaddr)
+ {
+ int ret;
+
+@@ -194,95 +205,227 @@ static int futex_setup_time(struct __kernel_timespec __user *timo,
+ return 0;
+ }
+
++/**
++ * futex_dequeue_multiple - Remove multiple futexes from hash table
++ * @futexv: list of waiters
++ * @nr: number of futexes to be removed
++ *
++ * This function should be used after we found that this futex was in a queue.
++ * Thus, it needs to be removed before the next step. However, someone could
++ * wake it between the time of the first check and the time to get the lock for
++ * the bucket. Check one more time if the futex is there with the bucket locked.
++ * If it's there, just remove it and return true. Else, mark the removal as
++ * false and do nothing.
++ *
++ * Return:
++ * * -1 if no futex was woken during the removal
++ * * =< 0 at least one futex was found woken, index of the last one
++ */
++static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr)
++{
++ int i, ret = -1;
++
++ for (i = 0; i < nr; i++) {
++ spin_lock(&futexv->objects[i].bucket->lock);
++ if (!list_empty_careful(&futexv->objects[i].list)) {
++ list_del_init_careful(&futexv->objects[i].list);
++ bucket_dec_waiters(futexv->objects[i].bucket);
++ } else {
++ ret = i;
++ }
++ spin_unlock(&futexv->objects[i].bucket->lock);
++ }
++
++ return ret;
++}
+
+ /**
+- * futex_get_user_value - Get the value from the userspace address and compares
+- * with the expected one. In success, leaves the function
+- * holding the bucket lock. Else, hold no lock.
+- * @bucket: hash bucket of this address
+- * @uaddr: futex's userspace address
+- * @val: expected value
+- * @multiple: is this call in the wait on multiple path
++ * futex_enqueue - Check the value and enqueue a futex on a wait list
++ *
++ * @futexv: List of futexes
++ * @nr_futexes: Number of futexes in the list
++ * @awaken: If a futex was awaken during enqueueing, store the index here
++ *
++ * Get the value from the userspace address and compares with the expected one.
++ * In success, enqueue the futex in the correct bucket
++ *
++ * Get the value from user futex address.
++ *
++ * Since we are in a hurry, we use a spin lock and we can't sleep.
++ * Try to get the value with page fault disabled (when enable, we might
++ * sleep).
++ *
++ * If we fail, we aren't sure if the address is invalid or is just a
++ * page fault. Then, release the lock (so we can sleep) and try to get
++ * the value with page fault enabled. In order to trigger a page fault
++ * handling, we just call __get_user() again. If we sleep with enqueued
++ * futexes, we might miss a wake, so dequeue everything before sleeping.
++ *
++ * If get_user succeeds, this mean that the address is valid and we do
++ * the work again. Since we just handled the page fault, the page is
++ * likely pinned in memory and we should be luckier this time and be
++ * able to get the value. If we fail anyway, we will try again.
++ *
++ * If even with page faults enabled we get and error, this means that
++ * the address is not valid and we return from the syscall.
++ *
++ * If we got an unexpected value or need to treat a page fault and realized that
++ * a futex was awaken, we can priority this and return success.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+-static int futex_get_user_value(struct futex_bucket *bucket, u32 __user *uaddr,
+- unsigned int val, bool multiple)
++static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes,
++ unsigned int *awaken)
+ {
+- u32 uval;
+- int ret;
++ int i, ret;
++ u32 uval, *uaddr, val;
++ struct futex_bucket *bucket;
+
+- /*
+- * Get the value from user futex address.
+- *
+- * Since we are in a hurry, we use a spin lock and we can't sleep.
+- * Try to get the value with page fault disabled (when enable, we might
+- * sleep).
+- *
+- * If we fail, we aren't sure if the address is invalid or is just a
+- * page fault. Then, release the lock (so we can sleep) and try to get
+- * the value with page fault enabled. In order to trigger a page fault
+- * handling, we just call __get_user() again.
+- *
+- * If get_user succeeds, this mean that the address is valid and we do
+- * the loop again. Since we just handled the page fault, the page is
+- * likely pinned in memory and we should be luckier this time and be
+- * able to get the value. If we fail anyway, we will try again.
+- *
+- * If even with page faults enabled we get and error, this means that
+- * the address is not valid and we return from the syscall.
+- */
+- do {
+- spin_lock(&bucket->lock);
++retry:
++ set_current_state(TASK_INTERRUPTIBLE);
++
++ for (i = 0; i < nr_futexes; i++) {
++ uaddr = (u32 * __user) futexv->objects[i].key.address;
++ val = (u32) futexv->objects[i].val;
++ bucket = futexv->objects[i].bucket;
++
++ bucket_inc_waiters(bucket);
++ spin_lock(&bucket->lock);
+
+- ret = futex_get_user(&uval, uaddr);
++ ret = futex_get_user(&uval, uaddr);
+
+- if (ret) {
++ if (unlikely(ret)) {
+ spin_unlock(&bucket->lock);
+- if (multiple || __get_user(uval, uaddr))
++
++ bucket_dec_waiters(bucket);
++ __set_current_state(TASK_RUNNING);
++ *awaken = futex_dequeue_multiple(futexv, i);
++
++ if (__get_user(uval, uaddr))
+ return -EFAULT;
+
++ if (*awaken >= 0)
++ return 0;
++
++ goto retry;
++ }
++
++ if (uval != val) {
++ spin_unlock(&bucket->lock);
++
++ bucket_dec_waiters(bucket);
++ __set_current_state(TASK_RUNNING);
++ *awaken = futex_dequeue_multiple(futexv, i);
++
++ if (*awaken >= 0)
++ return 0;
++
++ return -EWOULDBLOCK;
+ }
+- } while (ret);
+
+- if (uval != val) {
++ list_add_tail(&futexv->objects[i].list, &bucket->list);
+ spin_unlock(&bucket->lock);
+- return -EWOULDBLOCK;
+ }
+
+ return 0;
+ }
+
++
++static int __futex_wait(struct futexv *futexv,
++ unsigned int nr_futexes,
++ struct hrtimer_sleeper *timeout)
++{
++ int ret;
++ unsigned int awaken = -1;
++
++ while (1) {
++ ret = futex_enqueue(futexv, nr_futexes, &awaken);
++
++ if (ret < 0)
++ break;
++
++ if (awaken <= 0) {
++ return awaken;
++ }
++
++
++ /* Before sleeping, check if someone was woken */
++ if (!futexv->hint && (!timeout || timeout->task))
++ freezable_schedule();
++
++ __set_current_state(TASK_RUNNING);
++
++ /*
++ * One of those things triggered this wake:
++ *
++ * * We have been removed from the bucket. futex_wake() woke
++ * us. We just need to dequeue return 0 to userspace.
++ *
++ * However, if no futex was dequeued by a futex_wake():
++ *
++ * * If the there's a timeout and it has expired,
++ * return -ETIMEDOUT.
++ *
++ * * If there is a signal pending, something wants to kill our
++ * thread, return -ERESTARTSYS.
++ *
++ * * If there's no signal pending, it was a spurious wake
++ * (scheduler gave us a change to do some work, even if we
++ * don't want to). We need to remove ourselves from the
++ * bucket and add again, to prevent losing wakeups in the
++ * meantime.
++ */
++
++ ret = futex_dequeue_multiple(futexv, nr_futexes);
++
++ /* Normal wake */
++ if (ret >= 0)
++ break;
++
++ if (timeout && !timeout->task)
++ return -ETIMEDOUT;
++
++ /* signal */
++ if (signal_pending(current))
++ return -ERESTARTSYS;
++
++ /* spurious wake, do everything again */
++ }
++
++ return ret;
++}
++
+ /**
+- * futex_dequeue - Remove a futex from a queue
+- * @bucket: current bucket holding the futex
+- * @waiter: futex to be removed
++ * futex_wait - Setup the timer and wait on a list of futexes
++ * @futexv: List of waiters
++ * @nr_futexes: Number of waiters
++ * @timo: Timeout
++ * @timeout: Timeout
++ * @flags: Timeout flags
+ *
+- * Return: True if futex was removed by this function, false if another wake
+- * thread removed this futex.
+- *
+- * This function should be used after we found that this futex was in a queue.
+- * Thus, it needs to be removed before the next step. However, someone could
+- * wake it between the time of the first check and the time to get the lock for
+- * the bucket. Check one more time if the futex is there with the bucket locked.
+- * If it's there, just remove it and return true. Else, mark the removal as
+- * false and do nothing.
++ * Return: error code, or a hint of one of the waiters
+ */
+-static bool futex_dequeue(struct futex_bucket *bucket, struct futex_waiter *waiter)
++static int futex_wait(struct futexv *futexv, unsigned int nr_futexes,
++ struct __kernel_timespec __user *timo,
++ struct hrtimer_sleeper *timeout, unsigned int flags)
+ {
+- bool removed = true;
++ int ret;
+
+- spin_lock(&bucket->lock);
+- if (list_empty(&waiter->list))
+- removed = false;
+- else
+- list_del(&waiter->list);
+- spin_unlock(&bucket->lock);
++ if (timo) {
++ ret = futex_setup_time(timo, timeout, flags);
++ if (ret)
++ return ret;
+
+- if (removed)
+- bucket_dec_waiters(bucket);
++ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
++ }
+
+- return removed;
++ ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL);
++
++
++ if (timo)
++ hrtimer_cancel(&timeout->timer);
++
++ return ret;
+ }
+
+ /**
+@@ -297,15 +440,20 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val,
+ {
+ unsigned int size = flags & FUTEX_SIZE_MASK;
+ struct hrtimer_sleeper timeout;
+- struct futex_bucket *bucket;
+ struct futex_single_waiter wait_single;
+ struct futex_waiter *waiter;
++ struct futexv *futexv;
+ int ret;
+
+- wait_single.parent.task = current;
+- wait_single.parent.hint = 0;
++ futexv = &wait_single.futexv;
++ futexv->task = current;
++ futexv->hint = false;
++
+ waiter = &wait_single.waiter;
+ waiter->index = 0;
++ waiter->val = val;
++
++ INIT_LIST_HEAD(&waiter->list);
+
+ if (flags & ~FUTEX2_MASK)
+ return -EINVAL;
+@@ -313,85 +461,101 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val,
+ if (size != FUTEX_32)
+ return -EINVAL;
+
+- if (timo) {
+- ret = futex_setup_time(timo, &timeout, flags);
+- if (ret)
+- return ret;
+- }
+-
+ /* Get an unlocked hash bucket */
+- bucket = futex_get_bucket(uaddr, &waiter->key);
+- if (IS_ERR(bucket))
+- return PTR_ERR(bucket);
++ waiter->bucket = futex_get_bucket(uaddr, &waiter->key);
++ if (IS_ERR(waiter->bucket))
++ return PTR_ERR(waiter->bucket);
+
+- if (timo)
+- hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS);
++ ret = futex_wait(futexv, 1, timo, &timeout, flags);
+
+-retry:
+- bucket_inc_waiters(bucket);
++ return ret;
++}
+
+- /* Compare the expected and current value, get the bucket lock */
+- ret = futex_get_user_value(bucket, uaddr, val, false);
+- if (ret) {
+- bucket_dec_waiters(bucket);
+- goto out;
+- }
++/**
++ * futex_parse_waitv - Parse a waitv array from userspace
++ * @futexv: list of waiters
++ * @uwaitv: userspace list
++ * @nr_futexes: number of waiters in the list
++ *
++ * Return: Error code on failure, pointer to a prepared futexv otherwise
++ */
++static int futex_parse_waitv(struct futexv *futexv,
++ struct futex_waitv __user *uwaitv,
++ unsigned int nr_futexes)
++{
++ struct futex_waitv waitv;
++ unsigned int i;
++ struct futex_bucket *bucket;
+
+- /* Add the waiter to the hash table and sleep */
+- set_current_state(TASK_INTERRUPTIBLE);
+- list_add_tail(&waiter->list, &bucket->list);
+- spin_unlock(&bucket->lock);
++ for (i = 0; i < nr_futexes; i++) {
++ if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv)))
++ return -EFAULT;
+
+- /* Do not sleep if someone woke this futex or if it was timeouted */
+- if (!list_empty_careful(&waiter->list) && (!timo || timeout.task))
+- freezable_schedule();
++ if ((waitv.flags & ~FUTEXV_WAITER_MASK) ||
++ (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32)
++ return -EINVAL;
+
+- __set_current_state(TASK_RUNNING);
++ bucket = futex_get_bucket(waitv.uaddr,
++ &futexv->objects[i].key);
++ if (IS_ERR(bucket))
++ return PTR_ERR(bucket);
+
+- /*
+- * One of those things triggered this wake:
+- *
+- * * We have been removed from the bucket. futex_wake() woke us. We just
+- * need to return 0 to userspace.
+- *
+- * However, if we find ourselves in the bucket we must remove ourselves
+- * from the bucket and ...
+- *
+- * * If the there's a timeout and it has expired, return -ETIMEDOUT.
+- *
+- * * If there is a signal pending, something wants to kill our thread.
+- * Return -ERESTARTSYS.
+- *
+- * * If there's no signal pending, it was a spurious wake (scheduler
+- * gave us a change to do some work, even if we don't want to). We
+- * need to remove ourselves from the bucket and add again, to prevent
+- * losing wakeups in the meantime.
+- */
++ futexv->objects[i].bucket = bucket;
++ futexv->objects[i].val = waitv.val;
++ futexv->objects[i].flags = waitv.flags;
++ futexv->objects[i].index = i;
++ INIT_LIST_HEAD(&futexv->objects[i].list);
++ }
+
+- /* Normal wake */
+- if (list_empty_careful(&waiter->list))
+- goto out;
++ return 0;
++}
+
+- if (!futex_dequeue(bucket, waiter))
+- goto out;
++/**
++ * sys_futex_waitv - function
++ * @waiters: TODO
++ * @nr_futexes: TODO
++ * @flags: TODO
++ * @timo: TODO
++ */
++SYSCALL_DEFINE4(futex_waitv, struct futex_waitv __user *, waiters,
++ unsigned int, nr_futexes, unsigned int, flags,
++ struct __kernel_timespec __user *, timo)
++{
++ struct hrtimer_sleeper timeout;
++ struct futexv *futexv;
++ int ret;
+
+- /* Timeout */
+- if (timo && !timeout.task)
+- return -ETIMEDOUT;
++ if (flags & ~FUTEXV_MASK)
++ return -EINVAL;
+
+- /* Spurious wakeup */
+- if (!signal_pending(current))
+- goto retry;
++ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
++ return -EINVAL;
+
+- /* Some signal is pending */
+- ret = -ERESTARTSYS;
+-out:
+- if (timo)
+- hrtimer_cancel(&timeout.timer);
++ futexv = kmalloc(sizeof(struct futexv) +
++ (sizeof(struct futex_waiter) * nr_futexes),
++ GFP_KERNEL);
++ if (!futexv)
++ return -ENOMEM;
++
++ futexv->hint = false;
++ futexv->task = current;
++
++ ret = futex_parse_waitv(futexv, waiters, nr_futexes);
++ if (!ret)
++ ret = futex_wait(futexv, nr_futexes, timo, &timeout, flags);
++
++ kfree(futexv);
+
+ return ret;
+ }
+
++/**
++ * futex_get_parent - Get parent
++ * @waiter: TODO
++ * @index: TODO
++ *
++ * Return: TODO
++ */
+ static struct futexv *futex_get_parent(uintptr_t waiter, u8 index)
+ {
+ uintptr_t parent = waiter - sizeof(struct futexv)
+@@ -439,7 +603,7 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake,
+ struct futexv *parent =
+ futex_get_parent((uintptr_t) aux, aux->index);
+
+- parent->hint = 1;
++ parent->hint = true;
+ task = parent->task;
+ get_task_struct(task);
+ list_del_init_careful(&aux->list);
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index 10049bc56c24..3e1a713d3e57 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -151,6 +151,7 @@ COND_SYSCALL_COMPAT(get_robust_list);
+ /* kernel/futex2.c */
+ COND_SYSCALL(futex_wait);
+ COND_SYSCALL(futex_wake);
++COND_SYSCALL(futex_waitv);
+
+ /* kernel/hrtimer.c */
+
+--
+2.28.0
+
+From d8120d2ee1729a6933a606a6720f3e3116e4f699 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Thu, 9 Jul 2020 11:34:40 -0300
+Subject: [PATCH 03/13] selftests: futex: Add futex2 wake/wait test
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add a simple test to test wake/wait mechanism using futex2 interface.
+Create helper files so more tests can evaluate futex2. While 32bit ABIs
+from glibc aren't able to use 64 bit sized time variables, add a
+temporary workaround that implements the required types and calls the
+appropriated syscalls, since futex2 doesn't supports 32 bit sized time.
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ tools/include/uapi/asm-generic/unistd.h | 7 +-
+ .../selftests/futex/functional/.gitignore | 1 +
+ .../selftests/futex/functional/Makefile | 4 +-
+ .../selftests/futex/functional/futex2_wait.c | 111 ++++++++++++++++++
+ .../testing/selftests/futex/functional/run.sh | 3 +
+ .../selftests/futex/include/futex2test.h | 77 ++++++++++++
+ 6 files changed, 201 insertions(+), 2 deletions(-)
+ create mode 100644 tools/testing/selftests/futex/functional/futex2_wait.c
+ create mode 100644 tools/testing/selftests/futex/include/futex2test.h
+
+diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
+index 995b36c2ea7d..dd457de21bad 100644
+--- a/tools/include/uapi/asm-generic/unistd.h
++++ b/tools/include/uapi/asm-generic/unistd.h
+@@ -860,8 +860,13 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
+ #define __NR_faccessat2 439
+ __SYSCALL(__NR_faccessat2, sys_faccessat2)
+
++#define __NR_futex_wait 440
++__SYSCALL(__NR_futex_wait, sys_futex_wait)
++#define __NR_futex_wake 441
++__SYSCALL(__NR_futex_wake, sys_futex_wake)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 440
++#define __NR_syscalls 442
+
+ /*
+ * 32 bit systems traditionally used different
+diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore
+index 0efcd494daab..d61f1df94360 100644
+--- a/tools/testing/selftests/futex/functional/.gitignore
++++ b/tools/testing/selftests/futex/functional/.gitignore
+@@ -6,3 +6,4 @@ futex_wait_private_mapped_file
+ futex_wait_timeout
+ futex_wait_uninitialized_heap
+ futex_wait_wouldblock
++futex2_wait
+diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
+index 23207829ec75..7142a94a7ac3 100644
+--- a/tools/testing/selftests/futex/functional/Makefile
++++ b/tools/testing/selftests/futex/functional/Makefile
+@@ -5,6 +5,7 @@ LDLIBS := -lpthread -lrt
+
+ HEADERS := \
+ ../include/futextest.h \
++ ../include/futex2test.h \
+ ../include/atomic.h \
+ ../include/logging.h
+ TEST_GEN_FILES := \
+@@ -14,7 +15,8 @@ TEST_GEN_FILES := \
+ futex_requeue_pi_signal_restart \
+ futex_requeue_pi_mismatched_ops \
+ futex_wait_uninitialized_heap \
+- futex_wait_private_mapped_file
++ futex_wait_private_mapped_file \
++ futex2_wait
+
+ TEST_PROGS := run.sh
+
+diff --git a/tools/testing/selftests/futex/functional/futex2_wait.c b/tools/testing/selftests/futex/functional/futex2_wait.c
+new file mode 100644
+index 000000000000..752ed26803b3
+--- /dev/null
++++ b/tools/testing/selftests/futex/functional/futex2_wait.c
+@@ -0,0 +1,111 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/******************************************************************************
++ *
++ * Copyright Collabora Ltd., 2020
++ *
++ * DESCRIPTION
++ * Test wait/wake mechanism of futex2, using 32bit sized futexes.
++ *
++ * AUTHOR
++ * André Almeida <andrealmeid@collabora.com>
++ *
++ * HISTORY
++ * 2020-Jul-9: Initial version by André <andrealmeid@collabora.com>
++ *
++ *****************************************************************************/
++
++#include <errno.h>
++#include <error.h>
++#include <getopt.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <time.h>
++#include <pthread.h>
++#include "futex2test.h"
++#include "logging.h"
++
++#define TEST_NAME "futex-wait-wouldblock"
++#define timeout_ns 30000000
++#define WAKE_WAIT_US 10000
++futex_t f1 = FUTEX_INITIALIZER;
++
++void usage(char *prog)
++{
++ printf("Usage: %s\n", prog);
++ printf(" -c Use color\n");
++ printf(" -h Display this help message\n");
++ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
++ VQUIET, VCRITICAL, VINFO);
++}
++
++void *waiterfn(void *arg)
++{
++ struct timespec64 to64;
++
++ /* setting absolute timeout for futex2 */
++ if (gettime64(CLOCK_MONOTONIC, &to64))
++ error("gettime64 failed\n", errno);
++
++ to64.tv_nsec += timeout_ns;
++
++ if (to64.tv_nsec >= 1000000000) {
++ to64.tv_sec++;
++ to64.tv_nsec -= 1000000000;
++ }
++
++ if (futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64))
++ printf("waiter failed errno %d\n", errno);
++
++ return NULL;
++}
++
++int main(int argc, char *argv[])
++{
++ pthread_t waiter;
++ int res, ret = RET_PASS;
++ int c;
++
++ while ((c = getopt(argc, argv, "cht:v:")) != -1) {
++ switch (c) {
++ case 'c':
++ log_color(1);
++ break;
++ case 'h':
++ usage(basename(argv[0]));
++ exit(0);
++ case 'v':
++ log_verbosity(atoi(optarg));
++ break;
++ default:
++ usage(basename(argv[0]));
++ exit(1);
++ }
++ }
++
++ ksft_print_header();
++ ksft_set_plan(1);
++ ksft_print_msg("%s: Test FUTEX_WAIT\n",
++ basename(argv[0]));
++
++ info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1);
++
++ if (pthread_create(&waiter, NULL, waiterfn, NULL))
++ error("pthread_create failed\n", errno);
++
++ usleep(WAKE_WAIT_US);
++
++ info("Calling futex2_wake on f1: %u @ %p with val=%u\n", f1, &f1, f1);
++ res = futex2_wake(&f1, 1, FUTEX_PRIVATE_FLAG | FUTEX_32);
++ if (res != 1) {
++ ksft_test_result_fail("futex2_wake returned: %d %s\n",
++ res ? errno : res,
++ res ? strerror(errno) : "");
++ ret = RET_FAIL;
++ } else {
++ ksft_test_result_pass("futex2_wake wouldblock succeeds\n");
++ }
++
++ ksft_print_cnts();
++ return ret;
++}
+diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh
+index 1acb6ace1680..3730159c865a 100755
+--- a/tools/testing/selftests/futex/functional/run.sh
++++ b/tools/testing/selftests/futex/functional/run.sh
+@@ -73,3 +73,6 @@ echo
+ echo
+ ./futex_wait_uninitialized_heap $COLOR
+ ./futex_wait_private_mapped_file $COLOR
++
++echo
++./futex2_wait $COLOR
+diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h
+new file mode 100644
+index 000000000000..807b8b57fe61
+--- /dev/null
++++ b/tools/testing/selftests/futex/include/futex2test.h
+@@ -0,0 +1,77 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++/******************************************************************************
++ *
++ * Copyright Collabora Ltd., 2020
++ *
++ * DESCRIPTION
++ * Futex2 library addons for old futex library
++ *
++ * AUTHOR
++ * André Almeida <andrealmeid@collabora.com>
++ *
++ * HISTORY
++ * 2020-Jul-9: Initial version by André <andrealmeid@collabora.com>
++ *
++ *****************************************************************************/
++#include "futextest.h"
++#include <stdio.h>
++
++#define NSEC_PER_SEC 1000000000L
++
++#ifndef FUTEX_8
++# define FUTEX_8 0
++#endif
++#ifndef FUTEX_16
++# define FUTEX_16 1
++#endif
++#ifndef FUTEX_32
++#define FUTEX_32 2
++#endif
++#ifdef __x86_64__
++# ifndef FUTEX_64
++# define FUTEX_64 3
++# endif
++#endif
++
++/*
++ * - Y2038 section for 32-bit applications -
++ *
++ * Remove this when glibc is ready for y2038. Then, always compile with
++ * `-DTIME_BITS=64` or `-D__USE_TIME_BITS64`. glibc will provide both
++ * timespec64 and clock_gettime64 so we won't need to define here.
++ */
++#if defined(__i386__) || __TIMESIZE == 32
++# define NR_gettime __NR_clock_gettime64
++#else
++# define NR_gettime __NR_clock_gettime
++#endif
++
++struct timespec64 {
++ long long tv_sec; /* seconds */
++ long long tv_nsec; /* nanoseconds */
++};
++
++int gettime64(clock_t clockid, struct timespec64 *tv)
++{
++ return syscall(NR_gettime, clockid, tv);
++}
++/*
++ * - End of Y2038 section -
++ */
++
++/*
++ * wait for uaddr if (*uaddr == val)
++ */
++static inline int futex2_wait(volatile void *uaddr, unsigned long val,
++ unsigned long flags, struct timespec64 *timo)
++{
++ return syscall(__NR_futex_wait, uaddr, val, flags, timo);
++}
++
++/*
++ * wake nr futexes waiting for uaddr
++ */
++static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned long flags)
++{
++ return syscall(__NR_futex_wake, uaddr, nr, flags);
++}
+--
+2.28.0
+
+From d4a7ca72f276b2e337eaedcbbe58a2782e0e7d3b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Thu, 9 Jul 2020 11:36:14 -0300
+Subject: [PATCH 04/13] selftests: futex: Add futex2 timeout test
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Adapt existing futex wait timeout file to test the same mechanism for
+futex2.
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ .../futex/functional/futex_wait_timeout.c | 38 ++++++++++++++-----
+ 1 file changed, 29 insertions(+), 9 deletions(-)
+
+diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
+index ee55e6d389a3..d2e7ae18985b 100644
+--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c
++++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
+@@ -11,6 +11,7 @@
+ *
+ * HISTORY
+ * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com>
++ * 2020-Jul-9: Add futex2 test by André <andrealmeid@collabora.com>
+ *
+ *****************************************************************************/
+
+@@ -20,7 +21,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <time.h>
+-#include "futextest.h"
++#include "futex2test.h"
+ #include "logging.h"
+
+ #define TEST_NAME "futex-wait-timeout"
+@@ -40,7 +41,8 @@ void usage(char *prog)
+ int main(int argc, char *argv[])
+ {
+ futex_t f1 = FUTEX_INITIALIZER;
+- struct timespec to;
++ struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns};
++ struct timespec64 to64;
+ int res, ret = RET_PASS;
+ int c;
+
+@@ -65,22 +67,40 @@ int main(int argc, char *argv[])
+ }
+
+ ksft_print_header();
+- ksft_set_plan(1);
++ ksft_set_plan(2);
+ ksft_print_msg("%s: Block on a futex and wait for timeout\n",
+ basename(argv[0]));
+ ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns);
+
+- /* initialize timeout */
+- to.tv_sec = 0;
+- to.tv_nsec = timeout_ns;
+-
+ info("Calling futex_wait on f1: %u @ %p\n", f1, &f1);
+ res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG);
+ if (!res || errno != ETIMEDOUT) {
+- fail("futex_wait returned %d\n", ret < 0 ? errno : ret);
++ ksft_test_result_fail("futex_wait returned %d\n", ret < 0 ? errno : ret);
++ ret = RET_FAIL;
++ } else {
++ ksft_test_result_pass("futex_wait timeout succeeds\n");
++ }
++
++ /* setting absolute timeout for futex2 */
++ if (gettime64(CLOCK_MONOTONIC, &to64))
++ error("gettime64 failed\n", errno);
++
++ to64.tv_nsec += timeout_ns;
++
++ if (to64.tv_nsec >= 1000000000) {
++ to64.tv_sec++;
++ to64.tv_nsec -= 1000000000;
++ }
++
++ info("Calling futex2_wait on f1: %u @ %p\n", f1, &f1);
++ res = futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64);
++ if (!res || errno != ETIMEDOUT) {
++ ksft_test_result_fail("futex2_wait returned %d\n", ret < 0 ? errno : ret);
+ ret = RET_FAIL;
++ } else {
++ ksft_test_result_pass("futex2_wait timeout succeeds\n");
+ }
+
+- print_result(TEST_NAME, ret);
++ ksft_print_cnts();
+ return ret;
+ }
+--
+2.28.0
+
+From 6d2252d43d36a5eb2b9170351128007e27f47737 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Thu, 9 Jul 2020 11:37:42 -0300
+Subject: [PATCH 05/13] selftests: futex: Add futex2 wouldblock test
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Adapt existing futex wait wouldblock file to test the same mechanism for
+futex2.
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ .../futex/functional/futex_wait_wouldblock.c | 33 ++++++++++++++++---
+ 1 file changed, 29 insertions(+), 4 deletions(-)
+
+diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
+index 0ae390ff8164..8187f0754cd2 100644
+--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
++++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
+@@ -12,6 +12,7 @@
+ *
+ * HISTORY
+ * 2009-Nov-14: Initial version by Gowrishankar <gowrishankar.m@in.ibm.com>
++ * 2020-Jul-9: Add futex2 test by André <andrealmeid@collabora.com>
+ *
+ *****************************************************************************/
+
+@@ -21,7 +22,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <time.h>
+-#include "futextest.h"
++#include "futex2test.h"
+ #include "logging.h"
+
+ #define TEST_NAME "futex-wait-wouldblock"
+@@ -39,6 +40,7 @@ void usage(char *prog)
+ int main(int argc, char *argv[])
+ {
+ struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns};
++ struct timespec64 to64;
+ futex_t f1 = FUTEX_INITIALIZER;
+ int res, ret = RET_PASS;
+ int c;
+@@ -61,18 +63,41 @@ int main(int argc, char *argv[])
+ }
+
+ ksft_print_header();
+- ksft_set_plan(1);
++ ksft_set_plan(2);
+ ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+ info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
+ res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG);
+ if (!res || errno != EWOULDBLOCK) {
+- fail("futex_wait returned: %d %s\n",
++ ksft_test_result_fail("futex_wait returned: %d %s\n",
+ res ? errno : res, res ? strerror(errno) : "");
+ ret = RET_FAIL;
++ } else {
++ ksft_test_result_pass("futex_wait wouldblock succeeds\n");
+ }
+
+- print_result(TEST_NAME, ret);
++ /* setting absolute timeout for futex2 */
++ if (gettime64(CLOCK_MONOTONIC, &to64))
++ error("gettime64 failed\n", errno);
++
++ to64.tv_nsec += timeout_ns;
++
++ if (to64.tv_nsec >= 1000000000) {
++ to64.tv_sec++;
++ to64.tv_nsec -= 1000000000;
++ }
++
++ info("Calling futex2_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
++ res = futex2_wait(&f1, f1+1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64);
++ if (!res || errno != EWOULDBLOCK) {
++ ksft_test_result_fail("futex2_wait returned: %d %s\n",
++ res ? errno : res, res ? strerror(errno) : "");
++ ret = RET_FAIL;
++ } else {
++ ksft_test_result_pass("futex2_wait wouldblock succeeds\n");
++ }
++
++ ksft_print_cnts();
+ return ret;
+ }
+--
+2.28.0
+
+From 6b35a09be663f5a844e089f1ddd370137832e7a7 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Wed, 14 Oct 2020 16:10:09 -0300
+Subject: [PATCH 06/13] DONOTMERGE: futex: Add a clone of futex implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+For comparative performance tests between the original futex and the new
+futex2 interface, create a clone of the current futex. In that way, we
+can have a fair comparison, since the futex2 table will be empty with no
+contention for the bucket locks. Since futex is widely used in the host
+system, the performance tests could get misleading results by the tests
+competing with the system for resources.
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ arch/x86/entry/syscalls/syscall_32.tbl | 1 +
+ arch/x86/entry/syscalls/syscall_64.tbl | 1 +
+ include/linux/syscalls.h | 3 +
+ include/uapi/asm-generic/unistd.h | 5 +-
+ kernel/Makefile | 1 +
+ kernel/futex1.c | 3384 +++++++++++++++++
+ kernel/sys_ni.c | 2 +
+ tools/arch/x86/include/asm/unistd_64.h | 12 +
+ tools/include/uapi/asm-generic/unistd.h | 6 +-
+ .../arch/x86/entry/syscalls/syscall_64.tbl | 3 +
+ tools/perf/bench/futex.h | 23 +-
+ 11 files changed, 3438 insertions(+), 3 deletions(-)
+ create mode 100644 kernel/futex1.c
+
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index c844c0cbf0e5..820fa53ccf75 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -447,3 +447,4 @@
+ 440 i386 futex_wait sys_futex_wait
+ 441 i386 futex_wake sys_futex_wake
+ 442 i386 futex_waitv sys_futex_waitv
++443 i386 futex1 sys_futex1
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index 0901c26c6786..99795136cb98 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -364,6 +364,7 @@
+ 440 common futex_wait sys_futex_wait
+ 441 common futex_wake sys_futex_wake
+ 442 common futex_waitv sys_futex_waitv
++443 common futex1 sys_futex1
+
+ #
+ # x32-specific system call numbers start at 512 to avoid cache impact
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index 38c3a87dbfc2..0351f6ad09a9 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -596,6 +596,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val,
+ asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long nr_wake,
+ unsigned long flags);
+
++asmlinkage long sys_futex1(void __user *uaddr, unsigned long nr_wake,
++ unsigned long flags);
++
+ /* kernel/hrtimer.c */
+ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
+ struct __kernel_timespec __user *rmtp);
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index d7ebbed0a18c..e3ba6cb1f76d 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -869,8 +869,11 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake)
+ #define __NR_futex_waitv 442
+ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
+
++#define __NR_futex1 443
++__SYSCALL(__NR_futex1, sys_futex1)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 443
++#define __NR_syscalls 444
+
+ /*
+ * 32 bit systems traditionally used different
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 51ea9bc647bf..0fe55a8cb9e2 100644
+--- a/kernel/Makefile
++++ b/kernel/Makefile
+@@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o
+ obj-$(CONFIG_STACKTRACE) += stacktrace.o
+ obj-y += time/
+ obj-$(CONFIG_FUTEX) += futex.o
++obj-$(CONFIG_FUTEX2) += futex1.o
+ obj-$(CONFIG_FUTEX2) += futex2.o
+ obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+ obj-$(CONFIG_SMP) += smp.o
+diff --git a/kernel/futex1.c b/kernel/futex1.c
+new file mode 100644
+index 000000000000..4f7bf312fefd
+--- /dev/null
++++ b/kernel/futex1.c
+@@ -0,0 +1,3384 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * Fast Userspace Mutexes (which I call "Futexes!").
++ * (C) Rusty Russell, IBM 2002
++ *
++ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
++ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
++ *
++ * Removed page pinning, fix privately mapped COW pages and other cleanups
++ * (C) Copyright 2003, 2004 Jamie Lokier
++ *
++ * Robust futex support started by Ingo Molnar
++ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
++ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
++ *
++ * PI-futex support started by Ingo Molnar and Thomas Gleixner
++ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
++ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
++ *
++ * PRIVATE futexes by Eric Dumazet
++ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
++ *
++ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
++ * Copyright (C) IBM Corporation, 2009
++ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
++ *
++ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
++ * enough at me, Linus for the original (flawed) idea, Matthew
++ * Kirkwood for proof-of-concept implementation.
++ *
++ * "The futexes are also cursed."
++ * "But they come in a choice of three flavours!"
++ */
++#include <linux/compat.h>
++#include <linux/jhash.h>
++#include <linux/pagemap.h>
++#include <linux/syscalls.h>
++#include <linux/hugetlb.h>
++#include <linux/freezer.h>
++#include <linux/memblock.h>
++#include <linux/fault-inject.h>
++
++#include <asm/futex.h>
++
++#include "locking/rtmutex_common.h"
++
++/*
++ * READ this before attempting to hack on futexes!
++ *
++ * Basic futex operation and ordering guarantees
++ * =============================================
++ *
++ * The waiter reads the futex value in user space and calls
++ * futex_wait(). This function computes the hash bucket and acquires
++ * the hash bucket lock. After that it reads the futex user space value
++ * again and verifies that the data has not changed. If it has not changed
++ * it enqueues itself into the hash bucket, releases the hash bucket lock
++ * and schedules.
++ *
++ * The waker side modifies the user space value of the futex and calls
++ * futex_wake(). This function computes the hash bucket and acquires the
++ * hash bucket lock. Then it looks for waiters on that futex in the hash
++ * bucket and wakes them.
++ *
++ * In futex wake up scenarios where no tasks are blocked on a futex, taking
++ * the hb spinlock can be avoided and simply return. In order for this
++ * optimization to work, ordering guarantees must exist so that the waiter
++ * being added to the list is acknowledged when the list is concurrently being
++ * checked by the waker, avoiding scenarios like the following:
++ *
++ * CPU 0 CPU 1
++ * val = *futex;
++ * sys_futex(WAIT, futex, val);
++ * futex_wait(futex, val);
++ * uval = *futex;
++ * *futex = newval;
++ * sys_futex(WAKE, futex);
++ * futex_wake(futex);
++ * if (queue_empty())
++ * return;
++ * if (uval == val)
++ * lock(hash_bucket(futex));
++ * queue();
++ * unlock(hash_bucket(futex));
++ * schedule();
++ *
++ * This would cause the waiter on CPU 0 to wait forever because it
++ * missed the transition of the user space value from val to newval
++ * and the waker did not find the waiter in the hash bucket queue.
++ *
++ * The correct serialization ensures that a waiter either observes
++ * the changed user space value before blocking or is woken by a
++ * concurrent waker:
++ *
++ * CPU 0 CPU 1
++ * val = *futex;
++ * sys_futex(WAIT, futex, val);
++ * futex_wait(futex, val);
++ *
++ * waiters++; (a)
++ * smp_mb(); (A) <-- paired with -.
++ * |
++ * lock(hash_bucket(futex)); |
++ * |
++ * uval = *futex; |
++ * | *futex = newval;
++ * | sys_futex(WAKE, futex);
++ * | futex_wake(futex);
++ * |
++ * `--------> smp_mb(); (B)
++ * if (uval == val)
++ * queue();
++ * unlock(hash_bucket(futex));
++ * schedule(); if (waiters)
++ * lock(hash_bucket(futex));
++ * else wake_waiters(futex);
++ * waiters--; (b) unlock(hash_bucket(futex));
++ *
++ * Where (A) orders the waiters increment and the futex value read through
++ * atomic operations (see hb_waiters_inc) and where (B) orders the write
++ * to futex and the waiters read (see hb_waiters_pending()).
++ *
++ * This yields the following case (where X:=waiters, Y:=futex):
++ *
++ * X = Y = 0
++ *
++ * w[X]=1 w[Y]=1
++ * MB MB
++ * r[Y]=y r[X]=x
++ *
++ * Which guarantees that x==0 && y==0 is impossible; which translates back into
++ * the guarantee that we cannot both miss the futex variable change and the
++ * enqueue.
++ *
++ * Note that a new waiter is accounted for in (a) even when it is possible that
++ * the wait call can return error, in which case we backtrack from it in (b).
++ * Refer to the comment in queue_lock().
++ *
++ * Similarly, in order to account for waiters being requeued on another
++ * address we always increment the waiters for the destination bucket before
++ * acquiring the lock. It then decrements them again after releasing it -
++ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
++ * will do the additional required waiter count housekeeping. This is done for
++ * double_lock_hb() and double_unlock_hb(), respectively.
++ */
++
++#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
++#define futex_cmpxchg_enabled 1
++#else
++static int __read_mostly futex_cmpxchg_enabled;
++#endif
++
++/*
++ * Futex flags used to encode options to functions and preserve them across
++ * restarts.
++ */
++#ifdef CONFIG_MMU
++# define FLAGS_SHARED 0x01
++#else
++/*
++ * NOMMU does not have per process address space. Let the compiler optimize
++ * code away.
++ */
++# define FLAGS_SHARED 0x00
++#endif
++#define FLAGS_CLOCKRT 0x02
++#define FLAGS_HAS_TIMEOUT 0x04
++
++/*
++ * Priority Inheritance state:
++ */
++struct futex_pi_state {
++ /*
++ * list of 'owned' pi_state instances - these have to be
++ * cleaned up in do_exit() if the task exits prematurely:
++ */
++ struct list_head list;
++
++ /*
++ * The PI object:
++ */
++ struct rt_mutex pi_mutex;
++
++ struct task_struct *owner;
++ refcount_t refcount;
++
++ union futex_key key;
++} __randomize_layout;
++
++/**
++ * struct futex_q - The hashed futex queue entry, one per waiting task
++ * @list: priority-sorted list of tasks waiting on this futex
++ * @task: the task waiting on the futex
++ * @lock_ptr: the hash bucket lock
++ * @key: the key the futex is hashed on
++ * @pi_state: optional priority inheritance state
++ * @rt_waiter: rt_waiter storage for use with requeue_pi
++ * @requeue_pi_key: the requeue_pi target futex key
++ * @bitset: bitset for the optional bitmasked wakeup
++ *
++ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
++ * we can wake only the relevant ones (hashed queues may be shared).
++ *
++ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
++ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
++ * The order of wakeup is always to make the first condition true, then
++ * the second.
++ *
++ * PI futexes are typically woken before they are removed from the hash list via
++ * the rt_mutex code. See unqueue_me_pi().
++ */
++struct futex_q {
++ struct plist_node list;
++
++ struct task_struct *task;
++ spinlock_t *lock_ptr;
++ union futex_key key;
++ struct futex_pi_state *pi_state;
++ struct rt_mutex_waiter *rt_waiter;
++ union futex_key *requeue_pi_key;
++ u32 bitset;
++} __randomize_layout;
++
++static const struct futex_q futex_q_init = {
++ /* list gets initialized in queue_me()*/
++ .key = FUTEX_KEY_INIT,
++ .bitset = FUTEX_BITSET_MATCH_ANY
++};
++
++/*
++ * Hash buckets are shared by all the futex_keys that hash to the same
++ * location. Each key may have multiple futex_q structures, one for each task
++ * waiting on a futex.
++ */
++struct futex_hash_bucket {
++ atomic_t waiters;
++ spinlock_t lock;
++ struct plist_head chain;
++} ____cacheline_aligned_in_smp;
++
++/*
++ * The base of the bucket array and its size are always used together
++ * (after initialization only in hash_futex()), so ensure that they
++ * reside in the same cacheline.
++ */
++static struct {
++ struct futex_hash_bucket *queues;
++ unsigned long hashsize;
++} __futex_data __read_mostly __aligned(2*sizeof(long));
++#define futex_queues (__futex_data.queues)
++#define futex_hashsize (__futex_data.hashsize)
++
++
++/*
++ * Fault injections for futexes.
++ */
++#ifdef CONFIG_FAIL_FUTEX
++
++static struct {
++ struct fault_attr attr;
++
++ bool ignore_private;
++} fail_futex = {
++ .attr = FAULT_ATTR_INITIALIZER,
++ .ignore_private = false,
++};
++
++static int __init setup_fail_futex(char *str)
++{
++ return setup_fault_attr(&fail_futex.attr, str);
++}
++__setup("fail_futex=", setup_fail_futex);
++
++static bool should_fail_futex(bool fshared)
++{
++ if (fail_futex.ignore_private && !fshared)
++ return false;
++
++ return should_fail(&fail_futex.attr, 1);
++}
++
++#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
++
++static int __init fail_futex_debugfs(void)
++{
++ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
++ struct dentry *dir;
++
++ dir = fault_create_debugfs_attr("fail_futex", NULL,
++ &fail_futex.attr);
++ if (IS_ERR(dir))
++ return PTR_ERR(dir);
++
++ debugfs_create_bool("ignore-private", mode, dir,
++ &fail_futex.ignore_private);
++ return 0;
++}
++
++late_initcall(fail_futex_debugfs);
++
++#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
++
++#else
++static inline bool should_fail_futex(bool fshared)
++{
++ return false;
++}
++#endif /* CONFIG_FAIL_FUTEX */
++
++/*
++ * Reflects a new waiter being added to the waitqueue.
++ */
++static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
++{
++#ifdef CONFIG_SMP
++ atomic_inc(&hb->waiters);
++ /*
++ * Full barrier (A), see the ordering comment above.
++ */
++ smp_mb__after_atomic();
++#endif
++}
++
++/*
++ * Reflects a waiter being removed from the waitqueue by wakeup
++ * paths.
++ */
++static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
++{
++#ifdef CONFIG_SMP
++ atomic_dec(&hb->waiters);
++#endif
++}
++
++static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
++{
++#ifdef CONFIG_SMP
++ /*
++ * Full barrier (B), see the ordering comment above.
++ */
++ smp_mb();
++ return atomic_read(&hb->waiters);
++#else
++ return 1;
++#endif
++}
++
++/**
++ * hash_futex - Return the hash bucket in the global hash
++ * @key: Pointer to the futex key for which the hash is calculated
++ *
++ * We hash on the keys returned from get_futex_key (see below) and return the
++ * corresponding hash bucket in the global hash.
++ */
++static struct futex_hash_bucket *hash_futex(union futex_key *key)
++{
++ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
++ key->both.offset);
++
++ return &futex_queues[hash & (futex_hashsize - 1)];
++}
++
++
++/**
++ * match_futex - Check whether two futex keys are equal
++ * @key1: Pointer to key1
++ * @key2: Pointer to key2
++ *
++ * Return 1 if two futex_keys are equal, 0 otherwise.
++ */
++static inline int match_futex(union futex_key *key1, union futex_key *key2)
++{
++ return (key1 && key2
++ && key1->both.word == key2->both.word
++ && key1->both.ptr == key2->both.ptr
++ && key1->both.offset == key2->both.offset);
++}
++
++enum futex_access {
++ FUTEX_READ,
++ FUTEX_WRITE
++};
++
++/**
++ * futex_setup_timer - set up the sleeping hrtimer.
++ * @time: ptr to the given timeout value
++ * @timeout: the hrtimer_sleeper structure to be set up
++ * @flags: futex flags
++ * @range_ns: optional range in ns
++ *
++ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
++ * value given
++ */
++static inline struct hrtimer_sleeper *
++futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
++ int flags, u64 range_ns)
++{
++ if (!time)
++ return NULL;
++
++ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
++ CLOCK_REALTIME : CLOCK_MONOTONIC,
++ HRTIMER_MODE_ABS);
++ /*
++ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
++ * effectively the same as calling hrtimer_set_expires().
++ */
++ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
++
++ return timeout;
++}
++
++/*
++ * Generate a machine wide unique identifier for this inode.
++ *
++ * This relies on u64 not wrapping in the life-time of the machine; which with
++ * 1ns resolution means almost 585 years.
++ *
++ * This further relies on the fact that a well formed program will not unmap
++ * the file while it has a (shared) futex waiting on it. This mapping will have
++ * a file reference which pins the mount and inode.
++ *
++ * If for some reason an inode gets evicted and read back in again, it will get
++ * a new sequence number and will _NOT_ match, even though it is the exact same
++ * file.
++ *
++ * It is important that match_futex() will never have a false-positive, esp.
++ * for PI futexes that can mess up the state. The above argues that false-negatives
++ * are only possible for malformed programs.
++ */
++static u64 get_inode_sequence_number(struct inode *inode)
++{
++ static atomic64_t i_seq;
++ u64 old;
++
++ /* Does the inode already have a sequence number? */
++ old = atomic64_read(&inode->i_sequence);
++ if (likely(old))
++ return old;
++
++ for (;;) {
++ u64 new = atomic64_add_return(1, &i_seq);
++ if (WARN_ON_ONCE(!new))
++ continue;
++
++ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
++ if (old)
++ return old;
++ return new;
++ }
++}
++
++/**
++ * get_futex_key() - Get parameters which are the keys for a futex
++ * @uaddr: virtual address of the futex
++ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
++ * @key: address where result is stored.
++ * @rw: mapping needs to be read/write (values: FUTEX_READ,
++ * FUTEX_WRITE)
++ *
++ * Return: a negative error code or 0
++ *
++ * The key words are stored in @key on success.
++ *
++ * For shared mappings (when @fshared), the key is:
++ *
++ * ( inode->i_sequence, page->index, offset_within_page )
++ *
++ * [ also see get_inode_sequence_number() ]
++ *
++ * For private mappings (or when !@fshared), the key is:
++ *
++ * ( current->mm, address, 0 )
++ *
++ * This allows (cross process, where applicable) identification of the futex
++ * without keeping the page pinned for the duration of the FUTEX_WAIT.
++ *
++ * lock_page() might sleep, the caller should not hold a spinlock.
++ */
++static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
++ enum futex_access rw)
++{
++ unsigned long address = (unsigned long)uaddr;
++ struct mm_struct *mm = current->mm;
++ struct page *page, *tail;
++ struct address_space *mapping;
++ int err, ro = 0;
++
++ /*
++ * The futex address must be "naturally" aligned.
++ */
++ key->both.offset = address % PAGE_SIZE;
++ if (unlikely((address % sizeof(u32)) != 0))
++ return -EINVAL;
++ address -= key->both.offset;
++
++ if (unlikely(!access_ok(uaddr, sizeof(u32))))
++ return -EFAULT;
++
++ if (unlikely(should_fail_futex(fshared)))
++ return -EFAULT;
++
++ /*
++ * PROCESS_PRIVATE futexes are fast.
++ * As the mm cannot disappear under us and the 'key' only needs
++ * virtual address, we dont even have to find the underlying vma.
++ * Note : We do have to check 'uaddr' is a valid user address,
++ * but access_ok() should be faster than find_vma()
++ */
++ if (!fshared) {
++ key->private.mm = mm;
++ key->private.address = address;
++ return 0;
++ }
++
++again:
++ /* Ignore any VERIFY_READ mapping (futex common case) */
++ if (unlikely(should_fail_futex(true)))
++ return -EFAULT;
++
++ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
++ /*
++ * If write access is not required (eg. FUTEX_WAIT), try
++ * and get read-only access.
++ */
++ if (err == -EFAULT && rw == FUTEX_READ) {
++ err = get_user_pages_fast(address, 1, 0, &page);
++ ro = 1;
++ }
++ if (err < 0)
++ return err;
++ else
++ err = 0;
++
++ /*
++ * The treatment of mapping from this point on is critical. The page
++ * lock protects many things but in this context the page lock
++ * stabilizes mapping, prevents inode freeing in the shared
++ * file-backed region case and guards against movement to swap cache.
++ *
++ * Strictly speaking the page lock is not needed in all cases being
++ * considered here and page lock forces unnecessarily serialization
++ * From this point on, mapping will be re-verified if necessary and
++ * page lock will be acquired only if it is unavoidable
++ *
++ * Mapping checks require the head page for any compound page so the
++ * head page and mapping is looked up now. For anonymous pages, it
++ * does not matter if the page splits in the future as the key is
++ * based on the address. For filesystem-backed pages, the tail is
++ * required as the index of the page determines the key. For
++ * base pages, there is no tail page and tail == page.
++ */
++ tail = page;
++ page = compound_head(page);
++ mapping = READ_ONCE(page->mapping);
++
++ /*
++ * If page->mapping is NULL, then it cannot be a PageAnon
++ * page; but it might be the ZERO_PAGE or in the gate area or
++ * in a special mapping (all cases which we are happy to fail);
++ * or it may have been a good file page when get_user_pages_fast
++ * found it, but truncated or holepunched or subjected to
++ * invalidate_complete_page2 before we got the page lock (also
++ * cases which we are happy to fail). And we hold a reference,
++ * so refcount care in invalidate_complete_page's remove_mapping
++ * prevents drop_caches from setting mapping to NULL beneath us.
++ *
++ * The case we do have to guard against is when memory pressure made
++ * shmem_writepage move it from filecache to swapcache beneath us:
++ * an unlikely race, but we do need to retry for page->mapping.
++ */
++ if (unlikely(!mapping)) {
++ int shmem_swizzled;
++
++ /*
++ * Page lock is required to identify which special case above
++ * applies. If this is really a shmem page then the page lock
++ * will prevent unexpected transitions.
++ */
++ lock_page(page);
++ shmem_swizzled = PageSwapCache(page) || page->mapping;
++ unlock_page(page);
++ put_page(page);
++
++ if (shmem_swizzled)
++ goto again;
++
++ return -EFAULT;
++ }
++
++ /*
++ * Private mappings are handled in a simple way.
++ *
++ * If the futex key is stored on an anonymous page, then the associated
++ * object is the mm which is implicitly pinned by the calling process.
++ *
++ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
++ * it's a read-only handle, it's expected that futexes attach to
++ * the object not the particular process.
++ */
++ if (PageAnon(page)) {
++ /*
++ * A RO anonymous page will never change and thus doesn't make
++ * sense for futex operations.
++ */
++ if (unlikely(should_fail_futex(true)) || ro) {
++ err = -EFAULT;
++ goto out;
++ }
++
++ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
++ key->private.mm = mm;
++ key->private.address = address;
++
++ } else {
++ struct inode *inode;
++
++ /*
++ * The associated futex object in this case is the inode and
++ * the page->mapping must be traversed. Ordinarily this should
++ * be stabilised under page lock but it's not strictly
++ * necessary in this case as we just want to pin the inode, not
++ * update the radix tree or anything like that.
++ *
++ * The RCU read lock is taken as the inode is finally freed
++ * under RCU. If the mapping still matches expectations then the
++ * mapping->host can be safely accessed as being a valid inode.
++ */
++ rcu_read_lock();
++
++ if (READ_ONCE(page->mapping) != mapping) {
++ rcu_read_unlock();
++ put_page(page);
++
++ goto again;
++ }
++
++ inode = READ_ONCE(mapping->host);
++ if (!inode) {
++ rcu_read_unlock();
++ put_page(page);
++
++ goto again;
++ }
++
++ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
++ key->shared.i_seq = get_inode_sequence_number(inode);
++ key->shared.pgoff = basepage_index(tail);
++ rcu_read_unlock();
++ }
++
++out:
++ put_page(page);
++ return err;
++}
++
++/**
++ * fault_in_user_writeable() - Fault in user address and verify RW access
++ * @uaddr: pointer to faulting user space address
++ *
++ * Slow path to fixup the fault we just took in the atomic write
++ * access to @uaddr.
++ *
++ * We have no generic implementation of a non-destructive write to the
++ * user address. We know that we faulted in the atomic pagefault
++ * disabled section so we can as well avoid the #PF overhead by
++ * calling get_user_pages() right away.
++ */
++static int fault_in_user_writeable(u32 __user *uaddr)
++{
++ struct mm_struct *mm = current->mm;
++ int ret;
++
++ mmap_read_lock(mm);
++ ret = fixup_user_fault(mm, (unsigned long)uaddr,
++ FAULT_FLAG_WRITE, NULL);
++ mmap_read_unlock(mm);
++
++ return ret < 0 ? ret : 0;
++}
++
++/**
++ * futex_top_waiter() - Return the highest priority waiter on a futex
++ * @hb: the hash bucket the futex_q's reside in
++ * @key: the futex key (to distinguish it from other futex futex_q's)
++ *
++ * Must be called with the hb lock held.
++ */
++static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
++ union futex_key *key)
++{
++ struct futex_q *this;
++
++ plist_for_each_entry(this, &hb->chain, list) {
++ if (match_futex(&this->key, key))
++ return this;
++ }
++ return NULL;
++}
++
++static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
++ u32 uval, u32 newval)
++{
++ int ret;
++
++ pagefault_disable();
++ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
++ pagefault_enable();
++
++ return ret;
++}
++
++static int get_futex_value_locked(u32 *dest, u32 __user *from)
++{
++ int ret;
++
++ pagefault_disable();
++ ret = __get_user(*dest, from);
++ pagefault_enable();
++
++ return ret ? -EFAULT : 0;
++}
++
++
++/*
++ * PI code:
++ */
++static int refill_pi_state_cache(void)
++{
++ struct futex_pi_state *pi_state;
++
++ if (likely(current->pi_state_cache))
++ return 0;
++
++ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
++
++ if (!pi_state)
++ return -ENOMEM;
++
++ INIT_LIST_HEAD(&pi_state->list);
++ /* pi_mutex gets initialized later */
++ pi_state->owner = NULL;
++ refcount_set(&pi_state->refcount, 1);
++ pi_state->key = FUTEX_KEY_INIT;
++
++ current->pi_state_cache = pi_state;
++
++ return 0;
++}
++
++static struct futex_pi_state *alloc_pi_state(void)
++{
++ struct futex_pi_state *pi_state = current->pi_state_cache;
++
++ WARN_ON(!pi_state);
++ current->pi_state_cache = NULL;
++
++ return pi_state;
++}
++
++static void get_pi_state(struct futex_pi_state *pi_state)
++{
++ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
++}
++
++/*
++ * Drops a reference to the pi_state object and frees or caches it
++ * when the last reference is gone.
++ */
++static void put_pi_state(struct futex_pi_state *pi_state)
++{
++ if (!pi_state)
++ return;
++
++ if (!refcount_dec_and_test(&pi_state->refcount))
++ return;
++
++ /*
++ * If pi_state->owner is NULL, the owner is most probably dying
++ * and has cleaned up the pi_state already
++ */
++ if (pi_state->owner) {
++ struct task_struct *owner;
++
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++ owner = pi_state->owner;
++ if (owner) {
++ raw_spin_lock(&owner->pi_lock);
++ list_del_init(&pi_state->list);
++ raw_spin_unlock(&owner->pi_lock);
++ }
++ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ }
++
++ if (current->pi_state_cache) {
++ kfree(pi_state);
++ } else {
++ /*
++ * pi_state->list is already empty.
++ * clear pi_state->owner.
++ * refcount is at 0 - put it back to 1.
++ */
++ pi_state->owner = NULL;
++ refcount_set(&pi_state->refcount, 1);
++ current->pi_state_cache = pi_state;
++ }
++}
++
++/*
++ * We need to check the following states:
++ *
++ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
++ *
++ * [1] NULL | --- | --- | 0 | 0/1 | Valid
++ * [2] NULL | --- | --- | >0 | 0/1 | Valid
++ *
++ * [3] Found | NULL | -- | Any | 0/1 | Invalid
++ *
++ * [4] Found | Found | NULL | 0 | 1 | Valid
++ * [5] Found | Found | NULL | >0 | 1 | Invalid
++ *
++ * [6] Found | Found | task | 0 | 1 | Valid
++ *
++ * [7] Found | Found | NULL | Any | 0 | Invalid
++ *
++ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
++ * [9] Found | Found | task | 0 | 0 | Invalid
++ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
++ *
++ * [1] Indicates that the kernel can acquire the futex atomically. We
++ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
++ *
++ * [2] Valid, if TID does not belong to a kernel thread. If no matching
++ * thread is found then it indicates that the owner TID has died.
++ *
++ * [3] Invalid. The waiter is queued on a non PI futex
++ *
++ * [4] Valid state after exit_robust_list(), which sets the user space
++ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
++ *
++ * [5] The user space value got manipulated between exit_robust_list()
++ * and exit_pi_state_list()
++ *
++ * [6] Valid state after exit_pi_state_list() which sets the new owner in
++ * the pi_state but cannot access the user space value.
++ *
++ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
++ *
++ * [8] Owner and user space value match
++ *
++ * [9] There is no transient state which sets the user space TID to 0
++ * except exit_robust_list(), but this is indicated by the
++ * FUTEX_OWNER_DIED bit. See [4]
++ *
++ * [10] There is no transient state which leaves owner and user space
++ * TID out of sync.
++ *
++ *
++ * Serialization and lifetime rules:
++ *
++ * hb->lock:
++ *
++ * hb -> futex_q, relation
++ * futex_q -> pi_state, relation
++ *
++ * (cannot be raw because hb can contain arbitrary amount
++ * of futex_q's)
++ *
++ * pi_mutex->wait_lock:
++ *
++ * {uval, pi_state}
++ *
++ * (and pi_mutex 'obviously')
++ *
++ * p->pi_lock:
++ *
++ * p->pi_state_list -> pi_state->list, relation
++ *
++ * pi_state->refcount:
++ *
++ * pi_state lifetime
++ *
++ *
++ * Lock order:
++ *
++ * hb->lock
++ * pi_mutex->wait_lock
++ * p->pi_lock
++ *
++ */
++
++/*
++ * Validate that the existing waiter has a pi_state and sanity check
++ * the pi_state against the user space value. If correct, attach to
++ * it.
++ */
++static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
++ struct futex_pi_state *pi_state,
++ struct futex_pi_state **ps)
++{
++ pid_t pid = uval & FUTEX_TID_MASK;
++ u32 uval2;
++ int ret;
++
++ /*
++ * Userspace might have messed up non-PI and PI futexes [3]
++ */
++ if (unlikely(!pi_state))
++ return -EINVAL;
++
++ /*
++ * We get here with hb->lock held, and having found a
++ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
++ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
++ * which in turn means that futex_lock_pi() still has a reference on
++ * our pi_state.
++ *
++ * The waiter holding a reference on @pi_state also protects against
++ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
++ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
++ * free pi_state before we can take a reference ourselves.
++ */
++ WARN_ON(!refcount_read(&pi_state->refcount));
++
++ /*
++ * Now that we have a pi_state, we can acquire wait_lock
++ * and do the state validation.
++ */
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++
++ /*
++ * Since {uval, pi_state} is serialized by wait_lock, and our current
++ * uval was read without holding it, it can have changed. Verify it
++ * still is what we expect it to be, otherwise retry the entire
++ * operation.
++ */
++ if (get_futex_value_locked(&uval2, uaddr))
++ goto out_efault;
++
++ if (uval != uval2)
++ goto out_eagain;
++
++ /*
++ * Handle the owner died case:
++ */
++ if (uval & FUTEX_OWNER_DIED) {
++ /*
++ * exit_pi_state_list sets owner to NULL and wakes the
++ * topmost waiter. The task which acquires the
++ * pi_state->rt_mutex will fixup owner.
++ */
++ if (!pi_state->owner) {
++ /*
++ * No pi state owner, but the user space TID
++ * is not 0. Inconsistent state. [5]
++ */
++ if (pid)
++ goto out_einval;
++ /*
++ * Take a ref on the state and return success. [4]
++ */
++ goto out_attach;
++ }
++
++ /*
++ * If TID is 0, then either the dying owner has not
++ * yet executed exit_pi_state_list() or some waiter
++ * acquired the rtmutex in the pi state, but did not
++ * yet fixup the TID in user space.
++ *
++ * Take a ref on the state and return success. [6]
++ */
++ if (!pid)
++ goto out_attach;
++ } else {
++ /*
++ * If the owner died bit is not set, then the pi_state
++ * must have an owner. [7]
++ */
++ if (!pi_state->owner)
++ goto out_einval;
++ }
++
++ /*
++ * Bail out if user space manipulated the futex value. If pi
++ * state exists then the owner TID must be the same as the
++ * user space TID. [9/10]
++ */
++ if (pid != task_pid_vnr(pi_state->owner))
++ goto out_einval;
++
++out_attach:
++ get_pi_state(pi_state);
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ *ps = pi_state;
++ return 0;
++
++out_einval:
++ ret = -EINVAL;
++ goto out_error;
++
++out_eagain:
++ ret = -EAGAIN;
++ goto out_error;
++
++out_efault:
++ ret = -EFAULT;
++ goto out_error;
++
++out_error:
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ return ret;
++}
++
++/**
++ * wait_for_owner_exiting - Block until the owner has exited
++ * @ret: owner's current futex lock status
++ * @exiting: Pointer to the exiting task
++ *
++ * Caller must hold a refcount on @exiting.
++ */
++static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
++{
++ if (ret != -EBUSY) {
++ WARN_ON_ONCE(exiting);
++ return;
++ }
++
++ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
++ return;
++
++ mutex_lock(&exiting->futex_exit_mutex);
++ /*
++ * No point in doing state checking here. If the waiter got here
++ * while the task was in exec()->exec_futex_release() then it can
++ * have any FUTEX_STATE_* value when the waiter has acquired the
++ * mutex. OK, if running, EXITING or DEAD if it reached exit()
++ * already. Highly unlikely and not a problem. Just one more round
++ * through the futex maze.
++ */
++ mutex_unlock(&exiting->futex_exit_mutex);
++
++ put_task_struct(exiting);
++}
++
++static int handle_exit_race(u32 __user *uaddr, u32 uval,
++ struct task_struct *tsk)
++{
++ u32 uval2;
++
++ /*
++ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
++ * caller that the alleged owner is busy.
++ */
++ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
++ return -EBUSY;
++
++ /*
++ * Reread the user space value to handle the following situation:
++ *
++ * CPU0 CPU1
++ *
++ * sys_exit() sys_futex()
++ * do_exit() futex_lock_pi()
++ * futex_lock_pi_atomic()
++ * exit_signals(tsk) No waiters:
++ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
++ * mm_release(tsk) Set waiter bit
++ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
++ * Set owner died attach_to_pi_owner() {
++ * *uaddr = 0xC0000000; tsk = get_task(PID);
++ * } if (!tsk->flags & PF_EXITING) {
++ * ... attach();
++ * tsk->futex_state = } else {
++ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
++ * FUTEX_STATE_DEAD)
++ * return -EAGAIN;
++ * return -ESRCH; <--- FAIL
++ * }
++ *
++ * Returning ESRCH unconditionally is wrong here because the
++ * user space value has been changed by the exiting task.
++ *
++ * The same logic applies to the case where the exiting task is
++ * already gone.
++ */
++ if (get_futex_value_locked(&uval2, uaddr))
++ return -EFAULT;
++
++ /* If the user space value has changed, try again. */
++ if (uval2 != uval)
++ return -EAGAIN;
++
++ /*
++ * The exiting task did not have a robust list, the robust list was
++ * corrupted or the user space value in *uaddr is simply bogus.
++ * Give up and tell user space.
++ */
++ return -ESRCH;
++}
++
++/*
++ * Lookup the task for the TID provided from user space and attach to
++ * it after doing proper sanity checks.
++ */
++static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
++ struct futex_pi_state **ps,
++ struct task_struct **exiting)
++{
++ pid_t pid = uval & FUTEX_TID_MASK;
++ struct futex_pi_state *pi_state;
++ struct task_struct *p;
++
++ /*
++ * We are the first waiter - try to look up the real owner and attach
++ * the new pi_state to it, but bail out when TID = 0 [1]
++ *
++ * The !pid check is paranoid. None of the call sites should end up
++ * with pid == 0, but better safe than sorry. Let the caller retry
++ */
++ if (!pid)
++ return -EAGAIN;
++ p = find_get_task_by_vpid(pid);
++ if (!p)
++ return handle_exit_race(uaddr, uval, NULL);
++
++ if (unlikely(p->flags & PF_KTHREAD)) {
++ put_task_struct(p);
++ return -EPERM;
++ }
++
++ /*
++ * We need to look at the task state to figure out, whether the
++ * task is exiting. To protect against the change of the task state
++ * in futex_exit_release(), we do this protected by p->pi_lock:
++ */
++ raw_spin_lock_irq(&p->pi_lock);
++ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
++ /*
++ * The task is on the way out. When the futex state is
++ * FUTEX_STATE_DEAD, we know that the task has finished
++ * the cleanup:
++ */
++ int ret = handle_exit_race(uaddr, uval, p);
++
++ raw_spin_unlock_irq(&p->pi_lock);
++ /*
++ * If the owner task is between FUTEX_STATE_EXITING and
++ * FUTEX_STATE_DEAD then store the task pointer and keep
++ * the reference on the task struct. The calling code will
++ * drop all locks, wait for the task to reach
++ * FUTEX_STATE_DEAD and then drop the refcount. This is
++ * required to prevent a live lock when the current task
++ * preempted the exiting task between the two states.
++ */
++ if (ret == -EBUSY)
++ *exiting = p;
++ else
++ put_task_struct(p);
++ return ret;
++ }
++
++ /*
++ * No existing pi state. First waiter. [2]
++ *
++ * This creates pi_state, we have hb->lock held, this means nothing can
++ * observe this state, wait_lock is irrelevant.
++ */
++ pi_state = alloc_pi_state();
++
++ /*
++ * Initialize the pi_mutex in locked state and make @p
++ * the owner of it:
++ */
++ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
++
++ /* Store the key for possible exit cleanups: */
++ pi_state->key = *key;
++
++ WARN_ON(!list_empty(&pi_state->list));
++ list_add(&pi_state->list, &p->pi_state_list);
++ /*
++ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
++ * because there is no concurrency as the object is not published yet.
++ */
++ pi_state->owner = p;
++ raw_spin_unlock_irq(&p->pi_lock);
++
++ put_task_struct(p);
++
++ *ps = pi_state;
++
++ return 0;
++}
++
++static int lookup_pi_state(u32 __user *uaddr, u32 uval,
++ struct futex_hash_bucket *hb,
++ union futex_key *key, struct futex_pi_state **ps,
++ struct task_struct **exiting)
++{
++ struct futex_q *top_waiter = futex_top_waiter(hb, key);
++
++ /*
++ * If there is a waiter on that futex, validate it and
++ * attach to the pi_state when the validation succeeds.
++ */
++ if (top_waiter)
++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
++
++ /*
++ * We are the first waiter - try to look up the owner based on
++ * @uval and attach to it.
++ */
++ return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
++}
++
++static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
++{
++ int err;
++ u32 curval;
++
++ if (unlikely(should_fail_futex(true)))
++ return -EFAULT;
++
++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++ if (unlikely(err))
++ return err;
++
++ /* If user space value changed, let the caller retry */
++ return curval != uval ? -EAGAIN : 0;
++}
++
++/**
++ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
++ * @uaddr: the pi futex user address
++ * @hb: the pi futex hash bucket
++ * @key: the futex key associated with uaddr and hb
++ * @ps: the pi_state pointer where we store the result of the
++ * lookup
++ * @task: the task to perform the atomic lock work for. This will
++ * be "current" except in the case of requeue pi.
++ * @exiting: Pointer to store the task pointer of the owner task
++ * which is in the middle of exiting
++ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
++ *
++ * Return:
++ * - 0 - ready to wait;
++ * - 1 - acquired the lock;
++ * - <0 - error
++ *
++ * The hb->lock and futex_key refs shall be held by the caller.
++ *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
++ */
++static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
++ union futex_key *key,
++ struct futex_pi_state **ps,
++ struct task_struct *task,
++ struct task_struct **exiting,
++ int set_waiters)
++{
++ u32 uval, newval, vpid = task_pid_vnr(task);
++ struct futex_q *top_waiter;
++ int ret;
++
++ /*
++ * Read the user space value first so we can validate a few
++ * things before proceeding further.
++ */
++ if (get_futex_value_locked(&uval, uaddr))
++ return -EFAULT;
++
++ if (unlikely(should_fail_futex(true)))
++ return -EFAULT;
++
++ /*
++ * Detect deadlocks.
++ */
++ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
++ return -EDEADLK;
++
++ if ((unlikely(should_fail_futex(true))))
++ return -EDEADLK;
++
++ /*
++ * Lookup existing state first. If it exists, try to attach to
++ * its pi_state.
++ */
++ top_waiter = futex_top_waiter(hb, key);
++ if (top_waiter)
++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
++
++ /*
++ * No waiter and user TID is 0. We are here because the
++ * waiters or the owner died bit is set or called from
++ * requeue_cmp_pi or for whatever reason something took the
++ * syscall.
++ */
++ if (!(uval & FUTEX_TID_MASK)) {
++ /*
++ * We take over the futex. No other waiters and the user space
++ * TID is 0. We preserve the owner died bit.
++ */
++ newval = uval & FUTEX_OWNER_DIED;
++ newval |= vpid;
++
++ /* The futex requeue_pi code can enforce the waiters bit */
++ if (set_waiters)
++ newval |= FUTEX_WAITERS;
++
++ ret = lock_pi_update_atomic(uaddr, uval, newval);
++ /* If the take over worked, return 1 */
++ return ret < 0 ? ret : 1;
++ }
++
++ /*
++ * First waiter. Set the waiters bit before attaching ourself to
++ * the owner. If owner tries to unlock, it will be forced into
++ * the kernel and blocked on hb->lock.
++ */
++ newval = uval | FUTEX_WAITERS;
++ ret = lock_pi_update_atomic(uaddr, uval, newval);
++ if (ret)
++ return ret;
++ /*
++ * If the update of the user space value succeeded, we try to
++ * attach to the owner. If that fails, no harm done, we only
++ * set the FUTEX_WAITERS bit in the user space variable.
++ */
++ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
++}
++
++/**
++ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
++ * @q: The futex_q to unqueue
++ *
++ * The q->lock_ptr must not be NULL and must be held by the caller.
++ */
++static void __unqueue_futex(struct futex_q *q)
++{
++ struct futex_hash_bucket *hb;
++
++ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
++ return;
++ lockdep_assert_held(q->lock_ptr);
++
++ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
++ plist_del(&q->list, &hb->chain);
++ hb_waiters_dec(hb);
++}
++
++/*
++ * The hash bucket lock must be held when this is called.
++ * Afterwards, the futex_q must not be accessed. Callers
++ * must ensure to later call wake_up_q() for the actual
++ * wakeups to occur.
++ */
++static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
++{
++ struct task_struct *p = q->task;
++
++ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
++ return;
++
++ get_task_struct(p);
++ __unqueue_futex(q);
++ /*
++ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
++ * is written, without taking any locks. This is possible in the event
++ * of a spurious wakeup, for example. A memory barrier is required here
++ * to prevent the following store to lock_ptr from getting ahead of the
++ * plist_del in __unqueue_futex().
++ */
++ smp_store_release(&q->lock_ptr, NULL);
++
++ /*
++ * Queue the task for later wakeup for after we've released
++ * the hb->lock.
++ */
++ wake_q_add_safe(wake_q, p);
++}
++
++/*
++ * Caller must hold a reference on @pi_state.
++ */
++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
++{
++ u32 curval, newval;
++ struct task_struct *new_owner;
++ bool postunlock = false;
++ DEFINE_WAKE_Q(wake_q);
++ int ret = 0;
++
++ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
++ if (WARN_ON_ONCE(!new_owner)) {
++ /*
++ * As per the comment in futex_unlock_pi() this should not happen.
++ *
++ * When this happens, give up our locks and try again, giving
++ * the futex_lock_pi() instance time to complete, either by
++ * waiting on the rtmutex or removing itself from the futex
++ * queue.
++ */
++ ret = -EAGAIN;
++ goto out_unlock;
++ }
++
++ /*
++ * We pass it to the next owner. The WAITERS bit is always kept
++ * enabled while there is PI state around. We cleanup the owner
++ * died bit, because we are the owner.
++ */
++ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
++
++ if (unlikely(should_fail_futex(true)))
++ ret = -EFAULT;
++
++ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++ if (!ret && (curval != uval)) {
++ /*
++ * If a unconditional UNLOCK_PI operation (user space did not
++ * try the TID->0 transition) raced with a waiter setting the
++ * FUTEX_WAITERS flag between get_user() and locking the hash
++ * bucket lock, retry the operation.
++ */
++ if ((FUTEX_TID_MASK & curval) == uval)
++ ret = -EAGAIN;
++ else
++ ret = -EINVAL;
++ }
++
++ if (ret)
++ goto out_unlock;
++
++ /*
++ * This is a point of no return; once we modify the uval there is no
++ * going back and subsequent operations must not fail.
++ */
++
++ raw_spin_lock(&pi_state->owner->pi_lock);
++ WARN_ON(list_empty(&pi_state->list));
++ list_del_init(&pi_state->list);
++ raw_spin_unlock(&pi_state->owner->pi_lock);
++
++ raw_spin_lock(&new_owner->pi_lock);
++ WARN_ON(!list_empty(&pi_state->list));
++ list_add(&pi_state->list, &new_owner->pi_state_list);
++ pi_state->owner = new_owner;
++ raw_spin_unlock(&new_owner->pi_lock);
++
++ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
++
++out_unlock:
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++
++ if (postunlock)
++ rt_mutex_postunlock(&wake_q);
++
++ return ret;
++}
++
++/*
++ * Express the locking dependencies for lockdep:
++ */
++static inline void
++double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
++{
++ if (hb1 <= hb2) {
++ spin_lock(&hb1->lock);
++ if (hb1 < hb2)
++ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
++ } else { /* hb1 > hb2 */
++ spin_lock(&hb2->lock);
++ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
++ }
++}
++
++static inline void
++double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
++{
++ spin_unlock(&hb1->lock);
++ if (hb1 != hb2)
++ spin_unlock(&hb2->lock);
++}
++
++/*
++ * Wake up waiters matching bitset queued on this futex (uaddr).
++ */
++static int
++futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
++{
++ struct futex_hash_bucket *hb;
++ struct futex_q *this, *next;
++ union futex_key key = FUTEX_KEY_INIT;
++ int ret;
++ DEFINE_WAKE_Q(wake_q);
++
++ if (!bitset)
++ return -EINVAL;
++
++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
++ if (unlikely(ret != 0))
++ return ret;
++
++ hb = hash_futex(&key);
++
++ /* Make sure we really have tasks to wakeup */
++ if (!hb_waiters_pending(hb))
++ return ret;
++
++ spin_lock(&hb->lock);
++
++ plist_for_each_entry_safe(this, next, &hb->chain, list) {
++ if (match_futex (&this->key, &key)) {
++ if (this->pi_state || this->rt_waiter) {
++ ret = -EINVAL;
++ break;
++ }
++
++ /* Check if one of the bits is set in both bitsets */
++ if (!(this->bitset & bitset))
++ continue;
++
++ mark_wake_futex(&wake_q, this);
++ if (++ret >= nr_wake)
++ break;
++ }
++ }
++
++ spin_unlock(&hb->lock);
++ wake_up_q(&wake_q);
++ return ret;
++}
++
++static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
++{
++ unsigned int op = (encoded_op & 0x70000000) >> 28;
++ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
++ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
++ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
++ int oldval, ret;
++
++ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
++ if (oparg < 0 || oparg > 31) {
++ char comm[sizeof(current->comm)];
++ /*
++ * kill this print and return -EINVAL when userspace
++ * is sane again
++ */
++ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
++ get_task_comm(comm, current), oparg);
++ oparg &= 31;
++ }
++ oparg = 1 << oparg;
++ }
++
++ pagefault_disable();
++ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
++ pagefault_enable();
++ if (ret)
++ return ret;
++
++ switch (cmp) {
++ case FUTEX_OP_CMP_EQ:
++ return oldval == cmparg;
++ case FUTEX_OP_CMP_NE:
++ return oldval != cmparg;
++ case FUTEX_OP_CMP_LT:
++ return oldval < cmparg;
++ case FUTEX_OP_CMP_GE:
++ return oldval >= cmparg;
++ case FUTEX_OP_CMP_LE:
++ return oldval <= cmparg;
++ case FUTEX_OP_CMP_GT:
++ return oldval > cmparg;
++ default:
++ return -ENOSYS;
++ }
++}
++
++/*
++ * Wake up all waiters hashed on the physical page that is mapped
++ * to this virtual address:
++ */
++static int
++futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
++ int nr_wake, int nr_wake2, int op)
++{
++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
++ struct futex_hash_bucket *hb1, *hb2;
++ struct futex_q *this, *next;
++ int ret, op_ret;
++ DEFINE_WAKE_Q(wake_q);
++
++retry:
++ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
++ if (unlikely(ret != 0))
++ return ret;
++ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
++ if (unlikely(ret != 0))
++ return ret;
++
++ hb1 = hash_futex(&key1);
++ hb2 = hash_futex(&key2);
++
++retry_private:
++ double_lock_hb(hb1, hb2);
++ op_ret = futex_atomic_op_inuser(op, uaddr2);
++ if (unlikely(op_ret < 0)) {
++ double_unlock_hb(hb1, hb2);
++
++ if (!IS_ENABLED(CONFIG_MMU) ||
++ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
++ /*
++ * we don't get EFAULT from MMU faults if we don't have
++ * an MMU, but we might get them from range checking
++ */
++ ret = op_ret;
++ return ret;
++ }
++
++ if (op_ret == -EFAULT) {
++ ret = fault_in_user_writeable(uaddr2);
++ if (ret)
++ return ret;
++ }
++
++ if (!(flags & FLAGS_SHARED)) {
++ cond_resched();
++ goto retry_private;
++ }
++
++ cond_resched();
++ goto retry;
++ }
++
++ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
++ if (match_futex (&this->key, &key1)) {
++ if (this->pi_state || this->rt_waiter) {
++ ret = -EINVAL;
++ goto out_unlock;
++ }
++ mark_wake_futex(&wake_q, this);
++ if (++ret >= nr_wake)
++ break;
++ }
++ }
++
++ if (op_ret > 0) {
++ op_ret = 0;
++ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
++ if (match_futex (&this->key, &key2)) {
++ if (this->pi_state || this->rt_waiter) {
++ ret = -EINVAL;
++ goto out_unlock;
++ }
++ mark_wake_futex(&wake_q, this);
++ if (++op_ret >= nr_wake2)
++ break;
++ }
++ }
++ ret += op_ret;
++ }
++
++out_unlock:
++ double_unlock_hb(hb1, hb2);
++ wake_up_q(&wake_q);
++ return ret;
++}
++
++/**
++ * requeue_futex() - Requeue a futex_q from one hb to another
++ * @q: the futex_q to requeue
++ * @hb1: the source hash_bucket
++ * @hb2: the target hash_bucket
++ * @key2: the new key for the requeued futex_q
++ */
++static inline
++void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
++ struct futex_hash_bucket *hb2, union futex_key *key2)
++{
++
++ /*
++ * If key1 and key2 hash to the same bucket, no need to
++ * requeue.
++ */
++ if (likely(&hb1->chain != &hb2->chain)) {
++ plist_del(&q->list, &hb1->chain);
++ hb_waiters_dec(hb1);
++ hb_waiters_inc(hb2);
++ plist_add(&q->list, &hb2->chain);
++ q->lock_ptr = &hb2->lock;
++ }
++ q->key = *key2;
++}
++
++/**
++ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
++ * @q: the futex_q
++ * @key: the key of the requeue target futex
++ * @hb: the hash_bucket of the requeue target futex
++ *
++ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
++ * target futex if it is uncontended or via a lock steal. Set the futex_q key
++ * to the requeue target futex so the waiter can detect the wakeup on the right
++ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
++ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
++ * to protect access to the pi_state to fixup the owner later. Must be called
++ * with both q->lock_ptr and hb->lock held.
++ */
++static inline
++void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
++ struct futex_hash_bucket *hb)
++{
++ q->key = *key;
++
++ __unqueue_futex(q);
++
++ WARN_ON(!q->rt_waiter);
++ q->rt_waiter = NULL;
++
++ q->lock_ptr = &hb->lock;
++
++ wake_up_state(q->task, TASK_NORMAL);
++}
++
++/**
++ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
++ * @pifutex: the user address of the to futex
++ * @hb1: the from futex hash bucket, must be locked by the caller
++ * @hb2: the to futex hash bucket, must be locked by the caller
++ * @key1: the from futex key
++ * @key2: the to futex key
++ * @ps: address to store the pi_state pointer
++ * @exiting: Pointer to store the task pointer of the owner task
++ * which is in the middle of exiting
++ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
++ *
++ * Try and get the lock on behalf of the top waiter if we can do it atomically.
++ * Wake the top waiter if we succeed. If the caller specified set_waiters,
++ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
++ * hb1 and hb2 must be held by the caller.
++ *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
++ *
++ * Return:
++ * - 0 - failed to acquire the lock atomically;
++ * - >0 - acquired the lock, return value is vpid of the top_waiter
++ * - <0 - error
++ */
++static int
++futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
++ struct futex_hash_bucket *hb2, union futex_key *key1,
++ union futex_key *key2, struct futex_pi_state **ps,
++ struct task_struct **exiting, int set_waiters)
++{
++ struct futex_q *top_waiter = NULL;
++ u32 curval;
++ int ret, vpid;
++
++ if (get_futex_value_locked(&curval, pifutex))
++ return -EFAULT;
++
++ if (unlikely(should_fail_futex(true)))
++ return -EFAULT;
++
++ /*
++ * Find the top_waiter and determine if there are additional waiters.
++ * If the caller intends to requeue more than 1 waiter to pifutex,
++ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
++ * as we have means to handle the possible fault. If not, don't set
++ * the bit unecessarily as it will force the subsequent unlock to enter
++ * the kernel.
++ */
++ top_waiter = futex_top_waiter(hb1, key1);
++
++ /* There are no waiters, nothing for us to do. */
++ if (!top_waiter)
++ return 0;
++
++ /* Ensure we requeue to the expected futex. */
++ if (!match_futex(top_waiter->requeue_pi_key, key2))
++ return -EINVAL;
++
++ /*
++ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
++ * the contended case or if set_waiters is 1. The pi_state is returned
++ * in ps in contended cases.
++ */
++ vpid = task_pid_vnr(top_waiter->task);
++ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
++ exiting, set_waiters);
++ if (ret == 1) {
++ requeue_pi_wake_futex(top_waiter, key2, hb2);
++ return vpid;
++ }
++ return ret;
++}
++
++/**
++ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
++ * @uaddr1: source futex user address
++ * @flags: futex flags (FLAGS_SHARED, etc.)
++ * @uaddr2: target futex user address
++ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
++ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
++ * @cmpval: @uaddr1 expected value (or %NULL)
++ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
++ * pi futex (pi to pi requeue is not supported)
++ *
++ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
++ * uaddr2 atomically on behalf of the top waiter.
++ *
++ * Return:
++ * - >=0 - on success, the number of tasks requeued or woken;
++ * - <0 - on error
++ */
++static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
++ u32 __user *uaddr2, int nr_wake, int nr_requeue,
++ u32 *cmpval, int requeue_pi)
++{
++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
++ int task_count = 0, ret;
++ struct futex_pi_state *pi_state = NULL;
++ struct futex_hash_bucket *hb1, *hb2;
++ struct futex_q *this, *next;
++ DEFINE_WAKE_Q(wake_q);
++
++ if (nr_wake < 0 || nr_requeue < 0)
++ return -EINVAL;
++
++ /*
++ * When PI not supported: return -ENOSYS if requeue_pi is true,
++ * consequently the compiler knows requeue_pi is always false past
++ * this point which will optimize away all the conditional code
++ * further down.
++ */
++ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
++ return -ENOSYS;
++
++ if (requeue_pi) {
++ /*
++ * Requeue PI only works on two distinct uaddrs. This
++ * check is only valid for private futexes. See below.
++ */
++ if (uaddr1 == uaddr2)
++ return -EINVAL;
++
++ /*
++ * requeue_pi requires a pi_state, try to allocate it now
++ * without any locks in case it fails.
++ */
++ if (refill_pi_state_cache())
++ return -ENOMEM;
++ /*
++ * requeue_pi must wake as many tasks as it can, up to nr_wake
++ * + nr_requeue, since it acquires the rt_mutex prior to
++ * returning to userspace, so as to not leave the rt_mutex with
++ * waiters and no owner. However, second and third wake-ups
++ * cannot be predicted as they involve race conditions with the
++ * first wake and a fault while looking up the pi_state. Both
++ * pthread_cond_signal() and pthread_cond_broadcast() should
++ * use nr_wake=1.
++ */
++ if (nr_wake != 1)
++ return -EINVAL;
++ }
++
++retry:
++ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
++ if (unlikely(ret != 0))
++ return ret;
++ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
++ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
++ if (unlikely(ret != 0))
++ return ret;
++
++ /*
++ * The check above which compares uaddrs is not sufficient for
++ * shared futexes. We need to compare the keys:
++ */
++ if (requeue_pi && match_futex(&key1, &key2))
++ return -EINVAL;
++
++ hb1 = hash_futex(&key1);
++ hb2 = hash_futex(&key2);
++
++retry_private:
++ hb_waiters_inc(hb2);
++ double_lock_hb(hb1, hb2);
++
++ if (likely(cmpval != NULL)) {
++ u32 curval;
++
++ ret = get_futex_value_locked(&curval, uaddr1);
++
++ if (unlikely(ret)) {
++ double_unlock_hb(hb1, hb2);
++ hb_waiters_dec(hb2);
++
++ ret = get_user(curval, uaddr1);
++ if (ret)
++ return ret;
++
++ if (!(flags & FLAGS_SHARED))
++ goto retry_private;
++
++ goto retry;
++ }
++ if (curval != *cmpval) {
++ ret = -EAGAIN;
++ goto out_unlock;
++ }
++ }
++
++ if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
++ struct task_struct *exiting = NULL;
++
++ /*
++ * Attempt to acquire uaddr2 and wake the top waiter. If we
++ * intend to requeue waiters, force setting the FUTEX_WAITERS
++ * bit. We force this here where we are able to easily handle
++ * faults rather in the requeue loop below.
++ */
++ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
++ &key2, &pi_state,
++ &exiting, nr_requeue);
++
++ /*
++ * At this point the top_waiter has either taken uaddr2 or is
++ * waiting on it. If the former, then the pi_state will not
++ * exist yet, look it up one more time to ensure we have a
++ * reference to it. If the lock was taken, ret contains the
++ * vpid of the top waiter task.
++ * If the lock was not taken, we have pi_state and an initial
++ * refcount on it. In case of an error we have nothing.
++ */
++ if (ret > 0) {
++ WARN_ON(pi_state);
++ task_count++;
++ /*
++ * If we acquired the lock, then the user space value
++ * of uaddr2 should be vpid. It cannot be changed by
++ * the top waiter as it is blocked on hb2 lock if it
++ * tries to do so. If something fiddled with it behind
++ * our back the pi state lookup might unearth it. So
++ * we rather use the known value than rereading and
++ * handing potential crap to lookup_pi_state.
++ *
++ * If that call succeeds then we have pi_state and an
++ * initial refcount on it.
++ */
++ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
++ &pi_state, &exiting);
++ }
++
++ switch (ret) {
++ case 0:
++ /* We hold a reference on the pi state. */
++ break;
++
++ /* If the above failed, then pi_state is NULL */
++ case -EFAULT:
++ double_unlock_hb(hb1, hb2);
++ hb_waiters_dec(hb2);
++ ret = fault_in_user_writeable(uaddr2);
++ if (!ret)
++ goto retry;
++ return ret;
++ case -EBUSY:
++ case -EAGAIN:
++ /*
++ * Two reasons for this:
++ * - EBUSY: Owner is exiting and we just wait for the
++ * exit to complete.
++ * - EAGAIN: The user space value changed.
++ */
++ double_unlock_hb(hb1, hb2);
++ hb_waiters_dec(hb2);
++ /*
++ * Handle the case where the owner is in the middle of
++ * exiting. Wait for the exit to complete otherwise
++ * this task might loop forever, aka. live lock.
++ */
++ wait_for_owner_exiting(ret, exiting);
++ cond_resched();
++ goto retry;
++ default:
++ goto out_unlock;
++ }
++ }
++
++ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
++ if (task_count - nr_wake >= nr_requeue)
++ break;
++
++ if (!match_futex(&this->key, &key1))
++ continue;
++
++ /*
++ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
++ * be paired with each other and no other futex ops.
++ *
++ * We should never be requeueing a futex_q with a pi_state,
++ * which is awaiting a futex_unlock_pi().
++ */
++ if ((requeue_pi && !this->rt_waiter) ||
++ (!requeue_pi && this->rt_waiter) ||
++ this->pi_state) {
++ ret = -EINVAL;
++ break;
++ }
++
++ /*
++ * Wake nr_wake waiters. For requeue_pi, if we acquired the
++ * lock, we already woke the top_waiter. If not, it will be
++ * woken by futex_unlock_pi().
++ */
++ if (++task_count <= nr_wake && !requeue_pi) {
++ mark_wake_futex(&wake_q, this);
++ continue;
++ }
++
++ /* Ensure we requeue to the expected futex for requeue_pi. */
++ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
++ ret = -EINVAL;
++ break;
++ }
++
++ /*
++ * Requeue nr_requeue waiters and possibly one more in the case
++ * of requeue_pi if we couldn't acquire the lock atomically.
++ */
++ if (requeue_pi) {
++ /*
++ * Prepare the waiter to take the rt_mutex. Take a
++ * refcount on the pi_state and store the pointer in
++ * the futex_q object of the waiter.
++ */
++ get_pi_state(pi_state);
++ this->pi_state = pi_state;
++ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
++ this->rt_waiter,
++ this->task);
++ if (ret == 1) {
++ /*
++ * We got the lock. We do neither drop the
++ * refcount on pi_state nor clear
++ * this->pi_state because the waiter needs the
++ * pi_state for cleaning up the user space
++ * value. It will drop the refcount after
++ * doing so.
++ */
++ requeue_pi_wake_futex(this, &key2, hb2);
++ continue;
++ } else if (ret) {
++ /*
++ * rt_mutex_start_proxy_lock() detected a
++ * potential deadlock when we tried to queue
++ * that waiter. Drop the pi_state reference
++ * which we took above and remove the pointer
++ * to the state from the waiters futex_q
++ * object.
++ */
++ this->pi_state = NULL;
++ put_pi_state(pi_state);
++ /*
++ * We stop queueing more waiters and let user
++ * space deal with the mess.
++ */
++ break;
++ }
++ }
++ requeue_futex(this, hb1, hb2, &key2);
++ }
++
++ /*
++ * We took an extra initial reference to the pi_state either
++ * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
++ * need to drop it here again.
++ */
++ put_pi_state(pi_state);
++
++out_unlock:
++ double_unlock_hb(hb1, hb2);
++ wake_up_q(&wake_q);
++ hb_waiters_dec(hb2);
++ return ret ? ret : task_count;
++}
++
++/* The key must be already stored in q->key. */
++static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
++ __acquires(&hb->lock)
++{
++ struct futex_hash_bucket *hb;
++
++ hb = hash_futex(&q->key);
++
++ /*
++ * Increment the counter before taking the lock so that
++ * a potential waker won't miss a to-be-slept task that is
++ * waiting for the spinlock. This is safe as all queue_lock()
++ * users end up calling queue_me(). Similarly, for housekeeping,
++ * decrement the counter at queue_unlock() when some error has
++ * occurred and we don't end up adding the task to the list.
++ */
++ hb_waiters_inc(hb); /* implies smp_mb(); (A) */
++
++ q->lock_ptr = &hb->lock;
++
++ spin_lock(&hb->lock);
++ return hb;
++}
++
++static inline void
++queue_unlock(struct futex_hash_bucket *hb)
++ __releases(&hb->lock)
++{
++ spin_unlock(&hb->lock);
++ hb_waiters_dec(hb);
++}
++
++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
++{
++ int prio;
++
++ /*
++ * The priority used to register this element is
++ * - either the real thread-priority for the real-time threads
++ * (i.e. threads with a priority lower than MAX_RT_PRIO)
++ * - or MAX_RT_PRIO for non-RT threads.
++ * Thus, all RT-threads are woken first in priority order, and
++ * the others are woken last, in FIFO order.
++ */
++ prio = min(current->normal_prio, MAX_RT_PRIO);
++
++ plist_node_init(&q->list, prio);
++ plist_add(&q->list, &hb->chain);
++ q->task = current;
++}
++
++/**
++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
++ * @q: The futex_q to enqueue
++ * @hb: The destination hash bucket
++ *
++ * The hb->lock must be held by the caller, and is released here. A call to
++ * queue_me() is typically paired with exactly one call to unqueue_me(). The
++ * exceptions involve the PI related operations, which may use unqueue_me_pi()
++ * or nothing if the unqueue is done as part of the wake process and the unqueue
++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
++ * an example).
++ */
++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
++ __releases(&hb->lock)
++{
++ __queue_me(q, hb);
++ spin_unlock(&hb->lock);
++}
++
++/**
++ * unqueue_me() - Remove the futex_q from its futex_hash_bucket
++ * @q: The futex_q to unqueue
++ *
++ * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
++ * be paired with exactly one earlier call to queue_me().
++ *
++ * Return:
++ * - 1 - if the futex_q was still queued (and we removed unqueued it);
++ * - 0 - if the futex_q was already removed by the waking thread
++ */
++static int unqueue_me(struct futex_q *q)
++{
++ spinlock_t *lock_ptr;
++ int ret = 0;
++
++ /* In the common case we don't take the spinlock, which is nice. */
++retry:
++ /*
++ * q->lock_ptr can change between this read and the following spin_lock.
++ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
++ * optimizing lock_ptr out of the logic below.
++ */
++ lock_ptr = READ_ONCE(q->lock_ptr);
++ if (lock_ptr != NULL) {
++ spin_lock(lock_ptr);
++ /*
++ * q->lock_ptr can change between reading it and
++ * spin_lock(), causing us to take the wrong lock. This
++ * corrects the race condition.
++ *
++ * Reasoning goes like this: if we have the wrong lock,
++ * q->lock_ptr must have changed (maybe several times)
++ * between reading it and the spin_lock(). It can
++ * change again after the spin_lock() but only if it was
++ * already changed before the spin_lock(). It cannot,
++ * however, change back to the original value. Therefore
++ * we can detect whether we acquired the correct lock.
++ */
++ if (unlikely(lock_ptr != q->lock_ptr)) {
++ spin_unlock(lock_ptr);
++ goto retry;
++ }
++ __unqueue_futex(q);
++
++ BUG_ON(q->pi_state);
++
++ spin_unlock(lock_ptr);
++ ret = 1;
++ }
++
++ return ret;
++}
++
++/*
++ * PI futexes can not be requeued and must remove themself from the
++ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
++ * and dropped here.
++ */
++static void unqueue_me_pi(struct futex_q *q)
++ __releases(q->lock_ptr)
++{
++ __unqueue_futex(q);
++
++ BUG_ON(!q->pi_state);
++ put_pi_state(q->pi_state);
++ q->pi_state = NULL;
++
++ spin_unlock(q->lock_ptr);
++}
++
++static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
++ struct task_struct *argowner)
++{
++ struct futex_pi_state *pi_state = q->pi_state;
++ u32 uval, curval, newval;
++ struct task_struct *oldowner, *newowner;
++ u32 newtid;
++ int ret, err = 0;
++
++ lockdep_assert_held(q->lock_ptr);
++
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++
++ oldowner = pi_state->owner;
++
++ /*
++ * We are here because either:
++ *
++ * - we stole the lock and pi_state->owner needs updating to reflect
++ * that (@argowner == current),
++ *
++ * or:
++ *
++ * - someone stole our lock and we need to fix things to point to the
++ * new owner (@argowner == NULL).
++ *
++ * Either way, we have to replace the TID in the user space variable.
++ * This must be atomic as we have to preserve the owner died bit here.
++ *
++ * Note: We write the user space value _before_ changing the pi_state
++ * because we can fault here. Imagine swapped out pages or a fork
++ * that marked all the anonymous memory readonly for cow.
++ *
++ * Modifying pi_state _before_ the user space value would leave the
++ * pi_state in an inconsistent state when we fault here, because we
++ * need to drop the locks to handle the fault. This might be observed
++ * in the PID check in lookup_pi_state.
++ */
++retry:
++ if (!argowner) {
++ if (oldowner != current) {
++ /*
++ * We raced against a concurrent self; things are
++ * already fixed up. Nothing to do.
++ */
++ ret = 0;
++ goto out_unlock;
++ }
++
++ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
++ /* We got the lock after all, nothing to fix. */
++ ret = 0;
++ goto out_unlock;
++ }
++
++ /*
++ * Since we just failed the trylock; there must be an owner.
++ */
++ newowner = rt_mutex_owner(&pi_state->pi_mutex);
++ BUG_ON(!newowner);
++ } else {
++ WARN_ON_ONCE(argowner != current);
++ if (oldowner == current) {
++ /*
++ * We raced against a concurrent self; things are
++ * already fixed up. Nothing to do.
++ */
++ ret = 0;
++ goto out_unlock;
++ }
++ newowner = argowner;
++ }
++
++ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
++ /* Owner died? */
++ if (!pi_state->owner)
++ newtid |= FUTEX_OWNER_DIED;
++
++ err = get_futex_value_locked(&uval, uaddr);
++ if (err)
++ goto handle_err;
++
++ for (;;) {
++ newval = (uval & FUTEX_OWNER_DIED) | newtid;
++
++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++ if (err)
++ goto handle_err;
++
++ if (curval == uval)
++ break;
++ uval = curval;
++ }
++
++ /*
++ * We fixed up user space. Now we need to fix the pi_state
++ * itself.
++ */
++ if (pi_state->owner != NULL) {
++ raw_spin_lock(&pi_state->owner->pi_lock);
++ WARN_ON(list_empty(&pi_state->list));
++ list_del_init(&pi_state->list);
++ raw_spin_unlock(&pi_state->owner->pi_lock);
++ }
++
++ pi_state->owner = newowner;
++
++ raw_spin_lock(&newowner->pi_lock);
++ WARN_ON(!list_empty(&pi_state->list));
++ list_add(&pi_state->list, &newowner->pi_state_list);
++ raw_spin_unlock(&newowner->pi_lock);
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++
++ return 0;
++
++ /*
++ * In order to reschedule or handle a page fault, we need to drop the
++ * locks here. In the case of a fault, this gives the other task
++ * (either the highest priority waiter itself or the task which stole
++ * the rtmutex) the chance to try the fixup of the pi_state. So once we
++ * are back from handling the fault we need to check the pi_state after
++ * reacquiring the locks and before trying to do another fixup. When
++ * the fixup has been done already we simply return.
++ *
++ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
++ * drop hb->lock since the caller owns the hb -> futex_q relation.
++ * Dropping the pi_mutex->wait_lock requires the state revalidate.
++ */
++handle_err:
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ spin_unlock(q->lock_ptr);
++
++ switch (err) {
++ case -EFAULT:
++ ret = fault_in_user_writeable(uaddr);
++ break;
++
++ case -EAGAIN:
++ cond_resched();
++ ret = 0;
++ break;
++
++ default:
++ WARN_ON_ONCE(1);
++ ret = err;
++ break;
++ }
++
++ spin_lock(q->lock_ptr);
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++
++ /*
++ * Check if someone else fixed it for us:
++ */
++ if (pi_state->owner != oldowner) {
++ ret = 0;
++ goto out_unlock;
++ }
++
++ if (ret)
++ goto out_unlock;
++
++ goto retry;
++
++out_unlock:
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ return ret;
++}
++
++static long futex_wait_restart(struct restart_block *restart);
++
++/**
++ * fixup_owner() - Post lock pi_state and corner case management
++ * @uaddr: user address of the futex
++ * @q: futex_q (contains pi_state and access to the rt_mutex)
++ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
++ *
++ * After attempting to lock an rt_mutex, this function is called to cleanup
++ * the pi_state owner as well as handle race conditions that may allow us to
++ * acquire the lock. Must be called with the hb lock held.
++ *
++ * Return:
++ * - 1 - success, lock taken;
++ * - 0 - success, lock not taken;
++ * - <0 - on error (-EFAULT)
++ */
++static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
++{
++ int ret = 0;
++
++ if (locked) {
++ /*
++ * Got the lock. We might not be the anticipated owner if we
++ * did a lock-steal - fix up the PI-state in that case:
++ *
++ * Speculative pi_state->owner read (we don't hold wait_lock);
++ * since we own the lock pi_state->owner == current is the
++ * stable state, anything else needs more attention.
++ */
++ if (q->pi_state->owner != current)
++ ret = fixup_pi_state_owner(uaddr, q, current);
++ return ret ? ret : locked;
++ }
++
++ /*
++ * If we didn't get the lock; check if anybody stole it from us. In
++ * that case, we need to fix up the uval to point to them instead of
++ * us, otherwise bad things happen. [10]
++ *
++ * Another speculative read; pi_state->owner == current is unstable
++ * but needs our attention.
++ */
++ if (q->pi_state->owner == current) {
++ ret = fixup_pi_state_owner(uaddr, q, NULL);
++ return ret;
++ }
++
++ /*
++ * Paranoia check. If we did not take the lock, then we should not be
++ * the owner of the rt_mutex.
++ */
++ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
++ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
++ "pi-state %p\n", ret,
++ q->pi_state->pi_mutex.owner,
++ q->pi_state->owner);
++ }
++
++ return ret;
++}
++
++/**
++ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
++ * @hb: the futex hash bucket, must be locked by the caller
++ * @q: the futex_q to queue up on
++ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
++ */
++static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
++ struct hrtimer_sleeper *timeout)
++{
++ /*
++ * The task state is guaranteed to be set before another task can
++ * wake it. set_current_state() is implemented using smp_store_mb() and
++ * queue_me() calls spin_unlock() upon completion, both serializing
++ * access to the hash list and forcing another memory barrier.
++ */
++ set_current_state(TASK_INTERRUPTIBLE);
++ queue_me(q, hb);
++
++ /* Arm the timer */
++ if (timeout)
++ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
++
++ /*
++ * If we have been removed from the hash list, then another task
++ * has tried to wake us, and we can skip the call to schedule().
++ */
++ if (likely(!plist_node_empty(&q->list))) {
++ /*
++ * If the timer has already expired, current will already be
++ * flagged for rescheduling. Only call schedule if there
++ * is no timeout, or if it has yet to expire.
++ */
++ if (!timeout || timeout->task)
++ freezable_schedule();
++ }
++ __set_current_state(TASK_RUNNING);
++}
++
++/**
++ * futex_wait_setup() - Prepare to wait on a futex
++ * @uaddr: the futex userspace address
++ * @val: the expected value
++ * @flags: futex flags (FLAGS_SHARED, etc.)
++ * @q: the associated futex_q
++ * @hb: storage for hash_bucket pointer to be returned to caller
++ *
++ * Setup the futex_q and locate the hash_bucket. Get the futex value and
++ * compare it with the expected value. Handle atomic faults internally.
++ * Return with the hb lock held and a q.key reference on success, and unlocked
++ * with no q.key reference on failure.
++ *
++ * Return:
++ * - 0 - uaddr contains val and hb has been locked;
++ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
++ */
++static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
++ struct futex_q *q, struct futex_hash_bucket **hb)
++{
++ u32 uval;
++ int ret;
++
++ /*
++ * Access the page AFTER the hash-bucket is locked.
++ * Order is important:
++ *
++ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
++ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
++ *
++ * The basic logical guarantee of a futex is that it blocks ONLY
++ * if cond(var) is known to be true at the time of blocking, for
++ * any cond. If we locked the hash-bucket after testing *uaddr, that
++ * would open a race condition where we could block indefinitely with
++ * cond(var) false, which would violate the guarantee.
++ *
++ * On the other hand, we insert q and release the hash-bucket only
++ * after testing *uaddr. This guarantees that futex_wait() will NOT
++ * absorb a wakeup if *uaddr does not match the desired values
++ * while the syscall executes.
++ */
++retry:
++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
++ if (unlikely(ret != 0))
++ return ret;
++
++retry_private:
++ *hb = queue_lock(q);
++
++ ret = get_futex_value_locked(&uval, uaddr);
++
++ if (ret) {
++ queue_unlock(*hb);
++
++ ret = get_user(uval, uaddr);
++ if (ret)
++ return ret;
++
++ if (!(flags & FLAGS_SHARED))
++ goto retry_private;
++
++ goto retry;
++ }
++
++ if (uval != val) {
++ queue_unlock(*hb);
++ ret = -EWOULDBLOCK;
++ }
++
++ return ret;
++}
++
++static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
++ ktime_t *abs_time, u32 bitset)
++{
++ struct hrtimer_sleeper timeout, *to;
++ struct restart_block *restart;
++ struct futex_hash_bucket *hb;
++ struct futex_q q = futex_q_init;
++ int ret;
++
++ if (!bitset)
++ return -EINVAL;
++ q.bitset = bitset;
++
++ to = futex_setup_timer(abs_time, &timeout, flags,
++ current->timer_slack_ns);
++retry:
++ /*
++ * Prepare to wait on uaddr. On success, holds hb lock and increments
++ * q.key refs.
++ */
++ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
++ if (ret)
++ goto out;
++
++ /* queue_me and wait for wakeup, timeout, or a signal. */
++ futex_wait_queue_me(hb, &q, to);
++
++ /* If we were woken (and unqueued), we succeeded, whatever. */
++ ret = 0;
++ /* unqueue_me() drops q.key ref */
++ if (!unqueue_me(&q))
++ goto out;
++ ret = -ETIMEDOUT;
++ if (to && !to->task)
++ goto out;
++
++ /*
++ * We expect signal_pending(current), but we might be the
++ * victim of a spurious wakeup as well.
++ */
++ if (!signal_pending(current))
++ goto retry;
++
++ ret = -ERESTARTSYS;
++ if (!abs_time)
++ goto out;
++
++ restart = &current->restart_block;
++ restart->fn = futex_wait_restart;
++ restart->futex.uaddr = uaddr;
++ restart->futex.val = val;
++ restart->futex.time = *abs_time;
++ restart->futex.bitset = bitset;
++ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
++
++ ret = -ERESTART_RESTARTBLOCK;
++
++out:
++ if (to) {
++ hrtimer_cancel(&to->timer);
++ destroy_hrtimer_on_stack(&to->timer);
++ }
++ return ret;
++}
++
++
++static long futex_wait_restart(struct restart_block *restart)
++{
++ u32 __user *uaddr = restart->futex.uaddr;
++ ktime_t t, *tp = NULL;
++
++ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
++ t = restart->futex.time;
++ tp = &t;
++ }
++ restart->fn = do_no_restart_syscall;
++
++ return (long)futex_wait(uaddr, restart->futex.flags,
++ restart->futex.val, tp, restart->futex.bitset);
++}
++
++
++/*
++ * Userspace tried a 0 -> TID atomic transition of the futex value
++ * and failed. The kernel side here does the whole locking operation:
++ * if there are waiters then it will block as a consequence of relying
++ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
++ * a 0 value of the futex too.).
++ *
++ * Also serves as futex trylock_pi()'ing, and due semantics.
++ */
++static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
++ ktime_t *time, int trylock)
++{
++ struct hrtimer_sleeper timeout, *to;
++ struct futex_pi_state *pi_state = NULL;
++ struct task_struct *exiting = NULL;
++ struct rt_mutex_waiter rt_waiter;
++ struct futex_hash_bucket *hb;
++ struct futex_q q = futex_q_init;
++ int res, ret;
++
++ if (!IS_ENABLED(CONFIG_FUTEX_PI))
++ return -ENOSYS;
++
++ if (refill_pi_state_cache())
++ return -ENOMEM;
++
++ to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
++
++retry:
++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
++ if (unlikely(ret != 0))
++ goto out;
++
++retry_private:
++ hb = queue_lock(&q);
++
++ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
++ &exiting, 0);
++ if (unlikely(ret)) {
++ /*
++ * Atomic work succeeded and we got the lock,
++ * or failed. Either way, we do _not_ block.
++ */
++ switch (ret) {
++ case 1:
++ /* We got the lock. */
++ ret = 0;
++ goto out_unlock_put_key;
++ case -EFAULT:
++ goto uaddr_faulted;
++ case -EBUSY:
++ case -EAGAIN:
++ /*
++ * Two reasons for this:
++ * - EBUSY: Task is exiting and we just wait for the
++ * exit to complete.
++ * - EAGAIN: The user space value changed.
++ */
++ queue_unlock(hb);
++ /*
++ * Handle the case where the owner is in the middle of
++ * exiting. Wait for the exit to complete otherwise
++ * this task might loop forever, aka. live lock.
++ */
++ wait_for_owner_exiting(ret, exiting);
++ cond_resched();
++ goto retry;
++ default:
++ goto out_unlock_put_key;
++ }
++ }
++
++ WARN_ON(!q.pi_state);
++
++ /*
++ * Only actually queue now that the atomic ops are done:
++ */
++ __queue_me(&q, hb);
++
++ if (trylock) {
++ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
++ /* Fixup the trylock return value: */
++ ret = ret ? 0 : -EWOULDBLOCK;
++ goto no_block;
++ }
++
++ rt_mutex_init_waiter(&rt_waiter);
++
++ /*
++ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
++ * hold it while doing rt_mutex_start_proxy(), because then it will
++ * include hb->lock in the blocking chain, even through we'll not in
++ * fact hold it while blocking. This will lead it to report -EDEADLK
++ * and BUG when futex_unlock_pi() interleaves with this.
++ *
++ * Therefore acquire wait_lock while holding hb->lock, but drop the
++ * latter before calling __rt_mutex_start_proxy_lock(). This
++ * interleaves with futex_unlock_pi() -- which does a similar lock
++ * handoff -- such that the latter can observe the futex_q::pi_state
++ * before __rt_mutex_start_proxy_lock() is done.
++ */
++ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
++ spin_unlock(q.lock_ptr);
++ /*
++ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
++ * such that futex_unlock_pi() is guaranteed to observe the waiter when
++ * it sees the futex_q::pi_state.
++ */
++ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
++
++ if (ret) {
++ if (ret == 1)
++ ret = 0;
++ goto cleanup;
++ }
++
++ if (unlikely(to))
++ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
++
++ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
++
++cleanup:
++ spin_lock(q.lock_ptr);
++ /*
++ * If we failed to acquire the lock (deadlock/signal/timeout), we must
++ * first acquire the hb->lock before removing the lock from the
++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
++ * lists consistent.
++ *
++ * In particular; it is important that futex_unlock_pi() can not
++ * observe this inconsistency.
++ */
++ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
++ ret = 0;
++
++no_block:
++ /*
++ * Fixup the pi_state owner and possibly acquire the lock if we
++ * haven't already.
++ */
++ res = fixup_owner(uaddr, &q, !ret);
++ /*
++ * If fixup_owner() returned an error, proprogate that. If it acquired
++ * the lock, clear our -ETIMEDOUT or -EINTR.
++ */
++ if (res)
++ ret = (res < 0) ? res : 0;
++
++ /*
++ * If fixup_owner() faulted and was unable to handle the fault, unlock
++ * it and return the fault to userspace.
++ */
++ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
++ pi_state = q.pi_state;
++ get_pi_state(pi_state);
++ }
++
++ /* Unqueue and drop the lock */
++ unqueue_me_pi(&q);
++
++ if (pi_state) {
++ rt_mutex_futex_unlock(&pi_state->pi_mutex);
++ put_pi_state(pi_state);
++ }
++
++ goto out;
++
++out_unlock_put_key:
++ queue_unlock(hb);
++
++out:
++ if (to) {
++ hrtimer_cancel(&to->timer);
++ destroy_hrtimer_on_stack(&to->timer);
++ }
++ return ret != -EINTR ? ret : -ERESTARTNOINTR;
++
++uaddr_faulted:
++ queue_unlock(hb);
++
++ ret = fault_in_user_writeable(uaddr);
++ if (ret)
++ goto out;
++
++ if (!(flags & FLAGS_SHARED))
++ goto retry_private;
++
++ goto retry;
++}
++
++/*
++ * Userspace attempted a TID -> 0 atomic transition, and failed.
++ * This is the in-kernel slowpath: we look up the PI state (if any),
++ * and do the rt-mutex unlock.
++ */
++static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
++{
++ u32 curval, uval, vpid = task_pid_vnr(current);
++ union futex_key key = FUTEX_KEY_INIT;
++ struct futex_hash_bucket *hb;
++ struct futex_q *top_waiter;
++ int ret;
++
++ if (!IS_ENABLED(CONFIG_FUTEX_PI))
++ return -ENOSYS;
++
++retry:
++ if (get_user(uval, uaddr))
++ return -EFAULT;
++ /*
++ * We release only a lock we actually own:
++ */
++ if ((uval & FUTEX_TID_MASK) != vpid)
++ return -EPERM;
++
++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
++ if (ret)
++ return ret;
++
++ hb = hash_futex(&key);
++ spin_lock(&hb->lock);
++
++ /*
++ * Check waiters first. We do not trust user space values at
++ * all and we at least want to know if user space fiddled
++ * with the futex value instead of blindly unlocking.
++ */
++ top_waiter = futex_top_waiter(hb, &key);
++ if (top_waiter) {
++ struct futex_pi_state *pi_state = top_waiter->pi_state;
++
++ ret = -EINVAL;
++ if (!pi_state)
++ goto out_unlock;
++
++ /*
++ * If current does not own the pi_state then the futex is
++ * inconsistent and user space fiddled with the futex value.
++ */
++ if (pi_state->owner != current)
++ goto out_unlock;
++
++ get_pi_state(pi_state);
++ /*
++ * By taking wait_lock while still holding hb->lock, we ensure
++ * there is no point where we hold neither; and therefore
++ * wake_futex_pi() must observe a state consistent with what we
++ * observed.
++ *
++ * In particular; this forces __rt_mutex_start_proxy() to
++ * complete such that we're guaranteed to observe the
++ * rt_waiter. Also see the WARN in wake_futex_pi().
++ */
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++ spin_unlock(&hb->lock);
++
++ /* drops pi_state->pi_mutex.wait_lock */
++ ret = wake_futex_pi(uaddr, uval, pi_state);
++
++ put_pi_state(pi_state);
++
++ /*
++ * Success, we're done! No tricky corner cases.
++ */
++ if (!ret)
++ goto out_putkey;
++ /*
++ * The atomic access to the futex value generated a
++ * pagefault, so retry the user-access and the wakeup:
++ */
++ if (ret == -EFAULT)
++ goto pi_faulted;
++ /*
++ * A unconditional UNLOCK_PI op raced against a waiter
++ * setting the FUTEX_WAITERS bit. Try again.
++ */
++ if (ret == -EAGAIN)
++ goto pi_retry;
++ /*
++ * wake_futex_pi has detected invalid state. Tell user
++ * space.
++ */
++ goto out_putkey;
++ }
++
++ /*
++ * We have no kernel internal state, i.e. no waiters in the
++ * kernel. Waiters which are about to queue themselves are stuck
++ * on hb->lock. So we can safely ignore them. We do neither
++ * preserve the WAITERS bit not the OWNER_DIED one. We are the
++ * owner.
++ */
++ if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
++ spin_unlock(&hb->lock);
++ switch (ret) {
++ case -EFAULT:
++ goto pi_faulted;
++
++ case -EAGAIN:
++ goto pi_retry;
++
++ default:
++ WARN_ON_ONCE(1);
++ goto out_putkey;
++ }
++ }
++
++ /*
++ * If uval has changed, let user space handle it.
++ */
++ ret = (curval == uval) ? 0 : -EAGAIN;
++
++out_unlock:
++ spin_unlock(&hb->lock);
++out_putkey:
++ return ret;
++
++pi_retry:
++ cond_resched();
++ goto retry;
++
++pi_faulted:
++
++ ret = fault_in_user_writeable(uaddr);
++ if (!ret)
++ goto retry;
++
++ return ret;
++}
++
++/**
++ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
++ * @hb: the hash_bucket futex_q was original enqueued on
++ * @q: the futex_q woken while waiting to be requeued
++ * @key2: the futex_key of the requeue target futex
++ * @timeout: the timeout associated with the wait (NULL if none)
++ *
++ * Detect if the task was woken on the initial futex as opposed to the requeue
++ * target futex. If so, determine if it was a timeout or a signal that caused
++ * the wakeup and return the appropriate error code to the caller. Must be
++ * called with the hb lock held.
++ *
++ * Return:
++ * - 0 = no early wakeup detected;
++ * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
++ */
++static inline
++int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
++ struct futex_q *q, union futex_key *key2,
++ struct hrtimer_sleeper *timeout)
++{
++ int ret = 0;
++
++ /*
++ * With the hb lock held, we avoid races while we process the wakeup.
++ * We only need to hold hb (and not hb2) to ensure atomicity as the
++ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
++ * It can't be requeued from uaddr2 to something else since we don't
++ * support a PI aware source futex for requeue.
++ */
++ if (!match_futex(&q->key, key2)) {
++ WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
++ /*
++ * We were woken prior to requeue by a timeout or a signal.
++ * Unqueue the futex_q and determine which it was.
++ */
++ plist_del(&q->list, &hb->chain);
++ hb_waiters_dec(hb);
++
++ /* Handle spurious wakeups gracefully */
++ ret = -EWOULDBLOCK;
++ if (timeout && !timeout->task)
++ ret = -ETIMEDOUT;
++ else if (signal_pending(current))
++ ret = -ERESTARTNOINTR;
++ }
++ return ret;
++}
++
++/**
++ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
++ * @uaddr: the futex we initially wait on (non-pi)
++ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
++ * the same type, no requeueing from private to shared, etc.
++ * @val: the expected value of uaddr
++ * @abs_time: absolute timeout
++ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
++ * @uaddr2: the pi futex we will take prior to returning to user-space
++ *
++ * The caller will wait on uaddr and will be requeued by futex_requeue() to
++ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
++ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
++ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
++ * without one, the pi logic would not know which task to boost/deboost, if
++ * there was a need to.
++ *
++ * We call schedule in futex_wait_queue_me() when we enqueue and return there
++ * via the following--
++ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
++ * 2) wakeup on uaddr2 after a requeue
++ * 3) signal
++ * 4) timeout
++ *
++ * If 3, cleanup and return -ERESTARTNOINTR.
++ *
++ * If 2, we may then block on trying to take the rt_mutex and return via:
++ * 5) successful lock
++ * 6) signal
++ * 7) timeout
++ * 8) other lock acquisition failure
++ *
++ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
++ *
++ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
++ *
++ * Return:
++ * - 0 - On success;
++ * - <0 - On error
++ */
++static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
++ u32 val, ktime_t *abs_time, u32 bitset,
++ u32 __user *uaddr2)
++{
++ struct hrtimer_sleeper timeout, *to;
++ struct futex_pi_state *pi_state = NULL;
++ struct rt_mutex_waiter rt_waiter;
++ struct futex_hash_bucket *hb;
++ union futex_key key2 = FUTEX_KEY_INIT;
++ struct futex_q q = futex_q_init;
++ int res, ret;
++
++ if (!IS_ENABLED(CONFIG_FUTEX_PI))
++ return -ENOSYS;
++
++ if (uaddr == uaddr2)
++ return -EINVAL;
++
++ if (!bitset)
++ return -EINVAL;
++
++ to = futex_setup_timer(abs_time, &timeout, flags,
++ current->timer_slack_ns);
++
++ /*
++ * The waiter is allocated on our stack, manipulated by the requeue
++ * code while we sleep on uaddr.
++ */
++ rt_mutex_init_waiter(&rt_waiter);
++
++ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
++ if (unlikely(ret != 0))
++ goto out;
++
++ q.bitset = bitset;
++ q.rt_waiter = &rt_waiter;
++ q.requeue_pi_key = &key2;
++
++ /*
++ * Prepare to wait on uaddr. On success, increments q.key (key1) ref
++ * count.
++ */
++ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
++ if (ret)
++ goto out;
++
++ /*
++ * The check above which compares uaddrs is not sufficient for
++ * shared futexes. We need to compare the keys:
++ */
++ if (match_futex(&q.key, &key2)) {
++ queue_unlock(hb);
++ ret = -EINVAL;
++ goto out;
++ }
++
++ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
++ futex_wait_queue_me(hb, &q, to);
++
++ spin_lock(&hb->lock);
++ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
++ spin_unlock(&hb->lock);
++ if (ret)
++ goto out;
++
++ /*
++ * In order for us to be here, we know our q.key == key2, and since
++ * we took the hb->lock above, we also know that futex_requeue() has
++ * completed and we no longer have to concern ourselves with a wakeup
++ * race with the atomic proxy lock acquisition by the requeue code. The
++ * futex_requeue dropped our key1 reference and incremented our key2
++ * reference count.
++ */
++
++ /* Check if the requeue code acquired the second futex for us. */
++ if (!q.rt_waiter) {
++ /*
++ * Got the lock. We might not be the anticipated owner if we
++ * did a lock-steal - fix up the PI-state in that case.
++ */
++ if (q.pi_state && (q.pi_state->owner != current)) {
++ spin_lock(q.lock_ptr);
++ ret = fixup_pi_state_owner(uaddr2, &q, current);
++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
++ pi_state = q.pi_state;
++ get_pi_state(pi_state);
++ }
++ /*
++ * Drop the reference to the pi state which
++ * the requeue_pi() code acquired for us.
++ */
++ put_pi_state(q.pi_state);
++ spin_unlock(q.lock_ptr);
++ }
++ } else {
++ struct rt_mutex *pi_mutex;
++
++ /*
++ * We have been woken up by futex_unlock_pi(), a timeout, or a
++ * signal. futex_unlock_pi() will not destroy the lock_ptr nor
++ * the pi_state.
++ */
++ WARN_ON(!q.pi_state);
++ pi_mutex = &q.pi_state->pi_mutex;
++ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
++
++ spin_lock(q.lock_ptr);
++ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
++ ret = 0;
++
++ debug_rt_mutex_free_waiter(&rt_waiter);
++ /*
++ * Fixup the pi_state owner and possibly acquire the lock if we
++ * haven't already.
++ */
++ res = fixup_owner(uaddr2, &q, !ret);
++ /*
++ * If fixup_owner() returned an error, proprogate that. If it
++ * acquired the lock, clear -ETIMEDOUT or -EINTR.
++ */
++ if (res)
++ ret = (res < 0) ? res : 0;
++
++ /*
++ * If fixup_pi_state_owner() faulted and was unable to handle
++ * the fault, unlock the rt_mutex and return the fault to
++ * userspace.
++ */
++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
++ pi_state = q.pi_state;
++ get_pi_state(pi_state);
++ }
++
++ /* Unqueue and drop the lock. */
++ unqueue_me_pi(&q);
++ }
++
++ if (pi_state) {
++ rt_mutex_futex_unlock(&pi_state->pi_mutex);
++ put_pi_state(pi_state);
++ }
++
++ if (ret == -EINTR) {
++ /*
++ * We've already been requeued, but cannot restart by calling
++ * futex_lock_pi() directly. We could restart this syscall, but
++ * it would detect that the user space "val" changed and return
++ * -EWOULDBLOCK. Save the overhead of the restart and return
++ * -EWOULDBLOCK directly.
++ */
++ ret = -EWOULDBLOCK;
++ }
++
++out:
++ if (to) {
++ hrtimer_cancel(&to->timer);
++ destroy_hrtimer_on_stack(&to->timer);
++ }
++ return ret;
++}
++
++static long do_futex1(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
++ u32 __user *uaddr2, u32 val2, u32 val3)
++{
++ int cmd = op & FUTEX_CMD_MASK;
++ unsigned int flags = 0;
++
++ if (!(op & FUTEX_PRIVATE_FLAG))
++ flags |= FLAGS_SHARED;
++
++ if (op & FUTEX_CLOCK_REALTIME) {
++ flags |= FLAGS_CLOCKRT;
++ if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
++ cmd != FUTEX_WAIT_REQUEUE_PI)
++ return -ENOSYS;
++ }
++
++ switch (cmd) {
++ case FUTEX_LOCK_PI:
++ case FUTEX_UNLOCK_PI:
++ case FUTEX_TRYLOCK_PI:
++ case FUTEX_WAIT_REQUEUE_PI:
++ case FUTEX_CMP_REQUEUE_PI:
++ if (!futex_cmpxchg_enabled)
++ return -ENOSYS;
++ }
++
++ switch (cmd) {
++ case FUTEX_WAIT:
++ val3 = FUTEX_BITSET_MATCH_ANY;
++ fallthrough;
++ case FUTEX_WAIT_BITSET:
++ return futex_wait(uaddr, flags, val, timeout, val3);
++ case FUTEX_WAKE:
++ val3 = FUTEX_BITSET_MATCH_ANY;
++ fallthrough;
++ case FUTEX_WAKE_BITSET:
++ return futex_wake(uaddr, flags, val, val3);
++ case FUTEX_REQUEUE:
++ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
++ case FUTEX_CMP_REQUEUE:
++ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
++ case FUTEX_WAKE_OP:
++ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
++ case FUTEX_LOCK_PI:
++ return futex_lock_pi(uaddr, flags, timeout, 0);
++ case FUTEX_UNLOCK_PI:
++ return futex_unlock_pi(uaddr, flags);
++ case FUTEX_TRYLOCK_PI:
++ return futex_lock_pi(uaddr, flags, NULL, 1);
++ case FUTEX_WAIT_REQUEUE_PI:
++ val3 = FUTEX_BITSET_MATCH_ANY;
++ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
++ uaddr2);
++ case FUTEX_CMP_REQUEUE_PI:
++ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
++ }
++ return -ENOSYS;
++}
++
++
++SYSCALL_DEFINE6(futex1, u32 __user *, uaddr, int, op, u32, val,
++ struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
++ u32, val3)
++{
++ struct timespec64 ts;
++ ktime_t t, *tp = NULL;
++ u32 val2 = 0;
++ int cmd = op & FUTEX_CMD_MASK;
++
++ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
++ cmd == FUTEX_WAIT_BITSET ||
++ cmd == FUTEX_WAIT_REQUEUE_PI)) {
++ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
++ return -EFAULT;
++ if (get_timespec64(&ts, utime))
++ return -EFAULT;
++ if (!timespec64_valid(&ts))
++ return -EINVAL;
++
++ t = timespec64_to_ktime(ts);
++ if (cmd == FUTEX_WAIT)
++ t = ktime_add_safe(ktime_get(), t);
++ tp = &t;
++ }
++ /*
++ * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
++ * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
++ */
++ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
++ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
++ val2 = (u32) (unsigned long) utime;
++
++ return do_futex1(uaddr, op, val, tp, uaddr2, val2, val3);
++}
++
++static void __init futex_detect_cmpxchg(void)
++{
++#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
++ u32 curval;
++
++ /*
++ * This will fail and we want it. Some arch implementations do
++ * runtime detection of the futex_atomic_cmpxchg_inatomic()
++ * functionality. We want to know that before we call in any
++ * of the complex code paths. Also we want to prevent
++ * registration of robust lists in that case. NULL is
++ * guaranteed to fault and we get -EFAULT on functional
++ * implementation, the non-functional ones will return
++ * -ENOSYS.
++ */
++ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
++ futex_cmpxchg_enabled = 1;
++#endif
++}
++
++static int __init futex_init(void)
++{
++ unsigned int futex_shift;
++ unsigned long i;
++
++#if CONFIG_BASE_SMALL
++ futex_hashsize = 16;
++#else
++ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
++#endif
++
++ futex_queues = alloc_large_system_hash("futex1", sizeof(*futex_queues),
++ futex_hashsize, 0,
++ futex_hashsize < 256 ? HASH_SMALL : 0,
++ &futex_shift, NULL,
++ futex_hashsize, futex_hashsize);
++ futex_hashsize = 1UL << futex_shift;
++
++ futex_detect_cmpxchg();
++
++ for (i = 0; i < futex_hashsize; i++) {
++ atomic_set(&futex_queues[i].waiters, 0);
++ plist_head_init(&futex_queues[i].chain);
++ spin_lock_init(&futex_queues[i].lock);
++ }
++
++ return 0;
++}
++core_initcall(futex_init);
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index 3e1a713d3e57..b53a24a99a14 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -153,6 +153,8 @@ COND_SYSCALL(futex_wait);
+ COND_SYSCALL(futex_wake);
+ COND_SYSCALL(futex_waitv);
+
++COND_SYSCALL(futex1);
++
+ /* kernel/hrtimer.c */
+
+ /* kernel/itimer.c */
+diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h
+index 4205ed4158bf..43de5a59ac1c 100644
+--- a/tools/arch/x86/include/asm/unistd_64.h
++++ b/tools/arch/x86/include/asm/unistd_64.h
+@@ -17,3 +17,15 @@
+ #ifndef __NR_setns
+ #define __NR_setns 308
+ #endif
++
++#ifndef __NR_futex_wait
++#define __NR_futex_wait 440
++#endif
++
++#ifndef __NR_futex_wake
++#define __NR_futex_wake 441
++#endif
++
++#ifndef __NR_futex1
++#define __NR_futex1 442
++#endif
+diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
+index dd457de21bad..f737eaeecbb6 100644
+--- a/tools/include/uapi/asm-generic/unistd.h
++++ b/tools/include/uapi/asm-generic/unistd.h
+@@ -862,11 +862,15 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
+
+ #define __NR_futex_wait 440
+ __SYSCALL(__NR_futex_wait, sys_futex_wait)
++
+ #define __NR_futex_wake 441
+ __SYSCALL(__NR_futex_wake, sys_futex_wake)
+
++#define __NR_futex1 442
++__SYSCALL(__NR_futex1, sys_futex1)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 442
++#define __NR_syscalls 443
+
+ /*
+ * 32 bit systems traditionally used different
+diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+index f30d6ae9a688..1a516b081207 100644
+--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -361,6 +361,9 @@
+ 437 common openat2 sys_openat2
+ 438 common pidfd_getfd sys_pidfd_getfd
+ 439 common faccessat2 sys_faccessat2
++440 common futex_wait sys_futex_wait
++441 common futex_wake sys_futex_wake
++442 common futex1 sys_futex1
+
+ #
+ # x32-specific system call numbers start at 512 to avoid cache impact
+diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h
+index 31b53cc7d5bc..baf6a0d077ac 100644
+--- a/tools/perf/bench/futex.h
++++ b/tools/perf/bench/futex.h
+@@ -8,10 +8,14 @@
+ #ifndef _FUTEX_H
+ #define _FUTEX_H
+
++//#define FUTEX1 0
++#define UNUSED(x) (void)(x)
++
+ #include <unistd.h>
+ #include <sys/syscall.h>
+ #include <sys/types.h>
+ #include <linux/futex.h>
++#include <linux/unistd.h>
+
+ /**
+ * futex() - SYS_futex syscall wrapper
+@@ -34,7 +38,13 @@
+ * like-named arguments in the following wrappers except where noted below.
+ */
+ #define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \
+- syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3)
++ syscall(__NR_futex1, uaddr, op | opflags, val, timeout, uaddr2, val3)
++
++#define futex2_wake(uaddr, nr, flags) \
++ syscall(__NR_futex_wake, uaddr, nr, flags | FUTEX_32)
++
++#define futex2_wait(uaddr, val, flags, timeout) \
++ syscall(__NR_futex_wait, uaddr, val, flags | FUTEX_32, timeout)
+
+ /**
+ * futex_wait() - block on uaddr with optional timeout
+@@ -43,7 +53,13 @@
+ static inline int
+ futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags)
+ {
++#ifdef FUTEX1
+ return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags);
++#else
++ UNUSED(timeout);
++ UNUSED(opflags);
++ return futex2_wait(uaddr, val, 0, NULL);
++#endif
+ }
+
+ /**
+@@ -53,7 +69,12 @@ futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflag
+ static inline int
+ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags)
+ {
++#ifdef FUTEX1
+ return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags);
++#else
++ UNUSED(opflags);
++ return futex2_wake(uaddr, nr_wake, 0);
++#endif
+ }
+
+ /**
+--
+2.28.0
+
+From 2f5e38a4191ac6fd5040435f6a41433add3711a6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Thu, 15 Oct 2020 18:06:40 -0300
+Subject: [PATCH 07/13] futex2: Add support for shared futexes
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add support for shared futexes for cross-process resources.
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ kernel/futex2.c | 169 +++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 146 insertions(+), 23 deletions(-)
+
+diff --git a/kernel/futex2.c b/kernel/futex2.c
+index 4b782b5ef615..ae743ddf223e 100644
+--- a/kernel/futex2.c
++++ b/kernel/futex2.c
+@@ -6,7 +6,9 @@
+ */
+
+ #include <linux/freezer.h>
++#include <linux/hugetlb.h>
+ #include <linux/jhash.h>
++#include <linux/pagemap.h>
+ #include <linux/sched/wake_q.h>
+ #include <linux/spinlock.h>
+ #include <linux/syscalls.h>
+@@ -15,6 +17,7 @@
+
+ /**
+ * struct futex_waiter - List entry for a waiter
++ * @uaddr: Memory address of userspace futex
+ * @key.address: Memory address of userspace futex
+ * @key.mm: Pointer to memory management struct of this process
+ * @key: Stores information that uniquely identify a futex
+@@ -25,6 +28,7 @@
+ * @index: Index of waiter in futexv list
+ */
+ struct futex_waiter {
++ uintptr_t uaddr;
+ struct futex_key {
+ uintptr_t address;
+ struct mm_struct *mm;
+@@ -125,16 +129,109 @@ static inline int bucket_get_waiters(struct futex_bucket *bucket)
+ #endif
+ }
+
++static u64 get_inode_sequence_number(struct inode *inode)
++{
++ static atomic64_t i_seq;
++ u64 old;
++
++ /* Does the inode already have a sequence number? */
++ old = atomic64_read(&inode->i_sequence);
++ if (likely(old))
++ return old;
++
++ for (;;) {
++ u64 new = atomic64_add_return(1, &i_seq);
++ if (WARN_ON_ONCE(!new))
++ continue;
++
++ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
++ if (old)
++ return old;
++ return new;
++ }
++}
++
++static int futex_get_shared_key(uintptr_t address, struct mm_struct *mm,
++ struct futex_key *key)
++{
++ int err;
++ struct page *page, *tail;
++ struct address_space *mapping;
++
++again:
++ err = get_user_pages_fast(address, 1, 0, &page);
++
++ if (err < 0)
++ return err;
++ else
++ err = 0;
++
++
++ tail = page;
++ page = compound_head(page);
++ mapping = READ_ONCE(page->mapping);
++
++
++ if (unlikely(!mapping)) {
++ int shmem_swizzled;
++
++ lock_page(page);
++ shmem_swizzled = PageSwapCache(page) || page->mapping;
++ unlock_page(page);
++ put_page(page);
++
++ if (shmem_swizzled)
++ goto again;
++
++ return -EFAULT;
++ }
++
++ if (PageAnon(page)) {
++
++ key->mm = mm;
++ key->address = address;
++
++ } else {
++ struct inode *inode;
++
++ rcu_read_lock();
++
++ if (READ_ONCE(page->mapping) != mapping) {
++ rcu_read_unlock();
++ put_page(page);
++
++ goto again;
++ }
++
++ inode = READ_ONCE(mapping->host);
++ if (!inode) {
++ rcu_read_unlock();
++ put_page(page);
++
++ goto again;
++ }
++
++ key->address = get_inode_sequence_number(inode);
++ key->mm = (struct mm_struct *) basepage_index(tail);
++ rcu_read_unlock();
++ }
++
++ put_page(page);
++ return err;
++}
++
+ /**
+ * futex_get_bucket - Check if the user address is valid, prepare internal
+ * data and calculate the hash
+ * @uaddr: futex user address
+ * @key: data that uniquely identifies a futex
++ * @shared: is this a shared futex?
+ *
+ * Return: address of bucket on success, error code otherwise
+ */
+ static struct futex_bucket *futex_get_bucket(void __user *uaddr,
+- struct futex_key *key)
++ struct futex_key *key,
++ bool shared)
+ {
+ uintptr_t address = (uintptr_t) uaddr;
+ u32 hash_key;
+@@ -145,8 +242,12 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr,
+ if (unlikely(!access_ok(address, sizeof(u32))))
+ return ERR_PTR(-EFAULT);
+
+- key->address = address;
+- key->mm = current->mm;
++ if (!shared) {
++ key->address = address;
++ key->mm = current->mm;
++ } else {
++ futex_get_shared_key(address, current->mm, key);
++ }
+
+ /* Generate hash key for this futex using uaddr and current->mm */
+ hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0);
+@@ -275,9 +376,10 @@ static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr)
+ * Return: 0 on success, error code otherwise
+ */
+ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes,
+- unsigned int *awaken)
++ int *awaken)
+ {
+ int i, ret;
++ bool shared;
+ u32 uval, *uaddr, val;
+ struct futex_bucket *bucket;
+
+@@ -285,9 +387,13 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes,
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ for (i = 0; i < nr_futexes; i++) {
+- uaddr = (u32 * __user) futexv->objects[i].key.address;
++ uaddr = (u32 * __user) futexv->objects[i].uaddr;
+ val = (u32) futexv->objects[i].val;
+- bucket = futexv->objects[i].bucket;
++ shared = (futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false;
++ if (shared)
++ bucket = futex_get_bucket((void *) uaddr, &futexv->objects[i].key, true);
++ else
++ bucket = futexv->objects[i].bucket;
+
+ bucket_inc_waiters(bucket);
+ spin_lock(&bucket->lock);
+@@ -301,11 +407,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes,
+ __set_current_state(TASK_RUNNING);
+ *awaken = futex_dequeue_multiple(futexv, i);
+
++ if (shared)
++ goto retry;
++
+ if (__get_user(uval, uaddr))
+ return -EFAULT;
+
+ if (*awaken >= 0)
+- return 0;
++ return 1;
+
+ goto retry;
+ }
+@@ -313,12 +422,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes,
+ if (uval != val) {
+ spin_unlock(&bucket->lock);
+
++
+ bucket_dec_waiters(bucket);
+ __set_current_state(TASK_RUNNING);
+ *awaken = futex_dequeue_multiple(futexv, i);
+
+- if (*awaken >= 0)
+- return 0;
++ if (*awaken >= 0) {
++ return 1;
++ }
+
+ return -EWOULDBLOCK;
+ }
+@@ -336,19 +447,18 @@ static int __futex_wait(struct futexv *futexv,
+ struct hrtimer_sleeper *timeout)
+ {
+ int ret;
+- unsigned int awaken = -1;
+
+- while (1) {
+- ret = futex_enqueue(futexv, nr_futexes, &awaken);
+
+- if (ret < 0)
+- break;
++ while (1) {
++ int awaken = -1;
+
+- if (awaken <= 0) {
+- return awaken;
++ ret = futex_enqueue(futexv, nr_futexes, &awaken);
++ if (ret) {
++ if (awaken >= 0)
++ return awaken;
++ return ret;
+ }
+
+-
+ /* Before sleeping, check if someone was woken */
+ if (!futexv->hint && (!timeout || timeout->task))
+ freezable_schedule();
+@@ -419,6 +529,7 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes,
+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
+ }
+
++
+ ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL);
+
+
+@@ -438,9 +549,10 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes,
+ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val,
+ unsigned int, flags, struct __kernel_timespec __user *, timo)
+ {
++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false;
+ unsigned int size = flags & FUTEX_SIZE_MASK;
+- struct hrtimer_sleeper timeout;
+ struct futex_single_waiter wait_single;
++ struct hrtimer_sleeper timeout;
+ struct futex_waiter *waiter;
+ struct futexv *futexv;
+ int ret;
+@@ -452,6 +564,7 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val,
+ waiter = &wait_single.waiter;
+ waiter->index = 0;
+ waiter->val = val;
++ waiter->uaddr = (uintptr_t) uaddr;
+
+ INIT_LIST_HEAD(&waiter->list);
+
+@@ -462,11 +575,14 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val,
+ return -EINVAL;
+
+ /* Get an unlocked hash bucket */
+- waiter->bucket = futex_get_bucket(uaddr, &waiter->key);
+- if (IS_ERR(waiter->bucket))
++ waiter->bucket = futex_get_bucket(uaddr, &waiter->key, shared);
++ if (IS_ERR(waiter->bucket)) {
+ return PTR_ERR(waiter->bucket);
++ }
+
+ ret = futex_wait(futexv, 1, timo, &timeout, flags);
++ if (ret > 0)
++ ret = 0;
+
+ return ret;
+ }
+@@ -486,8 +602,10 @@ static int futex_parse_waitv(struct futexv *futexv,
+ struct futex_waitv waitv;
+ unsigned int i;
+ struct futex_bucket *bucket;
++ bool shared;
+
+ for (i = 0; i < nr_futexes; i++) {
++
+ if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv)))
+ return -EFAULT;
+
+@@ -495,8 +613,10 @@ static int futex_parse_waitv(struct futexv *futexv,
+ (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32)
+ return -EINVAL;
+
++ shared = (waitv.flags & FUTEX_SHARED_FLAG) ? true : false;
++
+ bucket = futex_get_bucket(waitv.uaddr,
+- &futexv->objects[i].key);
++ &futexv->objects[i].key, shared);
+ if (IS_ERR(bucket))
+ return PTR_ERR(bucket);
+
+@@ -505,6 +625,7 @@ static int futex_parse_waitv(struct futexv *futexv,
+ futexv->objects[i].flags = waitv.flags;
+ futexv->objects[i].index = i;
+ INIT_LIST_HEAD(&futexv->objects[i].list);
++ futexv->objects[i].uaddr = (uintptr_t) waitv.uaddr;
+ }
+
+ return 0;
+@@ -573,6 +694,7 @@ static struct futexv *futex_get_parent(uintptr_t waiter, u8 index)
+ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake,
+ unsigned int, flags)
+ {
++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false;
+ unsigned int size = flags & FUTEX_SIZE_MASK;
+ struct futex_waiter waiter, *aux, *tmp;
+ struct futex_bucket *bucket;
+@@ -586,9 +708,10 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake,
+ if (size != FUTEX_32)
+ return -EINVAL;
+
+- bucket = futex_get_bucket(uaddr, &waiter.key);
+- if (IS_ERR(bucket))
++ bucket = futex_get_bucket(uaddr, &waiter.key, shared);
++ if (IS_ERR(bucket)) {
+ return PTR_ERR(bucket);
++ }
+
+ if (!bucket_get_waiters(bucket))
+ return 0;
+--
+2.28.0
+
+From 909eb056421668b5d42f8c4dfa92339851a43dd8 Mon Sep 17 00:00:00 2001
+From: Gabriel Krisman Bertazi <krisman@collabora.com>
+Date: Mon, 2 Nov 2020 18:41:38 -0500
+Subject: [PATCH 08/13] Revert "futex: Remove needless goto's"
+
+This reverts commit d7c5ed73b19c4640426d9c106f70ec2cb532034d.
+---
+ kernel/futex.c | 40 ++++++++++++++++++++++++----------------
+ 1 file changed, 24 insertions(+), 16 deletions(-)
+
+diff --git a/kernel/futex.c b/kernel/futex.c
+index 6c00c0952313..a671d371b11f 100644
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1593,13 +1593,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+ if (unlikely(ret != 0))
+- return ret;
++ goto out;
+
+ hb = hash_futex(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+- return ret;
++ goto out;
+
+ spin_lock(&hb->lock);
+
+@@ -1622,6 +1622,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
++out:
+ return ret;
+ }
+
+@@ -1688,10 +1689,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+- return ret;
++ goto out;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+- return ret;
++ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+@@ -1709,13 +1710,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
+- return ret;
++ goto out;
+ }
+
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+- return ret;
++ goto out;
+ }
+
+ if (!(flags & FLAGS_SHARED)) {
+@@ -1758,6 +1759,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
++out:
+ return ret;
+ }
+
+@@ -1964,18 +1966,20 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+- return ret;
++ goto out;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+ if (unlikely(ret != 0))
+- return ret;
++ goto out;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+- if (requeue_pi && match_futex(&key1, &key2))
+- return -EINVAL;
++ if (requeue_pi && match_futex(&key1, &key2)) {
++ ret = -EINVAL;
++ goto out;
++ }
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+@@ -1995,7 +1999,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+
+ ret = get_user(curval, uaddr1);
+ if (ret)
+- return ret;
++ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+@@ -2061,7 +2065,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+- return ret;
++ goto out;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+@@ -2180,6 +2184,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
+ hb_waiters_dec(hb2);
++
++out:
+ return ret ? ret : task_count;
+ }
+
+@@ -2537,7 +2543,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+ */
+ if (q->pi_state->owner != current)
+ ret = fixup_pi_state_owner(uaddr, q, current);
+- return ret ? ret : locked;
++ goto out;
+ }
+
+ /*
+@@ -2550,7 +2556,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+ */
+ if (q->pi_state->owner == current) {
+ ret = fixup_pi_state_owner(uaddr, q, NULL);
+- return ret;
++ goto out;
+ }
+
+ /*
+@@ -2564,7 +2570,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+ q->pi_state->owner);
+ }
+
+- return ret;
++out:
++ return ret ? ret : locked;
+ }
+
+ /**
+@@ -2661,7 +2668,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+- return ret;
++ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+@@ -2674,6 +2681,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ ret = -EWOULDBLOCK;
+ }
+
++out:
+ return ret;
+ }
+
+--
+2.28.0
+
+From fee513186b69c4a65534fd790545877974ef17d3 Mon Sep 17 00:00:00 2001
+From: Gabriel Krisman Bertazi <krisman@collabora.com>
+Date: Mon, 2 Nov 2020 18:41:54 -0500
+Subject: [PATCH 09/13] Revert "futex: Remove put_futex_key()"
+
+This reverts commit 9180bd467f9abdb44afde650d07e3b9dd66d837c.
+---
+ kernel/futex.c | 61 ++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 49 insertions(+), 12 deletions(-)
+
+diff --git a/kernel/futex.c b/kernel/futex.c
+index a671d371b11f..647de692c874 100644
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -661,6 +661,10 @@ static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+ return err;
+ }
+
++static inline void put_futex_key(union futex_key *key)
++{
++}
++
+ /**
+ * fault_in_user_writeable() - Fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+@@ -1599,7 +1603,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+- goto out;
++ goto out_put_key;
+
+ spin_lock(&hb->lock);
+
+@@ -1622,6 +1626,8 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
++out_put_key:
++ put_futex_key(&key);
+ out:
+ return ret;
+ }
+@@ -1692,7 +1698,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ goto out;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+- goto out;
++ goto out_put_key1;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+@@ -1710,13 +1716,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
+- goto out;
++ goto out_put_keys;
+ }
+
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+- goto out;
++ goto out_put_keys;
+ }
+
+ if (!(flags & FLAGS_SHARED)) {
+@@ -1724,6 +1730,8 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ goto retry_private;
+ }
+
++ put_futex_key(&key2);
++ put_futex_key(&key1);
+ cond_resched();
+ goto retry;
+ }
+@@ -1759,6 +1767,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
++out_put_keys:
++ put_futex_key(&key2);
++out_put_key1:
++ put_futex_key(&key1);
+ out:
+ return ret;
+ }
+@@ -1970,7 +1982,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+ if (unlikely(ret != 0))
+- goto out;
++ goto out_put_key1;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+@@ -1978,7 +1990,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ */
+ if (requeue_pi && match_futex(&key1, &key2)) {
+ ret = -EINVAL;
+- goto out;
++ goto out_put_keys;
+ }
+
+ hb1 = hash_futex(&key1);
+@@ -1999,11 +2011,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+
+ ret = get_user(curval, uaddr1);
+ if (ret)
+- goto out;
++ goto out_put_keys;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
++ put_futex_key(&key2);
++ put_futex_key(&key1);
+ goto retry;
+ }
+ if (curval != *cmpval) {
+@@ -2062,6 +2076,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ case -EFAULT:
+ double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
++ put_futex_key(&key2);
++ put_futex_key(&key1);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+@@ -2076,6 +2092,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ */
+ double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
++ put_futex_key(&key2);
++ put_futex_key(&key1);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+@@ -2185,6 +2203,10 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ wake_up_q(&wake_q);
+ hb_waiters_dec(hb2);
+
++out_put_keys:
++ put_futex_key(&key2);
++out_put_key1:
++ put_futex_key(&key1);
+ out:
+ return ret ? ret : task_count;
+ }
+@@ -2673,6 +2695,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
++ put_futex_key(&q->key);
+ goto retry;
+ }
+
+@@ -2682,6 +2705,8 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ }
+
+ out:
++ if (ret)
++ put_futex_key(&q->key);
+ return ret;
+ }
+
+@@ -2826,6 +2851,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
+ * - EAGAIN: The user space value changed.
+ */
+ queue_unlock(hb);
++ put_futex_key(&q.key);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+@@ -2933,11 +2959,13 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
+ put_pi_state(pi_state);
+ }
+
+- goto out;
++ goto out_put_key;
+
+ out_unlock_put_key:
+ queue_unlock(hb);
+
++out_put_key:
++ put_futex_key(&q.key);
+ out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+@@ -2950,11 +2978,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+- goto out;
++ goto out_put_key;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
++ put_futex_key(&q.key);
+ goto retry;
+ }
+
+@@ -3083,13 +3112,16 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+ out_unlock:
+ spin_unlock(&hb->lock);
+ out_putkey:
++ put_futex_key(&key);
+ return ret;
+
+ pi_retry:
++ put_futex_key(&key);
+ cond_resched();
+ goto retry;
+
+ pi_faulted:
++ put_futex_key(&key);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (!ret)
+@@ -3231,7 +3263,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+- goto out;
++ goto out_key2;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+@@ -3240,7 +3272,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ if (match_futex(&q.key, &key2)) {
+ queue_unlock(hb);
+ ret = -EINVAL;
+- goto out;
++ goto out_put_keys;
+ }
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+@@ -3250,7 +3282,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+ spin_unlock(&hb->lock);
+ if (ret)
+- goto out;
++ goto out_put_keys;
+
+ /*
+ * In order for us to be here, we know our q.key == key2, and since
+@@ -3340,6 +3372,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ ret = -EWOULDBLOCK;
+ }
+
++out_put_keys:
++ put_futex_key(&q.key);
++out_key2:
++ put_futex_key(&key2);
++
+ out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+--
+2.28.0
+
+From 3b1489448a277fc1c34ca12e859193c3a7f3446c Mon Sep 17 00:00:00 2001
+From: Gabriel Krisman Bertazi <krisman@collabora.com>
+Date: Fri, 12 Jul 2019 14:16:20 -0400
+Subject: [PATCH 10/13] futex: Split key setup from key queue locking and read
+
+split the futex key setup from the queue locking and key reading. This
+is usefull to support the setup of multiple keys at the same time, like
+what is done in futex_requeue() and what will be done for the
+FUTEX_WAIT_MULTIPLE command.
+
+Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
+---
+ kernel/futex.c | 71 +++++++++++++++++++++++++++++---------------------
+ 1 file changed, 42 insertions(+), 29 deletions(-)
+
+diff --git a/kernel/futex.c b/kernel/futex.c
+index 647de692c874..f05349def492 100644
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2634,6 +2634,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+ __set_current_state(TASK_RUNNING);
+ }
+
++static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
++ struct futex_q *q, struct futex_hash_bucket **hb)
++{
++
++ u32 uval;
++ int ret;
++
++retry_private:
++ *hb = queue_lock(q);
++
++ ret = get_futex_value_locked(&uval, uaddr);
++
++ if (ret) {
++ queue_unlock(*hb);
++
++ ret = get_user(uval, uaddr);
++ if (ret)
++ return ret;
++
++ if (!(flags & FLAGS_SHARED))
++ goto retry_private;
++
++ return 1;
++ }
++
++ if (uval != val) {
++ queue_unlock(*hb);
++ ret = -EWOULDBLOCK;
++ }
++
++ return ret;
++}
++
+ /**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+@@ -2654,7 +2687,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb)
+ {
+- u32 uval;
+ int ret;
+
+ /*
+@@ -2675,38 +2707,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
+ */
+-retry:
+- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+- if (unlikely(ret != 0))
+- return ret;
+-
+-retry_private:
+- *hb = queue_lock(q);
++ do {
++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED,
++ &q->key, FUTEX_READ);
++ if (unlikely(ret != 0))
++ return ret;
+
+- ret = get_futex_value_locked(&uval, uaddr);
++ ret = __futex_wait_setup(uaddr, val, flags, q, hb);
+
+- if (ret) {
+- queue_unlock(*hb);
+-
+- ret = get_user(uval, uaddr);
++ /* Drop key reference if retry or error. */
+ if (ret)
+- goto out;
++ put_futex_key(&q->key);
++ } while (ret > 0);
+
+- if (!(flags & FLAGS_SHARED))
+- goto retry_private;
+-
+- put_futex_key(&q->key);
+- goto retry;
+- }
+-
+- if (uval != val) {
+- queue_unlock(*hb);
+- ret = -EWOULDBLOCK;
+- }
+-
+-out:
+- if (ret)
+- put_futex_key(&q->key);
+ return ret;
+ }
+
+--
+2.28.0
+
+From 539862895e53b9a774f3a2271d1e7db57879d0d7 Mon Sep 17 00:00:00 2001
+From: Gabriel Krisman Bertazi <krisman@collabora.com>
+Date: Mon, 8 Jul 2019 09:44:09 -0400
+Subject: [PATCH 11/13] futex: Implement FUTEX_WAIT_MULTIPLE
+
+This is a new futex operation to allow a thread to wait on several
+futexes at the same time, and wake up on any of them. In a sense, it
+implements one of the features that was supported by pooling on the old
+FUTEX_FD interface.
+
+My use case for this feature lies in Wine, where we want to implement a
+similar function available in Windows, mainly for event handling. The
+wine folks have an implementation of the userspace side using eventfd,
+but it suffers from bad performance, as shown in the measurements below.
+
+Technically, the old FUTEX_WAIT implementation can be easily
+reimplemented using do_futex_wait_multiple, with a count one, and I have
+a patch demonstrating how it works. I'm not proposing it, since futex
+is such a tricky code, that I'd be more confortable to have
+FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles,
+before considering modifying FUTEX_WAIT.
+
+This was tested using three mechanisms:
+
+1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and
+running tools/testing/selftests/futex and a full linux distro on top of
+this kernel.
+
+2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a
+multi thread, event handling setup.
+
+3) By running the Wine fsync implementation and executing multi-threaded
+applications, in particular modern games on top of the implementation.
+
+Signed-off-by: Zebediah Figura <z.figura12@gmail.com>
+Signed-off-by: Steven Noonan <steven@valvesoftware.com>
+Signed-off-by: Pierre-Loup A. Griffais <pgriffais@valvesoftware.com>
+Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
+---
+ include/uapi/linux/futex.h | 7 ++
+ kernel/futex.c | 159 ++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 162 insertions(+), 4 deletions(-)
+
+diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
+index 35a5bf1cd41b..aefb0b83b784 100644
+--- a/include/uapi/linux/futex.h
++++ b/include/uapi/linux/futex.h
+@@ -21,6 +21,7 @@
+ #define FUTEX_WAKE_BITSET 10
+ #define FUTEX_WAIT_REQUEUE_PI 11
+ #define FUTEX_CMP_REQUEUE_PI 12
++#define FUTEX_WAIT_MULTIPLE 13
+
+ #define FUTEX_PRIVATE_FLAG 128
+ #define FUTEX_CLOCK_REALTIME 256
+@@ -190,4 +191,10 @@ struct robust_list_head {
+ (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \
+ | ((oparg & 0xfff) << 12) | (cmparg & 0xfff))
+
++struct futex_wait_block {
++ __u32 __user *uaddr;
++ __u32 val;
++ __u32 bitset;
++};
++
+ #endif /* _UAPI_LINUX_FUTEX_H */
+diff --git a/kernel/futex.c b/kernel/futex.c
+index f05349def492..775f780a96c4 100644
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -166,6 +166,7 @@ static int __read_mostly futex_cmpxchg_enabled;
+ #endif
+ #define FLAGS_CLOCKRT 0x02
+ #define FLAGS_HAS_TIMEOUT 0x04
++#define FLAGS_WAKE_MULTIPLE 0x08
+
+ /*
+ * Priority Inheritance state:
+@@ -2723,6 +2724,148 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ return ret;
+ }
+
++static int do_futex_wait_multiple(struct futex_wait_block *wb,
++ u32 count, unsigned int flags,
++ ktime_t *abs_time)
++{
++
++ struct hrtimer_sleeper timeout, *to;
++ struct futex_hash_bucket *hb;
++ struct futex_q *qs = NULL;
++ int ret;
++ int i;
++
++ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL);
++ if (!qs)
++ return -ENOMEM;
++
++ to = futex_setup_timer(abs_time, &timeout, flags,
++ current->timer_slack_ns);
++ retry:
++ for (i = 0; i < count; i++) {
++ qs[i].key = FUTEX_KEY_INIT;
++ qs[i].bitset = wb[i].bitset;
++
++ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED,
++ &qs[i].key, FUTEX_READ);
++ if (unlikely(ret != 0)) {
++ for (--i; i >= 0; i--)
++ put_futex_key(&qs[i].key);
++ goto out;
++ }
++ }
++
++ set_current_state(TASK_INTERRUPTIBLE);
++
++ for (i = 0; i < count; i++) {
++ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val,
++ flags, &qs[i], &hb);
++ if (ret) {
++ /* Drop the failed key directly. keys 0..(i-1)
++ * will be put by unqueue_me. */
++ put_futex_key(&qs[i].key);
++
++ /* Undo the partial work we did. */
++ for (--i; i >= 0; i--)
++ unqueue_me(&qs[i]);
++
++ __set_current_state(TASK_RUNNING);
++ if (ret > 0)
++ goto retry;
++ goto out;
++ }
++
++ /* We can't hold to the bucket lock when dealing with
++ * the next futex. Queue ourselves now so we can unlock
++ * it before moving on. */
++ queue_me(&qs[i], hb);
++ }
++
++ if (to)
++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
++
++ /* There is no easy to way to check if we are wake already on
++ * multiple futexes without waking through each one of them. So
++ * just sleep and let the scheduler handle it.
++ */
++ if (!to || to->task)
++ freezable_schedule();
++
++ __set_current_state(TASK_RUNNING);
++
++ ret = -ETIMEDOUT;
++ /* If we were woken (and unqueued), we succeeded. */
++ for (i = 0; i < count; i++)
++ if (!unqueue_me(&qs[i]))
++ ret = i;
++
++ /* Succeed wakeup */
++ if (ret >= 0)
++ goto out;
++
++ /* Woken by triggered timeout */
++ if (to && !to->task)
++ goto out;
++
++ /*
++ * We expect signal_pending(current), but we might be the
++ * victim of a spurious wakeup as well.
++ */
++ if (!signal_pending(current))
++ goto retry;
++
++ ret = -ERESTARTSYS;
++ if (!abs_time)
++ goto out;
++
++ ret = -ERESTART_RESTARTBLOCK;
++ out:
++ if (to) {
++ hrtimer_cancel(&to->timer);
++ destroy_hrtimer_on_stack(&to->timer);
++ }
++
++ kfree(qs);
++ return ret;
++}
++
++static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags,
++ u32 count, ktime_t *abs_time)
++{
++ struct futex_wait_block *wb;
++ struct restart_block *restart;
++ int ret;
++
++ if (!count)
++ return -EINVAL;
++
++ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL);
++ if (!wb)
++ return -ENOMEM;
++
++ if (copy_from_user(wb, uaddr,
++ count * sizeof(struct futex_wait_block))) {
++ ret = -EFAULT;
++ goto out;
++ }
++
++ ret = do_futex_wait_multiple(wb, count, flags, abs_time);
++
++ if (ret == -ERESTART_RESTARTBLOCK) {
++ restart = &current->restart_block;
++ restart->fn = futex_wait_restart;
++ restart->futex.uaddr = uaddr;
++ restart->futex.val = count;
++ restart->futex.time = *abs_time;
++ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT |
++ FLAGS_WAKE_MULTIPLE);
++ }
++
++out:
++ kfree(wb);
++ return ret;
++}
++
+ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset)
+ {
+@@ -2800,6 +2943,10 @@ static long futex_wait_restart(struct restart_block *restart)
+ }
+ restart->fn = do_no_restart_syscall;
+
++ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE)
++ return (long)futex_wait_multiple(uaddr, restart->futex.flags,
++ restart->futex.val, tp);
++
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
+ }
+@@ -3843,6 +3990,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ uaddr2);
+ case FUTEX_CMP_REQUEUE_PI:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
++ case FUTEX_WAIT_MULTIPLE:
++ return futex_wait_multiple(uaddr, flags, val, timeout);
+ }
+ return -ENOSYS;
+ }
+@@ -3859,7 +4008,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+- cmd == FUTEX_WAIT_REQUEUE_PI)) {
++ cmd == FUTEX_WAIT_REQUEUE_PI ||
++ cmd == FUTEX_WAIT_MULTIPLE)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
+ if (get_timespec64(&ts, utime))
+@@ -3868,7 +4018,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ return -EINVAL;
+
+ t = timespec64_to_ktime(ts);
+- if (cmd == FUTEX_WAIT)
++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE)
+ t = ktime_add_safe(ktime_get(), t);
+ else if (!(op & FUTEX_CLOCK_REALTIME))
+ t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+@@ -4055,14 +4205,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+- cmd == FUTEX_WAIT_REQUEUE_PI)) {
++ cmd == FUTEX_WAIT_REQUEUE_PI ||
++ cmd == FUTEX_WAIT_MULTIPLE)) {
+ if (get_old_timespec32(&ts, utime))
+ return -EFAULT;
+ if (!timespec64_valid(&ts))
+ return -EINVAL;
+
+ t = timespec64_to_ktime(ts);
+- if (cmd == FUTEX_WAIT)
++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE)
+ t = ktime_add_safe(ktime_get(), t);
+ else if (!(op & FUTEX_CLOCK_REALTIME))
+ t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+--
+2.28.0
+
+From f56b85af005d46e9ef920a6728e61f7c47cf561e Mon Sep 17 00:00:00 2001
+From: Gabriel Krisman Bertazi <krisman@collabora.com>
+Date: Mon, 2 Nov 2020 18:50:26 -0500
+Subject: [PATCH 12/13] futex: Change WAIT_MULTIPLE opcode to 31
+
+Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
+---
+ include/uapi/linux/futex.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
+index aefb0b83b784..fe2b67ac0c5e 100644
+--- a/include/uapi/linux/futex.h
++++ b/include/uapi/linux/futex.h
+@@ -21,7 +21,7 @@
+ #define FUTEX_WAKE_BITSET 10
+ #define FUTEX_WAIT_REQUEUE_PI 11
+ #define FUTEX_CMP_REQUEUE_PI 12
+-#define FUTEX_WAIT_MULTIPLE 13
++#define FUTEX_WAIT_MULTIPLE 31
+
+ #define FUTEX_PRIVATE_FLAG 128
+ #define FUTEX_CLOCK_REALTIME 256
+--
+2.28.0
+
+From 022e2f888a50fb8d062e26bc385abf02c0be84a3 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
+Date: Mon, 16 Nov 2020 21:22:21 -0300
+Subject: [PATCH 13/13] futex2: Add sysfs entry for syscall numbers
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Signed-off-by: André Almeida <andrealmeid@collabora.com>
+---
+ kernel/futex2.c | 42 ++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 42 insertions(+)
+
+diff --git a/kernel/futex2.c b/kernel/futex2.c
+index ae743ddf223e..4bdff8bfc78d 100644
+--- a/kernel/futex2.c
++++ b/kernel/futex2.c
+@@ -742,6 +742,48 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake,
+ return ret;
+ }
+
++static ssize_t wait_show(struct kobject *kobj, struct kobj_attribute *attr,
++ char *buf)
++{
++ return sprintf(buf, "%u\n", __NR_futex_wait);
++
++}
++static struct kobj_attribute futex2_wait_attr = __ATTR_RO(wait);
++
++static ssize_t wake_show(struct kobject *kobj, struct kobj_attribute *attr,
++ char *buf)
++{
++ return sprintf(buf, "%u\n", __NR_futex_wake);
++
++}
++static struct kobj_attribute futex2_wake_attr = __ATTR_RO(wake);
++
++static ssize_t waitv_show(struct kobject *kobj, struct kobj_attribute *attr,
++ char *buf)
++{
++ return sprintf(buf, "%u\n", __NR_futex_waitv);
++
++}
++static struct kobj_attribute futex2_waitv_attr = __ATTR_RO(waitv);
++
++static struct attribute *futex2_sysfs_attrs[] = {
++ &futex2_wait_attr.attr,
++ &futex2_wake_attr.attr,
++ &futex2_waitv_attr.attr,
++ NULL,
++};
++
++static const struct attribute_group futex2_sysfs_attr_group = {
++ .attrs = futex2_sysfs_attrs,
++ .name = "futex2",
++};
++
++static int __init futex2_sysfs_init(void)
++{
++ return sysfs_create_group(kernel_kobj, &futex2_sysfs_attr_group);
++}
++subsys_initcall(futex2_sysfs_init);
++
+ static int __init futex2_init(void)
+ {
+ int i;
+--
+2.28.0
+