From 3814de27892f88d7bee46f434d386ada761fd4ba Mon Sep 17 00:00:00 2001 From: Jan200101 Date: Fri, 15 Jan 2021 00:10:49 +0100 Subject: kernel 5.10.6 --- SOURCES/futex2.patch | 6706 +++++++++++--------------------------------------- 1 file changed, 1497 insertions(+), 5209 deletions(-) (limited to 'SOURCES/futex2.patch') diff --git a/SOURCES/futex2.patch b/SOURCES/futex2.patch index bae4138..1bc4486 100644 --- a/SOURCES/futex2.patch +++ b/SOURCES/futex2.patch @@ -1,7 +1,7 @@ -From ada1f13b98e86cb7ac4140c4976c3d165006d995 Mon Sep 17 00:00:00 2001 +From 14a106cc87e6d03169ac8c7ea030e3d7fac2dfe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Wed, 5 Aug 2020 12:40:26 -0300 -Subject: [PATCH 01/13] futex2: Add new futex interface +Subject: [PATCH 1/9] futex2: Add new futex interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -10,25 +10,28 @@ Initial implementation for futex2. Support only private u32 wait/wake, with timeout (monotonic and realtime clocks). Signed-off-by: André Almeida +Signed-off-by: Jan200101 --- - MAINTAINERS | 2 +- - arch/x86/entry/syscalls/syscall_32.tbl | 2 + - arch/x86/entry/syscalls/syscall_64.tbl | 2 + - include/linux/syscalls.h | 7 + - include/uapi/asm-generic/unistd.h | 8 +- - include/uapi/linux/futex.h | 40 ++ - init/Kconfig | 7 + - kernel/Makefile | 1 + - kernel/futex2.c | 484 +++++++++++++++++++++++++ - kernel/sys_ni.c | 4 + - 10 files changed, 555 insertions(+), 2 deletions(-) + MAINTAINERS | 2 +- + arch/x86/entry/syscalls/syscall_32.tbl | 2 + + arch/x86/entry/syscalls/syscall_64.tbl | 2 + + include/linux/syscalls.h | 7 + + include/uapi/asm-generic/unistd.h | 8 +- + include/uapi/linux/futex.h | 40 ++ + init/Kconfig | 7 + + kernel/Makefile | 1 + + kernel/futex2.c | 484 ++++++++++++++++++ + kernel/sys_ni.c | 4 + + tools/include/uapi/asm-generic/unistd.h | 9 +- + .../arch/x86/entry/syscalls/syscall_64.tbl | 2 + + 12 files changed, 565 insertions(+), 3 deletions(-) create mode 100644 kernel/futex2.c diff --git a/MAINTAINERS b/MAINTAINERS -index 867157311dc8..0c425f74ed88 100644 +index 2daa6ee67..855d38511 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -7214,7 +7214,7 @@ F: Documentation/locking/*futex* +@@ -7259,7 +7259,7 @@ F: Documentation/locking/*futex* F: include/asm-generic/futex.h F: include/linux/futex.h F: include/uapi/linux/futex.h @@ -38,30 +41,30 @@ index 867157311dc8..0c425f74ed88 100644 F: tools/testing/selftests/futex/ diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index 9d1102873666..955322962964 100644 +index 0d0667a9f..83a75ff39 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl -@@ -444,3 +444,5 @@ - 437 i386 openat2 sys_openat2 +@@ -445,3 +445,5 @@ 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 -+440 i386 futex_wait sys_futex_wait -+441 i386 futex_wake sys_futex_wake + 440 i386 process_madvise sys_process_madvise ++441 i386 futex_wait sys_futex_wait ++442 i386 futex_wake sys_futex_wake diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl -index f30d6ae9a688..4133bfe96891 100644 +index 379819244..6658fd63c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl -@@ -361,6 +361,8 @@ - 437 common openat2 sys_openat2 +@@ -362,6 +362,8 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 -+440 common futex_wait sys_futex_wait -+441 common futex_wake sys_futex_wake + 440 common process_madvise sys_process_madvise ++441 common futex_wait sys_futex_wait ++442 common futex_wake sys_futex_wake # - # x32-specific system call numbers start at 512 to avoid cache impact + # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 75ac7f8ae93c..38c3a87dbfc2 100644 +index 37bea07c1..b6b77cf2b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -589,6 +589,13 @@ asmlinkage long sys_get_robust_list(int pid, @@ -79,27 +82,27 @@ index 75ac7f8ae93c..38c3a87dbfc2 100644 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index 995b36c2ea7d..80567ade774a 100644 +index 205631898..ae47d6a9e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h -@@ -860,8 +860,14 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) - #define __NR_faccessat2 439 - __SYSCALL(__NR_faccessat2, sys_faccessat2) +@@ -860,8 +860,14 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) + #define __NR_process_madvise 440 + __SYSCALL(__NR_process_madvise, sys_process_madvise) -+#define __NR_futex_wait 440 ++#define __NR_futex_wait 441 +__SYSCALL(__NR_futex_wait, sys_futex_wait) + -+#define __NR_futex_wake 441 ++#define __NR_futex_wake 442 +__SYSCALL(__NR_futex_wake, sys_futex_wake) + #undef __NR_syscalls --#define __NR_syscalls 440 -+#define __NR_syscalls 442 +-#define __NR_syscalls 441 ++#define __NR_syscalls 443 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index a89eb0accd5e..35a5bf1cd41b 100644 +index a89eb0acc..35a5bf1cd 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -41,6 +41,46 @@ @@ -150,7 +153,7 @@ index a89eb0accd5e..35a5bf1cd41b 100644 * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/init/Kconfig b/init/Kconfig -index 2a5df1cf838c..440f21f5c3d8 100644 +index 02d13ae27..1264687ea 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1522,6 +1522,13 @@ config FUTEX @@ -168,10 +171,10 @@ index 2a5df1cf838c..440f21f5c3d8 100644 bool depends on FUTEX && RT_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile -index 9a20016d4900..51ea9bc647bf 100644 +index af601b9bd..bb7f33986 100644 --- a/kernel/Makefile +++ b/kernel/Makefile -@@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o +@@ -54,6 +54,7 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_FUTEX) += futex.o @@ -181,7 +184,7 @@ index 9a20016d4900..51ea9bc647bf 100644 ifneq ($(CONFIG_SMP),y) diff --git a/kernel/futex2.c b/kernel/futex2.c new file mode 100644 -index 000000000000..107b80a466d0 +index 000000000..107b80a46 --- /dev/null +++ b/kernel/futex2.c @@ -0,0 +1,484 @@ @@ -670,7 +673,7 @@ index 000000000000..107b80a466d0 +} +core_initcall(futex2_init); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index 4d59775ea79c..10049bc56c24 100644 +index f27ac94d5..35ff743b1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -148,6 +148,10 @@ COND_SYSCALL_COMPAT(set_robust_list); @@ -684,13 +687,48 @@ index 4d59775ea79c..10049bc56c24 100644 /* kernel/hrtimer.c */ /* kernel/itimer.c */ +diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h +index 205631898..cd79f94e0 100644 +--- a/tools/include/uapi/asm-generic/unistd.h ++++ b/tools/include/uapi/asm-generic/unistd.h +@@ -860,8 +860,15 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) + #define __NR_process_madvise 440 + __SYSCALL(__NR_process_madvise, sys_process_madvise) + ++#define __NR_futex_wait 441 ++__SYSCALL(__NR_futex_wait, sys_futex_wait) ++ ++#define __NR_futex_wake 442 ++__SYSCALL(__NR_futex_wake, sys_futex_wake) ++ + #undef __NR_syscalls +-#define __NR_syscalls 441 ++#define __NR_syscalls 443 ++ + + /* + * 32 bit systems traditionally used different +diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +index 379819244..47de3bf93 100644 +--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +@@ -362,6 +362,8 @@ + 438 common pidfd_getfd sys_pidfd_getfd + 439 common faccessat2 sys_faccessat2 + 440 common process_madvise sys_process_madvise ++441 common futex_wait sys_futex_wait ++442 common futex_wake sys_futex_wake + + # + # Due to a historical design error, certain syscalls are numbered differently -- -2.28.0 +2.29.2 -From 08110d54945541dd186a7dabeef58be08011dde7 Mon Sep 17 00:00:00 2001 + +From d71973d99efb1e2fd2542ea4d4b45b0e03e45b9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 15 Oct 2020 17:15:57 -0300 -Subject: [PATCH 02/13] futex2: Add suport for vectorized wait +Subject: [PATCH 2/9] futex2: Add suport for vectorized wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @@ -698,54 +736,57 @@ Content-Transfer-Encoding: 8bit Add support to wait on multiple futexes Signed-off-by: André Almeida +Signed-off-by: Jan200101 --- - arch/x86/entry/syscalls/syscall_32.tbl | 1 + - arch/x86/entry/syscalls/syscall_64.tbl | 1 + - include/uapi/asm-generic/unistd.h | 5 +- - kernel/futex2.c | 430 +++++++++++++++++-------- - kernel/sys_ni.c | 1 + - 5 files changed, 304 insertions(+), 134 deletions(-) + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + include/uapi/asm-generic/unistd.h | 5 +- + kernel/futex2.c | 430 ++++++++++++------ + kernel/sys_ni.c | 1 + + tools/include/uapi/asm-generic/unistd.h | 5 +- + .../arch/x86/entry/syscalls/syscall_64.tbl | 1 + + 7 files changed, 309 insertions(+), 135 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index 955322962964..c844c0cbf0e5 100644 +index 83a75ff39..65734d5e1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl -@@ -446,3 +446,4 @@ - 439 i386 faccessat2 sys_faccessat2 - 440 i386 futex_wait sys_futex_wait - 441 i386 futex_wake sys_futex_wake -+442 i386 futex_waitv sys_futex_waitv +@@ -447,3 +447,4 @@ + 440 i386 process_madvise sys_process_madvise + 441 i386 futex_wait sys_futex_wait + 442 i386 futex_wake sys_futex_wake ++443 i386 futex_waitv sys_futex_waitv diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl -index 4133bfe96891..0901c26c6786 100644 +index 6658fd63c..f30811b56 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl -@@ -363,6 +363,7 @@ - 439 common faccessat2 sys_faccessat2 - 440 common futex_wait sys_futex_wait - 441 common futex_wake sys_futex_wake -+442 common futex_waitv sys_futex_waitv +@@ -364,6 +364,7 @@ + 440 common process_madvise sys_process_madvise + 441 common futex_wait sys_futex_wait + 442 common futex_wake sys_futex_wake ++443 common futex_waitv sys_futex_waitv # - # x32-specific system call numbers start at 512 to avoid cache impact + # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index 80567ade774a..d7ebbed0a18c 100644 +index ae47d6a9e..81a90b697 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -866,8 +866,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) - #define __NR_futex_wake 441 + #define __NR_futex_wake 442 __SYSCALL(__NR_futex_wake, sys_futex_wake) -+#define __NR_futex_waitv 442 ++#define __NR_futex_waitv 443 +__SYSCALL(__NR_futex_waitv, sys_futex_waitv) + #undef __NR_syscalls --#define __NR_syscalls 442 -+#define __NR_syscalls 443 +-#define __NR_syscalls 443 ++#define __NR_syscalls 444 /* * 32 bit systems traditionally used different diff --git a/kernel/futex2.c b/kernel/futex2.c -index 107b80a466d0..4b782b5ef615 100644 +index 107b80a46..4b782b5ef 100644 --- a/kernel/futex2.c +++ b/kernel/futex2.c @@ -48,14 +48,25 @@ struct futex_bucket { @@ -1286,7 +1327,7 @@ index 107b80a466d0..4b782b5ef615 100644 get_task_struct(task); list_del_init_careful(&aux->list); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index 10049bc56c24..3e1a713d3e57 100644 +index 35ff743b1..1898e7340 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -151,6 +151,7 @@ COND_SYSCALL_COMPAT(get_robust_list); @@ -1297,5353 +1338,1242 @@ index 10049bc56c24..3e1a713d3e57 100644 /* kernel/hrtimer.c */ +diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h +index cd79f94e0..7de33be59 100644 +--- a/tools/include/uapi/asm-generic/unistd.h ++++ b/tools/include/uapi/asm-generic/unistd.h +@@ -866,8 +866,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) + #define __NR_futex_wake 442 + __SYSCALL(__NR_futex_wake, sys_futex_wake) + ++#define __NR_futex_waitv 443 ++__SYSCALL(__NR_futex_waitv, sys_futex_waitv) ++ + #undef __NR_syscalls +-#define __NR_syscalls 443 ++#define __NR_syscalls 444 + + + /* +diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +index 47de3bf93..bd47f368f 100644 +--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +@@ -364,6 +364,7 @@ + 440 common process_madvise sys_process_madvise + 441 common futex_wait sys_futex_wait + 442 common futex_wake sys_futex_wake ++443 common futex_waitv sys_futex_waitv + + # + # Due to a historical design error, certain syscalls are numbered differently -- -2.28.0 +2.29.2 -From d8120d2ee1729a6933a606a6720f3e3116e4f699 Mon Sep 17 00:00:00 2001 + +From 24681616a5432f7680f934abf335a9ab9a1eaf1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Thu, 9 Jul 2020 11:34:40 -0300 -Subject: [PATCH 03/13] selftests: futex: Add futex2 wake/wait test +Date: Thu, 15 Oct 2020 18:06:40 -0300 +Subject: [PATCH 3/9] futex2: Add support for shared futexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -Add a simple test to test wake/wait mechanism using futex2 interface. -Create helper files so more tests can evaluate futex2. While 32bit ABIs -from glibc aren't able to use 64 bit sized time variables, add a -temporary workaround that implements the required types and calls the -appropriated syscalls, since futex2 doesn't supports 32 bit sized time. +Add support for shared futexes for cross-process resources. Signed-off-by: André Almeida +Signed-off-by: Jan200101 --- - tools/include/uapi/asm-generic/unistd.h | 7 +- - .../selftests/futex/functional/.gitignore | 1 + - .../selftests/futex/functional/Makefile | 4 +- - .../selftests/futex/functional/futex2_wait.c | 111 ++++++++++++++++++ - .../testing/selftests/futex/functional/run.sh | 3 + - .../selftests/futex/include/futex2test.h | 77 ++++++++++++ - 6 files changed, 201 insertions(+), 2 deletions(-) - create mode 100644 tools/testing/selftests/futex/functional/futex2_wait.c - create mode 100644 tools/testing/selftests/futex/include/futex2test.h + kernel/futex2.c | 187 ++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 165 insertions(+), 22 deletions(-) -diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h -index 995b36c2ea7d..dd457de21bad 100644 ---- a/tools/include/uapi/asm-generic/unistd.h -+++ b/tools/include/uapi/asm-generic/unistd.h -@@ -860,8 +860,13 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) - #define __NR_faccessat2 439 - __SYSCALL(__NR_faccessat2, sys_faccessat2) - -+#define __NR_futex_wait 440 -+__SYSCALL(__NR_futex_wait, sys_futex_wait) -+#define __NR_futex_wake 441 -+__SYSCALL(__NR_futex_wake, sys_futex_wake) -+ - #undef __NR_syscalls --#define __NR_syscalls 440 -+#define __NR_syscalls 442 - - /* - * 32 bit systems traditionally used different -diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore -index 0efcd494daab..d61f1df94360 100644 ---- a/tools/testing/selftests/futex/functional/.gitignore -+++ b/tools/testing/selftests/futex/functional/.gitignore -@@ -6,3 +6,4 @@ futex_wait_private_mapped_file - futex_wait_timeout - futex_wait_uninitialized_heap - futex_wait_wouldblock -+futex2_wait -diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile -index 23207829ec75..7142a94a7ac3 100644 ---- a/tools/testing/selftests/futex/functional/Makefile -+++ b/tools/testing/selftests/futex/functional/Makefile -@@ -5,6 +5,7 @@ LDLIBS := -lpthread -lrt +diff --git a/kernel/futex2.c b/kernel/futex2.c +index 4b782b5ef..5ddb9922d 100644 +--- a/kernel/futex2.c ++++ b/kernel/futex2.c +@@ -6,7 +6,9 @@ + */ - HEADERS := \ - ../include/futextest.h \ -+ ../include/futex2test.h \ - ../include/atomic.h \ - ../include/logging.h - TEST_GEN_FILES := \ -@@ -14,7 +15,8 @@ TEST_GEN_FILES := \ - futex_requeue_pi_signal_restart \ - futex_requeue_pi_mismatched_ops \ - futex_wait_uninitialized_heap \ -- futex_wait_private_mapped_file -+ futex_wait_private_mapped_file \ -+ futex2_wait + #include ++#include + #include ++#include + #include + #include + #include +@@ -15,6 +17,7 @@ - TEST_PROGS := run.sh + /** + * struct futex_waiter - List entry for a waiter ++ * @uaddr: Memory address of userspace futex + * @key.address: Memory address of userspace futex + * @key.mm: Pointer to memory management struct of this process + * @key: Stores information that uniquely identify a futex +@@ -25,9 +28,11 @@ + * @index: Index of waiter in futexv list + */ + struct futex_waiter { ++ uintptr_t uaddr; + struct futex_key { + uintptr_t address; + struct mm_struct *mm; ++ unsigned long int offset; + } key; + struct list_head list; + unsigned int val; +@@ -125,16 +130,116 @@ static inline int bucket_get_waiters(struct futex_bucket *bucket) + #endif + } -diff --git a/tools/testing/selftests/futex/functional/futex2_wait.c b/tools/testing/selftests/futex/functional/futex2_wait.c -new file mode 100644 -index 000000000000..752ed26803b3 ---- /dev/null -+++ b/tools/testing/selftests/futex/functional/futex2_wait.c -@@ -0,0 +1,111 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+/****************************************************************************** -+ * -+ * Copyright Collabora Ltd., 2020 -+ * -+ * DESCRIPTION -+ * Test wait/wake mechanism of futex2, using 32bit sized futexes. -+ * -+ * AUTHOR -+ * André Almeida -+ * -+ * HISTORY -+ * 2020-Jul-9: Initial version by André -+ * -+ *****************************************************************************/ ++static u64 get_inode_sequence_number(struct inode *inode) ++{ ++ static atomic64_t i_seq; ++ u64 old; + -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "futex2test.h" -+#include "logging.h" ++ /* Does the inode already have a sequence number? */ ++ old = atomic64_read(&inode->i_sequence); ++ if (likely(old)) ++ return old; + -+#define TEST_NAME "futex-wait-wouldblock" -+#define timeout_ns 30000000 -+#define WAKE_WAIT_US 10000 -+futex_t f1 = FUTEX_INITIALIZER; ++ for (;;) { ++ u64 new = atomic64_add_return(1, &i_seq); ++ if (WARN_ON_ONCE(!new)) ++ continue; + -+void usage(char *prog) -+{ -+ printf("Usage: %s\n", prog); -+ printf(" -c Use color\n"); -+ printf(" -h Display this help message\n"); -+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", -+ VQUIET, VCRITICAL, VINFO); ++ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); ++ if (old) ++ return old; ++ return new; ++ } +} + -+void *waiterfn(void *arg) ++#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */ ++#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */ ++ ++static int futex_get_shared_key(uintptr_t address, struct mm_struct *mm, ++ struct futex_key *key) +{ -+ struct timespec64 to64; ++ int err; ++ struct page *page, *tail; ++ struct address_space *mapping; + -+ /* setting absolute timeout for futex2 */ -+ if (gettime64(CLOCK_MONOTONIC, &to64)) -+ error("gettime64 failed\n", errno); ++again: ++ err = get_user_pages_fast(address, 1, 0, &page); + -+ to64.tv_nsec += timeout_ns; ++ if (err < 0) ++ return err; ++ else ++ err = 0; + -+ if (to64.tv_nsec >= 1000000000) { -+ to64.tv_sec++; -+ to64.tv_nsec -= 1000000000; -+ } + -+ if (futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64)) -+ printf("waiter failed errno %d\n", errno); ++ tail = page; ++ page = compound_head(page); ++ mapping = READ_ONCE(page->mapping); + -+ return NULL; -+} + -+int main(int argc, char *argv[]) -+{ -+ pthread_t waiter; -+ int res, ret = RET_PASS; -+ int c; ++ if (unlikely(!mapping)) { ++ int shmem_swizzled; + -+ while ((c = getopt(argc, argv, "cht:v:")) != -1) { -+ switch (c) { -+ case 'c': -+ log_color(1); -+ break; -+ case 'h': -+ usage(basename(argv[0])); -+ exit(0); -+ case 'v': -+ log_verbosity(atoi(optarg)); -+ break; -+ default: -+ usage(basename(argv[0])); -+ exit(1); -+ } ++ lock_page(page); ++ shmem_swizzled = PageSwapCache(page) || page->mapping; ++ unlock_page(page); ++ put_page(page); ++ ++ if (shmem_swizzled) ++ goto again; ++ ++ return -EFAULT; + } + -+ ksft_print_header(); -+ ksft_set_plan(1); -+ ksft_print_msg("%s: Test FUTEX_WAIT\n", -+ basename(argv[0])); ++ if (PageAnon(page)) { + -+ info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1); ++ key->mm = mm; ++ key->address = address; + -+ if (pthread_create(&waiter, NULL, waiterfn, NULL)) -+ error("pthread_create failed\n", errno); ++ key->offset |= FUT_OFF_MMSHARED; + -+ usleep(WAKE_WAIT_US); ++ } else { ++ struct inode *inode; + -+ info("Calling futex2_wake on f1: %u @ %p with val=%u\n", f1, &f1, f1); -+ res = futex2_wake(&f1, 1, FUTEX_PRIVATE_FLAG | FUTEX_32); -+ if (res != 1) { -+ ksft_test_result_fail("futex2_wake returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex2_wake wouldblock succeeds\n"); -+ } -+ -+ ksft_print_cnts(); -+ return ret; -+} -diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh -index 1acb6ace1680..3730159c865a 100755 ---- a/tools/testing/selftests/futex/functional/run.sh -+++ b/tools/testing/selftests/futex/functional/run.sh -@@ -73,3 +73,6 @@ echo - echo - ./futex_wait_uninitialized_heap $COLOR - ./futex_wait_private_mapped_file $COLOR ++ rcu_read_lock(); + -+echo -+./futex2_wait $COLOR -diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h -new file mode 100644 -index 000000000000..807b8b57fe61 ---- /dev/null -+++ b/tools/testing/selftests/futex/include/futex2test.h -@@ -0,0 +1,77 @@ -+/* SPDX-License-Identifier: GPL-2.0-or-later */ -+/****************************************************************************** -+ * -+ * Copyright Collabora Ltd., 2020 -+ * -+ * DESCRIPTION -+ * Futex2 library addons for old futex library -+ * -+ * AUTHOR -+ * André Almeida -+ * -+ * HISTORY -+ * 2020-Jul-9: Initial version by André -+ * -+ *****************************************************************************/ -+#include "futextest.h" -+#include ++ if (READ_ONCE(page->mapping) != mapping) { ++ rcu_read_unlock(); ++ put_page(page); + -+#define NSEC_PER_SEC 1000000000L ++ goto again; ++ } + -+#ifndef FUTEX_8 -+# define FUTEX_8 0 -+#endif -+#ifndef FUTEX_16 -+# define FUTEX_16 1 -+#endif -+#ifndef FUTEX_32 -+#define FUTEX_32 2 -+#endif -+#ifdef __x86_64__ -+# ifndef FUTEX_64 -+# define FUTEX_64 3 -+# endif -+#endif ++ inode = READ_ONCE(mapping->host); ++ if (!inode) { ++ rcu_read_unlock(); ++ put_page(page); + -+/* -+ * - Y2038 section for 32-bit applications - -+ * -+ * Remove this when glibc is ready for y2038. Then, always compile with -+ * `-DTIME_BITS=64` or `-D__USE_TIME_BITS64`. glibc will provide both -+ * timespec64 and clock_gettime64 so we won't need to define here. -+ */ -+#if defined(__i386__) || __TIMESIZE == 32 -+# define NR_gettime __NR_clock_gettime64 -+#else -+# define NR_gettime __NR_clock_gettime -+#endif ++ goto again; ++ } + -+struct timespec64 { -+ long long tv_sec; /* seconds */ -+ long long tv_nsec; /* nanoseconds */ -+}; ++ key->address = get_inode_sequence_number(inode); ++ key->mm = (struct mm_struct *) basepage_index(tail); ++ key->offset |= FUT_OFF_INODE; + -+int gettime64(clock_t clockid, struct timespec64 *tv) -+{ -+ return syscall(NR_gettime, clockid, tv); -+} -+/* -+ * - End of Y2038 section - -+ */ ++ rcu_read_unlock(); ++ } + -+/* -+ * wait for uaddr if (*uaddr == val) -+ */ -+static inline int futex2_wait(volatile void *uaddr, unsigned long val, -+ unsigned long flags, struct timespec64 *timo) -+{ -+ return syscall(__NR_futex_wait, uaddr, val, flags, timo); ++ put_page(page); ++ return err; +} + -+/* -+ * wake nr futexes waiting for uaddr -+ */ -+static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned long flags) -+{ -+ return syscall(__NR_futex_wake, uaddr, nr, flags); -+} --- -2.28.0 - -From d4a7ca72f276b2e337eaedcbbe58a2782e0e7d3b Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Thu, 9 Jul 2020 11:36:14 -0300 -Subject: [PATCH 04/13] selftests: futex: Add futex2 timeout test -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Adapt existing futex wait timeout file to test the same mechanism for -futex2. - -Signed-off-by: André Almeida ---- - .../futex/functional/futex_wait_timeout.c | 38 ++++++++++++++----- - 1 file changed, 29 insertions(+), 9 deletions(-) - -diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -index ee55e6d389a3..d2e7ae18985b 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -@@ -11,6 +11,7 @@ - * - * HISTORY - * 2009-Nov-6: Initial version by Darren Hart -+ * 2020-Jul-9: Add futex2 test by André + /** + * futex_get_bucket - Check if the user address is valid, prepare internal + * data and calculate the hash + * @uaddr: futex user address + * @key: data that uniquely identifies a futex ++ * @shared: is this a shared futex? * - *****************************************************************************/ + * Return: address of bucket on success, error code otherwise + */ + static struct futex_bucket *futex_get_bucket(void __user *uaddr, +- struct futex_key *key) ++ struct futex_key *key, ++ bool shared) + { + uintptr_t address = (uintptr_t) uaddr; + u32 hash_key; +@@ -145,8 +250,15 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr, + if (unlikely(!access_ok(address, sizeof(u32)))) + return ERR_PTR(-EFAULT); -@@ -20,7 +21,7 @@ - #include - #include - #include --#include "futextest.h" -+#include "futex2test.h" - #include "logging.h" +- key->address = address; +- key->mm = current->mm; ++ key->offset = address % PAGE_SIZE; ++ address -= key->offset; ++ ++ if (!shared) { ++ key->address = address; ++ key->mm = current->mm; ++ } else { ++ futex_get_shared_key(address, current->mm, key); ++ } - #define TEST_NAME "futex-wait-timeout" -@@ -40,7 +41,8 @@ void usage(char *prog) - int main(int argc, char *argv[]) + /* Generate hash key for this futex using uaddr and current->mm */ + hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0); +@@ -275,9 +387,10 @@ static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr) + * Return: 0 on success, error code otherwise + */ + static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, +- unsigned int *awaken) ++ int *awaken) { - futex_t f1 = FUTEX_INITIALIZER; -- struct timespec to; -+ struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; -+ struct timespec64 to64; - int res, ret = RET_PASS; - int c; - -@@ -65,22 +67,40 @@ int main(int argc, char *argv[]) - } + int i, ret; ++ bool shared, retry = false; + u32 uval, *uaddr, val; + struct futex_bucket *bucket; - ksft_print_header(); -- ksft_set_plan(1); -+ ksft_set_plan(2); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); +@@ -285,8 +398,18 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, + set_current_state(TASK_INTERRUPTIBLE); -- /* initialize timeout */ -- to.tv_sec = 0; -- to.tv_nsec = timeout_ns; -- - info("Calling futex_wait on f1: %u @ %p\n", f1, &f1); - res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG); - if (!res || errno != ETIMEDOUT) { -- fail("futex_wait returned %d\n", ret < 0 ? errno : ret); -+ ksft_test_result_fail("futex_wait returned %d\n", ret < 0 ? errno : ret); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_wait timeout succeeds\n"); -+ } + for (i = 0; i < nr_futexes; i++) { +- uaddr = (u32 * __user) futexv->objects[i].key.address; ++ uaddr = (u32 * __user) futexv->objects[i].uaddr; + val = (u32) futexv->objects[i].val; ++ shared = (futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false; + -+ /* setting absolute timeout for futex2 */ -+ if (gettime64(CLOCK_MONOTONIC, &to64)) -+ error("gettime64 failed\n", errno); ++ if (shared && retry) { ++ futexv->objects[i].bucket = ++ futex_get_bucket((void *) uaddr, ++ &futexv->objects[i].key, true); ++ if (IS_ERR(futexv->objects[i].bucket)) ++ return PTR_ERR(futexv->objects[i].bucket); ++ } + -+ to64.tv_nsec += timeout_ns; + bucket = futexv->objects[i].bucket; + + bucket_inc_waiters(bucket); +@@ -301,24 +424,32 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, + __set_current_state(TASK_RUNNING); + *awaken = futex_dequeue_multiple(futexv, i); + ++ if (shared) { ++ retry = true; ++ goto retry; ++ } + -+ if (to64.tv_nsec >= 1000000000) { -+ to64.tv_sec++; -+ to64.tv_nsec -= 1000000000; -+ } + if (__get_user(uval, uaddr)) + return -EFAULT; + + if (*awaken >= 0) +- return 0; ++ return 1; + ++ retry = true; + goto retry; + } + + if (uval != val) { + spin_unlock(&bucket->lock); + + -+ info("Calling futex2_wait on f1: %u @ %p\n", f1, &f1); -+ res = futex2_wait(&f1, f1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64); -+ if (!res || errno != ETIMEDOUT) { -+ ksft_test_result_fail("futex2_wait returned %d\n", ret < 0 ? errno : ret); - ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex2_wait timeout succeeds\n"); - } + bucket_dec_waiters(bucket); + __set_current_state(TASK_RUNNING); + *awaken = futex_dequeue_multiple(futexv, i); -- print_result(TEST_NAME, ret); -+ ksft_print_cnts(); - return ret; - } --- -2.28.0 - -From 6d2252d43d36a5eb2b9170351128007e27f47737 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Thu, 9 Jul 2020 11:37:42 -0300 -Subject: [PATCH 05/13] selftests: futex: Add futex2 wouldblock test -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Adapt existing futex wait wouldblock file to test the same mechanism for -futex2. - -Signed-off-by: André Almeida ---- - .../futex/functional/futex_wait_wouldblock.c | 33 ++++++++++++++++--- - 1 file changed, 29 insertions(+), 4 deletions(-) - -diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -index 0ae390ff8164..8187f0754cd2 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -@@ -12,6 +12,7 @@ - * - * HISTORY - * 2009-Nov-14: Initial version by Gowrishankar -+ * 2020-Jul-9: Add futex2 test by André - * - *****************************************************************************/ - -@@ -21,7 +22,7 @@ - #include - #include - #include --#include "futextest.h" -+#include "futex2test.h" - #include "logging.h" +- if (*awaken >= 0) +- return 0; ++ if (*awaken >= 0) { ++ return 1; ++ } - #define TEST_NAME "futex-wait-wouldblock" -@@ -39,6 +40,7 @@ void usage(char *prog) - int main(int argc, char *argv[]) + return -EWOULDBLOCK; + } +@@ -336,19 +467,18 @@ static int __futex_wait(struct futexv *futexv, + struct hrtimer_sleeper *timeout) { - struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; -+ struct timespec64 to64; - futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; - int c; -@@ -61,18 +63,41 @@ int main(int argc, char *argv[]) - } + int ret; +- unsigned int awaken = -1; - ksft_print_header(); -- ksft_set_plan(1); -+ ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); +- while (1) { +- ret = futex_enqueue(futexv, nr_futexes, &awaken); - info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); - res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); - if (!res || errno != EWOULDBLOCK) { -- fail("futex_wait returned: %d %s\n", -+ ksft_test_result_fail("futex_wait returned: %d %s\n", - res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); +- if (ret < 0) +- break; ++ while (1) { ++ int awaken = -1; + +- if (awaken <= 0) { +- return awaken; ++ ret = futex_enqueue(futexv, nr_futexes, &awaken); ++ if (ret) { ++ if (awaken >= 0) ++ return awaken; ++ return ret; + } + +- + /* Before sleeping, check if someone was woken */ + if (!futexv->hint && (!timeout || timeout->task)) + freezable_schedule(); +@@ -419,6 +549,7 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, + hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); } -- print_result(TEST_NAME, ret); -+ /* setting absolute timeout for futex2 */ -+ if (gettime64(CLOCK_MONOTONIC, &to64)) -+ error("gettime64 failed\n", errno); -+ -+ to64.tv_nsec += timeout_ns; -+ -+ if (to64.tv_nsec >= 1000000000) { -+ to64.tv_sec++; -+ to64.tv_nsec -= 1000000000; -+ } + -+ info("Calling futex2_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); -+ res = futex2_wait(&f1, f1+1, FUTEX_PRIVATE_FLAG | FUTEX_32, &to64); -+ if (!res || errno != EWOULDBLOCK) { -+ ksft_test_result_fail("futex2_wait returned: %d %s\n", -+ res ? errno : res, res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex2_wait wouldblock succeeds\n"); + ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL); + + +@@ -438,9 +569,10 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, + SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + unsigned int, flags, struct __kernel_timespec __user *, timo) + { ++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; + unsigned int size = flags & FUTEX_SIZE_MASK; +- struct hrtimer_sleeper timeout; + struct futex_single_waiter wait_single; ++ struct hrtimer_sleeper timeout; + struct futex_waiter *waiter; + struct futexv *futexv; + int ret; +@@ -452,6 +584,7 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + waiter = &wait_single.waiter; + waiter->index = 0; + waiter->val = val; ++ waiter->uaddr = (uintptr_t) uaddr; + + INIT_LIST_HEAD(&waiter->list); + +@@ -462,11 +595,14 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, + return -EINVAL; + + /* Get an unlocked hash bucket */ +- waiter->bucket = futex_get_bucket(uaddr, &waiter->key); +- if (IS_ERR(waiter->bucket)) ++ waiter->bucket = futex_get_bucket(uaddr, &waiter->key, shared); ++ if (IS_ERR(waiter->bucket)) { + return PTR_ERR(waiter->bucket); + } -+ -+ ksft_print_cnts(); + + ret = futex_wait(futexv, 1, timo, &timeout, flags); ++ if (ret > 0) ++ ret = 0; + return ret; } +@@ -486,8 +622,10 @@ static int futex_parse_waitv(struct futexv *futexv, + struct futex_waitv waitv; + unsigned int i; + struct futex_bucket *bucket; ++ bool shared; + + for (i = 0; i < nr_futexes; i++) { ++ + if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) + return -EFAULT; + +@@ -495,8 +633,10 @@ static int futex_parse_waitv(struct futexv *futexv, + (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) + return -EINVAL; + ++ shared = (waitv.flags & FUTEX_SHARED_FLAG) ? true : false; ++ + bucket = futex_get_bucket(waitv.uaddr, +- &futexv->objects[i].key); ++ &futexv->objects[i].key, shared); + if (IS_ERR(bucket)) + return PTR_ERR(bucket); + +@@ -505,6 +645,7 @@ static int futex_parse_waitv(struct futexv *futexv, + futexv->objects[i].flags = waitv.flags; + futexv->objects[i].index = i; + INIT_LIST_HEAD(&futexv->objects[i].list); ++ futexv->objects[i].uaddr = (uintptr_t) waitv.uaddr; + } + + return 0; +@@ -573,6 +714,7 @@ static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) + SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + unsigned int, flags) + { ++ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; + unsigned int size = flags & FUTEX_SIZE_MASK; + struct futex_waiter waiter, *aux, *tmp; + struct futex_bucket *bucket; +@@ -586,7 +728,7 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + if (size != FUTEX_32) + return -EINVAL; + +- bucket = futex_get_bucket(uaddr, &waiter.key); ++ bucket = futex_get_bucket(uaddr, &waiter.key, shared); + if (IS_ERR(bucket)) + return PTR_ERR(bucket); + +@@ -599,7 +741,8 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, + break; + + if (waiter.key.address == aux->key.address && +- waiter.key.mm == aux->key.mm) { ++ waiter.key.mm == aux->key.mm && ++ waiter.key.offset == aux->key.offset) { + struct futexv *parent = + futex_get_parent((uintptr_t) aux, aux->index); + -- -2.28.0 +2.29.2 + -From 6b35a09be663f5a844e089f1ddd370137832e7a7 Mon Sep 17 00:00:00 2001 +From ce3ae4bd9f98763fda07f315c1f239c4aaef4b5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Wed, 14 Oct 2020 16:10:09 -0300 -Subject: [PATCH 06/13] DONOTMERGE: futex: Add a clone of futex implementation +Date: Thu, 9 Jul 2020 11:34:40 -0300 +Subject: [PATCH 4/9] selftests: futex: Add futex2 wake/wait test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -For comparative performance tests between the original futex and the new -futex2 interface, create a clone of the current futex. In that way, we -can have a fair comparison, since the futex2 table will be empty with no -contention for the bucket locks. Since futex is widely used in the host -system, the performance tests could get misleading results by the tests -competing with the system for resources. +Add a simple test to test wake/wait mechanism using futex2 interface. +Create helper files so more tests can evaluate futex2. While 32bit ABIs +from glibc aren't able to use 64 bit sized time variables, add a +temporary workaround that implements the required types and calls the +appropriated syscalls, since futex2 doesn't supports 32 bit sized time. Signed-off-by: André Almeida +Signed-off-by: Jan200101 --- - arch/x86/entry/syscalls/syscall_32.tbl | 1 + - arch/x86/entry/syscalls/syscall_64.tbl | 1 + - include/linux/syscalls.h | 3 + - include/uapi/asm-generic/unistd.h | 5 +- - kernel/Makefile | 1 + - kernel/futex1.c | 3384 +++++++++++++++++ - kernel/sys_ni.c | 2 + - tools/arch/x86/include/asm/unistd_64.h | 12 + - tools/include/uapi/asm-generic/unistd.h | 6 +- - .../arch/x86/entry/syscalls/syscall_64.tbl | 3 + - tools/perf/bench/futex.h | 23 +- - 11 files changed, 3438 insertions(+), 3 deletions(-) - create mode 100644 kernel/futex1.c + tools/include/uapi/asm-generic/unistd.h | 1 - + .../selftests/futex/functional/.gitignore | 1 + + .../selftests/futex/functional/Makefile | 4 +- + .../selftests/futex/functional/futex2_wait.c | 148 ++++++++++++++++++ + .../testing/selftests/futex/functional/run.sh | 3 + + .../selftests/futex/include/futex2test.h | 77 +++++++++ + 6 files changed, 232 insertions(+), 2 deletions(-) + create mode 100644 tools/testing/selftests/futex/functional/futex2_wait.c + create mode 100644 tools/testing/selftests/futex/include/futex2test.h -diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index c844c0cbf0e5..820fa53ccf75 100644 ---- a/arch/x86/entry/syscalls/syscall_32.tbl -+++ b/arch/x86/entry/syscalls/syscall_32.tbl -@@ -447,3 +447,4 @@ - 440 i386 futex_wait sys_futex_wait - 441 i386 futex_wake sys_futex_wake - 442 i386 futex_waitv sys_futex_waitv -+443 i386 futex1 sys_futex1 -diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl -index 0901c26c6786..99795136cb98 100644 ---- a/arch/x86/entry/syscalls/syscall_64.tbl -+++ b/arch/x86/entry/syscalls/syscall_64.tbl -@@ -364,6 +364,7 @@ - 440 common futex_wait sys_futex_wait - 441 common futex_wake sys_futex_wake - 442 common futex_waitv sys_futex_waitv -+443 common futex1 sys_futex1 - - # - # x32-specific system call numbers start at 512 to avoid cache impact -diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 38c3a87dbfc2..0351f6ad09a9 100644 ---- a/include/linux/syscalls.h -+++ b/include/linux/syscalls.h -@@ -596,6 +596,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, - asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long nr_wake, - unsigned long flags); - -+asmlinkage long sys_futex1(void __user *uaddr, unsigned long nr_wake, -+ unsigned long flags); -+ - /* kernel/hrtimer.c */ - asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, - struct __kernel_timespec __user *rmtp); -diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index d7ebbed0a18c..e3ba6cb1f76d 100644 ---- a/include/uapi/asm-generic/unistd.h -+++ b/include/uapi/asm-generic/unistd.h -@@ -869,8 +869,11 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake) - #define __NR_futex_waitv 442 - __SYSCALL(__NR_futex_waitv, sys_futex_waitv) - -+#define __NR_futex1 443 -+__SYSCALL(__NR_futex1, sys_futex1) -+ +diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h +index 7de33be59..81a90b697 100644 +--- a/tools/include/uapi/asm-generic/unistd.h ++++ b/tools/include/uapi/asm-generic/unistd.h +@@ -872,7 +872,6 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #undef __NR_syscalls --#define __NR_syscalls 443 -+#define __NR_syscalls 444 + #define __NR_syscalls 444 +- /* * 32 bit systems traditionally used different -diff --git a/kernel/Makefile b/kernel/Makefile -index 51ea9bc647bf..0fe55a8cb9e2 100644 ---- a/kernel/Makefile -+++ b/kernel/Makefile -@@ -57,6 +57,7 @@ obj-$(CONFIG_PROFILING) += profile.o - obj-$(CONFIG_STACKTRACE) += stacktrace.o - obj-y += time/ - obj-$(CONFIG_FUTEX) += futex.o -+obj-$(CONFIG_FUTEX2) += futex1.o - obj-$(CONFIG_FUTEX2) += futex2.o - obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o - obj-$(CONFIG_SMP) += smp.o -diff --git a/kernel/futex1.c b/kernel/futex1.c -new file mode 100644 -index 000000000000..4f7bf312fefd ---- /dev/null -+++ b/kernel/futex1.c -@@ -0,0 +1,3384 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+/* -+ * Fast Userspace Mutexes (which I call "Futexes!"). -+ * (C) Rusty Russell, IBM 2002 -+ * -+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar -+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved -+ * -+ * Removed page pinning, fix privately mapped COW pages and other cleanups -+ * (C) Copyright 2003, 2004 Jamie Lokier -+ * -+ * Robust futex support started by Ingo Molnar -+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved -+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes. -+ * -+ * PI-futex support started by Ingo Molnar and Thomas Gleixner -+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar -+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner -+ * -+ * PRIVATE futexes by Eric Dumazet -+ * Copyright (C) 2007 Eric Dumazet -+ * -+ * Requeue-PI support by Darren Hart -+ * Copyright (C) IBM Corporation, 2009 -+ * Thanks to Thomas Gleixner for conceptual design and careful reviews. -+ * -+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly -+ * enough at me, Linus for the original (flawed) idea, Matthew -+ * Kirkwood for proof-of-concept implementation. -+ * -+ * "The futexes are also cursed." -+ * "But they come in a choice of three flavours!" -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "locking/rtmutex_common.h" -+ -+/* -+ * READ this before attempting to hack on futexes! -+ * -+ * Basic futex operation and ordering guarantees -+ * ============================================= -+ * -+ * The waiter reads the futex value in user space and calls -+ * futex_wait(). This function computes the hash bucket and acquires -+ * the hash bucket lock. After that it reads the futex user space value -+ * again and verifies that the data has not changed. If it has not changed -+ * it enqueues itself into the hash bucket, releases the hash bucket lock -+ * and schedules. -+ * -+ * The waker side modifies the user space value of the futex and calls -+ * futex_wake(). This function computes the hash bucket and acquires the -+ * hash bucket lock. Then it looks for waiters on that futex in the hash -+ * bucket and wakes them. -+ * -+ * In futex wake up scenarios where no tasks are blocked on a futex, taking -+ * the hb spinlock can be avoided and simply return. In order for this -+ * optimization to work, ordering guarantees must exist so that the waiter -+ * being added to the list is acknowledged when the list is concurrently being -+ * checked by the waker, avoiding scenarios like the following: -+ * -+ * CPU 0 CPU 1 -+ * val = *futex; -+ * sys_futex(WAIT, futex, val); -+ * futex_wait(futex, val); -+ * uval = *futex; -+ * *futex = newval; -+ * sys_futex(WAKE, futex); -+ * futex_wake(futex); -+ * if (queue_empty()) -+ * return; -+ * if (uval == val) -+ * lock(hash_bucket(futex)); -+ * queue(); -+ * unlock(hash_bucket(futex)); -+ * schedule(); -+ * -+ * This would cause the waiter on CPU 0 to wait forever because it -+ * missed the transition of the user space value from val to newval -+ * and the waker did not find the waiter in the hash bucket queue. -+ * -+ * The correct serialization ensures that a waiter either observes -+ * the changed user space value before blocking or is woken by a -+ * concurrent waker: -+ * -+ * CPU 0 CPU 1 -+ * val = *futex; -+ * sys_futex(WAIT, futex, val); -+ * futex_wait(futex, val); -+ * -+ * waiters++; (a) -+ * smp_mb(); (A) <-- paired with -. -+ * | -+ * lock(hash_bucket(futex)); | -+ * | -+ * uval = *futex; | -+ * | *futex = newval; -+ * | sys_futex(WAKE, futex); -+ * | futex_wake(futex); -+ * | -+ * `--------> smp_mb(); (B) -+ * if (uval == val) -+ * queue(); -+ * unlock(hash_bucket(futex)); -+ * schedule(); if (waiters) -+ * lock(hash_bucket(futex)); -+ * else wake_waiters(futex); -+ * waiters--; (b) unlock(hash_bucket(futex)); -+ * -+ * Where (A) orders the waiters increment and the futex value read through -+ * atomic operations (see hb_waiters_inc) and where (B) orders the write -+ * to futex and the waiters read (see hb_waiters_pending()). -+ * -+ * This yields the following case (where X:=waiters, Y:=futex): + * syscalls for off_t and loff_t arguments, while +diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore +index 0efcd494d..d61f1df94 100644 +--- a/tools/testing/selftests/futex/functional/.gitignore ++++ b/tools/testing/selftests/futex/functional/.gitignore +@@ -6,3 +6,4 @@ futex_wait_private_mapped_file + futex_wait_timeout + futex_wait_uninitialized_heap + futex_wait_wouldblock ++futex2_wait +diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile +index 23207829e..7142a94a7 100644 +--- a/tools/testing/selftests/futex/functional/Makefile ++++ b/tools/testing/selftests/futex/functional/Makefile +@@ -5,6 +5,7 @@ LDLIBS := -lpthread -lrt + + HEADERS := \ + ../include/futextest.h \ ++ ../include/futex2test.h \ + ../include/atomic.h \ + ../include/logging.h + TEST_GEN_FILES := \ +@@ -14,7 +15,8 @@ TEST_GEN_FILES := \ + futex_requeue_pi_signal_restart \ + futex_requeue_pi_mismatched_ops \ + futex_wait_uninitialized_heap \ +- futex_wait_private_mapped_file ++ futex_wait_private_mapped_file \ ++ futex2_wait + + TEST_PROGS := run.sh + +diff --git a/tools/testing/selftests/futex/functional/futex2_wait.c b/tools/testing/selftests/futex/functional/futex2_wait.c +new file mode 100644 +index 000000000..0646a24b7 +--- /dev/null ++++ b/tools/testing/selftests/futex/functional/futex2_wait.c +@@ -0,0 +1,148 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/****************************************************************************** + * -+ * X = Y = 0 ++ * Copyright Collabora Ltd., 2020 + * -+ * w[X]=1 w[Y]=1 -+ * MB MB -+ * r[Y]=y r[X]=x ++ * DESCRIPTION ++ * Test wait/wake mechanism of futex2, using 32bit sized futexes. + * -+ * Which guarantees that x==0 && y==0 is impossible; which translates back into -+ * the guarantee that we cannot both miss the futex variable change and the -+ * enqueue. ++ * AUTHOR ++ * André Almeida + * -+ * Note that a new waiter is accounted for in (a) even when it is possible that -+ * the wait call can return error, in which case we backtrack from it in (b). -+ * Refer to the comment in queue_lock(). ++ * HISTORY ++ * 2020-Jul-9: Initial version by André + * -+ * Similarly, in order to account for waiters being requeued on another -+ * address we always increment the waiters for the destination bucket before -+ * acquiring the lock. It then decrements them again after releasing it - -+ * the code that actually moves the futex(es) between hash buckets (requeue_futex) -+ * will do the additional required waiter count housekeeping. This is done for -+ * double_lock_hb() and double_unlock_hb(), respectively. -+ */ ++ *****************************************************************************/ + -+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG -+#define futex_cmpxchg_enabled 1 -+#else -+static int __read_mostly futex_cmpxchg_enabled; -+#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "futex2test.h" ++#include "logging.h" + -+/* -+ * Futex flags used to encode options to functions and preserve them across -+ * restarts. -+ */ -+#ifdef CONFIG_MMU -+# define FLAGS_SHARED 0x01 -+#else -+/* -+ * NOMMU does not have per process address space. Let the compiler optimize -+ * code away. -+ */ -+# define FLAGS_SHARED 0x00 -+#endif -+#define FLAGS_CLOCKRT 0x02 -+#define FLAGS_HAS_TIMEOUT 0x04 ++#define TEST_NAME "futex2-wait" ++#define timeout_ns 30000000 ++#define WAKE_WAIT_US 10000 ++futex_t *f1; + -+/* -+ * Priority Inheritance state: -+ */ -+struct futex_pi_state { -+ /* -+ * list of 'owned' pi_state instances - these have to be -+ * cleaned up in do_exit() if the task exits prematurely: -+ */ -+ struct list_head list; ++void usage(char *prog) ++{ ++ printf("Usage: %s\n", prog); ++ printf(" -c Use color\n"); ++ printf(" -h Display this help message\n"); ++ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", ++ VQUIET, VCRITICAL, VINFO); ++} + -+ /* -+ * The PI object: -+ */ -+ struct rt_mutex pi_mutex; ++void *waiterfn(void *arg) ++{ ++ struct timespec64 to64; ++ unsigned int flags = 0; ++ if (arg) ++ flags = *((unsigned int *) arg); ++ ++ /* setting absolute timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); + -+ struct task_struct *owner; -+ refcount_t refcount; ++ to64.tv_nsec += timeout_ns; + -+ union futex_key key; -+} __randomize_layout; ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; ++ } + -+/** -+ * struct futex_q - The hashed futex queue entry, one per waiting task -+ * @list: priority-sorted list of tasks waiting on this futex -+ * @task: the task waiting on the futex -+ * @lock_ptr: the hash bucket lock -+ * @key: the key the futex is hashed on -+ * @pi_state: optional priority inheritance state -+ * @rt_waiter: rt_waiter storage for use with requeue_pi -+ * @requeue_pi_key: the requeue_pi target futex key -+ * @bitset: bitset for the optional bitmasked wakeup -+ * -+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so -+ * we can wake only the relevant ones (hashed queues may be shared). -+ * -+ * A futex_q has a woken state, just like tasks have TASK_RUNNING. -+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. -+ * The order of wakeup is always to make the first condition true, then -+ * the second. -+ * -+ * PI futexes are typically woken before they are removed from the hash list via -+ * the rt_mutex code. See unqueue_me_pi(). -+ */ -+struct futex_q { -+ struct plist_node list; ++ if (futex2_wait(f1, *f1, FUTEX_32 | flags, &to64)) ++ printf("waiter failed errno %d\n", errno); + -+ struct task_struct *task; -+ spinlock_t *lock_ptr; -+ union futex_key key; -+ struct futex_pi_state *pi_state; -+ struct rt_mutex_waiter *rt_waiter; -+ union futex_key *requeue_pi_key; -+ u32 bitset; -+} __randomize_layout; -+ -+static const struct futex_q futex_q_init = { -+ /* list gets initialized in queue_me()*/ -+ .key = FUTEX_KEY_INIT, -+ .bitset = FUTEX_BITSET_MATCH_ANY -+}; ++ return NULL; ++} + -+/* -+ * Hash buckets are shared by all the futex_keys that hash to the same -+ * location. Each key may have multiple futex_q structures, one for each task -+ * waiting on a futex. -+ */ -+struct futex_hash_bucket { -+ atomic_t waiters; -+ spinlock_t lock; -+ struct plist_head chain; -+} ____cacheline_aligned_in_smp; ++int main(int argc, char *argv[]) ++{ ++ pthread_t waiter; ++ unsigned int flags = FUTEX_SHARED_FLAG; ++ int res, ret = RET_PASS; ++ int c; ++ futex_t f_private = 0; ++ f1 = &f_private; + -+/* -+ * The base of the bucket array and its size are always used together -+ * (after initialization only in hash_futex()), so ensure that they -+ * reside in the same cacheline. -+ */ -+static struct { -+ struct futex_hash_bucket *queues; -+ unsigned long hashsize; -+} __futex_data __read_mostly __aligned(2*sizeof(long)); -+#define futex_queues (__futex_data.queues) -+#define futex_hashsize (__futex_data.hashsize) ++ while ((c = getopt(argc, argv, "cht:v:")) != -1) { ++ switch (c) { ++ case 'c': ++ log_color(1); ++ break; ++ case 'h': ++ usage(basename(argv[0])); ++ exit(0); ++ case 'v': ++ log_verbosity(atoi(optarg)); ++ break; ++ default: ++ usage(basename(argv[0])); ++ exit(1); ++ } ++ } + ++ ksft_print_header(); ++ ksft_set_plan(2); ++ ksft_print_msg("%s: Test FUTEX2_WAIT\n", ++ basename(argv[0])); + -+/* -+ * Fault injections for futexes. -+ */ -+#ifdef CONFIG_FAIL_FUTEX ++ info("Calling private futex2_wait on f1: %u @ %p with val=%u\n", *f1, f1, *f1); + -+static struct { -+ struct fault_attr attr; ++ if (pthread_create(&waiter, NULL, waiterfn, NULL)) ++ error("pthread_create failed\n", errno); + -+ bool ignore_private; -+} fail_futex = { -+ .attr = FAULT_ATTR_INITIALIZER, -+ .ignore_private = false, -+}; ++ usleep(WAKE_WAIT_US); + -+static int __init setup_fail_futex(char *str) -+{ -+ return setup_fault_attr(&fail_futex.attr, str); -+} -+__setup("fail_futex=", setup_fail_futex); ++ info("Calling private futex2_wake on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ res = futex2_wake(f1, 1, FUTEX_32); ++ if (res != 1) { ++ ksft_test_result_fail("futex2_wake private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wake private succeeds\n"); ++ } + -+static bool should_fail_futex(bool fshared) -+{ -+ if (fail_futex.ignore_private && !fshared) -+ return false; ++ int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); ++ if (shm_id < 0) { ++ perror("shmget"); ++ exit(1); ++ } + -+ return should_fail(&fail_futex.attr, 1); -+} ++ unsigned int *shared_data = shmat(shm_id, NULL, 0); ++ *shared_data = 0; ++ f1 = shared_data; + -+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS ++ info("Calling shared futex2_wait on f1: %u @ %p with val=%u\n", *f1, f1, *f1); + -+static int __init fail_futex_debugfs(void) -+{ -+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; -+ struct dentry *dir; ++ if (pthread_create(&waiter, NULL, waiterfn, &flags)) ++ error("pthread_create failed\n", errno); + -+ dir = fault_create_debugfs_attr("fail_futex", NULL, -+ &fail_futex.attr); -+ if (IS_ERR(dir)) -+ return PTR_ERR(dir); ++ usleep(WAKE_WAIT_US); + -+ debugfs_create_bool("ignore-private", mode, dir, -+ &fail_futex.ignore_private); -+ return 0; ++ info("Calling shared futex2_wake on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ res = futex2_wake(f1, 1, FUTEX_32 | FUTEX_SHARED_FLAG); ++ if (res != 1) { ++ ksft_test_result_fail("futex2_wake shared returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wake shared succeeds\n"); ++ } ++ ++ shmdt(shared_data); ++ ++ ksft_print_cnts(); ++ return ret; +} +diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh +index 1acb6ace1..3730159c8 100755 +--- a/tools/testing/selftests/futex/functional/run.sh ++++ b/tools/testing/selftests/futex/functional/run.sh +@@ -73,3 +73,6 @@ echo + echo + ./futex_wait_uninitialized_heap $COLOR + ./futex_wait_private_mapped_file $COLOR ++ ++echo ++./futex2_wait $COLOR +diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h +new file mode 100644 +index 000000000..807b8b57f +--- /dev/null ++++ b/tools/testing/selftests/futex/include/futex2test.h +@@ -0,0 +1,77 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++/****************************************************************************** ++ * ++ * Copyright Collabora Ltd., 2020 ++ * ++ * DESCRIPTION ++ * Futex2 library addons for old futex library ++ * ++ * AUTHOR ++ * André Almeida ++ * ++ * HISTORY ++ * 2020-Jul-9: Initial version by André ++ * ++ *****************************************************************************/ ++#include "futextest.h" ++#include + -+late_initcall(fail_futex_debugfs); ++#define NSEC_PER_SEC 1000000000L + -+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ ++#ifndef FUTEX_8 ++# define FUTEX_8 0 ++#endif ++#ifndef FUTEX_16 ++# define FUTEX_16 1 ++#endif ++#ifndef FUTEX_32 ++#define FUTEX_32 2 ++#endif ++#ifdef __x86_64__ ++# ifndef FUTEX_64 ++# define FUTEX_64 3 ++# endif ++#endif + ++/* ++ * - Y2038 section for 32-bit applications - ++ * ++ * Remove this when glibc is ready for y2038. Then, always compile with ++ * `-DTIME_BITS=64` or `-D__USE_TIME_BITS64`. glibc will provide both ++ * timespec64 and clock_gettime64 so we won't need to define here. ++ */ ++#if defined(__i386__) || __TIMESIZE == 32 ++# define NR_gettime __NR_clock_gettime64 +#else -+static inline bool should_fail_futex(bool fshared) ++# define NR_gettime __NR_clock_gettime ++#endif ++ ++struct timespec64 { ++ long long tv_sec; /* seconds */ ++ long long tv_nsec; /* nanoseconds */ ++}; ++ ++int gettime64(clock_t clockid, struct timespec64 *tv) +{ -+ return false; ++ return syscall(NR_gettime, clockid, tv); +} -+#endif /* CONFIG_FAIL_FUTEX */ ++/* ++ * - End of Y2038 section - ++ */ + +/* -+ * Reflects a new waiter being added to the waitqueue. ++ * wait for uaddr if (*uaddr == val) + */ -+static inline void hb_waiters_inc(struct futex_hash_bucket *hb) ++static inline int futex2_wait(volatile void *uaddr, unsigned long val, ++ unsigned long flags, struct timespec64 *timo) +{ -+#ifdef CONFIG_SMP -+ atomic_inc(&hb->waiters); -+ /* -+ * Full barrier (A), see the ordering comment above. -+ */ -+ smp_mb__after_atomic(); -+#endif ++ return syscall(__NR_futex_wait, uaddr, val, flags, timo); +} + +/* -+ * Reflects a waiter being removed from the waitqueue by wakeup -+ * paths. -+ */ -+static inline void hb_waiters_dec(struct futex_hash_bucket *hb) -+{ -+#ifdef CONFIG_SMP -+ atomic_dec(&hb->waiters); -+#endif -+} -+ -+static inline int hb_waiters_pending(struct futex_hash_bucket *hb) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * Full barrier (B), see the ordering comment above. -+ */ -+ smp_mb(); -+ return atomic_read(&hb->waiters); -+#else -+ return 1; -+#endif -+} -+ -+/** -+ * hash_futex - Return the hash bucket in the global hash -+ * @key: Pointer to the futex key for which the hash is calculated -+ * -+ * We hash on the keys returned from get_futex_key (see below) and return the -+ * corresponding hash bucket in the global hash. -+ */ -+static struct futex_hash_bucket *hash_futex(union futex_key *key) -+{ -+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, -+ key->both.offset); -+ -+ return &futex_queues[hash & (futex_hashsize - 1)]; -+} -+ -+ -+/** -+ * match_futex - Check whether two futex keys are equal -+ * @key1: Pointer to key1 -+ * @key2: Pointer to key2 -+ * -+ * Return 1 if two futex_keys are equal, 0 otherwise. -+ */ -+static inline int match_futex(union futex_key *key1, union futex_key *key2) -+{ -+ return (key1 && key2 -+ && key1->both.word == key2->both.word -+ && key1->both.ptr == key2->both.ptr -+ && key1->both.offset == key2->both.offset); -+} -+ -+enum futex_access { -+ FUTEX_READ, -+ FUTEX_WRITE -+}; -+ -+/** -+ * futex_setup_timer - set up the sleeping hrtimer. -+ * @time: ptr to the given timeout value -+ * @timeout: the hrtimer_sleeper structure to be set up -+ * @flags: futex flags -+ * @range_ns: optional range in ns -+ * -+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout -+ * value given -+ */ -+static inline struct hrtimer_sleeper * -+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, -+ int flags, u64 range_ns) -+{ -+ if (!time) -+ return NULL; -+ -+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? -+ CLOCK_REALTIME : CLOCK_MONOTONIC, -+ HRTIMER_MODE_ABS); -+ /* -+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is -+ * effectively the same as calling hrtimer_set_expires(). -+ */ -+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); -+ -+ return timeout; -+} -+ -+/* -+ * Generate a machine wide unique identifier for this inode. -+ * -+ * This relies on u64 not wrapping in the life-time of the machine; which with -+ * 1ns resolution means almost 585 years. -+ * -+ * This further relies on the fact that a well formed program will not unmap -+ * the file while it has a (shared) futex waiting on it. This mapping will have -+ * a file reference which pins the mount and inode. -+ * -+ * If for some reason an inode gets evicted and read back in again, it will get -+ * a new sequence number and will _NOT_ match, even though it is the exact same -+ * file. -+ * -+ * It is important that match_futex() will never have a false-positive, esp. -+ * for PI futexes that can mess up the state. The above argues that false-negatives -+ * are only possible for malformed programs. -+ */ -+static u64 get_inode_sequence_number(struct inode *inode) -+{ -+ static atomic64_t i_seq; -+ u64 old; -+ -+ /* Does the inode already have a sequence number? */ -+ old = atomic64_read(&inode->i_sequence); -+ if (likely(old)) -+ return old; -+ -+ for (;;) { -+ u64 new = atomic64_add_return(1, &i_seq); -+ if (WARN_ON_ONCE(!new)) -+ continue; -+ -+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); -+ if (old) -+ return old; -+ return new; -+ } -+} -+ -+/** -+ * get_futex_key() - Get parameters which are the keys for a futex -+ * @uaddr: virtual address of the futex -+ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED -+ * @key: address where result is stored. -+ * @rw: mapping needs to be read/write (values: FUTEX_READ, -+ * FUTEX_WRITE) -+ * -+ * Return: a negative error code or 0 -+ * -+ * The key words are stored in @key on success. -+ * -+ * For shared mappings (when @fshared), the key is: -+ * -+ * ( inode->i_sequence, page->index, offset_within_page ) -+ * -+ * [ also see get_inode_sequence_number() ] -+ * -+ * For private mappings (or when !@fshared), the key is: -+ * -+ * ( current->mm, address, 0 ) -+ * -+ * This allows (cross process, where applicable) identification of the futex -+ * without keeping the page pinned for the duration of the FUTEX_WAIT. -+ * -+ * lock_page() might sleep, the caller should not hold a spinlock. -+ */ -+static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, -+ enum futex_access rw) -+{ -+ unsigned long address = (unsigned long)uaddr; -+ struct mm_struct *mm = current->mm; -+ struct page *page, *tail; -+ struct address_space *mapping; -+ int err, ro = 0; -+ -+ /* -+ * The futex address must be "naturally" aligned. -+ */ -+ key->both.offset = address % PAGE_SIZE; -+ if (unlikely((address % sizeof(u32)) != 0)) -+ return -EINVAL; -+ address -= key->both.offset; -+ -+ if (unlikely(!access_ok(uaddr, sizeof(u32)))) -+ return -EFAULT; -+ -+ if (unlikely(should_fail_futex(fshared))) -+ return -EFAULT; -+ -+ /* -+ * PROCESS_PRIVATE futexes are fast. -+ * As the mm cannot disappear under us and the 'key' only needs -+ * virtual address, we dont even have to find the underlying vma. -+ * Note : We do have to check 'uaddr' is a valid user address, -+ * but access_ok() should be faster than find_vma() -+ */ -+ if (!fshared) { -+ key->private.mm = mm; -+ key->private.address = address; -+ return 0; -+ } -+ -+again: -+ /* Ignore any VERIFY_READ mapping (futex common case) */ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); -+ /* -+ * If write access is not required (eg. FUTEX_WAIT), try -+ * and get read-only access. -+ */ -+ if (err == -EFAULT && rw == FUTEX_READ) { -+ err = get_user_pages_fast(address, 1, 0, &page); -+ ro = 1; -+ } -+ if (err < 0) -+ return err; -+ else -+ err = 0; -+ -+ /* -+ * The treatment of mapping from this point on is critical. The page -+ * lock protects many things but in this context the page lock -+ * stabilizes mapping, prevents inode freeing in the shared -+ * file-backed region case and guards against movement to swap cache. -+ * -+ * Strictly speaking the page lock is not needed in all cases being -+ * considered here and page lock forces unnecessarily serialization -+ * From this point on, mapping will be re-verified if necessary and -+ * page lock will be acquired only if it is unavoidable -+ * -+ * Mapping checks require the head page for any compound page so the -+ * head page and mapping is looked up now. For anonymous pages, it -+ * does not matter if the page splits in the future as the key is -+ * based on the address. For filesystem-backed pages, the tail is -+ * required as the index of the page determines the key. For -+ * base pages, there is no tail page and tail == page. -+ */ -+ tail = page; -+ page = compound_head(page); -+ mapping = READ_ONCE(page->mapping); -+ -+ /* -+ * If page->mapping is NULL, then it cannot be a PageAnon -+ * page; but it might be the ZERO_PAGE or in the gate area or -+ * in a special mapping (all cases which we are happy to fail); -+ * or it may have been a good file page when get_user_pages_fast -+ * found it, but truncated or holepunched or subjected to -+ * invalidate_complete_page2 before we got the page lock (also -+ * cases which we are happy to fail). And we hold a reference, -+ * so refcount care in invalidate_complete_page's remove_mapping -+ * prevents drop_caches from setting mapping to NULL beneath us. -+ * -+ * The case we do have to guard against is when memory pressure made -+ * shmem_writepage move it from filecache to swapcache beneath us: -+ * an unlikely race, but we do need to retry for page->mapping. -+ */ -+ if (unlikely(!mapping)) { -+ int shmem_swizzled; -+ -+ /* -+ * Page lock is required to identify which special case above -+ * applies. If this is really a shmem page then the page lock -+ * will prevent unexpected transitions. -+ */ -+ lock_page(page); -+ shmem_swizzled = PageSwapCache(page) || page->mapping; -+ unlock_page(page); -+ put_page(page); -+ -+ if (shmem_swizzled) -+ goto again; -+ -+ return -EFAULT; -+ } -+ -+ /* -+ * Private mappings are handled in a simple way. -+ * -+ * If the futex key is stored on an anonymous page, then the associated -+ * object is the mm which is implicitly pinned by the calling process. -+ * -+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if -+ * it's a read-only handle, it's expected that futexes attach to -+ * the object not the particular process. -+ */ -+ if (PageAnon(page)) { -+ /* -+ * A RO anonymous page will never change and thus doesn't make -+ * sense for futex operations. -+ */ -+ if (unlikely(should_fail_futex(true)) || ro) { -+ err = -EFAULT; -+ goto out; -+ } -+ -+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ -+ key->private.mm = mm; -+ key->private.address = address; -+ -+ } else { -+ struct inode *inode; -+ -+ /* -+ * The associated futex object in this case is the inode and -+ * the page->mapping must be traversed. Ordinarily this should -+ * be stabilised under page lock but it's not strictly -+ * necessary in this case as we just want to pin the inode, not -+ * update the radix tree or anything like that. -+ * -+ * The RCU read lock is taken as the inode is finally freed -+ * under RCU. If the mapping still matches expectations then the -+ * mapping->host can be safely accessed as being a valid inode. -+ */ -+ rcu_read_lock(); -+ -+ if (READ_ONCE(page->mapping) != mapping) { -+ rcu_read_unlock(); -+ put_page(page); -+ -+ goto again; -+ } -+ -+ inode = READ_ONCE(mapping->host); -+ if (!inode) { -+ rcu_read_unlock(); -+ put_page(page); -+ -+ goto again; -+ } -+ -+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */ -+ key->shared.i_seq = get_inode_sequence_number(inode); -+ key->shared.pgoff = basepage_index(tail); -+ rcu_read_unlock(); -+ } -+ -+out: -+ put_page(page); -+ return err; -+} -+ -+/** -+ * fault_in_user_writeable() - Fault in user address and verify RW access -+ * @uaddr: pointer to faulting user space address -+ * -+ * Slow path to fixup the fault we just took in the atomic write -+ * access to @uaddr. -+ * -+ * We have no generic implementation of a non-destructive write to the -+ * user address. We know that we faulted in the atomic pagefault -+ * disabled section so we can as well avoid the #PF overhead by -+ * calling get_user_pages() right away. -+ */ -+static int fault_in_user_writeable(u32 __user *uaddr) -+{ -+ struct mm_struct *mm = current->mm; -+ int ret; -+ -+ mmap_read_lock(mm); -+ ret = fixup_user_fault(mm, (unsigned long)uaddr, -+ FAULT_FLAG_WRITE, NULL); -+ mmap_read_unlock(mm); -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/** -+ * futex_top_waiter() - Return the highest priority waiter on a futex -+ * @hb: the hash bucket the futex_q's reside in -+ * @key: the futex key (to distinguish it from other futex futex_q's) -+ * -+ * Must be called with the hb lock held. -+ */ -+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, -+ union futex_key *key) -+{ -+ struct futex_q *this; -+ -+ plist_for_each_entry(this, &hb->chain, list) { -+ if (match_futex(&this->key, key)) -+ return this; -+ } -+ return NULL; -+} -+ -+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, -+ u32 uval, u32 newval) -+{ -+ int ret; -+ -+ pagefault_disable(); -+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); -+ pagefault_enable(); -+ -+ return ret; -+} -+ -+static int get_futex_value_locked(u32 *dest, u32 __user *from) -+{ -+ int ret; -+ -+ pagefault_disable(); -+ ret = __get_user(*dest, from); -+ pagefault_enable(); -+ -+ return ret ? -EFAULT : 0; -+} -+ -+ -+/* -+ * PI code: -+ */ -+static int refill_pi_state_cache(void) -+{ -+ struct futex_pi_state *pi_state; -+ -+ if (likely(current->pi_state_cache)) -+ return 0; -+ -+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); -+ -+ if (!pi_state) -+ return -ENOMEM; -+ -+ INIT_LIST_HEAD(&pi_state->list); -+ /* pi_mutex gets initialized later */ -+ pi_state->owner = NULL; -+ refcount_set(&pi_state->refcount, 1); -+ pi_state->key = FUTEX_KEY_INIT; -+ -+ current->pi_state_cache = pi_state; -+ -+ return 0; -+} -+ -+static struct futex_pi_state *alloc_pi_state(void) -+{ -+ struct futex_pi_state *pi_state = current->pi_state_cache; -+ -+ WARN_ON(!pi_state); -+ current->pi_state_cache = NULL; -+ -+ return pi_state; -+} -+ -+static void get_pi_state(struct futex_pi_state *pi_state) -+{ -+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); -+} -+ -+/* -+ * Drops a reference to the pi_state object and frees or caches it -+ * when the last reference is gone. -+ */ -+static void put_pi_state(struct futex_pi_state *pi_state) -+{ -+ if (!pi_state) -+ return; -+ -+ if (!refcount_dec_and_test(&pi_state->refcount)) -+ return; -+ -+ /* -+ * If pi_state->owner is NULL, the owner is most probably dying -+ * and has cleaned up the pi_state already -+ */ -+ if (pi_state->owner) { -+ struct task_struct *owner; -+ -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ owner = pi_state->owner; -+ if (owner) { -+ raw_spin_lock(&owner->pi_lock); -+ list_del_init(&pi_state->list); -+ raw_spin_unlock(&owner->pi_lock); -+ } -+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ } -+ -+ if (current->pi_state_cache) { -+ kfree(pi_state); -+ } else { -+ /* -+ * pi_state->list is already empty. -+ * clear pi_state->owner. -+ * refcount is at 0 - put it back to 1. -+ */ -+ pi_state->owner = NULL; -+ refcount_set(&pi_state->refcount, 1); -+ current->pi_state_cache = pi_state; -+ } -+} -+ -+/* -+ * We need to check the following states: -+ * -+ * Waiter | pi_state | pi->owner | uTID | uODIED | ? -+ * -+ * [1] NULL | --- | --- | 0 | 0/1 | Valid -+ * [2] NULL | --- | --- | >0 | 0/1 | Valid -+ * -+ * [3] Found | NULL | -- | Any | 0/1 | Invalid -+ * -+ * [4] Found | Found | NULL | 0 | 1 | Valid -+ * [5] Found | Found | NULL | >0 | 1 | Invalid -+ * -+ * [6] Found | Found | task | 0 | 1 | Valid -+ * -+ * [7] Found | Found | NULL | Any | 0 | Invalid -+ * -+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid -+ * [9] Found | Found | task | 0 | 0 | Invalid -+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid -+ * -+ * [1] Indicates that the kernel can acquire the futex atomically. We -+ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. -+ * -+ * [2] Valid, if TID does not belong to a kernel thread. If no matching -+ * thread is found then it indicates that the owner TID has died. -+ * -+ * [3] Invalid. The waiter is queued on a non PI futex -+ * -+ * [4] Valid state after exit_robust_list(), which sets the user space -+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. -+ * -+ * [5] The user space value got manipulated between exit_robust_list() -+ * and exit_pi_state_list() -+ * -+ * [6] Valid state after exit_pi_state_list() which sets the new owner in -+ * the pi_state but cannot access the user space value. -+ * -+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. -+ * -+ * [8] Owner and user space value match -+ * -+ * [9] There is no transient state which sets the user space TID to 0 -+ * except exit_robust_list(), but this is indicated by the -+ * FUTEX_OWNER_DIED bit. See [4] -+ * -+ * [10] There is no transient state which leaves owner and user space -+ * TID out of sync. -+ * -+ * -+ * Serialization and lifetime rules: -+ * -+ * hb->lock: -+ * -+ * hb -> futex_q, relation -+ * futex_q -> pi_state, relation -+ * -+ * (cannot be raw because hb can contain arbitrary amount -+ * of futex_q's) -+ * -+ * pi_mutex->wait_lock: -+ * -+ * {uval, pi_state} -+ * -+ * (and pi_mutex 'obviously') -+ * -+ * p->pi_lock: -+ * -+ * p->pi_state_list -> pi_state->list, relation -+ * -+ * pi_state->refcount: -+ * -+ * pi_state lifetime -+ * -+ * -+ * Lock order: -+ * -+ * hb->lock -+ * pi_mutex->wait_lock -+ * p->pi_lock -+ * -+ */ -+ -+/* -+ * Validate that the existing waiter has a pi_state and sanity check -+ * the pi_state against the user space value. If correct, attach to -+ * it. -+ */ -+static int attach_to_pi_state(u32 __user *uaddr, u32 uval, -+ struct futex_pi_state *pi_state, -+ struct futex_pi_state **ps) -+{ -+ pid_t pid = uval & FUTEX_TID_MASK; -+ u32 uval2; -+ int ret; -+ -+ /* -+ * Userspace might have messed up non-PI and PI futexes [3] -+ */ -+ if (unlikely(!pi_state)) -+ return -EINVAL; -+ -+ /* -+ * We get here with hb->lock held, and having found a -+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q -+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), -+ * which in turn means that futex_lock_pi() still has a reference on -+ * our pi_state. -+ * -+ * The waiter holding a reference on @pi_state also protects against -+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() -+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently -+ * free pi_state before we can take a reference ourselves. -+ */ -+ WARN_ON(!refcount_read(&pi_state->refcount)); -+ -+ /* -+ * Now that we have a pi_state, we can acquire wait_lock -+ * and do the state validation. -+ */ -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ /* -+ * Since {uval, pi_state} is serialized by wait_lock, and our current -+ * uval was read without holding it, it can have changed. Verify it -+ * still is what we expect it to be, otherwise retry the entire -+ * operation. -+ */ -+ if (get_futex_value_locked(&uval2, uaddr)) -+ goto out_efault; -+ -+ if (uval != uval2) -+ goto out_eagain; -+ -+ /* -+ * Handle the owner died case: -+ */ -+ if (uval & FUTEX_OWNER_DIED) { -+ /* -+ * exit_pi_state_list sets owner to NULL and wakes the -+ * topmost waiter. The task which acquires the -+ * pi_state->rt_mutex will fixup owner. -+ */ -+ if (!pi_state->owner) { -+ /* -+ * No pi state owner, but the user space TID -+ * is not 0. Inconsistent state. [5] -+ */ -+ if (pid) -+ goto out_einval; -+ /* -+ * Take a ref on the state and return success. [4] -+ */ -+ goto out_attach; -+ } -+ -+ /* -+ * If TID is 0, then either the dying owner has not -+ * yet executed exit_pi_state_list() or some waiter -+ * acquired the rtmutex in the pi state, but did not -+ * yet fixup the TID in user space. -+ * -+ * Take a ref on the state and return success. [6] -+ */ -+ if (!pid) -+ goto out_attach; -+ } else { -+ /* -+ * If the owner died bit is not set, then the pi_state -+ * must have an owner. [7] -+ */ -+ if (!pi_state->owner) -+ goto out_einval; -+ } -+ -+ /* -+ * Bail out if user space manipulated the futex value. If pi -+ * state exists then the owner TID must be the same as the -+ * user space TID. [9/10] -+ */ -+ if (pid != task_pid_vnr(pi_state->owner)) -+ goto out_einval; -+ -+out_attach: -+ get_pi_state(pi_state); -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ *ps = pi_state; -+ return 0; -+ -+out_einval: -+ ret = -EINVAL; -+ goto out_error; -+ -+out_eagain: -+ ret = -EAGAIN; -+ goto out_error; -+ -+out_efault: -+ ret = -EFAULT; -+ goto out_error; -+ -+out_error: -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ return ret; -+} -+ -+/** -+ * wait_for_owner_exiting - Block until the owner has exited -+ * @ret: owner's current futex lock status -+ * @exiting: Pointer to the exiting task -+ * -+ * Caller must hold a refcount on @exiting. -+ */ -+static void wait_for_owner_exiting(int ret, struct task_struct *exiting) -+{ -+ if (ret != -EBUSY) { -+ WARN_ON_ONCE(exiting); -+ return; -+ } -+ -+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) -+ return; -+ -+ mutex_lock(&exiting->futex_exit_mutex); -+ /* -+ * No point in doing state checking here. If the waiter got here -+ * while the task was in exec()->exec_futex_release() then it can -+ * have any FUTEX_STATE_* value when the waiter has acquired the -+ * mutex. OK, if running, EXITING or DEAD if it reached exit() -+ * already. Highly unlikely and not a problem. Just one more round -+ * through the futex maze. -+ */ -+ mutex_unlock(&exiting->futex_exit_mutex); -+ -+ put_task_struct(exiting); -+} -+ -+static int handle_exit_race(u32 __user *uaddr, u32 uval, -+ struct task_struct *tsk) -+{ -+ u32 uval2; -+ -+ /* -+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the -+ * caller that the alleged owner is busy. -+ */ -+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) -+ return -EBUSY; -+ -+ /* -+ * Reread the user space value to handle the following situation: -+ * -+ * CPU0 CPU1 -+ * -+ * sys_exit() sys_futex() -+ * do_exit() futex_lock_pi() -+ * futex_lock_pi_atomic() -+ * exit_signals(tsk) No waiters: -+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID -+ * mm_release(tsk) Set waiter bit -+ * exit_robust_list(tsk) { *uaddr = 0x80000PID; -+ * Set owner died attach_to_pi_owner() { -+ * *uaddr = 0xC0000000; tsk = get_task(PID); -+ * } if (!tsk->flags & PF_EXITING) { -+ * ... attach(); -+ * tsk->futex_state = } else { -+ * FUTEX_STATE_DEAD; if (tsk->futex_state != -+ * FUTEX_STATE_DEAD) -+ * return -EAGAIN; -+ * return -ESRCH; <--- FAIL -+ * } -+ * -+ * Returning ESRCH unconditionally is wrong here because the -+ * user space value has been changed by the exiting task. -+ * -+ * The same logic applies to the case where the exiting task is -+ * already gone. -+ */ -+ if (get_futex_value_locked(&uval2, uaddr)) -+ return -EFAULT; -+ -+ /* If the user space value has changed, try again. */ -+ if (uval2 != uval) -+ return -EAGAIN; -+ -+ /* -+ * The exiting task did not have a robust list, the robust list was -+ * corrupted or the user space value in *uaddr is simply bogus. -+ * Give up and tell user space. -+ */ -+ return -ESRCH; -+} -+ -+/* -+ * Lookup the task for the TID provided from user space and attach to -+ * it after doing proper sanity checks. -+ */ -+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, -+ struct futex_pi_state **ps, -+ struct task_struct **exiting) -+{ -+ pid_t pid = uval & FUTEX_TID_MASK; -+ struct futex_pi_state *pi_state; -+ struct task_struct *p; -+ -+ /* -+ * We are the first waiter - try to look up the real owner and attach -+ * the new pi_state to it, but bail out when TID = 0 [1] -+ * -+ * The !pid check is paranoid. None of the call sites should end up -+ * with pid == 0, but better safe than sorry. Let the caller retry -+ */ -+ if (!pid) -+ return -EAGAIN; -+ p = find_get_task_by_vpid(pid); -+ if (!p) -+ return handle_exit_race(uaddr, uval, NULL); -+ -+ if (unlikely(p->flags & PF_KTHREAD)) { -+ put_task_struct(p); -+ return -EPERM; -+ } -+ -+ /* -+ * We need to look at the task state to figure out, whether the -+ * task is exiting. To protect against the change of the task state -+ * in futex_exit_release(), we do this protected by p->pi_lock: -+ */ -+ raw_spin_lock_irq(&p->pi_lock); -+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) { -+ /* -+ * The task is on the way out. When the futex state is -+ * FUTEX_STATE_DEAD, we know that the task has finished -+ * the cleanup: -+ */ -+ int ret = handle_exit_race(uaddr, uval, p); -+ -+ raw_spin_unlock_irq(&p->pi_lock); -+ /* -+ * If the owner task is between FUTEX_STATE_EXITING and -+ * FUTEX_STATE_DEAD then store the task pointer and keep -+ * the reference on the task struct. The calling code will -+ * drop all locks, wait for the task to reach -+ * FUTEX_STATE_DEAD and then drop the refcount. This is -+ * required to prevent a live lock when the current task -+ * preempted the exiting task between the two states. -+ */ -+ if (ret == -EBUSY) -+ *exiting = p; -+ else -+ put_task_struct(p); -+ return ret; -+ } -+ -+ /* -+ * No existing pi state. First waiter. [2] -+ * -+ * This creates pi_state, we have hb->lock held, this means nothing can -+ * observe this state, wait_lock is irrelevant. -+ */ -+ pi_state = alloc_pi_state(); -+ -+ /* -+ * Initialize the pi_mutex in locked state and make @p -+ * the owner of it: -+ */ -+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); -+ -+ /* Store the key for possible exit cleanups: */ -+ pi_state->key = *key; -+ -+ WARN_ON(!list_empty(&pi_state->list)); -+ list_add(&pi_state->list, &p->pi_state_list); -+ /* -+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe -+ * because there is no concurrency as the object is not published yet. -+ */ -+ pi_state->owner = p; -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ put_task_struct(p); -+ -+ *ps = pi_state; -+ -+ return 0; -+} -+ -+static int lookup_pi_state(u32 __user *uaddr, u32 uval, -+ struct futex_hash_bucket *hb, -+ union futex_key *key, struct futex_pi_state **ps, -+ struct task_struct **exiting) -+{ -+ struct futex_q *top_waiter = futex_top_waiter(hb, key); -+ -+ /* -+ * If there is a waiter on that futex, validate it and -+ * attach to the pi_state when the validation succeeds. -+ */ -+ if (top_waiter) -+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); -+ -+ /* -+ * We are the first waiter - try to look up the owner based on -+ * @uval and attach to it. -+ */ -+ return attach_to_pi_owner(uaddr, uval, key, ps, exiting); -+} -+ -+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) -+{ -+ int err; -+ u32 curval; -+ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); -+ if (unlikely(err)) -+ return err; -+ -+ /* If user space value changed, let the caller retry */ -+ return curval != uval ? -EAGAIN : 0; -+} -+ -+/** -+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex -+ * @uaddr: the pi futex user address -+ * @hb: the pi futex hash bucket -+ * @key: the futex key associated with uaddr and hb -+ * @ps: the pi_state pointer where we store the result of the -+ * lookup -+ * @task: the task to perform the atomic lock work for. This will -+ * be "current" except in the case of requeue pi. -+ * @exiting: Pointer to store the task pointer of the owner task -+ * which is in the middle of exiting -+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) -+ * -+ * Return: -+ * - 0 - ready to wait; -+ * - 1 - acquired the lock; -+ * - <0 - error -+ * -+ * The hb->lock and futex_key refs shall be held by the caller. -+ * -+ * @exiting is only set when the return value is -EBUSY. If so, this holds -+ * a refcount on the exiting task on return and the caller needs to drop it -+ * after waiting for the exit to complete. -+ */ -+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, -+ union futex_key *key, -+ struct futex_pi_state **ps, -+ struct task_struct *task, -+ struct task_struct **exiting, -+ int set_waiters) -+{ -+ u32 uval, newval, vpid = task_pid_vnr(task); -+ struct futex_q *top_waiter; -+ int ret; -+ -+ /* -+ * Read the user space value first so we can validate a few -+ * things before proceeding further. -+ */ -+ if (get_futex_value_locked(&uval, uaddr)) -+ return -EFAULT; -+ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ /* -+ * Detect deadlocks. -+ */ -+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) -+ return -EDEADLK; -+ -+ if ((unlikely(should_fail_futex(true)))) -+ return -EDEADLK; -+ -+ /* -+ * Lookup existing state first. If it exists, try to attach to -+ * its pi_state. -+ */ -+ top_waiter = futex_top_waiter(hb, key); -+ if (top_waiter) -+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); -+ -+ /* -+ * No waiter and user TID is 0. We are here because the -+ * waiters or the owner died bit is set or called from -+ * requeue_cmp_pi or for whatever reason something took the -+ * syscall. -+ */ -+ if (!(uval & FUTEX_TID_MASK)) { -+ /* -+ * We take over the futex. No other waiters and the user space -+ * TID is 0. We preserve the owner died bit. -+ */ -+ newval = uval & FUTEX_OWNER_DIED; -+ newval |= vpid; -+ -+ /* The futex requeue_pi code can enforce the waiters bit */ -+ if (set_waiters) -+ newval |= FUTEX_WAITERS; -+ -+ ret = lock_pi_update_atomic(uaddr, uval, newval); -+ /* If the take over worked, return 1 */ -+ return ret < 0 ? ret : 1; -+ } -+ -+ /* -+ * First waiter. Set the waiters bit before attaching ourself to -+ * the owner. If owner tries to unlock, it will be forced into -+ * the kernel and blocked on hb->lock. -+ */ -+ newval = uval | FUTEX_WAITERS; -+ ret = lock_pi_update_atomic(uaddr, uval, newval); -+ if (ret) -+ return ret; -+ /* -+ * If the update of the user space value succeeded, we try to -+ * attach to the owner. If that fails, no harm done, we only -+ * set the FUTEX_WAITERS bit in the user space variable. -+ */ -+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting); -+} -+ -+/** -+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket -+ * @q: The futex_q to unqueue -+ * -+ * The q->lock_ptr must not be NULL and must be held by the caller. -+ */ -+static void __unqueue_futex(struct futex_q *q) -+{ -+ struct futex_hash_bucket *hb; -+ -+ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) -+ return; -+ lockdep_assert_held(q->lock_ptr); -+ -+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); -+ plist_del(&q->list, &hb->chain); -+ hb_waiters_dec(hb); -+} -+ -+/* -+ * The hash bucket lock must be held when this is called. -+ * Afterwards, the futex_q must not be accessed. Callers -+ * must ensure to later call wake_up_q() for the actual -+ * wakeups to occur. -+ */ -+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) -+{ -+ struct task_struct *p = q->task; -+ -+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) -+ return; -+ -+ get_task_struct(p); -+ __unqueue_futex(q); -+ /* -+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL -+ * is written, without taking any locks. This is possible in the event -+ * of a spurious wakeup, for example. A memory barrier is required here -+ * to prevent the following store to lock_ptr from getting ahead of the -+ * plist_del in __unqueue_futex(). -+ */ -+ smp_store_release(&q->lock_ptr, NULL); -+ -+ /* -+ * Queue the task for later wakeup for after we've released -+ * the hb->lock. -+ */ -+ wake_q_add_safe(wake_q, p); -+} -+ -+/* -+ * Caller must hold a reference on @pi_state. -+ */ -+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) -+{ -+ u32 curval, newval; -+ struct task_struct *new_owner; -+ bool postunlock = false; -+ DEFINE_WAKE_Q(wake_q); -+ int ret = 0; -+ -+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); -+ if (WARN_ON_ONCE(!new_owner)) { -+ /* -+ * As per the comment in futex_unlock_pi() this should not happen. -+ * -+ * When this happens, give up our locks and try again, giving -+ * the futex_lock_pi() instance time to complete, either by -+ * waiting on the rtmutex or removing itself from the futex -+ * queue. -+ */ -+ ret = -EAGAIN; -+ goto out_unlock; -+ } -+ -+ /* -+ * We pass it to the next owner. The WAITERS bit is always kept -+ * enabled while there is PI state around. We cleanup the owner -+ * died bit, because we are the owner. -+ */ -+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); -+ -+ if (unlikely(should_fail_futex(true))) -+ ret = -EFAULT; -+ -+ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); -+ if (!ret && (curval != uval)) { -+ /* -+ * If a unconditional UNLOCK_PI operation (user space did not -+ * try the TID->0 transition) raced with a waiter setting the -+ * FUTEX_WAITERS flag between get_user() and locking the hash -+ * bucket lock, retry the operation. -+ */ -+ if ((FUTEX_TID_MASK & curval) == uval) -+ ret = -EAGAIN; -+ else -+ ret = -EINVAL; -+ } -+ -+ if (ret) -+ goto out_unlock; -+ -+ /* -+ * This is a point of no return; once we modify the uval there is no -+ * going back and subsequent operations must not fail. -+ */ -+ -+ raw_spin_lock(&pi_state->owner->pi_lock); -+ WARN_ON(list_empty(&pi_state->list)); -+ list_del_init(&pi_state->list); -+ raw_spin_unlock(&pi_state->owner->pi_lock); -+ -+ raw_spin_lock(&new_owner->pi_lock); -+ WARN_ON(!list_empty(&pi_state->list)); -+ list_add(&pi_state->list, &new_owner->pi_state_list); -+ pi_state->owner = new_owner; -+ raw_spin_unlock(&new_owner->pi_lock); -+ -+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); -+ -+out_unlock: -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ if (postunlock) -+ rt_mutex_postunlock(&wake_q); -+ -+ return ret; -+} -+ -+/* -+ * Express the locking dependencies for lockdep: -+ */ -+static inline void -+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -+{ -+ if (hb1 <= hb2) { -+ spin_lock(&hb1->lock); -+ if (hb1 < hb2) -+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); -+ } else { /* hb1 > hb2 */ -+ spin_lock(&hb2->lock); -+ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); -+ } -+} -+ -+static inline void -+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -+{ -+ spin_unlock(&hb1->lock); -+ if (hb1 != hb2) -+ spin_unlock(&hb2->lock); -+} -+ -+/* -+ * Wake up waiters matching bitset queued on this futex (uaddr). -+ */ -+static int -+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) -+{ -+ struct futex_hash_bucket *hb; -+ struct futex_q *this, *next; -+ union futex_key key = FUTEX_KEY_INIT; -+ int ret; -+ DEFINE_WAKE_Q(wake_q); -+ -+ if (!bitset) -+ return -EINVAL; -+ -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ hb = hash_futex(&key); -+ -+ /* Make sure we really have tasks to wakeup */ -+ if (!hb_waiters_pending(hb)) -+ return ret; -+ -+ spin_lock(&hb->lock); -+ -+ plist_for_each_entry_safe(this, next, &hb->chain, list) { -+ if (match_futex (&this->key, &key)) { -+ if (this->pi_state || this->rt_waiter) { -+ ret = -EINVAL; -+ break; -+ } -+ -+ /* Check if one of the bits is set in both bitsets */ -+ if (!(this->bitset & bitset)) -+ continue; -+ -+ mark_wake_futex(&wake_q, this); -+ if (++ret >= nr_wake) -+ break; -+ } -+ } -+ -+ spin_unlock(&hb->lock); -+ wake_up_q(&wake_q); -+ return ret; -+} -+ -+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) -+{ -+ unsigned int op = (encoded_op & 0x70000000) >> 28; -+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24; -+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); -+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); -+ int oldval, ret; -+ -+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { -+ if (oparg < 0 || oparg > 31) { -+ char comm[sizeof(current->comm)]; -+ /* -+ * kill this print and return -EINVAL when userspace -+ * is sane again -+ */ -+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", -+ get_task_comm(comm, current), oparg); -+ oparg &= 31; -+ } -+ oparg = 1 << oparg; -+ } -+ -+ pagefault_disable(); -+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); -+ pagefault_enable(); -+ if (ret) -+ return ret; -+ -+ switch (cmp) { -+ case FUTEX_OP_CMP_EQ: -+ return oldval == cmparg; -+ case FUTEX_OP_CMP_NE: -+ return oldval != cmparg; -+ case FUTEX_OP_CMP_LT: -+ return oldval < cmparg; -+ case FUTEX_OP_CMP_GE: -+ return oldval >= cmparg; -+ case FUTEX_OP_CMP_LE: -+ return oldval <= cmparg; -+ case FUTEX_OP_CMP_GT: -+ return oldval > cmparg; -+ default: -+ return -ENOSYS; -+ } -+} -+ -+/* -+ * Wake up all waiters hashed on the physical page that is mapped -+ * to this virtual address: -+ */ -+static int -+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, -+ int nr_wake, int nr_wake2, int op) -+{ -+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; -+ struct futex_hash_bucket *hb1, *hb2; -+ struct futex_q *this, *next; -+ int ret, op_ret; -+ DEFINE_WAKE_Q(wake_q); -+ -+retry: -+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ hb1 = hash_futex(&key1); -+ hb2 = hash_futex(&key2); -+ -+retry_private: -+ double_lock_hb(hb1, hb2); -+ op_ret = futex_atomic_op_inuser(op, uaddr2); -+ if (unlikely(op_ret < 0)) { -+ double_unlock_hb(hb1, hb2); -+ -+ if (!IS_ENABLED(CONFIG_MMU) || -+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { -+ /* -+ * we don't get EFAULT from MMU faults if we don't have -+ * an MMU, but we might get them from range checking -+ */ -+ ret = op_ret; -+ return ret; -+ } -+ -+ if (op_ret == -EFAULT) { -+ ret = fault_in_user_writeable(uaddr2); -+ if (ret) -+ return ret; -+ } -+ -+ if (!(flags & FLAGS_SHARED)) { -+ cond_resched(); -+ goto retry_private; -+ } -+ -+ cond_resched(); -+ goto retry; -+ } -+ -+ plist_for_each_entry_safe(this, next, &hb1->chain, list) { -+ if (match_futex (&this->key, &key1)) { -+ if (this->pi_state || this->rt_waiter) { -+ ret = -EINVAL; -+ goto out_unlock; -+ } -+ mark_wake_futex(&wake_q, this); -+ if (++ret >= nr_wake) -+ break; -+ } -+ } -+ -+ if (op_ret > 0) { -+ op_ret = 0; -+ plist_for_each_entry_safe(this, next, &hb2->chain, list) { -+ if (match_futex (&this->key, &key2)) { -+ if (this->pi_state || this->rt_waiter) { -+ ret = -EINVAL; -+ goto out_unlock; -+ } -+ mark_wake_futex(&wake_q, this); -+ if (++op_ret >= nr_wake2) -+ break; -+ } -+ } -+ ret += op_ret; -+ } -+ -+out_unlock: -+ double_unlock_hb(hb1, hb2); -+ wake_up_q(&wake_q); -+ return ret; -+} -+ -+/** -+ * requeue_futex() - Requeue a futex_q from one hb to another -+ * @q: the futex_q to requeue -+ * @hb1: the source hash_bucket -+ * @hb2: the target hash_bucket -+ * @key2: the new key for the requeued futex_q -+ */ -+static inline -+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, -+ struct futex_hash_bucket *hb2, union futex_key *key2) -+{ -+ -+ /* -+ * If key1 and key2 hash to the same bucket, no need to -+ * requeue. -+ */ -+ if (likely(&hb1->chain != &hb2->chain)) { -+ plist_del(&q->list, &hb1->chain); -+ hb_waiters_dec(hb1); -+ hb_waiters_inc(hb2); -+ plist_add(&q->list, &hb2->chain); -+ q->lock_ptr = &hb2->lock; -+ } -+ q->key = *key2; -+} -+ -+/** -+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue -+ * @q: the futex_q -+ * @key: the key of the requeue target futex -+ * @hb: the hash_bucket of the requeue target futex -+ * -+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the -+ * target futex if it is uncontended or via a lock steal. Set the futex_q key -+ * to the requeue target futex so the waiter can detect the wakeup on the right -+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect -+ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock -+ * to protect access to the pi_state to fixup the owner later. Must be called -+ * with both q->lock_ptr and hb->lock held. -+ */ -+static inline -+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, -+ struct futex_hash_bucket *hb) -+{ -+ q->key = *key; -+ -+ __unqueue_futex(q); -+ -+ WARN_ON(!q->rt_waiter); -+ q->rt_waiter = NULL; -+ -+ q->lock_ptr = &hb->lock; -+ -+ wake_up_state(q->task, TASK_NORMAL); -+} -+ -+/** -+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter -+ * @pifutex: the user address of the to futex -+ * @hb1: the from futex hash bucket, must be locked by the caller -+ * @hb2: the to futex hash bucket, must be locked by the caller -+ * @key1: the from futex key -+ * @key2: the to futex key -+ * @ps: address to store the pi_state pointer -+ * @exiting: Pointer to store the task pointer of the owner task -+ * which is in the middle of exiting -+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) -+ * -+ * Try and get the lock on behalf of the top waiter if we can do it atomically. -+ * Wake the top waiter if we succeed. If the caller specified set_waiters, -+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. -+ * hb1 and hb2 must be held by the caller. -+ * -+ * @exiting is only set when the return value is -EBUSY. If so, this holds -+ * a refcount on the exiting task on return and the caller needs to drop it -+ * after waiting for the exit to complete. -+ * -+ * Return: -+ * - 0 - failed to acquire the lock atomically; -+ * - >0 - acquired the lock, return value is vpid of the top_waiter -+ * - <0 - error -+ */ -+static int -+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, -+ struct futex_hash_bucket *hb2, union futex_key *key1, -+ union futex_key *key2, struct futex_pi_state **ps, -+ struct task_struct **exiting, int set_waiters) -+{ -+ struct futex_q *top_waiter = NULL; -+ u32 curval; -+ int ret, vpid; -+ -+ if (get_futex_value_locked(&curval, pifutex)) -+ return -EFAULT; -+ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ /* -+ * Find the top_waiter and determine if there are additional waiters. -+ * If the caller intends to requeue more than 1 waiter to pifutex, -+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, -+ * as we have means to handle the possible fault. If not, don't set -+ * the bit unecessarily as it will force the subsequent unlock to enter -+ * the kernel. -+ */ -+ top_waiter = futex_top_waiter(hb1, key1); -+ -+ /* There are no waiters, nothing for us to do. */ -+ if (!top_waiter) -+ return 0; -+ -+ /* Ensure we requeue to the expected futex. */ -+ if (!match_futex(top_waiter->requeue_pi_key, key2)) -+ return -EINVAL; -+ -+ /* -+ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in -+ * the contended case or if set_waiters is 1. The pi_state is returned -+ * in ps in contended cases. -+ */ -+ vpid = task_pid_vnr(top_waiter->task); -+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, -+ exiting, set_waiters); -+ if (ret == 1) { -+ requeue_pi_wake_futex(top_waiter, key2, hb2); -+ return vpid; -+ } -+ return ret; -+} -+ -+/** -+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 -+ * @uaddr1: source futex user address -+ * @flags: futex flags (FLAGS_SHARED, etc.) -+ * @uaddr2: target futex user address -+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) -+ * @nr_requeue: number of waiters to requeue (0-INT_MAX) -+ * @cmpval: @uaddr1 expected value (or %NULL) -+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a -+ * pi futex (pi to pi requeue is not supported) -+ * -+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire -+ * uaddr2 atomically on behalf of the top waiter. -+ * -+ * Return: -+ * - >=0 - on success, the number of tasks requeued or woken; -+ * - <0 - on error -+ */ -+static int futex_requeue(u32 __user *uaddr1, unsigned int flags, -+ u32 __user *uaddr2, int nr_wake, int nr_requeue, -+ u32 *cmpval, int requeue_pi) -+{ -+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; -+ int task_count = 0, ret; -+ struct futex_pi_state *pi_state = NULL; -+ struct futex_hash_bucket *hb1, *hb2; -+ struct futex_q *this, *next; -+ DEFINE_WAKE_Q(wake_q); -+ -+ if (nr_wake < 0 || nr_requeue < 0) -+ return -EINVAL; -+ -+ /* -+ * When PI not supported: return -ENOSYS if requeue_pi is true, -+ * consequently the compiler knows requeue_pi is always false past -+ * this point which will optimize away all the conditional code -+ * further down. -+ */ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) -+ return -ENOSYS; -+ -+ if (requeue_pi) { -+ /* -+ * Requeue PI only works on two distinct uaddrs. This -+ * check is only valid for private futexes. See below. -+ */ -+ if (uaddr1 == uaddr2) -+ return -EINVAL; -+ -+ /* -+ * requeue_pi requires a pi_state, try to allocate it now -+ * without any locks in case it fails. -+ */ -+ if (refill_pi_state_cache()) -+ return -ENOMEM; -+ /* -+ * requeue_pi must wake as many tasks as it can, up to nr_wake -+ * + nr_requeue, since it acquires the rt_mutex prior to -+ * returning to userspace, so as to not leave the rt_mutex with -+ * waiters and no owner. However, second and third wake-ups -+ * cannot be predicted as they involve race conditions with the -+ * first wake and a fault while looking up the pi_state. Both -+ * pthread_cond_signal() and pthread_cond_broadcast() should -+ * use nr_wake=1. -+ */ -+ if (nr_wake != 1) -+ return -EINVAL; -+ } -+ -+retry: -+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, -+ requeue_pi ? FUTEX_WRITE : FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ /* -+ * The check above which compares uaddrs is not sufficient for -+ * shared futexes. We need to compare the keys: -+ */ -+ if (requeue_pi && match_futex(&key1, &key2)) -+ return -EINVAL; -+ -+ hb1 = hash_futex(&key1); -+ hb2 = hash_futex(&key2); -+ -+retry_private: -+ hb_waiters_inc(hb2); -+ double_lock_hb(hb1, hb2); -+ -+ if (likely(cmpval != NULL)) { -+ u32 curval; -+ -+ ret = get_futex_value_locked(&curval, uaddr1); -+ -+ if (unlikely(ret)) { -+ double_unlock_hb(hb1, hb2); -+ hb_waiters_dec(hb2); -+ -+ ret = get_user(curval, uaddr1); -+ if (ret) -+ return ret; -+ -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ -+ goto retry; -+ } -+ if (curval != *cmpval) { -+ ret = -EAGAIN; -+ goto out_unlock; -+ } -+ } -+ -+ if (requeue_pi && (task_count - nr_wake < nr_requeue)) { -+ struct task_struct *exiting = NULL; -+ -+ /* -+ * Attempt to acquire uaddr2 and wake the top waiter. If we -+ * intend to requeue waiters, force setting the FUTEX_WAITERS -+ * bit. We force this here where we are able to easily handle -+ * faults rather in the requeue loop below. -+ */ -+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, -+ &key2, &pi_state, -+ &exiting, nr_requeue); -+ -+ /* -+ * At this point the top_waiter has either taken uaddr2 or is -+ * waiting on it. If the former, then the pi_state will not -+ * exist yet, look it up one more time to ensure we have a -+ * reference to it. If the lock was taken, ret contains the -+ * vpid of the top waiter task. -+ * If the lock was not taken, we have pi_state and an initial -+ * refcount on it. In case of an error we have nothing. -+ */ -+ if (ret > 0) { -+ WARN_ON(pi_state); -+ task_count++; -+ /* -+ * If we acquired the lock, then the user space value -+ * of uaddr2 should be vpid. It cannot be changed by -+ * the top waiter as it is blocked on hb2 lock if it -+ * tries to do so. If something fiddled with it behind -+ * our back the pi state lookup might unearth it. So -+ * we rather use the known value than rereading and -+ * handing potential crap to lookup_pi_state. -+ * -+ * If that call succeeds then we have pi_state and an -+ * initial refcount on it. -+ */ -+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, -+ &pi_state, &exiting); -+ } -+ -+ switch (ret) { -+ case 0: -+ /* We hold a reference on the pi state. */ -+ break; -+ -+ /* If the above failed, then pi_state is NULL */ -+ case -EFAULT: -+ double_unlock_hb(hb1, hb2); -+ hb_waiters_dec(hb2); -+ ret = fault_in_user_writeable(uaddr2); -+ if (!ret) -+ goto retry; -+ return ret; -+ case -EBUSY: -+ case -EAGAIN: -+ /* -+ * Two reasons for this: -+ * - EBUSY: Owner is exiting and we just wait for the -+ * exit to complete. -+ * - EAGAIN: The user space value changed. -+ */ -+ double_unlock_hb(hb1, hb2); -+ hb_waiters_dec(hb2); -+ /* -+ * Handle the case where the owner is in the middle of -+ * exiting. Wait for the exit to complete otherwise -+ * this task might loop forever, aka. live lock. -+ */ -+ wait_for_owner_exiting(ret, exiting); -+ cond_resched(); -+ goto retry; -+ default: -+ goto out_unlock; -+ } -+ } -+ -+ plist_for_each_entry_safe(this, next, &hb1->chain, list) { -+ if (task_count - nr_wake >= nr_requeue) -+ break; -+ -+ if (!match_futex(&this->key, &key1)) -+ continue; -+ -+ /* -+ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always -+ * be paired with each other and no other futex ops. -+ * -+ * We should never be requeueing a futex_q with a pi_state, -+ * which is awaiting a futex_unlock_pi(). -+ */ -+ if ((requeue_pi && !this->rt_waiter) || -+ (!requeue_pi && this->rt_waiter) || -+ this->pi_state) { -+ ret = -EINVAL; -+ break; -+ } -+ -+ /* -+ * Wake nr_wake waiters. For requeue_pi, if we acquired the -+ * lock, we already woke the top_waiter. If not, it will be -+ * woken by futex_unlock_pi(). -+ */ -+ if (++task_count <= nr_wake && !requeue_pi) { -+ mark_wake_futex(&wake_q, this); -+ continue; -+ } -+ -+ /* Ensure we requeue to the expected futex for requeue_pi. */ -+ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { -+ ret = -EINVAL; -+ break; -+ } -+ -+ /* -+ * Requeue nr_requeue waiters and possibly one more in the case -+ * of requeue_pi if we couldn't acquire the lock atomically. -+ */ -+ if (requeue_pi) { -+ /* -+ * Prepare the waiter to take the rt_mutex. Take a -+ * refcount on the pi_state and store the pointer in -+ * the futex_q object of the waiter. -+ */ -+ get_pi_state(pi_state); -+ this->pi_state = pi_state; -+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, -+ this->rt_waiter, -+ this->task); -+ if (ret == 1) { -+ /* -+ * We got the lock. We do neither drop the -+ * refcount on pi_state nor clear -+ * this->pi_state because the waiter needs the -+ * pi_state for cleaning up the user space -+ * value. It will drop the refcount after -+ * doing so. -+ */ -+ requeue_pi_wake_futex(this, &key2, hb2); -+ continue; -+ } else if (ret) { -+ /* -+ * rt_mutex_start_proxy_lock() detected a -+ * potential deadlock when we tried to queue -+ * that waiter. Drop the pi_state reference -+ * which we took above and remove the pointer -+ * to the state from the waiters futex_q -+ * object. -+ */ -+ this->pi_state = NULL; -+ put_pi_state(pi_state); -+ /* -+ * We stop queueing more waiters and let user -+ * space deal with the mess. -+ */ -+ break; -+ } -+ } -+ requeue_futex(this, hb1, hb2, &key2); -+ } -+ -+ /* -+ * We took an extra initial reference to the pi_state either -+ * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We -+ * need to drop it here again. -+ */ -+ put_pi_state(pi_state); -+ -+out_unlock: -+ double_unlock_hb(hb1, hb2); -+ wake_up_q(&wake_q); -+ hb_waiters_dec(hb2); -+ return ret ? ret : task_count; -+} -+ -+/* The key must be already stored in q->key. */ -+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) -+ __acquires(&hb->lock) -+{ -+ struct futex_hash_bucket *hb; -+ -+ hb = hash_futex(&q->key); -+ -+ /* -+ * Increment the counter before taking the lock so that -+ * a potential waker won't miss a to-be-slept task that is -+ * waiting for the spinlock. This is safe as all queue_lock() -+ * users end up calling queue_me(). Similarly, for housekeeping, -+ * decrement the counter at queue_unlock() when some error has -+ * occurred and we don't end up adding the task to the list. -+ */ -+ hb_waiters_inc(hb); /* implies smp_mb(); (A) */ -+ -+ q->lock_ptr = &hb->lock; -+ -+ spin_lock(&hb->lock); -+ return hb; -+} -+ -+static inline void -+queue_unlock(struct futex_hash_bucket *hb) -+ __releases(&hb->lock) -+{ -+ spin_unlock(&hb->lock); -+ hb_waiters_dec(hb); -+} -+ -+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) -+{ -+ int prio; -+ -+ /* -+ * The priority used to register this element is -+ * - either the real thread-priority for the real-time threads -+ * (i.e. threads with a priority lower than MAX_RT_PRIO) -+ * - or MAX_RT_PRIO for non-RT threads. -+ * Thus, all RT-threads are woken first in priority order, and -+ * the others are woken last, in FIFO order. -+ */ -+ prio = min(current->normal_prio, MAX_RT_PRIO); -+ -+ plist_node_init(&q->list, prio); -+ plist_add(&q->list, &hb->chain); -+ q->task = current; -+} -+ -+/** -+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket -+ * @q: The futex_q to enqueue -+ * @hb: The destination hash bucket -+ * -+ * The hb->lock must be held by the caller, and is released here. A call to -+ * queue_me() is typically paired with exactly one call to unqueue_me(). The -+ * exceptions involve the PI related operations, which may use unqueue_me_pi() -+ * or nothing if the unqueue is done as part of the wake process and the unqueue -+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for -+ * an example). -+ */ -+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) -+ __releases(&hb->lock) -+{ -+ __queue_me(q, hb); -+ spin_unlock(&hb->lock); -+} -+ -+/** -+ * unqueue_me() - Remove the futex_q from its futex_hash_bucket -+ * @q: The futex_q to unqueue -+ * -+ * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must -+ * be paired with exactly one earlier call to queue_me(). -+ * -+ * Return: -+ * - 1 - if the futex_q was still queued (and we removed unqueued it); -+ * - 0 - if the futex_q was already removed by the waking thread -+ */ -+static int unqueue_me(struct futex_q *q) -+{ -+ spinlock_t *lock_ptr; -+ int ret = 0; -+ -+ /* In the common case we don't take the spinlock, which is nice. */ -+retry: -+ /* -+ * q->lock_ptr can change between this read and the following spin_lock. -+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and -+ * optimizing lock_ptr out of the logic below. -+ */ -+ lock_ptr = READ_ONCE(q->lock_ptr); -+ if (lock_ptr != NULL) { -+ spin_lock(lock_ptr); -+ /* -+ * q->lock_ptr can change between reading it and -+ * spin_lock(), causing us to take the wrong lock. This -+ * corrects the race condition. -+ * -+ * Reasoning goes like this: if we have the wrong lock, -+ * q->lock_ptr must have changed (maybe several times) -+ * between reading it and the spin_lock(). It can -+ * change again after the spin_lock() but only if it was -+ * already changed before the spin_lock(). It cannot, -+ * however, change back to the original value. Therefore -+ * we can detect whether we acquired the correct lock. -+ */ -+ if (unlikely(lock_ptr != q->lock_ptr)) { -+ spin_unlock(lock_ptr); -+ goto retry; -+ } -+ __unqueue_futex(q); -+ -+ BUG_ON(q->pi_state); -+ -+ spin_unlock(lock_ptr); -+ ret = 1; -+ } -+ -+ return ret; -+} -+ -+/* -+ * PI futexes can not be requeued and must remove themself from the -+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry -+ * and dropped here. -+ */ -+static void unqueue_me_pi(struct futex_q *q) -+ __releases(q->lock_ptr) -+{ -+ __unqueue_futex(q); -+ -+ BUG_ON(!q->pi_state); -+ put_pi_state(q->pi_state); -+ q->pi_state = NULL; -+ -+ spin_unlock(q->lock_ptr); -+} -+ -+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, -+ struct task_struct *argowner) -+{ -+ struct futex_pi_state *pi_state = q->pi_state; -+ u32 uval, curval, newval; -+ struct task_struct *oldowner, *newowner; -+ u32 newtid; -+ int ret, err = 0; -+ -+ lockdep_assert_held(q->lock_ptr); -+ -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ oldowner = pi_state->owner; -+ -+ /* -+ * We are here because either: -+ * -+ * - we stole the lock and pi_state->owner needs updating to reflect -+ * that (@argowner == current), -+ * -+ * or: -+ * -+ * - someone stole our lock and we need to fix things to point to the -+ * new owner (@argowner == NULL). -+ * -+ * Either way, we have to replace the TID in the user space variable. -+ * This must be atomic as we have to preserve the owner died bit here. -+ * -+ * Note: We write the user space value _before_ changing the pi_state -+ * because we can fault here. Imagine swapped out pages or a fork -+ * that marked all the anonymous memory readonly for cow. -+ * -+ * Modifying pi_state _before_ the user space value would leave the -+ * pi_state in an inconsistent state when we fault here, because we -+ * need to drop the locks to handle the fault. This might be observed -+ * in the PID check in lookup_pi_state. -+ */ -+retry: -+ if (!argowner) { -+ if (oldowner != current) { -+ /* -+ * We raced against a concurrent self; things are -+ * already fixed up. Nothing to do. -+ */ -+ ret = 0; -+ goto out_unlock; -+ } -+ -+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { -+ /* We got the lock after all, nothing to fix. */ -+ ret = 0; -+ goto out_unlock; -+ } -+ -+ /* -+ * Since we just failed the trylock; there must be an owner. -+ */ -+ newowner = rt_mutex_owner(&pi_state->pi_mutex); -+ BUG_ON(!newowner); -+ } else { -+ WARN_ON_ONCE(argowner != current); -+ if (oldowner == current) { -+ /* -+ * We raced against a concurrent self; things are -+ * already fixed up. Nothing to do. -+ */ -+ ret = 0; -+ goto out_unlock; -+ } -+ newowner = argowner; -+ } -+ -+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; -+ /* Owner died? */ -+ if (!pi_state->owner) -+ newtid |= FUTEX_OWNER_DIED; -+ -+ err = get_futex_value_locked(&uval, uaddr); -+ if (err) -+ goto handle_err; -+ -+ for (;;) { -+ newval = (uval & FUTEX_OWNER_DIED) | newtid; -+ -+ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); -+ if (err) -+ goto handle_err; -+ -+ if (curval == uval) -+ break; -+ uval = curval; -+ } -+ -+ /* -+ * We fixed up user space. Now we need to fix the pi_state -+ * itself. -+ */ -+ if (pi_state->owner != NULL) { -+ raw_spin_lock(&pi_state->owner->pi_lock); -+ WARN_ON(list_empty(&pi_state->list)); -+ list_del_init(&pi_state->list); -+ raw_spin_unlock(&pi_state->owner->pi_lock); -+ } -+ -+ pi_state->owner = newowner; -+ -+ raw_spin_lock(&newowner->pi_lock); -+ WARN_ON(!list_empty(&pi_state->list)); -+ list_add(&pi_state->list, &newowner->pi_state_list); -+ raw_spin_unlock(&newowner->pi_lock); -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ return 0; -+ -+ /* -+ * In order to reschedule or handle a page fault, we need to drop the -+ * locks here. In the case of a fault, this gives the other task -+ * (either the highest priority waiter itself or the task which stole -+ * the rtmutex) the chance to try the fixup of the pi_state. So once we -+ * are back from handling the fault we need to check the pi_state after -+ * reacquiring the locks and before trying to do another fixup. When -+ * the fixup has been done already we simply return. -+ * -+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely -+ * drop hb->lock since the caller owns the hb -> futex_q relation. -+ * Dropping the pi_mutex->wait_lock requires the state revalidate. -+ */ -+handle_err: -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ spin_unlock(q->lock_ptr); -+ -+ switch (err) { -+ case -EFAULT: -+ ret = fault_in_user_writeable(uaddr); -+ break; -+ -+ case -EAGAIN: -+ cond_resched(); -+ ret = 0; -+ break; -+ -+ default: -+ WARN_ON_ONCE(1); -+ ret = err; -+ break; -+ } -+ -+ spin_lock(q->lock_ptr); -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ /* -+ * Check if someone else fixed it for us: -+ */ -+ if (pi_state->owner != oldowner) { -+ ret = 0; -+ goto out_unlock; -+ } -+ -+ if (ret) -+ goto out_unlock; -+ -+ goto retry; -+ -+out_unlock: -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ return ret; -+} -+ -+static long futex_wait_restart(struct restart_block *restart); -+ -+/** -+ * fixup_owner() - Post lock pi_state and corner case management -+ * @uaddr: user address of the futex -+ * @q: futex_q (contains pi_state and access to the rt_mutex) -+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) -+ * -+ * After attempting to lock an rt_mutex, this function is called to cleanup -+ * the pi_state owner as well as handle race conditions that may allow us to -+ * acquire the lock. Must be called with the hb lock held. -+ * -+ * Return: -+ * - 1 - success, lock taken; -+ * - 0 - success, lock not taken; -+ * - <0 - on error (-EFAULT) -+ */ -+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) -+{ -+ int ret = 0; -+ -+ if (locked) { -+ /* -+ * Got the lock. We might not be the anticipated owner if we -+ * did a lock-steal - fix up the PI-state in that case: -+ * -+ * Speculative pi_state->owner read (we don't hold wait_lock); -+ * since we own the lock pi_state->owner == current is the -+ * stable state, anything else needs more attention. -+ */ -+ if (q->pi_state->owner != current) -+ ret = fixup_pi_state_owner(uaddr, q, current); -+ return ret ? ret : locked; -+ } -+ -+ /* -+ * If we didn't get the lock; check if anybody stole it from us. In -+ * that case, we need to fix up the uval to point to them instead of -+ * us, otherwise bad things happen. [10] -+ * -+ * Another speculative read; pi_state->owner == current is unstable -+ * but needs our attention. -+ */ -+ if (q->pi_state->owner == current) { -+ ret = fixup_pi_state_owner(uaddr, q, NULL); -+ return ret; -+ } -+ -+ /* -+ * Paranoia check. If we did not take the lock, then we should not be -+ * the owner of the rt_mutex. -+ */ -+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { -+ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " -+ "pi-state %p\n", ret, -+ q->pi_state->pi_mutex.owner, -+ q->pi_state->owner); -+ } -+ -+ return ret; -+} -+ -+/** -+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal -+ * @hb: the futex hash bucket, must be locked by the caller -+ * @q: the futex_q to queue up on -+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout -+ */ -+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, -+ struct hrtimer_sleeper *timeout) -+{ -+ /* -+ * The task state is guaranteed to be set before another task can -+ * wake it. set_current_state() is implemented using smp_store_mb() and -+ * queue_me() calls spin_unlock() upon completion, both serializing -+ * access to the hash list and forcing another memory barrier. -+ */ -+ set_current_state(TASK_INTERRUPTIBLE); -+ queue_me(q, hb); -+ -+ /* Arm the timer */ -+ if (timeout) -+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); -+ -+ /* -+ * If we have been removed from the hash list, then another task -+ * has tried to wake us, and we can skip the call to schedule(). -+ */ -+ if (likely(!plist_node_empty(&q->list))) { -+ /* -+ * If the timer has already expired, current will already be -+ * flagged for rescheduling. Only call schedule if there -+ * is no timeout, or if it has yet to expire. -+ */ -+ if (!timeout || timeout->task) -+ freezable_schedule(); -+ } -+ __set_current_state(TASK_RUNNING); -+} -+ -+/** -+ * futex_wait_setup() - Prepare to wait on a futex -+ * @uaddr: the futex userspace address -+ * @val: the expected value -+ * @flags: futex flags (FLAGS_SHARED, etc.) -+ * @q: the associated futex_q -+ * @hb: storage for hash_bucket pointer to be returned to caller -+ * -+ * Setup the futex_q and locate the hash_bucket. Get the futex value and -+ * compare it with the expected value. Handle atomic faults internally. -+ * Return with the hb lock held and a q.key reference on success, and unlocked -+ * with no q.key reference on failure. -+ * -+ * Return: -+ * - 0 - uaddr contains val and hb has been locked; -+ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked -+ */ -+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, -+ struct futex_q *q, struct futex_hash_bucket **hb) -+{ -+ u32 uval; -+ int ret; -+ -+ /* -+ * Access the page AFTER the hash-bucket is locked. -+ * Order is important: -+ * -+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); -+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } -+ * -+ * The basic logical guarantee of a futex is that it blocks ONLY -+ * if cond(var) is known to be true at the time of blocking, for -+ * any cond. If we locked the hash-bucket after testing *uaddr, that -+ * would open a race condition where we could block indefinitely with -+ * cond(var) false, which would violate the guarantee. -+ * -+ * On the other hand, we insert q and release the hash-bucket only -+ * after testing *uaddr. This guarantees that futex_wait() will NOT -+ * absorb a wakeup if *uaddr does not match the desired values -+ * while the syscall executes. -+ */ -+retry: -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+retry_private: -+ *hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, uaddr); -+ -+ if (ret) { -+ queue_unlock(*hb); -+ -+ ret = get_user(uval, uaddr); -+ if (ret) -+ return ret; -+ -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ -+ goto retry; -+ } -+ -+ if (uval != val) { -+ queue_unlock(*hb); -+ ret = -EWOULDBLOCK; -+ } -+ -+ return ret; -+} -+ -+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, -+ ktime_t *abs_time, u32 bitset) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ struct restart_block *restart; -+ struct futex_hash_bucket *hb; -+ struct futex_q q = futex_q_init; -+ int ret; -+ -+ if (!bitset) -+ return -EINVAL; -+ q.bitset = bitset; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, -+ current->timer_slack_ns); -+retry: -+ /* -+ * Prepare to wait on uaddr. On success, holds hb lock and increments -+ * q.key refs. -+ */ -+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); -+ if (ret) -+ goto out; -+ -+ /* queue_me and wait for wakeup, timeout, or a signal. */ -+ futex_wait_queue_me(hb, &q, to); -+ -+ /* If we were woken (and unqueued), we succeeded, whatever. */ -+ ret = 0; -+ /* unqueue_me() drops q.key ref */ -+ if (!unqueue_me(&q)) -+ goto out; -+ ret = -ETIMEDOUT; -+ if (to && !to->task) -+ goto out; -+ -+ /* -+ * We expect signal_pending(current), but we might be the -+ * victim of a spurious wakeup as well. -+ */ -+ if (!signal_pending(current)) -+ goto retry; -+ -+ ret = -ERESTARTSYS; -+ if (!abs_time) -+ goto out; -+ -+ restart = ¤t->restart_block; -+ restart->fn = futex_wait_restart; -+ restart->futex.uaddr = uaddr; -+ restart->futex.val = val; -+ restart->futex.time = *abs_time; -+ restart->futex.bitset = bitset; -+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; -+ -+ ret = -ERESTART_RESTARTBLOCK; -+ -+out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ return ret; -+} -+ -+ -+static long futex_wait_restart(struct restart_block *restart) -+{ -+ u32 __user *uaddr = restart->futex.uaddr; -+ ktime_t t, *tp = NULL; -+ -+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { -+ t = restart->futex.time; -+ tp = &t; -+ } -+ restart->fn = do_no_restart_syscall; -+ -+ return (long)futex_wait(uaddr, restart->futex.flags, -+ restart->futex.val, tp, restart->futex.bitset); -+} -+ -+ -+/* -+ * Userspace tried a 0 -> TID atomic transition of the futex value -+ * and failed. The kernel side here does the whole locking operation: -+ * if there are waiters then it will block as a consequence of relying -+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see -+ * a 0 value of the futex too.). -+ * -+ * Also serves as futex trylock_pi()'ing, and due semantics. -+ */ -+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, -+ ktime_t *time, int trylock) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ struct futex_pi_state *pi_state = NULL; -+ struct task_struct *exiting = NULL; -+ struct rt_mutex_waiter rt_waiter; -+ struct futex_hash_bucket *hb; -+ struct futex_q q = futex_q_init; -+ int res, ret; -+ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI)) -+ return -ENOSYS; -+ -+ if (refill_pi_state_cache()) -+ return -ENOMEM; -+ -+ to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); -+ -+retry: -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); -+ if (unlikely(ret != 0)) -+ goto out; -+ -+retry_private: -+ hb = queue_lock(&q); -+ -+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, -+ &exiting, 0); -+ if (unlikely(ret)) { -+ /* -+ * Atomic work succeeded and we got the lock, -+ * or failed. Either way, we do _not_ block. -+ */ -+ switch (ret) { -+ case 1: -+ /* We got the lock. */ -+ ret = 0; -+ goto out_unlock_put_key; -+ case -EFAULT: -+ goto uaddr_faulted; -+ case -EBUSY: -+ case -EAGAIN: -+ /* -+ * Two reasons for this: -+ * - EBUSY: Task is exiting and we just wait for the -+ * exit to complete. -+ * - EAGAIN: The user space value changed. -+ */ -+ queue_unlock(hb); -+ /* -+ * Handle the case where the owner is in the middle of -+ * exiting. Wait for the exit to complete otherwise -+ * this task might loop forever, aka. live lock. -+ */ -+ wait_for_owner_exiting(ret, exiting); -+ cond_resched(); -+ goto retry; -+ default: -+ goto out_unlock_put_key; -+ } -+ } -+ -+ WARN_ON(!q.pi_state); -+ -+ /* -+ * Only actually queue now that the atomic ops are done: -+ */ -+ __queue_me(&q, hb); -+ -+ if (trylock) { -+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); -+ /* Fixup the trylock return value: */ -+ ret = ret ? 0 : -EWOULDBLOCK; -+ goto no_block; -+ } -+ -+ rt_mutex_init_waiter(&rt_waiter); -+ -+ /* -+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not -+ * hold it while doing rt_mutex_start_proxy(), because then it will -+ * include hb->lock in the blocking chain, even through we'll not in -+ * fact hold it while blocking. This will lead it to report -EDEADLK -+ * and BUG when futex_unlock_pi() interleaves with this. -+ * -+ * Therefore acquire wait_lock while holding hb->lock, but drop the -+ * latter before calling __rt_mutex_start_proxy_lock(). This -+ * interleaves with futex_unlock_pi() -- which does a similar lock -+ * handoff -- such that the latter can observe the futex_q::pi_state -+ * before __rt_mutex_start_proxy_lock() is done. -+ */ -+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); -+ spin_unlock(q.lock_ptr); -+ /* -+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter -+ * such that futex_unlock_pi() is guaranteed to observe the waiter when -+ * it sees the futex_q::pi_state. -+ */ -+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); -+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); -+ -+ if (ret) { -+ if (ret == 1) -+ ret = 0; -+ goto cleanup; -+ } -+ -+ if (unlikely(to)) -+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); -+ -+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); -+ -+cleanup: -+ spin_lock(q.lock_ptr); -+ /* -+ * If we failed to acquire the lock (deadlock/signal/timeout), we must -+ * first acquire the hb->lock before removing the lock from the -+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait -+ * lists consistent. -+ * -+ * In particular; it is important that futex_unlock_pi() can not -+ * observe this inconsistency. -+ */ -+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) -+ ret = 0; -+ -+no_block: -+ /* -+ * Fixup the pi_state owner and possibly acquire the lock if we -+ * haven't already. -+ */ -+ res = fixup_owner(uaddr, &q, !ret); -+ /* -+ * If fixup_owner() returned an error, proprogate that. If it acquired -+ * the lock, clear our -ETIMEDOUT or -EINTR. -+ */ -+ if (res) -+ ret = (res < 0) ? res : 0; -+ -+ /* -+ * If fixup_owner() faulted and was unable to handle the fault, unlock -+ * it and return the fault to userspace. -+ */ -+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { -+ pi_state = q.pi_state; -+ get_pi_state(pi_state); -+ } -+ -+ /* Unqueue and drop the lock */ -+ unqueue_me_pi(&q); -+ -+ if (pi_state) { -+ rt_mutex_futex_unlock(&pi_state->pi_mutex); -+ put_pi_state(pi_state); -+ } -+ -+ goto out; -+ -+out_unlock_put_key: -+ queue_unlock(hb); -+ -+out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ return ret != -EINTR ? ret : -ERESTARTNOINTR; -+ -+uaddr_faulted: -+ queue_unlock(hb); -+ -+ ret = fault_in_user_writeable(uaddr); -+ if (ret) -+ goto out; -+ -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ -+ goto retry; -+} -+ -+/* -+ * Userspace attempted a TID -> 0 atomic transition, and failed. -+ * This is the in-kernel slowpath: we look up the PI state (if any), -+ * and do the rt-mutex unlock. -+ */ -+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) -+{ -+ u32 curval, uval, vpid = task_pid_vnr(current); -+ union futex_key key = FUTEX_KEY_INIT; -+ struct futex_hash_bucket *hb; -+ struct futex_q *top_waiter; -+ int ret; -+ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI)) -+ return -ENOSYS; -+ -+retry: -+ if (get_user(uval, uaddr)) -+ return -EFAULT; -+ /* -+ * We release only a lock we actually own: -+ */ -+ if ((uval & FUTEX_TID_MASK) != vpid) -+ return -EPERM; -+ -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); -+ if (ret) -+ return ret; -+ -+ hb = hash_futex(&key); -+ spin_lock(&hb->lock); -+ -+ /* -+ * Check waiters first. We do not trust user space values at -+ * all and we at least want to know if user space fiddled -+ * with the futex value instead of blindly unlocking. -+ */ -+ top_waiter = futex_top_waiter(hb, &key); -+ if (top_waiter) { -+ struct futex_pi_state *pi_state = top_waiter->pi_state; -+ -+ ret = -EINVAL; -+ if (!pi_state) -+ goto out_unlock; -+ -+ /* -+ * If current does not own the pi_state then the futex is -+ * inconsistent and user space fiddled with the futex value. -+ */ -+ if (pi_state->owner != current) -+ goto out_unlock; -+ -+ get_pi_state(pi_state); -+ /* -+ * By taking wait_lock while still holding hb->lock, we ensure -+ * there is no point where we hold neither; and therefore -+ * wake_futex_pi() must observe a state consistent with what we -+ * observed. -+ * -+ * In particular; this forces __rt_mutex_start_proxy() to -+ * complete such that we're guaranteed to observe the -+ * rt_waiter. Also see the WARN in wake_futex_pi(). -+ */ -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ spin_unlock(&hb->lock); -+ -+ /* drops pi_state->pi_mutex.wait_lock */ -+ ret = wake_futex_pi(uaddr, uval, pi_state); -+ -+ put_pi_state(pi_state); -+ -+ /* -+ * Success, we're done! No tricky corner cases. -+ */ -+ if (!ret) -+ goto out_putkey; -+ /* -+ * The atomic access to the futex value generated a -+ * pagefault, so retry the user-access and the wakeup: -+ */ -+ if (ret == -EFAULT) -+ goto pi_faulted; -+ /* -+ * A unconditional UNLOCK_PI op raced against a waiter -+ * setting the FUTEX_WAITERS bit. Try again. -+ */ -+ if (ret == -EAGAIN) -+ goto pi_retry; -+ /* -+ * wake_futex_pi has detected invalid state. Tell user -+ * space. -+ */ -+ goto out_putkey; -+ } -+ -+ /* -+ * We have no kernel internal state, i.e. no waiters in the -+ * kernel. Waiters which are about to queue themselves are stuck -+ * on hb->lock. So we can safely ignore them. We do neither -+ * preserve the WAITERS bit not the OWNER_DIED one. We are the -+ * owner. -+ */ -+ if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { -+ spin_unlock(&hb->lock); -+ switch (ret) { -+ case -EFAULT: -+ goto pi_faulted; -+ -+ case -EAGAIN: -+ goto pi_retry; -+ -+ default: -+ WARN_ON_ONCE(1); -+ goto out_putkey; -+ } -+ } -+ -+ /* -+ * If uval has changed, let user space handle it. -+ */ -+ ret = (curval == uval) ? 0 : -EAGAIN; -+ -+out_unlock: -+ spin_unlock(&hb->lock); -+out_putkey: -+ return ret; -+ -+pi_retry: -+ cond_resched(); -+ goto retry; -+ -+pi_faulted: -+ -+ ret = fault_in_user_writeable(uaddr); -+ if (!ret) -+ goto retry; -+ -+ return ret; -+} -+ -+/** -+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex -+ * @hb: the hash_bucket futex_q was original enqueued on -+ * @q: the futex_q woken while waiting to be requeued -+ * @key2: the futex_key of the requeue target futex -+ * @timeout: the timeout associated with the wait (NULL if none) -+ * -+ * Detect if the task was woken on the initial futex as opposed to the requeue -+ * target futex. If so, determine if it was a timeout or a signal that caused -+ * the wakeup and return the appropriate error code to the caller. Must be -+ * called with the hb lock held. -+ * -+ * Return: -+ * - 0 = no early wakeup detected; -+ * - <0 = -ETIMEDOUT or -ERESTARTNOINTR -+ */ -+static inline -+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, -+ struct futex_q *q, union futex_key *key2, -+ struct hrtimer_sleeper *timeout) -+{ -+ int ret = 0; -+ -+ /* -+ * With the hb lock held, we avoid races while we process the wakeup. -+ * We only need to hold hb (and not hb2) to ensure atomicity as the -+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. -+ * It can't be requeued from uaddr2 to something else since we don't -+ * support a PI aware source futex for requeue. -+ */ -+ if (!match_futex(&q->key, key2)) { -+ WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); -+ /* -+ * We were woken prior to requeue by a timeout or a signal. -+ * Unqueue the futex_q and determine which it was. -+ */ -+ plist_del(&q->list, &hb->chain); -+ hb_waiters_dec(hb); -+ -+ /* Handle spurious wakeups gracefully */ -+ ret = -EWOULDBLOCK; -+ if (timeout && !timeout->task) -+ ret = -ETIMEDOUT; -+ else if (signal_pending(current)) -+ ret = -ERESTARTNOINTR; -+ } -+ return ret; -+} -+ -+/** -+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 -+ * @uaddr: the futex we initially wait on (non-pi) -+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be -+ * the same type, no requeueing from private to shared, etc. -+ * @val: the expected value of uaddr -+ * @abs_time: absolute timeout -+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all -+ * @uaddr2: the pi futex we will take prior to returning to user-space -+ * -+ * The caller will wait on uaddr and will be requeued by futex_requeue() to -+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake -+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to -+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters; -+ * without one, the pi logic would not know which task to boost/deboost, if -+ * there was a need to. -+ * -+ * We call schedule in futex_wait_queue_me() when we enqueue and return there -+ * via the following-- -+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() -+ * 2) wakeup on uaddr2 after a requeue -+ * 3) signal -+ * 4) timeout -+ * -+ * If 3, cleanup and return -ERESTARTNOINTR. -+ * -+ * If 2, we may then block on trying to take the rt_mutex and return via: -+ * 5) successful lock -+ * 6) signal -+ * 7) timeout -+ * 8) other lock acquisition failure -+ * -+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). -+ * -+ * If 4 or 7, we cleanup and return with -ETIMEDOUT. -+ * -+ * Return: -+ * - 0 - On success; -+ * - <0 - On error -+ */ -+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, -+ u32 val, ktime_t *abs_time, u32 bitset, -+ u32 __user *uaddr2) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ struct futex_pi_state *pi_state = NULL; -+ struct rt_mutex_waiter rt_waiter; -+ struct futex_hash_bucket *hb; -+ union futex_key key2 = FUTEX_KEY_INIT; -+ struct futex_q q = futex_q_init; -+ int res, ret; -+ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI)) -+ return -ENOSYS; -+ -+ if (uaddr == uaddr2) -+ return -EINVAL; -+ -+ if (!bitset) -+ return -EINVAL; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, -+ current->timer_slack_ns); -+ -+ /* -+ * The waiter is allocated on our stack, manipulated by the requeue -+ * code while we sleep on uaddr. -+ */ -+ rt_mutex_init_waiter(&rt_waiter); -+ -+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); -+ if (unlikely(ret != 0)) -+ goto out; -+ -+ q.bitset = bitset; -+ q.rt_waiter = &rt_waiter; -+ q.requeue_pi_key = &key2; -+ -+ /* -+ * Prepare to wait on uaddr. On success, increments q.key (key1) ref -+ * count. -+ */ -+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); -+ if (ret) -+ goto out; -+ -+ /* -+ * The check above which compares uaddrs is not sufficient for -+ * shared futexes. We need to compare the keys: -+ */ -+ if (match_futex(&q.key, &key2)) { -+ queue_unlock(hb); -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */ -+ futex_wait_queue_me(hb, &q, to); -+ -+ spin_lock(&hb->lock); -+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); -+ spin_unlock(&hb->lock); -+ if (ret) -+ goto out; -+ -+ /* -+ * In order for us to be here, we know our q.key == key2, and since -+ * we took the hb->lock above, we also know that futex_requeue() has -+ * completed and we no longer have to concern ourselves with a wakeup -+ * race with the atomic proxy lock acquisition by the requeue code. The -+ * futex_requeue dropped our key1 reference and incremented our key2 -+ * reference count. -+ */ -+ -+ /* Check if the requeue code acquired the second futex for us. */ -+ if (!q.rt_waiter) { -+ /* -+ * Got the lock. We might not be the anticipated owner if we -+ * did a lock-steal - fix up the PI-state in that case. -+ */ -+ if (q.pi_state && (q.pi_state->owner != current)) { -+ spin_lock(q.lock_ptr); -+ ret = fixup_pi_state_owner(uaddr2, &q, current); -+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { -+ pi_state = q.pi_state; -+ get_pi_state(pi_state); -+ } -+ /* -+ * Drop the reference to the pi state which -+ * the requeue_pi() code acquired for us. -+ */ -+ put_pi_state(q.pi_state); -+ spin_unlock(q.lock_ptr); -+ } -+ } else { -+ struct rt_mutex *pi_mutex; -+ -+ /* -+ * We have been woken up by futex_unlock_pi(), a timeout, or a -+ * signal. futex_unlock_pi() will not destroy the lock_ptr nor -+ * the pi_state. -+ */ -+ WARN_ON(!q.pi_state); -+ pi_mutex = &q.pi_state->pi_mutex; -+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); -+ -+ spin_lock(q.lock_ptr); -+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) -+ ret = 0; -+ -+ debug_rt_mutex_free_waiter(&rt_waiter); -+ /* -+ * Fixup the pi_state owner and possibly acquire the lock if we -+ * haven't already. -+ */ -+ res = fixup_owner(uaddr2, &q, !ret); -+ /* -+ * If fixup_owner() returned an error, proprogate that. If it -+ * acquired the lock, clear -ETIMEDOUT or -EINTR. -+ */ -+ if (res) -+ ret = (res < 0) ? res : 0; -+ -+ /* -+ * If fixup_pi_state_owner() faulted and was unable to handle -+ * the fault, unlock the rt_mutex and return the fault to -+ * userspace. -+ */ -+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { -+ pi_state = q.pi_state; -+ get_pi_state(pi_state); -+ } -+ -+ /* Unqueue and drop the lock. */ -+ unqueue_me_pi(&q); -+ } -+ -+ if (pi_state) { -+ rt_mutex_futex_unlock(&pi_state->pi_mutex); -+ put_pi_state(pi_state); -+ } -+ -+ if (ret == -EINTR) { -+ /* -+ * We've already been requeued, but cannot restart by calling -+ * futex_lock_pi() directly. We could restart this syscall, but -+ * it would detect that the user space "val" changed and return -+ * -EWOULDBLOCK. Save the overhead of the restart and return -+ * -EWOULDBLOCK directly. -+ */ -+ ret = -EWOULDBLOCK; -+ } -+ -+out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ return ret; -+} -+ -+static long do_futex1(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, -+ u32 __user *uaddr2, u32 val2, u32 val3) -+{ -+ int cmd = op & FUTEX_CMD_MASK; -+ unsigned int flags = 0; -+ -+ if (!(op & FUTEX_PRIVATE_FLAG)) -+ flags |= FLAGS_SHARED; -+ -+ if (op & FUTEX_CLOCK_REALTIME) { -+ flags |= FLAGS_CLOCKRT; -+ if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ -+ cmd != FUTEX_WAIT_REQUEUE_PI) -+ return -ENOSYS; -+ } -+ -+ switch (cmd) { -+ case FUTEX_LOCK_PI: -+ case FUTEX_UNLOCK_PI: -+ case FUTEX_TRYLOCK_PI: -+ case FUTEX_WAIT_REQUEUE_PI: -+ case FUTEX_CMP_REQUEUE_PI: -+ if (!futex_cmpxchg_enabled) -+ return -ENOSYS; -+ } -+ -+ switch (cmd) { -+ case FUTEX_WAIT: -+ val3 = FUTEX_BITSET_MATCH_ANY; -+ fallthrough; -+ case FUTEX_WAIT_BITSET: -+ return futex_wait(uaddr, flags, val, timeout, val3); -+ case FUTEX_WAKE: -+ val3 = FUTEX_BITSET_MATCH_ANY; -+ fallthrough; -+ case FUTEX_WAKE_BITSET: -+ return futex_wake(uaddr, flags, val, val3); -+ case FUTEX_REQUEUE: -+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); -+ case FUTEX_CMP_REQUEUE: -+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); -+ case FUTEX_WAKE_OP: -+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); -+ case FUTEX_LOCK_PI: -+ return futex_lock_pi(uaddr, flags, timeout, 0); -+ case FUTEX_UNLOCK_PI: -+ return futex_unlock_pi(uaddr, flags); -+ case FUTEX_TRYLOCK_PI: -+ return futex_lock_pi(uaddr, flags, NULL, 1); -+ case FUTEX_WAIT_REQUEUE_PI: -+ val3 = FUTEX_BITSET_MATCH_ANY; -+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, -+ uaddr2); -+ case FUTEX_CMP_REQUEUE_PI: -+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); -+ } -+ return -ENOSYS; -+} -+ -+ -+SYSCALL_DEFINE6(futex1, u32 __user *, uaddr, int, op, u32, val, -+ struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, -+ u32, val3) -+{ -+ struct timespec64 ts; -+ ktime_t t, *tp = NULL; -+ u32 val2 = 0; -+ int cmd = op & FUTEX_CMD_MASK; -+ -+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || -+ cmd == FUTEX_WAIT_BITSET || -+ cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) -+ return -EFAULT; -+ if (get_timespec64(&ts, utime)) -+ return -EFAULT; -+ if (!timespec64_valid(&ts)) -+ return -EINVAL; -+ -+ t = timespec64_to_ktime(ts); -+ if (cmd == FUTEX_WAIT) -+ t = ktime_add_safe(ktime_get(), t); -+ tp = &t; -+ } -+ /* -+ * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. -+ * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. -+ */ -+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || -+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) -+ val2 = (u32) (unsigned long) utime; -+ -+ return do_futex1(uaddr, op, val, tp, uaddr2, val2, val3); -+} -+ -+static void __init futex_detect_cmpxchg(void) -+{ -+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -+ u32 curval; -+ -+ /* -+ * This will fail and we want it. Some arch implementations do -+ * runtime detection of the futex_atomic_cmpxchg_inatomic() -+ * functionality. We want to know that before we call in any -+ * of the complex code paths. Also we want to prevent -+ * registration of robust lists in that case. NULL is -+ * guaranteed to fault and we get -EFAULT on functional -+ * implementation, the non-functional ones will return -+ * -ENOSYS. -+ */ -+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) -+ futex_cmpxchg_enabled = 1; -+#endif -+} -+ -+static int __init futex_init(void) -+{ -+ unsigned int futex_shift; -+ unsigned long i; -+ -+#if CONFIG_BASE_SMALL -+ futex_hashsize = 16; -+#else -+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); -+#endif -+ -+ futex_queues = alloc_large_system_hash("futex1", sizeof(*futex_queues), -+ futex_hashsize, 0, -+ futex_hashsize < 256 ? HASH_SMALL : 0, -+ &futex_shift, NULL, -+ futex_hashsize, futex_hashsize); -+ futex_hashsize = 1UL << futex_shift; -+ -+ futex_detect_cmpxchg(); -+ -+ for (i = 0; i < futex_hashsize; i++) { -+ atomic_set(&futex_queues[i].waiters, 0); -+ plist_head_init(&futex_queues[i].chain); -+ spin_lock_init(&futex_queues[i].lock); -+ } -+ -+ return 0; -+} -+core_initcall(futex_init); -diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index 3e1a713d3e57..b53a24a99a14 100644 ---- a/kernel/sys_ni.c -+++ b/kernel/sys_ni.c -@@ -153,6 +153,8 @@ COND_SYSCALL(futex_wait); - COND_SYSCALL(futex_wake); - COND_SYSCALL(futex_waitv); - -+COND_SYSCALL(futex1); -+ - /* kernel/hrtimer.c */ - - /* kernel/itimer.c */ -diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h -index 4205ed4158bf..43de5a59ac1c 100644 ---- a/tools/arch/x86/include/asm/unistd_64.h -+++ b/tools/arch/x86/include/asm/unistd_64.h -@@ -17,3 +17,15 @@ - #ifndef __NR_setns - #define __NR_setns 308 - #endif -+ -+#ifndef __NR_futex_wait -+#define __NR_futex_wait 440 -+#endif -+ -+#ifndef __NR_futex_wake -+#define __NR_futex_wake 441 -+#endif -+ -+#ifndef __NR_futex1 -+#define __NR_futex1 442 -+#endif -diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h -index dd457de21bad..f737eaeecbb6 100644 ---- a/tools/include/uapi/asm-generic/unistd.h -+++ b/tools/include/uapi/asm-generic/unistd.h -@@ -862,11 +862,15 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) - - #define __NR_futex_wait 440 - __SYSCALL(__NR_futex_wait, sys_futex_wait) -+ - #define __NR_futex_wake 441 - __SYSCALL(__NR_futex_wake, sys_futex_wake) - -+#define __NR_futex1 442 -+__SYSCALL(__NR_futex1, sys_futex1) -+ - #undef __NR_syscalls --#define __NR_syscalls 442 -+#define __NR_syscalls 443 - - /* - * 32 bit systems traditionally used different -diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl -index f30d6ae9a688..1a516b081207 100644 ---- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl -+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl -@@ -361,6 +361,9 @@ - 437 common openat2 sys_openat2 - 438 common pidfd_getfd sys_pidfd_getfd - 439 common faccessat2 sys_faccessat2 -+440 common futex_wait sys_futex_wait -+441 common futex_wake sys_futex_wake -+442 common futex1 sys_futex1 - - # - # x32-specific system call numbers start at 512 to avoid cache impact -diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h -index 31b53cc7d5bc..baf6a0d077ac 100644 ---- a/tools/perf/bench/futex.h -+++ b/tools/perf/bench/futex.h -@@ -8,10 +8,14 @@ - #ifndef _FUTEX_H - #define _FUTEX_H - -+//#define FUTEX1 0 -+#define UNUSED(x) (void)(x) -+ - #include - #include - #include - #include -+#include - - /** - * futex() - SYS_futex syscall wrapper -@@ -34,7 +38,13 @@ - * like-named arguments in the following wrappers except where noted below. - */ - #define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \ -- syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3) -+ syscall(__NR_futex1, uaddr, op | opflags, val, timeout, uaddr2, val3) -+ -+#define futex2_wake(uaddr, nr, flags) \ -+ syscall(__NR_futex_wake, uaddr, nr, flags | FUTEX_32) -+ -+#define futex2_wait(uaddr, val, flags, timeout) \ -+ syscall(__NR_futex_wait, uaddr, val, flags | FUTEX_32, timeout) - - /** - * futex_wait() - block on uaddr with optional timeout -@@ -43,7 +53,13 @@ - static inline int - futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags) - { -+#ifdef FUTEX1 - return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); -+#else -+ UNUSED(timeout); -+ UNUSED(opflags); -+ return futex2_wait(uaddr, val, 0, NULL); -+#endif - } - - /** -@@ -53,7 +69,12 @@ futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflag - static inline int - futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) - { -+#ifdef FUTEX1 - return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); -+#else -+ UNUSED(opflags); -+ return futex2_wake(uaddr, nr_wake, 0); -+#endif - } - - /** --- -2.28.0 - -From 2f5e38a4191ac6fd5040435f6a41433add3711a6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Thu, 15 Oct 2020 18:06:40 -0300 -Subject: [PATCH 07/13] futex2: Add support for shared futexes -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Add support for shared futexes for cross-process resources. - -Signed-off-by: André Almeida ---- - kernel/futex2.c | 169 +++++++++++++++++++++++++++++++++++++++++------- - 1 file changed, 146 insertions(+), 23 deletions(-) - -diff --git a/kernel/futex2.c b/kernel/futex2.c -index 4b782b5ef615..ae743ddf223e 100644 ---- a/kernel/futex2.c -+++ b/kernel/futex2.c -@@ -6,7 +6,9 @@ - */ - - #include -+#include - #include -+#include - #include - #include - #include -@@ -15,6 +17,7 @@ - - /** - * struct futex_waiter - List entry for a waiter -+ * @uaddr: Memory address of userspace futex - * @key.address: Memory address of userspace futex - * @key.mm: Pointer to memory management struct of this process - * @key: Stores information that uniquely identify a futex -@@ -25,6 +28,7 @@ - * @index: Index of waiter in futexv list - */ - struct futex_waiter { -+ uintptr_t uaddr; - struct futex_key { - uintptr_t address; - struct mm_struct *mm; -@@ -125,16 +129,109 @@ static inline int bucket_get_waiters(struct futex_bucket *bucket) - #endif - } - -+static u64 get_inode_sequence_number(struct inode *inode) -+{ -+ static atomic64_t i_seq; -+ u64 old; -+ -+ /* Does the inode already have a sequence number? */ -+ old = atomic64_read(&inode->i_sequence); -+ if (likely(old)) -+ return old; -+ -+ for (;;) { -+ u64 new = atomic64_add_return(1, &i_seq); -+ if (WARN_ON_ONCE(!new)) -+ continue; -+ -+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); -+ if (old) -+ return old; -+ return new; -+ } -+} -+ -+static int futex_get_shared_key(uintptr_t address, struct mm_struct *mm, -+ struct futex_key *key) -+{ -+ int err; -+ struct page *page, *tail; -+ struct address_space *mapping; -+ -+again: -+ err = get_user_pages_fast(address, 1, 0, &page); -+ -+ if (err < 0) -+ return err; -+ else -+ err = 0; -+ -+ -+ tail = page; -+ page = compound_head(page); -+ mapping = READ_ONCE(page->mapping); -+ -+ -+ if (unlikely(!mapping)) { -+ int shmem_swizzled; -+ -+ lock_page(page); -+ shmem_swizzled = PageSwapCache(page) || page->mapping; -+ unlock_page(page); -+ put_page(page); -+ -+ if (shmem_swizzled) -+ goto again; -+ -+ return -EFAULT; -+ } -+ -+ if (PageAnon(page)) { -+ -+ key->mm = mm; -+ key->address = address; -+ -+ } else { -+ struct inode *inode; -+ -+ rcu_read_lock(); -+ -+ if (READ_ONCE(page->mapping) != mapping) { -+ rcu_read_unlock(); -+ put_page(page); -+ -+ goto again; -+ } -+ -+ inode = READ_ONCE(mapping->host); -+ if (!inode) { -+ rcu_read_unlock(); -+ put_page(page); -+ -+ goto again; -+ } -+ -+ key->address = get_inode_sequence_number(inode); -+ key->mm = (struct mm_struct *) basepage_index(tail); -+ rcu_read_unlock(); -+ } -+ -+ put_page(page); -+ return err; -+} -+ - /** - * futex_get_bucket - Check if the user address is valid, prepare internal - * data and calculate the hash - * @uaddr: futex user address - * @key: data that uniquely identifies a futex -+ * @shared: is this a shared futex? - * - * Return: address of bucket on success, error code otherwise - */ - static struct futex_bucket *futex_get_bucket(void __user *uaddr, -- struct futex_key *key) -+ struct futex_key *key, -+ bool shared) - { - uintptr_t address = (uintptr_t) uaddr; - u32 hash_key; -@@ -145,8 +242,12 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr, - if (unlikely(!access_ok(address, sizeof(u32)))) - return ERR_PTR(-EFAULT); - -- key->address = address; -- key->mm = current->mm; -+ if (!shared) { -+ key->address = address; -+ key->mm = current->mm; -+ } else { -+ futex_get_shared_key(address, current->mm, key); -+ } - - /* Generate hash key for this futex using uaddr and current->mm */ - hash_key = jhash2((u32 *) key, sizeof(*key) / sizeof(u32), 0); -@@ -275,9 +376,10 @@ static int futex_dequeue_multiple(struct futexv *futexv, unsigned int nr) - * Return: 0 on success, error code otherwise - */ - static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, -- unsigned int *awaken) -+ int *awaken) - { - int i, ret; -+ bool shared; - u32 uval, *uaddr, val; - struct futex_bucket *bucket; - -@@ -285,9 +387,13 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, - set_current_state(TASK_INTERRUPTIBLE); - - for (i = 0; i < nr_futexes; i++) { -- uaddr = (u32 * __user) futexv->objects[i].key.address; -+ uaddr = (u32 * __user) futexv->objects[i].uaddr; - val = (u32) futexv->objects[i].val; -- bucket = futexv->objects[i].bucket; -+ shared = (futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false; -+ if (shared) -+ bucket = futex_get_bucket((void *) uaddr, &futexv->objects[i].key, true); -+ else -+ bucket = futexv->objects[i].bucket; - - bucket_inc_waiters(bucket); - spin_lock(&bucket->lock); -@@ -301,11 +407,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, - __set_current_state(TASK_RUNNING); - *awaken = futex_dequeue_multiple(futexv, i); - -+ if (shared) -+ goto retry; -+ - if (__get_user(uval, uaddr)) - return -EFAULT; - - if (*awaken >= 0) -- return 0; -+ return 1; - - goto retry; - } -@@ -313,12 +422,14 @@ static int futex_enqueue(struct futexv *futexv, unsigned int nr_futexes, - if (uval != val) { - spin_unlock(&bucket->lock); - -+ - bucket_dec_waiters(bucket); - __set_current_state(TASK_RUNNING); - *awaken = futex_dequeue_multiple(futexv, i); ++ * wake nr futexes waiting for uaddr ++ */ ++static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned long flags) ++{ ++ return syscall(__NR_futex_wake, uaddr, nr, flags); ++} +-- +2.29.2 + + +From 1e0349f5a81a43cdb50d9a97812194df6d937b69 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 9 Jul 2020 11:36:14 -0300 +Subject: [PATCH 5/9] selftests: futex: Add futex2 timeout test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Adapt existing futex wait timeout file to test the same mechanism for +futex2. + +Signed-off-by: André Almeida +Signed-off-by: Jan200101 +--- + .../futex/functional/futex_wait_timeout.c | 58 ++++++++++++++++--- + 1 file changed, 49 insertions(+), 9 deletions(-) + +diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +index ee55e6d38..245670e44 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +@@ -11,6 +11,7 @@ + * + * HISTORY + * 2009-Nov-6: Initial version by Darren Hart ++ * 2020-Jul-9: Add futex2 test by André + * + *****************************************************************************/ -- if (*awaken >= 0) -- return 0; -+ if (*awaken >= 0) { -+ return 1; -+ } +@@ -20,7 +21,7 @@ + #include + #include + #include +-#include "futextest.h" ++#include "futex2test.h" + #include "logging.h" - return -EWOULDBLOCK; - } -@@ -336,19 +447,18 @@ static int __futex_wait(struct futexv *futexv, - struct hrtimer_sleeper *timeout) + #define TEST_NAME "futex-wait-timeout" +@@ -40,7 +41,8 @@ void usage(char *prog) + int main(int argc, char *argv[]) { - int ret; -- unsigned int awaken = -1; - -- while (1) { -- ret = futex_enqueue(futexv, nr_futexes, &awaken); - -- if (ret < 0) -- break; -+ while (1) { -+ int awaken = -1; - -- if (awaken <= 0) { -- return awaken; -+ ret = futex_enqueue(futexv, nr_futexes, &awaken); -+ if (ret) { -+ if (awaken >= 0) -+ return awaken; -+ return ret; - } + futex_t f1 = FUTEX_INITIALIZER; +- struct timespec to; ++ struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; ++ struct timespec64 to64; + int res, ret = RET_PASS; + int c; -- - /* Before sleeping, check if someone was woken */ - if (!futexv->hint && (!timeout || timeout->task)) - freezable_schedule(); -@@ -419,6 +529,7 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, - hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); +@@ -65,22 +67,60 @@ int main(int argc, char *argv[]) } -+ - ret = __futex_wait(futexv, nr_futexes, timo ? timeout : NULL); - - -@@ -438,9 +549,10 @@ static int futex_wait(struct futexv *futexv, unsigned int nr_futexes, - SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, - unsigned int, flags, struct __kernel_timespec __user *, timo) - { -+ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; - unsigned int size = flags & FUTEX_SIZE_MASK; -- struct hrtimer_sleeper timeout; - struct futex_single_waiter wait_single; -+ struct hrtimer_sleeper timeout; - struct futex_waiter *waiter; - struct futexv *futexv; - int ret; -@@ -452,6 +564,7 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, - waiter = &wait_single.waiter; - waiter->index = 0; - waiter->val = val; -+ waiter->uaddr = (uintptr_t) uaddr; - - INIT_LIST_HEAD(&waiter->list); - -@@ -462,11 +575,14 @@ SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, unsigned int, val, - return -EINVAL; + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(3); + ksft_print_msg("%s: Block on a futex and wait for timeout\n", + basename(argv[0])); + ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); - /* Get an unlocked hash bucket */ -- waiter->bucket = futex_get_bucket(uaddr, &waiter->key); -- if (IS_ERR(waiter->bucket)) -+ waiter->bucket = futex_get_bucket(uaddr, &waiter->key, shared); -+ if (IS_ERR(waiter->bucket)) { - return PTR_ERR(waiter->bucket); +- /* initialize timeout */ +- to.tv_sec = 0; +- to.tv_nsec = timeout_ns; +- + info("Calling futex_wait on f1: %u @ %p\n", f1, &f1); + res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG); + if (!res || errno != ETIMEDOUT) { +- fail("futex_wait returned %d\n", ret < 0 ? errno : ret); ++ ksft_test_result_fail("futex_wait returned %d\n", ret < 0 ? errno : ret); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex_wait timeout succeeds\n"); + } - - ret = futex_wait(futexv, 1, timo, &timeout, flags); -+ if (ret > 0) -+ ret = 0; - - return ret; - } -@@ -486,8 +602,10 @@ static int futex_parse_waitv(struct futexv *futexv, - struct futex_waitv waitv; - unsigned int i; - struct futex_bucket *bucket; -+ bool shared; - - for (i = 0; i < nr_futexes; i++) { + - if (copy_from_user(&waitv, &uwaitv[i], sizeof(waitv))) - return -EFAULT; - -@@ -495,8 +613,10 @@ static int futex_parse_waitv(struct futexv *futexv, - (waitv.flags & FUTEX_SIZE_MASK) != FUTEX_32) - return -EINVAL; - -+ shared = (waitv.flags & FUTEX_SHARED_FLAG) ? true : false; ++ /* setting absolute monotonic timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); + - bucket = futex_get_bucket(waitv.uaddr, -- &futexv->objects[i].key); -+ &futexv->objects[i].key, shared); - if (IS_ERR(bucket)) - return PTR_ERR(bucket); - -@@ -505,6 +625,7 @@ static int futex_parse_waitv(struct futexv *futexv, - futexv->objects[i].flags = waitv.flags; - futexv->objects[i].index = i; - INIT_LIST_HEAD(&futexv->objects[i].list); -+ futexv->objects[i].uaddr = (uintptr_t) waitv.uaddr; - } - - return 0; -@@ -573,6 +694,7 @@ static struct futexv *futex_get_parent(uintptr_t waiter, u8 index) - SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, - unsigned int, flags) - { -+ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false; - unsigned int size = flags & FUTEX_SIZE_MASK; - struct futex_waiter waiter, *aux, *tmp; - struct futex_bucket *bucket; -@@ -586,9 +708,10 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, - if (size != FUTEX_32) - return -EINVAL; - -- bucket = futex_get_bucket(uaddr, &waiter.key); -- if (IS_ERR(bucket)) -+ bucket = futex_get_bucket(uaddr, &waiter.key, shared); -+ if (IS_ERR(bucket)) { - return PTR_ERR(bucket); ++ to64.tv_nsec += timeout_ns; ++ ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; + } - - if (!bucket_get_waiters(bucket)) - return 0; --- -2.28.0 - -From 909eb056421668b5d42f8c4dfa92339851a43dd8 Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Mon, 2 Nov 2020 18:41:38 -0500 -Subject: [PATCH 08/13] Revert "futex: Remove needless goto's" - -This reverts commit d7c5ed73b19c4640426d9c106f70ec2cb532034d. ---- - kernel/futex.c | 40 ++++++++++++++++++++++++---------------- - 1 file changed, 24 insertions(+), 16 deletions(-) - -diff --git a/kernel/futex.c b/kernel/futex.c -index 6c00c0952313..a671d371b11f 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -1593,13 +1593,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) - - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); - if (unlikely(ret != 0)) -- return ret; -+ goto out; - - hb = hash_futex(&key); - - /* Make sure we really have tasks to wakeup */ - if (!hb_waiters_pending(hb)) -- return ret; -+ goto out; - - spin_lock(&hb->lock); - -@@ -1622,6 +1622,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) - - spin_unlock(&hb->lock); - wake_up_q(&wake_q); -+out: - return ret; - } - -@@ -1688,10 +1689,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); - if (unlikely(ret != 0)) -- return ret; -+ goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); - if (unlikely(ret != 0)) -- return ret; -+ goto out; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); -@@ -1709,13 +1710,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - * an MMU, but we might get them from range checking - */ - ret = op_ret; -- return ret; -+ goto out; - } - - if (op_ret == -EFAULT) { - ret = fault_in_user_writeable(uaddr2); - if (ret) -- return ret; -+ goto out; - } - - if (!(flags & FLAGS_SHARED)) { -@@ -1758,6 +1759,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - out_unlock: - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); -+out: - return ret; - } - -@@ -1964,18 +1966,20 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); - if (unlikely(ret != 0)) -- return ret; -+ goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? FUTEX_WRITE : FUTEX_READ); - if (unlikely(ret != 0)) -- return ret; -+ goto out; - - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ -- if (requeue_pi && match_futex(&key1, &key2)) -- return -EINVAL; -+ if (requeue_pi && match_futex(&key1, &key2)) { -+ ret = -EINVAL; -+ goto out; ++ ++ info("Calling futex2_wait on f1: %u @ %p\n", f1, &f1); ++ res = futex2_wait(&f1, f1, FUTEX_32, &to64); ++ if (!res || errno != ETIMEDOUT) { ++ ksft_test_result_fail("futex2_wait monotonic returned %d\n", ret < 0 ? errno : ret); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wait monotonic timeout succeeds\n"); + } - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); -@@ -1995,7 +1999,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - - ret = get_user(curval, uaddr1); - if (ret) -- return ret; -+ goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; -@@ -2061,7 +2065,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; -- return ret; -+ goto out; - case -EBUSY: - case -EAGAIN: - /* -@@ -2180,6 +2184,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); - hb_waiters_dec(hb2); + -+out: - return ret ? ret : task_count; - } - -@@ -2537,7 +2543,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) - */ - if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current); -- return ret ? ret : locked; -+ goto out; - } - - /* -@@ -2550,7 +2556,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) - */ - if (q->pi_state->owner == current) { - ret = fixup_pi_state_owner(uaddr, q, NULL); -- return ret; -+ goto out; - } - - /* -@@ -2564,7 +2570,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) - q->pi_state->owner); - } - -- return ret; -+out: -+ return ret ? ret : locked; - } - - /** -@@ -2661,7 +2668,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - - ret = get_user(uval, uaddr); - if (ret) -- return ret; -+ goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; -@@ -2674,6 +2681,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - ret = -EWOULDBLOCK; ++ /* setting absolute realtime timeout for futex2 */ ++ if (gettime64(CLOCK_REALTIME, &to64)) ++ error("gettime64 failed\n", errno); ++ ++ to64.tv_nsec += timeout_ns; ++ ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; ++ } ++ ++ info("Calling futex2_wait on f1: %u @ %p\n", f1, &f1); ++ res = futex2_wait(&f1, f1, FUTEX_32 | FUTEX_CLOCK_REALTIME, &to64); ++ if (!res || errno != ETIMEDOUT) { ++ ksft_test_result_fail("futex2_wait realtime returned %d\n", ret < 0 ? errno : ret); + ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wait realtime timeout succeeds\n"); } -+out: +- print_result(TEST_NAME, ret); ++ ksft_print_cnts(); return ret; } - -- -2.28.0 +2.29.2 + -From fee513186b69c4a65534fd790545877974ef17d3 Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Mon, 2 Nov 2020 18:41:54 -0500 -Subject: [PATCH 09/13] Revert "futex: Remove put_futex_key()" +From 298120f6e3a758cd03e26a104f5ce60a88501b7f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Thu, 9 Jul 2020 11:37:42 -0300 +Subject: [PATCH 6/9] selftests: futex: Add futex2 wouldblock test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Adapt existing futex wait wouldblock file to test the same mechanism for +futex2. -This reverts commit 9180bd467f9abdb44afde650d07e3b9dd66d837c. +Signed-off-by: André Almeida +Signed-off-by: Jan200101 --- - kernel/futex.c | 61 ++++++++++++++++++++++++++++++++++++++++---------- - 1 file changed, 49 insertions(+), 12 deletions(-) + .../futex/functional/futex_wait_wouldblock.c | 33 ++++++++++++++++--- + 1 file changed, 29 insertions(+), 4 deletions(-) -diff --git a/kernel/futex.c b/kernel/futex.c -index a671d371b11f..647de692c874 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -661,6 +661,10 @@ static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, - return err; - } - -+static inline void put_futex_key(union futex_key *key) -+{ -+} -+ - /** - * fault_in_user_writeable() - Fault in user address and verify RW access - * @uaddr: pointer to faulting user space address -@@ -1599,7 +1603,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) - - /* Make sure we really have tasks to wakeup */ - if (!hb_waiters_pending(hb)) -- goto out; -+ goto out_put_key; - - spin_lock(&hb->lock); - -@@ -1622,6 +1626,8 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) - - spin_unlock(&hb->lock); - wake_up_q(&wake_q); -+out_put_key: -+ put_futex_key(&key); - out: - return ret; - } -@@ -1692,7 +1698,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); - if (unlikely(ret != 0)) -- goto out; -+ goto out_put_key1; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); -@@ -1710,13 +1716,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - * an MMU, but we might get them from range checking - */ - ret = op_ret; -- goto out; -+ goto out_put_keys; - } - - if (op_ret == -EFAULT) { - ret = fault_in_user_writeable(uaddr2); - if (ret) -- goto out; -+ goto out_put_keys; - } - - if (!(flags & FLAGS_SHARED)) { -@@ -1724,6 +1730,8 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - goto retry_private; - } - -+ put_futex_key(&key2); -+ put_futex_key(&key1); - cond_resched(); - goto retry; - } -@@ -1759,6 +1767,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - out_unlock: - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); -+out_put_keys: -+ put_futex_key(&key2); -+out_put_key1: -+ put_futex_key(&key1); - out: - return ret; - } -@@ -1970,7 +1982,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? FUTEX_WRITE : FUTEX_READ); - if (unlikely(ret != 0)) -- goto out; -+ goto out_put_key1; - - /* - * The check above which compares uaddrs is not sufficient for -@@ -1978,7 +1990,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; -- goto out; -+ goto out_put_keys; - } - - hb1 = hash_futex(&key1); -@@ -1999,11 +2011,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - - ret = get_user(curval, uaddr1); - if (ret) -- goto out; -+ goto out_put_keys; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - -+ put_futex_key(&key2); -+ put_futex_key(&key1); - goto retry; - } - if (curval != *cmpval) { -@@ -2062,6 +2076,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - case -EFAULT: - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); -+ put_futex_key(&key2); -+ put_futex_key(&key1); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; -@@ -2076,6 +2092,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - */ - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); -+ put_futex_key(&key2); -+ put_futex_key(&key1); - /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise -@@ -2185,6 +2203,10 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - wake_up_q(&wake_q); - hb_waiters_dec(hb2); - -+out_put_keys: -+ put_futex_key(&key2); -+out_put_key1: -+ put_futex_key(&key1); - out: - return ret ? ret : task_count; - } -@@ -2673,6 +2695,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - if (!(flags & FLAGS_SHARED)) - goto retry_private; - -+ put_futex_key(&q->key); - goto retry; - } - -@@ -2682,6 +2705,8 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - } - - out: -+ if (ret) -+ put_futex_key(&q->key); - return ret; - } - -@@ -2826,6 +2851,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, - * - EAGAIN: The user space value changed. - */ - queue_unlock(hb); -+ put_futex_key(&q.key); - /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise -@@ -2933,11 +2959,13 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, - put_pi_state(pi_state); - } - -- goto out; -+ goto out_put_key; - - out_unlock_put_key: - queue_unlock(hb); - -+out_put_key: -+ put_futex_key(&q.key); - out: - if (to) { - hrtimer_cancel(&to->timer); -@@ -2950,11 +2978,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, - - ret = fault_in_user_writeable(uaddr); - if (ret) -- goto out; -+ goto out_put_key; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - -+ put_futex_key(&q.key); - goto retry; - } - -@@ -3083,13 +3112,16 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) - out_unlock: - spin_unlock(&hb->lock); - out_putkey: -+ put_futex_key(&key); - return ret; - - pi_retry: -+ put_futex_key(&key); - cond_resched(); - goto retry; +diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +index 0ae390ff8..1f72e5928 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +@@ -12,6 +12,7 @@ + * + * HISTORY + * 2009-Nov-14: Initial version by Gowrishankar ++ * 2020-Jul-9: Add futex2 test by André + * + *****************************************************************************/ - pi_faulted: -+ put_futex_key(&key); +@@ -21,7 +22,7 @@ + #include + #include + #include +-#include "futextest.h" ++#include "futex2test.h" + #include "logging.h" - ret = fault_in_user_writeable(uaddr); - if (!ret) -@@ -3231,7 +3263,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); - if (ret) -- goto out; -+ goto out_key2; - - /* - * The check above which compares uaddrs is not sufficient for -@@ -3240,7 +3272,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - if (match_futex(&q.key, &key2)) { - queue_unlock(hb); - ret = -EINVAL; -- goto out; -+ goto out_put_keys; + #define TEST_NAME "futex-wait-wouldblock" +@@ -39,6 +40,7 @@ void usage(char *prog) + int main(int argc, char *argv[]) + { + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; ++ struct timespec64 to64; + futex_t f1 = FUTEX_INITIALIZER; + int res, ret = RET_PASS; + int c; +@@ -61,18 +63,41 @@ int main(int argc, char *argv[]) } - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ -@@ -3250,7 +3282,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) -- goto out; -+ goto out_put_keys; + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(2); + ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", + basename(argv[0])); - /* - * In order for us to be here, we know our q.key == key2, and since -@@ -3340,6 +3372,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - ret = -EWOULDBLOCK; + info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); + if (!res || errno != EWOULDBLOCK) { +- fail("futex_wait returned: %d %s\n", ++ ksft_test_result_fail("futex_wait returned: %d %s\n", + res ? errno : res, res ? strerror(errno) : ""); + ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); } -+out_put_keys: -+ put_futex_key(&q.key); -+out_key2: -+ put_futex_key(&key2); -+ - out: - if (to) { - hrtimer_cancel(&to->timer); --- -2.28.0 - -From 3b1489448a277fc1c34ca12e859193c3a7f3446c Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Fri, 12 Jul 2019 14:16:20 -0400 -Subject: [PATCH 10/13] futex: Split key setup from key queue locking and read - -split the futex key setup from the queue locking and key reading. This -is usefull to support the setup of multiple keys at the same time, like -what is done in futex_requeue() and what will be done for the -FUTEX_WAIT_MULTIPLE command. - -Signed-off-by: Gabriel Krisman Bertazi ---- - kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- - 1 file changed, 42 insertions(+), 29 deletions(-) - -diff --git a/kernel/futex.c b/kernel/futex.c -index 647de692c874..f05349def492 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -2634,6 +2634,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - __set_current_state(TASK_RUNNING); - } - -+static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, -+ struct futex_q *q, struct futex_hash_bucket **hb) -+{ -+ -+ u32 uval; -+ int ret; -+ -+retry_private: -+ *hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, uaddr); -+ -+ if (ret) { -+ queue_unlock(*hb); -+ -+ ret = get_user(uval, uaddr); -+ if (ret) -+ return ret; +- print_result(TEST_NAME, ret); ++ /* setting absolute timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); + -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; ++ to64.tv_nsec += timeout_ns; + -+ return 1; ++ if (to64.tv_nsec >= 1000000000) { ++ to64.tv_sec++; ++ to64.tv_nsec -= 1000000000; + } + -+ if (uval != val) { -+ queue_unlock(*hb); -+ ret = -EWOULDBLOCK; ++ info("Calling futex2_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); ++ res = futex2_wait(&f1, f1+1, FUTEX_32, &to64); ++ if (!res || errno != EWOULDBLOCK) { ++ ksft_test_result_fail("futex2_wait returned: %d %s\n", ++ res ? errno : res, res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wait wouldblock succeeds\n"); + } + -+ return ret; -+} -+ - /** - * futex_wait_setup() - Prepare to wait on a futex - * @uaddr: the futex userspace address -@@ -2654,7 +2687,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) - { -- u32 uval; - int ret; - - /* -@@ -2675,38 +2707,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - * absorb a wakeup if *uaddr does not match the desired values - * while the syscall executes. - */ --retry: -- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); -- if (unlikely(ret != 0)) -- return ret; -- --retry_private: -- *hb = queue_lock(q); -+ do { -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, -+ &q->key, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; - -- ret = get_futex_value_locked(&uval, uaddr); -+ ret = __futex_wait_setup(uaddr, val, flags, q, hb); - -- if (ret) { -- queue_unlock(*hb); -- -- ret = get_user(uval, uaddr); -+ /* Drop key reference if retry or error. */ - if (ret) -- goto out; -+ put_futex_key(&q->key); -+ } while (ret > 0); - -- if (!(flags & FLAGS_SHARED)) -- goto retry_private; -- -- put_futex_key(&q->key); -- goto retry; -- } -- -- if (uval != val) { -- queue_unlock(*hb); -- ret = -EWOULDBLOCK; -- } -- --out: -- if (ret) -- put_futex_key(&q->key); ++ ksft_print_cnts(); return ret; } - -- -2.28.0 - -From 539862895e53b9a774f3a2271d1e7db57879d0d7 Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Mon, 8 Jul 2019 09:44:09 -0400 -Subject: [PATCH 11/13] futex: Implement FUTEX_WAIT_MULTIPLE - -This is a new futex operation to allow a thread to wait on several -futexes at the same time, and wake up on any of them. In a sense, it -implements one of the features that was supported by pooling on the old -FUTEX_FD interface. +2.29.2 -My use case for this feature lies in Wine, where we want to implement a -similar function available in Windows, mainly for event handling. The -wine folks have an implementation of the userspace side using eventfd, -but it suffers from bad performance, as shown in the measurements below. -Technically, the old FUTEX_WAIT implementation can be easily -reimplemented using do_futex_wait_multiple, with a count one, and I have -a patch demonstrating how it works. I'm not proposing it, since futex -is such a tricky code, that I'd be more confortable to have -FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, -before considering modifying FUTEX_WAIT. - -This was tested using three mechanisms: - -1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and -running tools/testing/selftests/futex and a full linux distro on top of -this kernel. - -2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a -multi thread, event handling setup. - -3) By running the Wine fsync implementation and executing multi-threaded -applications, in particular modern games on top of the implementation. +From 05c697a239aad5e8608c6acf0da9239cac5f7a2e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Tue, 8 Dec 2020 18:47:31 -0300 +Subject: [PATCH 7/9] selftests: futex: Add futex2 waitv test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit -Signed-off-by: Zebediah Figura -Signed-off-by: Steven Noonan -Signed-off-by: Pierre-Loup A. Griffais -Signed-off-by: Gabriel Krisman Bertazi +Signed-off-by: André Almeida +Signed-off-by: Jan200101 --- - include/uapi/linux/futex.h | 7 ++ - kernel/futex.c | 159 ++++++++++++++++++++++++++++++++++++- - 2 files changed, 162 insertions(+), 4 deletions(-) + .../selftests/futex/functional/.gitignore | 1 + + .../selftests/futex/functional/Makefile | 3 +- + .../selftests/futex/functional/futex2_waitv.c | 156 ++++++++++++++++++ + .../testing/selftests/futex/functional/run.sh | 3 + + .../selftests/futex/include/futex2test.h | 25 ++- + 5 files changed, 183 insertions(+), 5 deletions(-) + create mode 100644 tools/testing/selftests/futex/functional/futex2_waitv.c -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 35a5bf1cd41b..aefb0b83b784 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,6 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 -+#define FUTEX_WAIT_MULTIPLE 13 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -190,4 +191,10 @@ struct robust_list_head { - (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ - | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) - -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; +diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore +index d61f1df94..d0b8f637b 100644 +--- a/tools/testing/selftests/futex/functional/.gitignore ++++ b/tools/testing/selftests/futex/functional/.gitignore +@@ -7,3 +7,4 @@ futex_wait_timeout + futex_wait_uninitialized_heap + futex_wait_wouldblock + futex2_wait ++futex2_waitv +diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile +index 7142a94a7..b857b9450 100644 +--- a/tools/testing/selftests/futex/functional/Makefile ++++ b/tools/testing/selftests/futex/functional/Makefile +@@ -16,7 +16,8 @@ TEST_GEN_FILES := \ + futex_requeue_pi_mismatched_ops \ + futex_wait_uninitialized_heap \ + futex_wait_private_mapped_file \ +- futex2_wait ++ futex2_wait \ ++ futex2_waitv + + TEST_PROGS := run.sh + +diff --git a/tools/testing/selftests/futex/functional/futex2_waitv.c b/tools/testing/selftests/futex/functional/futex2_waitv.c +new file mode 100644 +index 000000000..d4b116651 +--- /dev/null ++++ b/tools/testing/selftests/futex/functional/futex2_waitv.c +@@ -0,0 +1,156 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/****************************************************************************** ++ * ++ * Copyright Collabora Ltd., 2020 ++ * ++ * DESCRIPTION ++ * Test waitv/wake mechanism of futex2, using 32bit sized futexes. ++ * ++ * AUTHOR ++ * André Almeida ++ * ++ * HISTORY ++ * 2020-Jul-9: Initial version by André ++ * ++ *****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "futex2test.h" ++#include "logging.h" ++ ++#define TEST_NAME "futex2-wait" ++#define timeout_ns 1000000000 ++#define WAKE_WAIT_US 10000 ++#define NR_FUTEXES 30 ++struct futex_waitv waitv[NR_FUTEXES]; ++u_int32_t futexes[NR_FUTEXES] = {0}; ++ ++void usage(char *prog) ++{ ++ printf("Usage: %s\n", prog); ++ printf(" -c Use color\n"); ++ printf(" -h Display this help message\n"); ++ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", ++ VQUIET, VCRITICAL, VINFO); ++} + - #endif /* _UAPI_LINUX_FUTEX_H */ -diff --git a/kernel/futex.c b/kernel/futex.c -index f05349def492..775f780a96c4 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -166,6 +166,7 @@ static int __read_mostly futex_cmpxchg_enabled; - #endif - #define FLAGS_CLOCKRT 0x02 - #define FLAGS_HAS_TIMEOUT 0x04 -+#define FLAGS_WAKE_MULTIPLE 0x08 - - /* - * Priority Inheritance state: -@@ -2723,6 +2724,148 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - -+static int do_futex_wait_multiple(struct futex_wait_block *wb, -+ u32 count, unsigned int flags, -+ ktime_t *abs_time) ++void *waiterfn(void *arg) +{ ++ struct timespec64 to64; ++ int res; + -+ struct hrtimer_sleeper timeout, *to; -+ struct futex_hash_bucket *hb; -+ struct futex_q *qs = NULL; -+ int ret; -+ int i; ++ /* setting absolute timeout for futex2 */ ++ if (gettime64(CLOCK_MONOTONIC, &to64)) ++ error("gettime64 failed\n", errno); + -+ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); -+ if (!qs) -+ return -ENOMEM; ++ to64.tv_sec++; + -+ to = futex_setup_timer(abs_time, &timeout, flags, -+ current->timer_slack_ns); -+ retry: -+ for (i = 0; i < count; i++) { -+ qs[i].key = FUTEX_KEY_INIT; -+ qs[i].bitset = wb[i].bitset; -+ -+ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, -+ &qs[i].key, FUTEX_READ); -+ if (unlikely(ret != 0)) { -+ for (--i; i >= 0; i--) -+ put_futex_key(&qs[i].key); -+ goto out; -+ } ++ res = futex2_waitv(waitv, NR_FUTEXES, 0, &to64); ++ if (res < 0) { ++ printf("waiter failed errno %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); + } + -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, -+ flags, &qs[i], &hb); -+ if (ret) { -+ /* Drop the failed key directly. keys 0..(i-1) -+ * will be put by unqueue_me. */ -+ put_futex_key(&qs[i].key); ++ return NULL; ++} + -+ /* Undo the partial work we did. */ -+ for (--i; i >= 0; i--) -+ unqueue_me(&qs[i]); ++int main(int argc, char *argv[]) ++{ ++ pthread_t waiter; ++ int res, ret = RET_PASS; ++ int c, i; + -+ __set_current_state(TASK_RUNNING); -+ if (ret > 0) -+ goto retry; -+ goto out; ++ while ((c = getopt(argc, argv, "cht:v:")) != -1) { ++ switch (c) { ++ case 'c': ++ log_color(1); ++ break; ++ case 'h': ++ usage(basename(argv[0])); ++ exit(0); ++ case 'v': ++ log_verbosity(atoi(optarg)); ++ break; ++ default: ++ usage(basename(argv[0])); ++ exit(1); + } -+ -+ /* We can't hold to the bucket lock when dealing with -+ * the next futex. Queue ourselves now so we can unlock -+ * it before moving on. */ -+ queue_me(&qs[i], hb); + } + -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* There is no easy to way to check if we are wake already on -+ * multiple futexes without waking through each one of them. So -+ * just sleep and let the scheduler handle it. -+ */ -+ if (!to || to->task) -+ freezable_schedule(); ++ ksft_print_header(); ++ ksft_set_plan(2); ++ ksft_print_msg("%s: Test FUTEX2_WAITV\n", ++ basename(argv[0])); + -+ __set_current_state(TASK_RUNNING); ++ //info("Calling private futex2_wait on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ ++ for (i = 0; i < NR_FUTEXES; i++) { ++ waitv[i].uaddr = &futexes[i]; ++ waitv[i].flags = FUTEX_32; ++ waitv[i].val = 0; ++ } + -+ ret = -ETIMEDOUT; -+ /* If we were woken (and unqueued), we succeeded. */ -+ for (i = 0; i < count; i++) -+ if (!unqueue_me(&qs[i])) -+ ret = i; ++ if (pthread_create(&waiter, NULL, waiterfn, NULL)) ++ error("pthread_create failed\n", errno); + -+ /* Succeed wakeup */ -+ if (ret >= 0) -+ goto out; ++ usleep(WAKE_WAIT_US); + -+ /* Woken by triggered timeout */ -+ if (to && !to->task) -+ goto out; ++ // info("Calling private futex2_wake on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ res = futex2_wake(waitv[NR_FUTEXES - 1].uaddr, 1, FUTEX_32); ++ if (res != 1) { ++ ksft_test_result_fail("futex2_wake private returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_waitv private succeeds\n"); ++ } + -+ /* -+ * We expect signal_pending(current), but we might be the -+ * victim of a spurious wakeup as well. -+ */ -+ if (!signal_pending(current)) -+ goto retry; ++ for (i = 0; i < NR_FUTEXES; i++) { ++ int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); ++ if (shm_id < 0) { ++ perror("shmget"); ++ exit(1); ++ } + -+ ret = -ERESTARTSYS; -+ if (!abs_time) -+ goto out; ++ unsigned int *shared_data = shmat(shm_id, NULL, 0); ++ *shared_data = 0; + -+ ret = -ERESTART_RESTARTBLOCK; -+ out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); ++ waitv[i].uaddr = shared_data; ++ waitv[i].flags = FUTEX_32 | FUTEX_SHARED_FLAG; ++ waitv[i].val = 0; + } + -+ kfree(qs); -+ return ret; -+} -+ -+static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, -+ u32 count, ktime_t *abs_time) -+{ -+ struct futex_wait_block *wb; -+ struct restart_block *restart; -+ int ret; ++ //info("Calling shared futex2_wait on f1: %u @ %p with val=%u\n", *f1, f1, *f1); + -+ if (!count) -+ return -EINVAL; ++ if (pthread_create(&waiter, NULL, waiterfn, NULL)) ++ error("pthread_create failed\n", errno); + -+ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); -+ if (!wb) -+ return -ENOMEM; ++ usleep(WAKE_WAIT_US); + -+ if (copy_from_user(wb, uaddr, -+ count * sizeof(struct futex_wait_block))) { -+ ret = -EFAULT; -+ goto out; ++ // info("Calling shared futex2_wake on f1: %u @ %p with val=%u\n", *f1, f1, *f1); ++ res = futex2_wake(waitv[NR_FUTEXES - 1].uaddr, 1, FUTEX_32 | FUTEX_SHARED_FLAG); ++ if (res != 1) { ++ ksft_test_result_fail("futex2_wake shared returned: %d %s\n", ++ res ? errno : res, ++ res ? strerror(errno) : ""); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex2_wake shared succeeds\n"); + } + -+ ret = do_futex_wait_multiple(wb, count, flags, abs_time); -+ -+ if (ret == -ERESTART_RESTARTBLOCK) { -+ restart = ¤t->restart_block; -+ restart->fn = futex_wait_restart; -+ restart->futex.uaddr = uaddr; -+ restart->futex.val = count; -+ restart->futex.time = *abs_time; -+ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | -+ FLAGS_WAKE_MULTIPLE); -+ } ++ for (i = 0; i < NR_FUTEXES; i++) ++ shmdt(waitv[i].uaddr); + -+out: -+ kfree(wb); ++ ksft_print_cnts(); + return ret; +} -+ - static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) - { -@@ -2800,6 +2943,10 @@ static long futex_wait_restart(struct restart_block *restart) - } - restart->fn = do_no_restart_syscall; +diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh +index 3730159c8..18b3883d7 100755 +--- a/tools/testing/selftests/futex/functional/run.sh ++++ b/tools/testing/selftests/futex/functional/run.sh +@@ -76,3 +76,6 @@ echo -+ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) -+ return (long)futex_wait_multiple(uaddr, restart->futex.flags, -+ restart->futex.val, tp); + echo + ./futex2_wait $COLOR + - return (long)futex_wait(uaddr, restart->futex.flags, - restart->futex.val, tp, restart->futex.bitset); - } -@@ -3843,6 +3990,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - uaddr2); - case FUTEX_CMP_REQUEUE_PI: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); -+ case FUTEX_WAIT_MULTIPLE: -+ return futex_wait_multiple(uaddr, flags, val, timeout); - } - return -ENOSYS; - } -@@ -3859,7 +4008,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) -@@ -3868,7 +4018,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); -@@ -4055,14 +4205,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; ++echo ++./futex2_waitv $COLOR +diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h +index 807b8b57f..10be0c504 100644 +--- a/tools/testing/selftests/futex/include/futex2test.h ++++ b/tools/testing/selftests/futex/include/futex2test.h +@@ -27,10 +27,18 @@ + #ifndef FUTEX_32 + #define FUTEX_32 2 + #endif +-#ifdef __x86_64__ +-# ifndef FUTEX_64 +-# define FUTEX_64 3 +-# endif ++ ++#ifndef FUTEX_SHARED_FLAG ++#define FUTEX_SHARED_FLAG 8 ++#endif ++ ++#ifndef FUTEX_WAITV_MAX ++#define FUTEX_WAITV_MAX 128 ++struct futex_waitv { ++ void *uaddr; ++ unsigned int val; ++ unsigned int flags; ++}; + #endif - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); + /* +@@ -75,3 +83,12 @@ static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned lo + { + return syscall(__NR_futex_wake, uaddr, nr, flags); + } ++ ++/* ++ * wait for uaddr if (*uaddr == val) ++ */ ++static inline int futex2_waitv(volatile struct futex_waitv *waiters, unsigned long nr_waiters, ++ unsigned long flags, struct timespec64 *timo) ++{ ++ return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo); ++} -- -2.28.0 - -From f56b85af005d46e9ef920a6728e61f7c47cf561e Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Mon, 2 Nov 2020 18:50:26 -0500 -Subject: [PATCH 12/13] futex: Change WAIT_MULTIPLE opcode to 31 - -Signed-off-by: Gabriel Krisman Bertazi ---- - include/uapi/linux/futex.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) +2.29.2 -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index aefb0b83b784..fe2b67ac0c5e 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,7 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 --#define FUTEX_WAIT_MULTIPLE 13 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 --- -2.28.0 -From 022e2f888a50fb8d062e26bc385abf02c0be84a3 Mon Sep 17 00:00:00 2001 +From 9358bbdf929a90bc144d13e002fed8f4223d3178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Mon, 16 Nov 2020 21:22:21 -0300 -Subject: [PATCH 13/13] futex2: Add sysfs entry for syscall numbers +Date: Fri, 4 Dec 2020 19:12:23 -0300 +Subject: [PATCH 8/9] futex2: Add sysfs entry for syscall numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: André Almeida +Signed-off-by: Jan200101 --- kernel/futex2.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/futex2.c b/kernel/futex2.c -index ae743ddf223e..4bdff8bfc78d 100644 +index 5ddb9922d..58cd8a868 100644 --- a/kernel/futex2.c +++ b/kernel/futex2.c -@@ -742,6 +742,48 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, +@@ -762,6 +762,48 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake, return ret; } @@ -6693,5 +2623,363 @@ index ae743ddf223e..4bdff8bfc78d 100644 { int i; -- -2.28.0 +2.29.2 + + +From f7b1c9a2ad05933e559ef78bc7753b2fac1698fd Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Tue, 5 Jan 2021 15:44:02 -0300 +Subject: [PATCH 9/9] perf bench: Add futex2 benchmark tests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Port existing futex infrastructure to use futex2 calls. + +Signed-off-by: André Almeida +Signed-off-by: Jan200101 +--- + tools/arch/x86/include/asm/unistd_64.h | 8 +++++ + tools/perf/bench/bench.h | 3 ++ + tools/perf/bench/futex-hash.c | 24 ++++++++++++--- + tools/perf/bench/futex-wake-parallel.c | 41 ++++++++++++++++++++++---- + tools/perf/bench/futex-wake.c | 36 ++++++++++++++++++---- + tools/perf/bench/futex.h | 17 +++++++++++ + tools/perf/builtin-bench.c | 17 ++++++++--- + 7 files changed, 127 insertions(+), 19 deletions(-) + +diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h +index 4205ed415..151a41ceb 100644 +--- a/tools/arch/x86/include/asm/unistd_64.h ++++ b/tools/arch/x86/include/asm/unistd_64.h +@@ -17,3 +17,11 @@ + #ifndef __NR_setns + #define __NR_setns 308 + #endif ++ ++#ifndef __NR_futex_wait ++# define __NR_futex_wait 441 ++#endif ++ ++#ifndef __NR_futex_wake ++# define __NR_futex_wake 442 ++#endif +diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h +index eac36afab..f6f881a05 100644 +--- a/tools/perf/bench/bench.h ++++ b/tools/perf/bench/bench.h +@@ -38,8 +38,11 @@ int bench_mem_memcpy(int argc, const char **argv); + int bench_mem_memset(int argc, const char **argv); + int bench_mem_find_bit(int argc, const char **argv); + int bench_futex_hash(int argc, const char **argv); ++int bench_futex2_hash(int argc, const char **argv); + int bench_futex_wake(int argc, const char **argv); ++int bench_futex2_wake(int argc, const char **argv); + int bench_futex_wake_parallel(int argc, const char **argv); ++int bench_futex2_wake_parallel(int argc, const char **argv); + int bench_futex_requeue(int argc, const char **argv); + /* pi futexes */ + int bench_futex_lock_pi(int argc, const char **argv); +diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c +index 915bf3da7..72921c22b 100644 +--- a/tools/perf/bench/futex-hash.c ++++ b/tools/perf/bench/futex-hash.c +@@ -34,7 +34,7 @@ static unsigned int nthreads = 0; + static unsigned int nsecs = 10; + /* amount of futexes per thread */ + static unsigned int nfutexes = 1024; +-static bool fshared = false, done = false, silent = false; ++static bool fshared = false, done = false, silent = false, futex2 = false; + static int futex_flag = 0; + + struct timeval bench__start, bench__end, bench__runtime; +@@ -86,7 +86,10 @@ static void *workerfn(void *arg) + * such as internal waitqueue handling, thus enlarging + * the critical region protected by hb->lock. + */ +- ret = futex_wait(&w->futex[i], 1234, NULL, futex_flag); ++ if (!futex2) ++ ret = futex_wait(&w->futex[i], 1234, NULL, futex_flag); ++ else ++ ret = futex2_wait(&w->futex[i], 1234, futex_flag, NULL); + if (!silent && + (!ret || errno != EAGAIN || errno != EWOULDBLOCK)) + warn("Non-expected futex return call"); +@@ -117,7 +120,7 @@ static void print_summary(void) + (int)bench__runtime.tv_sec); + } + +-int bench_futex_hash(int argc, const char **argv) ++static int bench_futex_hash_common(int argc, const char **argv) + { + int ret = 0; + cpu_set_t cpuset; +@@ -149,7 +152,9 @@ int bench_futex_hash(int argc, const char **argv) + if (!worker) + goto errmem; + +- if (!fshared) ++ if (futex2) ++ futex_flag = FUTEX_32 | (fshared * FUTEX_SHARED_FLAG); ++ else if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n", +@@ -229,3 +234,14 @@ int bench_futex_hash(int argc, const char **argv) + errmem: + err(EXIT_FAILURE, "calloc"); + } ++ ++int bench_futex_hash(int argc, const char **argv) ++{ ++ return bench_futex_hash_common(argc, argv); ++} ++ ++int bench_futex2_hash(int argc, const char **argv) ++{ ++ futex2 = true; ++ return bench_futex_hash_common(argc, argv); ++} +diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c +index cd2b81a84..540104538 100644 +--- a/tools/perf/bench/futex-wake-parallel.c ++++ b/tools/perf/bench/futex-wake-parallel.c +@@ -17,6 +17,12 @@ int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe + pr_err("%s: pthread_barrier_t unavailable, disabling this test...\n", __func__); + return 0; + } ++ ++int bench_futex2_wake_parallel(int argc __maybe_unused, const char **argv __maybe_unused) ++{ ++ pr_err("%s: pthread_barrier_t unavailable, disabling this test...\n", __func__); ++ return 0; ++} + #else /* HAVE_PTHREAD_BARRIER */ + /* For the CLR_() macros */ + #include +@@ -48,7 +54,7 @@ static unsigned int nwakes = 1; + static u_int32_t futex = 0; + + static pthread_t *blocked_worker; +-static bool done = false, silent = false, fshared = false; ++static bool done = false, silent = false, fshared = false, futex2 = false; + static unsigned int nblocked_threads = 0, nwaking_threads = 0; + static pthread_mutex_t thread_lock; + static pthread_cond_t thread_parent, thread_worker; +@@ -79,7 +85,11 @@ static void *waking_workerfn(void *arg) + + gettimeofday(&start, NULL); + +- waker->nwoken = futex_wake(&futex, nwakes, futex_flag); ++ if (!futex2) ++ waker->nwoken = futex_wake(&futex, nwakes, futex_flag); ++ else ++ waker->nwoken = futex2_wake(&futex, nwakes, futex_flag); ++ + if (waker->nwoken != nwakes) + warnx("couldn't wakeup all tasks (%d/%d)", + waker->nwoken, nwakes); +@@ -130,8 +140,13 @@ static void *blocked_workerfn(void *arg __maybe_unused) + pthread_mutex_unlock(&thread_lock); + + while (1) { /* handle spurious wakeups */ +- if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR) +- break; ++ if (!futex2) { ++ if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR) ++ break; ++ } else { ++ if (futex2_wait(&futex, 0, futex_flag, NULL) != EINTR) ++ break; ++ } + } + + pthread_exit(NULL); +@@ -218,7 +233,7 @@ static void toggle_done(int sig __maybe_unused, + done = true; + } + +-int bench_futex_wake_parallel(int argc, const char **argv) ++static int bench_futex_wake_parallel_common(int argc, const char **argv) + { + int ret = 0; + unsigned int i, j; +@@ -262,7 +277,9 @@ int bench_futex_wake_parallel(int argc, const char **argv) + if (!blocked_worker) + err(EXIT_FAILURE, "calloc"); + +- if (!fshared) ++ if (futex2) ++ futex_flag = FUTEX_32 | (fshared * FUTEX_SHARED_FLAG); ++ else if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: blocking on %d threads (at [%s] " +@@ -322,4 +339,16 @@ int bench_futex_wake_parallel(int argc, const char **argv) + free(blocked_worker); + return ret; + } ++ ++int bench_futex_wake_parallel(int argc, const char **argv) ++{ ++ return bench_futex_wake_parallel_common(argc, argv); ++} ++ ++int bench_futex2_wake_parallel(int argc, const char **argv) ++{ ++ futex2 = true; ++ return bench_futex_wake_parallel_common(argc, argv); ++} ++ + #endif /* HAVE_PTHREAD_BARRIER */ +diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c +index 2dfcef3e3..b98b84e7b 100644 +--- a/tools/perf/bench/futex-wake.c ++++ b/tools/perf/bench/futex-wake.c +@@ -46,6 +46,9 @@ static struct stats waketime_stats, wakeup_stats; + static unsigned int threads_starting, nthreads = 0; + static int futex_flag = 0; + ++/* Should we use futex2 API? */ ++static bool futex2 = false; ++ + static const struct option options[] = { + OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakes", &nwakes, "Specify amount of threads to wake at once"), +@@ -69,8 +72,13 @@ static void *workerfn(void *arg __maybe_unused) + pthread_mutex_unlock(&thread_lock); + + while (1) { +- if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR) +- break; ++ if (!futex2) { ++ if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR) ++ break; ++ } else { ++ if (futex2_wait(&futex1, 0, futex_flag, NULL) != EINTR) ++ break; ++ } + } + + pthread_exit(NULL); +@@ -118,7 +126,7 @@ static void toggle_done(int sig __maybe_unused, + done = true; + } + +-int bench_futex_wake(int argc, const char **argv) ++static int bench_futex_wake_common(int argc, const char **argv) + { + int ret = 0; + unsigned int i, j; +@@ -148,7 +156,9 @@ int bench_futex_wake(int argc, const char **argv) + if (!worker) + err(EXIT_FAILURE, "calloc"); + +- if (!fshared) ++ if (futex2) ++ futex_flag = FUTEX_32 | (fshared * FUTEX_SHARED_FLAG); ++ else if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: blocking on %d threads (at [%s] futex %p), " +@@ -181,8 +191,13 @@ int bench_futex_wake(int argc, const char **argv) + /* Ok, all threads are patiently blocked, start waking folks up */ + gettimeofday(&start, NULL); + while (nwoken != nthreads) +- nwoken += futex_wake(&futex1, nwakes, futex_flag); ++ if (!futex2) { ++ nwoken += futex_wake(&futex1, nwakes, futex_flag); ++ } else { ++ nwoken += futex2_wake(&futex1, nwakes, futex_flag); ++ } + gettimeofday(&end, NULL); ++ + timersub(&end, &start, &runtime); + + update_stats(&wakeup_stats, nwoken); +@@ -212,3 +227,14 @@ int bench_futex_wake(int argc, const char **argv) + free(worker); + return ret; + } ++ ++int bench_futex_wake(int argc, const char **argv) ++{ ++ return bench_futex_wake_common(argc, argv); ++} ++ ++int bench_futex2_wake(int argc, const char **argv) ++{ ++ futex2 = true; ++ return bench_futex_wake_common(argc, argv); ++} +diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h +index 31b53cc7d..5111799b5 100644 +--- a/tools/perf/bench/futex.h ++++ b/tools/perf/bench/futex.h +@@ -86,4 +86,21 @@ futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wak + return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, + val, opflags); + } ++ ++/* ++ * wait for uaddr if (*uaddr == val) ++ */ ++static inline int futex2_wait(volatile void *uaddr, unsigned long val, ++ unsigned long flags, struct timespec *timo) ++{ ++ return syscall(__NR_futex_wait, uaddr, val, flags, timo); ++} ++ ++/* ++ * wake nr futexes waiting for uaddr ++ */ ++static inline int futex2_wake(volatile void *uaddr, unsigned int nr, unsigned long flags) ++{ ++ return syscall(__NR_futex_wake, uaddr, nr, flags); ++} + #endif /* _FUTEX_H */ +diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c +index 62a7b7420..200ecacad 100644 +--- a/tools/perf/builtin-bench.c ++++ b/tools/perf/builtin-bench.c +@@ -12,10 +12,11 @@ + * + * sched ... scheduler and IPC performance + * syscall ... System call performance +- * mem ... memory access performance +- * numa ... NUMA scheduling and MM performance +- * futex ... Futex performance +- * epoll ... Event poll performance ++ * mem ... memory access performance ++ * numa ... NUMA scheduling and MM performance ++ * futex ... Futex performance ++ * futex2 ... Futex2 performance ++ * epoll ... Event poll performance + */ + #include + #include "builtin.h" +@@ -75,6 +76,13 @@ static struct bench futex_benchmarks[] = { + { NULL, NULL, NULL } + }; + ++static struct bench futex2_benchmarks[] = { ++ { "hash", "Benchmark for futex2 hash table", bench_futex2_hash }, ++ { "wake", "Benchmark for futex2 wake calls", bench_futex2_wake }, ++ { "wake-parallel", "Benchmark for parallel futex2 wake calls", bench_futex2_wake_parallel }, ++ { NULL, NULL, NULL } ++}; ++ + #ifdef HAVE_EVENTFD_SUPPORT + static struct bench epoll_benchmarks[] = { + { "wait", "Benchmark epoll concurrent epoll_waits", bench_epoll_wait }, +@@ -105,6 +113,7 @@ static struct collection collections[] = { + { "numa", "NUMA scheduling and MM benchmarks", numa_benchmarks }, + #endif + {"futex", "Futex stressing benchmarks", futex_benchmarks }, ++ {"futex2", "Futex2 stressing benchmarks", futex2_benchmarks }, + #ifdef HAVE_EVENTFD_SUPPORT + {"epoll", "Epoll stressing benchmarks", epoll_benchmarks }, + #endif +-- +2.29.2 -- cgit v1.2.3