2019-04-05 01:43:16 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
/*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* Authors: Waiman Long <longman@redhat.com>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef LOCK_EVENT
|
|
|
|
#define LOCK_EVENT(name) LOCKEVENT_ ## name,
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
|
|
#ifdef CONFIG_PARAVIRT_SPINLOCKS
|
|
|
|
/*
|
|
|
|
* Locking events for PV qspinlock.
|
|
|
|
*/
|
|
|
|
LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */
|
|
|
|
LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */
|
|
|
|
LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */
|
|
|
|
LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */
|
|
|
|
LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */
|
|
|
|
LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */
|
|
|
|
LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */
|
|
|
|
LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */
|
|
|
|
LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */
|
|
|
|
LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */
|
|
|
|
LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */
|
|
|
|
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Locking events for qspinlock
|
|
|
|
*
|
|
|
|
* Subtracting lock_use_node[234] from lock_slowpath will give you
|
|
|
|
* lock_use_node1.
|
|
|
|
*/
|
|
|
|
LOCK_EVENT(lock_pending) /* # of locking ops via pending code */
|
|
|
|
LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */
|
|
|
|
LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */
|
|
|
|
LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */
|
|
|
|
LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */
|
|
|
|
LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
|
|
|
|
#endif /* CONFIG_QUEUED_SPINLOCKS */
|
2019-04-05 01:43:19 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Locking events for rwsem
|
|
|
|
*/
|
|
|
|
LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
|
|
|
|
LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
|
|
|
|
LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
|
|
|
|
LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
|
2019-05-21 04:59:11 +08:00
|
|
|
LOCK_EVENT(rwsem_opt_rlock) /* # of read locks opt-spin acquired */
|
2019-04-05 01:43:19 +08:00
|
|
|
LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */
|
|
|
|
LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */
|
locking/rwsem: Enable time-based spinning on reader-owned rwsem
When the rwsem is owned by reader, writers stop optimistic spinning
simply because there is no easy way to figure out if all the readers
are actively running or not. However, there are scenarios where
the readers are unlikely to sleep and optimistic spinning can help
performance.
This patch provides a simple mechanism for spinning on a reader-owned
rwsem by a writer. It is a time threshold based spinning where the
allowable spinning time can vary from 10us to 25us depending on the
condition of the rwsem.
When the time threshold is exceeded, the nonspinnable bits will be set
in the owner field to indicate that no more optimistic spinning will
be allowed on this rwsem until it becomes writer owned again. Not even
readers is allowed to acquire the reader-locked rwsem by optimistic
spinning for fairness.
We also want a writer to acquire the lock after the readers hold the
lock for a relatively long time. In order to give preference to writers
under such a circumstance, the single RWSEM_NONSPINNABLE bit is now split
into two - one for reader and one for writer. When optimistic spinning
is disabled, both bits will be set. When the reader count drop down
to 0, the writer nonspinnable bit will be cleared to allow writers to
spin on the lock, but not the readers. When a writer acquires the lock,
it will write its own task structure pointer into sem->owner and clear
the reader nonspinnable bit in the process.
The time taken for each iteration of the reader-owned rwsem spinning
loop varies. Below are sample minimum elapsed times for 16 iterations
of the loop.
System Time for 16 Iterations
------ ----------------------
1-socket Skylake ~800ns
4-socket Broadwell ~300ns
2-socket ThunderX2 (arm64) ~250ns
When the lock cacheline is contended, we can see up to almost 10X
increase in elapsed time. So 25us will be at most 500, 1300 and 1600
iterations for each of the above systems.
With a locking microbenchmark running on 5.1 based kernel, the total
locking rates (in kops/s) on a 8-socket IvyBridge-EX system with
equal numbers of readers and writers before and after this patch were
as follows:
# of Threads Pre-patch Post-patch
------------ --------- ----------
2 1,759 6,684
4 1,684 6,738
8 1,074 7,222
16 900 7,163
32 458 7,316
64 208 520
128 168 425
240 143 474
This patch gives a big boost in performance for mixed reader/writer
workloads.
With 32 locking threads, the rwsem lock event data were:
rwsem_opt_fail=79850
rwsem_opt_nospin=5069
rwsem_opt_rlock=597484
rwsem_opt_wlock=957339
rwsem_sleep_reader=57782
rwsem_sleep_writer=55663
With 64 locking threads, the data looked like:
rwsem_opt_fail=346723
rwsem_opt_nospin=6293
rwsem_opt_rlock=1127119
rwsem_opt_wlock=1400628
rwsem_sleep_reader=308201
rwsem_sleep_writer=72281
So a lot more threads acquired the lock in the slowpath and more threads
went to sleep.
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-15-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-05-21 04:59:13 +08:00
|
|
|
LOCK_EVENT(rwsem_opt_nospin) /* # of disabled reader opt-spinnings */
|
2019-04-05 01:43:19 +08:00
|
|
|
LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
|
|
|
|
LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
|
|
|
|
LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
|
locking/rwsem: Implement lock handoff to prevent lock starvation
Because of writer lock stealing, it is possible that a constant
stream of incoming writers will cause a waiting writer or reader to
wait indefinitely leading to lock starvation.
This patch implements a lock handoff mechanism to disable lock stealing
and force lock handoff to the first waiter or waiters (for readers)
in the queue after at least a 4ms waiting period unless it is a RT
writer task which doesn't need to wait. The waiting period is used to
avoid discouraging lock stealing too much to affect performance.
The setting and clearing of the handoff bit is serialized by the
wait_lock. So racing is not possible.
A rwsem microbenchmark was run for 5 seconds on a 2-socket 40-core
80-thread Skylake system with a v5.1 based kernel and 240 write_lock
threads with 5us sleep critical section.
Before the patch, the min/mean/max numbers of locking operations for
the locking threads were 1/7,792/173,696. After the patch, the figures
became 5,842/6,542/7,458. It can be seen that the rwsem became much
more fair, though there was a drop of about 16% in the mean locking
operations done which was a tradeoff of having better fairness.
Making the waiter set the handoff bit right after the first wakeup can
impact performance especially with a mixed reader/writer workload. With
the same microbenchmark with short critical section and equal number of
reader and writer threads (40/40), the reader/writer locking operation
counts with the current patch were:
40 readers, Iterations Min/Mean/Max = 1,793/1,794/1,796
40 writers, Iterations Min/Mean/Max = 1,793/34,956/86,081
By making waiter set handoff bit immediately after wakeup:
40 readers, Iterations Min/Mean/Max = 43/44/46
40 writers, Iterations Min/Mean/Max = 43/1,263/3,191
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-8-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-05-21 04:59:06 +08:00
|
|
|
LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
|
2019-04-05 01:43:19 +08:00
|
|
|
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
|
|
|
|
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
|
locking/rwsem: Implement lock handoff to prevent lock starvation
Because of writer lock stealing, it is possible that a constant
stream of incoming writers will cause a waiting writer or reader to
wait indefinitely leading to lock starvation.
This patch implements a lock handoff mechanism to disable lock stealing
and force lock handoff to the first waiter or waiters (for readers)
in the queue after at least a 4ms waiting period unless it is a RT
writer task which doesn't need to wait. The waiting period is used to
avoid discouraging lock stealing too much to affect performance.
The setting and clearing of the handoff bit is serialized by the
wait_lock. So racing is not possible.
A rwsem microbenchmark was run for 5 seconds on a 2-socket 40-core
80-thread Skylake system with a v5.1 based kernel and 240 write_lock
threads with 5us sleep critical section.
Before the patch, the min/mean/max numbers of locking operations for
the locking threads were 1/7,792/173,696. After the patch, the figures
became 5,842/6,542/7,458. It can be seen that the rwsem became much
more fair, though there was a drop of about 16% in the mean locking
operations done which was a tradeoff of having better fairness.
Making the waiter set the handoff bit right after the first wakeup can
impact performance especially with a mixed reader/writer workload. With
the same microbenchmark with short critical section and equal number of
reader and writer threads (40/40), the reader/writer locking operation
counts with the current patch were:
40 readers, Iterations Min/Mean/Max = 1,793/1,794/1,796
40 writers, Iterations Min/Mean/Max = 1,793/34,956/86,081
By making waiter set handoff bit immediately after wakeup:
40 readers, Iterations Min/Mean/Max = 43/44/46
40 writers, Iterations Min/Mean/Max = 43/1,263/3,191
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-8-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-05-21 04:59:06 +08:00
|
|
|
LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
|