kernel_optimize_test/kernel/locking/semaphore.c

264 lines
7.3 KiB
C
Raw Normal View History

/*
* Copyright (c) 2008 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
* Distributed under the terms of the GNU GPL, version 2
*
* This file implements counting semaphores.
* A counting semaphore may be acquired 'n' times before sleeping.
* See mutex.c for single-acquisition sleeping locks which enforce
* rules which allow code to be debugged more easily.
*/
/*
* Some notes on the implementation:
*
* The spinlock controls access to the other members of the semaphore.
* down_trylock() and up() can be called from interrupt context, so we
* have to disable interrupts when taking the lock. It turns out various
* parts of the kernel expect to be able to use down() on a semaphore in
* interrupt context when they know it will succeed, so we have to use
* irqsave variants for down(), down_interruptible() and down_killable()
* too.
*
* The ->count variable represents how many more tasks can acquire this
* semaphore. If it's zero, there may be tasks waiting on the wait_list.
*/
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem);
/**
* down - acquire the semaphore
* @sem: the semaphore to be acquired
*
* Acquires the semaphore. If no more tasks are allowed to acquire the
* semaphore, calling this function will put the task to sleep until the
* semaphore is released.
*
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
void down(struct semaphore *sem)
{
unsigned long flags;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(down);
/**
* down_interruptible - acquire the semaphore unless interrupted
* @sem: the semaphore to be acquired
*
* Attempts to acquire the semaphore. If no more tasks are allowed to
* acquire the semaphore, calling this function will put the task to sleep.
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
int down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
semaphore: fix Yanmin Zhang reported: | Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th | regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton, | and Itanium Montecito. Bisect located the patch below: | | 64ac24e738823161693bf791f87adc802cf529ff is first bad commit | commit 64ac24e738823161693bf791f87adc802cf529ff | Author: Matthew Wilcox <matthew@wil.cx> | Date: Fri Mar 7 21:55:58 2008 -0500 | | Generic semaphore implementation | | After I manually reverted the patch against 2.6.26-rc1 while fixing | lots of conflicts/errors, aim7 regression became less than 2%. i reproduced the AIM7 workload and can confirm Yanmin's findings that -.26-rc1 regresses over .25 - by over 67% here. Looking at the workload i found and fixed what i believe to be the real bug causing the AIM7 regression: it was inefficient wakeup / scheduling / locking behavior of the new generic semaphore code, causing suboptimal performance. The problem comes from the following code. The new semaphore code does this on down(): spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); spin_unlock_irqrestore(&sem->lock, flags); and this on up(): spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); spin_unlock_irqrestore(&sem->lock, flags); where __up() does: list_del(&waiter->list); waiter->up = 1; wake_up_process(waiter->task); and where __down() does this in essence: list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; waiter.up = 0; for (;;) { [...] spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); spin_lock_irq(&sem->lock); if (waiter.up) return 0; } the fastpath looks good and obvious, but note the following property of the contended path: if there's a task on the ->wait_list, the up() of the current owner will "pass over" ownership to that waiting task, in a wake-one manner, via the waiter->up flag and by removing the waiter from the wait list. That is all and fine in principle, but as implemented in kernel/semaphore.c it also creates a nasty, hidden source of contention! The contention comes from the following property of the new semaphore code: the new owner owns the semaphore exclusively, even if it is not running yet. So if the old owner, even if just a few instructions later, does a down() [lock_kernel()] again, it will be blocked and will have to wait on the new owner to eventually be scheduled (possibly on another CPU)! Or if another task gets to lock_kernel() sooner than the "new owner" scheduled, it will be blocked unnecessarily and for a very long time when there are 2000 tasks running. I.e. the implementation of the new semaphores code does wake-one and lock ownership in a very restrictive way - it does not allow opportunistic re-locking of the lock at all and keeps the scheduler from picking task order intelligently. This kind of scheduling, with 2000 AIM7 processes running, creates awful cross-scheduling between those 2000 tasks, causes reduced parallelism, a throttled runqueue length and a lot of idle time. With increasing number of CPUs it causes an exponentially worse behavior in AIM7, as the chance for a newly woken new-owner task to actually run anytime soon is less and less likely. Note that it takes just a tiny bit of contention for the 'new-semaphore catastrophy' to happen: the wakeup latencies get added to whatever small contention there is, and quickly snowball out of control! I believe Yanmin's findings and numbers support this analysis too. The best fix for this problem is to use the same scheduling logic that the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and wanted because we do not want to over-schedule), but also allow opportunistic locking of the lock even if a wakee is already "in flight". The patch below implements this new logic. With this patch applied the AIM7 regression is largely fixed on my quad testbox: # v2.6.25 vanilla: .................. Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 56096.4 91 207.5 789.7 0.4675 2000 55894.4 94 208.2 792.7 0.4658 # v2.6.26-rc1-166-gc0a1811 vanilla: ................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 33230.6 83 350.3 784.5 0.2769 2000 31778.1 86 366.3 783.6 0.2648 # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup: ............................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 55707.1 92 209.0 795.6 0.4642 2000 55704.4 96 209.0 796.0 0.4642 i.e. a 67% speedup. We are now back to within 1% of the v2.6.25 performance levels and have zero idle time during the test, as expected. Btw., interactivity also improved dramatically with the fix - for example console-switching became almost instantaneous during this workload (which after all is running 2000 tasks at once!), without the patch it was stuck for a minute at times. There's another nice side-effect of this speedup patch, the new generic semaphore code got even smaller: text data bss dec hex filename 1241 0 0 1241 4d9 semaphore.o.before 1207 0 0 1207 4b7 semaphore.o.after (because the waiter.up complication got removed.) Longer-term we should look into using the mutex code for the generic semaphore code as well - but i's not easy due to legacies and it's outside of the scope of v2.6.26 and outside the scope of this patch as well. Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 17:53:48 +08:00
sem->count--;
else
result = __down_interruptible(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
EXPORT_SYMBOL(down_interruptible);
/**
* down_killable - acquire the semaphore unless killed
* @sem: the semaphore to be acquired
*
* Attempts to acquire the semaphore. If no more tasks are allowed to
* acquire the semaphore, calling this function will put the task to sleep.
* If the sleep is interrupted by a fatal signal, this function will return
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
int down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
semaphore: fix Yanmin Zhang reported: | Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th | regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton, | and Itanium Montecito. Bisect located the patch below: | | 64ac24e738823161693bf791f87adc802cf529ff is first bad commit | commit 64ac24e738823161693bf791f87adc802cf529ff | Author: Matthew Wilcox <matthew@wil.cx> | Date: Fri Mar 7 21:55:58 2008 -0500 | | Generic semaphore implementation | | After I manually reverted the patch against 2.6.26-rc1 while fixing | lots of conflicts/errors, aim7 regression became less than 2%. i reproduced the AIM7 workload and can confirm Yanmin's findings that -.26-rc1 regresses over .25 - by over 67% here. Looking at the workload i found and fixed what i believe to be the real bug causing the AIM7 regression: it was inefficient wakeup / scheduling / locking behavior of the new generic semaphore code, causing suboptimal performance. The problem comes from the following code. The new semaphore code does this on down(): spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); spin_unlock_irqrestore(&sem->lock, flags); and this on up(): spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); spin_unlock_irqrestore(&sem->lock, flags); where __up() does: list_del(&waiter->list); waiter->up = 1; wake_up_process(waiter->task); and where __down() does this in essence: list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; waiter.up = 0; for (;;) { [...] spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); spin_lock_irq(&sem->lock); if (waiter.up) return 0; } the fastpath looks good and obvious, but note the following property of the contended path: if there's a task on the ->wait_list, the up() of the current owner will "pass over" ownership to that waiting task, in a wake-one manner, via the waiter->up flag and by removing the waiter from the wait list. That is all and fine in principle, but as implemented in kernel/semaphore.c it also creates a nasty, hidden source of contention! The contention comes from the following property of the new semaphore code: the new owner owns the semaphore exclusively, even if it is not running yet. So if the old owner, even if just a few instructions later, does a down() [lock_kernel()] again, it will be blocked and will have to wait on the new owner to eventually be scheduled (possibly on another CPU)! Or if another task gets to lock_kernel() sooner than the "new owner" scheduled, it will be blocked unnecessarily and for a very long time when there are 2000 tasks running. I.e. the implementation of the new semaphores code does wake-one and lock ownership in a very restrictive way - it does not allow opportunistic re-locking of the lock at all and keeps the scheduler from picking task order intelligently. This kind of scheduling, with 2000 AIM7 processes running, creates awful cross-scheduling between those 2000 tasks, causes reduced parallelism, a throttled runqueue length and a lot of idle time. With increasing number of CPUs it causes an exponentially worse behavior in AIM7, as the chance for a newly woken new-owner task to actually run anytime soon is less and less likely. Note that it takes just a tiny bit of contention for the 'new-semaphore catastrophy' to happen: the wakeup latencies get added to whatever small contention there is, and quickly snowball out of control! I believe Yanmin's findings and numbers support this analysis too. The best fix for this problem is to use the same scheduling logic that the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and wanted because we do not want to over-schedule), but also allow opportunistic locking of the lock even if a wakee is already "in flight". The patch below implements this new logic. With this patch applied the AIM7 regression is largely fixed on my quad testbox: # v2.6.25 vanilla: .................. Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 56096.4 91 207.5 789.7 0.4675 2000 55894.4 94 208.2 792.7 0.4658 # v2.6.26-rc1-166-gc0a1811 vanilla: ................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 33230.6 83 350.3 784.5 0.2769 2000 31778.1 86 366.3 783.6 0.2648 # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup: ............................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 55707.1 92 209.0 795.6 0.4642 2000 55704.4 96 209.0 796.0 0.4642 i.e. a 67% speedup. We are now back to within 1% of the v2.6.25 performance levels and have zero idle time during the test, as expected. Btw., interactivity also improved dramatically with the fix - for example console-switching became almost instantaneous during this workload (which after all is running 2000 tasks at once!), without the patch it was stuck for a minute at times. There's another nice side-effect of this speedup patch, the new generic semaphore code got even smaller: text data bss dec hex filename 1241 0 0 1241 4d9 semaphore.o.before 1207 0 0 1207 4b7 semaphore.o.after (because the waiter.up complication got removed.) Longer-term we should look into using the mutex code for the generic semaphore code as well - but i's not easy due to legacies and it's outside of the scope of v2.6.26 and outside the scope of this patch as well. Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 17:53:48 +08:00
sem->count--;
else
result = __down_killable(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
EXPORT_SYMBOL(down_killable);
/**
* down_trylock - try to acquire the semaphore, without waiting
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
* been acquired successfully or 1 if it it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
*
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
int down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
sem->count = count;
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
}
EXPORT_SYMBOL(down_trylock);
/**
* down_timeout - acquire the semaphore within a specified time
* @sem: the semaphore to be acquired
* @timeout: how long to wait before failing
*
* Attempts to acquire the semaphore. If no more tasks are allowed to
* acquire the semaphore, calling this function will put the task to sleep.
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
int down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
semaphore: fix Yanmin Zhang reported: | Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th | regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton, | and Itanium Montecito. Bisect located the patch below: | | 64ac24e738823161693bf791f87adc802cf529ff is first bad commit | commit 64ac24e738823161693bf791f87adc802cf529ff | Author: Matthew Wilcox <matthew@wil.cx> | Date: Fri Mar 7 21:55:58 2008 -0500 | | Generic semaphore implementation | | After I manually reverted the patch against 2.6.26-rc1 while fixing | lots of conflicts/errors, aim7 regression became less than 2%. i reproduced the AIM7 workload and can confirm Yanmin's findings that -.26-rc1 regresses over .25 - by over 67% here. Looking at the workload i found and fixed what i believe to be the real bug causing the AIM7 regression: it was inefficient wakeup / scheduling / locking behavior of the new generic semaphore code, causing suboptimal performance. The problem comes from the following code. The new semaphore code does this on down(): spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); spin_unlock_irqrestore(&sem->lock, flags); and this on up(): spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); spin_unlock_irqrestore(&sem->lock, flags); where __up() does: list_del(&waiter->list); waiter->up = 1; wake_up_process(waiter->task); and where __down() does this in essence: list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; waiter.up = 0; for (;;) { [...] spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); spin_lock_irq(&sem->lock); if (waiter.up) return 0; } the fastpath looks good and obvious, but note the following property of the contended path: if there's a task on the ->wait_list, the up() of the current owner will "pass over" ownership to that waiting task, in a wake-one manner, via the waiter->up flag and by removing the waiter from the wait list. That is all and fine in principle, but as implemented in kernel/semaphore.c it also creates a nasty, hidden source of contention! The contention comes from the following property of the new semaphore code: the new owner owns the semaphore exclusively, even if it is not running yet. So if the old owner, even if just a few instructions later, does a down() [lock_kernel()] again, it will be blocked and will have to wait on the new owner to eventually be scheduled (possibly on another CPU)! Or if another task gets to lock_kernel() sooner than the "new owner" scheduled, it will be blocked unnecessarily and for a very long time when there are 2000 tasks running. I.e. the implementation of the new semaphores code does wake-one and lock ownership in a very restrictive way - it does not allow opportunistic re-locking of the lock at all and keeps the scheduler from picking task order intelligently. This kind of scheduling, with 2000 AIM7 processes running, creates awful cross-scheduling between those 2000 tasks, causes reduced parallelism, a throttled runqueue length and a lot of idle time. With increasing number of CPUs it causes an exponentially worse behavior in AIM7, as the chance for a newly woken new-owner task to actually run anytime soon is less and less likely. Note that it takes just a tiny bit of contention for the 'new-semaphore catastrophy' to happen: the wakeup latencies get added to whatever small contention there is, and quickly snowball out of control! I believe Yanmin's findings and numbers support this analysis too. The best fix for this problem is to use the same scheduling logic that the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and wanted because we do not want to over-schedule), but also allow opportunistic locking of the lock even if a wakee is already "in flight". The patch below implements this new logic. With this patch applied the AIM7 regression is largely fixed on my quad testbox: # v2.6.25 vanilla: .................. Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 56096.4 91 207.5 789.7 0.4675 2000 55894.4 94 208.2 792.7 0.4658 # v2.6.26-rc1-166-gc0a1811 vanilla: ................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 33230.6 83 350.3 784.5 0.2769 2000 31778.1 86 366.3 783.6 0.2648 # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup: ............................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 55707.1 92 209.0 795.6 0.4642 2000 55704.4 96 209.0 796.0 0.4642 i.e. a 67% speedup. We are now back to within 1% of the v2.6.25 performance levels and have zero idle time during the test, as expected. Btw., interactivity also improved dramatically with the fix - for example console-switching became almost instantaneous during this workload (which after all is running 2000 tasks at once!), without the patch it was stuck for a minute at times. There's another nice side-effect of this speedup patch, the new generic semaphore code got even smaller: text data bss dec hex filename 1241 0 0 1241 4d9 semaphore.o.before 1207 0 0 1207 4b7 semaphore.o.after (because the waiter.up complication got removed.) Longer-term we should look into using the mutex code for the generic semaphore code as well - but i's not easy due to legacies and it's outside of the scope of v2.6.26 and outside the scope of this patch as well. Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 17:53:48 +08:00
sem->count--;
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
EXPORT_SYMBOL(down_timeout);
/**
* up - release the semaphore
* @sem: the semaphore to release
*
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
void up(struct semaphore *sem)
{
unsigned long flags;
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(up);
/* Functions for the contended case */
struct semaphore_waiter {
struct list_head list;
struct task_struct *task;
bool up;
};
/*
* Because this function is inlined, the 'state' parameter will be
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
static inline int __sched __down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
semaphore: fix Yanmin Zhang reported: | Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th | regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton, | and Itanium Montecito. Bisect located the patch below: | | 64ac24e738823161693bf791f87adc802cf529ff is first bad commit | commit 64ac24e738823161693bf791f87adc802cf529ff | Author: Matthew Wilcox <matthew@wil.cx> | Date: Fri Mar 7 21:55:58 2008 -0500 | | Generic semaphore implementation | | After I manually reverted the patch against 2.6.26-rc1 while fixing | lots of conflicts/errors, aim7 regression became less than 2%. i reproduced the AIM7 workload and can confirm Yanmin's findings that -.26-rc1 regresses over .25 - by over 67% here. Looking at the workload i found and fixed what i believe to be the real bug causing the AIM7 regression: it was inefficient wakeup / scheduling / locking behavior of the new generic semaphore code, causing suboptimal performance. The problem comes from the following code. The new semaphore code does this on down(): spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); spin_unlock_irqrestore(&sem->lock, flags); and this on up(): spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); spin_unlock_irqrestore(&sem->lock, flags); where __up() does: list_del(&waiter->list); waiter->up = 1; wake_up_process(waiter->task); and where __down() does this in essence: list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; waiter.up = 0; for (;;) { [...] spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); spin_lock_irq(&sem->lock); if (waiter.up) return 0; } the fastpath looks good and obvious, but note the following property of the contended path: if there's a task on the ->wait_list, the up() of the current owner will "pass over" ownership to that waiting task, in a wake-one manner, via the waiter->up flag and by removing the waiter from the wait list. That is all and fine in principle, but as implemented in kernel/semaphore.c it also creates a nasty, hidden source of contention! The contention comes from the following property of the new semaphore code: the new owner owns the semaphore exclusively, even if it is not running yet. So if the old owner, even if just a few instructions later, does a down() [lock_kernel()] again, it will be blocked and will have to wait on the new owner to eventually be scheduled (possibly on another CPU)! Or if another task gets to lock_kernel() sooner than the "new owner" scheduled, it will be blocked unnecessarily and for a very long time when there are 2000 tasks running. I.e. the implementation of the new semaphores code does wake-one and lock ownership in a very restrictive way - it does not allow opportunistic re-locking of the lock at all and keeps the scheduler from picking task order intelligently. This kind of scheduling, with 2000 AIM7 processes running, creates awful cross-scheduling between those 2000 tasks, causes reduced parallelism, a throttled runqueue length and a lot of idle time. With increasing number of CPUs it causes an exponentially worse behavior in AIM7, as the chance for a newly woken new-owner task to actually run anytime soon is less and less likely. Note that it takes just a tiny bit of contention for the 'new-semaphore catastrophy' to happen: the wakeup latencies get added to whatever small contention there is, and quickly snowball out of control! I believe Yanmin's findings and numbers support this analysis too. The best fix for this problem is to use the same scheduling logic that the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and wanted because we do not want to over-schedule), but also allow opportunistic locking of the lock even if a wakee is already "in flight". The patch below implements this new logic. With this patch applied the AIM7 regression is largely fixed on my quad testbox: # v2.6.25 vanilla: .................. Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 56096.4 91 207.5 789.7 0.4675 2000 55894.4 94 208.2 792.7 0.4658 # v2.6.26-rc1-166-gc0a1811 vanilla: ................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 33230.6 83 350.3 784.5 0.2769 2000 31778.1 86 366.3 783.6 0.2648 # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup: ............................................... Tasks Jobs/Min JTI Real CPU Jobs/sec/task 2000 55707.1 92 209.0 795.6 0.4642 2000 55704.4 96 209.0 796.0 0.4642 i.e. a 67% speedup. We are now back to within 1% of the v2.6.25 performance levels and have zero idle time during the test, as expected. Btw., interactivity also improved dramatically with the fix - for example console-switching became almost instantaneous during this workload (which after all is running 2000 tasks at once!), without the patch it was stuck for a minute at times. There's another nice side-effect of this speedup patch, the new generic semaphore code got even smaller: text data bss dec hex filename 1241 0 0 1241 4d9 semaphore.o.before 1207 0 0 1207 4b7 semaphore.o.after (because the waiter.up complication got removed.) Longer-term we should look into using the mutex code for the generic semaphore code as well - but i's not easy due to legacies and it's outside of the scope of v2.6.26 and outside the scope of this patch as well. Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 17:53:48 +08:00
list_add_tail(&waiter.list, &sem->wait_list);
kernel/locking: Compute 'current' directly This patch effectively replaces the tsk pointer dereference (which is obviously == current), to directly use get_current() macro. This is to make the removal of setting foreign task states smoother and painfully obvious. Performance win on some archs such as x86-64 and ppc64. On a microbenchmark that calls set_task_state() vs set_current_state() and an inode rwsem pounding benchmark doing unlink: == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-01-04 05:43:13 +08:00
waiter.task = current;
waiter.up = false;
for (;;) {
kernel/locking: Compute 'current' directly This patch effectively replaces the tsk pointer dereference (which is obviously == current), to directly use get_current() macro. This is to make the removal of setting foreign task states smoother and painfully obvious. Performance win on some archs such as x86-64 and ppc64. On a microbenchmark that calls set_task_state() vs set_current_state() and an inode rwsem pounding benchmark doing unlink: == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-01-04 05:43:13 +08:00
if (signal_pending_state(state, current))
goto interrupted;
if (unlikely(timeout <= 0))
goto timed_out;
sched/core: Remove set_task_state() This is a nasty interface and setting the state of a foreign task must not be done. As of the following commit: be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()") ... everyone in the kernel calls set_task_state() with current, allowing the helper to be removed. However, as the comment indicates, it is still around for those archs where computing current is more expensive than using a pointer, at least in theory. An important arch that is affected is arm64, however this has been addressed now [1] and performance is up to par making no difference with either calls. Of all the callers, if any, it's the locking bits that would care most about this -- ie: we end up passing a tsk pointer to a lot of the lock slowpath, and setting ->state on that. The following numbers are based on two tests: a custom ad-hoc microbenchmark that just measures latencies (for ~65 million calls) between get_task_state() vs get_current_state(). Secondly for a higher overview, an unlink microbenchmark was used, which pounds on a single file with open, close,unlink combos with increasing thread counts (up to 4x ncpus). While the workload is quite unrealistic, it does contend a lot on the inode mutex or now rwsem. [1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching) and ppc64 (with paca) show similar improvements in the unlink microbenches. The small delta for ppc64 (2ms), does not represent the gains on the unlink runs. In the case of x86, there was a decent amount of variation in the latency runs, but always within a 20 to 50ms increase), ppc was more constant. Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-01-04 05:43:14 +08:00
__set_current_state(state);
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
timed_out:
list_del(&waiter.list);
return -ETIME;
interrupted:
list_del(&waiter.list);
return -EINTR;
}
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
static noinline int __sched __down_interruptible(struct semaphore *sem)
{
return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
static noinline int __sched __down_killable(struct semaphore *sem)
{
return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
}
static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
{
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
static noinline void __sched __up(struct semaphore *sem)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
wake_up_process(waiter->task);
}