kernel_optimize_test/kernel/softirq.c

878 lines
21 KiB
C
Raw Normal View History

/*
* linux/kernel/softirq.c
*
* Copyright (C) 1992 Linus Torvalds
*
* Distribute under GPLv2.
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
*
* Remote softirq infrastructure is by Jens Axboe.
*/
#include <linux/export.h>
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <linux/tick.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
#include <asm/irq.h>
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
by its own spinlocks.
- Even if softirq is serialized, only local cpu is marked for
execution. Hence, we get something sort of weak cpu binding.
Though it is still not clear, will it result in better locality
or will not.
Examples:
- NET RX softirq. It is multithreaded and does not require
any global serialization.
- NET TX softirq. It kicks software netdevice queues, hence
it is logically serialized per device, but this serialization
is invisible to common code.
- Tasklets: serialized wrt itself.
*/
#ifndef __ARCH_IRQ_STAT
irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
EXPORT_SYMBOL(irq_stat);
#endif
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
rcu: Use softirq to address performance regression Commit a26ac2455ffcf3(rcu: move TREE_RCU from softirq to kthread) introduced performance regression. In an AIM7 test, this commit degraded performance by about 40%. The commit runs rcu callbacks in a kthread instead of softirq. We observed high rate of context switch which is caused by this. Out test system has 64 CPUs and HZ is 1000, so we saw more than 64k context switch per second which is caused by RCU's per-CPU kthread. A trace showed that most of the time the RCU per-CPU kthread doesn't actually handle any callbacks, but instead just does a very small amount of work handling grace periods. This means that RCU's per-CPU kthreads are making the scheduler do quite a bit of work in order to allow a very small amount of RCU-related processing to be done. Alex Shi's analysis determined that this slowdown is due to lock contention within the scheduler. Unfortunately, as Peter Zijlstra points out, the scheduler's real-time semantics require global action, which means that this contention is inherent in real-time scheduling. (Yes, perhaps someone will come up with a workaround -- otherwise, -rt is not going to do well on large SMP systems -- but this patch will work around this issue in the meantime. And "the meantime" might well be forever.) This patch therefore re-introduces softirq processing to RCU, but only for core RCU work. RCU callbacks are still executed in kthread context, so that only a small amount of RCU work runs in softirq context in the common case. This should minimize ksoftirqd execution, allowing us to skip boosting of ksoftirqd for CONFIG_RCU_BOOST=y kernels. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Tested-by: "Alex,Shi" <alex.shi@intel.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-06-14 13:26:25 +08:00
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
static void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
if (tsk && tsk->state != TASK_RUNNING)
wake_up_process(tsk);
}
/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
* softirq processing.
* - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
* on local_bh_disable or local_bh_enable.
* This lets us distinguish between whether we are currently processing
* softirq and whether we just have bh disabled.
*/
/*
* This one is for softirq.c-internal use,
* where hardirqs are disabled legitimately:
*/
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
#ifdef CONFIG_TRACE_IRQFLAGS
static void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
WARN_ON_ONCE(in_irq());
raw_local_irq_save(flags);
/*
* The preempt tracer hooks into add_preempt_count and will break
* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
* is set and before current->softirq_enabled is cleared.
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
preempt_count() += cnt;
/*
* Were softirqs turned off above:
*/
if (softirq_count() == cnt)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
if (preempt_count() == cnt)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
{
add_preempt_count(cnt);
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
barrier();
}
#endif /* CONFIG_TRACE_IRQFLAGS */
void local_bh_disable(void)
{
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(local_bh_disable);
static void __local_bh_enable(unsigned int cnt)
{
WARN_ON_ONCE(in_irq());
WARN_ON_ONCE(!irqs_disabled());
if (softirq_count() == cnt)
trace_softirqs_on((unsigned long)__builtin_return_address(0));
sub_preempt_count(cnt);
}
/*
* Special-case - softirqs can safely be enabled in
* cond_resched_softirq(), or by __do_softirq(),
* without processing still-pending softirqs:
*/
void _local_bh_enable(void)
{
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(_local_bh_enable);
static inline void _local_bh_enable_ip(unsigned long ip)
{
WARN_ON_ONCE(in_irq() || irqs_disabled());
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_disable();
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
#endif
/*
* Are softirqs going to be turned on now:
*/
if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
trace_softirqs_on(ip);
/*
* Keep preemption disabled until we are done with
* softirq processing:
*/
sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
if (unlikely(!in_interrupt() && local_softirq_pending()))
do_softirq();
dec_preempt_count();
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_enable();
[PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen <tim.c.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-30 18:04:02 +08:00
#endif
preempt_check_resched();
}
void local_bh_enable(void)
{
_local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
EXPORT_SYMBOL(local_bh_enable);
void local_bh_enable_ip(unsigned long ip)
{
_local_bh_enable_ip(ip);
}
EXPORT_SYMBOL(local_bh_enable_ip);
/*
softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Miller <davem@davemloft.net> Cc: Tom Herbert <therbert@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-11 07:26:34 +08:00
* We restart softirq processing for at most 2 ms,
* and if need_resched() is not set.
*
softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Miller <davem@davemloft.net> Cc: Tom Herbert <therbert@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-11 07:26:34 +08:00
* These limits have been established via experimentation.
* The two things to balance is latency against fairness -
* we want to handle softirqs as soon as possible, but they
* should not be able to lock up the box.
*/
softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Miller <davem@davemloft.net> Cc: Tom Herbert <therbert@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-11 07:26:34 +08:00
#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
__u32 pending;
softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Miller <davem@davemloft.net> Cc: Tom Herbert <therbert@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-11 07:26:34 +08:00
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
int cpu;
mm: allow PF_MEMALLOC from softirq context This is needed to allow network softirq packet processing to make use of PF_MEMALLOC. Currently softirq context cannot use PF_MEMALLOC due to it not being associated with a task, and therefore not having task flags to fiddle with - thus the gfp to alloc flag mapping ignores the task flags when in interrupts (hard or soft) context. Allowing softirqs to make use of PF_MEMALLOC therefore requires some trickery. This patch borrows the task flags from whatever process happens to be preempted by the softirq. It then modifies the gfp to alloc flags mapping to not exclude task flags in softirq context, and modify the softirq code to save, clear and restore the PF_MEMALLOC flag. The save and clear, ensures the preempted task's PF_MEMALLOC flag doesn't leak into the softirq. The restore ensures a softirq's PF_MEMALLOC flag cannot leak back into the preempted process. This should be safe due to the following reasons Softirqs can run on multiple CPUs sure but the same task should not be executing the same softirq code. Neither should the softirq handler be preempted by any other softirq handler so the flags should not leak to an unrelated softirq. Softirqs re-enable hardware interrupts in __do_softirq() so can be preempted by hardware interrupts so PF_MEMALLOC is inherited by the hard IRQ. However, this is similar to a process in reclaim being preempted by a hardirq. While PF_MEMALLOC is set, gfp_to_alloc_flags() distinguishes between hard and soft irqs and avoids giving a hardirq the ALLOC_NO_WATERMARKS flag. If the softirq is deferred to ksoftirq then its flags may be used instead of a normal tasks but as the softirq cannot be preempted, the PF_MEMALLOC flag does not leak to other code by accident. [davem@davemloft.net: Document why PF_MEMALLOC is safe] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 07:44:07 +08:00
unsigned long old_flags = current->flags;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap
*/
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_irq_enter_time(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
lockdep_softirq_enter();
cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
local_irq_enable();
h = softirq_vec;
do {
if (pending & 1) {
unsigned int vec_nr = h - softirq_vec;
int prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h);
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
printk(KERN_ERR "huh, entered softirq %u %s %p"
"with preempt_count %08x,"
" exited with %08x?\n", vec_nr,
softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count() = prev_count;
}
rcu_bh_qs(cpu);
}
h++;
pending >>= 1;
} while (pending);
local_irq_disable();
pending = local_softirq_pending();
softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Miller <davem@davemloft.net> Cc: Tom Herbert <therbert@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-11 07:26:34 +08:00
if (pending) {
if (time_before(jiffies, end) && !need_resched())
goto restart;
wakeup_softirqd();
softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Miller <davem@davemloft.net> Cc: Tom Herbert <therbert@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-11 07:26:34 +08:00
}
lockdep_softirq_exit();
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
mm: allow PF_MEMALLOC from softirq context This is needed to allow network softirq packet processing to make use of PF_MEMALLOC. Currently softirq context cannot use PF_MEMALLOC due to it not being associated with a task, and therefore not having task flags to fiddle with - thus the gfp to alloc flag mapping ignores the task flags when in interrupts (hard or soft) context. Allowing softirqs to make use of PF_MEMALLOC therefore requires some trickery. This patch borrows the task flags from whatever process happens to be preempted by the softirq. It then modifies the gfp to alloc flags mapping to not exclude task flags in softirq context, and modify the softirq code to save, clear and restore the PF_MEMALLOC flag. The save and clear, ensures the preempted task's PF_MEMALLOC flag doesn't leak into the softirq. The restore ensures a softirq's PF_MEMALLOC flag cannot leak back into the preempted process. This should be safe due to the following reasons Softirqs can run on multiple CPUs sure but the same task should not be executing the same softirq code. Neither should the softirq handler be preempted by any other softirq handler so the flags should not leak to an unrelated softirq. Softirqs re-enable hardware interrupts in __do_softirq() so can be preempted by hardware interrupts so PF_MEMALLOC is inherited by the hard IRQ. However, this is similar to a process in reclaim being preempted by a hardirq. While PF_MEMALLOC is set, gfp_to_alloc_flags() distinguishes between hard and soft irqs and avoids giving a hardirq the ALLOC_NO_WATERMARKS flag. If the softirq is deferred to ksoftirq then its flags may be used instead of a normal tasks but as the softirq cannot be preempted, the PF_MEMALLOC flag does not leak to other code by accident. [davem@davemloft.net: Document why PF_MEMALLOC is safe] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 07:44:07 +08:00
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
asmlinkage void do_softirq(void)
{
__u32 pending;
unsigned long flags;
if (in_interrupt())
return;
local_irq_save(flags);
pending = local_softirq_pending();
if (pending)
__do_softirq();
local_irq_restore(flags);
}
#endif
/*
* Enter an interrupt context.
*/
void irq_enter(void)
{
int cpu = smp_processor_id();
"Tree RCU": scalable classic RCU implementation This patch fixes a long-standing performance bug in classic RCU that results in massive internal-to-RCU lock contention on systems with more than a few hundred CPUs. Although this patch creates a separate flavor of RCU for ease of review and patch maintenance, it is intended to replace classic RCU. This patch still handles stress better than does mainline, so I am still calling it ready for inclusion. This patch is against the -tip tree. Nevertheless, experience on an actual 1000+ CPU machine would still be most welcome. Most of the changes noted below were found while creating an rcutiny (which should permit ejecting the current rcuclassic) and while doing detailed line-by-line documentation. Updates from v9 (http://lkml.org/lkml/2008/12/2/334): o Fixes from remainder of line-by-line code walkthrough, including comment spelling, initialization, undesirable narrowing due to type conversion, removing redundant memory barriers, removing redundant local-variable initialization, and removing redundant local variables. I do not believe that any of these fixes address the CPU-hotplug issues that Andi Kleen was seeing, but please do give it a whirl in case the machine is smarter than I am. A writeup from the walkthrough may be found at the following URL, in case you are suffering from terminal insomnia or masochism: http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf o Made rcutree tracing use seq_file, as suggested some time ago by Lai Jiangshan. o Added a .csv variant of the rcudata debugfs trace file, to allow people having thousands of CPUs to drop the data into a spreadsheet. Tested with oocalc and gnumeric. Updated documentation to suit. Updates from v8 (http://lkml.org/lkml/2008/11/15/139): o Fix a theoretical race between grace-period initialization and force_quiescent_state() that could occur if more than three jiffies were required to carry out the grace-period initialization. Which it might, if you had enough CPUs. o Apply Ingo's printk-standardization patch. o Substitute local variables for repeated accesses to global variables. o Fix comment misspellings and redundant (but harmless) increments of ->n_rcu_pending (this latter after having explicitly added it). o Apply checkpatch fixes. Updates from v7 (http://lkml.org/lkml/2008/10/10/291): o Fixed a number of problems noted by Gautham Shenoy, including the cpu-stall-detection bug that he was having difficulty convincing me was real. ;-) o Changed cpu-stall detection to wait for ten seconds rather than three in order to reduce false positive, as suggested by Ingo Molnar. o Produced a design document (http://lwn.net/Articles/305782/). The act of writing this document uncovered a number of both theoretical and "here and now" bugs as noted below. o Fix dynticks_nesting accounting confusion, simplify WARN_ON() condition, fix kerneldoc comments, and add memory barriers in dynticks interface functions. o Add more data to tracing. o Remove unused "rcu_barrier" field from rcu_data structure. o Count calls to rcu_pending() from scheduling-clock interrupt to use as a surrogate timebase should jiffies stop counting. o Fix a theoretical race between force_quiescent_state() and grace-period initialization. Yes, initialization does have to go on for some jiffies for this race to occur, but given enough CPUs... Updates from v6 (http://lkml.org/lkml/2008/9/23/448): o Fix a number of checkpatch.pl complaints. o Apply review comments from Ingo Molnar and Lai Jiangshan on the stall-detection code. o Fix several bugs in !CONFIG_SMP builds. o Fix a misspelled config-parameter name so that RCU now announces at boot time if stall detection is configured. o Run tests on numerous combinations of configurations parameters, which after the fixes above, now build and run correctly. Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line): o Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a changeset some time ago, and finally got around to retesting this option). o Fix some tracing bugs in rcupreempt that caused incorrect totals to be printed. o I now test with a more brutal random-selection online/offline script (attached). Probably more brutal than it needs to be on the people reading it as well, but so it goes. o A number of optimizations and usability improvements: o Make rcu_pending() ignore the grace-period timeout when there is no grace period in progress. o Make force_quiescent_state() avoid going for a global lock in the case where there is no grace period in progress. o Rearrange struct fields to improve struct layout. o Make call_rcu() initiate a grace period if RCU was idle, rather than waiting for the next scheduling clock interrupt. o Invoke rcu_irq_enter() and rcu_irq_exit() only when idle, as suggested by Andi Kleen. I still don't completely trust this change, and might back it out. o Make CONFIG_RCU_TRACE be the single config variable manipulated for all forms of RCU, instead of the prior confusion. o Document tracing files and formats for both rcupreempt and rcutree. Updates from v4 for those missing v5 given its bad subject line: o Separated dynticks interface so that NMIs and irqs call separate functions, greatly simplifying it. In particular, this code no longer requires a proof of correctness. ;-) o Separated dynticks state out into its own per-CPU structure, avoiding the duplicated accounting. o The case where a dynticks-idle CPU runs an irq handler that invokes call_rcu() is now correctly handled, forcing that CPU out of dynticks-idle mode. o Review comments have been applied (thank you all!!!). For but one example, fixed the dynticks-ordering issue that Manfred pointed out, saving me much debugging. ;-) o Adjusted rcuclassic and rcupreempt to handle dynticks changes. Attached is an updated patch to Classic RCU that applies a hierarchy, greatly reducing the contention on the top-level lock for large machines. This passes 10-hour concurrent rcutorture and online-offline testing on 128-CPU ppc64 without dynticks enabled, and exposes some timekeeping bugs in presence of dynticks (exciting working on a system where "sleep 1" hangs until interrupted...), which were fixed in the 2.6.27 kernel. It is getting more reliable than mainline by some measures, so the next version will be against -tip for inclusion. See also Manfred Spraul's recent patches (or his earlier work from 2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2). We will converge onto a common patch in the fullness of time, but are currently exploring different regions of the design space. That said, I have already gratefully stolen quite a few of Manfred's ideas. This patch provides CONFIG_RCU_FANOUT, which controls the bushiness of the RCU hierarchy. Defaults to 32 on 32-bit machines and 64 on 64-bit machines. If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT, there is no hierarchy. By default, the RCU initialization code will adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable this balancing, allowing the hierarchy to be exactly aligned to the underlying hardware. Up to two levels of hierarchy are permitted (in addition to the root node), allowing up to 16,384 CPUs on 32-bit systems and up to 262,144 CPUs on 64-bit systems. I just know that I am going to regret saying this, but this seems more than sufficient for the foreseeable future. (Some architectures might wish to set CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs. If this becomes a real problem, additional levels can be added, but I doubt that it will make a significant difference on real hardware.) In the common case, a given CPU will manipulate its private rcu_data structure and the rcu_node structure that it shares with its immediate neighbors. This can reduce both lock and memory contention by multiple orders of magnitude, which should eliminate the need for the strange manipulations that are reported to be required when running Linux on very large systems. Some shortcomings: o More bugs will probably surface as a result of an ongoing line-by-line code inspection. Patches will be provided as required. o There are probably hangs, rcutorture failures, &c. Seems quite stable on a 128-CPU machine, but that is kind of small compared to 4096 CPUs. However, seems to do better than mainline. Patches will be provided as required. o The memory footprint of this version is several KB larger than rcuclassic. A separate UP-only rcutiny patch will be provided, which will reduce the memory footprint significantly, even compared to the old rcuclassic. One such patch passes light testing, and has a memory footprint smaller even than rcuclassic. Initial reaction from various embedded guys was "it is not worth it", so am putting it aside. Credits: o Manfred Spraul for ideas, review comments, and bugs spotted, as well as some good friendly competition. ;-) o Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers, Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton for reviews and comments. o Thomas Gleixner for much-needed help with some timer issues (see patches below). o Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos, Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines alive despite my heavy abuse^Wtesting. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-12-19 04:55:32 +08:00
rcu_irq_enter();
if (is_idle_task(current) && !in_interrupt()) {
/*
* Prevent raise_softirq from needlessly waking up ksoftirqd
* here, as softirq will be serviced on return from interrupt.
*/
local_bh_disable();
tick_check_idle(cpu);
_local_bh_enable();
}
__irq_enter();
}
genirq: Provide forced interrupt threading Add a commandline parameter "threadirqs" which forces all interrupts except those marked IRQF_NO_THREAD to run threaded. That's mostly a debug option to allow retrieving better debug data from crashing interrupt handlers. If "threadirqs" is not enabled on the kernel command line, then there is no impact in the interrupt hotpath. Architecture code needs to select CONFIG_IRQ_FORCED_THREADING after marking the interrupts which cant be threaded IRQF_NO_THREAD. All interrupts which have IRQF_TIMER set are implict marked IRQF_NO_THREAD. Also all PER_CPU interrupts are excluded. Forced threading hard interrupts also forces all soft interrupt handling into thread context. When enabled it might slow down things a bit, but for debugging problems in interrupt code it's a reasonable penalty as it does not immediately crash and burn the machine when an interrupt handler is buggy. Some test results on a Core2Duo machine: Cache cold run of: # time git grep irq_desc non-threaded threaded real 1m18.741s 1m19.061s user 0m1.874s 0m1.757s sys 0m5.843s 0m5.427s # iperf -c server non-threaded [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec threaded [ 3] 0.0-10.0 sec 1.09 GBytes 939 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 937 Mbits/sec Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> LKML-Reference: <20110223234956.772668648@linutronix.de>
2011-02-24 07:52:23 +08:00
static inline void invoke_softirq(void)
{
if (!force_irqthreads)
genirq: Provide forced interrupt threading Add a commandline parameter "threadirqs" which forces all interrupts except those marked IRQF_NO_THREAD to run threaded. That's mostly a debug option to allow retrieving better debug data from crashing interrupt handlers. If "threadirqs" is not enabled on the kernel command line, then there is no impact in the interrupt hotpath. Architecture code needs to select CONFIG_IRQ_FORCED_THREADING after marking the interrupts which cant be threaded IRQF_NO_THREAD. All interrupts which have IRQF_TIMER set are implict marked IRQF_NO_THREAD. Also all PER_CPU interrupts are excluded. Forced threading hard interrupts also forces all soft interrupt handling into thread context. When enabled it might slow down things a bit, but for debugging problems in interrupt code it's a reasonable penalty as it does not immediately crash and burn the machine when an interrupt handler is buggy. Some test results on a Core2Duo machine: Cache cold run of: # time git grep irq_desc non-threaded threaded real 1m18.741s 1m19.061s user 0m1.874s 0m1.757s sys 0m5.843s 0m5.427s # iperf -c server non-threaded [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec threaded [ 3] 0.0-10.0 sec 1.09 GBytes 939 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 937 Mbits/sec Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> LKML-Reference: <20110223234956.772668648@linutronix.de>
2011-02-24 07:52:23 +08:00
__do_softirq();
else
genirq: Provide forced interrupt threading Add a commandline parameter "threadirqs" which forces all interrupts except those marked IRQF_NO_THREAD to run threaded. That's mostly a debug option to allow retrieving better debug data from crashing interrupt handlers. If "threadirqs" is not enabled on the kernel command line, then there is no impact in the interrupt hotpath. Architecture code needs to select CONFIG_IRQ_FORCED_THREADING after marking the interrupts which cant be threaded IRQF_NO_THREAD. All interrupts which have IRQF_TIMER set are implict marked IRQF_NO_THREAD. Also all PER_CPU interrupts are excluded. Forced threading hard interrupts also forces all soft interrupt handling into thread context. When enabled it might slow down things a bit, but for debugging problems in interrupt code it's a reasonable penalty as it does not immediately crash and burn the machine when an interrupt handler is buggy. Some test results on a Core2Duo machine: Cache cold run of: # time git grep irq_desc non-threaded threaded real 1m18.741s 1m19.061s user 0m1.874s 0m1.757s sys 0m5.843s 0m5.427s # iperf -c server non-threaded [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec threaded [ 3] 0.0-10.0 sec 1.09 GBytes 939 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 937 Mbits/sec Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> LKML-Reference: <20110223234956.772668648@linutronix.de>
2011-02-24 07:52:23 +08:00
wakeup_softirqd();
}
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
void irq_exit(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
WARN_ON_ONCE(!irqs_disabled());
#endif
account_irq_exit_time(current);
trace_hardirq_exit();
sub_preempt_count(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
#ifdef CONFIG_NO_HZ
/* Make sure that timer wheel updates are propagated */
"Tree RCU": scalable classic RCU implementation This patch fixes a long-standing performance bug in classic RCU that results in massive internal-to-RCU lock contention on systems with more than a few hundred CPUs. Although this patch creates a separate flavor of RCU for ease of review and patch maintenance, it is intended to replace classic RCU. This patch still handles stress better than does mainline, so I am still calling it ready for inclusion. This patch is against the -tip tree. Nevertheless, experience on an actual 1000+ CPU machine would still be most welcome. Most of the changes noted below were found while creating an rcutiny (which should permit ejecting the current rcuclassic) and while doing detailed line-by-line documentation. Updates from v9 (http://lkml.org/lkml/2008/12/2/334): o Fixes from remainder of line-by-line code walkthrough, including comment spelling, initialization, undesirable narrowing due to type conversion, removing redundant memory barriers, removing redundant local-variable initialization, and removing redundant local variables. I do not believe that any of these fixes address the CPU-hotplug issues that Andi Kleen was seeing, but please do give it a whirl in case the machine is smarter than I am. A writeup from the walkthrough may be found at the following URL, in case you are suffering from terminal insomnia or masochism: http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf o Made rcutree tracing use seq_file, as suggested some time ago by Lai Jiangshan. o Added a .csv variant of the rcudata debugfs trace file, to allow people having thousands of CPUs to drop the data into a spreadsheet. Tested with oocalc and gnumeric. Updated documentation to suit. Updates from v8 (http://lkml.org/lkml/2008/11/15/139): o Fix a theoretical race between grace-period initialization and force_quiescent_state() that could occur if more than three jiffies were required to carry out the grace-period initialization. Which it might, if you had enough CPUs. o Apply Ingo's printk-standardization patch. o Substitute local variables for repeated accesses to global variables. o Fix comment misspellings and redundant (but harmless) increments of ->n_rcu_pending (this latter after having explicitly added it). o Apply checkpatch fixes. Updates from v7 (http://lkml.org/lkml/2008/10/10/291): o Fixed a number of problems noted by Gautham Shenoy, including the cpu-stall-detection bug that he was having difficulty convincing me was real. ;-) o Changed cpu-stall detection to wait for ten seconds rather than three in order to reduce false positive, as suggested by Ingo Molnar. o Produced a design document (http://lwn.net/Articles/305782/). The act of writing this document uncovered a number of both theoretical and "here and now" bugs as noted below. o Fix dynticks_nesting accounting confusion, simplify WARN_ON() condition, fix kerneldoc comments, and add memory barriers in dynticks interface functions. o Add more data to tracing. o Remove unused "rcu_barrier" field from rcu_data structure. o Count calls to rcu_pending() from scheduling-clock interrupt to use as a surrogate timebase should jiffies stop counting. o Fix a theoretical race between force_quiescent_state() and grace-period initialization. Yes, initialization does have to go on for some jiffies for this race to occur, but given enough CPUs... Updates from v6 (http://lkml.org/lkml/2008/9/23/448): o Fix a number of checkpatch.pl complaints. o Apply review comments from Ingo Molnar and Lai Jiangshan on the stall-detection code. o Fix several bugs in !CONFIG_SMP builds. o Fix a misspelled config-parameter name so that RCU now announces at boot time if stall detection is configured. o Run tests on numerous combinations of configurations parameters, which after the fixes above, now build and run correctly. Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line): o Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a changeset some time ago, and finally got around to retesting this option). o Fix some tracing bugs in rcupreempt that caused incorrect totals to be printed. o I now test with a more brutal random-selection online/offline script (attached). Probably more brutal than it needs to be on the people reading it as well, but so it goes. o A number of optimizations and usability improvements: o Make rcu_pending() ignore the grace-period timeout when there is no grace period in progress. o Make force_quiescent_state() avoid going for a global lock in the case where there is no grace period in progress. o Rearrange struct fields to improve struct layout. o Make call_rcu() initiate a grace period if RCU was idle, rather than waiting for the next scheduling clock interrupt. o Invoke rcu_irq_enter() and rcu_irq_exit() only when idle, as suggested by Andi Kleen. I still don't completely trust this change, and might back it out. o Make CONFIG_RCU_TRACE be the single config variable manipulated for all forms of RCU, instead of the prior confusion. o Document tracing files and formats for both rcupreempt and rcutree. Updates from v4 for those missing v5 given its bad subject line: o Separated dynticks interface so that NMIs and irqs call separate functions, greatly simplifying it. In particular, this code no longer requires a proof of correctness. ;-) o Separated dynticks state out into its own per-CPU structure, avoiding the duplicated accounting. o The case where a dynticks-idle CPU runs an irq handler that invokes call_rcu() is now correctly handled, forcing that CPU out of dynticks-idle mode. o Review comments have been applied (thank you all!!!). For but one example, fixed the dynticks-ordering issue that Manfred pointed out, saving me much debugging. ;-) o Adjusted rcuclassic and rcupreempt to handle dynticks changes. Attached is an updated patch to Classic RCU that applies a hierarchy, greatly reducing the contention on the top-level lock for large machines. This passes 10-hour concurrent rcutorture and online-offline testing on 128-CPU ppc64 without dynticks enabled, and exposes some timekeeping bugs in presence of dynticks (exciting working on a system where "sleep 1" hangs until interrupted...), which were fixed in the 2.6.27 kernel. It is getting more reliable than mainline by some measures, so the next version will be against -tip for inclusion. See also Manfred Spraul's recent patches (or his earlier work from 2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2). We will converge onto a common patch in the fullness of time, but are currently exploring different regions of the design space. That said, I have already gratefully stolen quite a few of Manfred's ideas. This patch provides CONFIG_RCU_FANOUT, which controls the bushiness of the RCU hierarchy. Defaults to 32 on 32-bit machines and 64 on 64-bit machines. If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT, there is no hierarchy. By default, the RCU initialization code will adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable this balancing, allowing the hierarchy to be exactly aligned to the underlying hardware. Up to two levels of hierarchy are permitted (in addition to the root node), allowing up to 16,384 CPUs on 32-bit systems and up to 262,144 CPUs on 64-bit systems. I just know that I am going to regret saying this, but this seems more than sufficient for the foreseeable future. (Some architectures might wish to set CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs. If this becomes a real problem, additional levels can be added, but I doubt that it will make a significant difference on real hardware.) In the common case, a given CPU will manipulate its private rcu_data structure and the rcu_node structure that it shares with its immediate neighbors. This can reduce both lock and memory contention by multiple orders of magnitude, which should eliminate the need for the strange manipulations that are reported to be required when running Linux on very large systems. Some shortcomings: o More bugs will probably surface as a result of an ongoing line-by-line code inspection. Patches will be provided as required. o There are probably hangs, rcutorture failures, &c. Seems quite stable on a 128-CPU machine, but that is kind of small compared to 4096 CPUs. However, seems to do better than mainline. Patches will be provided as required. o The memory footprint of this version is several KB larger than rcuclassic. A separate UP-only rcutiny patch will be provided, which will reduce the memory footprint significantly, even compared to the old rcuclassic. One such patch passes light testing, and has a memory footprint smaller even than rcuclassic. Initial reaction from various embedded guys was "it is not worth it", so am putting it aside. Credits: o Manfred Spraul for ideas, review comments, and bugs spotted, as well as some good friendly competition. ;-) o Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers, Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton for reviews and comments. o Thomas Gleixner for much-needed help with some timer issues (see patches below). o Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos, Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines alive despite my heavy abuse^Wtesting. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-12-19 04:55:32 +08:00
if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-08 00:22:06 +08:00
tick_nohz_irq_exit();
#endif
rcu: Fix early call to rcu_idle_enter() On the irq exit path, tick_nohz_irq_exit() may raise a softirq, which action leads to the wake up path and select_task_rq_fair() that makes use of rcu to iterate the domains. This is an illegal use of RCU because we may be in RCU extended quiescent state if we interrupted an RCU-idle window in the idle loop: [ 132.978883] =============================== [ 132.978883] [ INFO: suspicious RCU usage. ] [ 132.978883] ------------------------------- [ 132.978883] kernel/sched_fair.c:1707 suspicious rcu_dereference_check() usage! [ 132.978883] [ 132.978883] other info that might help us debug this: [ 132.978883] [ 132.978883] [ 132.978883] rcu_scheduler_active = 1, debug_locks = 0 [ 132.978883] RCU used illegally from extended quiescent state! [ 132.978883] 2 locks held by swapper/0: [ 132.978883] #0: (&p->pi_lock){-.-.-.}, at: [<ffffffff8105a729>] try_to_wake_up+0x39/0x2f0 [ 132.978883] #1: (rcu_read_lock){.+.+..}, at: [<ffffffff8105556a>] select_task_rq_fair+0x6a/0xec0 [ 132.978883] [ 132.978883] stack backtrace: [ 132.978883] Pid: 0, comm: swapper Tainted: G W 3.0.0+ #178 [ 132.978883] Call Trace: [ 132.978883] <IRQ> [<ffffffff810a01f6>] lockdep_rcu_suspicious+0xe6/0x100 [ 132.978883] [<ffffffff81055c49>] select_task_rq_fair+0x749/0xec0 [ 132.978883] [<ffffffff8105556a>] ? select_task_rq_fair+0x6a/0xec0 [ 132.978883] [<ffffffff812fe494>] ? do_raw_spin_lock+0x54/0x150 [ 132.978883] [<ffffffff810a1f2d>] ? trace_hardirqs_on+0xd/0x10 [ 132.978883] [<ffffffff8105a7c3>] try_to_wake_up+0xd3/0x2f0 [ 132.978883] [<ffffffff81094f98>] ? ktime_get+0x68/0xf0 [ 132.978883] [<ffffffff8105aa35>] wake_up_process+0x15/0x20 [ 132.978883] [<ffffffff81069dd5>] raise_softirq_irqoff+0x65/0x110 [ 132.978883] [<ffffffff8108eb65>] __hrtimer_start_range_ns+0x415/0x5a0 [ 132.978883] [<ffffffff812fe3ee>] ? do_raw_spin_unlock+0x5e/0xb0 [ 132.978883] [<ffffffff8108ed08>] hrtimer_start+0x18/0x20 [ 132.978883] [<ffffffff8109c9c3>] tick_nohz_stop_sched_tick+0x393/0x450 [ 132.978883] [<ffffffff810694f2>] irq_exit+0xd2/0x100 [ 132.978883] [<ffffffff81829e96>] do_IRQ+0x66/0xe0 [ 132.978883] [<ffffffff81820d53>] common_interrupt+0x13/0x13 [ 132.978883] <EOI> [<ffffffff8103434b>] ? native_safe_halt+0xb/0x10 [ 132.978883] [<ffffffff810a1f2d>] ? trace_hardirqs_on+0xd/0x10 [ 132.978883] [<ffffffff810144ea>] default_idle+0xba/0x370 [ 132.978883] [<ffffffff810147fe>] amd_e400_idle+0x5e/0x130 [ 132.978883] [<ffffffff8100a9f6>] cpu_idle+0xb6/0x120 [ 132.978883] [<ffffffff817f217f>] rest_init+0xef/0x150 [ 132.978883] [<ffffffff817f20e2>] ? rest_init+0x52/0x150 [ 132.978883] [<ffffffff81ed9cf3>] start_kernel+0x3da/0x3e5 [ 132.978883] [<ffffffff81ed9346>] x86_64_start_reservations+0x131/0x135 [ 132.978883] [<ffffffff81ed944d>] x86_64_start_kernel+0x103/0x112 Fix this by calling rcu_idle_enter() after tick_nohz_irq_exit(). Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-08 07:31:02 +08:00
rcu_irq_exit();
}
/*
* This function must run with irqs disabled!
*/
inline void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);
/*
* If we're in an interrupt or softirq, we're done
* (this also catches softirq-disabled code). We will
* actually run the softirq once we return from
* the irq or softirq.
*
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
if (!in_interrupt())
wakeup_softirqd();
}
void raise_softirq(unsigned int nr)
{
unsigned long flags;
local_irq_save(flags);
raise_softirq_irqoff(nr);
local_irq_restore(flags);
}
void __raise_softirq_irqoff(unsigned int nr)
{
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
}
/*
* Tasklets
*/
struct tasklet_head
{
struct tasklet_struct *head;
struct tasklet_struct **tail;
};
static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
void __tasklet_schedule(struct tasklet_struct *t)
{
unsigned long flags;
local_irq_save(flags);
t->next = NULL;
*__this_cpu_read(tasklet_vec.tail) = t;
__this_cpu_write(tasklet_vec.tail, &(t->next));
raise_softirq_irqoff(TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__tasklet_schedule);
void __tasklet_hi_schedule(struct tasklet_struct *t)
{
unsigned long flags;
local_irq_save(flags);
t->next = NULL;
*__this_cpu_read(tasklet_hi_vec.tail) = t;
__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
void __tasklet_hi_schedule_first(struct tasklet_struct *t)
{
BUG_ON(!irqs_disabled());
t->next = __this_cpu_read(tasklet_hi_vec.head);
__this_cpu_write(tasklet_hi_vec.head, t);
__raise_softirq_irqoff(HI_SOFTIRQ);
}
EXPORT_SYMBOL(__tasklet_hi_schedule_first);
static void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
local_irq_disable();
list = __this_cpu_read(tasklet_vec.head);
__this_cpu_write(tasklet_vec.head, NULL);
__this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
local_irq_enable();
while (list) {
struct tasklet_struct *t = list;
list = list->next;
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}
local_irq_disable();
t->next = NULL;
*__this_cpu_read(tasklet_vec.tail) = t;
__this_cpu_write(tasklet_vec.tail, &(t->next));
__raise_softirq_irqoff(TASKLET_SOFTIRQ);
local_irq_enable();
}
}
static void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
local_irq_disable();
list = __this_cpu_read(tasklet_hi_vec.head);
__this_cpu_write(tasklet_hi_vec.head, NULL);
__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
local_irq_enable();
while (list) {
struct tasklet_struct *t = list;
list = list->next;
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}
local_irq_disable();
t->next = NULL;
*__this_cpu_read(tasklet_hi_vec.tail) = t;
__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
__raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
}
}
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
t->next = NULL;
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
t->data = data;
}
EXPORT_SYMBOL(tasklet_init);
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
printk("Attempt to kill tasklet from interrupt\n");
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
do {
yield();
} while (test_bit(TASKLET_STATE_SCHED, &t->state));
}
tasklet_unlock_wait(t);
clear_bit(TASKLET_STATE_SCHED, &t->state);
}
EXPORT_SYMBOL(tasklet_kill);
/*
* tasklet_hrtimer
*/
/*
* The trampoline is called when the hrtimer expires. It schedules a tasklet
* to run __tasklet_hrtimer_trampoline() which in turn will call the intended
* hrtimer callback, but from softirq context.
*/
static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
{
struct tasklet_hrtimer *ttimer =
container_of(timer, struct tasklet_hrtimer, timer);
tasklet_hi_schedule(&ttimer->tasklet);
return HRTIMER_NORESTART;
}
/*
* Helper function which calls the hrtimer callback from
* tasklet/softirq context
*/
static void __tasklet_hrtimer_trampoline(unsigned long data)
{
struct tasklet_hrtimer *ttimer = (void *)data;
enum hrtimer_restart restart;
restart = ttimer->function(&ttimer->timer);
if (restart != HRTIMER_NORESTART)
hrtimer_restart(&ttimer->timer);
}
/**
* tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
* @ttimer: tasklet_hrtimer which is initialized
* @function: hrtimer callback function which gets called from softirq context
* @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
* @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
*/
void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
enum hrtimer_restart (*function)(struct hrtimer *),
clockid_t which_clock, enum hrtimer_mode mode)
{
hrtimer_init(&ttimer->timer, which_clock, mode);
ttimer->timer.function = __hrtimer_tasklet_trampoline;
tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
(unsigned long)ttimer);
ttimer->function = function;
}
EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
/*
* Remote softirq bits
*/
DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
EXPORT_PER_CPU_SYMBOL(softirq_work_list);
static void __local_trigger(struct call_single_data *cp, int softirq)
{
struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
list_add_tail(&cp->list, head);
/* Trigger the softirq only if the list was previously empty. */
if (head->next == &cp->list)
raise_softirq_irqoff(softirq);
}
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
static void remote_softirq_receive(void *data)
{
struct call_single_data *cp = data;
unsigned long flags;
int softirq;
softirq = cp->priv;
local_irq_save(flags);
__local_trigger(cp, softirq);
local_irq_restore(flags);
}
static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
{
if (cpu_online(cpu)) {
cp->func = remote_softirq_receive;
cp->info = cp;
cp->flags = 0;
cp->priv = softirq;
__smp_call_function_single(cpu, cp, 0);
return 0;
}
return 1;
}
#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
{
return 1;
}
#endif
/**
* __send_remote_softirq - try to schedule softirq work on a remote cpu
* @cp: private SMP call function data area
* @cpu: the remote cpu
* @this_cpu: the currently executing cpu
* @softirq: the softirq for the work
*
* Attempt to schedule softirq work on a remote cpu. If this cannot be
* done, the work is instead queued up on the local cpu.
*
* Interrupts must be disabled.
*/
void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
{
if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
__local_trigger(cp, softirq);
}
EXPORT_SYMBOL(__send_remote_softirq);
/**
* send_remote_softirq - try to schedule softirq work on a remote cpu
* @cp: private SMP call function data area
* @cpu: the remote cpu
* @softirq: the softirq for the work
*
* Like __send_remote_softirq except that disabling interrupts and
* computing the current cpu is done for the caller.
*/
void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
{
unsigned long flags;
int this_cpu;
local_irq_save(flags);
this_cpu = smp_processor_id();
__send_remote_softirq(cp, cpu, this_cpu, softirq);
local_irq_restore(flags);
}
EXPORT_SYMBOL(send_remote_softirq);
static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
int cpu = (unsigned long) hcpu;
int i;
local_irq_disable();
for (i = 0; i < NR_SOFTIRQS; i++) {
struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
struct list_head *local_head;
if (list_empty(head))
continue;
local_head = &__get_cpu_var(softirq_work_list[i]);
list_splice_init(head, local_head);
raise_softirq_irqoff(i);
}
local_irq_enable();
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
.notifier_call = remote_softirq_cpu_notify,
};
void __init softirq_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
int i;
per_cpu(tasklet_vec, cpu).tail =
&per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
for (i = 0; i < NR_SOFTIRQS; i++)
INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
}
register_hotcpu_notifier(&remote_softirq_cpu_notifier);
open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
static int ksoftirqd_should_run(unsigned int cpu)
{
return local_softirq_pending();
}
static void run_ksoftirqd(unsigned int cpu)
{
local_irq_disable();
if (local_softirq_pending()) {
__do_softirq();
rcu_note_context_switch(cpu);
local_irq_enable();
cond_resched();
return;
}
local_irq_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
/*
* tasklet_kill_immediate is called to remove a tasklet which can already be
* scheduled for execution on @cpu.
*
* Unlike tasklet_kill, this function removes the tasklet
* _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
*
* When this function is called, @cpu must be in the CPU_DEAD state.
*/
void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
{
struct tasklet_struct **i;
BUG_ON(cpu_online(cpu));
BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
if (!test_bit(TASKLET_STATE_SCHED, &t->state))
return;
/* CPU is dead, so no lock needed. */
for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
if (*i == t) {
*i = t->next;
/* If this was the tail element, move the tail ptr */
if (*i == NULL)
per_cpu(tasklet_vec, cpu).tail = i;
return;
}
}
BUG();
}
static void takeover_tasklets(unsigned int cpu)
{
/* CPU is dead, so no lock needed. */
local_irq_disable();
/* Find end, append list for that CPU. */
if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
*__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
per_cpu(tasklet_vec, cpu).head = NULL;
per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
}
raise_softirq_irqoff(TASKLET_SOFTIRQ);
if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
*__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
__this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
per_cpu(tasklet_hi_vec, cpu).head = NULL;
per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
}
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
}
#endif /* CONFIG_HOTPLUG_CPU */
static int __cpuinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
switch (action) {
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
case CPU_DEAD_FROZEN:
takeover_tasklets((unsigned long)hcpu);
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
};
static __init int spawn_ksoftirqd(void)
{
register_cpu_notifier(&cpu_nfb);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
}
early_initcall(spawn_ksoftirqd);
/*
* [ These __weak aliases are kept in a separate compilation unit, so that
* GCC does not inline them incorrectly. ]
*/
int __init __weak early_irq_init(void)
{
return 0;
}
#ifdef CONFIG_GENERIC_HARDIRQS
int __init __weak arch_probe_nr_irqs(void)
{
return NR_IRQS_LEGACY;
}
int __init __weak arch_early_irq_init(void)
{
return 0;
}
#endif