genirq, sched/isolation: Isolate from handling managed interrupts

The affinity of managed interrupts is completely handled in the kernel and
cannot be changed via the /proc/irq/* interfaces from user space. As the
kernel tries to spread out interrupts evenly accross CPUs on x86 to prevent
vector exhaustion, it can happen that a managed interrupt whose affinity
mask contains both isolated and housekeeping CPUs is routed to an isolated
CPU. As a consequence IO submitted on a housekeeping CPU causes interrupts
on the isolated CPU.

Add a new sub-parameter 'managed_irq' for 'isolcpus' and the corresponding
logic in the interrupt affinity selection code.

The subparameter indicates to the interrupt affinity selection logic that
it should try to avoid the above scenario.

This isolation is best effort and only effective if the automatically
assigned interrupt mask of a device queue contains isolated and
housekeeping CPUs. If housekeeping CPUs are online then such interrupts are
directed to the housekeeping CPU so that IO submitted on the housekeeping
CPU cannot disturb the isolated CPU.

If a queue's affinity mask contains only isolated CPUs then this parameter
has no effect on the interrupt routing decision, though interrupts are only
happening when tasks running on those isolated CPUs submit IO. IO submitted
on housekeeping CPUs has no influence on those queues.

If the affinity mask contains both housekeeping and isolated CPUs, but none
of the contained housekeeping CPUs is online, then the interrupt is also
routed to an isolated CPU. Interrupts are only delivered when one of the
isolated CPUs in the affinity mask submits IO. If one of the contained
housekeeping CPUs comes online, the CPU hotplug logic migrates the
interrupt automatically back to the upcoming housekeeping CPU. Depending on
the type of interrupt controller, this can require that at least one
interrupt is delivered to the isolated CPU in order to complete the
migration.

[ tglx: Removed unused parameter, added and edited comments/documentation
  	and rephrased the changelog so it contains more details. ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200120091625.17912-1-ming.lei@redhat.com
This commit is contained in:
Ming Lei 2020-01-20 17:16:25 +08:00 committed by Thomas Gleixner
parent 099368bb10
commit 11ea68f553
5 changed files with 90 additions and 5 deletions

View File

@ -1933,10 +1933,32 @@
<cpu number> begins at 0 and the maximum value is <cpu number> begins at 0 and the maximum value is
"number of CPUs in system - 1". "number of CPUs in system - 1".
managed_irq
Isolate from being targeted by managed interrupts
which have an interrupt mask containing isolated
CPUs. The affinity of managed interrupts is
handled by the kernel and cannot be changed via
the /proc/irq/* interfaces.
This isolation is best effort and only effective
if the automatically assigned interrupt mask of a
device queue contains isolated and housekeeping
CPUs. If housekeeping CPUs are online then such
interrupts are directed to the housekeeping CPU
so that IO submitted on the housekeeping CPU
cannot disturb the isolated CPU.
If a queue's affinity mask contains only isolated
CPUs then this parameter has no effect on the
interrupt routing decision, though interrupts are
only delivered when tasks running on those
isolated CPUs submit IO. IO submitted on
housekeeping CPUs has no influence on those
queues.
The format of <cpu-list> is described above. The format of <cpu-list> is described above.
iucv= [HW,NET] iucv= [HW,NET]
ivrs_ioapic [HW,X86_64] ivrs_ioapic [HW,X86_64]

View File

@ -13,6 +13,7 @@ enum hk_flags {
HK_FLAG_TICK = (1 << 4), HK_FLAG_TICK = (1 << 4),
HK_FLAG_DOMAIN = (1 << 5), HK_FLAG_DOMAIN = (1 << 5),
HK_FLAG_WQ = (1 << 6), HK_FLAG_WQ = (1 << 6),
HK_FLAG_MANAGED_IRQ = (1 << 7),
}; };
#ifdef CONFIG_CPU_ISOLATION #ifdef CONFIG_CPU_ISOLATION

View File

@ -12,6 +12,7 @@
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/irq.h> #include <linux/irq.h>
#include <linux/sched/isolation.h>
#include "internals.h" #include "internals.h"
@ -171,6 +172,20 @@ void irq_migrate_all_off_this_cpu(void)
} }
} }
static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
{
const struct cpumask *hk_mask;
if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
return false;
hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
return false;
return cpumask_test_cpu(cpu, hk_mask);
}
static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
{ {
struct irq_data *data = irq_desc_get_irq_data(desc); struct irq_data *data = irq_desc_get_irq_data(desc);
@ -188,9 +203,11 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
/* /*
* If the interrupt can only be directed to a single target * If the interrupt can only be directed to a single target
* CPU then it is already assigned to a CPU in the affinity * CPU then it is already assigned to a CPU in the affinity
* mask. No point in trying to move it around. * mask. No point in trying to move it around unless the
* isolation mechanism requests to move it to an upcoming
* housekeeping CPU.
*/ */
if (!irqd_is_single_target(data)) if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu))
irq_set_affinity_locked(data, affinity, false); irq_set_affinity_locked(data, affinity, false);
} }

View File

@ -18,6 +18,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h> #include <uapi/linux/sched/types.h>
#include <linux/task_work.h> #include <linux/task_work.h>
@ -217,7 +218,45 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity) if (!chip || !chip->irq_set_affinity)
return -EINVAL; return -EINVAL;
/*
* If this is a managed interrupt and housekeeping is enabled on
* it check whether the requested affinity mask intersects with
* a housekeeping CPU. If so, then remove the isolated CPUs from
* the mask and just keep the housekeeping CPU(s). This prevents
* the affinity setter from routing the interrupt to an isolated
* CPU to avoid that I/O submitted from a housekeeping CPU causes
* interrupts on an isolated one.
*
* If the masks do not intersect or include online CPU(s) then
* keep the requested mask. The isolated target CPUs are only
* receiving interrupts when the I/O operation was submitted
* directly from them.
*
* If all housekeeping CPUs in the affinity mask are offline, the
* interrupt will be migrated by the CPU hotplug code once a
* housekeeping CPU which belongs to the affinity mask comes
* online.
*/
if (irqd_affinity_is_managed(data) &&
housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
const struct cpumask *hk_mask, *prog_mask;
static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
static struct cpumask tmp_mask;
hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
raw_spin_lock(&tmp_mask_lock);
cpumask_and(&tmp_mask, mask, hk_mask);
if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
prog_mask = mask;
else
prog_mask = &tmp_mask;
ret = chip->irq_set_affinity(data, prog_mask, force);
raw_spin_unlock(&tmp_mask_lock);
} else {
ret = chip->irq_set_affinity(data, mask, force); ret = chip->irq_set_affinity(data, mask, force);
}
switch (ret) { switch (ret) {
case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE: case IRQ_SET_MASK_OK_DONE:

View File

@ -163,6 +163,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
continue; continue;
} }
if (!strncmp(str, "managed_irq,", 12)) {
str += 12;
flags |= HK_FLAG_MANAGED_IRQ;
continue;
}
pr_warn("isolcpus: Error, unknown flag\n"); pr_warn("isolcpus: Error, unknown flag\n");
return 0; return 0;
} }