sched/core: Optimize __schedule()

Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD
context switch, we can avoid the TASK_DEAD special case currently in
__schedule() because that avoids the extra preempt_disable() from
schedule().

In order to facilitate this, create a do_task_dead() helper which we
place in the scheduler code, such that it can access __schedule().

Also add some __noreturn annotations to the functions, there's no
coming back from do_exit().

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Cheng Chao <cs.os.kernel@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: chris@chris-wilson.co.uk
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20160913163729.GB5012@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Peter Zijlstra 2016-09-13 18:37:29 +02:00 committed by Ingo Molnar
parent bf89a30472
commit 9af6528ee9
4 changed files with 34 additions and 41 deletions

View File

@ -259,17 +259,14 @@ static inline void might_fault(void) { }
extern struct atomic_notifier_head panic_notifier_list; extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(int state); extern long (*panic_blink)(int state);
__printf(1, 2) __printf(1, 2)
void panic(const char *fmt, ...) void panic(const char *fmt, ...) __noreturn __cold;
__noreturn __cold;
void nmi_panic(struct pt_regs *regs, const char *msg); void nmi_panic(struct pt_regs *regs, const char *msg);
extern void oops_enter(void); extern void oops_enter(void);
extern void oops_exit(void); extern void oops_exit(void);
void print_oops_end_marker(void); void print_oops_end_marker(void);
extern int oops_may_print(void); extern int oops_may_print(void);
void do_exit(long error_code) void do_exit(long error_code) __noreturn;
__noreturn; void complete_and_exit(struct completion *, long) __noreturn;
void complete_and_exit(struct completion *, long)
__noreturn;
/* Internal, do not use. */ /* Internal, do not use. */
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);

View File

@ -448,6 +448,8 @@ static inline void io_schedule(void)
io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
} }
void __noreturn do_task_dead(void);
struct nsproxy; struct nsproxy;
struct user_namespace; struct user_namespace;

View File

@ -725,7 +725,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {} static inline void check_stack_usage(void) {}
#endif #endif
void do_exit(long code) void __noreturn do_exit(long code)
{ {
struct task_struct *tsk = current; struct task_struct *tsk = current;
int group_dead; int group_dead;
@ -882,29 +882,7 @@ void do_exit(long code)
exit_rcu(); exit_rcu();
TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
/* do_task_dead();
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
* when the following two conditions become true.
* - There is race condition of mmap_sem (It is acquired by
* exit_mm()), and
* - SMI occurs before setting TASK_RUNINNG.
* (or hypervisor of virtual machine switches to other guest)
* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
*
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
schedule();
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
} }
EXPORT_SYMBOL_GPL(do_exit); EXPORT_SYMBOL_GPL(do_exit);

View File

@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu); rq = cpu_rq(cpu);
prev = rq->curr; prev = rq->curr;
/*
* do_exit() calls schedule() with preemption disabled as an exception;
* however we must fix that up, otherwise the next task will see an
* inconsistent (higher) preempt count.
*
* It also avoids the below schedule_debug() test from complaining
* about this.
*/
if (unlikely(prev->state == TASK_DEAD))
preempt_enable_no_resched_notrace();
schedule_debug(prev); schedule_debug(prev);
if (sched_feat(HRTICK)) if (sched_feat(HRTICK))
@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt)
} }
STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
void __noreturn do_task_dead(void)
{
/*
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
* when the following two conditions become true.
* - There is race condition of mmap_sem (It is acquired by
* exit_mm()), and
* - SMI occurs before setting TASK_RUNINNG.
* (or hypervisor of virtual machine switches to other guest)
* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
*
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
smp_mb();
raw_spin_unlock_wait(&current->pi_lock);
/* causes final put_task_struct in finish_task_switch(). */
__set_current_state(TASK_DEAD);
current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
__schedule(false);
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
}
static inline void sched_submit_work(struct task_struct *tsk) static inline void sched_submit_work(struct task_struct *tsk)
{ {
if (!tsk->state || tsk_is_pi_blocked(tsk)) if (!tsk->state || tsk_is_pi_blocked(tsk))