forked from luck/tmp_suning_uos_patched
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Spencer Candland <spencer@bluehost.com> Cc: Americo Wang <xiyou.wangcong@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
parent
d99ca3b977
commit
0cf55e1ec0
|
@ -506,7 +506,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
|
||||||
|
|
||||||
/* add up live thread stats at the group level */
|
/* add up live thread stats at the group level */
|
||||||
if (whole) {
|
if (whole) {
|
||||||
struct task_cputime cputime;
|
|
||||||
struct task_struct *t = task;
|
struct task_struct *t = task;
|
||||||
do {
|
do {
|
||||||
min_flt += t->min_flt;
|
min_flt += t->min_flt;
|
||||||
|
@ -517,9 +516,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
|
||||||
|
|
||||||
min_flt += sig->min_flt;
|
min_flt += sig->min_flt;
|
||||||
maj_flt += sig->maj_flt;
|
maj_flt += sig->maj_flt;
|
||||||
thread_group_cputime(task, &cputime);
|
thread_group_times(task, &utime, &stime);
|
||||||
utime = cputime.utime;
|
|
||||||
stime = cputime.stime;
|
|
||||||
gtime = cputime_add(gtime, sig->gtime);
|
gtime = cputime_add(gtime, sig->gtime);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -624,6 +624,9 @@ struct signal_struct {
|
||||||
cputime_t utime, stime, cutime, cstime;
|
cputime_t utime, stime, cutime, cstime;
|
||||||
cputime_t gtime;
|
cputime_t gtime;
|
||||||
cputime_t cgtime;
|
cputime_t cgtime;
|
||||||
|
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
|
||||||
|
cputime_t prev_utime, prev_stime;
|
||||||
|
#endif
|
||||||
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
|
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
|
||||||
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
|
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
|
||||||
unsigned long inblock, oublock, cinblock, coublock;
|
unsigned long inblock, oublock, cinblock, coublock;
|
||||||
|
@ -1723,6 +1726,7 @@ static inline void put_task_struct(struct task_struct *t)
|
||||||
}
|
}
|
||||||
|
|
||||||
extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
|
extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
|
||||||
|
extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Per process flags
|
* Per process flags
|
||||||
|
|
|
@ -91,8 +91,6 @@ static void __exit_signal(struct task_struct *tsk)
|
||||||
if (atomic_dec_and_test(&sig->count))
|
if (atomic_dec_and_test(&sig->count))
|
||||||
posix_cpu_timers_exit_group(tsk);
|
posix_cpu_timers_exit_group(tsk);
|
||||||
else {
|
else {
|
||||||
cputime_t utime, stime;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there is any task waiting for the group exit
|
* If there is any task waiting for the group exit
|
||||||
* then notify it:
|
* then notify it:
|
||||||
|
@ -112,9 +110,8 @@ static void __exit_signal(struct task_struct *tsk)
|
||||||
* We won't ever get here for the group leader, since it
|
* We won't ever get here for the group leader, since it
|
||||||
* will have been the last reference on the signal_struct.
|
* will have been the last reference on the signal_struct.
|
||||||
*/
|
*/
|
||||||
task_times(tsk, &utime, &stime);
|
sig->utime = cputime_add(sig->utime, tsk->utime);
|
||||||
sig->utime = cputime_add(sig->utime, utime);
|
sig->stime = cputime_add(sig->stime, tsk->stime);
|
||||||
sig->stime = cputime_add(sig->stime, stime);
|
|
||||||
sig->gtime = cputime_add(sig->gtime, tsk->gtime);
|
sig->gtime = cputime_add(sig->gtime, tsk->gtime);
|
||||||
sig->min_flt += tsk->min_flt;
|
sig->min_flt += tsk->min_flt;
|
||||||
sig->maj_flt += tsk->maj_flt;
|
sig->maj_flt += tsk->maj_flt;
|
||||||
|
@ -1208,6 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
|
||||||
struct signal_struct *psig;
|
struct signal_struct *psig;
|
||||||
struct signal_struct *sig;
|
struct signal_struct *sig;
|
||||||
unsigned long maxrss;
|
unsigned long maxrss;
|
||||||
|
cputime_t tgutime, tgstime;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The resource counters for the group leader are in its
|
* The resource counters for the group leader are in its
|
||||||
|
@ -1223,20 +1221,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
|
||||||
* need to protect the access to parent->signal fields,
|
* need to protect the access to parent->signal fields,
|
||||||
* as other threads in the parent group can be right
|
* as other threads in the parent group can be right
|
||||||
* here reaping other children at the same time.
|
* here reaping other children at the same time.
|
||||||
|
*
|
||||||
|
* We use thread_group_times() to get times for the thread
|
||||||
|
* group, which consolidates times for all threads in the
|
||||||
|
* group including the group leader.
|
||||||
*/
|
*/
|
||||||
|
thread_group_times(p, &tgutime, &tgstime);
|
||||||
spin_lock_irq(&p->real_parent->sighand->siglock);
|
spin_lock_irq(&p->real_parent->sighand->siglock);
|
||||||
psig = p->real_parent->signal;
|
psig = p->real_parent->signal;
|
||||||
sig = p->signal;
|
sig = p->signal;
|
||||||
psig->cutime =
|
psig->cutime =
|
||||||
cputime_add(psig->cutime,
|
cputime_add(psig->cutime,
|
||||||
cputime_add(p->utime,
|
cputime_add(tgutime,
|
||||||
cputime_add(sig->utime,
|
sig->cutime));
|
||||||
sig->cutime)));
|
|
||||||
psig->cstime =
|
psig->cstime =
|
||||||
cputime_add(psig->cstime,
|
cputime_add(psig->cstime,
|
||||||
cputime_add(p->stime,
|
cputime_add(tgstime,
|
||||||
cputime_add(sig->stime,
|
sig->cstime));
|
||||||
sig->cstime)));
|
|
||||||
psig->cgtime =
|
psig->cgtime =
|
||||||
cputime_add(psig->cgtime,
|
cputime_add(psig->cgtime,
|
||||||
cputime_add(p->gtime,
|
cputime_add(p->gtime,
|
||||||
|
|
|
@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
|
||||||
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
|
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
|
||||||
sig->gtime = cputime_zero;
|
sig->gtime = cputime_zero;
|
||||||
sig->cgtime = cputime_zero;
|
sig->cgtime = cputime_zero;
|
||||||
|
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
|
||||||
|
sig->prev_utime = sig->prev_stime = cputime_zero;
|
||||||
|
#endif
|
||||||
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
|
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
|
||||||
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
|
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
|
||||||
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
|
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
|
||||||
|
|
|
@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||||
*ut = p->utime;
|
*ut = p->utime;
|
||||||
*st = p->stime;
|
*st = p->stime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||||
|
{
|
||||||
|
struct task_cputime cputime;
|
||||||
|
|
||||||
|
thread_group_cputime(p, &cputime);
|
||||||
|
|
||||||
|
*ut = cputime.utime;
|
||||||
|
*st = cputime.stime;
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifndef nsecs_to_cputime
|
#ifndef nsecs_to_cputime
|
||||||
|
@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||||
*ut = p->prev_utime;
|
*ut = p->prev_utime;
|
||||||
*st = p->prev_stime;
|
*st = p->prev_stime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Must be called with siglock held.
|
||||||
|
*/
|
||||||
|
void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||||
|
{
|
||||||
|
struct signal_struct *sig = p->signal;
|
||||||
|
struct task_cputime cputime;
|
||||||
|
cputime_t rtime, utime, total;
|
||||||
|
|
||||||
|
thread_group_cputime(p, &cputime);
|
||||||
|
|
||||||
|
total = cputime_add(cputime.utime, cputime.stime);
|
||||||
|
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
|
||||||
|
|
||||||
|
if (total) {
|
||||||
|
u64 temp;
|
||||||
|
|
||||||
|
temp = (u64)(rtime * cputime.utime);
|
||||||
|
do_div(temp, total);
|
||||||
|
utime = (cputime_t)temp;
|
||||||
|
} else
|
||||||
|
utime = rtime;
|
||||||
|
|
||||||
|
sig->prev_utime = max(sig->prev_utime, utime);
|
||||||
|
sig->prev_stime = max(sig->prev_stime,
|
||||||
|
cputime_sub(rtime, sig->prev_utime));
|
||||||
|
|
||||||
|
*ut = sig->prev_utime;
|
||||||
|
*st = sig->prev_stime;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
18
kernel/sys.c
18
kernel/sys.c
|
@ -911,16 +911,15 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
|
||||||
|
|
||||||
void do_sys_times(struct tms *tms)
|
void do_sys_times(struct tms *tms)
|
||||||
{
|
{
|
||||||
struct task_cputime cputime;
|
cputime_t tgutime, tgstime, cutime, cstime;
|
||||||
cputime_t cutime, cstime;
|
|
||||||
|
|
||||||
thread_group_cputime(current, &cputime);
|
|
||||||
spin_lock_irq(¤t->sighand->siglock);
|
spin_lock_irq(¤t->sighand->siglock);
|
||||||
|
thread_group_times(current, &tgutime, &tgstime);
|
||||||
cutime = current->signal->cutime;
|
cutime = current->signal->cutime;
|
||||||
cstime = current->signal->cstime;
|
cstime = current->signal->cstime;
|
||||||
spin_unlock_irq(¤t->sighand->siglock);
|
spin_unlock_irq(¤t->sighand->siglock);
|
||||||
tms->tms_utime = cputime_to_clock_t(cputime.utime);
|
tms->tms_utime = cputime_to_clock_t(tgutime);
|
||||||
tms->tms_stime = cputime_to_clock_t(cputime.stime);
|
tms->tms_stime = cputime_to_clock_t(tgstime);
|
||||||
tms->tms_cutime = cputime_to_clock_t(cutime);
|
tms->tms_cutime = cputime_to_clock_t(cutime);
|
||||||
tms->tms_cstime = cputime_to_clock_t(cstime);
|
tms->tms_cstime = cputime_to_clock_t(cstime);
|
||||||
}
|
}
|
||||||
|
@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
|
||||||
{
|
{
|
||||||
struct task_struct *t;
|
struct task_struct *t;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
cputime_t utime, stime;
|
cputime_t tgutime, tgstime, utime, stime;
|
||||||
struct task_cputime cputime;
|
|
||||||
unsigned long maxrss = 0;
|
unsigned long maxrss = 0;
|
||||||
|
|
||||||
memset((char *) r, 0, sizeof *r);
|
memset((char *) r, 0, sizeof *r);
|
||||||
|
@ -1372,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case RUSAGE_SELF:
|
case RUSAGE_SELF:
|
||||||
thread_group_cputime(p, &cputime);
|
thread_group_times(p, &tgutime, &tgstime);
|
||||||
utime = cputime_add(utime, cputime.utime);
|
utime = cputime_add(utime, tgutime);
|
||||||
stime = cputime_add(stime, cputime.stime);
|
stime = cputime_add(stime, tgstime);
|
||||||
r->ru_nvcsw += p->signal->nvcsw;
|
r->ru_nvcsw += p->signal->nvcsw;
|
||||||
r->ru_nivcsw += p->signal->nivcsw;
|
r->ru_nivcsw += p->signal->nivcsw;
|
||||||
r->ru_minflt += p->signal->min_flt;
|
r->ru_minflt += p->signal->min_flt;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user