sched: Fix /proc/sched_debug failure on very very large systems

On systems with 4096 cores attemping to read /proc/sched_debug fails because we are trying to push all the data into a single kmalloc buffer. The issue is on these very large machines all the data will not fit in 4mb. A better solution is to not us the single_open mechanism but to provide our own seq_operations and treat each cpu as an individual record. The output should be identical to the previous version. Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Cc: Peter Zijlstra <peterz@infradead.org>) [ Whitespace fixlet] [ Fix spello in comment] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-02-21 15:15:09 -08:00 · 2013-02-21 15:15:09 -08:00 · bbbfeac92b
commit bbbfeac92b
parent cb152ff267
1 changed files with 80 additions and 12 deletions
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu)
 	{
 		unsigned int freq = cpu_khz ? : 1;
-		SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+		SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
 			   cpu, freq / 1000, (freq % 1000));
 	}
 #else
-	SEQ_printf(m, "\ncpu#%d\n", cpu);
+	SEQ_printf(m, "cpu#%d\n", cpu);
 #endif
 #define P(x)								\
@ -330,6 +330,7 @@ do {									\
 	print_rq(m, rq, cpu);
 	rcu_read_unlock();
 	spin_unlock_irqrestore(&sched_debug_lock, flags);
 	SEQ_printf(m, "\n");
 }
 static const char *sched_tunable_scaling_names[] = {
@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = {
 	"linear"
 };
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
 {
 	u64 ktime, sched_clk, cpu_clk;
 	unsigned long flags;
 	int cpu;
 	local_irq_save(flags);
 	ktime = ktime_to_ns(ktime_get());
@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
-	SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+	SEQ_printf(m, "  .%-40s: %d (%s)\n",
 		"sysctl_sched_tunable_scaling",
 		sysctl_sched_tunable_scaling,
 		sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
 	for_each_online_cpu(cpu)
 		print_cpu(m, cpu);
 	SEQ_printf(m, "\n");
 }
 static int sched_debug_show(struct seq_file *m, void *v)
 {
 	int cpu = (unsigned long)(v - 2);
 	if (cpu != -1)
 		print_cpu(m, cpu);
 	else
 		sched_debug_header(m);
 	return 0;
 }
 void sysrq_sched_debug_show(void)
 {
-	sched_debug_show(NULL, NULL);
+	int cpu;
 	sched_debug_header(NULL);
 	for_each_online_cpu(cpu)
 		print_cpu(NULL, cpu);
 }
 /*
 * This itererator needs some explanation.
 * It returns 1 for the header position.
 * This means 2 is cpu 0.
 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
 * to use cpumask_* to iterate over the cpus.
 */
 static void *sched_debug_start(struct seq_file *file, loff_t *offset)
 {
 	unsigned long n = *offset;
 	if (n == 0)
 		return (void *) 1;
 	n--;
 	if (n > 0)
 		n = cpumask_next(n - 1, cpu_online_mask);
 	else
 		n = cpumask_first(cpu_online_mask);
 	*offset = n + 1;
 	if (n < nr_cpu_ids)
 		return (void *)(unsigned long)(n + 2);
 	return NULL;
 }
 static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
 {
 	(*offset)++;
 	return sched_debug_start(file, offset);
 }
 static void sched_debug_stop(struct seq_file *file, void *data)
 {
 }
 static const struct seq_operations sched_debug_sops = {
 	.start = sched_debug_start,
 	.next = sched_debug_next,
 	.stop = sched_debug_stop,
 	.show = sched_debug_show,
 };
 static int sched_debug_release(struct inode *inode, struct file *file)
 {
 	seq_release(inode, file);
 	return 0;
 }
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
-	return single_open(filp, sched_debug_show, NULL);
+	int ret = 0;
 	ret = seq_open(filp, &sched_debug_sops);
 	return ret;
 }
 static const struct file_operations sched_debug_fops = {
 	.open		= sched_debug_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= sched_debug_release,
 };
 static int __init init_sched_debug_procfs(void)