From b396bfdebffcc05a855137775e38f4652cbca454 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 12 Feb 2020 12:21:03 -0500
Subject: [PATCH 01/27] tracing: Have hwlat ts be first instance and record
 count of instances

The hwlat tracer runs a loop of width time during a given window. It then
reports the max latency over a given threshold and records a timestamp. But
this timestamp is the time after the width has finished, and not the time it
actually triggered.

Record the actual time when the latency was greater than the threshold as
well as the number of times it was greater in a given width per window.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst | 32 ++++++++++++++++++++++----------
 kernel/trace/trace_entries.h   |  4 +++-
 kernel/trace/trace_hwlat.c     | 24 +++++++++++++++++-------
 kernel/trace/trace_output.c    |  4 ++--
 4 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index ff658e27d25b..99a0890e20ec 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -2126,6 +2126,8 @@ periodically make a CPU constantly busy with interrupts disabled.
   # cat trace
   # tracer: hwlat
   #
+  # entries-in-buffer/entries-written: 13/13   #P:8
+  #
   #                              _-----=> irqs-off
   #                             / _----=> need-resched
   #                            | / _---=> hardirq/softirq
@@ -2133,12 +2135,18 @@ periodically make a CPU constantly busy with interrupts disabled.
   #                            ||| /     delay
   #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
   #              | |       |   ||||       |         |
-             <...>-3638  [001] d... 19452.055471: #1     inner/outer(us):   12/14    ts:1499801089.066141940
-             <...>-3638  [003] d... 19454.071354: #2     inner/outer(us):   11/9     ts:1499801091.082164365
-             <...>-3638  [002] dn.. 19461.126852: #3     inner/outer(us):   12/9     ts:1499801098.138150062
-             <...>-3638  [001] d... 19488.340960: #4     inner/outer(us):    8/12    ts:1499801125.354139633
-             <...>-3638  [003] d... 19494.388553: #5     inner/outer(us):    8/12    ts:1499801131.402150961
-             <...>-3638  [003] d... 19501.283419: #6     inner/outer(us):    0/12    ts:1499801138.297435289 nmi-total:4 nmi-count:1
+             <...>-1729  [001] d...   678.473449: #1     inner/outer(us):   11/12    ts:1581527483.343962693 count:6
+             <...>-1729  [004] d...   689.556542: #2     inner/outer(us):   16/9     ts:1581527494.889008092 count:1
+             <...>-1729  [005] d...   714.756290: #3     inner/outer(us):   16/16    ts:1581527519.678961629 count:5
+             <...>-1729  [001] d...   718.788247: #4     inner/outer(us):    9/17    ts:1581527523.889012713 count:1
+             <...>-1729  [002] d...   719.796341: #5     inner/outer(us):   13/9     ts:1581527524.912872606 count:1
+             <...>-1729  [006] d...   844.787091: #6     inner/outer(us):    9/12    ts:1581527649.889048502 count:2
+             <...>-1729  [003] d...   849.827033: #7     inner/outer(us):   18/9     ts:1581527654.889013793 count:1
+             <...>-1729  [007] d...   853.859002: #8     inner/outer(us):    9/12    ts:1581527658.889065736 count:1
+             <...>-1729  [001] d...   855.874978: #9     inner/outer(us):    9/11    ts:1581527660.861991877 count:1
+             <...>-1729  [001] d...   863.938932: #10    inner/outer(us):    9/11    ts:1581527668.970010500 count:1 nmi-total:7 nmi-count:1
+             <...>-1729  [007] d...   878.050780: #11    inner/outer(us):    9/12    ts:1581527683.385002600 count:1 nmi-total:5 nmi-count:1
+             <...>-1729  [007] d...   886.114702: #12    inner/outer(us):    9/12    ts:1581527691.385001600 count:1
 
 
 The above output is somewhat the same in the header. All events will have
@@ -2148,7 +2156,7 @@ interrupts disabled 'd'. Under the FUNCTION title there is:
 	This is the count of events recorded that were greater than the
 	tracing_threshold (See below).
 
- inner/outer(us):   12/14
+ inner/outer(us):   11/11
 
       This shows two numbers as "inner latency" and "outer latency". The test
       runs in a loop checking a timestamp twice. The latency detected within
@@ -2156,11 +2164,15 @@ interrupts disabled 'd'. Under the FUNCTION title there is:
       after the previous timestamp and the next timestamp in the loop is
       the "outer latency".
 
- ts:1499801089.066141940
+ ts:1581527483.343962693
 
-      The absolute timestamp that the event happened.
+      The absolute timestamp that the first latency was recorded in the window.
 
- nmi-total:4 nmi-count:1
+ count:6
+
+      The number of times a latency was detected during the window.
+
+ nmi-total:7 nmi-count:1
 
       On architectures that support it, if an NMI comes in during the
       test, the time spent in NMI is reported in "nmi-total" (in
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index f22746f3c132..a523da0dae0a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -325,14 +325,16 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
 		__field_desc(	long,	timestamp,	tv_nsec		)
 		__field(	unsigned int,		nmi_count	)
 		__field(	unsigned int,		seqnum		)
+		__field(	unsigned int,		count		)
 	),
 
-	F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n",
+	F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tcount:%d\tnmi-ts:%llu\tnmi-count:%u\n",
 		 __entry->seqnum,
 		 __entry->tv_sec,
 		 __entry->tv_nsec,
 		 __entry->duration,
 		 __entry->outer_duration,
+		 __entry->count,
 		 __entry->nmi_total_ts,
 		 __entry->nmi_count)
 );
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index a48808c43249..e2be7bb7ef7e 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -83,6 +83,7 @@ struct hwlat_sample {
 	u64			nmi_total_ts;	/* Total time spent in NMIs */
 	struct timespec64	timestamp;	/* wall time */
 	int			nmi_count;	/* # NMIs during this sample */
+	int			count;		/* # of iteratons over threash */
 };
 
 /* keep the global state somewhere. */
@@ -124,6 +125,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
 	entry->timestamp		= sample->timestamp;
 	entry->nmi_total_ts		= sample->nmi_total_ts;
 	entry->nmi_count		= sample->nmi_count;
+	entry->count			= sample->count;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
 		trace_buffer_unlock_commit_nostack(buffer, event);
@@ -167,12 +169,14 @@ void trace_hwlat_callback(bool enter)
 static int get_sample(void)
 {
 	struct trace_array *tr = hwlat_trace;
+	struct hwlat_sample s;
 	time_type start, t1, t2, last_t2;
-	s64 diff, total, last_total = 0;
+	s64 diff, outer_diff, total, last_total = 0;
 	u64 sample = 0;
 	u64 thresh = tracing_thresh;
 	u64 outer_sample = 0;
 	int ret = -1;
+	unsigned int count = 0;
 
 	do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
 
@@ -186,6 +190,7 @@ static int get_sample(void)
 
 	init_time(last_t2, 0);
 	start = time_get(); /* start timestamp */
+	outer_diff = 0;
 
 	do {
 
@@ -194,14 +199,14 @@ static int get_sample(void)
 
 		if (time_u64(last_t2)) {
 			/* Check the delta from outer loop (t2 to next t1) */
-			diff = time_to_us(time_sub(t1, last_t2));
+			outer_diff = time_to_us(time_sub(t1, last_t2));
 			/* This shouldn't happen */
-			if (diff < 0) {
+			if (outer_diff < 0) {
 				pr_err(BANNER "time running backwards\n");
 				goto out;
 			}
-			if (diff > outer_sample)
-				outer_sample = diff;
+			if (outer_diff > outer_sample)
+				outer_sample = outer_diff;
 		}
 		last_t2 = t2;
 
@@ -217,6 +222,12 @@ static int get_sample(void)
 		/* This checks the inner loop (t1 to t2) */
 		diff = time_to_us(time_sub(t2, t1));     /* current diff */
 
+		if (diff > thresh || outer_diff > thresh) {
+			if (!count)
+				ktime_get_real_ts64(&s.timestamp);
+			count++;
+		}
+
 		/* This shouldn't happen */
 		if (diff < 0) {
 			pr_err(BANNER "time running backwards\n");
@@ -236,7 +247,6 @@ static int get_sample(void)
 
 	/* If we exceed the threshold value, we have found a hardware latency */
 	if (sample > thresh || outer_sample > thresh) {
-		struct hwlat_sample s;
 		u64 latency;
 
 		ret = 1;
@@ -249,9 +259,9 @@ static int get_sample(void)
 		s.seqnum = hwlat_data.count;
 		s.duration = sample;
 		s.outer_duration = outer_sample;
-		ktime_get_real_ts64(&s.timestamp);
 		s.nmi_total_ts = nmi_total_ts;
 		s.nmi_count = nmi_count;
+		s.count = count;
 		trace_hwlat_sample(&s);
 
 		latency = max(sample, outer_sample);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b4909082f6a4..e25a7da79c6b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1158,12 +1158,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
 
 	trace_assign_type(field, entry);
 
-	trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld",
+	trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d",
 			 field->seqnum,
 			 field->duration,
 			 field->outer_duration,
 			 (long long)field->timestamp.tv_sec,
-			 field->timestamp.tv_nsec);
+			 field->timestamp.tv_nsec, field->count);
 
 	if (field->nmi_count) {
 		/*

From 5412e0b763e0c46165fa353f695fa3b68a66ac91 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 14 Feb 2020 16:20:04 -0500
Subject: [PATCH 02/27] tracing: Remove unused TRACE_BUFFER bits

Commit 567cd4da54ff ("ring-buffer: User context bit recursion checking")
added the TRACE_BUFFER bits to be used in the current task's trace_recursion
field. But the final submission of the logic removed the use of those bits,
but never removed the bits themselves (they were never used in upstream
Linux). These can be safely removed.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 99372dd7d168..c61e1b1c85a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -557,12 +557,7 @@ struct tracer {
  * caller, and we can skip the current check.
  */
 enum {
-	TRACE_BUFFER_BIT,
-	TRACE_BUFFER_NMI_BIT,
-	TRACE_BUFFER_IRQ_BIT,
-	TRACE_BUFFER_SIRQ_BIT,
-
-	/* Start of function recursion bits */
+	/* Function recursion bits */
 	TRACE_FTRACE_BIT,
 	TRACE_FTRACE_NMI_BIT,
 	TRACE_FTRACE_IRQ_BIT,

From 306b69dce9269df116bfa94319b1b827c22d5794 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 3 Mar 2020 20:24:40 +0900
Subject: [PATCH 03/27] bootconfig: Support O=<builddir> option

Support O=<builddir> option to build bootconfig tool in
the other directory. As same as other tools, if you specify
O=<builddir>, bootconfig command is build under <builddir>.

Link: http://lkml.kernel.org/r/158323468033.10560.14661631369326294355.stgit@devnote2

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/bootconfig/Makefile           | 27 +++++++++++++++++----------
 tools/bootconfig/test-bootconfig.sh | 14 ++++++++++----
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/tools/bootconfig/Makefile b/tools/bootconfig/Makefile
index a6146ac64458..da5975775337 100644
--- a/tools/bootconfig/Makefile
+++ b/tools/bootconfig/Makefile
@@ -1,23 +1,30 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for bootconfig command
+include ../scripts/Makefile.include
 
 bindir ?= /usr/bin
 
-HEADER = include/linux/bootconfig.h
-CFLAGS = -Wall -g -I./include
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+endif
 
-PROGS = bootconfig
+LIBSRC = $(srctree)/lib/bootconfig.c $(srctree)/include/linux/bootconfig.h
+CFLAGS = -Wall -g -I$(CURDIR)/include
 
-all: $(PROGS)
+ALL_TARGETS := bootconfig
+ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
 
-bootconfig: ../../lib/bootconfig.c main.c $(HEADER)
+all: $(ALL_PROGRAMS)
+
+$(OUTPUT)bootconfig: main.c $(LIBSRC)
 	$(CC) $(filter %.c,$^) $(CFLAGS) -o $@
 
-install: $(PROGS)
-	install bootconfig $(DESTDIR)$(bindir)
+test: $(ALL_PROGRAMS) test-bootconfig.sh
+	./test-bootconfig.sh $(OUTPUT)
 
-test: bootconfig
-	./test-bootconfig.sh
+install: $(ALL_PROGRAMS)
+	install $(OUTPUT)bootconfig $(DESTDIR)$(bindir)
 
 clean:
-	$(RM) -f *.o bootconfig
+	$(RM) -f $(OUTPUT)*.o $(ALL_PROGRAMS)
diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh
index 1411f4c3454f..81b350ffd03f 100755
--- a/tools/bootconfig/test-bootconfig.sh
+++ b/tools/bootconfig/test-bootconfig.sh
@@ -3,9 +3,16 @@
 
 echo "Boot config test script"
 
-BOOTCONF=./bootconfig
-INITRD=`mktemp initrd-XXXX`
-TEMPCONF=`mktemp temp-XXXX.bconf`
+if [ -d "$1" ]; then
+  TESTDIR=$1
+else
+  TESTDIR=.
+fi
+BOOTCONF=${TESTDIR}/bootconfig
+
+INITRD=`mktemp ${TESTDIR}/initrd-XXXX`
+TEMPCONF=`mktemp ${TESTDIR}/temp-XXXX.bconf`
+OUTFILE=`mktemp ${TESTDIR}/tempout-XXXX`
 NG=0
 
 cleanup() {
@@ -65,7 +72,6 @@ new_size=$(stat -c %s $INITRD)
 xpass test $new_size -eq $initrd_size
 
 echo "No error messge while applying"
-OUTFILE=`mktemp tempout-XXXX`
 dd if=/dev/zero of=$INITRD bs=4096 count=1
 printf " \0\0\0 \0\0\0" >> $INITRD
 $BOOTCONF -a $TEMPCONF $INITRD > $OUTFILE 2>&1

From 89b74cac7834734d6b2733204c639917d3826083 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 3 Mar 2020 20:24:50 +0900
Subject: [PATCH 04/27] tools/bootconfig: Show line and column in parse error

Show line and column when we got a parse error in bootconfig tool.
Current lib/bootconfig shows the parse error with byte offset, but
that is not human readable.
This makes xbc_init() not showing error message itself but able to
pass the error message and position to caller, so that the caller
can decode it and show the error message with line number and columns.

With this patch, bootconfig tool shows an error with line:column as
below.

  $ cat samples/bad-dotword.bconf
  # do not start keyword with .
  key {
    .word = 1
  }
  $ ./bootconfig -a samples/bad-dotword.bconf initrd
  Parse Error: Invalid keyword at 3:3

Link: http://lkml.kernel.org/r/158323469002.10560.4023923847704522760.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h |  3 ++-
 init/main.c                | 14 ++++++++++----
 lib/bootconfig.c           | 35 ++++++++++++++++++++++++++---------
 tools/bootconfig/main.c    | 35 +++++++++++++++++++++++++++++++----
 4 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index d11e183fcb54..9903088891fa 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -216,7 +216,8 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node,
 }
 
 /* XBC node initializer */
-int __init xbc_init(char *buf);
+int __init xbc_init(char *buf, const char **emsg, int *epos);
+
 
 /* XBC cleanup data structures */
 void __init xbc_destroy_all(void);
diff --git a/init/main.c b/init/main.c
index ee4947af823f..e488213857e2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -353,6 +353,8 @@ static int __init bootconfig_params(char *param, char *val,
 static void __init setup_boot_config(const char *cmdline)
 {
 	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
+	const char *msg;
+	int pos;
 	u32 size, csum;
 	char *data, *copy;
 	u32 *hdr;
@@ -400,10 +402,14 @@ static void __init setup_boot_config(const char *cmdline)
 	memcpy(copy, data, size);
 	copy[size] = '\0';
 
-	ret = xbc_init(copy);
-	if (ret < 0)
-		pr_err("Failed to parse bootconfig\n");
-	else {
+	ret = xbc_init(copy, &msg, &pos);
+	if (ret < 0) {
+		if (pos < 0)
+			pr_err("Failed to init bootconfig: %s.\n", msg);
+		else
+			pr_err("Failed to parse bootconfig: %s at %d.\n",
+				msg, pos);
+	} else {
 		pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret);
 		/* keys starting with "kernel." are passed via cmdline */
 		extra_command_line = xbc_make_cmdline("kernel");
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index ec3ce7fd299f..912ef4921398 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -29,12 +29,14 @@ static int xbc_node_num __initdata;
 static char *xbc_data __initdata;
 static size_t xbc_data_size __initdata;
 static struct xbc_node *last_parent __initdata;
+static const char *xbc_err_msg __initdata;
+static int xbc_err_pos __initdata;
 
 static int __init xbc_parse_error(const char *msg, const char *p)
 {
-	int pos = p - xbc_data;
+	xbc_err_msg = msg;
+	xbc_err_pos = (int)(p - xbc_data);
 
-	pr_err("Parse error at pos %d: %s\n", pos, msg);
 	return -EINVAL;
 }
 
@@ -738,33 +740,44 @@ void __init xbc_destroy_all(void)
 /**
  * xbc_init() - Parse given XBC file and build XBC internal tree
  * @buf: boot config text
+ * @emsg: A pointer of const char * to store the error message
+ * @epos: A pointer of int to store the error position
  *
  * This parses the boot config text in @buf. @buf must be a
  * null terminated string and smaller than XBC_DATA_MAX.
  * Return the number of stored nodes (>0) if succeeded, or -errno
  * if there is any error.
+ * In error cases, @emsg will be updated with an error message and
+ * @epos will be updated with the error position which is the byte offset
+ * of @buf. If the error is not a parser error, @epos will be -1.
  */
-int __init xbc_init(char *buf)
+int __init xbc_init(char *buf, const char **emsg, int *epos)
 {
 	char *p, *q;
 	int ret, c;
 
+	if (epos)
+		*epos = -1;
+
 	if (xbc_data) {
-		pr_err("Error: bootconfig is already initialized.\n");
+		if (emsg)
+			*emsg = "Bootconfig is already initialized";
 		return -EBUSY;
 	}
 
 	ret = strlen(buf);
 	if (ret > XBC_DATA_MAX - 1 || ret == 0) {
-		pr_err("Error: Config data is %s.\n",
-			ret ? "too big" : "empty");
+		if (emsg)
+			*emsg = ret ? "Config data is too big" :
+				"Config data is empty";
 		return -ERANGE;
 	}
 
 	xbc_nodes = memblock_alloc(sizeof(struct xbc_node) * XBC_NODE_MAX,
 				   SMP_CACHE_BYTES);
 	if (!xbc_nodes) {
-		pr_err("Failed to allocate memory for bootconfig nodes.\n");
+		if (emsg)
+			*emsg = "Failed to allocate bootconfig nodes";
 		return -ENOMEM;
 	}
 	memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX);
@@ -814,9 +827,13 @@ int __init xbc_init(char *buf)
 	if (!ret)
 		ret = xbc_verify_tree();
 
-	if (ret < 0)
+	if (ret < 0) {
+		if (epos)
+			*epos = xbc_err_pos;
+		if (emsg)
+			*emsg = xbc_err_msg;
 		xbc_destroy_all();
-	else
+	} else
 		ret = xbc_node_num;
 
 	return ret;
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index a9b97814d1a9..16b9a420e6fd 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -130,6 +130,7 @@ int load_xbc_from_initrd(int fd, char **buf)
 	int ret;
 	u32 size = 0, csum = 0, rcsum;
 	char magic[BOOTCONFIG_MAGIC_LEN];
+	const char *msg;
 
 	ret = fstat(fd, &stat);
 	if (ret < 0)
@@ -182,10 +183,12 @@ int load_xbc_from_initrd(int fd, char **buf)
 		return -EINVAL;
 	}
 
-	ret = xbc_init(*buf);
+	ret = xbc_init(*buf, &msg, NULL);
 	/* Wrong data */
-	if (ret < 0)
+	if (ret < 0) {
+		pr_err("parse error: %s.\n", msg);
 		return ret;
+	}
 
 	return size;
 }
@@ -244,11 +247,34 @@ int delete_xbc(const char *path)
 	return ret;
 }
 
+static void show_xbc_error(const char *data, const char *msg, int pos)
+{
+	int lin = 1, col, i;
+
+	if (pos < 0) {
+		pr_err("Error: %s.\n", msg);
+		return;
+	}
+
+	/* Note that pos starts from 0 but lin and col should start from 1. */
+	col = pos + 1;
+	for (i = 0; i < pos; i++) {
+		if (data[i] == '\n') {
+			lin++;
+			col = pos - i;
+		}
+	}
+	pr_err("Parse Error: %s at %d:%d\n", msg, lin, col);
+
+}
+
 int apply_xbc(const char *path, const char *xbc_path)
 {
 	u32 size, csum;
 	char *buf, *data;
 	int ret, fd;
+	const char *msg;
+	int pos;
 
 	ret = load_xbc_file(xbc_path, &buf);
 	if (ret < 0) {
@@ -267,11 +293,12 @@ int apply_xbc(const char *path, const char *xbc_path)
 	*(u32 *)(data + size + 4) = csum;
 
 	/* Check the data format */
-	ret = xbc_init(buf);
+	ret = xbc_init(buf, &msg, &pos);
 	if (ret < 0) {
-		pr_err("Failed to parse %s: %d\n", xbc_path, ret);
+		show_xbc_error(data, msg, pos);
 		free(data);
 		free(buf);
+
 		return ret;
 	}
 	printf("Apply %s to %s\n", xbc_path, path);

From bbd9d05618a6d608c72640b1d3d651a75913456a Mon Sep 17 00:00:00 2001
From: Yiwei Zhang <zzyiwei@google.com>
Date: Mon, 2 Mar 2020 15:50:44 -0800
Subject: [PATCH 05/27] gpu/trace: add a gpu total memory usage tracepoint

This change adds the below gpu memory tracepoint:
gpu_mem/gpu_mem_total: track global or proc gpu memory total usages

Per process tracking of total gpu memory usage in the gem layer is not
appropriate and hard to implement with trivial overhead. So for the gfx
device driver layer to track total gpu memory usage both globally and
per process in an easy and uniform way is to integrate the tracepoint in
this patch to the underlying varied implementations of gpu memory
tracking system from vendors.

Putting this tracepoint in the common trace events can not only help
wean the gfx drivers off of debugfs but also greatly help the downstream
Android gpu vendors because debugfs is to be deprecated in the upcoming
Android release. Then the gpu memory tracking of both Android kernel and
the upstream linux kernel can stay closely, which can benefit the whole
kernel eco-system in the long term.

Link: http://lkml.kernel.org/r/20200302235044.59163-1-zzyiwei@google.com

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Yiwei Zhang <zzyiwei@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 drivers/Kconfig                   |  2 ++
 drivers/gpu/Makefile              |  1 +
 drivers/gpu/trace/Kconfig         |  4 +++
 drivers/gpu/trace/Makefile        |  3 ++
 drivers/gpu/trace/trace_gpu_mem.c | 13 +++++++
 include/trace/events/gpu_mem.h    | 57 +++++++++++++++++++++++++++++++
 6 files changed, 80 insertions(+)
 create mode 100644 drivers/gpu/trace/Kconfig
 create mode 100644 drivers/gpu/trace/Makefile
 create mode 100644 drivers/gpu/trace/trace_gpu_mem.c
 create mode 100644 include/trace/events/gpu_mem.h

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 8befa53f43be..e0eda1a5c3f9 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -200,6 +200,8 @@ source "drivers/thunderbolt/Kconfig"
 
 source "drivers/android/Kconfig"
 
+source "drivers/gpu/trace/Kconfig"
+
 source "drivers/nvdimm/Kconfig"
 
 source "drivers/dax/Kconfig"
diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile
index f17d01f076c7..835c88318cec 100644
--- a/drivers/gpu/Makefile
+++ b/drivers/gpu/Makefile
@@ -5,3 +5,4 @@
 obj-$(CONFIG_TEGRA_HOST1X)	+= host1x/
 obj-y			+= drm/ vga/
 obj-$(CONFIG_IMX_IPUV3_CORE)	+= ipu-v3/
+obj-$(CONFIG_TRACE_GPU_MEM)		+= trace/
diff --git a/drivers/gpu/trace/Kconfig b/drivers/gpu/trace/Kconfig
new file mode 100644
index 000000000000..c24e9edd022e
--- /dev/null
+++ b/drivers/gpu/trace/Kconfig
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config TRACE_GPU_MEM
+	bool
diff --git a/drivers/gpu/trace/Makefile b/drivers/gpu/trace/Makefile
new file mode 100644
index 000000000000..b70fbdc5847f
--- /dev/null
+++ b/drivers/gpu/trace/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_TRACE_GPU_MEM) += trace_gpu_mem.o
diff --git a/drivers/gpu/trace/trace_gpu_mem.c b/drivers/gpu/trace/trace_gpu_mem.c
new file mode 100644
index 000000000000..01e855897b6d
--- /dev/null
+++ b/drivers/gpu/trace/trace_gpu_mem.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GPU memory trace points
+ *
+ * Copyright (C) 2020 Google, Inc.
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gpu_mem.h>
+
+EXPORT_TRACEPOINT_SYMBOL(gpu_mem_total);
diff --git a/include/trace/events/gpu_mem.h b/include/trace/events/gpu_mem.h
new file mode 100644
index 000000000000..1897822a9150
--- /dev/null
+++ b/include/trace/events/gpu_mem.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * GPU memory trace points
+ *
+ * Copyright (C) 2020 Google, Inc.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gpu_mem
+
+#if !defined(_TRACE_GPU_MEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GPU_MEM_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * The gpu_memory_total event indicates that there's an update to either the
+ * global or process total gpu memory counters.
+ *
+ * This event should be emitted whenever the kernel device driver allocates,
+ * frees, imports, unimports memory in the GPU addressable space.
+ *
+ * @gpu_id: This is the gpu id.
+ *
+ * @pid: Put 0 for global total, while positive pid for process total.
+ *
+ * @size: Virtual size of the allocation in bytes.
+ *
+ */
+TRACE_EVENT(gpu_mem_total,
+
+	TP_PROTO(uint32_t gpu_id, uint32_t pid, uint64_t size),
+
+	TP_ARGS(gpu_id, pid, size),
+
+	TP_STRUCT__entry(
+		__field(uint32_t, gpu_id)
+		__field(uint32_t, pid)
+		__field(uint64_t, size)
+	),
+
+	TP_fast_assign(
+		__entry->gpu_id = gpu_id;
+		__entry->pid = pid;
+		__entry->size = size;
+	),
+
+	TP_printk("gpu_id=%u pid=%u size=%llu",
+		__entry->gpu_id,
+		__entry->pid,
+		__entry->size)
+);
+
+#endif /* _TRACE_GPU_MEM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

From bf2cbe044da275021b2de5917240411a19e5c50d Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 19 Feb 2020 22:10:12 -0700
Subject: [PATCH 06/27] tracing: Use address-of operator on section symbols

Clang warns:

../kernel/trace/trace.c:9335:33: warning: array comparison always
evaluates to true [-Wtautological-compare]
        if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
                                       ^
1 warning generated.

These are not true arrays, they are linker defined symbols, which are
just addresses. Using the address of operator silences the warning and
does not change the runtime result of the check (tested with some print
statements compiled in with clang + ld.lld and gcc + ld.bfd in QEMU).

Link: http://lkml.kernel.org/r/20200220051011.26113-1-natechancellor@gmail.com

Link: https://github.com/ClangBuiltLinux/linux/issues/893
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6b11e4e2150c..02be4ddd4ad5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9334,7 +9334,7 @@ __init static int tracer_alloc_buffers(void)
 		goto out_free_buffer_mask;
 
 	/* Only allocate trace_printk buffers if a trace_printk exists */
-	if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+	if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt)
 		/* Must be called before global_trace.buffer is allocated */
 		trace_printk_init_buffers();
 

From 8c77f0ba41566b5a54a49c53107cc855d89ff4c1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 18 Mar 2020 11:13:45 -0400
Subject: [PATCH 07/27] selftest/ftrace: Fix function trigger test to handle
 trace not disabling the tracer

The ftrace selftest "ftrace - test for function traceon/off triggers"
enables all events and reads the trace file. Now that the trace file does
not disable tracing, and will attempt to continually read new data that is
added, the selftest gets stuck reading the trace file. This is because the
data added to the trace file will fill up quicker than the reading of it.

By only enabling scheduling events, the read can keep up with the writes.
Instead of enabling all events, only enable the scheduler events.

Link: http://lkml.kernel.org/r/20200318111345.0516642e@gandalf.local.home

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: linux-kselftest@vger.kernel.org
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 .../selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
index 0c04282d33dd..1947387fe976 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
@@ -41,7 +41,7 @@ fi
 
 echo '** ENABLE EVENTS'
 
-echo 1 > events/enable
+echo 1 > events/sched/enable
 
 echo '** ENABLE TRACING'
 enable_tracing

From ff895103a84abc85a5f43ecabc7f67cf36e1348f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:23 -0400
Subject: [PATCH 08/27] tracing: Save off entry when peeking at next entry

In order to have the iterator read the buffer even when it's still updating,
it requires that the ring buffer iterator saves each event in a separate
location outside the ring buffer such that its use is immutable.

There's one use case that saves off the event returned from the ring buffer
interator and calls it again to look at the next event, before going back to
use the first event. As the ring buffer iterator will only have a single
copy, this use case will no longer be supported.

Instead, have the one use case create its own buffer to store the first
event when looking at the next event. This way, when looking at the first
event again, it wont be corrupted by the second read.

Link: http://lkml.kernel.org/r/20200317213415.722539921@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  2 ++
 kernel/trace/trace.c         | 40 +++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_output.c  | 15 ++++++--------
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 6c7a10a6d71e..5c6943354049 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -85,6 +85,8 @@ struct trace_iterator {
 	struct mutex		mutex;
 	struct ring_buffer_iter	**buffer_iter;
 	unsigned long		iter_flags;
+	void			*temp;	/* temp holder */
+	unsigned int		temp_size;
 
 	/* trace_seq for __print_flags() and __print_symbolic() etc. */
 	struct trace_seq	tmp_seq;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 02be4ddd4ad5..819e31d0d66c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3466,7 +3466,31 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts)
 {
-	return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
+	/* __find_next_entry will reset ent_size */
+	int ent_size = iter->ent_size;
+	struct trace_entry *entry;
+
+	/*
+	 * The __find_next_entry() may call peek_next_entry(), which may
+	 * call ring_buffer_peek() that may make the contents of iter->ent
+	 * undefined. Need to copy iter->ent now.
+	 */
+	if (iter->ent && iter->ent != iter->temp) {
+		if (!iter->temp || iter->temp_size < iter->ent_size) {
+			kfree(iter->temp);
+			iter->temp = kmalloc(iter->ent_size, GFP_KERNEL);
+			if (!iter->temp)
+				return NULL;
+		}
+		memcpy(iter->temp, iter->ent, iter->ent_size);
+		iter->temp_size = iter->ent_size;
+		iter->ent = iter->temp;
+	}
+	entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts);
+	/* Put back the original ent_size */
+	iter->ent_size = ent_size;
+
+	return entry;
 }
 
 /* Find the next real entry, and increment the iterator to the next entry */
@@ -4197,6 +4221,18 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (!iter->buffer_iter)
 		goto release;
 
+	/*
+	 * trace_find_next_entry() may need to save off iter->ent.
+	 * It will place it into the iter->temp buffer. As most
+	 * events are less than 128, allocate a buffer of that size.
+	 * If one is greater, then trace_find_next_entry() will
+	 * allocate a new buffer to adjust for the bigger iter->ent.
+	 * It's not critical if it fails to get allocated here.
+	 */
+	iter->temp = kmalloc(128, GFP_KERNEL);
+	if (iter->temp)
+		iter->temp_size = 128;
+
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
 	 * changes on it while we are reading.
@@ -4269,6 +4305,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
  fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter->trace);
+	kfree(iter->temp);
 	kfree(iter->buffer_iter);
 release:
 	seq_release_private(inode, file);
@@ -4344,6 +4381,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	mutex_destroy(&iter->mutex);
 	free_cpumask_var(iter->started);
+	kfree(iter->temp);
 	kfree(iter->trace);
 	kfree(iter->buffer_iter);
 	seq_release_private(inode, file);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e25a7da79c6b..9a121e147102 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -617,22 +617,19 @@ int trace_print_context(struct trace_iterator *iter)
 
 int trace_print_lat_context(struct trace_iterator *iter)
 {
+	struct trace_entry *entry, *next_entry;
 	struct trace_array *tr = iter->tr;
-	/* trace_find_next_entry will reset ent_size */
-	int ent_size = iter->ent_size;
 	struct trace_seq *s = &iter->seq;
-	u64 next_ts;
-	struct trace_entry *entry = iter->ent,
-			   *next_entry = trace_find_next_entry(iter, NULL,
-							       &next_ts);
 	unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE);
+	u64 next_ts;
 
-	/* Restore the original ent_size */
-	iter->ent_size = ent_size;
-
+	next_entry = trace_find_next_entry(iter, NULL, &next_ts);
 	if (!next_entry)
 		next_ts = iter->ts;
 
+	/* trace_find_next_entry() may change iter->ent */
+	entry = iter->ent;
+
 	if (verbose) {
 		char comm[TASK_COMM_LEN];
 

From ead6ecfddea54f4754c97f64ab7198cc1d8c0daa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:24 -0400
Subject: [PATCH 09/27] ring-buffer: Have ring_buffer_empty() not depend on
 tracing stopped

It was complained about that when the trace file is read, that the tracing
is disabled, as the iterator expects writing to the buffer it reads is not
updated. Several steps are needed to make the iterator handle a writer,
by testing if things have changed as it reads.

This step is to make ring_buffer_empty() expect the buffer to be changing.
Note if the current location of the iterator is overwritten, then it will
return false as new data is being added. Note, that this means that data
will be skipped.

Link: http://lkml.kernel.org/r/20200317213415.870741809@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 61f0e92ace99..1718520a2809 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3590,16 +3590,37 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
 	struct buffer_page *reader;
 	struct buffer_page *head_page;
 	struct buffer_page *commit_page;
+	struct buffer_page *curr_commit_page;
 	unsigned commit;
+	u64 curr_commit_ts;
+	u64 commit_ts;
 
 	cpu_buffer = iter->cpu_buffer;
-
-	/* Remember, trace recording is off when iterator is in use */
 	reader = cpu_buffer->reader_page;
 	head_page = cpu_buffer->head_page;
 	commit_page = cpu_buffer->commit_page;
-	commit = rb_page_commit(commit_page);
+	commit_ts = commit_page->page->time_stamp;
 
+	/*
+	 * When the writer goes across pages, it issues a cmpxchg which
+	 * is a mb(), which will synchronize with the rmb here.
+	 * (see rb_tail_page_update())
+	 */
+	smp_rmb();
+	commit = rb_page_commit(commit_page);
+	/* We want to make sure that the commit page doesn't change */
+	smp_rmb();
+
+	/* Make sure commit page didn't change */
+	curr_commit_page = READ_ONCE(cpu_buffer->commit_page);
+	curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp);
+
+	/* If the commit page changed, then there's more data */
+	if (curr_commit_page != commit_page ||
+	    curr_commit_ts != commit_ts)
+		return 0;
+
+	/* Still racy, as it may return a false positive, but that's OK */
 	return ((iter->head_page == commit_page && iter->head == commit) ||
 		(iter->head_page == reader && commit_page == head_page &&
 		 head_page->read == commit &&

From bc1a72afdc4a91844928831cac85731566e03bc6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:25 -0400
Subject: [PATCH 10/27] ring-buffer: Rename ring_buffer_read() to
 read_buffer_iter_advance()

When the ring buffer was first created, the iterator followed the normal
producer/consumer operations where it had both a peek() operation, that just
returned the event at the current location, and a read(), that would return
the event at the current location and also increment the iterator such that
the next peek() or read() will return the next event.

The only use of the ring_buffer_read() is currently to move the iterator to
the next location and nothing now actually reads the event it returns.
Rename this function to its actual use case to ring_buffer_iter_advance(),
which also adds the "iter" part to the name, which is more meaningful. As
the timestamp returned by ring_buffer_read() was never used, there's no
reason that this new version should bother having returning it. It will also
become a void function.

Link: http://lkml.kernel.org/r/20200317213416.018928618@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h          |  3 +--
 kernel/trace/ring_buffer.c           | 23 ++++++-----------------
 kernel/trace/trace.c                 |  4 ++--
 kernel/trace/trace_functions_graph.c |  2 +-
 4 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index df0124eabece..0ae603b79b0e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -135,8 +135,7 @@ void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
 ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
-struct ring_buffer_event *
-ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_advance(struct ring_buffer_iter *iter);
 void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
 int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1718520a2809..f57eeaa80e3e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4318,35 +4318,24 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
 EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
 
 /**
- * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * ring_buffer_iter_advance - advance the iterator to the next location
  * @iter: The ring buffer iterator
- * @ts: The time stamp of the event read.
  *
- * This reads the next event in the ring buffer and increments the iterator.
+ * Move the location of the iterator such that the next read will
+ * be the next location of the iterator.
  */
-struct ring_buffer_event *
-ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
 {
-	struct ring_buffer_event *event;
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- again:
-	event = rb_iter_peek(iter, ts);
-	if (!event)
-		goto out;
-
-	if (event->type_len == RINGBUF_TYPE_PADDING)
-		goto again;
 
 	rb_advance_iter(iter);
- out:
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	return event;
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
-EXPORT_SYMBOL_GPL(ring_buffer_read);
+EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
 
 /**
  * ring_buffer_size - return the size of the ring buffer (in bytes)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 819e31d0d66c..47889123be7f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3378,7 +3378,7 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 
 	iter->idx++;
 	if (buf_iter)
-		ring_buffer_read(buf_iter, NULL);
+		ring_buffer_iter_advance(buf_iter);
 }
 
 static struct trace_entry *
@@ -3562,7 +3562,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 		if (ts >= iter->array_buffer->time_start)
 			break;
 		entries++;
-		ring_buffer_read(buf_iter, NULL);
+		ring_buffer_iter_advance(buf_iter);
 	}
 
 	per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 7d71546ba00a..4a9c49c08ec9 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -482,7 +482,7 @@ get_return_for_leaf(struct trace_iterator *iter,
 
 	/* this is a leaf, now advance the iterator */
 	if (ring_iter)
-		ring_buffer_read(ring_iter, NULL);
+		ring_buffer_iter_advance(ring_iter);
 
 	return next;
 }

From 28e3fc56a471bbac39d24571e11dde64b15de988 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:26 -0400
Subject: [PATCH 11/27] ring-buffer: Add page_stamp to iterator for
 synchronization

Have the ring_buffer_iter structure contain a page_stamp, such that it can
be used to see if the writer entered the page the iterator is on. When going
to a new page, the iterator will record the time stamp of that page. When
reading events, it can copy the event to an internal buffer on the iterator
(to be implemented later), then check the page's time stamp with its own to
see if the writer entered the page. If so, it will need to try to read the
event again.

Link: http://lkml.kernel.org/r/20200317213416.163549674@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f57eeaa80e3e..e689bdcb53e8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -507,6 +507,7 @@ struct ring_buffer_iter {
 	struct buffer_page		*cache_reader_page;
 	unsigned long			cache_read;
 	u64				read_stamp;
+	u64				page_stamp;
 };
 
 /**
@@ -1959,7 +1960,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 	else
 		rb_inc_page(cpu_buffer, &iter->head_page);
 
-	iter->read_stamp = iter->head_page->page->time_stamp;
+	iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
 	iter->head = 0;
 }
 
@@ -3551,10 +3552,13 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
 	iter->cache_reader_page = iter->head_page;
 	iter->cache_read = cpu_buffer->read;
 
-	if (iter->head)
+	if (iter->head) {
 		iter->read_stamp = cpu_buffer->read_stamp;
-	else
+		iter->page_stamp = cpu_buffer->reader_page->page->time_stamp;
+	} else {
 		iter->read_stamp = iter->head_page->page->time_stamp;
+		iter->page_stamp = iter->read_stamp;
+	}
 }
 
 /**

From 785888c544e0433f601df18ff98a3215b380b9c3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:27 -0400
Subject: [PATCH 12/27] ring-buffer: Have rb_iter_head_event() handle
 concurrent writer

Have the ring_buffer_iter structure have a place to store an event, such
that it can not be overwritten by a writer, and load it in such a way via
rb_iter_head_event() that it will return NULL and reset the iter to the
start of the current page if a writer updated the page.

Link: http://lkml.kernel.org/r/20200317213416.306959216@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 108 ++++++++++++++++++++++++++-----------
 1 file changed, 76 insertions(+), 32 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e689bdcb53e8..3d718add73c1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -503,11 +503,13 @@ struct trace_buffer {
 struct ring_buffer_iter {
 	struct ring_buffer_per_cpu	*cpu_buffer;
 	unsigned long			head;
+	unsigned long			next_event;
 	struct buffer_page		*head_page;
 	struct buffer_page		*cache_reader_page;
 	unsigned long			cache_read;
 	u64				read_stamp;
 	u64				page_stamp;
+	struct ring_buffer_event	*event;
 };
 
 /**
@@ -1914,17 +1916,61 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
 			       cpu_buffer->reader_page->read);
 }
 
-static __always_inline struct ring_buffer_event *
-rb_iter_head_event(struct ring_buffer_iter *iter)
-{
-	return __rb_page_index(iter->head_page, iter->head);
-}
-
 static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
 {
 	return local_read(&bpage->page->commit);
 }
 
+static struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_event *event;
+	struct buffer_page *iter_head_page = iter->head_page;
+	unsigned long commit;
+	unsigned length;
+
+	/*
+	 * When the writer goes across pages, it issues a cmpxchg which
+	 * is a mb(), which will synchronize with the rmb here.
+	 * (see rb_tail_page_update() and __rb_reserve_next())
+	 */
+	commit = rb_page_commit(iter_head_page);
+	smp_rmb();
+	event = __rb_page_index(iter_head_page, iter->head);
+	length = rb_event_length(event);
+
+	/*
+	 * READ_ONCE() doesn't work on functions and we don't want the
+	 * compiler doing any crazy optimizations with length.
+	 */
+	barrier();
+
+	if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+		/* Writer corrupted the read? */
+		goto reset;
+
+	memcpy(iter->event, event, length);
+	/*
+	 * If the page stamp is still the same after this rmb() then the
+	 * event was safely copied without the writer entering the page.
+	 */
+	smp_rmb();
+
+	/* Make sure the page didn't change since we read this */
+	if (iter->page_stamp != iter_head_page->page->time_stamp ||
+	    commit > rb_page_commit(iter_head_page))
+		goto reset;
+
+	iter->next_event = iter->head + length;
+	return iter->event;
+ reset:
+	/* Reset to the beginning */
+	iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
+	iter->head = 0;
+	iter->next_event = 0;
+	return NULL;
+}
+
 /* Size is determined by what has been committed */
 static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
 {
@@ -1962,6 +2008,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 
 	iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
 	iter->head = 0;
+	iter->next_event = 0;
 }
 
 /*
@@ -3548,6 +3595,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
 	/* Iterator usage is expected to have record disabled */
 	iter->head_page = cpu_buffer->reader_page;
 	iter->head = cpu_buffer->reader_page->read;
+	iter->next_event = iter->head;
 
 	iter->cache_reader_page = iter->head_page;
 	iter->cache_read = cpu_buffer->read;
@@ -3625,7 +3673,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
 		return 0;
 
 	/* Still racy, as it may return a false positive, but that's OK */
-	return ((iter->head_page == commit_page && iter->head == commit) ||
+	return ((iter->head_page == commit_page && iter->head >= commit) ||
 		(iter->head_page == reader && commit_page == head_page &&
 		 head_page->read == commit &&
 		 iter->head == rb_page_commit(cpu_buffer->reader_page)));
@@ -3853,15 +3901,22 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 static void rb_advance_iter(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct ring_buffer_event *event;
-	unsigned length;
 
 	cpu_buffer = iter->cpu_buffer;
 
+	/* If head == next_event then we need to jump to the next event */
+	if (iter->head == iter->next_event) {
+		/* If the event gets overwritten again, there's nothing to do */
+		if (rb_iter_head_event(iter) == NULL)
+			return;
+	}
+
+	iter->head = iter->next_event;
+
 	/*
 	 * Check if we are at the end of the buffer.
 	 */
-	if (iter->head >= rb_page_size(iter->head_page)) {
+	if (iter->next_event >= rb_page_size(iter->head_page)) {
 		/* discarded commits can make the page empty */
 		if (iter->head_page == cpu_buffer->commit_page)
 			return;
@@ -3869,27 +3924,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 		return;
 	}
 
-	event = rb_iter_head_event(iter);
-
-	length = rb_event_length(event);
-
-	/*
-	 * This should not be called to advance the header if we are
-	 * at the tail of the buffer.
-	 */
-	if (RB_WARN_ON(cpu_buffer,
-		       (iter->head_page == cpu_buffer->commit_page) &&
-		       (iter->head + length > rb_commit_index(cpu_buffer))))
-		return;
-
-	rb_update_iter_read_stamp(iter, event);
-
-	iter->head += length;
-
-	/* check for end of page padding */
-	if ((iter->head >= rb_page_size(iter->head_page)) &&
-	    (iter->head_page != cpu_buffer->commit_page))
-		rb_inc_iter(iter);
+	rb_update_iter_read_stamp(iter, iter->event);
 }
 
 static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
@@ -4017,6 +4052,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	}
 
 	event = rb_iter_head_event(iter);
+	if (!event)
+		goto again;
 
 	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
@@ -4233,10 +4270,16 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
-	iter = kmalloc(sizeof(*iter), flags);
+	iter = kzalloc(sizeof(*iter), flags);
 	if (!iter)
 		return NULL;
 
+	iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+	if (!iter->event) {
+		kfree(iter);
+		return NULL;
+	}
+
 	cpu_buffer = buffer->buffers[cpu];
 
 	iter->cpu_buffer = cpu_buffer;
@@ -4317,6 +4360,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
 
 	atomic_dec(&cpu_buffer->record_disabled);
 	atomic_dec(&cpu_buffer->buffer->resize_disabled);
+	kfree(iter->event);
 	kfree(iter);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_finish);

From ff84c50cfb4b8dc68c982fb6c05a524e1539ee2f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:28 -0400
Subject: [PATCH 13/27] ring-buffer: Do not die if rb_iter_peek() fails more
 than thrice

As the iterator will be reading a live buffer, and if the event being read
is on a page that a writer crosses, it will fail and try again, the
condition in rb_iter_peek() that only allows a retry to happen three times
is no longer valid. Allow rb_iter_peek() to retry more than three times
without killing the ring buffer, but only if rb_iter_head_event() had failed
at least once.

Link: http://lkml.kernel.org/r/20200317213416.452888193@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3d718add73c1..475338fda969 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4012,6 +4012,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
 	int nr_loops = 0;
+	bool failed = false;
 
 	if (ts)
 		*ts = 0;
@@ -4038,10 +4039,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	 * to a data event, we should never loop more than three times.
 	 * Once for going to next page, once on time extend, and
 	 * finally once to get the event.
-	 * (We never hit the following condition more than thrice).
+	 * We should never hit the following condition more than thrice,
+	 * unless the buffer is very small, and there's a writer
+	 * that is causing the reader to fail getting an event.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
+	if (++nr_loops > 3) {
+		RB_WARN_ON(cpu_buffer, !failed);
 		return NULL;
+	}
 
 	if (rb_per_cpu_empty(cpu_buffer))
 		return NULL;
@@ -4052,8 +4057,10 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	}
 
 	event = rb_iter_head_event(iter);
-	if (!event)
+	if (!event) {
+		failed = true;
 		goto again;
+	}
 
 	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:

From 153368ce1bd0ccb47812a3185e824445a7024ea5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:29 -0400
Subject: [PATCH 14/27] ring-buffer: Optimize rb_iter_head_event()

As it is fine to perform several "peeks" of event data in the ring buffer
via the iterator before moving it forward, do not re-read the event, just
return what was read before. Otherwise, it can cause inconsistent results,
especially when testing multiple CPU buffers to interleave them.

Link: http://lkml.kernel.org/r/20200317213416.592032170@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 475338fda969..5979327254f9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1929,6 +1929,9 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	unsigned long commit;
 	unsigned length;
 
+	if (iter->head != iter->next_event)
+		return iter->event;
+
 	/*
 	 * When the writer goes across pages, it issues a cmpxchg which
 	 * is a mb(), which will synchronize with the rmb here.

From 07b8b10ec94f852502db739047a2803ed36ccf46 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 27 Mar 2020 16:21:22 -0400
Subject: [PATCH 15/27] ring-buffer: Make resize disable per cpu buffer instead
 of total buffer

When the ring buffer becomes writable for even when the trace file is read,
it must still not be resized. But since tracers can be activated while the
trace file is being read, the irqsoff tracer can modify the per CPU buffers,
and this can cause the reader of the trace file to update the wrong buffer's
resize disable bit, as the irqsoff tracer swaps out cpu buffers.

By making the resize disable per cpu_buffer, it makes the update follow the
per cpu_buffer even if it's swapped out with the snapshot buffer and keeps
the release of the trace file modifying the same data as the open did.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 43 ++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5979327254f9..e2de5b448c91 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -441,6 +441,7 @@ enum {
 struct ring_buffer_per_cpu {
 	int				cpu;
 	atomic_t			record_disabled;
+	atomic_t			resize_disabled;
 	struct trace_buffer	*buffer;
 	raw_spinlock_t			reader_lock;	/* serialize readers */
 	arch_spinlock_t			lock;
@@ -484,7 +485,6 @@ struct trace_buffer {
 	unsigned			flags;
 	int				cpus;
 	atomic_t			record_disabled;
-	atomic_t			resize_disabled;
 	cpumask_var_t			cpumask;
 
 	struct lock_class_key		*reader_lock_key;
@@ -1740,18 +1740,24 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 
 	size = nr_pages * BUF_PAGE_SIZE;
 
-	/*
-	 * Don't succeed if resizing is disabled, as a reader might be
-	 * manipulating the ring buffer and is expecting a sane state while
-	 * this is true.
-	 */
-	if (atomic_read(&buffer->resize_disabled))
-		return -EBUSY;
-
 	/* prevent another thread from changing buffer sizes */
 	mutex_lock(&buffer->mutex);
 
+
 	if (cpu_id == RING_BUFFER_ALL_CPUS) {
+		/*
+		 * Don't succeed if resizing is disabled, as a reader might be
+		 * manipulating the ring buffer and is expecting a sane state while
+		 * this is true.
+		 */
+		for_each_buffer_cpu(buffer, cpu) {
+			cpu_buffer = buffer->buffers[cpu];
+			if (atomic_read(&cpu_buffer->resize_disabled)) {
+				err = -EBUSY;
+				goto out_err_unlock;
+			}
+		}
+
 		/* calculate the pages to update */
 		for_each_buffer_cpu(buffer, cpu) {
 			cpu_buffer = buffer->buffers[cpu];
@@ -1819,6 +1825,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 		if (nr_pages == cpu_buffer->nr_pages)
 			goto out;
 
+		/*
+		 * Don't succeed if resizing is disabled, as a reader might be
+		 * manipulating the ring buffer and is expecting a sane state while
+		 * this is true.
+		 */
+		if (atomic_read(&cpu_buffer->resize_disabled)) {
+			err = -EBUSY;
+			goto out_err_unlock;
+		}
+
 		cpu_buffer->nr_pages_to_update = nr_pages -
 						cpu_buffer->nr_pages;
 
@@ -1888,6 +1904,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 			free_buffer_page(bpage);
 		}
 	}
+ out_err_unlock:
 	mutex_unlock(&buffer->mutex);
 	return err;
 }
@@ -4294,7 +4311,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	iter->cpu_buffer = cpu_buffer;
 
-	atomic_inc(&buffer->resize_disabled);
+	atomic_inc(&cpu_buffer->resize_disabled);
 	atomic_inc(&cpu_buffer->record_disabled);
 
 	return iter;
@@ -4369,7 +4386,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
 	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	atomic_dec(&cpu_buffer->record_disabled);
-	atomic_dec(&cpu_buffer->buffer->resize_disabled);
+	atomic_dec(&cpu_buffer->resize_disabled);
 	kfree(iter->event);
 	kfree(iter);
 }
@@ -4474,7 +4491,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
 
-	atomic_inc(&buffer->resize_disabled);
+	atomic_inc(&cpu_buffer->resize_disabled);
 	atomic_inc(&cpu_buffer->record_disabled);
 
 	/* Make sure all commits have finished */
@@ -4495,7 +4512,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
 	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	atomic_dec(&cpu_buffer->record_disabled);
-	atomic_dec(&buffer->resize_disabled);
+	atomic_dec(&cpu_buffer->resize_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 

From 1039221cc2787dee51a7ffbf9b0e79d192dadf76 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:30 -0400
Subject: [PATCH 16/27] ring-buffer: Do not disable recording when there is an
 iterator

Now that the iterator can handle a concurrent writer, do not disable writing
to the ring buffer when there is an iterator present.

Link: http://lkml.kernel.org/r/20200317213416.759770696@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e2de5b448c91..af2f10d9f3f1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4312,7 +4312,6 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 	iter->cpu_buffer = cpu_buffer;
 
 	atomic_inc(&cpu_buffer->resize_disabled);
-	atomic_inc(&cpu_buffer->record_disabled);
 
 	return iter;
 }
@@ -4385,7 +4384,6 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
 	rb_check_pages(cpu_buffer);
 	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	atomic_dec(&cpu_buffer->record_disabled);
 	atomic_dec(&cpu_buffer->resize_disabled);
 	kfree(iter->event);
 	kfree(iter);

From 06e0a548bad0f43a21e036db018e4dadb501ce8b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:31 -0400
Subject: [PATCH 17/27] tracing: Do not disable tracing when reading the trace
 file

When opening the "trace" file, it is no longer necessary to disable tracing.

Note, a new option is created called "pause-on-trace", when set, will cause
the trace file to emulate its original behavior.

Link: http://lkml.kernel.org/r/20200317213416.903351225@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst | 6 ++++++
 kernel/trace/trace.c           | 9 ++++++---
 kernel/trace/trace.h           | 1 +
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 99a0890e20ec..c33950a35d65 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -1125,6 +1125,12 @@ Here are the available options:
 	the trace displays additional information about the
 	latency, as described in "Latency trace format".
 
+  pause-on-trace
+	When set, opening the trace file for read, will pause
+	writing to the ring buffer (as if tracing_on was set to zero).
+	This simulates the original behavior of the trace file.
+	When the file is closed, tracing will be enabled again.
+
   record-cmd
 	When any event or tracer is enabled, a hook is enabled
 	in the sched_switch trace point to fill comm cache
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 47889123be7f..650fa81fffe8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4273,8 +4273,11 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (trace_clocks[tr->clock_id].in_ns)
 		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 
-	/* stop the trace while dumping if we are not opening "snapshot" */
-	if (!iter->snapshot)
+	/*
+	 * If pause-on-trace is enabled, then stop the trace while
+	 * dumping, unless this is the "snapshot" file
+	 */
+	if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE))
 		tracing_stop_tr(tr);
 
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
@@ -4371,7 +4374,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 	if (iter->trace && iter->trace->close)
 		iter->trace->close(iter);
 
-	if (!iter->snapshot)
+	if (!iter->snapshot && tr->stop_count)
 		/* reenable tracing if it was previously enabled */
 		tracing_start_tr(tr);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c61e1b1c85a6..f37e05135986 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1302,6 +1302,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		C(IRQ_INFO,		"irq-info"),		\
 		C(MARKERS,		"markers"),		\
 		C(EVENT_FORK,		"event-fork"),		\
+		C(PAUSE_ON_TRACE,	"pause-on-trace"),	\
 		FUNCTION_FLAGS					\
 		FGRAPH_FLAGS					\
 		STACK_FLAGS					\

From c9b7a4a72ff64e67b7e877a99fd652230dc26058 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:32 -0400
Subject: [PATCH 18/27] ring-buffer/tracing: Have iterator acknowledge dropped
 events

Have the ring_buffer_iterator set a flag if events were dropped as it were
to go and peek at the next event. Have the trace file display this fact if
it happened with a "LOST EVENTS" message.

Link: http://lkml.kernel.org/r/20200317213417.045858900@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 16 ++++++++++++++++
 kernel/trace/trace.c        | 16 ++++++++++++----
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 0ae603b79b0e..c76b2f3b3ac4 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -138,6 +138,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
 void ring_buffer_iter_advance(struct ring_buffer_iter *iter);
 void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
 int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
 
 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index af2f10d9f3f1..6f0b42ceeb00 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -510,6 +510,7 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 	u64				page_stamp;
 	struct ring_buffer_event	*event;
+	int				missed_events;
 };
 
 /**
@@ -1988,6 +1989,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
 	iter->head = 0;
 	iter->next_event = 0;
+	iter->missed_events = 1;
 	return NULL;
 }
 
@@ -4191,6 +4193,20 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
 	return event;
 }
 
+/** ring_buffer_iter_dropped - report if there are dropped events
+ * @iter: The ring buffer iterator
+ *
+ * Returns true if there was dropped events since the last peek.
+ */
+bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
+{
+	bool ret = iter->missed_events != 0;
+
+	iter->missed_events = 0;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
+
 /**
  * ring_buffer_iter_peek - peek at the next event to be read
  * @iter: The ring buffer iterator
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 650fa81fffe8..5e634b9c1e0a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3388,11 +3388,15 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
 
-	if (buf_iter)
+	if (buf_iter) {
 		event = ring_buffer_iter_peek(buf_iter, ts);
-	else
+		if (lost_events)
+			*lost_events = ring_buffer_iter_dropped(buf_iter) ?
+				(unsigned long)-1 : 0;
+	} else {
 		event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts,
 					 lost_events);
+	}
 
 	if (event) {
 		iter->ent_size = ring_buffer_event_length(event);
@@ -4005,8 +4009,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
 	enum print_line_t ret;
 
 	if (iter->lost_events) {
-		trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
-				 iter->cpu, iter->lost_events);
+		if (iter->lost_events == (unsigned long)-1)
+			trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n",
+					 iter->cpu);
+		else
+			trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+					 iter->cpu, iter->lost_events);
 		if (trace_seq_has_overflowed(&iter->seq))
 			return TRACE_TYPE_PARTIAL_LINE;
 	}

From 8a815e6b8b88bfbbaba7c484af0833551a825321 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Mar 2020 17:32:33 -0400
Subject: [PATCH 19/27] tracing: Have the document reflect that the trace file
 keeps tracing enabled

Now that reading the trace file does not temporarly stop tracing while it is
open, update the document to reflect this fact.

Link: http://lkml.kernel.org/r/20200317213417.209675068@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index c33950a35d65..00621731bf27 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -125,10 +125,13 @@ of ftrace. Here is a list of some of the key files:
   trace:
 
 	This file holds the output of the trace in a human
-	readable format (described below). Note, tracing is temporarily
-	disabled when the file is open for reading. Once all readers
-	are closed, tracing is re-enabled. Opening this file for
+	readable format (described below). Opening this file for
 	writing with the O_TRUNC flag clears the ring buffer content.
+        Note, this file is not a consumer. If tracing is off
+        (no tracer running, or tracing_on is zero), it will produce
+        the same output each time it is read. When tracing is on,
+        it may produce inconsistent results as it tries to read
+        the entire buffer without consuming it.
 
   trace_pipe:
 
@@ -142,9 +145,7 @@ of ftrace. Here is a list of some of the key files:
 	will not be read again with a sequential read. The
 	"trace" file is static, and if the tracer is not
 	adding more data, it will display the same
-	information every time it is read. Unlike the
-	"trace" file, opening this file for reading will not
-	temporarily disable tracing.
+	information every time it is read.
 
   trace_options:
 

From 6a13a0d7b4d1171ef9b80ad69abc37e1daa941b3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 24 Mar 2020 16:34:48 +0900
Subject: [PATCH 20/27] ftrace/kprobe: Show the maxactive number on
 kprobe_events

Show maxactive parameter on kprobe_events.
This allows user to save the current configuration and
restore it without losing maxactive parameter.

Link: http://lkml.kernel.org/r/4762764a-6df7-bc93-ed60-e336146dce1f@gmail.com
Link: http://lkml.kernel.org/r/158503528846.22706.5549974121212526020.stgit@devnote2

Cc: stable@vger.kernel.org
Fixes: 696ced4fb1d76 ("tracing/kprobes: expose maxactive for kretprobe in kprobe_events")
Reported-by: Taeung Song <treeze.taeung@gmail.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 362cca52f5de..d0568af4a0ef 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1078,6 +1078,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev)
 	int i;
 
 	seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
+	if (trace_kprobe_is_return(tk) && tk->rp.maxactive)
+		seq_printf(m, "%d", tk->rp.maxactive);
 	seq_printf(m, ":%s/%s", trace_probe_group_name(&tk->tp),
 				trace_probe_name(&tk->tp));
 

From 717e3f5ebc823e5ecfdd15155b0fd81af9fc58d6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Mar 2020 23:40:40 -0400
Subject: [PATCH 21/27] ftrace: Make function trace pid filtering a bit more
 exact

The set_ftrace_pid file is used to filter function tracing to only trace
tasks that are listed in that file. Instead of testing the pids listed in
that file (it's a bitmask) at each function trace event, the logic is done
via a sched_switch hook. A flag is set when the next task to run is in the
list of pids in the set_ftrace_pid file. But the sched_switch hook is not at
the exact location of when the task switches, and the flag gets set before
the task to be traced actually runs. This leaves a residue of traced
functions that do not belong to the pid that should be filtered on.

By changing the logic slightly, where instead of having  a boolean flag to
test, record the pid that should be traced, with special values for not to
trace and always trace. Then at each function call, a check will be made to
see if the function should be ignored, or if the current pid matches the
function that should be traced, and only trace if it matches (or if it has
the special value to always trace).

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 32 +++++++++++++++++++++++++-------
 kernel/trace/trace.h  |  4 ++--
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f7ee102868a..34ae736cb1f8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -139,13 +139,23 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
 #endif
 }
 
+#define FTRACE_PID_IGNORE	-1
+#define FTRACE_PID_TRACE	-2
+
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 			    struct ftrace_ops *op, struct pt_regs *regs)
 {
 	struct trace_array *tr = op->private;
+	int pid;
 
-	if (tr && this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid))
-		return;
+	if (tr) {
+		pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+		if (pid == FTRACE_PID_IGNORE)
+			return;
+		if (pid != FTRACE_PID_TRACE &&
+		    pid != current->pid)
+			return;
+	}
 
 	op->saved_func(ip, parent_ip, op, regs);
 }
@@ -6924,8 +6934,12 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
 
 	pid_list = rcu_dereference_sched(tr->function_pids);
 
-	this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
-		       trace_ignore_this_task(pid_list, next));
+	if (trace_ignore_this_task(pid_list, next))
+		this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+			       FTRACE_PID_IGNORE);
+	else
+		this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+			       next->pid);
 }
 
 static void
@@ -6978,7 +6992,7 @@ static void clear_ftrace_pids(struct trace_array *tr)
 	unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
 
 	for_each_possible_cpu(cpu)
-		per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = false;
+		per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE;
 
 	rcu_assign_pointer(tr->function_pids, NULL);
 
@@ -7103,8 +7117,12 @@ static void ignore_task_cpu(void *data)
 	pid_list = rcu_dereference_protected(tr->function_pids,
 					     mutex_is_locked(&ftrace_lock));
 
-	this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
-		       trace_ignore_this_task(pid_list, current));
+	if (trace_ignore_this_task(pid_list, current))
+		this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+			       FTRACE_PID_IGNORE);
+	else
+		this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+			       current->pid);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f37e05135986..fdc72f5f0bb0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -178,10 +178,10 @@ struct trace_array_cpu {
 	kuid_t			uid;
 	char			comm[TASK_COMM_LEN];
 
-	bool			ignore_pid;
 #ifdef CONFIG_FUNCTION_TRACER
-	bool			ftrace_ignore_pid;
+	int			ftrace_ignore_pid;
 #endif
+	bool			ignore_pid;
 };
 
 struct tracer;

From b3b1e6ededa4337940adba6cf06e8351056e3097 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Mar 2020 23:19:06 -0400
Subject: [PATCH 22/27] ftrace: Create set_ftrace_notrace_pid to not trace
 tasks

There's currently a way to select a task that should only be traced by
functions, but there's no way to select a task not to be traced by the
function tracer. Add a set_ftrace_notrace_pid file that acts the same as
set_ftrace_pid (and is also affected by function-fork), but the task pids in
this file will not be traced even if they are listed in the set_ftrace_pid
file. This makes it easy for tools like trace-cmd to "hide" itself from the
function tracer when it is recording other tasks.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c       | 181 +++++++++++++++++++++++++++++++-----
 kernel/trace/trace.c        |  20 ++--
 kernel/trace/trace.h        |   2 +
 kernel/trace/trace_events.c |  12 +--
 4 files changed, 180 insertions(+), 35 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 34ae736cb1f8..7239d9acd09f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -102,7 +102,7 @@ static bool ftrace_pids_enabled(struct ftrace_ops *ops)
 
 	tr = ops->private;
 
-	return tr->function_pids != NULL;
+	return tr->function_pids != NULL || tr->function_no_pids != NULL;
 }
 
 static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -6931,10 +6931,12 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *pid_list;
+	struct trace_pid_list *no_pid_list;
 
 	pid_list = rcu_dereference_sched(tr->function_pids);
+	no_pid_list = rcu_dereference_sched(tr->function_no_pids);
 
-	if (trace_ignore_this_task(pid_list, next))
+	if (trace_ignore_this_task(pid_list, no_pid_list, next))
 		this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
 			       FTRACE_PID_IGNORE);
 	else
@@ -6952,6 +6954,9 @@ ftrace_pid_follow_sched_process_fork(void *data,
 
 	pid_list = rcu_dereference_sched(tr->function_pids);
 	trace_filter_add_remove_task(pid_list, self, task);
+
+	pid_list = rcu_dereference_sched(tr->function_no_pids);
+	trace_filter_add_remove_task(pid_list, self, task);
 }
 
 static void
@@ -6962,6 +6967,9 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
 
 	pid_list = rcu_dereference_sched(tr->function_pids);
 	trace_filter_add_remove_task(pid_list, NULL, task);
+
+	pid_list = rcu_dereference_sched(tr->function_no_pids);
+	trace_filter_add_remove_task(pid_list, NULL, task);
 }
 
 void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
@@ -6979,42 +6987,64 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
 	}
 }
 
-static void clear_ftrace_pids(struct trace_array *tr)
+enum {
+	TRACE_PIDS		= BIT(0),
+	TRACE_NO_PIDS		= BIT(1),
+};
+
+static void clear_ftrace_pids(struct trace_array *tr, int type)
 {
 	struct trace_pid_list *pid_list;
+	struct trace_pid_list *no_pid_list;
 	int cpu;
 
 	pid_list = rcu_dereference_protected(tr->function_pids,
 					     lockdep_is_held(&ftrace_lock));
-	if (!pid_list)
+	no_pid_list = rcu_dereference_protected(tr->function_no_pids,
+						lockdep_is_held(&ftrace_lock));
+
+	/* Make sure there's something to do */
+	if (!(((type & TRACE_PIDS) && pid_list) ||
+	      ((type & TRACE_NO_PIDS) && no_pid_list)))
 		return;
 
-	unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+	/* See if the pids still need to be checked after this */
+	if (!((!(type & TRACE_PIDS) && pid_list) ||
+	      (!(type & TRACE_NO_PIDS) && no_pid_list))) {
+		unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE;
+	}
 
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE;
+	if (type & TRACE_PIDS)
+		rcu_assign_pointer(tr->function_pids, NULL);
 
-	rcu_assign_pointer(tr->function_pids, NULL);
+	if (type & TRACE_NO_PIDS)
+		rcu_assign_pointer(tr->function_no_pids, NULL);
 
 	/* Wait till all users are no longer using pid filtering */
 	synchronize_rcu();
 
-	trace_free_pid_list(pid_list);
+	if ((type & TRACE_PIDS) && pid_list)
+		trace_free_pid_list(pid_list);
+
+	if ((type & TRACE_NO_PIDS) && no_pid_list)
+		trace_free_pid_list(no_pid_list);
 }
 
 void ftrace_clear_pids(struct trace_array *tr)
 {
 	mutex_lock(&ftrace_lock);
 
-	clear_ftrace_pids(tr);
+	clear_ftrace_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
 
 	mutex_unlock(&ftrace_lock);
 }
 
-static void ftrace_pid_reset(struct trace_array *tr)
+static void ftrace_pid_reset(struct trace_array *tr, int type)
 {
 	mutex_lock(&ftrace_lock);
-	clear_ftrace_pids(tr);
+	clear_ftrace_pids(tr, type);
 
 	ftrace_update_pid_func();
 	ftrace_startup_all(0);
@@ -7078,9 +7108,45 @@ static const struct seq_operations ftrace_pid_sops = {
 	.show = fpid_show,
 };
 
-static int
-ftrace_pid_open(struct inode *inode, struct file *file)
+static void *fnpid_start(struct seq_file *m, loff_t *pos)
+	__acquires(RCU)
 {
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = m->private;
+
+	mutex_lock(&ftrace_lock);
+	rcu_read_lock_sched();
+
+	pid_list = rcu_dereference_sched(tr->function_no_pids);
+
+	if (!pid_list)
+		return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+	return trace_pid_start(pid_list, pos);
+}
+
+static void *fnpid_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_no_pids);
+
+	if (v == FTRACE_NO_PIDS) {
+		(*pos)++;
+		return NULL;
+	}
+	return trace_pid_next(pid_list, v, pos);
+}
+
+static const struct seq_operations ftrace_no_pid_sops = {
+	.start = fnpid_start,
+	.next = fnpid_next,
+	.stop = fpid_stop,
+	.show = fpid_show,
+};
+
+static int pid_open(struct inode *inode, struct file *file, int type)
+{
+	const struct seq_operations *seq_ops;
 	struct trace_array *tr = inode->i_private;
 	struct seq_file *m;
 	int ret = 0;
@@ -7091,9 +7157,18 @@ ftrace_pid_open(struct inode *inode, struct file *file)
 
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
-		ftrace_pid_reset(tr);
+		ftrace_pid_reset(tr, type);
 
-	ret = seq_open(file, &ftrace_pid_sops);
+	switch (type) {
+	case TRACE_PIDS:
+		seq_ops = &ftrace_pid_sops;
+		break;
+	case TRACE_NO_PIDS:
+		seq_ops = &ftrace_no_pid_sops;
+		break;
+	}
+
+	ret = seq_open(file, seq_ops);
 	if (ret < 0) {
 		trace_array_put(tr);
 	} else {
@@ -7105,10 +7180,23 @@ ftrace_pid_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static int
+ftrace_pid_open(struct inode *inode, struct file *file)
+{
+	return pid_open(inode, file, TRACE_PIDS);
+}
+
+static int
+ftrace_no_pid_open(struct inode *inode, struct file *file)
+{
+	return pid_open(inode, file, TRACE_NO_PIDS);
+}
+
 static void ignore_task_cpu(void *data)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *pid_list;
+	struct trace_pid_list *no_pid_list;
 
 	/*
 	 * This function is called by on_each_cpu() while the
@@ -7116,8 +7204,10 @@ static void ignore_task_cpu(void *data)
 	 */
 	pid_list = rcu_dereference_protected(tr->function_pids,
 					     mutex_is_locked(&ftrace_lock));
+	no_pid_list = rcu_dereference_protected(tr->function_no_pids,
+						mutex_is_locked(&ftrace_lock));
 
-	if (trace_ignore_this_task(pid_list, current))
+	if (trace_ignore_this_task(pid_list, no_pid_list, current))
 		this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
 			       FTRACE_PID_IGNORE);
 	else
@@ -7126,12 +7216,13 @@ static void ignore_task_cpu(void *data)
 }
 
 static ssize_t
-ftrace_pid_write(struct file *filp, const char __user *ubuf,
-		   size_t cnt, loff_t *ppos)
+pid_write(struct file *filp, const char __user *ubuf,
+	  size_t cnt, loff_t *ppos, int type)
 {
 	struct seq_file *m = filp->private_data;
 	struct trace_array *tr = m->private;
-	struct trace_pid_list *filtered_pids = NULL;
+	struct trace_pid_list *filtered_pids;
+	struct trace_pid_list *other_pids;
 	struct trace_pid_list *pid_list;
 	ssize_t ret;
 
@@ -7140,19 +7231,39 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 
 	mutex_lock(&ftrace_lock);
 
-	filtered_pids = rcu_dereference_protected(tr->function_pids,
+	switch (type) {
+	case TRACE_PIDS:
+		filtered_pids = rcu_dereference_protected(tr->function_pids,
 					     lockdep_is_held(&ftrace_lock));
+		other_pids = rcu_dereference_protected(tr->function_no_pids,
+					     lockdep_is_held(&ftrace_lock));
+		break;
+	case TRACE_NO_PIDS:
+		filtered_pids = rcu_dereference_protected(tr->function_no_pids,
+					     lockdep_is_held(&ftrace_lock));
+		other_pids = rcu_dereference_protected(tr->function_pids,
+					     lockdep_is_held(&ftrace_lock));
+		break;
+	}
 
 	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
 	if (ret < 0)
 		goto out;
 
-	rcu_assign_pointer(tr->function_pids, pid_list);
+	switch (type) {
+	case TRACE_PIDS:
+		rcu_assign_pointer(tr->function_pids, pid_list);
+		break;
+	case TRACE_NO_PIDS:
+		rcu_assign_pointer(tr->function_no_pids, pid_list);
+		break;
+	}
+
 
 	if (filtered_pids) {
 		synchronize_rcu();
 		trace_free_pid_list(filtered_pids);
-	} else if (pid_list) {
+	} else if (pid_list && !other_pids) {
 		/* Register a probe to set whether to ignore the tracing of a task */
 		register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
 	}
@@ -7175,6 +7286,20 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	return ret;
 }
 
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+		 size_t cnt, loff_t *ppos)
+{
+	return pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS);
+}
+
+static ssize_t
+ftrace_no_pid_write(struct file *filp, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	return pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS);
+}
+
 static int
 ftrace_pid_release(struct inode *inode, struct file *file)
 {
@@ -7193,10 +7318,20 @@ static const struct file_operations ftrace_pid_fops = {
 	.release	= ftrace_pid_release,
 };
 
+static const struct file_operations ftrace_no_pid_fops = {
+	.open		= ftrace_no_pid_open,
+	.write		= ftrace_no_pid_write,
+	.read		= seq_read,
+	.llseek		= tracing_lseek,
+	.release	= ftrace_pid_release,
+};
+
 void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
 	trace_create_file("set_ftrace_pid", 0644, d_tracer,
 			    tr, &ftrace_pid_fops);
+	trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer,
+			    tr, &ftrace_no_pid_fops);
 }
 
 void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5e634b9c1e0a..6519b7afc499 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -386,16 +386,22 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
  * Returns false if @task should be traced.
  */
 bool
-trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+		       struct trace_pid_list *filtered_no_pids,
+		       struct task_struct *task)
 {
 	/*
-	 * Return false, because if filtered_pids does not exist,
-	 * all pids are good to trace.
+	 * If filterd_no_pids is not empty, and the task's pid is listed
+	 * in filtered_no_pids, then return true.
+	 * Otherwise, if filtered_pids is empty, that means we can
+	 * trace all tasks. If it has content, then only trace pids
+	 * within filtered_pids.
 	 */
-	if (!filtered_pids)
-		return false;
 
-	return !trace_find_filtered_pid(filtered_pids, task->pid);
+	return (filtered_pids &&
+		!trace_find_filtered_pid(filtered_pids, task->pid)) ||
+		(filtered_no_pids &&
+		 trace_find_filtered_pid(filtered_no_pids, task->pid));
 }
 
 /**
@@ -5013,6 +5019,8 @@ static const char readme_msg[] =
 #ifdef CONFIG_FUNCTION_TRACER
 	"  set_ftrace_pid\t- Write pid(s) to only function trace those pids\n"
 	"\t\t    (function)\n"
+	"  set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n"
+	"\t\t    (function)\n"
 #endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	"  set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fdc72f5f0bb0..6b5ff5adb4ad 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -331,6 +331,7 @@ struct trace_array {
 #ifdef CONFIG_FUNCTION_TRACER
 	struct ftrace_ops	*ops;
 	struct trace_pid_list	__rcu *function_pids;
+	struct trace_pid_list	__rcu *function_no_pids;
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/* All of these are protected by the ftrace_lock */
 	struct list_head	func_probes;
@@ -782,6 +783,7 @@ extern int pid_max;
 bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
 			     pid_t search_pid);
 bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+			    struct trace_pid_list *filtered_no_pids,
 			    struct task_struct *task);
 void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
 				  struct task_struct *self,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f38234ecea18..c196d0dc5871 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -549,8 +549,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, prev) &&
-		       trace_ignore_this_task(pid_list, next));
+		       trace_ignore_this_task(pid_list, NULL, prev) &&
+		       trace_ignore_this_task(pid_list, NULL, next));
 }
 
 static void
@@ -563,7 +563,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, next));
+		       trace_ignore_this_task(pid_list, NULL, next));
 }
 
 static void
@@ -579,7 +579,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, task));
+		       trace_ignore_this_task(pid_list, NULL, task));
 }
 
 static void
@@ -596,7 +596,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
 	/* Set tracing if current is enabled */
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, current));
+		       trace_ignore_this_task(pid_list, NULL, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -1597,7 +1597,7 @@ static void ignore_task_cpu(void *data)
 					     mutex_is_locked(&event_mutex));
 
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, current));
+		       trace_ignore_this_task(pid_list, NULL, current));
 }
 
 static ssize_t

From 2768362603018da2be44ae4d01f22406152db05a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 25 Mar 2020 19:51:19 -0400
Subject: [PATCH 23/27] tracing: Create set_event_notrace_pid to not trace
 tasks

There's currently a way to select a task that should only have its events
traced, but there's no way to select a task not to have itsevents traced.
Add a set_event_notrace_pid file that acts the same as set_event_pid (and is
also affected by event-fork), but the task pids in this file will not be
traced even if they are listed in the set_event_pid file. This makes it easy
for tools like trace-cmd to "hide" itself from beint traced by events when
it is recording other tasks.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c       |  11 +-
 kernel/trace/trace.h        |  25 ++++
 kernel/trace/trace_events.c | 280 ++++++++++++++++++++++++++++--------
 3 files changed, 244 insertions(+), 72 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7239d9acd09f..0bb62e64280c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6987,11 +6987,6 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
 	}
 }
 
-enum {
-	TRACE_PIDS		= BIT(0),
-	TRACE_NO_PIDS		= BIT(1),
-};
-
 static void clear_ftrace_pids(struct trace_array *tr, int type)
 {
 	struct trace_pid_list *pid_list;
@@ -7004,13 +6999,11 @@ static void clear_ftrace_pids(struct trace_array *tr, int type)
 						lockdep_is_held(&ftrace_lock));
 
 	/* Make sure there's something to do */
-	if (!(((type & TRACE_PIDS) && pid_list) ||
-	      ((type & TRACE_NO_PIDS) && no_pid_list)))
+	if (!pid_type_enabled(type, pid_list, no_pid_list))
 		return;
 
 	/* See if the pids still need to be checked after this */
-	if (!((!(type & TRACE_PIDS) && pid_list) ||
-	      (!(type & TRACE_NO_PIDS) && no_pid_list))) {
+	if (!still_need_pid_events(type, pid_list, no_pid_list)) {
 		unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
 		for_each_possible_cpu(cpu)
 			per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b5ff5adb4ad..4eb1d004d5f2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -207,6 +207,30 @@ struct trace_pid_list {
 	unsigned long			*pids;
 };
 
+enum {
+	TRACE_PIDS		= BIT(0),
+	TRACE_NO_PIDS		= BIT(1),
+};
+
+static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list,
+				    struct trace_pid_list *no_pid_list)
+{
+	/* Return true if the pid list in type has pids */
+	return ((type & TRACE_PIDS) && pid_list) ||
+		((type & TRACE_NO_PIDS) && no_pid_list);
+}
+
+static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list,
+					 struct trace_pid_list *no_pid_list)
+{
+	/*
+	 * Turning off what is in @type, return true if the "other"
+	 * pid list, still has pids in it.
+	 */
+	return (!(type & TRACE_PIDS) && pid_list) ||
+		(!(type & TRACE_NO_PIDS) && no_pid_list);
+}
+
 typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
 
 /**
@@ -285,6 +309,7 @@ struct trace_array {
 #endif
 #endif
 	struct trace_pid_list	__rcu *filtered_pids;
+	struct trace_pid_list	__rcu *filtered_no_pids;
 	/*
 	 * max_lock is used to protect the swapping of buffers
 	 * when taking a max snapshot. The buffers themselves are
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c196d0dc5871..242f59e7f17d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -232,10 +232,13 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
 {
 	struct trace_array *tr = trace_file->tr;
 	struct trace_array_cpu *data;
+	struct trace_pid_list *no_pid_list;
 	struct trace_pid_list *pid_list;
 
 	pid_list = rcu_dereference_raw(tr->filtered_pids);
-	if (!pid_list)
+	no_pid_list = rcu_dereference_raw(tr->filtered_no_pids);
+
+	if (!pid_list && !no_pid_list)
 		return false;
 
 	data = this_cpu_ptr(tr->array_buffer.data);
@@ -510,6 +513,9 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 
 	pid_list = rcu_dereference_raw(tr->filtered_pids);
 	trace_filter_add_remove_task(pid_list, NULL, task);
+
+	pid_list = rcu_dereference_raw(tr->filtered_no_pids);
+	trace_filter_add_remove_task(pid_list, NULL, task);
 }
 
 static void
@@ -522,6 +528,9 @@ event_filter_pid_sched_process_fork(void *data,
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 	trace_filter_add_remove_task(pid_list, self, task);
+
+	pid_list = rcu_dereference_sched(tr->filtered_no_pids);
+	trace_filter_add_remove_task(pid_list, self, task);
 }
 
 void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -544,13 +553,23 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 		    struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array *tr = data;
+	struct trace_pid_list *no_pid_list;
 	struct trace_pid_list *pid_list;
+	bool ret;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
 
-	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, NULL, prev) &&
-		       trace_ignore_this_task(pid_list, NULL, next));
+	/*
+	 * Sched switch is funny, as we only want to ignore it
+	 * in the notrace case if both prev and next should be ignored.
+	 */
+	ret = trace_ignore_this_task(NULL, no_pid_list, prev) &&
+		trace_ignore_this_task(NULL, no_pid_list, next);
+
+	this_cpu_write(tr->array_buffer.data->ignore_pid, ret ||
+		       (trace_ignore_this_task(pid_list, NULL, prev) &&
+			trace_ignore_this_task(pid_list, NULL, next)));
 }
 
 static void
@@ -558,18 +577,21 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
 		    struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array *tr = data;
+	struct trace_pid_list *no_pid_list;
 	struct trace_pid_list *pid_list;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
 
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, NULL, next));
+		       trace_ignore_this_task(pid_list, no_pid_list, next));
 }
 
 static void
 event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 {
 	struct trace_array *tr = data;
+	struct trace_pid_list *no_pid_list;
 	struct trace_pid_list *pid_list;
 
 	/* Nothing to do if we are already tracing */
@@ -577,15 +599,17 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 		return;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
 
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, NULL, task));
+		       trace_ignore_this_task(pid_list, no_pid_list, task));
 }
 
 static void
 event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 {
 	struct trace_array *tr = data;
+	struct trace_pid_list *no_pid_list;
 	struct trace_pid_list *pid_list;
 
 	/* Nothing to do if we are not tracing */
@@ -593,23 +617,15 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 		return;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
 
 	/* Set tracing if current is enabled */
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, NULL, current));
+		       trace_ignore_this_task(pid_list, no_pid_list, current));
 }
 
-static void __ftrace_clear_event_pids(struct trace_array *tr)
+static void unregister_pid_events(struct trace_array *tr)
 {
-	struct trace_pid_list *pid_list;
-	struct trace_event_file *file;
-	int cpu;
-
-	pid_list = rcu_dereference_protected(tr->filtered_pids,
-					     lockdep_is_held(&event_mutex));
-	if (!pid_list)
-		return;
-
 	unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr);
 	unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr);
 
@@ -621,26 +637,55 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
 
 	unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr);
 	unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr);
+}
 
-	list_for_each_entry(file, &tr->events, list) {
-		clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+static void __ftrace_clear_event_pids(struct trace_array *tr, int type)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_pid_list *no_pid_list;
+	struct trace_event_file *file;
+	int cpu;
+
+	pid_list = rcu_dereference_protected(tr->filtered_pids,
+					     lockdep_is_held(&event_mutex));
+	no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
+					     lockdep_is_held(&event_mutex));
+
+	/* Make sure there's something to do */
+	if (!pid_type_enabled(type, pid_list, no_pid_list))
+		return;
+
+	if (!still_need_pid_events(type, pid_list, no_pid_list)) {
+		unregister_pid_events(tr);
+
+		list_for_each_entry(file, &tr->events, list) {
+			clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+		}
+
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false;
 	}
 
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false;
+	if (type & TRACE_PIDS)
+		rcu_assign_pointer(tr->filtered_pids, NULL);
 
-	rcu_assign_pointer(tr->filtered_pids, NULL);
+	if (type & TRACE_NO_PIDS)
+		rcu_assign_pointer(tr->filtered_no_pids, NULL);
 
 	/* Wait till all users are no longer using pid filtering */
 	tracepoint_synchronize_unregister();
 
-	trace_free_pid_list(pid_list);
+	if ((type & TRACE_PIDS) && pid_list)
+		trace_free_pid_list(pid_list);
+
+	if ((type & TRACE_NO_PIDS) && no_pid_list)
+		trace_free_pid_list(no_pid_list);
 }
 
-static void ftrace_clear_event_pids(struct trace_array *tr)
+static void ftrace_clear_event_pids(struct trace_array *tr, int type)
 {
 	mutex_lock(&event_mutex);
-	__ftrace_clear_event_pids(tr);
+	__ftrace_clear_event_pids(tr, type);
 	mutex_unlock(&event_mutex);
 }
 
@@ -1013,15 +1058,32 @@ static void t_stop(struct seq_file *m, void *p)
 }
 
 static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
+__next(struct seq_file *m, void *v, loff_t *pos, int type)
 {
 	struct trace_array *tr = m->private;
-	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+	struct trace_pid_list *pid_list;
+
+	if (type == TRACE_PIDS)
+		pid_list = rcu_dereference_sched(tr->filtered_pids);
+	else
+		pid_list = rcu_dereference_sched(tr->filtered_no_pids);
 
 	return trace_pid_next(pid_list, v, pos);
 }
 
-static void *p_start(struct seq_file *m, loff_t *pos)
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return __next(m, v, pos, TRACE_PIDS);
+}
+
+static void *
+np_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return __next(m, v, pos, TRACE_NO_PIDS);
+}
+
+static void *__start(struct seq_file *m, loff_t *pos, int type)
 	__acquires(RCU)
 {
 	struct trace_pid_list *pid_list;
@@ -1036,7 +1098,10 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 	mutex_lock(&event_mutex);
 	rcu_read_lock_sched();
 
-	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	if (type == TRACE_PIDS)
+		pid_list = rcu_dereference_sched(tr->filtered_pids);
+	else
+		pid_list = rcu_dereference_sched(tr->filtered_no_pids);
 
 	if (!pid_list)
 		return NULL;
@@ -1044,6 +1109,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 	return trace_pid_start(pid_list, pos);
 }
 
+static void *p_start(struct seq_file *m, loff_t *pos)
+	__acquires(RCU)
+{
+	return __start(m, pos, TRACE_PIDS);
+}
+
+static void *np_start(struct seq_file *m, loff_t *pos)
+	__acquires(RCU)
+{
+	return __start(m, pos, TRACE_NO_PIDS);
+}
+
 static void p_stop(struct seq_file *m, void *p)
 	__releases(RCU)
 {
@@ -1588,6 +1665,7 @@ static void ignore_task_cpu(void *data)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *pid_list;
+	struct trace_pid_list *no_pid_list;
 
 	/*
 	 * This function is called by on_each_cpu() while the
@@ -1595,18 +1673,50 @@ static void ignore_task_cpu(void *data)
 	 */
 	pid_list = rcu_dereference_protected(tr->filtered_pids,
 					     mutex_is_locked(&event_mutex));
+	no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
+					     mutex_is_locked(&event_mutex));
 
 	this_cpu_write(tr->array_buffer.data->ignore_pid,
-		       trace_ignore_this_task(pid_list, NULL, current));
+		       trace_ignore_this_task(pid_list, no_pid_list, current));
+}
+
+static void register_pid_events(struct trace_array *tr)
+{
+	/*
+	 * Register a probe that is called before all other probes
+	 * to set ignore_pid if next or prev do not match.
+	 * Register a probe this is called after all other probes
+	 * to only keep ignore_pid set if next pid matches.
+	 */
+	register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
+					 tr, INT_MAX);
+	register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
+					 tr, 0);
+
+	register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
+					 tr, INT_MAX);
+	register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
+					 tr, 0);
+
+	register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre,
+					     tr, INT_MAX);
+	register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post,
+					     tr, 0);
+
+	register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre,
+					 tr, INT_MAX);
+	register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post,
+					 tr, 0);
 }
 
 static ssize_t
-ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
+event_pid_write(struct file *filp, const char __user *ubuf,
+		size_t cnt, loff_t *ppos, int type)
 {
 	struct seq_file *m = filp->private_data;
 	struct trace_array *tr = m->private;
 	struct trace_pid_list *filtered_pids = NULL;
+	struct trace_pid_list *other_pids = NULL;
 	struct trace_pid_list *pid_list;
 	struct trace_event_file *file;
 	ssize_t ret;
@@ -1620,14 +1730,26 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 
 	mutex_lock(&event_mutex);
 
-	filtered_pids = rcu_dereference_protected(tr->filtered_pids,
-					     lockdep_is_held(&event_mutex));
+	if (type == TRACE_PIDS) {
+		filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+							  lockdep_is_held(&event_mutex));
+		other_pids = rcu_dereference_protected(tr->filtered_no_pids,
+							  lockdep_is_held(&event_mutex));
+	} else {
+		filtered_pids = rcu_dereference_protected(tr->filtered_no_pids,
+							  lockdep_is_held(&event_mutex));
+		other_pids = rcu_dereference_protected(tr->filtered_pids,
+							  lockdep_is_held(&event_mutex));
+	}
 
 	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
 	if (ret < 0)
 		goto out;
 
-	rcu_assign_pointer(tr->filtered_pids, pid_list);
+	if (type == TRACE_PIDS)
+		rcu_assign_pointer(tr->filtered_pids, pid_list);
+	else
+		rcu_assign_pointer(tr->filtered_no_pids, pid_list);
 
 	list_for_each_entry(file, &tr->events, list) {
 		set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
@@ -1636,32 +1758,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	if (filtered_pids) {
 		tracepoint_synchronize_unregister();
 		trace_free_pid_list(filtered_pids);
-	} else if (pid_list) {
-		/*
-		 * Register a probe that is called before all other probes
-		 * to set ignore_pid if next or prev do not match.
-		 * Register a probe this is called after all other probes
-		 * to only keep ignore_pid set if next pid matches.
-		 */
-		register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
-						 tr, INT_MAX);
-		register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
-						 tr, 0);
-
-		register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
-						 tr, INT_MAX);
-		register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
-						 tr, 0);
-
-		register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre,
-						     tr, INT_MAX);
-		register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post,
-						     tr, 0);
-
-		register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre,
-						 tr, INT_MAX);
-		register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post,
-						 tr, 0);
+	} else if (pid_list && !other_pids) {
+		register_pid_events(tr);
 	}
 
 	/*
@@ -1680,9 +1778,24 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	return ret;
 }
 
+static ssize_t
+ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS);
+}
+
+static ssize_t
+ftrace_event_npid_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS);
+}
+
 static int ftrace_event_avail_open(struct inode *inode, struct file *file);
 static int ftrace_event_set_open(struct inode *inode, struct file *file);
 static int ftrace_event_set_pid_open(struct inode *inode, struct file *file);
+static int ftrace_event_set_npid_open(struct inode *inode, struct file *file);
 static int ftrace_event_release(struct inode *inode, struct file *file);
 
 static const struct seq_operations show_event_seq_ops = {
@@ -1706,6 +1819,13 @@ static const struct seq_operations show_set_pid_seq_ops = {
 	.stop = p_stop,
 };
 
+static const struct seq_operations show_set_no_pid_seq_ops = {
+	.start = np_start,
+	.next = np_next,
+	.show = trace_pid_show,
+	.stop = p_stop,
+};
+
 static const struct file_operations ftrace_avail_fops = {
 	.open = ftrace_event_avail_open,
 	.read = seq_read,
@@ -1729,6 +1849,14 @@ static const struct file_operations ftrace_set_event_pid_fops = {
 	.release = ftrace_event_release,
 };
 
+static const struct file_operations ftrace_set_event_notrace_pid_fops = {
+	.open = ftrace_event_set_npid_open,
+	.read = seq_read,
+	.write = ftrace_event_npid_write,
+	.llseek = seq_lseek,
+	.release = ftrace_event_release,
+};
+
 static const struct file_operations ftrace_enable_fops = {
 	.open = tracing_open_generic,
 	.read = event_enable_read,
@@ -1858,7 +1986,28 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file)
 
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
-		ftrace_clear_event_pids(tr);
+		ftrace_clear_event_pids(tr, TRACE_PIDS);
+
+	ret = ftrace_event_open(inode, file, seq_ops);
+	if (ret < 0)
+		trace_array_put(tr);
+	return ret;
+}
+
+static int
+ftrace_event_set_npid_open(struct inode *inode, struct file *file)
+{
+	const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops;
+	struct trace_array *tr = inode->i_private;
+	int ret;
+
+	ret = tracing_check_open_get_tr(tr);
+	if (ret)
+		return ret;
+
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (file->f_flags & O_TRUNC))
+		ftrace_clear_event_pids(tr, TRACE_NO_PIDS);
 
 	ret = ftrace_event_open(inode, file, seq_ops);
 	if (ret < 0)
@@ -3075,6 +3224,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 	if (!entry)
 		pr_warn("Could not create tracefs 'set_event_pid' entry\n");
 
+	entry = tracefs_create_file("set_event_notrace_pid", 0644, parent,
+				    tr, &ftrace_set_event_notrace_pid_fops);
+	if (!entry)
+		pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
+
 	/* ring buffer internal formats */
 	entry = trace_create_file("header_page", 0444, d_events,
 				  ring_buffer_print_page_header,
@@ -3158,7 +3312,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 	clear_event_triggers(tr);
 
 	/* Clear the pid list */
-	__ftrace_clear_event_pids(tr);
+	__ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
 
 	/* Disable any running events */
 	__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);

From ed8839e072b8f4a8e17575fd22fe5348fe6d6a2a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 26 Mar 2020 13:11:25 -0400
Subject: [PATCH 24/27] selftests/ftrace: Add test to test new
 set_ftrace_notrace_pid file

A new file was added to the tracing directory that will allow a user to
place a PID into it and the task associated to that PID will not be traced
by the function tracer. If the function-fork option is enabled, then neither
will the children of that task be traced by the function tracer.

Cc: linux-kselftest@vger.kernel.org
Cc: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 .../test.d/ftrace/func-filter-notrace-pid.tc  | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc
new file mode 100644
index 000000000000..8aa46a2ea133
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc
@@ -0,0 +1,108 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function pid notrace filters
+# flags: instance
+
+# Make sure that function pid matching filter with notrace works.
+
+if ! grep -q function available_tracers; then
+    echo "no function tracer configured"
+    exit_unsupported
+fi
+
+if [ ! -f set_ftrace_notrace_pid ]; then
+    echo "set_ftrace_notrace_pid not found? Is function tracer not set?"
+    exit_unsupported
+fi
+
+if [ ! -f set_ftrace_filter ]; then
+    echo "set_ftrace_filter not found? Is function tracer not set?"
+    exit_unsupported
+fi
+
+do_function_fork=1
+
+if [ ! -f options/function-fork ]; then
+    do_function_fork=0
+    echo "no option for function-fork found. Option will not be tested."
+fi
+
+read PID _ < /proc/self/stat
+
+if [ $do_function_fork -eq 1 ]; then
+    # default value of function-fork option
+    orig_value=`grep function-fork trace_options`
+fi
+
+do_reset() {
+    if [ $do_function_fork -eq 0 ]; then
+	return
+    fi
+
+    echo > set_ftrace_notrace_pid
+    echo $orig_value > trace_options
+}
+
+fail() { # msg
+    do_reset
+    echo $1
+    exit_fail
+}
+
+do_test() {
+    disable_tracing
+
+    echo do_execve* > set_ftrace_filter
+    echo *do_fork >> set_ftrace_filter
+
+    echo $PID > set_ftrace_notrace_pid
+    echo function > current_tracer
+
+    if [ $do_function_fork -eq 1 ]; then
+	# don't allow children to be traced
+	echo nofunction-fork > trace_options
+    fi
+
+    enable_tracing
+    yield
+
+    count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+    count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+    # count_pid should be 0
+    if [ $count_pid -ne 0 -o $count_other -eq 0 ]; then
+	fail "PID filtering not working? traced task = $count_pid; other tasks = $count_other "
+    fi
+
+    disable_tracing
+    clear_trace
+
+    if [ $do_function_fork -eq 0 ]; then
+	return
+    fi
+
+    # allow children to be traced
+    echo function-fork > trace_options
+
+    # With pid in both set_ftrace_pid and set_ftrace_notrace_pid
+    # there should not be any tasks traced.
+
+    echo $PID > set_ftrace_pid
+
+    enable_tracing
+    yield
+
+    count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+    count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+    # both should be zero
+    if [ $count_pid -ne 0 -o $count_other -ne 0 ]; then
+	fail "PID filtering not following fork? traced task = $count_pid; other tasks = $count_other "
+    fi
+}
+
+do_test
+
+do_reset
+
+exit 0

From ebed9628f5c2101e1ab546de3b0742e0bb9eba3c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 26 Mar 2020 13:14:25 -0400
Subject: [PATCH 25/27] selftests/ftrace: Add test to test new
 set_event_notrace_pid file

A new file was added to the tracing directory that will allow a user to
place a PID into it and the task associated to that PID will not have its
events traced.  If the event-fork option is enabled, then neither will the
children of that task have its events traced.

Cc: linux-kselftest@vger.kernel.org
Cc: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 .../ftrace/test.d/event/event-no-pid.tc       | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc

diff --git a/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc
new file mode 100644
index 000000000000..f0f366f18d0c
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc
@@ -0,0 +1,125 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event tracing - restricts events based on pid notrace filtering
+# flags: instance
+
+do_reset() {
+    echo > set_event
+    echo > set_event_pid
+    echo > set_event_notrace_pid
+    echo 0 > options/event-fork
+    echo 0 > events/enable
+    clear_trace
+    echo 1 > tracing_on
+}
+
+fail() { #msg
+    cat trace
+    do_reset
+    echo $1
+    exit_fail
+}
+
+count_pid() {
+    pid=$@
+    cat trace | grep -v '^#' | sed -e 's/[^-]*-\([0-9]*\).*/\1/' | grep $pid | wc -l
+}
+
+count_no_pid() {
+    pid=$1
+    cat trace | grep -v '^#' | sed -e 's/[^-]*-\([0-9]*\).*/\1/' | grep -v $pid | wc -l
+}
+
+enable_system() {
+    system=$1
+
+    if [ -d events/$system ]; then
+	echo 1 > events/$system/enable
+    fi
+}
+
+enable_events() {
+    echo 0 > tracing_on
+    # Enable common groups of events, as all events can allow for
+    # events to be traced via scheduling that we don't care to test.
+    enable_system syscalls
+    enable_system rcu
+    enable_system block
+    enable_system exceptions
+    enable_system irq
+    enable_system net
+    enable_system power
+    enable_system signal
+    enable_system sock
+    enable_system timer
+    enable_system thermal
+    echo 1 > tracing_on
+}
+
+if [ ! -f set_event -o ! -d events/sched ]; then
+    echo "event tracing is not supported"
+    exit_unsupported
+fi
+
+if [ ! -f set_event_pid -o ! -f set_event_notrace_pid ]; then
+    echo "event pid notrace filtering is not supported"
+    exit_unsupported
+fi
+
+echo 0 > options/event-fork
+
+do_reset
+
+read mypid rest < /proc/self/stat
+
+echo $mypid > set_event_notrace_pid
+grep -q $mypid set_event_notrace_pid
+
+enable_events
+
+yield
+
+echo 0 > tracing_on
+
+cnt=`count_pid $mypid`
+if [ $cnt -ne 0 ]; then
+    fail "Filtered out task has events"
+fi
+
+cnt=`count_no_pid $mypid`
+if [ $cnt -eq 0 ]; then
+    fail "No other events were recorded"
+fi
+
+do_reset
+
+echo $mypid > set_event_notrace_pid
+echo 1 > options/event-fork
+
+enable_events
+
+yield &
+child=$!
+echo "child = $child"
+wait $child
+
+echo 0 > tracing_on
+
+cnt=`count_pid $mypid`
+if [ $cnt -ne 0 ]; then
+    fail "Filtered out task has events"
+fi
+
+cnt=`count_pid $child`
+if [ $cnt -ne 0 ]; then
+    fail "Child of filtered out taskhas events"
+fi
+
+cnt=`count_no_pid $mypid`
+if [ $cnt -eq 0 ]; then
+    fail "No other events were recorded"
+fi
+
+do_reset
+
+exit 0

From 2ab2a0924b9980551ebe1c47d2a402a94efc1835 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 26 Mar 2020 13:22:44 -0400
Subject: [PATCH 26/27] tracing: Add documentation on set_ftrace_notrace_pid
 and set_event_notrace_pid

Update the tracing documentation to reflect the new files in the tracing
directory.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 00621731bf27..3b5614b1d1a5 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -263,6 +263,20 @@ of ftrace. Here is a list of some of the key files:
 	traced by the function tracer as well. This option will also
 	cause PIDs of tasks that exit to be removed from the file.
 
+  set_ftrace_notrace_pid:
+
+        Have the function tracer ignore threads whose PID are listed in
+        this file.
+
+        If the "function-fork" option is set, then when a task whose
+	PID is listed in this file forks, the child's PID will
+	automatically be added to this file, and the child will not be
+	traced by the function tracer as well. This option will also
+	cause PIDs of tasks that exit to be removed from the file.
+
+        If a PID is in both this file and "set_ftrace_pid", then this
+        file takes precedence, and the thread will not be traced.
+
   set_event_pid:
 
 	Have the events only trace a task with a PID listed in this file.
@@ -274,6 +288,19 @@ of ftrace. Here is a list of some of the key files:
 	cause the PIDs of tasks to be removed from this file when the task
 	exits.
 
+  set_event_notrace_pid:
+
+	Have the events not trace a task with a PID listed in this file.
+	Note, sched_switch and sched_wakeup will trace threads not listed
+	in this file, even if a thread's PID is in the file if the
+        sched_switch or sched_wakeup events also trace a thread that should
+        be traced.
+
+	To have the PIDs of children of tasks with their PID in this file
+	added on fork, enable the "event-fork" option. That option will also
+	cause the PIDs of tasks to be removed from this file when the task
+	exits.
+
   set_graph_function:
 
 	Functions listed in this file will cause the function graph
@@ -1183,6 +1210,8 @@ Here are the available options:
 	tasks fork. Also, when tasks with PIDs in set_event_pid exit,
 	their PIDs will be removed from the file.
 
+        This affects PIDs listed in set_event_notrace_pid as well.
+
   function-trace
 	The latency tracers will enable function tracing
 	if this option is enabled (default it is). When
@@ -1197,6 +1226,8 @@ Here are the available options:
 	set_ftrace_pid exit, their PIDs will be removed from the
 	file.
 
+        This affects PIDs in set_ftrace_notrace_pid as well.
+
   display-graph
 	When set, the latency tracers (irqsoff, wakeup, etc) will
 	use function graph tracing instead of function tracing.

From 8e99cf91b99bb30e16727f10ad6828741c0e992f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 1 Apr 2020 22:44:46 -0400
Subject: [PATCH 27/27] tracing: Do not allocate buffer in
 trace_find_next_entry() in atomic

When dumping out the trace data in latency format, a check is made to peek
at the next event to compare its timestamp to the current one, and if the
delta is of a greater size, it will add a marker showing so. But to do this,
it needs to save the current event otherwise peeking at the next event will
remove the current event. To save the event, a temp buffer is used, and if
the event is bigger than the temp buffer, the temp buffer is freed and a
bigger buffer is allocated.

This allocation is a problem when called in atomic context. The only way
this gets called via atomic context is via ftrace_dump(). Thus, use a static
buffer of 128 bytes (which covers most events), and if the event is bigger
than that, simply return NULL. The callers of trace_find_next_entry() need
to handle a NULL case, as that's what would happen if the allocation failed.

Link: https://lore.kernel.org/r/20200326091256.GR11705@shao2-debian

Fixes: ff895103a84ab ("tracing: Save off entry when peeking at next entry")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6519b7afc499..8d2b98812625 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3472,6 +3472,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 	return next;
 }
 
+#define STATIC_TEMP_BUF_SIZE	128
+static char static_temp_buf[STATIC_TEMP_BUF_SIZE];
+
 /* Find the next real entry, without updating the iterator itself */
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts)
@@ -3480,13 +3483,26 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 	int ent_size = iter->ent_size;
 	struct trace_entry *entry;
 
+	/*
+	 * If called from ftrace_dump(), then the iter->temp buffer
+	 * will be the static_temp_buf and not created from kmalloc.
+	 * If the entry size is greater than the buffer, we can
+	 * not save it. Just return NULL in that case. This is only
+	 * used to add markers when two consecutive events' time
+	 * stamps have a large delta. See trace_print_lat_context()
+	 */
+	if (iter->temp == static_temp_buf &&
+	    STATIC_TEMP_BUF_SIZE < ent_size)
+		return NULL;
+
 	/*
 	 * The __find_next_entry() may call peek_next_entry(), which may
 	 * call ring_buffer_peek() that may make the contents of iter->ent
 	 * undefined. Need to copy iter->ent now.
 	 */
 	if (iter->ent && iter->ent != iter->temp) {
-		if (!iter->temp || iter->temp_size < iter->ent_size) {
+		if ((!iter->temp || iter->temp_size < iter->ent_size) &&
+		    !WARN_ON_ONCE(iter->temp == static_temp_buf)) {
 			kfree(iter->temp);
 			iter->temp = kmalloc(iter->ent_size, GFP_KERNEL);
 			if (!iter->temp)
@@ -9203,6 +9219,9 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 
 	/* Simulate the iterator */
 	trace_init_global_iter(&iter);
+	/* Can not use kmalloc for iter.temp */
+	iter.temp = static_temp_buf;
+	iter.temp_size = STATIC_TEMP_BUF_SIZE;
 
 	for_each_tracing_cpu(cpu) {
 		atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);