forked from luck/tmp_suning_uos_patched
dm stats: collect and report histogram of IO latencies
Add an option to dm statistics to collect and report a histogram of IO latencies. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
This commit is contained in:
parent
c96aec344d
commit
dfcfac3e4c
|
@ -13,9 +13,10 @@ the range specified.
|
|||
The I/O statistics counters for each step-sized area of a region are
|
||||
in the same format as /sys/block/*/stat or /proc/diskstats (see:
|
||||
Documentation/iostats.txt). But two extra counters (12 and 13) are
|
||||
provided: total time spent reading and writing. All these counters may
|
||||
be accessed by sending the @stats_print message to the appropriate DM
|
||||
device via dmsetup.
|
||||
provided: total time spent reading and writing. When the histogram
|
||||
argument is used, the 14th parameter is reported that represents the
|
||||
histogram of latencies. All these counters may be accessed by sending
|
||||
the @stats_print message to the appropriate DM device via dmsetup.
|
||||
|
||||
The reported times are in milliseconds and the granularity depends on
|
||||
the kernel ticks. When the option precise_timestamps is used, the
|
||||
|
@ -64,6 +65,18 @@ Messages
|
|||
used, the resulting times are in nanoseconds instead of
|
||||
milliseconds. Precise timestamps are a little bit slower
|
||||
to obtain than jiffies-based timestamps.
|
||||
histogram:n1,n2,n3,n4,... - collect histogram of latencies. The
|
||||
numbers n1, n2, etc are times that represent the boundaries
|
||||
of the histogram. If precise_timestamps is not used, the
|
||||
times are in milliseconds, otherwise they are in
|
||||
nanoseconds. For each range, the kernel will report the
|
||||
number of requests that completed within this range. For
|
||||
example, if we use "histogram:10,20,30", the kernel will
|
||||
report four numbers a:b:c:d. a is the number of requests
|
||||
that took 0-10 ms to complete, b is the number of requests
|
||||
that took 10-20 ms to complete, c is the number of requests
|
||||
that took 20-30 ms to complete and d is the number of
|
||||
requests that took more than 30 ms to complete.
|
||||
|
||||
<program_id>
|
||||
An optional parameter. A name that uniquely identifies
|
||||
|
|
|
@ -29,6 +29,7 @@ struct dm_stat_percpu {
|
|||
unsigned long long io_ticks[2];
|
||||
unsigned long long io_ticks_total;
|
||||
unsigned long long time_in_queue;
|
||||
unsigned long long *histogram;
|
||||
};
|
||||
|
||||
struct dm_stat_shared {
|
||||
|
@ -45,11 +46,14 @@ struct dm_stat {
|
|||
sector_t start;
|
||||
sector_t end;
|
||||
sector_t step;
|
||||
unsigned n_histogram_entries;
|
||||
unsigned long long *histogram_boundaries;
|
||||
const char *program_id;
|
||||
const char *aux_data;
|
||||
struct rcu_head rcu_head;
|
||||
size_t shared_alloc_size;
|
||||
size_t percpu_alloc_size;
|
||||
size_t histogram_alloc_size;
|
||||
struct dm_stat_percpu *stat_percpu[NR_CPUS];
|
||||
struct dm_stat_shared stat_shared[0];
|
||||
};
|
||||
|
@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
|
|||
|
||||
kfree(s->program_id);
|
||||
kfree(s->aux_data);
|
||||
for_each_possible_cpu(cpu)
|
||||
for_each_possible_cpu(cpu) {
|
||||
dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
|
||||
dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
|
||||
}
|
||||
dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
|
||||
dm_kvfree(s, s->shared_alloc_size);
|
||||
}
|
||||
|
||||
|
@ -228,6 +235,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
|
|||
|
||||
static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
|
||||
sector_t step, unsigned stat_flags,
|
||||
unsigned n_histogram_entries,
|
||||
unsigned long long *histogram_boundaries,
|
||||
const char *program_id, const char *aux_data,
|
||||
void (*suspend_callback)(struct mapped_device *),
|
||||
void (*resume_callback)(struct mapped_device *),
|
||||
|
@ -239,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
|
|||
size_t ni;
|
||||
size_t shared_alloc_size;
|
||||
size_t percpu_alloc_size;
|
||||
size_t histogram_alloc_size;
|
||||
struct dm_stat_percpu *p;
|
||||
int cpu;
|
||||
int ret_id;
|
||||
|
@ -262,7 +272,12 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
|
|||
if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
|
||||
return -EOVERFLOW;
|
||||
|
||||
if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
|
||||
histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
|
||||
if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
|
||||
return -EOVERFLOW;
|
||||
|
||||
if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
|
||||
num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
|
||||
return -ENOMEM;
|
||||
|
||||
s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
|
||||
|
@ -276,6 +291,15 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
|
|||
s->step = step;
|
||||
s->shared_alloc_size = shared_alloc_size;
|
||||
s->percpu_alloc_size = percpu_alloc_size;
|
||||
s->histogram_alloc_size = histogram_alloc_size;
|
||||
|
||||
s->n_histogram_entries = n_histogram_entries;
|
||||
s->histogram_boundaries = kmemdup(histogram_boundaries,
|
||||
s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
|
||||
if (!s->histogram_boundaries) {
|
||||
r = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
s->program_id = kstrdup(program_id, GFP_KERNEL);
|
||||
if (!s->program_id) {
|
||||
|
@ -293,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
|
|||
atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
|
||||
}
|
||||
|
||||
if (s->n_histogram_entries) {
|
||||
unsigned long long *hi;
|
||||
hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
|
||||
if (!hi) {
|
||||
r = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
for (ni = 0; ni < n_entries; ni++) {
|
||||
s->stat_shared[ni].tmp.histogram = hi;
|
||||
hi += s->n_histogram_entries + 1;
|
||||
}
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
|
||||
if (!p) {
|
||||
|
@ -300,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
|
|||
goto out;
|
||||
}
|
||||
s->stat_percpu[cpu] = p;
|
||||
if (s->n_histogram_entries) {
|
||||
unsigned long long *hi;
|
||||
hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
|
||||
if (!hi) {
|
||||
r = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
for (ni = 0; ni < n_entries; ni++) {
|
||||
p[ni].histogram = hi;
|
||||
hi += s->n_histogram_entries + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -377,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
|
|||
* vfree can't be called from RCU callback
|
||||
*/
|
||||
for_each_possible_cpu(cpu)
|
||||
if (is_vmalloc_addr(s->stat_percpu))
|
||||
if (is_vmalloc_addr(s->stat_percpu) ||
|
||||
is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
|
||||
goto do_sync_free;
|
||||
if (is_vmalloc_addr(s)) {
|
||||
if (is_vmalloc_addr(s) ||
|
||||
is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
|
||||
do_sync_free:
|
||||
synchronize_rcu_expedited();
|
||||
dm_stat_free(&s->rcu_head);
|
||||
|
@ -486,15 +537,32 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
|
|||
dm_stat_round(s, shared, p);
|
||||
atomic_inc(&shared->in_flight[idx]);
|
||||
} else {
|
||||
unsigned long long duration;
|
||||
dm_stat_round(s, shared, p);
|
||||
atomic_dec(&shared->in_flight[idx]);
|
||||
p->sectors[idx] += len;
|
||||
p->ios[idx] += 1;
|
||||
p->merges[idx] += stats_aux->merged;
|
||||
if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
|
||||
if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
|
||||
p->ticks[idx] += duration_jiffies;
|
||||
else
|
||||
duration = jiffies_to_msecs(duration_jiffies);
|
||||
} else {
|
||||
p->ticks[idx] += stats_aux->duration_ns;
|
||||
duration = stats_aux->duration_ns;
|
||||
}
|
||||
if (s->n_histogram_entries) {
|
||||
unsigned lo = 0, hi = s->n_histogram_entries + 1;
|
||||
while (lo + 1 < hi) {
|
||||
unsigned mid = (lo + hi) / 2;
|
||||
if (s->histogram_boundaries[mid - 1] > duration) {
|
||||
hi = mid;
|
||||
} else {
|
||||
lo = mid;
|
||||
}
|
||||
|
||||
}
|
||||
p->histogram[lo]++;
|
||||
}
|
||||
}
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
|
@ -600,7 +668,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
|
|||
dm_stat_round(s, shared, p);
|
||||
local_irq_enable();
|
||||
|
||||
memset(&shared->tmp, 0, sizeof(shared->tmp));
|
||||
shared->tmp.sectors[READ] = 0;
|
||||
shared->tmp.sectors[WRITE] = 0;
|
||||
shared->tmp.ios[READ] = 0;
|
||||
shared->tmp.ios[WRITE] = 0;
|
||||
shared->tmp.merges[READ] = 0;
|
||||
shared->tmp.merges[WRITE] = 0;
|
||||
shared->tmp.ticks[READ] = 0;
|
||||
shared->tmp.ticks[WRITE] = 0;
|
||||
shared->tmp.io_ticks[READ] = 0;
|
||||
shared->tmp.io_ticks[WRITE] = 0;
|
||||
shared->tmp.io_ticks_total = 0;
|
||||
shared->tmp.time_in_queue = 0;
|
||||
|
||||
if (s->n_histogram_entries)
|
||||
memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
p = &s->stat_percpu[cpu][x];
|
||||
shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
|
||||
|
@ -615,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
|
|||
shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
|
||||
shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
|
||||
shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
|
||||
if (s->n_histogram_entries) {
|
||||
unsigned i;
|
||||
for (i = 0; i < s->n_histogram_entries + 1; i++)
|
||||
shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -644,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
|
|||
p->io_ticks_total -= shared->tmp.io_ticks_total;
|
||||
p->time_in_queue -= shared->tmp.time_in_queue;
|
||||
local_irq_enable();
|
||||
if (s->n_histogram_entries) {
|
||||
unsigned i;
|
||||
for (i = 0; i < s->n_histogram_entries + 1; i++) {
|
||||
local_irq_disable();
|
||||
p = &s->stat_percpu[smp_processor_id()][x];
|
||||
p->histogram[i] -= shared->tmp.histogram[i];
|
||||
local_irq_enable();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -733,7 +830,7 @@ static int dm_stats_print(struct dm_stats *stats, int id,
|
|||
|
||||
__dm_stat_init_temporary_percpu_totals(shared, s, x);
|
||||
|
||||
DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
|
||||
DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
|
||||
(unsigned long long)start,
|
||||
(unsigned long long)step,
|
||||
shared->tmp.ios[READ],
|
||||
|
@ -749,6 +846,13 @@ static int dm_stats_print(struct dm_stats *stats, int id,
|
|||
dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
|
||||
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
|
||||
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
|
||||
if (s->n_histogram_entries) {
|
||||
unsigned i;
|
||||
for (i = 0; i < s->n_histogram_entries + 1; i++) {
|
||||
DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
|
||||
}
|
||||
}
|
||||
DMEMIT("\n");
|
||||
|
||||
if (unlikely(sz + 1 >= maxlen))
|
||||
goto buffer_overflow;
|
||||
|
@ -790,10 +894,47 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int parse_histogram(const char *h, unsigned *n_histogram_entries,
|
||||
unsigned long long **histogram_boundaries)
|
||||
{
|
||||
const char *q;
|
||||
unsigned n;
|
||||
unsigned long long last;
|
||||
|
||||
*n_histogram_entries = 1;
|
||||
for (q = h; *q; q++)
|
||||
if (*q == ',')
|
||||
(*n_histogram_entries)++;
|
||||
|
||||
*histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
|
||||
if (!*histogram_boundaries)
|
||||
return -ENOMEM;
|
||||
|
||||
n = 0;
|
||||
last = 0;
|
||||
while (1) {
|
||||
unsigned long long hi;
|
||||
int s;
|
||||
char ch;
|
||||
s = sscanf(h, "%llu%c", &hi, &ch);
|
||||
if (!s || (s == 2 && ch != ','))
|
||||
return -EINVAL;
|
||||
if (hi <= last)
|
||||
return -EINVAL;
|
||||
last = hi;
|
||||
(*histogram_boundaries)[n] = hi;
|
||||
if (s == 1)
|
||||
return 0;
|
||||
h = strchr(h, ',') + 1;
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
static int message_stats_create(struct mapped_device *md,
|
||||
unsigned argc, char **argv,
|
||||
char *result, unsigned maxlen)
|
||||
{
|
||||
int r;
|
||||
int id;
|
||||
char dummy;
|
||||
unsigned long long start, end, len, step;
|
||||
|
@ -801,6 +942,9 @@ static int message_stats_create(struct mapped_device *md,
|
|||
const char *program_id, *aux_data;
|
||||
unsigned stat_flags = 0;
|
||||
|
||||
unsigned n_histogram_entries = 0;
|
||||
unsigned long long *histogram_boundaries = NULL;
|
||||
|
||||
struct dm_arg_set as, as_backup;
|
||||
const char *a;
|
||||
unsigned feature_args;
|
||||
|
@ -811,7 +955,7 @@ static int message_stats_create(struct mapped_device *md,
|
|||
*/
|
||||
|
||||
if (argc < 3)
|
||||
return -EINVAL;
|
||||
goto ret_einval;
|
||||
|
||||
as.argc = argc;
|
||||
as.argv = argv;
|
||||
|
@ -825,11 +969,11 @@ static int message_stats_create(struct mapped_device *md,
|
|||
len = 1;
|
||||
} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
|
||||
start != (sector_t)start || len != (sector_t)len)
|
||||
return -EINVAL;
|
||||
goto ret_einval;
|
||||
|
||||
end = start + len;
|
||||
if (start >= end)
|
||||
return -EINVAL;
|
||||
goto ret_einval;
|
||||
|
||||
a = dm_shift_arg(&as);
|
||||
if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
|
||||
|
@ -842,7 +986,7 @@ static int message_stats_create(struct mapped_device *md,
|
|||
step = 1;
|
||||
} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
|
||||
step != (sector_t)step || !step)
|
||||
return -EINVAL;
|
||||
goto ret_einval;
|
||||
|
||||
as_backup = as;
|
||||
a = dm_shift_arg(&as);
|
||||
|
@ -850,11 +994,16 @@ static int message_stats_create(struct mapped_device *md,
|
|||
while (feature_args--) {
|
||||
a = dm_shift_arg(&as);
|
||||
if (!a)
|
||||
return -EINVAL;
|
||||
goto ret_einval;
|
||||
if (!strcasecmp(a, "precise_timestamps"))
|
||||
stat_flags |= STAT_PRECISE_TIMESTAMPS;
|
||||
else
|
||||
return -EINVAL;
|
||||
else if (!strncasecmp(a, "histogram:", 10)) {
|
||||
if (n_histogram_entries)
|
||||
goto ret_einval;
|
||||
if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
|
||||
goto ret;
|
||||
} else
|
||||
goto ret_einval;
|
||||
}
|
||||
} else {
|
||||
as = as_backup;
|
||||
|
@ -872,7 +1021,7 @@ static int message_stats_create(struct mapped_device *md,
|
|||
aux_data = a;
|
||||
|
||||
if (as.argc)
|
||||
return -EINVAL;
|
||||
goto ret_einval;
|
||||
|
||||
/*
|
||||
* If a buffer overflow happens after we created the region,
|
||||
|
@ -881,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
|
|||
* leaked). So we must detect buffer overflow in advance.
|
||||
*/
|
||||
snprintf(result, maxlen, "%d", INT_MAX);
|
||||
if (dm_message_test_buffer_overflow(result, maxlen))
|
||||
return 1;
|
||||
if (dm_message_test_buffer_overflow(result, maxlen)) {
|
||||
r = 1;
|
||||
goto ret;
|
||||
}
|
||||
|
||||
id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
|
||||
id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
|
||||
n_histogram_entries, histogram_boundaries, program_id, aux_data,
|
||||
dm_internal_suspend_fast, dm_internal_resume_fast, md);
|
||||
if (id < 0)
|
||||
return id;
|
||||
if (id < 0) {
|
||||
r = id;
|
||||
goto ret;
|
||||
}
|
||||
|
||||
snprintf(result, maxlen, "%d", id);
|
||||
|
||||
return 1;
|
||||
r = 1;
|
||||
goto ret;
|
||||
|
||||
ret_einval:
|
||||
r = -EINVAL;
|
||||
ret:
|
||||
kfree(histogram_boundaries);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int message_stats_delete(struct mapped_device *md,
|
||||
|
|
Loading…
Reference in New Issue
Block a user