blob: a72c9bd28071b3913b37e34039be1329a88a3e66 [file] [log] [blame]
Ingo Molnare0143ba2009-03-23 21:29:59 +01001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
Wu Fengguangf7524bd2009-03-20 10:08:06 +08006 cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
Ingo Molnare0143ba2009-03-23 21:29:59 +01007
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
Ingo Molnare0143ba2009-03-23 21:29:59 +010029 */
Wu Fengguangf7524bd2009-03-20 10:08:06 +080030
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 *
60 * Released under the GPL v2. (and only v2, not any later version)
61 */
62
Ingo Molnare0143ba2009-03-23 21:29:59 +010063#define _GNU_SOURCE
64#include <sys/types.h>
65#include <sys/stat.h>
66#include <sys/time.h>
67#include <unistd.h>
68#include <stdint.h>
69#include <stdlib.h>
70#include <string.h>
71#include <getopt.h>
72#include <assert.h>
73#include <fcntl.h>
74#include <stdio.h>
75#include <errno.h>
76#include <ctype.h>
77#include <time.h>
78
79#include <glib.h>
80
81#include <sys/syscall.h>
82#include <sys/ioctl.h>
83#include <sys/poll.h>
84#include <sys/prctl.h>
85#include <sys/wait.h>
86#include <sys/uio.h>
87
88#include <linux/unistd.h>
89
Peter Zijlstra803d4f32009-03-23 18:22:11 +010090#include "include/linux/perf_counter.h"
Ingo Molnare0143ba2009-03-23 21:29:59 +010091
Wu Fengguangf7524bd2009-03-20 10:08:06 +080092
Peter Zijlstra803d4f32009-03-23 18:22:11 +010093/*
94 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
95 * counters in the current task.
96 */
97#define PR_TASK_PERF_COUNTERS_DISABLE 31
98#define PR_TASK_PERF_COUNTERS_ENABLE 32
99
100#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
101
102#define rdclock() \
103({ \
104 struct timespec ts; \
105 \
106 clock_gettime(CLOCK_MONOTONIC, &ts); \
107 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
108})
109
110/*
111 * Pick up some kernel type conventions:
112 */
113#define __user
114#define asmlinkage
115
116typedef unsigned int __u32;
117typedef unsigned long long __u64;
118typedef long long __s64;
119
120
121#ifdef __x86_64__
122# define __NR_perf_counter_open 295
123#endif
124
125#ifdef __i386__
126# define __NR_perf_counter_open 333
127#endif
128
129#ifdef __powerpc__
130#define __NR_perf_counter_open 319
131#endif
132
133asmlinkage int sys_perf_counter_open(
134 struct perf_counter_hw_event *hw_event_uptr __user,
135 pid_t pid,
136 int cpu,
137 int group_fd,
138 unsigned long flags)
139{
140 int ret;
141
142 ret = syscall(
143 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
144#if defined(__x86_64__) || defined(__i386__)
145 if (ret < 0 && ret > -4096) {
146 errno = -ret;
147 ret = -1;
148 }
149#endif
150 return ret;
151}
152
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800153#define MAX_COUNTERS 64
154#define MAX_NR_CPUS 256
155
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100156#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800157
158static int run_perfstat = 0;
159static int system_wide = 0;
160
161static int nr_counters = 0;
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100162static __u64 event_id[MAX_COUNTERS] = {
163 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
164 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
165 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
166 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
167
168 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
169 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
170 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
171 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
172};
173static int default_interval = 100000;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800174static int event_count[MAX_COUNTERS];
175static int fd[MAX_NR_CPUS][MAX_COUNTERS];
Ingo Molnare0143ba2009-03-23 21:29:59 +0100176
Ingo Molnare0143ba2009-03-23 21:29:59 +0100177static __u64 count_filter = 100;
178
Ingo Molnare0143ba2009-03-23 21:29:59 +0100179static int tid = -1;
180static int profile_cpu = -1;
181static int nr_cpus = 0;
182static int nmi = 1;
183static int group = 0;
184
185static char *vmlinux;
186
187static char *sym_filter;
188static unsigned long filter_start;
189static unsigned long filter_end;
190
191static int delay_secs = 2;
192static int zero;
193static int dump_symtab;
194
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800195static GList *lines;
196
Ingo Molnare0143ba2009-03-23 21:29:59 +0100197struct source_line {
198 uint64_t EIP;
199 unsigned long count;
200 char *line;
201};
202
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800203
204const unsigned int default_count[] = {
Wu Fengguangdda7c022009-03-20 10:08:09 +0800205 10000,
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800206 1000000,
207 10000,
208 10000,
209 1000000,
210 10000,
211};
212
213static char *hw_event_names[] = {
214 "CPU cycles",
215 "instructions",
216 "cache references",
217 "cache misses",
218 "branches",
219 "branch misses",
220 "bus cycles",
221};
222
223static char *sw_event_names[] = {
224 "cpu clock ticks",
225 "task clock ticks",
226 "pagefaults",
227 "context switches",
228 "CPU migrations",
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100229 "minor faults",
230 "major faults",
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800231};
232
233struct event_symbol {
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100234 __u64 event;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800235 char *symbol;
236};
237
238static struct event_symbol event_symbols[] = {
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100239 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
240 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
241 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
242 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
243 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
244 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
245 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
246 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
247 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
248
249 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
250 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
251 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
252 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
253 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
254 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
255 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
256 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
257 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
258 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800259};
260
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100261#define __PERF_COUNTER_FIELD(config, name) \
262 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
263
264#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
265#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
266#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
267#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
268
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800269static void display_events_help(void)
270{
271 unsigned int i;
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100272 __u64 e;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800273
274 printf(
275 " -e EVENT --event=EVENT # symbolic-name abbreviations");
276
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100277 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
278 int type, id;
279
280 e = event_symbols[i].event;
281 type = PERF_COUNTER_TYPE(e);
282 id = PERF_COUNTER_ID(e);
283
284 printf("\n %d:%d: %-20s",
285 type, id, event_symbols[i].symbol);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800286 }
287
288 printf("\n"
289 " rNNN: raw PMU events (eventsel+umask)\n\n");
290}
291
292static void display_perfstat_help(void)
293{
294 printf(
295 "Usage: perfstat [<events...>] <cmd...>\n\n"
296 "PerfStat Options (up to %d event types can be specified):\n\n",
297 MAX_COUNTERS);
298
299 display_events_help();
300
301 printf(
302 " -a # system-wide collection\n");
303 exit(0);
304}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100305
306static void display_help(void)
307{
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800308 if (run_perfstat)
309 return display_perfstat_help();
310
Ingo Molnare0143ba2009-03-23 21:29:59 +0100311 printf(
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800312 "Usage: kerneltop [<options>]\n"
313 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100314 "KernelTop Options (up to %d event types can be specified at once):\n\n",
315 MAX_COUNTERS);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800316
317 display_events_help();
318
Ingo Molnare0143ba2009-03-23 21:29:59 +0100319 printf(
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800320 " -S --stat # perfstat COMMAND\n"
321 " -a # system-wide collection (for perfstat)\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100322 " -c CNT --count=CNT # event period to sample\n\n"
323 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
324 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
325 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800326 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100327 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800328 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100329 " -z --zero # zero counts after display\n"
330 " -D --dump_symtab # dump symbol table to stderr on startup\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800331 );
Ingo Molnare0143ba2009-03-23 21:29:59 +0100332
333 exit(0);
334}
335
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800336static char *event_name(int ctr)
337{
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100338 __u64 config = event_id[ctr];
339 int type = PERF_COUNTER_TYPE(config);
340 int id = PERF_COUNTER_ID(config);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800341 static char buf[32];
342
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100343 if (PERF_COUNTER_RAW(config)) {
344 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800345 return buf;
346 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800347
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100348 switch (type) {
349 case PERF_TYPE_HARDWARE:
350 if (id < PERF_HW_EVENTS_MAX)
351 return hw_event_names[id];
352 return "unknown-hardware";
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800353
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100354 case PERF_TYPE_SOFTWARE:
355 if (id < PERF_SW_EVENTS_MAX)
356 return sw_event_names[id];
357 return "unknown-software";
358
359 default:
360 break;
361 }
362
363 return "unknown";
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800364}
365
366/*
367 * Each event can have multiple symbolic names.
368 * Symbolic names are (almost) exactly matched.
369 */
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100370static __u64 match_event_symbols(char *str)
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800371{
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100372 __u64 config, id;
373 int type;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800374 unsigned int i;
375
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100376 if (sscanf(str, "r%llx", &config) == 1)
377 return config | PERF_COUNTER_RAW_MASK;
378
379 if (sscanf(str, "%d:%llu", &type, &id) == 2)
380 return EID(type, id);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800381
382 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
383 if (!strncmp(str, event_symbols[i].symbol,
384 strlen(event_symbols[i].symbol)))
385 return event_symbols[i].event;
386 }
387
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100388 return ~0ULL;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800389}
390
391static int parse_events(char *str)
392{
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100393 __u64 config;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800394
395again:
396 if (nr_counters == MAX_COUNTERS)
397 return -1;
398
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100399 config = match_event_symbols(str);
400 if (config == ~0ULL)
401 return -1;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800402
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100403 event_id[nr_counters] = config;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800404 nr_counters++;
405
406 str = strstr(str, ",");
407 if (str) {
408 str++;
409 goto again;
410 }
411
412 return 0;
413}
414
415
416/*
417 * perfstat
418 */
419
420char fault_here[1000000];
421
422static void create_perfstat_counter(int counter)
423{
424 struct perf_counter_hw_event hw_event;
425
426 memset(&hw_event, 0, sizeof(hw_event));
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100427 hw_event.config = event_id[counter];
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800428 hw_event.record_type = PERF_RECORD_SIMPLE;
429 hw_event.nmi = 0;
430
431 if (system_wide) {
432 int cpu;
433 for (cpu = 0; cpu < nr_cpus; cpu ++) {
434 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
435 if (fd[cpu][counter] < 0) {
436 printf("perfstat error: syscall returned with %d (%s)\n",
437 fd[cpu][counter], strerror(errno));
438 exit(-1);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100439 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800440 }
441 } else {
442 hw_event.inherit = 1;
443 hw_event.disabled = 1;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100444
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800445 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
446 if (fd[0][counter] < 0) {
447 printf("perfstat error: syscall returned with %d (%s)\n",
448 fd[0][counter], strerror(errno));
449 exit(-1);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100450 }
451 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800452}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100453
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800454int do_perfstat(int argc, char *argv[])
455{
456 unsigned long long t0, t1;
457 int counter;
458 ssize_t res;
459 int status;
460 int pid;
461
462 if (!system_wide)
463 nr_cpus = 1;
464
465 for (counter = 0; counter < nr_counters; counter++)
466 create_perfstat_counter(counter);
467
468 argc -= optind;
469 argv += optind;
470
Wu Fengguangaf9522c2009-03-20 10:08:10 +0800471 if (!argc)
472 display_help();
473
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800474 /*
475 * Enable counters and exec the command:
476 */
477 t0 = rdclock();
478 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
479
480 if ((pid = fork()) < 0)
481 perror("failed to fork");
482 if (!pid) {
483 if (execvp(argv[0], argv)) {
484 perror(argv[0]);
485 exit(-1);
486 }
Wu Fengguang95bb3be2009-03-20 10:08:04 +0800487 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800488 while (wait(&status) >= 0)
489 ;
490 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
491 t1 = rdclock();
492
493 fflush(stdout);
494
495 fprintf(stderr, "\n");
496 fprintf(stderr, " Performance counter stats for \'%s\':\n",
497 argv[0]);
498 fprintf(stderr, "\n");
Ingo Molnare0143ba2009-03-23 21:29:59 +0100499
500 for (counter = 0; counter < nr_counters; counter++) {
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800501 int cpu;
502 __u64 count, single_count;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100503
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800504 count = 0;
505 for (cpu = 0; cpu < nr_cpus; cpu ++) {
506 res = read(fd[cpu][counter],
507 (char *) &single_count, sizeof(single_count));
508 assert(res == sizeof(single_count));
509 count += single_count;
510 }
511
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100512 if (!PERF_COUNTER_RAW(event_id[counter]) &&
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800513 (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
514 event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
515
516 double msecs = (double)count / 1000000;
517
518 fprintf(stderr, " %14.6f %-20s (msecs)\n",
519 msecs, event_name(counter));
520 } else {
521 fprintf(stderr, " %14Ld %-20s (events)\n",
522 count, event_name(counter));
523 }
524 if (!counter)
525 fprintf(stderr, "\n");
Ingo Molnare0143ba2009-03-23 21:29:59 +0100526 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800527 fprintf(stderr, "\n");
528 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
529 (double)(t1-t0)/1e6);
530 fprintf(stderr, "\n");
531
532 return 0;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100533}
534
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800535/*
536 * Symbols
537 */
538
Ingo Molnare0143ba2009-03-23 21:29:59 +0100539static uint64_t min_ip;
540static uint64_t max_ip = -1ll;
541
542struct sym_entry {
543 unsigned long long addr;
544 char *sym;
545 unsigned long count[MAX_COUNTERS];
546 int skip;
547 GList *source;
548};
549
550#define MAX_SYMS 100000
551
552static int sym_table_count;
553
554struct sym_entry *sym_filter_entry;
555
556static struct sym_entry sym_table[MAX_SYMS];
557
558static void show_details(struct sym_entry *sym);
559
560/*
Wu Fengguangef45fa92009-03-20 10:08:07 +0800561 * Ordering weight: count-1 * count-2 * ... / count-n
Ingo Molnare0143ba2009-03-23 21:29:59 +0100562 */
563static double sym_weight(const struct sym_entry *sym)
564{
565 double weight;
566 int counter;
567
568 weight = sym->count[0];
569
570 for (counter = 1; counter < nr_counters-1; counter++)
571 weight *= sym->count[counter];
572
573 weight /= (sym->count[counter] + 1);
574
575 return weight;
576}
577
578static int compare(const void *__sym1, const void *__sym2)
579{
580 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
581
582 return sym_weight(sym1) < sym_weight(sym2);
583}
584
585static time_t last_refresh;
586static long events;
587static long userspace_events;
588static const char CONSOLE_CLEAR[] = "";
589
590static struct sym_entry tmp[MAX_SYMS];
591
592static void print_sym_table(void)
593{
594 int i, printed;
595 int counter;
596 float events_per_sec = events/delay_secs;
597 float kevents_per_sec = (events-userspace_events)/delay_secs;
598
599 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
600 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
601
602 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
603
604 printf(
605"------------------------------------------------------------------------------\n");
606 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
607 events_per_sec,
608 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
609 nmi ? "NMI" : "IRQ");
610
611 if (nr_counters == 1)
612 printf("%d ", event_count[0]);
613
614 for (counter = 0; counter < nr_counters; counter++) {
615 if (counter)
616 printf("/");
617
Wu Fengguange3908612009-03-20 10:08:05 +0800618 printf("%s", event_name(counter));
Ingo Molnare0143ba2009-03-23 21:29:59 +0100619 }
620
621 printf( "], ");
622
623 if (tid != -1)
624 printf(" (tid: %d", tid);
625 else
626 printf(" (all");
627
628 if (profile_cpu != -1)
629 printf(", cpu: %d)\n", profile_cpu);
630 else {
631 if (tid != -1)
632 printf(")\n");
633 else
634 printf(", %d CPUs)\n", nr_cpus);
635 }
636
637 printf("------------------------------------------------------------------------------\n\n");
638
639 if (nr_counters == 1)
640 printf(" events");
641 else
642 printf(" weight events");
643
644 printf(" RIP kernel function\n"
645 " ______ ______ ________________ _______________\n\n"
646 );
647
648 printed = 0;
649 for (i = 0; i < sym_table_count; i++) {
650 int count;
651
652 if (nr_counters == 1) {
653 if (printed <= 18 &&
654 tmp[i].count[0] >= count_filter) {
655 printf("%19.2f - %016llx : %s\n",
656 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
657 printed++;
658 }
659 } else {
660 if (printed <= 18 &&
661 tmp[i].count[0] >= count_filter) {
662 printf("%8.1f %10ld - %016llx : %s\n",
663 sym_weight(tmp + i),
664 tmp[i].count[0],
665 tmp[i].addr, tmp[i].sym);
666 printed++;
667 }
668 }
669 /*
670 * Add decay to the counts:
671 */
672 for (count = 0; count < nr_counters; count++)
673 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
674 }
675
676 if (sym_filter_entry)
677 show_details(sym_filter_entry);
678
679 last_refresh = time(NULL);
680
681 {
682 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
683
684 if (poll(&stdin_poll, 1, 0) == 1) {
685 printf("key pressed - exiting.\n");
686 exit(0);
687 }
688 }
689}
690
691static int read_symbol(FILE *in, struct sym_entry *s)
692{
693 static int filter_match = 0;
694 char *sym, stype;
695 char str[500];
696 int rc, pos;
697
698 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
699 if (rc == EOF)
700 return -1;
701
702 assert(rc == 3);
703
704 /* skip until end of line: */
705 pos = strlen(str);
706 do {
707 rc = fgetc(in);
708 if (rc == '\n' || rc == EOF || pos >= 499)
709 break;
710 str[pos] = rc;
711 pos++;
712 } while (1);
713 str[pos] = 0;
714
715 sym = str;
716
717 /* Filter out known duplicates and non-text symbols. */
718 if (!strcmp(sym, "_text"))
719 return 1;
720 if (!min_ip && !strcmp(sym, "_stext"))
721 return 1;
722 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
723 return 1;
724 if (stype != 'T' && stype != 't')
725 return 1;
726 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
727 return 1;
728 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
729 return 1;
730
731 s->sym = malloc(strlen(str));
732 assert(s->sym);
733
734 strcpy((char *)s->sym, str);
735 s->skip = 0;
736
737 /* Tag events to be skipped. */
738 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
739 s->skip = 1;
740 if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
741 s->skip = 1;
742
743 if (filter_match == 1) {
744 filter_end = s->addr;
745 filter_match = -1;
746 if (filter_end - filter_start > 10000) {
747 printf("hm, too large filter symbol <%s> - skipping.\n",
748 sym_filter);
749 printf("symbol filter start: %016lx\n", filter_start);
750 printf(" end: %016lx\n", filter_end);
751 filter_end = filter_start = 0;
752 sym_filter = NULL;
753 sleep(1);
754 }
755 }
756 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
757 filter_match = 1;
758 filter_start = s->addr;
759 }
760
761 return 0;
762}
763
764int compare_addr(const void *__sym1, const void *__sym2)
765{
766 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
767
768 return sym1->addr > sym2->addr;
769}
770
771static void sort_symbol_table(void)
772{
773 int i, dups;
774
775 do {
776 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
777 for (i = 0, dups = 0; i < sym_table_count; i++) {
778 if (sym_table[i].addr == sym_table[i+1].addr) {
779 sym_table[i+1].addr = -1ll;
780 dups++;
781 }
782 }
783 sym_table_count -= dups;
784 } while(dups);
785}
786
787static void parse_symbols(void)
788{
789 struct sym_entry *last;
790
791 FILE *kallsyms = fopen("/proc/kallsyms", "r");
792
793 if (!kallsyms) {
794 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
795 exit(-1);
796 }
797
798 while (!feof(kallsyms)) {
799 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
800 sym_table_count++;
801 assert(sym_table_count <= MAX_SYMS);
802 }
803 }
804
805 sort_symbol_table();
806 min_ip = sym_table[0].addr;
807 max_ip = sym_table[sym_table_count-1].addr;
808 last = sym_table + sym_table_count++;
809
810 last->addr = -1ll;
811 last->sym = "<end>";
812
813 if (filter_end) {
814 int count;
815 for (count=0; count < sym_table_count; count ++) {
816 if (!strcmp(sym_table[count].sym, sym_filter)) {
817 sym_filter_entry = &sym_table[count];
818 break;
819 }
820 }
821 }
822 if (dump_symtab) {
823 int i;
824
825 for (i = 0; i < sym_table_count; i++)
826 fprintf(stderr, "%llx %s\n",
827 sym_table[i].addr, sym_table[i].sym);
828 }
829}
830
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800831/*
832 * Source lines
833 */
Ingo Molnare0143ba2009-03-23 21:29:59 +0100834
835static void parse_vmlinux(char *filename)
836{
837 FILE *file;
838 char command[PATH_MAX*2];
839 if (!filename)
840 return;
841
842 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
843
844 file = popen(command, "r");
845 if (!file)
846 return;
847
848 while (!feof(file)) {
849 struct source_line *src;
850 size_t dummy = 0;
851 char *c;
852
853 src = malloc(sizeof(struct source_line));
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800854 assert(src != NULL);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100855 memset(src, 0, sizeof(struct source_line));
856
857 if (getline(&src->line, &dummy, file) < 0)
858 break;
859 if (!src->line)
860 break;
861
862 c = strchr(src->line, '\n');
863 if (c)
864 *c = 0;
865
866 lines = g_list_prepend(lines, src);
867
868 if (strlen(src->line)>8 && src->line[8] == ':')
869 src->EIP = strtoull(src->line, NULL, 16);
870 if (strlen(src->line)>8 && src->line[16] == ':')
871 src->EIP = strtoull(src->line, NULL, 16);
872 }
873 pclose(file);
874 lines = g_list_reverse(lines);
875}
876
877static void record_precise_ip(uint64_t ip)
878{
879 struct source_line *line;
880 GList *item;
881
882 item = g_list_first(lines);
883 while (item) {
884 line = item->data;
885 if (line->EIP == ip)
886 line->count++;
887 if (line->EIP > ip)
888 break;
889 item = g_list_next(item);
890 }
891}
892
893static void lookup_sym_in_vmlinux(struct sym_entry *sym)
894{
895 struct source_line *line;
896 GList *item;
897 char pattern[PATH_MAX];
898 sprintf(pattern, "<%s>:", sym->sym);
899
900 item = g_list_first(lines);
901 while (item) {
902 line = item->data;
903 if (strstr(line->line, pattern)) {
904 sym->source = item;
905 break;
906 }
907 item = g_list_next(item);
908 }
909}
910
911void show_lines(GList *item_queue, int item_queue_count)
912{
913 int i;
914 struct source_line *line;
915
916 for (i = 0; i < item_queue_count; i++) {
917 line = item_queue->data;
918 printf("%8li\t%s\n", line->count, line->line);
919 item_queue = g_list_next(item_queue);
920 }
921}
922
923#define TRACE_COUNT 3
924
925static void show_details(struct sym_entry *sym)
926{
927 struct source_line *line;
928 GList *item;
929 int displayed = 0;
930 GList *item_queue = NULL;
931 int item_queue_count = 0;
932
933 if (!sym->source)
934 lookup_sym_in_vmlinux(sym);
935 if (!sym->source)
936 return;
937
938 printf("Showing details for %s\n", sym->sym);
939
940 item = sym->source;
941 while (item) {
942 line = item->data;
943 if (displayed && strstr(line->line, ">:"))
944 break;
945
946 if (!item_queue_count)
947 item_queue = item;
948 item_queue_count ++;
949
950 if (line->count >= count_filter) {
951 show_lines(item_queue, item_queue_count);
952 item_queue_count = 0;
953 item_queue = NULL;
954 } else if (item_queue_count > TRACE_COUNT) {
955 item_queue = g_list_next(item_queue);
956 item_queue_count --;
957 }
958
959 line->count = 0;
960 displayed++;
961 if (displayed > 300)
962 break;
963 item = g_list_next(item);
964 }
965}
966
967/*
968 * Binary search in the histogram table and record the hit:
969 */
970static void record_ip(uint64_t ip, int counter)
971{
972 int left_idx, middle_idx, right_idx, idx;
973 unsigned long left, middle, right;
974
975 record_precise_ip(ip);
976
977 left_idx = 0;
978 right_idx = sym_table_count-1;
979 assert(ip <= max_ip && ip >= min_ip);
980
981 while (left_idx + 1 < right_idx) {
982 middle_idx = (left_idx + right_idx) / 2;
983
984 left = sym_table[ left_idx].addr;
985 middle = sym_table[middle_idx].addr;
986 right = sym_table[ right_idx].addr;
987
988 if (!(left <= middle && middle <= right)) {
989 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
990 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
991 }
992 assert(left <= middle && middle <= right);
993 if (!(left <= ip && ip <= right)) {
994 printf(" left: %016lx\n", left);
Peter Zijlstra803d4f32009-03-23 18:22:11 +0100995 printf(" ip: %016llx\n", ip);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100996 printf("right: %016lx\n", right);
997 }
998 assert(left <= ip && ip <= right);
999 /*
1000 * [ left .... target .... middle .... right ]
1001 * => right := middle
1002 */
1003 if (ip < middle) {
1004 right_idx = middle_idx;
1005 continue;
1006 }
1007 /*
1008 * [ left .... middle ... target ... right ]
1009 * => left := middle
1010 */
1011 left_idx = middle_idx;
1012 }
1013
1014 idx = left_idx;
1015
1016 if (!sym_table[idx].skip)
1017 sym_table[idx].count[counter]++;
1018 else events--;
1019}
1020
1021static void process_event(uint64_t ip, int counter)
1022{
1023 events++;
1024
1025 if (ip < min_ip || ip > max_ip) {
1026 userspace_events++;
1027 return;
1028 }
1029
1030 record_ip(ip, counter);
1031}
1032
Wu Fengguangf7524bd2009-03-20 10:08:06 +08001033static void process_options(int argc, char *argv[])
1034{
1035 int error = 0, counter;
1036
1037 if (strstr(argv[0], "perfstat"))
1038 run_perfstat = 1;
1039
1040 for (;;) {
1041 int option_index = 0;
1042 /** Options for getopt */
1043 static struct option long_options[] = {
1044 {"count", required_argument, NULL, 'c'},
1045 {"cpu", required_argument, NULL, 'C'},
1046 {"delay", required_argument, NULL, 'd'},
1047 {"dump_symtab", no_argument, NULL, 'D'},
1048 {"event", required_argument, NULL, 'e'},
1049 {"filter", required_argument, NULL, 'f'},
1050 {"group", required_argument, NULL, 'g'},
1051 {"help", no_argument, NULL, 'h'},
1052 {"nmi", required_argument, NULL, 'n'},
1053 {"pid", required_argument, NULL, 'p'},
1054 {"vmlinux", required_argument, NULL, 'x'},
1055 {"symbol", required_argument, NULL, 's'},
1056 {"stat", no_argument, NULL, 'S'},
1057 {"zero", no_argument, NULL, 'z'},
1058 {NULL, 0, NULL, 0 }
1059 };
1060 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
1061 long_options, &option_index);
1062 if (c == -1)
1063 break;
1064
1065 switch (c) {
1066 case 'a': system_wide = 1; break;
Peter Zijlstra803d4f32009-03-23 18:22:11 +01001067 case 'c': default_interval = atoi(optarg); break;
Wu Fengguangf7524bd2009-03-20 10:08:06 +08001068 case 'C':
1069 /* CPU and PID are mutually exclusive */
1070 if (tid != -1) {
1071 printf("WARNING: CPU switch overriding PID\n");
1072 sleep(1);
1073 tid = -1;
1074 }
1075 profile_cpu = atoi(optarg); break;
1076 case 'd': delay_secs = atoi(optarg); break;
1077 case 'D': dump_symtab = 1; break;
1078
1079 case 'e': error = parse_events(optarg); break;
1080
1081 case 'f': count_filter = atoi(optarg); break;
1082 case 'g': group = atoi(optarg); break;
1083 case 'h': display_help(); break;
1084 case 'n': nmi = atoi(optarg); break;
1085 case 'p':
1086 /* CPU and PID are mutually exclusive */
1087 if (profile_cpu != -1) {
1088 printf("WARNING: PID switch overriding CPU\n");
1089 sleep(1);
1090 profile_cpu = -1;
1091 }
1092 tid = atoi(optarg); break;
1093 case 's': sym_filter = strdup(optarg); break;
1094 case 'S': run_perfstat = 1; break;
1095 case 'x': vmlinux = strdup(optarg); break;
1096 case 'z': zero = 1; break;
1097 default: error = 1; break;
1098 }
1099 }
1100 if (error)
1101 display_help();
1102
1103 if (!nr_counters) {
1104 if (run_perfstat)
1105 nr_counters = 8;
1106 else {
1107 nr_counters = 1;
1108 event_id[0] = 0;
1109 }
1110 }
1111
1112 for (counter = 0; counter < nr_counters; counter++) {
1113 if (event_count[counter])
1114 continue;
1115
Peter Zijlstra803d4f32009-03-23 18:22:11 +01001116 event_count[counter] = default_interval;
Wu Fengguangf7524bd2009-03-20 10:08:06 +08001117 }
1118}
1119
Ingo Molnare0143ba2009-03-23 21:29:59 +01001120int main(int argc, char *argv[])
1121{
1122 struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1123 struct perf_counter_hw_event hw_event;
Ingo Molnare0143ba2009-03-23 21:29:59 +01001124 int i, counter, group_fd;
1125 unsigned int cpu;
1126 uint64_t ip;
1127 ssize_t res;
1128 int ret;
1129
1130 process_options(argc, argv);
1131
1132 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
Wu Fengguangf7524bd2009-03-20 10:08:06 +08001133 assert(nr_cpus <= MAX_NR_CPUS);
1134 assert(nr_cpus >= 0);
1135
1136 if (run_perfstat)
1137 return do_perfstat(argc, argv);
1138
Ingo Molnare0143ba2009-03-23 21:29:59 +01001139 if (tid != -1 || profile_cpu != -1)
1140 nr_cpus = 1;
1141
Ingo Molnare0143ba2009-03-23 21:29:59 +01001142 for (i = 0; i < nr_cpus; i++) {
1143 group_fd = -1;
1144 for (counter = 0; counter < nr_counters; counter++) {
1145
1146 cpu = profile_cpu;
1147 if (tid == -1 && profile_cpu == -1)
1148 cpu = i;
1149
1150 memset(&hw_event, 0, sizeof(hw_event));
Peter Zijlstra803d4f32009-03-23 18:22:11 +01001151 hw_event.config = event_id[counter];
Ingo Molnare0143ba2009-03-23 21:29:59 +01001152 hw_event.irq_period = event_count[counter];
1153 hw_event.record_type = PERF_RECORD_IRQ;
1154 hw_event.nmi = nmi;
1155
Peter Zijlstra803d4f32009-03-23 18:22:11 +01001156 printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
1157
Ingo Molnare0143ba2009-03-23 21:29:59 +01001158 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1159 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1160 if (fd[i][counter] < 0) {
1161 printf("kerneltop error: syscall returned with %d (%s)\n",
1162 fd[i][counter], strerror(-fd[i][counter]));
1163 if (fd[i][counter] == -1)
1164 printf("Are you root?\n");
1165 exit(-1);
1166 }
1167 assert(fd[i][counter] >= 0);
1168
1169 /*
1170 * First counter acts as the group leader:
1171 */
1172 if (group && group_fd == -1)
1173 group_fd = fd[i][counter];
1174
1175 event_array[i][counter].fd = fd[i][counter];
1176 event_array[i][counter].events = POLLIN;
1177 }
1178 }
1179
1180 parse_symbols();
1181 if (vmlinux && sym_filter_entry)
1182 parse_vmlinux(vmlinux);
1183
1184 printf("KernelTop refresh period: %d seconds\n", delay_secs);
1185 last_refresh = time(NULL);
1186
1187 while (1) {
1188 int hits = events;
1189
1190 for (i = 0; i < nr_cpus; i++) {
1191 for (counter = 0; counter < nr_counters; counter++) {
1192 res = read(fd[i][counter], (char *) &ip, sizeof(ip));
1193 if (res > 0) {
1194 assert(res == sizeof(ip));
1195
1196 process_event(ip, counter);
1197 }
1198 }
1199 }
1200
1201 if (time(NULL) >= last_refresh + delay_secs) {
1202 print_sym_table();
1203 events = userspace_events = 0;
1204 }
1205
1206 if (hits == events)
1207 ret = poll(event_array[0], nr_cpus, 1000);
1208 hits = events;
1209 }
1210
1211 return 0;
1212}