blob: 0bd3c13150b1f30b3147ebf58dd75e33cc3ba44e [file] [log] [blame]
Ingo Molnare0143ba2009-03-23 21:29:59 +01001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
Wu Fengguangf7524bd2009-03-20 10:08:06 +08006 cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
Ingo Molnare0143ba2009-03-23 21:29:59 +01007
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
Ingo Molnare0143ba2009-03-23 21:29:59 +010029 */
Wu Fengguangf7524bd2009-03-20 10:08:06 +080030
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 *
60 * Released under the GPL v2. (and only v2, not any later version)
61 */
62
Ingo Molnare0143ba2009-03-23 21:29:59 +010063#define _GNU_SOURCE
64#include <sys/types.h>
65#include <sys/stat.h>
66#include <sys/time.h>
67#include <unistd.h>
68#include <stdint.h>
69#include <stdlib.h>
70#include <string.h>
71#include <getopt.h>
72#include <assert.h>
73#include <fcntl.h>
74#include <stdio.h>
75#include <errno.h>
76#include <ctype.h>
77#include <time.h>
78
79#include <glib.h>
80
81#include <sys/syscall.h>
82#include <sys/ioctl.h>
83#include <sys/poll.h>
84#include <sys/prctl.h>
85#include <sys/wait.h>
86#include <sys/uio.h>
87
88#include <linux/unistd.h>
89
Wu Fengguangcea92ce2009-03-20 10:08:02 +080090#include "perfcounters.h"
Ingo Molnare0143ba2009-03-23 21:29:59 +010091
Wu Fengguangf7524bd2009-03-20 10:08:06 +080092
93#define MAX_COUNTERS 64
94#define MAX_NR_CPUS 256
95
96#define DEF_PERFSTAT_EVENTS { -2, -5, -4, -3, 0, 1, 2, 3}
97
98static int run_perfstat = 0;
99static int system_wide = 0;
100
101static int nr_counters = 0;
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800102static __s64 event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800103static int event_raw[MAX_COUNTERS];
104static int event_count[MAX_COUNTERS];
105static int fd[MAX_NR_CPUS][MAX_COUNTERS];
Ingo Molnare0143ba2009-03-23 21:29:59 +0100106
Ingo Molnare0143ba2009-03-23 21:29:59 +0100107static __u64 count_filter = 100;
108
Ingo Molnare0143ba2009-03-23 21:29:59 +0100109static int tid = -1;
110static int profile_cpu = -1;
111static int nr_cpus = 0;
112static int nmi = 1;
113static int group = 0;
114
115static char *vmlinux;
116
117static char *sym_filter;
118static unsigned long filter_start;
119static unsigned long filter_end;
120
121static int delay_secs = 2;
122static int zero;
123static int dump_symtab;
124
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800125static GList *lines;
126
Ingo Molnare0143ba2009-03-23 21:29:59 +0100127struct source_line {
128 uint64_t EIP;
129 unsigned long count;
130 char *line;
131};
132
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800133
134const unsigned int default_count[] = {
Wu Fengguangdda7c022009-03-20 10:08:09 +0800135 10000,
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800136 1000000,
137 10000,
138 10000,
139 1000000,
140 10000,
141};
142
143static char *hw_event_names[] = {
144 "CPU cycles",
145 "instructions",
146 "cache references",
147 "cache misses",
148 "branches",
149 "branch misses",
150 "bus cycles",
151};
152
153static char *sw_event_names[] = {
154 "cpu clock ticks",
155 "task clock ticks",
156 "pagefaults",
157 "context switches",
158 "CPU migrations",
159};
160
161struct event_symbol {
162 int event;
163 char *symbol;
164};
165
166static struct event_symbol event_symbols[] = {
167 {PERF_COUNT_CPU_CYCLES, "cpu-cycles", },
168 {PERF_COUNT_CPU_CYCLES, "cycles", },
169 {PERF_COUNT_INSTRUCTIONS, "instructions", },
170 {PERF_COUNT_CACHE_REFERENCES, "cache-references", },
171 {PERF_COUNT_CACHE_MISSES, "cache-misses", },
172 {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", },
173 {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", },
174 {PERF_COUNT_BRANCH_MISSES, "branch-misses", },
175 {PERF_COUNT_BUS_CYCLES, "bus-cycles", },
176 {PERF_COUNT_CPU_CLOCK, "cpu-ticks", },
177 {PERF_COUNT_CPU_CLOCK, "ticks", },
178 {PERF_COUNT_TASK_CLOCK, "task-ticks", },
179 {PERF_COUNT_PAGE_FAULTS, "page-faults", },
180 {PERF_COUNT_PAGE_FAULTS, "faults", },
181 {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", },
182 {PERF_COUNT_CONTEXT_SWITCHES, "cs", },
183 {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", },
184 {PERF_COUNT_CPU_MIGRATIONS, "migrations", },
185};
186
187static void display_events_help(void)
188{
189 unsigned int i;
190 int e;
191
192 printf(
193 " -e EVENT --event=EVENT # symbolic-name abbreviations");
194
195 for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
196 if (e != event_symbols[i].event) {
197 e = event_symbols[i].event;
198 printf(
199 "\n %2d: %-20s", e, event_symbols[i].symbol);
200 } else
201 printf(" %s", event_symbols[i].symbol);
202 }
203
204 printf("\n"
205 " rNNN: raw PMU events (eventsel+umask)\n\n");
206}
207
208static void display_perfstat_help(void)
209{
210 printf(
211 "Usage: perfstat [<events...>] <cmd...>\n\n"
212 "PerfStat Options (up to %d event types can be specified):\n\n",
213 MAX_COUNTERS);
214
215 display_events_help();
216
217 printf(
218 " -a # system-wide collection\n");
219 exit(0);
220}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100221
222static void display_help(void)
223{
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800224 if (run_perfstat)
225 return display_perfstat_help();
226
Ingo Molnare0143ba2009-03-23 21:29:59 +0100227 printf(
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800228 "Usage: kerneltop [<options>]\n"
229 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100230 "KernelTop Options (up to %d event types can be specified at once):\n\n",
231 MAX_COUNTERS);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800232
233 display_events_help();
234
Ingo Molnare0143ba2009-03-23 21:29:59 +0100235 printf(
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800236 " -S --stat # perfstat COMMAND\n"
237 " -a # system-wide collection (for perfstat)\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100238 " -c CNT --count=CNT # event period to sample\n\n"
239 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
240 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
241 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800242 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100243 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800244 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100245 " -z --zero # zero counts after display\n"
246 " -D --dump_symtab # dump symbol table to stderr on startup\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800247 );
Ingo Molnare0143ba2009-03-23 21:29:59 +0100248
249 exit(0);
250}
251
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800252static int type_valid(int type)
Ingo Molnare0143ba2009-03-23 21:29:59 +0100253{
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800254 if (type >= PERF_HW_EVENTS_MAX)
255 return 0;
256 if (type <= PERF_SW_EVENTS_MIN)
257 return 0;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100258
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800259 return 1;
260}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100261
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800262static char *event_name(int ctr)
263{
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800264 __s64 type = event_id[ctr];
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800265 static char buf[32];
266
267 if (event_raw[ctr]) {
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800268 sprintf(buf, "raw 0x%llx", (long long)type);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800269 return buf;
270 }
271 if (!type_valid(type))
272 return "unknown";
273
274 if (type >= 0)
275 return hw_event_names[type];
276
277 return sw_event_names[-type-1];
278}
279
280/*
281 * Each event can have multiple symbolic names.
282 * Symbolic names are (almost) exactly matched.
283 */
284static int match_event_symbols(char *str)
285{
286 unsigned int i;
287
288 if (isdigit(str[0]) || str[0] == '-')
289 return atoi(str);
290
291 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
292 if (!strncmp(str, event_symbols[i].symbol,
293 strlen(event_symbols[i].symbol)))
294 return event_symbols[i].event;
295 }
296
297 return PERF_HW_EVENTS_MAX;
298}
299
300static int parse_events(char *str)
301{
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800302 __s64 type;
303 int raw;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800304
305again:
306 if (nr_counters == MAX_COUNTERS)
307 return -1;
308
309 raw = 0;
310 if (*str == 'r') {
311 raw = 1;
312 ++str;
313 type = strtol(str, NULL, 16);
314 } else {
315 type = match_event_symbols(str);
316 if (!type_valid(type))
317 return -1;
318 }
319
320 event_id[nr_counters] = type;
321 event_raw[nr_counters] = raw;
322 nr_counters++;
323
324 str = strstr(str, ",");
325 if (str) {
326 str++;
327 goto again;
328 }
329
330 return 0;
331}
332
333
334/*
335 * perfstat
336 */
337
338char fault_here[1000000];
339
340static void create_perfstat_counter(int counter)
341{
342 struct perf_counter_hw_event hw_event;
343
344 memset(&hw_event, 0, sizeof(hw_event));
345 hw_event.type = event_id[counter];
346 hw_event.raw = event_raw[counter];
347 hw_event.record_type = PERF_RECORD_SIMPLE;
348 hw_event.nmi = 0;
349
350 if (system_wide) {
351 int cpu;
352 for (cpu = 0; cpu < nr_cpus; cpu ++) {
353 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
354 if (fd[cpu][counter] < 0) {
355 printf("perfstat error: syscall returned with %d (%s)\n",
356 fd[cpu][counter], strerror(errno));
357 exit(-1);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100358 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800359 }
360 } else {
361 hw_event.inherit = 1;
362 hw_event.disabled = 1;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100363
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800364 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
365 if (fd[0][counter] < 0) {
366 printf("perfstat error: syscall returned with %d (%s)\n",
367 fd[0][counter], strerror(errno));
368 exit(-1);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100369 }
370 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800371}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100372
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800373int do_perfstat(int argc, char *argv[])
374{
375 unsigned long long t0, t1;
376 int counter;
377 ssize_t res;
378 int status;
379 int pid;
380
381 if (!system_wide)
382 nr_cpus = 1;
383
384 for (counter = 0; counter < nr_counters; counter++)
385 create_perfstat_counter(counter);
386
387 argc -= optind;
388 argv += optind;
389
390 /*
391 * Enable counters and exec the command:
392 */
393 t0 = rdclock();
394 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
395
396 if ((pid = fork()) < 0)
397 perror("failed to fork");
398 if (!pid) {
399 if (execvp(argv[0], argv)) {
400 perror(argv[0]);
401 exit(-1);
402 }
Wu Fengguang95bb3be2009-03-20 10:08:04 +0800403 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800404 while (wait(&status) >= 0)
405 ;
406 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
407 t1 = rdclock();
408
409 fflush(stdout);
410
411 fprintf(stderr, "\n");
412 fprintf(stderr, " Performance counter stats for \'%s\':\n",
413 argv[0]);
414 fprintf(stderr, "\n");
Ingo Molnare0143ba2009-03-23 21:29:59 +0100415
416 for (counter = 0; counter < nr_counters; counter++) {
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800417 int cpu;
418 __u64 count, single_count;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100419
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800420 count = 0;
421 for (cpu = 0; cpu < nr_cpus; cpu ++) {
422 res = read(fd[cpu][counter],
423 (char *) &single_count, sizeof(single_count));
424 assert(res == sizeof(single_count));
425 count += single_count;
426 }
427
428 if (!event_raw[counter] &&
429 (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
430 event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
431
432 double msecs = (double)count / 1000000;
433
434 fprintf(stderr, " %14.6f %-20s (msecs)\n",
435 msecs, event_name(counter));
436 } else {
437 fprintf(stderr, " %14Ld %-20s (events)\n",
438 count, event_name(counter));
439 }
440 if (!counter)
441 fprintf(stderr, "\n");
Ingo Molnare0143ba2009-03-23 21:29:59 +0100442 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800443 fprintf(stderr, "\n");
444 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
445 (double)(t1-t0)/1e6);
446 fprintf(stderr, "\n");
447
448 return 0;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100449}
450
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800451/*
452 * Symbols
453 */
454
Ingo Molnare0143ba2009-03-23 21:29:59 +0100455static uint64_t min_ip;
456static uint64_t max_ip = -1ll;
457
458struct sym_entry {
459 unsigned long long addr;
460 char *sym;
461 unsigned long count[MAX_COUNTERS];
462 int skip;
463 GList *source;
464};
465
466#define MAX_SYMS 100000
467
468static int sym_table_count;
469
470struct sym_entry *sym_filter_entry;
471
472static struct sym_entry sym_table[MAX_SYMS];
473
474static void show_details(struct sym_entry *sym);
475
476/*
Wu Fengguangef45fa92009-03-20 10:08:07 +0800477 * Ordering weight: count-1 * count-2 * ... / count-n
Ingo Molnare0143ba2009-03-23 21:29:59 +0100478 */
479static double sym_weight(const struct sym_entry *sym)
480{
481 double weight;
482 int counter;
483
484 weight = sym->count[0];
485
486 for (counter = 1; counter < nr_counters-1; counter++)
487 weight *= sym->count[counter];
488
489 weight /= (sym->count[counter] + 1);
490
491 return weight;
492}
493
494static int compare(const void *__sym1, const void *__sym2)
495{
496 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
497
498 return sym_weight(sym1) < sym_weight(sym2);
499}
500
501static time_t last_refresh;
502static long events;
503static long userspace_events;
504static const char CONSOLE_CLEAR[] = "";
505
506static struct sym_entry tmp[MAX_SYMS];
507
508static void print_sym_table(void)
509{
510 int i, printed;
511 int counter;
512 float events_per_sec = events/delay_secs;
513 float kevents_per_sec = (events-userspace_events)/delay_secs;
514
515 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
516 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
517
518 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
519
520 printf(
521"------------------------------------------------------------------------------\n");
522 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
523 events_per_sec,
524 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
525 nmi ? "NMI" : "IRQ");
526
527 if (nr_counters == 1)
528 printf("%d ", event_count[0]);
529
530 for (counter = 0; counter < nr_counters; counter++) {
531 if (counter)
532 printf("/");
533
Wu Fengguange3908612009-03-20 10:08:05 +0800534 printf("%s", event_name(counter));
Ingo Molnare0143ba2009-03-23 21:29:59 +0100535 }
536
537 printf( "], ");
538
539 if (tid != -1)
540 printf(" (tid: %d", tid);
541 else
542 printf(" (all");
543
544 if (profile_cpu != -1)
545 printf(", cpu: %d)\n", profile_cpu);
546 else {
547 if (tid != -1)
548 printf(")\n");
549 else
550 printf(", %d CPUs)\n", nr_cpus);
551 }
552
553 printf("------------------------------------------------------------------------------\n\n");
554
555 if (nr_counters == 1)
556 printf(" events");
557 else
558 printf(" weight events");
559
560 printf(" RIP kernel function\n"
561 " ______ ______ ________________ _______________\n\n"
562 );
563
564 printed = 0;
565 for (i = 0; i < sym_table_count; i++) {
566 int count;
567
568 if (nr_counters == 1) {
569 if (printed <= 18 &&
570 tmp[i].count[0] >= count_filter) {
571 printf("%19.2f - %016llx : %s\n",
572 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
573 printed++;
574 }
575 } else {
576 if (printed <= 18 &&
577 tmp[i].count[0] >= count_filter) {
578 printf("%8.1f %10ld - %016llx : %s\n",
579 sym_weight(tmp + i),
580 tmp[i].count[0],
581 tmp[i].addr, tmp[i].sym);
582 printed++;
583 }
584 }
585 /*
586 * Add decay to the counts:
587 */
588 for (count = 0; count < nr_counters; count++)
589 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
590 }
591
592 if (sym_filter_entry)
593 show_details(sym_filter_entry);
594
595 last_refresh = time(NULL);
596
597 {
598 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
599
600 if (poll(&stdin_poll, 1, 0) == 1) {
601 printf("key pressed - exiting.\n");
602 exit(0);
603 }
604 }
605}
606
607static int read_symbol(FILE *in, struct sym_entry *s)
608{
609 static int filter_match = 0;
610 char *sym, stype;
611 char str[500];
612 int rc, pos;
613
614 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
615 if (rc == EOF)
616 return -1;
617
618 assert(rc == 3);
619
620 /* skip until end of line: */
621 pos = strlen(str);
622 do {
623 rc = fgetc(in);
624 if (rc == '\n' || rc == EOF || pos >= 499)
625 break;
626 str[pos] = rc;
627 pos++;
628 } while (1);
629 str[pos] = 0;
630
631 sym = str;
632
633 /* Filter out known duplicates and non-text symbols. */
634 if (!strcmp(sym, "_text"))
635 return 1;
636 if (!min_ip && !strcmp(sym, "_stext"))
637 return 1;
638 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
639 return 1;
640 if (stype != 'T' && stype != 't')
641 return 1;
642 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
643 return 1;
644 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
645 return 1;
646
647 s->sym = malloc(strlen(str));
648 assert(s->sym);
649
650 strcpy((char *)s->sym, str);
651 s->skip = 0;
652
653 /* Tag events to be skipped. */
654 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
655 s->skip = 1;
656 if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
657 s->skip = 1;
658
659 if (filter_match == 1) {
660 filter_end = s->addr;
661 filter_match = -1;
662 if (filter_end - filter_start > 10000) {
663 printf("hm, too large filter symbol <%s> - skipping.\n",
664 sym_filter);
665 printf("symbol filter start: %016lx\n", filter_start);
666 printf(" end: %016lx\n", filter_end);
667 filter_end = filter_start = 0;
668 sym_filter = NULL;
669 sleep(1);
670 }
671 }
672 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
673 filter_match = 1;
674 filter_start = s->addr;
675 }
676
677 return 0;
678}
679
680int compare_addr(const void *__sym1, const void *__sym2)
681{
682 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
683
684 return sym1->addr > sym2->addr;
685}
686
687static void sort_symbol_table(void)
688{
689 int i, dups;
690
691 do {
692 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
693 for (i = 0, dups = 0; i < sym_table_count; i++) {
694 if (sym_table[i].addr == sym_table[i+1].addr) {
695 sym_table[i+1].addr = -1ll;
696 dups++;
697 }
698 }
699 sym_table_count -= dups;
700 } while(dups);
701}
702
703static void parse_symbols(void)
704{
705 struct sym_entry *last;
706
707 FILE *kallsyms = fopen("/proc/kallsyms", "r");
708
709 if (!kallsyms) {
710 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
711 exit(-1);
712 }
713
714 while (!feof(kallsyms)) {
715 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
716 sym_table_count++;
717 assert(sym_table_count <= MAX_SYMS);
718 }
719 }
720
721 sort_symbol_table();
722 min_ip = sym_table[0].addr;
723 max_ip = sym_table[sym_table_count-1].addr;
724 last = sym_table + sym_table_count++;
725
726 last->addr = -1ll;
727 last->sym = "<end>";
728
729 if (filter_end) {
730 int count;
731 for (count=0; count < sym_table_count; count ++) {
732 if (!strcmp(sym_table[count].sym, sym_filter)) {
733 sym_filter_entry = &sym_table[count];
734 break;
735 }
736 }
737 }
738 if (dump_symtab) {
739 int i;
740
741 for (i = 0; i < sym_table_count; i++)
742 fprintf(stderr, "%llx %s\n",
743 sym_table[i].addr, sym_table[i].sym);
744 }
745}
746
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800747/*
748 * Source lines
749 */
Ingo Molnare0143ba2009-03-23 21:29:59 +0100750
751static void parse_vmlinux(char *filename)
752{
753 FILE *file;
754 char command[PATH_MAX*2];
755 if (!filename)
756 return;
757
758 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
759
760 file = popen(command, "r");
761 if (!file)
762 return;
763
764 while (!feof(file)) {
765 struct source_line *src;
766 size_t dummy = 0;
767 char *c;
768
769 src = malloc(sizeof(struct source_line));
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800770 assert(src != NULL);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100771 memset(src, 0, sizeof(struct source_line));
772
773 if (getline(&src->line, &dummy, file) < 0)
774 break;
775 if (!src->line)
776 break;
777
778 c = strchr(src->line, '\n');
779 if (c)
780 *c = 0;
781
782 lines = g_list_prepend(lines, src);
783
784 if (strlen(src->line)>8 && src->line[8] == ':')
785 src->EIP = strtoull(src->line, NULL, 16);
786 if (strlen(src->line)>8 && src->line[16] == ':')
787 src->EIP = strtoull(src->line, NULL, 16);
788 }
789 pclose(file);
790 lines = g_list_reverse(lines);
791}
792
793static void record_precise_ip(uint64_t ip)
794{
795 struct source_line *line;
796 GList *item;
797
798 item = g_list_first(lines);
799 while (item) {
800 line = item->data;
801 if (line->EIP == ip)
802 line->count++;
803 if (line->EIP > ip)
804 break;
805 item = g_list_next(item);
806 }
807}
808
809static void lookup_sym_in_vmlinux(struct sym_entry *sym)
810{
811 struct source_line *line;
812 GList *item;
813 char pattern[PATH_MAX];
814 sprintf(pattern, "<%s>:", sym->sym);
815
816 item = g_list_first(lines);
817 while (item) {
818 line = item->data;
819 if (strstr(line->line, pattern)) {
820 sym->source = item;
821 break;
822 }
823 item = g_list_next(item);
824 }
825}
826
827void show_lines(GList *item_queue, int item_queue_count)
828{
829 int i;
830 struct source_line *line;
831
832 for (i = 0; i < item_queue_count; i++) {
833 line = item_queue->data;
834 printf("%8li\t%s\n", line->count, line->line);
835 item_queue = g_list_next(item_queue);
836 }
837}
838
839#define TRACE_COUNT 3
840
841static void show_details(struct sym_entry *sym)
842{
843 struct source_line *line;
844 GList *item;
845 int displayed = 0;
846 GList *item_queue = NULL;
847 int item_queue_count = 0;
848
849 if (!sym->source)
850 lookup_sym_in_vmlinux(sym);
851 if (!sym->source)
852 return;
853
854 printf("Showing details for %s\n", sym->sym);
855
856 item = sym->source;
857 while (item) {
858 line = item->data;
859 if (displayed && strstr(line->line, ">:"))
860 break;
861
862 if (!item_queue_count)
863 item_queue = item;
864 item_queue_count ++;
865
866 if (line->count >= count_filter) {
867 show_lines(item_queue, item_queue_count);
868 item_queue_count = 0;
869 item_queue = NULL;
870 } else if (item_queue_count > TRACE_COUNT) {
871 item_queue = g_list_next(item_queue);
872 item_queue_count --;
873 }
874
875 line->count = 0;
876 displayed++;
877 if (displayed > 300)
878 break;
879 item = g_list_next(item);
880 }
881}
882
883/*
884 * Binary search in the histogram table and record the hit:
885 */
886static void record_ip(uint64_t ip, int counter)
887{
888 int left_idx, middle_idx, right_idx, idx;
889 unsigned long left, middle, right;
890
891 record_precise_ip(ip);
892
893 left_idx = 0;
894 right_idx = sym_table_count-1;
895 assert(ip <= max_ip && ip >= min_ip);
896
897 while (left_idx + 1 < right_idx) {
898 middle_idx = (left_idx + right_idx) / 2;
899
900 left = sym_table[ left_idx].addr;
901 middle = sym_table[middle_idx].addr;
902 right = sym_table[ right_idx].addr;
903
904 if (!(left <= middle && middle <= right)) {
905 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
906 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
907 }
908 assert(left <= middle && middle <= right);
909 if (!(left <= ip && ip <= right)) {
910 printf(" left: %016lx\n", left);
911 printf(" ip: %016lx\n", ip);
912 printf("right: %016lx\n", right);
913 }
914 assert(left <= ip && ip <= right);
915 /*
916 * [ left .... target .... middle .... right ]
917 * => right := middle
918 */
919 if (ip < middle) {
920 right_idx = middle_idx;
921 continue;
922 }
923 /*
924 * [ left .... middle ... target ... right ]
925 * => left := middle
926 */
927 left_idx = middle_idx;
928 }
929
930 idx = left_idx;
931
932 if (!sym_table[idx].skip)
933 sym_table[idx].count[counter]++;
934 else events--;
935}
936
937static void process_event(uint64_t ip, int counter)
938{
939 events++;
940
941 if (ip < min_ip || ip > max_ip) {
942 userspace_events++;
943 return;
944 }
945
946 record_ip(ip, counter);
947}
948
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800949static void process_options(int argc, char *argv[])
950{
951 int error = 0, counter;
952
953 if (strstr(argv[0], "perfstat"))
954 run_perfstat = 1;
955
956 for (;;) {
957 int option_index = 0;
958 /** Options for getopt */
959 static struct option long_options[] = {
960 {"count", required_argument, NULL, 'c'},
961 {"cpu", required_argument, NULL, 'C'},
962 {"delay", required_argument, NULL, 'd'},
963 {"dump_symtab", no_argument, NULL, 'D'},
964 {"event", required_argument, NULL, 'e'},
965 {"filter", required_argument, NULL, 'f'},
966 {"group", required_argument, NULL, 'g'},
967 {"help", no_argument, NULL, 'h'},
968 {"nmi", required_argument, NULL, 'n'},
969 {"pid", required_argument, NULL, 'p'},
970 {"vmlinux", required_argument, NULL, 'x'},
971 {"symbol", required_argument, NULL, 's'},
972 {"stat", no_argument, NULL, 'S'},
973 {"zero", no_argument, NULL, 'z'},
974 {NULL, 0, NULL, 0 }
975 };
976 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
977 long_options, &option_index);
978 if (c == -1)
979 break;
980
981 switch (c) {
982 case 'a': system_wide = 1; break;
983 case 'c': event_count[nr_counters] = atoi(optarg); break;
984 case 'C':
985 /* CPU and PID are mutually exclusive */
986 if (tid != -1) {
987 printf("WARNING: CPU switch overriding PID\n");
988 sleep(1);
989 tid = -1;
990 }
991 profile_cpu = atoi(optarg); break;
992 case 'd': delay_secs = atoi(optarg); break;
993 case 'D': dump_symtab = 1; break;
994
995 case 'e': error = parse_events(optarg); break;
996
997 case 'f': count_filter = atoi(optarg); break;
998 case 'g': group = atoi(optarg); break;
999 case 'h': display_help(); break;
1000 case 'n': nmi = atoi(optarg); break;
1001 case 'p':
1002 /* CPU and PID are mutually exclusive */
1003 if (profile_cpu != -1) {
1004 printf("WARNING: PID switch overriding CPU\n");
1005 sleep(1);
1006 profile_cpu = -1;
1007 }
1008 tid = atoi(optarg); break;
1009 case 's': sym_filter = strdup(optarg); break;
1010 case 'S': run_perfstat = 1; break;
1011 case 'x': vmlinux = strdup(optarg); break;
1012 case 'z': zero = 1; break;
1013 default: error = 1; break;
1014 }
1015 }
1016 if (error)
1017 display_help();
1018
1019 if (!nr_counters) {
1020 if (run_perfstat)
1021 nr_counters = 8;
1022 else {
1023 nr_counters = 1;
1024 event_id[0] = 0;
1025 }
1026 }
1027
1028 for (counter = 0; counter < nr_counters; counter++) {
1029 if (event_count[counter])
1030 continue;
1031
1032 if (event_id[counter] < PERF_HW_EVENTS_MAX)
1033 event_count[counter] = default_count[event_id[counter]];
1034 else
1035 event_count[counter] = 100000;
1036 }
1037}
1038
Ingo Molnare0143ba2009-03-23 21:29:59 +01001039int main(int argc, char *argv[])
1040{
1041 struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1042 struct perf_counter_hw_event hw_event;
Ingo Molnare0143ba2009-03-23 21:29:59 +01001043 int i, counter, group_fd;
1044 unsigned int cpu;
1045 uint64_t ip;
1046 ssize_t res;
1047 int ret;
1048
1049 process_options(argc, argv);
1050
1051 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
Wu Fengguangf7524bd2009-03-20 10:08:06 +08001052 assert(nr_cpus <= MAX_NR_CPUS);
1053 assert(nr_cpus >= 0);
1054
1055 if (run_perfstat)
1056 return do_perfstat(argc, argv);
1057
Ingo Molnare0143ba2009-03-23 21:29:59 +01001058 if (tid != -1 || profile_cpu != -1)
1059 nr_cpus = 1;
1060
Ingo Molnare0143ba2009-03-23 21:29:59 +01001061 for (i = 0; i < nr_cpus; i++) {
1062 group_fd = -1;
1063 for (counter = 0; counter < nr_counters; counter++) {
1064
1065 cpu = profile_cpu;
1066 if (tid == -1 && profile_cpu == -1)
1067 cpu = i;
1068
1069 memset(&hw_event, 0, sizeof(hw_event));
1070 hw_event.type = event_id[counter];
1071 hw_event.raw = event_raw[counter];
1072 hw_event.irq_period = event_count[counter];
1073 hw_event.record_type = PERF_RECORD_IRQ;
1074 hw_event.nmi = nmi;
1075
1076 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1077 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1078 if (fd[i][counter] < 0) {
1079 printf("kerneltop error: syscall returned with %d (%s)\n",
1080 fd[i][counter], strerror(-fd[i][counter]));
1081 if (fd[i][counter] == -1)
1082 printf("Are you root?\n");
1083 exit(-1);
1084 }
1085 assert(fd[i][counter] >= 0);
1086
1087 /*
1088 * First counter acts as the group leader:
1089 */
1090 if (group && group_fd == -1)
1091 group_fd = fd[i][counter];
1092
1093 event_array[i][counter].fd = fd[i][counter];
1094 event_array[i][counter].events = POLLIN;
1095 }
1096 }
1097
1098 parse_symbols();
1099 if (vmlinux && sym_filter_entry)
1100 parse_vmlinux(vmlinux);
1101
1102 printf("KernelTop refresh period: %d seconds\n", delay_secs);
1103 last_refresh = time(NULL);
1104
1105 while (1) {
1106 int hits = events;
1107
1108 for (i = 0; i < nr_cpus; i++) {
1109 for (counter = 0; counter < nr_counters; counter++) {
1110 res = read(fd[i][counter], (char *) &ip, sizeof(ip));
1111 if (res > 0) {
1112 assert(res == sizeof(ip));
1113
1114 process_event(ip, counter);
1115 }
1116 }
1117 }
1118
1119 if (time(NULL) >= last_refresh + delay_secs) {
1120 print_sym_table();
1121 events = userspace_events = 0;
1122 }
1123
1124 if (hits == events)
1125 ret = poll(event_array[0], nr_cpus, 1000);
1126 hits = events;
1127 }
1128
1129 return 0;
1130}