blob: 81a68aac137f75d9f726366283348feb8741f9dd [file] [log] [blame]
Ingo Molnare0143ba2009-03-23 21:29:59 +01001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
Wu Fengguangf7524bd2009-03-20 10:08:06 +08006 cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
Ingo Molnare0143ba2009-03-23 21:29:59 +01007
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
Ingo Molnare0143ba2009-03-23 21:29:59 +010029 */
Wu Fengguangf7524bd2009-03-20 10:08:06 +080030
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 *
60 * Released under the GPL v2. (and only v2, not any later version)
61 */
62
Ingo Molnare0143ba2009-03-23 21:29:59 +010063#define _GNU_SOURCE
64#include <sys/types.h>
65#include <sys/stat.h>
66#include <sys/time.h>
67#include <unistd.h>
68#include <stdint.h>
69#include <stdlib.h>
70#include <string.h>
71#include <getopt.h>
72#include <assert.h>
73#include <fcntl.h>
74#include <stdio.h>
75#include <errno.h>
76#include <ctype.h>
77#include <time.h>
78
79#include <glib.h>
80
81#include <sys/syscall.h>
82#include <sys/ioctl.h>
83#include <sys/poll.h>
84#include <sys/prctl.h>
85#include <sys/wait.h>
86#include <sys/uio.h>
87
88#include <linux/unistd.h>
89
Wu Fengguangcea92ce2009-03-20 10:08:02 +080090#include "perfcounters.h"
Ingo Molnare0143ba2009-03-23 21:29:59 +010091
Wu Fengguangf7524bd2009-03-20 10:08:06 +080092
93#define MAX_COUNTERS 64
94#define MAX_NR_CPUS 256
95
96#define DEF_PERFSTAT_EVENTS { -2, -5, -4, -3, 0, 1, 2, 3}
97
98static int run_perfstat = 0;
99static int system_wide = 0;
100
101static int nr_counters = 0;
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800102static __s64 event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800103static int event_raw[MAX_COUNTERS];
104static int event_count[MAX_COUNTERS];
105static int fd[MAX_NR_CPUS][MAX_COUNTERS];
Ingo Molnare0143ba2009-03-23 21:29:59 +0100106
Ingo Molnare0143ba2009-03-23 21:29:59 +0100107static __u64 count_filter = 100;
108
Ingo Molnare0143ba2009-03-23 21:29:59 +0100109static int tid = -1;
110static int profile_cpu = -1;
111static int nr_cpus = 0;
112static int nmi = 1;
113static int group = 0;
114
115static char *vmlinux;
116
117static char *sym_filter;
118static unsigned long filter_start;
119static unsigned long filter_end;
120
121static int delay_secs = 2;
122static int zero;
123static int dump_symtab;
124
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800125static GList *lines;
126
Ingo Molnare0143ba2009-03-23 21:29:59 +0100127struct source_line {
128 uint64_t EIP;
129 unsigned long count;
130 char *line;
131};
132
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800133
134const unsigned int default_count[] = {
Wu Fengguangdda7c022009-03-20 10:08:09 +0800135 10000,
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800136 1000000,
137 10000,
138 10000,
139 1000000,
140 10000,
141};
142
143static char *hw_event_names[] = {
144 "CPU cycles",
145 "instructions",
146 "cache references",
147 "cache misses",
148 "branches",
149 "branch misses",
150 "bus cycles",
151};
152
153static char *sw_event_names[] = {
154 "cpu clock ticks",
155 "task clock ticks",
156 "pagefaults",
157 "context switches",
158 "CPU migrations",
159};
160
161struct event_symbol {
162 int event;
163 char *symbol;
164};
165
166static struct event_symbol event_symbols[] = {
167 {PERF_COUNT_CPU_CYCLES, "cpu-cycles", },
168 {PERF_COUNT_CPU_CYCLES, "cycles", },
169 {PERF_COUNT_INSTRUCTIONS, "instructions", },
170 {PERF_COUNT_CACHE_REFERENCES, "cache-references", },
171 {PERF_COUNT_CACHE_MISSES, "cache-misses", },
172 {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", },
173 {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", },
174 {PERF_COUNT_BRANCH_MISSES, "branch-misses", },
175 {PERF_COUNT_BUS_CYCLES, "bus-cycles", },
176 {PERF_COUNT_CPU_CLOCK, "cpu-ticks", },
177 {PERF_COUNT_CPU_CLOCK, "ticks", },
178 {PERF_COUNT_TASK_CLOCK, "task-ticks", },
179 {PERF_COUNT_PAGE_FAULTS, "page-faults", },
180 {PERF_COUNT_PAGE_FAULTS, "faults", },
181 {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", },
182 {PERF_COUNT_CONTEXT_SWITCHES, "cs", },
183 {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", },
184 {PERF_COUNT_CPU_MIGRATIONS, "migrations", },
185};
186
187static void display_events_help(void)
188{
189 unsigned int i;
190 int e;
191
192 printf(
193 " -e EVENT --event=EVENT # symbolic-name abbreviations");
194
195 for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
196 if (e != event_symbols[i].event) {
197 e = event_symbols[i].event;
198 printf(
199 "\n %2d: %-20s", e, event_symbols[i].symbol);
200 } else
201 printf(" %s", event_symbols[i].symbol);
202 }
203
204 printf("\n"
205 " rNNN: raw PMU events (eventsel+umask)\n\n");
206}
207
208static void display_perfstat_help(void)
209{
210 printf(
211 "Usage: perfstat [<events...>] <cmd...>\n\n"
212 "PerfStat Options (up to %d event types can be specified):\n\n",
213 MAX_COUNTERS);
214
215 display_events_help();
216
217 printf(
218 " -a # system-wide collection\n");
219 exit(0);
220}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100221
222static void display_help(void)
223{
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800224 if (run_perfstat)
225 return display_perfstat_help();
226
Ingo Molnare0143ba2009-03-23 21:29:59 +0100227 printf(
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800228 "Usage: kerneltop [<options>]\n"
229 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100230 "KernelTop Options (up to %d event types can be specified at once):\n\n",
231 MAX_COUNTERS);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800232
233 display_events_help();
234
Ingo Molnare0143ba2009-03-23 21:29:59 +0100235 printf(
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800236 " -S --stat # perfstat COMMAND\n"
237 " -a # system-wide collection (for perfstat)\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100238 " -c CNT --count=CNT # event period to sample\n\n"
239 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
240 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
241 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800242 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100243 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800244 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
Ingo Molnare0143ba2009-03-23 21:29:59 +0100245 " -z --zero # zero counts after display\n"
246 " -D --dump_symtab # dump symbol table to stderr on startup\n"
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800247 );
Ingo Molnare0143ba2009-03-23 21:29:59 +0100248
249 exit(0);
250}
251
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800252static int type_valid(int type)
Ingo Molnare0143ba2009-03-23 21:29:59 +0100253{
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800254 if (type >= PERF_HW_EVENTS_MAX)
255 return 0;
256 if (type <= PERF_SW_EVENTS_MIN)
257 return 0;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100258
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800259 return 1;
260}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100261
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800262static char *event_name(int ctr)
263{
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800264 __s64 type = event_id[ctr];
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800265 static char buf[32];
266
267 if (event_raw[ctr]) {
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800268 sprintf(buf, "raw 0x%llx", (long long)type);
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800269 return buf;
270 }
271 if (!type_valid(type))
272 return "unknown";
273
274 if (type >= 0)
275 return hw_event_names[type];
276
277 return sw_event_names[-type-1];
278}
279
280/*
281 * Each event can have multiple symbolic names.
282 * Symbolic names are (almost) exactly matched.
283 */
284static int match_event_symbols(char *str)
285{
286 unsigned int i;
287
288 if (isdigit(str[0]) || str[0] == '-')
289 return atoi(str);
290
291 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
292 if (!strncmp(str, event_symbols[i].symbol,
293 strlen(event_symbols[i].symbol)))
294 return event_symbols[i].event;
295 }
296
297 return PERF_HW_EVENTS_MAX;
298}
299
300static int parse_events(char *str)
301{
Wu Fengguang3ab8d792009-03-20 10:08:08 +0800302 __s64 type;
303 int raw;
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800304
305again:
306 if (nr_counters == MAX_COUNTERS)
307 return -1;
308
309 raw = 0;
310 if (*str == 'r') {
311 raw = 1;
312 ++str;
313 type = strtol(str, NULL, 16);
314 } else {
315 type = match_event_symbols(str);
316 if (!type_valid(type))
317 return -1;
318 }
319
320 event_id[nr_counters] = type;
321 event_raw[nr_counters] = raw;
322 nr_counters++;
323
324 str = strstr(str, ",");
325 if (str) {
326 str++;
327 goto again;
328 }
329
330 return 0;
331}
332
333
334/*
335 * perfstat
336 */
337
338char fault_here[1000000];
339
340static void create_perfstat_counter(int counter)
341{
342 struct perf_counter_hw_event hw_event;
343
344 memset(&hw_event, 0, sizeof(hw_event));
345 hw_event.type = event_id[counter];
346 hw_event.raw = event_raw[counter];
347 hw_event.record_type = PERF_RECORD_SIMPLE;
348 hw_event.nmi = 0;
349
350 if (system_wide) {
351 int cpu;
352 for (cpu = 0; cpu < nr_cpus; cpu ++) {
353 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
354 if (fd[cpu][counter] < 0) {
355 printf("perfstat error: syscall returned with %d (%s)\n",
356 fd[cpu][counter], strerror(errno));
357 exit(-1);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100358 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800359 }
360 } else {
361 hw_event.inherit = 1;
362 hw_event.disabled = 1;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100363
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800364 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
365 if (fd[0][counter] < 0) {
366 printf("perfstat error: syscall returned with %d (%s)\n",
367 fd[0][counter], strerror(errno));
368 exit(-1);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100369 }
370 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800371}
Ingo Molnare0143ba2009-03-23 21:29:59 +0100372
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800373int do_perfstat(int argc, char *argv[])
374{
375 unsigned long long t0, t1;
376 int counter;
377 ssize_t res;
378 int status;
379 int pid;
380
381 if (!system_wide)
382 nr_cpus = 1;
383
384 for (counter = 0; counter < nr_counters; counter++)
385 create_perfstat_counter(counter);
386
387 argc -= optind;
388 argv += optind;
389
Wu Fengguangaf9522c2009-03-20 10:08:10 +0800390 if (!argc)
391 display_help();
392
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800393 /*
394 * Enable counters and exec the command:
395 */
396 t0 = rdclock();
397 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
398
399 if ((pid = fork()) < 0)
400 perror("failed to fork");
401 if (!pid) {
402 if (execvp(argv[0], argv)) {
403 perror(argv[0]);
404 exit(-1);
405 }
Wu Fengguang95bb3be2009-03-20 10:08:04 +0800406 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800407 while (wait(&status) >= 0)
408 ;
409 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
410 t1 = rdclock();
411
412 fflush(stdout);
413
414 fprintf(stderr, "\n");
415 fprintf(stderr, " Performance counter stats for \'%s\':\n",
416 argv[0]);
417 fprintf(stderr, "\n");
Ingo Molnare0143ba2009-03-23 21:29:59 +0100418
419 for (counter = 0; counter < nr_counters; counter++) {
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800420 int cpu;
421 __u64 count, single_count;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100422
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800423 count = 0;
424 for (cpu = 0; cpu < nr_cpus; cpu ++) {
425 res = read(fd[cpu][counter],
426 (char *) &single_count, sizeof(single_count));
427 assert(res == sizeof(single_count));
428 count += single_count;
429 }
430
431 if (!event_raw[counter] &&
432 (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
433 event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
434
435 double msecs = (double)count / 1000000;
436
437 fprintf(stderr, " %14.6f %-20s (msecs)\n",
438 msecs, event_name(counter));
439 } else {
440 fprintf(stderr, " %14Ld %-20s (events)\n",
441 count, event_name(counter));
442 }
443 if (!counter)
444 fprintf(stderr, "\n");
Ingo Molnare0143ba2009-03-23 21:29:59 +0100445 }
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800446 fprintf(stderr, "\n");
447 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
448 (double)(t1-t0)/1e6);
449 fprintf(stderr, "\n");
450
451 return 0;
Ingo Molnare0143ba2009-03-23 21:29:59 +0100452}
453
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800454/*
455 * Symbols
456 */
457
Ingo Molnare0143ba2009-03-23 21:29:59 +0100458static uint64_t min_ip;
459static uint64_t max_ip = -1ll;
460
461struct sym_entry {
462 unsigned long long addr;
463 char *sym;
464 unsigned long count[MAX_COUNTERS];
465 int skip;
466 GList *source;
467};
468
469#define MAX_SYMS 100000
470
471static int sym_table_count;
472
473struct sym_entry *sym_filter_entry;
474
475static struct sym_entry sym_table[MAX_SYMS];
476
477static void show_details(struct sym_entry *sym);
478
479/*
Wu Fengguangef45fa92009-03-20 10:08:07 +0800480 * Ordering weight: count-1 * count-2 * ... / count-n
Ingo Molnare0143ba2009-03-23 21:29:59 +0100481 */
482static double sym_weight(const struct sym_entry *sym)
483{
484 double weight;
485 int counter;
486
487 weight = sym->count[0];
488
489 for (counter = 1; counter < nr_counters-1; counter++)
490 weight *= sym->count[counter];
491
492 weight /= (sym->count[counter] + 1);
493
494 return weight;
495}
496
497static int compare(const void *__sym1, const void *__sym2)
498{
499 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
500
501 return sym_weight(sym1) < sym_weight(sym2);
502}
503
504static time_t last_refresh;
505static long events;
506static long userspace_events;
507static const char CONSOLE_CLEAR[] = "";
508
509static struct sym_entry tmp[MAX_SYMS];
510
511static void print_sym_table(void)
512{
513 int i, printed;
514 int counter;
515 float events_per_sec = events/delay_secs;
516 float kevents_per_sec = (events-userspace_events)/delay_secs;
517
518 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
519 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
520
521 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
522
523 printf(
524"------------------------------------------------------------------------------\n");
525 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
526 events_per_sec,
527 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
528 nmi ? "NMI" : "IRQ");
529
530 if (nr_counters == 1)
531 printf("%d ", event_count[0]);
532
533 for (counter = 0; counter < nr_counters; counter++) {
534 if (counter)
535 printf("/");
536
Wu Fengguange3908612009-03-20 10:08:05 +0800537 printf("%s", event_name(counter));
Ingo Molnare0143ba2009-03-23 21:29:59 +0100538 }
539
540 printf( "], ");
541
542 if (tid != -1)
543 printf(" (tid: %d", tid);
544 else
545 printf(" (all");
546
547 if (profile_cpu != -1)
548 printf(", cpu: %d)\n", profile_cpu);
549 else {
550 if (tid != -1)
551 printf(")\n");
552 else
553 printf(", %d CPUs)\n", nr_cpus);
554 }
555
556 printf("------------------------------------------------------------------------------\n\n");
557
558 if (nr_counters == 1)
559 printf(" events");
560 else
561 printf(" weight events");
562
563 printf(" RIP kernel function\n"
564 " ______ ______ ________________ _______________\n\n"
565 );
566
567 printed = 0;
568 for (i = 0; i < sym_table_count; i++) {
569 int count;
570
571 if (nr_counters == 1) {
572 if (printed <= 18 &&
573 tmp[i].count[0] >= count_filter) {
574 printf("%19.2f - %016llx : %s\n",
575 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
576 printed++;
577 }
578 } else {
579 if (printed <= 18 &&
580 tmp[i].count[0] >= count_filter) {
581 printf("%8.1f %10ld - %016llx : %s\n",
582 sym_weight(tmp + i),
583 tmp[i].count[0],
584 tmp[i].addr, tmp[i].sym);
585 printed++;
586 }
587 }
588 /*
589 * Add decay to the counts:
590 */
591 for (count = 0; count < nr_counters; count++)
592 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
593 }
594
595 if (sym_filter_entry)
596 show_details(sym_filter_entry);
597
598 last_refresh = time(NULL);
599
600 {
601 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
602
603 if (poll(&stdin_poll, 1, 0) == 1) {
604 printf("key pressed - exiting.\n");
605 exit(0);
606 }
607 }
608}
609
610static int read_symbol(FILE *in, struct sym_entry *s)
611{
612 static int filter_match = 0;
613 char *sym, stype;
614 char str[500];
615 int rc, pos;
616
617 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
618 if (rc == EOF)
619 return -1;
620
621 assert(rc == 3);
622
623 /* skip until end of line: */
624 pos = strlen(str);
625 do {
626 rc = fgetc(in);
627 if (rc == '\n' || rc == EOF || pos >= 499)
628 break;
629 str[pos] = rc;
630 pos++;
631 } while (1);
632 str[pos] = 0;
633
634 sym = str;
635
636 /* Filter out known duplicates and non-text symbols. */
637 if (!strcmp(sym, "_text"))
638 return 1;
639 if (!min_ip && !strcmp(sym, "_stext"))
640 return 1;
641 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
642 return 1;
643 if (stype != 'T' && stype != 't')
644 return 1;
645 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
646 return 1;
647 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
648 return 1;
649
650 s->sym = malloc(strlen(str));
651 assert(s->sym);
652
653 strcpy((char *)s->sym, str);
654 s->skip = 0;
655
656 /* Tag events to be skipped. */
657 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
658 s->skip = 1;
659 if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
660 s->skip = 1;
661
662 if (filter_match == 1) {
663 filter_end = s->addr;
664 filter_match = -1;
665 if (filter_end - filter_start > 10000) {
666 printf("hm, too large filter symbol <%s> - skipping.\n",
667 sym_filter);
668 printf("symbol filter start: %016lx\n", filter_start);
669 printf(" end: %016lx\n", filter_end);
670 filter_end = filter_start = 0;
671 sym_filter = NULL;
672 sleep(1);
673 }
674 }
675 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
676 filter_match = 1;
677 filter_start = s->addr;
678 }
679
680 return 0;
681}
682
683int compare_addr(const void *__sym1, const void *__sym2)
684{
685 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
686
687 return sym1->addr > sym2->addr;
688}
689
690static void sort_symbol_table(void)
691{
692 int i, dups;
693
694 do {
695 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
696 for (i = 0, dups = 0; i < sym_table_count; i++) {
697 if (sym_table[i].addr == sym_table[i+1].addr) {
698 sym_table[i+1].addr = -1ll;
699 dups++;
700 }
701 }
702 sym_table_count -= dups;
703 } while(dups);
704}
705
706static void parse_symbols(void)
707{
708 struct sym_entry *last;
709
710 FILE *kallsyms = fopen("/proc/kallsyms", "r");
711
712 if (!kallsyms) {
713 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
714 exit(-1);
715 }
716
717 while (!feof(kallsyms)) {
718 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
719 sym_table_count++;
720 assert(sym_table_count <= MAX_SYMS);
721 }
722 }
723
724 sort_symbol_table();
725 min_ip = sym_table[0].addr;
726 max_ip = sym_table[sym_table_count-1].addr;
727 last = sym_table + sym_table_count++;
728
729 last->addr = -1ll;
730 last->sym = "<end>";
731
732 if (filter_end) {
733 int count;
734 for (count=0; count < sym_table_count; count ++) {
735 if (!strcmp(sym_table[count].sym, sym_filter)) {
736 sym_filter_entry = &sym_table[count];
737 break;
738 }
739 }
740 }
741 if (dump_symtab) {
742 int i;
743
744 for (i = 0; i < sym_table_count; i++)
745 fprintf(stderr, "%llx %s\n",
746 sym_table[i].addr, sym_table[i].sym);
747 }
748}
749
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800750/*
751 * Source lines
752 */
Ingo Molnare0143ba2009-03-23 21:29:59 +0100753
754static void parse_vmlinux(char *filename)
755{
756 FILE *file;
757 char command[PATH_MAX*2];
758 if (!filename)
759 return;
760
761 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
762
763 file = popen(command, "r");
764 if (!file)
765 return;
766
767 while (!feof(file)) {
768 struct source_line *src;
769 size_t dummy = 0;
770 char *c;
771
772 src = malloc(sizeof(struct source_line));
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800773 assert(src != NULL);
Ingo Molnare0143ba2009-03-23 21:29:59 +0100774 memset(src, 0, sizeof(struct source_line));
775
776 if (getline(&src->line, &dummy, file) < 0)
777 break;
778 if (!src->line)
779 break;
780
781 c = strchr(src->line, '\n');
782 if (c)
783 *c = 0;
784
785 lines = g_list_prepend(lines, src);
786
787 if (strlen(src->line)>8 && src->line[8] == ':')
788 src->EIP = strtoull(src->line, NULL, 16);
789 if (strlen(src->line)>8 && src->line[16] == ':')
790 src->EIP = strtoull(src->line, NULL, 16);
791 }
792 pclose(file);
793 lines = g_list_reverse(lines);
794}
795
796static void record_precise_ip(uint64_t ip)
797{
798 struct source_line *line;
799 GList *item;
800
801 item = g_list_first(lines);
802 while (item) {
803 line = item->data;
804 if (line->EIP == ip)
805 line->count++;
806 if (line->EIP > ip)
807 break;
808 item = g_list_next(item);
809 }
810}
811
812static void lookup_sym_in_vmlinux(struct sym_entry *sym)
813{
814 struct source_line *line;
815 GList *item;
816 char pattern[PATH_MAX];
817 sprintf(pattern, "<%s>:", sym->sym);
818
819 item = g_list_first(lines);
820 while (item) {
821 line = item->data;
822 if (strstr(line->line, pattern)) {
823 sym->source = item;
824 break;
825 }
826 item = g_list_next(item);
827 }
828}
829
830void show_lines(GList *item_queue, int item_queue_count)
831{
832 int i;
833 struct source_line *line;
834
835 for (i = 0; i < item_queue_count; i++) {
836 line = item_queue->data;
837 printf("%8li\t%s\n", line->count, line->line);
838 item_queue = g_list_next(item_queue);
839 }
840}
841
842#define TRACE_COUNT 3
843
844static void show_details(struct sym_entry *sym)
845{
846 struct source_line *line;
847 GList *item;
848 int displayed = 0;
849 GList *item_queue = NULL;
850 int item_queue_count = 0;
851
852 if (!sym->source)
853 lookup_sym_in_vmlinux(sym);
854 if (!sym->source)
855 return;
856
857 printf("Showing details for %s\n", sym->sym);
858
859 item = sym->source;
860 while (item) {
861 line = item->data;
862 if (displayed && strstr(line->line, ">:"))
863 break;
864
865 if (!item_queue_count)
866 item_queue = item;
867 item_queue_count ++;
868
869 if (line->count >= count_filter) {
870 show_lines(item_queue, item_queue_count);
871 item_queue_count = 0;
872 item_queue = NULL;
873 } else if (item_queue_count > TRACE_COUNT) {
874 item_queue = g_list_next(item_queue);
875 item_queue_count --;
876 }
877
878 line->count = 0;
879 displayed++;
880 if (displayed > 300)
881 break;
882 item = g_list_next(item);
883 }
884}
885
886/*
887 * Binary search in the histogram table and record the hit:
888 */
889static void record_ip(uint64_t ip, int counter)
890{
891 int left_idx, middle_idx, right_idx, idx;
892 unsigned long left, middle, right;
893
894 record_precise_ip(ip);
895
896 left_idx = 0;
897 right_idx = sym_table_count-1;
898 assert(ip <= max_ip && ip >= min_ip);
899
900 while (left_idx + 1 < right_idx) {
901 middle_idx = (left_idx + right_idx) / 2;
902
903 left = sym_table[ left_idx].addr;
904 middle = sym_table[middle_idx].addr;
905 right = sym_table[ right_idx].addr;
906
907 if (!(left <= middle && middle <= right)) {
908 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
909 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
910 }
911 assert(left <= middle && middle <= right);
912 if (!(left <= ip && ip <= right)) {
913 printf(" left: %016lx\n", left);
914 printf(" ip: %016lx\n", ip);
915 printf("right: %016lx\n", right);
916 }
917 assert(left <= ip && ip <= right);
918 /*
919 * [ left .... target .... middle .... right ]
920 * => right := middle
921 */
922 if (ip < middle) {
923 right_idx = middle_idx;
924 continue;
925 }
926 /*
927 * [ left .... middle ... target ... right ]
928 * => left := middle
929 */
930 left_idx = middle_idx;
931 }
932
933 idx = left_idx;
934
935 if (!sym_table[idx].skip)
936 sym_table[idx].count[counter]++;
937 else events--;
938}
939
940static void process_event(uint64_t ip, int counter)
941{
942 events++;
943
944 if (ip < min_ip || ip > max_ip) {
945 userspace_events++;
946 return;
947 }
948
949 record_ip(ip, counter);
950}
951
Wu Fengguangf7524bd2009-03-20 10:08:06 +0800952static void process_options(int argc, char *argv[])
953{
954 int error = 0, counter;
955
956 if (strstr(argv[0], "perfstat"))
957 run_perfstat = 1;
958
959 for (;;) {
960 int option_index = 0;
961 /** Options for getopt */
962 static struct option long_options[] = {
963 {"count", required_argument, NULL, 'c'},
964 {"cpu", required_argument, NULL, 'C'},
965 {"delay", required_argument, NULL, 'd'},
966 {"dump_symtab", no_argument, NULL, 'D'},
967 {"event", required_argument, NULL, 'e'},
968 {"filter", required_argument, NULL, 'f'},
969 {"group", required_argument, NULL, 'g'},
970 {"help", no_argument, NULL, 'h'},
971 {"nmi", required_argument, NULL, 'n'},
972 {"pid", required_argument, NULL, 'p'},
973 {"vmlinux", required_argument, NULL, 'x'},
974 {"symbol", required_argument, NULL, 's'},
975 {"stat", no_argument, NULL, 'S'},
976 {"zero", no_argument, NULL, 'z'},
977 {NULL, 0, NULL, 0 }
978 };
979 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
980 long_options, &option_index);
981 if (c == -1)
982 break;
983
984 switch (c) {
985 case 'a': system_wide = 1; break;
986 case 'c': event_count[nr_counters] = atoi(optarg); break;
987 case 'C':
988 /* CPU and PID are mutually exclusive */
989 if (tid != -1) {
990 printf("WARNING: CPU switch overriding PID\n");
991 sleep(1);
992 tid = -1;
993 }
994 profile_cpu = atoi(optarg); break;
995 case 'd': delay_secs = atoi(optarg); break;
996 case 'D': dump_symtab = 1; break;
997
998 case 'e': error = parse_events(optarg); break;
999
1000 case 'f': count_filter = atoi(optarg); break;
1001 case 'g': group = atoi(optarg); break;
1002 case 'h': display_help(); break;
1003 case 'n': nmi = atoi(optarg); break;
1004 case 'p':
1005 /* CPU and PID are mutually exclusive */
1006 if (profile_cpu != -1) {
1007 printf("WARNING: PID switch overriding CPU\n");
1008 sleep(1);
1009 profile_cpu = -1;
1010 }
1011 tid = atoi(optarg); break;
1012 case 's': sym_filter = strdup(optarg); break;
1013 case 'S': run_perfstat = 1; break;
1014 case 'x': vmlinux = strdup(optarg); break;
1015 case 'z': zero = 1; break;
1016 default: error = 1; break;
1017 }
1018 }
1019 if (error)
1020 display_help();
1021
1022 if (!nr_counters) {
1023 if (run_perfstat)
1024 nr_counters = 8;
1025 else {
1026 nr_counters = 1;
1027 event_id[0] = 0;
1028 }
1029 }
1030
1031 for (counter = 0; counter < nr_counters; counter++) {
1032 if (event_count[counter])
1033 continue;
1034
1035 if (event_id[counter] < PERF_HW_EVENTS_MAX)
1036 event_count[counter] = default_count[event_id[counter]];
1037 else
1038 event_count[counter] = 100000;
1039 }
1040}
1041
Ingo Molnare0143ba2009-03-23 21:29:59 +01001042int main(int argc, char *argv[])
1043{
1044 struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1045 struct perf_counter_hw_event hw_event;
Ingo Molnare0143ba2009-03-23 21:29:59 +01001046 int i, counter, group_fd;
1047 unsigned int cpu;
1048 uint64_t ip;
1049 ssize_t res;
1050 int ret;
1051
1052 process_options(argc, argv);
1053
1054 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
Wu Fengguangf7524bd2009-03-20 10:08:06 +08001055 assert(nr_cpus <= MAX_NR_CPUS);
1056 assert(nr_cpus >= 0);
1057
1058 if (run_perfstat)
1059 return do_perfstat(argc, argv);
1060
Ingo Molnare0143ba2009-03-23 21:29:59 +01001061 if (tid != -1 || profile_cpu != -1)
1062 nr_cpus = 1;
1063
Ingo Molnare0143ba2009-03-23 21:29:59 +01001064 for (i = 0; i < nr_cpus; i++) {
1065 group_fd = -1;
1066 for (counter = 0; counter < nr_counters; counter++) {
1067
1068 cpu = profile_cpu;
1069 if (tid == -1 && profile_cpu == -1)
1070 cpu = i;
1071
1072 memset(&hw_event, 0, sizeof(hw_event));
1073 hw_event.type = event_id[counter];
1074 hw_event.raw = event_raw[counter];
1075 hw_event.irq_period = event_count[counter];
1076 hw_event.record_type = PERF_RECORD_IRQ;
1077 hw_event.nmi = nmi;
1078
1079 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1080 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1081 if (fd[i][counter] < 0) {
1082 printf("kerneltop error: syscall returned with %d (%s)\n",
1083 fd[i][counter], strerror(-fd[i][counter]));
1084 if (fd[i][counter] == -1)
1085 printf("Are you root?\n");
1086 exit(-1);
1087 }
1088 assert(fd[i][counter] >= 0);
1089
1090 /*
1091 * First counter acts as the group leader:
1092 */
1093 if (group && group_fd == -1)
1094 group_fd = fd[i][counter];
1095
1096 event_array[i][counter].fd = fd[i][counter];
1097 event_array[i][counter].events = POLLIN;
1098 }
1099 }
1100
1101 parse_symbols();
1102 if (vmlinux && sym_filter_entry)
1103 parse_vmlinux(vmlinux);
1104
1105 printf("KernelTop refresh period: %d seconds\n", delay_secs);
1106 last_refresh = time(NULL);
1107
1108 while (1) {
1109 int hits = events;
1110
1111 for (i = 0; i < nr_cpus; i++) {
1112 for (counter = 0; counter < nr_counters; counter++) {
1113 res = read(fd[i][counter], (char *) &ip, sizeof(ip));
1114 if (res > 0) {
1115 assert(res == sizeof(ip));
1116
1117 process_event(ip, counter);
1118 }
1119 }
1120 }
1121
1122 if (time(NULL) >= last_refresh + delay_secs) {
1123 print_sym_table();
1124 events = userspace_events = 0;
1125 }
1126
1127 if (hits == events)
1128 ret = poll(event_array[0], nr_cpus, 1000);
1129 hits = events;
1130 }
1131
1132 return 0;
1133}