blob: 8a5cd15512c78aa6df04b7c55da02b67323a1f22 [file] [log] [blame]
Mike Chan9d49b702010-06-22 11:26:45 -07001/*
2 * drivers/cpufreq/cpufreq_interactive.c
3 *
4 * Copyright (C) 2010 Google, Inc.
5 *
6 * This software is licensed under the terms of the GNU General Public
7 * License version 2, as published by the Free Software Foundation, and
8 * may be copied, distributed, and modified under those terms.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * Author: Mike Chan (mike@android.com)
16 *
17 */
18
19#include <linux/cpu.h>
20#include <linux/cpumask.h>
21#include <linux/cpufreq.h>
22#include <linux/module.h>
23#include <linux/mutex.h>
24#include <linux/sched.h>
25#include <linux/tick.h>
26#include <linux/time.h>
27#include <linux/timer.h>
28#include <linux/workqueue.h>
29#include <linux/kthread.h>
30#include <linux/mutex.h>
31
32#include <asm/cputime.h>
33
34static atomic_t active_count = ATOMIC_INIT(0);
35
36struct cpufreq_interactive_cpuinfo {
37 struct timer_list cpu_timer;
38 int timer_idlecancel;
39 u64 time_in_idle;
40 u64 idle_exit_time;
41 u64 timer_run_time;
42 int idling;
43 u64 freq_change_time;
44 u64 freq_change_time_in_idle;
45 struct cpufreq_policy *policy;
46 struct cpufreq_frequency_table *freq_table;
47 unsigned int target_freq;
48 int governor_enabled;
49};
50
51static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo);
52
53/* Workqueues handle frequency scaling */
54static struct task_struct *up_task;
55static struct workqueue_struct *down_wq;
56static struct work_struct freq_scale_down_work;
57static cpumask_t up_cpumask;
58static spinlock_t up_cpumask_lock;
59static cpumask_t down_cpumask;
60static spinlock_t down_cpumask_lock;
61static struct mutex set_speed_lock;
62
63/* Hi speed to bump to from lo speed when load burst (default max) */
64static u64 hispeed_freq;
65
66/* Go to hi speed when CPU load at or above this value. */
67#define DEFAULT_GO_HISPEED_LOAD 95
68static unsigned long go_hispeed_load;
69
70/*
71 * The minimum amount of time to spend at a frequency before we can ramp down.
72 */
73#define DEFAULT_MIN_SAMPLE_TIME 20 * USEC_PER_MSEC
74static unsigned long min_sample_time;
75
76/*
77 * The sample rate of the timer used to increase frequency
78 */
79#define DEFAULT_TIMER_RATE 20 * USEC_PER_MSEC
80static unsigned long timer_rate;
81
82static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
83 unsigned int event);
84
85#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
86static
87#endif
88struct cpufreq_governor cpufreq_gov_interactive = {
89 .name = "interactive",
90 .governor = cpufreq_governor_interactive,
91 .max_transition_latency = 10000000,
92 .owner = THIS_MODULE,
93};
94
95static void cpufreq_interactive_timer(unsigned long data)
96{
97 unsigned int delta_idle;
98 unsigned int delta_time;
99 int cpu_load;
100 int load_since_change;
101 u64 time_in_idle;
102 u64 idle_exit_time;
103 struct cpufreq_interactive_cpuinfo *pcpu =
104 &per_cpu(cpuinfo, data);
105 u64 now_idle;
106 unsigned int new_freq;
107 unsigned int index;
108 unsigned long flags;
109
110 smp_rmb();
111
112 if (!pcpu->governor_enabled)
113 goto exit;
114
115 /*
116 * Once pcpu->timer_run_time is updated to >= pcpu->idle_exit_time,
117 * this lets idle exit know the current idle time sample has
118 * been processed, and idle exit can generate a new sample and
119 * re-arm the timer. This prevents a concurrent idle
120 * exit on that CPU from writing a new set of info at the same time
121 * the timer function runs (the timer function can't use that info
122 * until more time passes).
123 */
124 time_in_idle = pcpu->time_in_idle;
125 idle_exit_time = pcpu->idle_exit_time;
126 now_idle = get_cpu_idle_time_us(data, &pcpu->timer_run_time);
127 smp_wmb();
128
129 /* If we raced with cancelling a timer, skip. */
130 if (!idle_exit_time)
131 goto exit;
132
133 delta_idle = (unsigned int)(now_idle - time_in_idle);
134 delta_time = (unsigned int)(pcpu->timer_run_time - idle_exit_time);
135
136 /*
137 * If timer ran less than 1ms after short-term sample started, retry.
138 */
139 if (delta_time < 1000)
140 goto rearm;
141
142 if (delta_idle > delta_time)
143 cpu_load = 0;
144 else
145 cpu_load = 100 * (delta_time - delta_idle) / delta_time;
146
147 delta_idle = (unsigned int)(now_idle - pcpu->freq_change_time_in_idle);
148 delta_time = (unsigned int)(pcpu->timer_run_time - pcpu->freq_change_time);
149
150 if ((delta_time == 0) || (delta_idle > delta_time))
151 load_since_change = 0;
152 else
153 load_since_change =
154 100 * (delta_time - delta_idle) / delta_time;
155
156 /*
157 * Choose greater of short-term load (since last idle timer
158 * started or timer function re-armed itself) or long-term load
159 * (since last frequency change).
160 */
161 if (load_since_change > cpu_load)
162 cpu_load = load_since_change;
163
164 if (cpu_load >= go_hispeed_load) {
165 if (pcpu->policy->cur == pcpu->policy->min)
166 new_freq = hispeed_freq;
167 else
168 new_freq = pcpu->policy->max * cpu_load / 100;
169 } else {
170 new_freq = pcpu->policy->cur * cpu_load / 100;
171 }
172
173 if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,
174 new_freq, CPUFREQ_RELATION_H,
175 &index)) {
176 pr_warn_once("timer %d: cpufreq_frequency_table_target error\n",
177 (int) data);
178 goto rearm;
179 }
180
181 new_freq = pcpu->freq_table[index].frequency;
182
183 if (pcpu->target_freq == new_freq)
184 goto rearm_if_notmax;
185
186 /*
187 * Do not scale down unless we have been at this frequency for the
188 * minimum sample time.
189 */
190 if (new_freq < pcpu->target_freq) {
191 if (pcpu->timer_run_time - pcpu->freq_change_time
192 < min_sample_time)
193 goto rearm;
194 }
195
196 if (new_freq < pcpu->target_freq) {
197 pcpu->target_freq = new_freq;
198 spin_lock_irqsave(&down_cpumask_lock, flags);
199 cpumask_set_cpu(data, &down_cpumask);
200 spin_unlock_irqrestore(&down_cpumask_lock, flags);
201 queue_work(down_wq, &freq_scale_down_work);
202 } else {
203 pcpu->target_freq = new_freq;
204 spin_lock_irqsave(&up_cpumask_lock, flags);
205 cpumask_set_cpu(data, &up_cpumask);
206 spin_unlock_irqrestore(&up_cpumask_lock, flags);
207 wake_up_process(up_task);
208 }
209
210rearm_if_notmax:
211 /*
212 * Already set max speed and don't see a need to change that,
213 * wait until next idle to re-evaluate, don't need timer.
214 */
215 if (pcpu->target_freq == pcpu->policy->max)
216 goto exit;
217
218rearm:
219 if (!timer_pending(&pcpu->cpu_timer)) {
220 /*
221 * If already at min: if that CPU is idle, don't set timer.
222 * Else cancel the timer if that CPU goes idle. We don't
223 * need to re-evaluate speed until the next idle exit.
224 */
225 if (pcpu->target_freq == pcpu->policy->min) {
226 smp_rmb();
227
228 if (pcpu->idling)
229 goto exit;
230
231 pcpu->timer_idlecancel = 1;
232 }
233
234 pcpu->time_in_idle = get_cpu_idle_time_us(
235 data, &pcpu->idle_exit_time);
236 mod_timer(&pcpu->cpu_timer,
237 jiffies + usecs_to_jiffies(timer_rate));
238 }
239
240exit:
241 return;
242}
243
244static void cpufreq_interactive_idle_start(void)
245{
246 struct cpufreq_interactive_cpuinfo *pcpu =
247 &per_cpu(cpuinfo, smp_processor_id());
248 int pending;
249
250 if (!pcpu->governor_enabled)
251 return;
252
253 pcpu->idling = 1;
254 smp_wmb();
255 pending = timer_pending(&pcpu->cpu_timer);
256
257 if (pcpu->target_freq != pcpu->policy->min) {
258#ifdef CONFIG_SMP
259 /*
260 * Entering idle while not at lowest speed. On some
261 * platforms this can hold the other CPU(s) at that speed
262 * even though the CPU is idle. Set a timer to re-evaluate
263 * speed so this idle CPU doesn't hold the other CPUs above
264 * min indefinitely. This should probably be a quirk of
265 * the CPUFreq driver.
266 */
267 if (!pending) {
268 pcpu->time_in_idle = get_cpu_idle_time_us(
269 smp_processor_id(), &pcpu->idle_exit_time);
270 pcpu->timer_idlecancel = 0;
271 mod_timer(&pcpu->cpu_timer,
272 jiffies + usecs_to_jiffies(timer_rate));
273 }
274#endif
275 } else {
276 /*
277 * If at min speed and entering idle after load has
278 * already been evaluated, and a timer has been set just in
279 * case the CPU suddenly goes busy, cancel that timer. The
280 * CPU didn't go busy; we'll recheck things upon idle exit.
281 */
282 if (pending && pcpu->timer_idlecancel) {
283 del_timer(&pcpu->cpu_timer);
284 /*
285 * Ensure last timer run time is after current idle
286 * sample start time, so next idle exit will always
287 * start a new idle sampling period.
288 */
289 pcpu->idle_exit_time = 0;
290 pcpu->timer_idlecancel = 0;
291 }
292 }
293
294}
295
296static void cpufreq_interactive_idle_end(void)
297{
298 struct cpufreq_interactive_cpuinfo *pcpu =
299 &per_cpu(cpuinfo, smp_processor_id());
300
301 pcpu->idling = 0;
302 smp_wmb();
303
304 /*
305 * Arm the timer for 1-2 ticks later if not already, and if the timer
306 * function has already processed the previous load sampling
307 * interval. (If the timer is not pending but has not processed
308 * the previous interval, it is probably racing with us on another
309 * CPU. Let it compute load based on the previous sample and then
310 * re-arm the timer for another interval when it's done, rather
311 * than updating the interval start time to be "now", which doesn't
312 * give the timer function enough time to make a decision on this
313 * run.)
314 */
315 if (timer_pending(&pcpu->cpu_timer) == 0 &&
316 pcpu->timer_run_time >= pcpu->idle_exit_time &&
317 pcpu->governor_enabled) {
318 pcpu->time_in_idle =
319 get_cpu_idle_time_us(smp_processor_id(),
320 &pcpu->idle_exit_time);
321 pcpu->timer_idlecancel = 0;
322 mod_timer(&pcpu->cpu_timer,
323 jiffies + usecs_to_jiffies(timer_rate));
324 }
325
326}
327
328static int cpufreq_interactive_up_task(void *data)
329{
330 unsigned int cpu;
331 cpumask_t tmp_mask;
332 unsigned long flags;
333 struct cpufreq_interactive_cpuinfo *pcpu;
334
335 while (1) {
336 set_current_state(TASK_INTERRUPTIBLE);
337 spin_lock_irqsave(&up_cpumask_lock, flags);
338
339 if (cpumask_empty(&up_cpumask)) {
340 spin_unlock_irqrestore(&up_cpumask_lock, flags);
341 schedule();
342
343 if (kthread_should_stop())
344 break;
345
346 spin_lock_irqsave(&up_cpumask_lock, flags);
347 }
348
349 set_current_state(TASK_RUNNING);
350 tmp_mask = up_cpumask;
351 cpumask_clear(&up_cpumask);
352 spin_unlock_irqrestore(&up_cpumask_lock, flags);
353
354 for_each_cpu(cpu, &tmp_mask) {
355 unsigned int j;
356 unsigned int max_freq = 0;
357
358 pcpu = &per_cpu(cpuinfo, cpu);
359 smp_rmb();
360
361 if (!pcpu->governor_enabled)
362 continue;
363
364 mutex_lock(&set_speed_lock);
365
366 for_each_cpu(j, pcpu->policy->cpus) {
367 struct cpufreq_interactive_cpuinfo *pjcpu =
368 &per_cpu(cpuinfo, j);
369
370 if (pjcpu->target_freq > max_freq)
371 max_freq = pjcpu->target_freq;
372 }
373
374 if (max_freq != pcpu->policy->cur)
375 __cpufreq_driver_target(pcpu->policy,
376 max_freq,
377 CPUFREQ_RELATION_H);
378 mutex_unlock(&set_speed_lock);
379
380 pcpu->freq_change_time_in_idle =
381 get_cpu_idle_time_us(cpu,
382 &pcpu->freq_change_time);
383 }
384 }
385
386 return 0;
387}
388
389static void cpufreq_interactive_freq_down(struct work_struct *work)
390{
391 unsigned int cpu;
392 cpumask_t tmp_mask;
393 unsigned long flags;
394 struct cpufreq_interactive_cpuinfo *pcpu;
395
396 spin_lock_irqsave(&down_cpumask_lock, flags);
397 tmp_mask = down_cpumask;
398 cpumask_clear(&down_cpumask);
399 spin_unlock_irqrestore(&down_cpumask_lock, flags);
400
401 for_each_cpu(cpu, &tmp_mask) {
402 unsigned int j;
403 unsigned int max_freq = 0;
404
405 pcpu = &per_cpu(cpuinfo, cpu);
406 smp_rmb();
407
408 if (!pcpu->governor_enabled)
409 continue;
410
411 mutex_lock(&set_speed_lock);
412
413 for_each_cpu(j, pcpu->policy->cpus) {
414 struct cpufreq_interactive_cpuinfo *pjcpu =
415 &per_cpu(cpuinfo, j);
416
417 if (pjcpu->target_freq > max_freq)
418 max_freq = pjcpu->target_freq;
419 }
420
421 if (max_freq != pcpu->policy->cur)
422 __cpufreq_driver_target(pcpu->policy, max_freq,
423 CPUFREQ_RELATION_H);
424
425 mutex_unlock(&set_speed_lock);
426 pcpu->freq_change_time_in_idle =
427 get_cpu_idle_time_us(cpu,
428 &pcpu->freq_change_time);
429 }
430}
431
432static ssize_t show_hispeed_freq(struct kobject *kobj,
433 struct attribute *attr, char *buf)
434{
435 return sprintf(buf, "%llu\n", hispeed_freq);
436}
437
438static ssize_t store_hispeed_freq(struct kobject *kobj,
439 struct attribute *attr, const char *buf,
440 size_t count)
441{
442 int ret;
443 u64 val;
444
445 ret = strict_strtoull(buf, 0, &val);
446 if (ret < 0)
447 return ret;
448 hispeed_freq = val;
449 return count;
450}
451
452static struct global_attr hispeed_freq_attr = __ATTR(hispeed_freq, 0644,
453 show_hispeed_freq, store_hispeed_freq);
454
455
456static ssize_t show_go_hispeed_load(struct kobject *kobj,
457 struct attribute *attr, char *buf)
458{
459 return sprintf(buf, "%lu\n", go_hispeed_load);
460}
461
462static ssize_t store_go_hispeed_load(struct kobject *kobj,
463 struct attribute *attr, const char *buf, size_t count)
464{
465 int ret;
466 unsigned long val;
467
468 ret = strict_strtoul(buf, 0, &val);
469 if (ret < 0)
470 return ret;
471 go_hispeed_load = val;
472 return count;
473}
474
475static struct global_attr go_hispeed_load_attr = __ATTR(go_hispeed_load, 0644,
476 show_go_hispeed_load, store_go_hispeed_load);
477
478static ssize_t show_min_sample_time(struct kobject *kobj,
479 struct attribute *attr, char *buf)
480{
481 return sprintf(buf, "%lu\n", min_sample_time);
482}
483
484static ssize_t store_min_sample_time(struct kobject *kobj,
485 struct attribute *attr, const char *buf, size_t count)
486{
487 int ret;
488 unsigned long val;
489
490 ret = strict_strtoul(buf, 0, &val);
491 if (ret < 0)
492 return ret;
493 min_sample_time = val;
494 return count;
495}
496
497static struct global_attr min_sample_time_attr = __ATTR(min_sample_time, 0644,
498 show_min_sample_time, store_min_sample_time);
499
500static ssize_t show_timer_rate(struct kobject *kobj,
501 struct attribute *attr, char *buf)
502{
503 return sprintf(buf, "%lu\n", timer_rate);
504}
505
506static ssize_t store_timer_rate(struct kobject *kobj,
507 struct attribute *attr, const char *buf, size_t count)
508{
509 int ret;
510 unsigned long val;
511
512 ret = strict_strtoul(buf, 0, &val);
513 if (ret < 0)
514 return ret;
515 timer_rate = val;
516 return count;
517}
518
519static struct global_attr timer_rate_attr = __ATTR(timer_rate, 0644,
520 show_timer_rate, store_timer_rate);
521
522static struct attribute *interactive_attributes[] = {
523 &hispeed_freq_attr.attr,
524 &go_hispeed_load_attr.attr,
525 &min_sample_time_attr.attr,
526 &timer_rate_attr.attr,
527 NULL,
528};
529
530static struct attribute_group interactive_attr_group = {
531 .attrs = interactive_attributes,
532 .name = "interactive",
533};
534
535static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
536 unsigned int event)
537{
538 int rc;
539 unsigned int j;
540 struct cpufreq_interactive_cpuinfo *pcpu;
541 struct cpufreq_frequency_table *freq_table;
542
543 switch (event) {
544 case CPUFREQ_GOV_START:
545 if (!cpu_online(policy->cpu))
546 return -EINVAL;
547
548 freq_table =
549 cpufreq_frequency_get_table(policy->cpu);
550
551 for_each_cpu(j, policy->cpus) {
552 pcpu = &per_cpu(cpuinfo, j);
553 pcpu->policy = policy;
554 pcpu->target_freq = policy->cur;
555 pcpu->freq_table = freq_table;
556 pcpu->freq_change_time_in_idle =
557 get_cpu_idle_time_us(j,
558 &pcpu->freq_change_time);
559 pcpu->governor_enabled = 1;
560 smp_wmb();
561 }
562
563 if (!hispeed_freq)
564 hispeed_freq = policy->max;
565
566 /*
567 * Do not register the idle hook and create sysfs
568 * entries if we have already done so.
569 */
570 if (atomic_inc_return(&active_count) > 1)
571 return 0;
572
573 rc = sysfs_create_group(cpufreq_global_kobject,
574 &interactive_attr_group);
575 if (rc)
576 return rc;
577
578 break;
579
580 case CPUFREQ_GOV_STOP:
581 for_each_cpu(j, policy->cpus) {
582 pcpu = &per_cpu(cpuinfo, j);
583 pcpu->governor_enabled = 0;
584 smp_wmb();
585 del_timer_sync(&pcpu->cpu_timer);
586
587 /*
588 * Reset idle exit time since we may cancel the timer
589 * before it can run after the last idle exit time,
590 * to avoid tripping the check in idle exit for a timer
591 * that is trying to run.
592 */
593 pcpu->idle_exit_time = 0;
594 }
595
596 flush_work(&freq_scale_down_work);
597 if (atomic_dec_return(&active_count) > 0)
598 return 0;
599
600 sysfs_remove_group(cpufreq_global_kobject,
601 &interactive_attr_group);
602
603 break;
604
605 case CPUFREQ_GOV_LIMITS:
606 if (policy->max < policy->cur)
607 __cpufreq_driver_target(policy,
608 policy->max, CPUFREQ_RELATION_H);
609 else if (policy->min > policy->cur)
610 __cpufreq_driver_target(policy,
611 policy->min, CPUFREQ_RELATION_L);
612 break;
613 }
614 return 0;
615}
616
617static int cpufreq_interactive_idle_notifier(struct notifier_block *nb,
618 unsigned long val,
619 void *data)
620{
621 switch (val) {
622 case IDLE_START:
623 cpufreq_interactive_idle_start();
624 break;
625 case IDLE_END:
626 cpufreq_interactive_idle_end();
627 break;
628 }
629
630 return 0;
631}
632
633static struct notifier_block cpufreq_interactive_idle_nb = {
634 .notifier_call = cpufreq_interactive_idle_notifier,
635};
636
637static int __init cpufreq_interactive_init(void)
638{
639 unsigned int i;
640 struct cpufreq_interactive_cpuinfo *pcpu;
641 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
642
643 go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
644 min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
645 timer_rate = DEFAULT_TIMER_RATE;
646
647 /* Initalize per-cpu timers */
648 for_each_possible_cpu(i) {
649 pcpu = &per_cpu(cpuinfo, i);
650 init_timer(&pcpu->cpu_timer);
651 pcpu->cpu_timer.function = cpufreq_interactive_timer;
652 pcpu->cpu_timer.data = i;
653 }
654
655 up_task = kthread_create(cpufreq_interactive_up_task, NULL,
656 "kinteractiveup");
657 if (IS_ERR(up_task))
658 return PTR_ERR(up_task);
659
660 sched_setscheduler_nocheck(up_task, SCHED_FIFO, &param);
661 get_task_struct(up_task);
662
663 /* No rescuer thread, bind to CPU queuing the work for possibly
664 warm cache (probably doesn't matter much). */
665 down_wq = alloc_workqueue("knteractive_down", 0, 1);
666
667 if (!down_wq)
668 goto err_freeuptask;
669
670 INIT_WORK(&freq_scale_down_work,
671 cpufreq_interactive_freq_down);
672
673 spin_lock_init(&up_cpumask_lock);
674 spin_lock_init(&down_cpumask_lock);
675 mutex_init(&set_speed_lock);
676
677 idle_notifier_register(&cpufreq_interactive_idle_nb);
678
679 return cpufreq_register_governor(&cpufreq_gov_interactive);
680
681err_freeuptask:
682 put_task_struct(up_task);
683 return -ENOMEM;
684}
685
686#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
687fs_initcall(cpufreq_interactive_init);
688#else
689module_init(cpufreq_interactive_init);
690#endif
691
692static void __exit cpufreq_interactive_exit(void)
693{
694 cpufreq_unregister_governor(&cpufreq_gov_interactive);
695 kthread_stop(up_task);
696 put_task_struct(up_task);
697 destroy_workqueue(down_wq);
698}
699
700module_exit(cpufreq_interactive_exit);
701
702MODULE_AUTHOR("Mike Chan <mike@android.com>");
703MODULE_DESCRIPTION("'cpufreq_interactive' - A cpufreq governor for "
704 "Latency sensitive workloads");
705MODULE_LICENSE("GPL");