hrtimer.c 46.4 KB
Newer Older
1
2
3
/*
 *  linux/kernel/hrtimer.c
 *
4
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
 *
 *  High-resolution kernel timers
 *
 *  In contrast to the low-resolution timeout API implemented in
 *  kernel/timer.c, hrtimers provide finer resolution and accuracy
 *  depending on system configuration and capabilities.
 *
 *  These timers are currently used for:
 *   - itimers
 *   - POSIX timers
 *   - nanosleep
 *   - precise in-kernel timing
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *	based on kernel/timer.c
 *
25
26
27
28
29
30
 *	Help, testing, suggestions, bugfixes, improvements were
 *	provided by:
 *
 *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
 *	et. al.
 *
31
32
33
34
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/cpu.h>
35
#include <linux/export.h>
36
37
38
39
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
40
#include <linux/kallsyms.h>
41
#include <linux/interrupt.h>
42
#include <linux/tick.h>
43
44
#include <linux/seq_file.h>
#include <linux/err.h>
45
#include <linux/debugobjects.h>
46
47
#include <linux/sched.h>
#include <linux/timer.h>
48
49
50

#include <asm/uaccess.h>

51
52
#include <trace/events/timer.h>

53
54
/*
 * The timer bases:
55
 *
56
57
58
59
 * There are more clockids then hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
60
 */
61
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
62
{
63
64

	.clock_base =
65
	{
66
		{
67
68
			.index = HRTIMER_BASE_MONOTONIC,
			.clockid = CLOCK_MONOTONIC,
69
			.get_time = &ktime_get,
70
			.resolution = KTIME_LOW_RES,
71
		},
72
73
74
75
76
77
		{
			.index = HRTIMER_BASE_REALTIME,
			.clockid = CLOCK_REALTIME,
			.get_time = &ktime_get_real,
			.resolution = KTIME_LOW_RES,
		},
78
		{
79
80
			.index = HRTIMER_BASE_BOOTTIME,
			.clockid = CLOCK_BOOTTIME,
81
82
83
			.get_time = &ktime_get_boottime,
			.resolution = KTIME_LOW_RES,
		},
84
	}
85
86
};

87
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
88
89
90
91
	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
};
92
93
94
95
96
97
98

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
	return hrtimer_clock_to_base_table[clock_id];
}


99
100
101
102
/*
 * Get the coarse grained time at the softirq based on xtime and
 * wall_to_monotonic.
 */
103
static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
104
{
105
	ktime_t xtim, mono, boot;
106
	struct timespec xts, tom, slp;
107

108
	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
109

john stultz's avatar
john stultz committed
110
	xtim = timespec_to_ktime(xts);
111
112
	mono = ktime_add(xtim, timespec_to_ktime(tom));
	boot = ktime_add(mono, timespec_to_ktime(slp));
113
	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
114
115
	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
116
117
}

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
#ifdef CONFIG_SMP

/*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found on the lists/queues.
 *
 * When the timer's base is locked, and the timer removed from list, it is
 * possible to set timer->base = NULL and drop the lock: the timer remains
 * locked.
 */
136
137
138
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
					     unsigned long *flags)
139
{
140
	struct hrtimer_clock_base *base;
141
142
143
144

	for (;;) {
		base = timer->base;
		if (likely(base != NULL)) {
145
			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
146
147
148
			if (likely(base == timer->base))
				return base;
			/* The timer has migrated to another CPU: */
149
			raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
150
151
152
153
154
		}
		cpu_relax();
	}
}

155
156
157
158
159
160
161

/*
 * Get the preferred target CPU for NOHZ
 */
static int hrtimer_get_target(int this_cpu, int pinned)
{
#ifdef CONFIG_NO_HZ
162
163
	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
		return get_nohz_timer_target();
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#endif
	return this_cpu;
}

/*
 * With HIGHRES=y we do not migrate the timer when it is expiring
 * before the next event on the target cpu because we cannot reprogram
 * the target cpu hardware and we would cause it to fire late.
 *
 * Called with cpu_base->lock of target cpu held.
 */
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
#ifdef CONFIG_HIGH_RES_TIMERS
	ktime_t expires;

	if (!new_base->cpu_base->hres_active)
		return 0;

	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
	return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
#else
	return 0;
#endif
}

191
192
193
/*
 * Switch the timer base to the current CPU when possible.
 */
194
static inline struct hrtimer_clock_base *
195
196
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
		    int pinned)
197
{
198
199
	struct hrtimer_clock_base *new_base;
	struct hrtimer_cpu_base *new_cpu_base;
200
201
	int this_cpu = smp_processor_id();
	int cpu = hrtimer_get_target(this_cpu, pinned);
202
	int basenum = base->index;
203

204
205
again:
	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
206
	new_base = &new_cpu_base->clock_base[basenum];
207
208
209

	if (base != new_base) {
		/*
210
		 * We are trying to move timer to new_base.
211
212
213
214
215
216
217
		 * However we can't change timer's base while it is running,
		 * so we keep it on the same CPU. No hassle vs. reprogramming
		 * the event source in the high resolution case. The softirq
		 * code will take care of this when the timer function has
		 * completed. There is no conflict as we hold the lock until
		 * the timer is enqueued.
		 */
218
		if (unlikely(hrtimer_callback_running(timer)))
219
220
221
222
			return base;

		/* See the comment in lock_timer_base() */
		timer->base = NULL;
223
224
		raw_spin_unlock(&base->cpu_base->lock);
		raw_spin_lock(&new_base->cpu_base->lock);
225

226
227
		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
			cpu = this_cpu;
228
229
			raw_spin_unlock(&new_base->cpu_base->lock);
			raw_spin_lock(&base->cpu_base->lock);
230
231
			timer->base = base;
			goto again;
232
		}
233
234
235
236
237
238
239
		timer->base = new_base;
	}
	return new_base;
}

#else /* CONFIG_SMP */

240
static inline struct hrtimer_clock_base *
241
242
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
243
	struct hrtimer_clock_base *base = timer->base;
244

245
	raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
246
247
248
249

	return base;
}

250
# define switch_hrtimer_base(t, b, p)	(b)
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280

#endif	/* !CONFIG_SMP */

/*
 * Functions for the union type storage format of ktime_t which are
 * too large for inlining:
 */
#if BITS_PER_LONG < 64
# ifndef CONFIG_KTIME_SCALAR
/**
 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
 * @kt:		addend
 * @nsec:	the scalar nsec value to add
 *
 * Returns the sum of kt and nsec in ktime_t format
 */
ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
{
	ktime_t tmp;

	if (likely(nsec < NSEC_PER_SEC)) {
		tmp.tv64 = nsec;
	} else {
		unsigned long rem = do_div(nsec, NSEC_PER_SEC);

		tmp = ktime_set((long)nsec, rem);
	}

	return ktime_add(kt, tmp);
}
281
282

EXPORT_SYMBOL_GPL(ktime_add_ns);
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

/**
 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
 * @kt:		minuend
 * @nsec:	the scalar nsec value to subtract
 *
 * Returns the subtraction of @nsec from @kt in ktime_t format
 */
ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
{
	ktime_t tmp;

	if (likely(nsec < NSEC_PER_SEC)) {
		tmp.tv64 = nsec;
	} else {
		unsigned long rem = do_div(nsec, NSEC_PER_SEC);

		tmp = ktime_set((long)nsec, rem);
	}

	return ktime_sub(kt, tmp);
}

EXPORT_SYMBOL_GPL(ktime_sub_ns);
307
308
309
310
311
# endif /* !CONFIG_KTIME_SCALAR */

/*
 * Divide a ktime value by a nanosecond value
 */
Davide Libenzi's avatar
Davide Libenzi committed
312
u64 ktime_divns(const ktime_t kt, s64 div)
313
{
314
	u64 dclc;
315
316
	int sft = 0;

317
	dclc = ktime_to_ns(kt);
318
319
320
321
322
323
324
325
	/* Make sure the divisor is less than 2^32: */
	while (div >> 32) {
		sft++;
		div >>= 1;
	}
	dclc >>= sft;
	do_div(dclc, (unsigned long) div);

Davide Libenzi's avatar
Davide Libenzi committed
326
	return dclc;
327
328
329
}
#endif /* BITS_PER_LONG >= 64 */

330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
/*
 * Add two ktime values and do a safety check for overflow:
 */
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
	ktime_t res = ktime_add(lhs, rhs);

	/*
	 * We use KTIME_SEC_MAX here, the maximum timeout which we can
	 * return to user space in a timespec:
	 */
	if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
		res = ktime_set(KTIME_SEC_MAX, 0);

	return res;
}

347
348
EXPORT_SYMBOL_GPL(ktime_add_safe);

349
350
351
352
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static struct debug_obj_descr hrtimer_debug_descr;

353
354
355
356
357
static void *hrtimer_debug_hint(void *addr)
{
	return ((struct hrtimer *) addr)->function;
}

358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
	struct hrtimer *timer = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		hrtimer_cancel(timer);
		debug_object_init(timer, &hrtimer_debug_descr);
		return 1;
	default:
		return 0;
	}
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown object is activated (might be a statically initialized object)
 */
static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
	switch (state) {

	case ODEBUG_STATE_NOTAVAILABLE:
		WARN_ON_ONCE(1);
		return 0;

	case ODEBUG_STATE_ACTIVE:
		WARN_ON(1);

	default:
		return 0;
	}
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
	struct hrtimer *timer = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		hrtimer_cancel(timer);
		debug_object_free(timer, &hrtimer_debug_descr);
		return 1;
	default:
		return 0;
	}
}

static struct debug_obj_descr hrtimer_debug_descr = {
	.name		= "hrtimer",
417
	.debug_hint	= hrtimer_debug_hint,
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
	.fixup_init	= hrtimer_fixup_init,
	.fixup_activate	= hrtimer_fixup_activate,
	.fixup_free	= hrtimer_fixup_free,
};

static inline void debug_hrtimer_init(struct hrtimer *timer)
{
	debug_object_init(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_activate(struct hrtimer *timer)
{
	debug_object_activate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
	debug_object_deactivate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_free(struct hrtimer *timer)
{
	debug_object_free(timer, &hrtimer_debug_descr);
}

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
			   enum hrtimer_mode mode);

void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
			   enum hrtimer_mode mode)
{
	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
	__hrtimer_init(timer, clock_id, mode);
}
Stephen Hemminger's avatar
Stephen Hemminger committed
452
EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
453
454
455
456
457
458
459
460
461
462
463
464

void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
	debug_object_free(timer, &hrtimer_debug_descr);
}

#else
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif

465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
static inline void
debug_init(struct hrtimer *timer, clockid_t clockid,
	   enum hrtimer_mode mode)
{
	debug_hrtimer_init(timer);
	trace_hrtimer_init(timer, clockid, mode);
}

static inline void debug_activate(struct hrtimer *timer)
{
	debug_hrtimer_activate(timer);
	trace_hrtimer_start(timer);
}

static inline void debug_deactivate(struct hrtimer *timer)
{
	debug_hrtimer_deactivate(timer);
	trace_hrtimer_cancel(timer);
}

485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer enabled ?
 */
static int hrtimer_hres_enabled __read_mostly  = 1;

/*
 * Enable / Disable high resolution mode
 */
static int __init setup_hrtimer_hres(char *str)
{
	if (!strcmp(str, "off"))
		hrtimer_hres_enabled = 0;
	else if (!strcmp(str, "on"))
		hrtimer_hres_enabled = 1;
	else
		return 0;
	return 1;
}

__setup("highres=", setup_hrtimer_hres);

/*
 * hrtimer_high_res_enabled - query, if the highres mode is enabled
 */
static inline int hrtimer_is_hres_enabled(void)
{
	return hrtimer_hres_enabled;
}

/*
 * Is the high resolution mode active ?
 */
static inline int hrtimer_hres_active(void)
{
522
	return __this_cpu_read(hrtimer_bases.hres_active);
523
524
525
526
527
528
529
}

/*
 * Reprogram the event source with checking both queues for the
 * next event
 * Called with interrupts disabled and base->lock held
 */
530
531
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
532
533
534
{
	int i;
	struct hrtimer_clock_base *base = cpu_base->clock_base;
535
	ktime_t expires, expires_next;
536

537
	expires_next.tv64 = KTIME_MAX;
538
539
540

	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
		struct hrtimer *timer;
541
		struct timerqueue_node *next;
542

543
544
		next = timerqueue_getnext(&base->active);
		if (!next)
545
			continue;
546
547
		timer = container_of(next, struct hrtimer, node);

548
		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
549
550
551
552
553
554
555
		/*
		 * clock_was_set() has changed base->offset so the
		 * result might be negative. Fix it up to prevent a
		 * false positive in clockevents_program_event()
		 */
		if (expires.tv64 < 0)
			expires.tv64 = 0;
556
557
		if (expires.tv64 < expires_next.tv64)
			expires_next = expires;
558
559
	}

560
561
562
563
564
	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
		return;

	cpu_base->expires_next.tv64 = expires_next.tv64;

565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
	if (cpu_base->expires_next.tv64 != KTIME_MAX)
		tick_program_event(cpu_base->expires_next, 1);
}

/*
 * Shared reprogramming for clock_realtime and clock_monotonic
 *
 * When a timer is enqueued and expires earlier than the already enqueued
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
static int hrtimer_reprogram(struct hrtimer *timer,
			     struct hrtimer_clock_base *base)
{
581
	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
582
	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
583
584
	int res;

585
	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
586

587
588
589
	/*
	 * When the callback is running, we do not reprogram the clock event
	 * device. The timer callback is either running on a different CPU or
590
	 * the callback is executed in the hrtimer_interrupt context. The
591
592
593
594
595
596
	 * reprogramming is handled either by the softirq, which called the
	 * callback or at the end of the hrtimer_interrupt.
	 */
	if (hrtimer_callback_running(timer))
		return 0;

597
598
599
600
601
602
603
604
605
	/*
	 * CLOCK_REALTIME timer might be requested with an absolute
	 * expiry time which is less than base->offset. Nothing wrong
	 * about that, just avoid to call into the tick code, which
	 * has now objections against negative expiry values.
	 */
	if (expires.tv64 < 0)
		return -ETIME;

606
607
608
609
610
611
612
613
614
615
	if (expires.tv64 >= cpu_base->expires_next.tv64)
		return 0;

	/*
	 * If a hang was detected in the last timer interrupt then we
	 * do not schedule a timer which is earlier than the expiry
	 * which we enforced in the hang detection. We want the system
	 * to make progress.
	 */
	if (cpu_base->hang_detected)
616
617
618
619
620
621
622
		return 0;

	/*
	 * Clockevents returns -ETIME, when the event was in the past.
	 */
	res = tick_program_event(expires, 0);
	if (!IS_ERR_VALUE(res))
623
		cpu_base->expires_next = expires;
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
	return res;
}

/*
 * Initialize the high resolution related parts of cpu_base
 */
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
	base->expires_next.tv64 = KTIME_MAX;
	base->hres_active = 0;
}

/*
 * When High resolution timers are active, try to reprogram. Note, that in case
 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
 * check happens. The timer gets enqueued into the rbtree. The reprogramming
 * and expiry check is done in the hrtimer_interrupt or in the softirq.
 */
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
643
644
					    struct hrtimer_clock_base *base,
					    int wakeup)
645
646
{
	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
647
		if (wakeup) {
648
			raw_spin_unlock(&base->cpu_base->lock);
649
			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
650
			raw_spin_lock(&base->cpu_base->lock);
651
652
653
		} else
			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);

654
		return 1;
655
	}
656

657
658
659
	return 0;
}

660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
/*
 * Retrigger next event is called after clock was set
 *
 * Called with interrupts disabled via on_each_cpu()
 */
static void retrigger_next_event(void *arg)
{
	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
	struct timespec realtime_offset, xtim, wtm, sleep;

	if (!hrtimer_hres_active())
		return;

	/* Optimized out for !HIGH_RES */
	get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);

	/* Adjust CLOCK_REALTIME offset */
	raw_spin_lock(&base->lock);
	base->clock_base[HRTIMER_BASE_REALTIME].offset =
		timespec_to_ktime(realtime_offset);
	base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
		timespec_to_ktime(sleep);

	hrtimer_force_reprogram(base, 0);
	raw_spin_unlock(&base->lock);
}
687

688
689
690
/*
 * Switch to high resolution mode
 */
691
static int hrtimer_switch_to_hres(void)
692
{
693
	int i, cpu = smp_processor_id();
Ingo Molnar's avatar
Ingo Molnar committed
694
	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
695
696
697
	unsigned long flags;

	if (base->hres_active)
698
		return 1;
699
700
701
702
703

	local_irq_save(flags);

	if (tick_init_highres()) {
		local_irq_restore(flags);
Ingo Molnar's avatar
Ingo Molnar committed
704
705
		printk(KERN_WARNING "Could not switch to high resolution "
				    "mode on CPU %d\n", cpu);
706
		return 0;
707
708
	}
	base->hres_active = 1;
709
710
	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
		base->clock_base[i].resolution = KTIME_HIGH_RES;
711
712
713
714
715
716

	tick_setup_sched_timer();

	/* "Retrigger" the interrupt to get things going */
	retrigger_next_event(NULL);
	local_irq_restore(flags);
717
	return 1;
718
719
720
721
722
723
}

#else

static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
724
static inline int hrtimer_switch_to_hres(void) { return 0; }
725
726
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
727
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
728
729
					    struct hrtimer_clock_base *base,
					    int wakeup)
730
731
732
733
{
	return 0;
}
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
734
static inline void retrigger_next_event(void *arg) { }
735
736
737

#endif /* CONFIG_HIGH_RES_TIMERS */

738
739
740
741
742
743
744
745
746
747
748
749
750
/*
 * Clock realtime was set
 *
 * Change the offset of the realtime clock vs. the monotonic
 * clock.
 *
 * We might have to reprogram the high resolution timer interrupt. On
 * SMP we call the architecture specific code to retrigger _all_ high
 * resolution timer interrupts. On UP we just disable interrupts and
 * call the high resolution interrupt code.
 */
void clock_was_set(void)
{
751
#ifdef CONFIG_HIGH_RES_TIMERS
752
753
	/* Retrigger the CPU local events everywhere */
	on_each_cpu(retrigger_next_event, NULL, 1);
754
755
#endif
	timerfd_clock_was_set();
756
757
758
759
760
761
762
763
764
765
766
767
}

/*
 * During resume we might have to reprogram the high resolution timer
 * interrupt (on the local CPU):
 */
void hrtimers_resume(void)
{
	WARN_ONCE(!irqs_disabled(),
		  KERN_INFO "hrtimers_resume() called with IRQs enabled!");

	retrigger_next_event(NULL);
768
	timerfd_clock_was_set();
769
770
}

771
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
772
{
773
#ifdef CONFIG_TIMER_STATS
774
775
	if (timer->start_site)
		return;
776
	timer->start_site = __builtin_return_address(0);
777
778
	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
	timer->start_pid = current->pid;
779
780
781
782
783
784
785
786
#endif
}

static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
{
#ifdef CONFIG_TIMER_STATS
	timer->start_site = NULL;
#endif
787
}
788
789
790
791
792
793
794
795

static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
{
#ifdef CONFIG_TIMER_STATS
	if (likely(!timer_stats_active))
		return;
	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
				 timer->function, timer->start_comm, 0);
796
#endif
797
}
798

799
/*
800
 * Counterpart to lock_hrtimer_base above:
801
802
803
804
 */
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
805
	raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
806
807
808
809
810
}

/**
 * hrtimer_forward - forward the timer expiry
 * @timer:	hrtimer to forward
811
 * @now:	forward past this time
812
813
814
 * @interval:	the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
815
 * Returns the number of overruns.
816
 */
Davide Libenzi's avatar
Davide Libenzi committed
817
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
818
{
Davide Libenzi's avatar
Davide Libenzi committed
819
	u64 orun = 1;
820
	ktime_t delta;
821

822
	delta = ktime_sub(now, hrtimer_get_expires(timer));
823
824
825
826

	if (delta.tv64 < 0)
		return 0;

827
828
829
	if (interval.tv64 < timer->base->resolution.tv64)
		interval.tv64 = timer->base->resolution.tv64;

830
	if (unlikely(delta.tv64 >= interval.tv64)) {
831
		s64 incr = ktime_to_ns(interval);
832
833

		orun = ktime_divns(delta, incr);
834
835
		hrtimer_add_expires_ns(timer, incr * orun);
		if (hrtimer_get_expires_tv64(timer) > now.tv64)
836
837
838
839
840
841
842
			return orun;
		/*
		 * This (and the ktime_add() below) is the
		 * correction for exact:
		 */
		orun++;
	}
843
	hrtimer_add_expires(timer, interval);
844
845
846

	return orun;
}
Stas Sergeev's avatar
Stas Sergeev committed
847
EXPORT_SYMBOL_GPL(hrtimer_forward);
848
849
850
851
852
853

/*
 * enqueue_hrtimer - internal function to (re)start a timer
 *
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)). Must hold the base lock.
854
855
 *
 * Returns 1 when the new timer is the leftmost timer in the tree.
856
 */
857
858
static int enqueue_hrtimer(struct hrtimer *timer,
			   struct hrtimer_clock_base *base)
859
{
860
	debug_activate(timer);
861

862
	timerqueue_add(&base->active, &timer->node);
863
	base->cpu_base->active_bases |= 1 << base->index;
864

865
866
867
868
869
	/*
	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
	 * state of a possibly running callback.
	 */
	timer->state |= HRTIMER_STATE_ENQUEUED;
870

871
	return (&timer->node == base->active.next);
872
}
873
874
875
876
877

/*
 * __remove_hrtimer - internal function to remove a timer
 *
 * Caller must hold the base lock.
878
879
880
881
882
 *
 * High resolution timer mode reprograms the clock event device when the
 * timer is the one which expires next. The caller can disable this by setting
 * reprogram to zero. This is useful, when the context does a reprogramming
 * anyway (e.g. timer interrupt)
883
 */
884
static void __remove_hrtimer(struct hrtimer *timer,
885
			     struct hrtimer_clock_base *base,
886
			     unsigned long newstate, int reprogram)
887
{
888
	struct timerqueue_node *next_timer;
889
890
891
	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
		goto out;

892
893
894
	next_timer = timerqueue_getnext(&base->active);
	timerqueue_del(&base->active, &timer->node);
	if (&timer->node == next_timer) {
895
896
897
898
899
900
901
902
903
#ifdef CONFIG_HIGH_RES_TIMERS
		/* Reprogram the clock event device. if enabled */
		if (reprogram && hrtimer_hres_active()) {
			ktime_t expires;

			expires = ktime_sub(hrtimer_get_expires(timer),
					    base->offset);
			if (base->cpu_base->expires_next.tv64 == expires.tv64)
				hrtimer_force_reprogram(base->cpu_base, 1);
904
		}
905
#endif
906
	}
907
908
	if (!timerqueue_getnext(&base->active))
		base->cpu_base->active_bases &= ~(1 << base->index);
909
out:
910
	timer->state = newstate;
911
912
913
914
915
916
}

/*
 * remove hrtimer, called with base lock held
 */
static inline int
917
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
918
{
919
	if (hrtimer_is_queued(timer)) {
920
		unsigned long state;
921
922
923
924
925
926
927
928
929
930
		int reprogram;

		/*
		 * Remove the timer and force reprogramming when high
		 * resolution mode is active and the timer is on the current
		 * CPU. If we remove a timer on another CPU, reprogramming is
		 * skipped. The interrupt event on this CPU is fired and
		 * reprogramming happens in the interrupt handler. This is a
		 * rare case and less expensive than a smp call.
		 */
931
		debug_deactivate(timer);
932
		timer_stats_hrtimer_clear_start_info(timer);
933
		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
934
935
936
937
938
939
940
		/*
		 * We must preserve the CALLBACK state flag here,
		 * otherwise we could move the timer base in
		 * switch_hrtimer_base.
		 */
		state = timer->state & HRTIMER_STATE_CALLBACK;
		__remove_hrtimer(timer, base, state, reprogram);
941
942
943
944
945
		return 1;
	}
	return 0;
}

946
947
948
int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
		unsigned long delta_ns, const enum hrtimer_mode mode,
		int wakeup)
949
{
950
	struct hrtimer_clock_base *base, *new_base;
951
	unsigned long flags;
952
	int ret, leftmost;
953
954
955
956
957
958
959

	base = lock_hrtimer_base(timer, &flags);

	/* Remove an active timer from the queue: */
	ret = remove_hrtimer(timer, base);

	/* Switch the timer base, if necessary: */
960
	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
961

962
	if (mode & HRTIMER_MODE_REL) {
963
		tim = ktime_add_safe(tim, new_base->get_time());
964
965
966
967
968
969
970
971
		/*
		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
		 * to signal that they simply return xtime in
		 * do_gettimeoffset(). In this case we want to round up by
		 * resolution when starting a relative timer, to avoid short
		 * timeouts. This will go away with the GTOD framework.
		 */
#ifdef CONFIG_TIME_LOW_RES
972
		tim = ktime_add_safe(tim, base->resolution);
973
974
#endif
	}
975

976
	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
977

978
979
	timer_stats_hrtimer_set_start_info(timer);

980
981
	leftmost = enqueue_hrtimer(timer, new_base);

982
983
984
	/*
	 * Only allow reprogramming if the new base is on this CPU.
	 * (it might still be on another CPU if the timer was pending)
985
986
	 *
	 * XXX send_remote_softirq() ?
987
	 */
988
	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
989
		hrtimer_enqueue_reprogram(timer, new_base, wakeup);
990
991
992
993
994

	unlock_hrtimer_base(timer, &flags);

	return ret;
}
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011

/**
 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
 * @timer:	the timer to be added
 * @tim:	expiry time
 * @delta_ns:	"slack" range for the timer
 * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
 *
 * Returns:
 *  0 on success
 *  1 when the timer was active
 */
int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
		unsigned long delta_ns, const enum hrtimer_mode mode)
{
	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
}
1012
1013
1014
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);

/**
1015
 * hrtimer_start - (re)start an hrtimer on the current CPU
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
 * @timer:	the timer to be added
 * @tim:	expiry time
 * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
 *
 * Returns:
 *  0 on success
 *  1 when the timer was active
 */
int
hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
{
1027
	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
1028
}
1029
EXPORT_SYMBOL_GPL(hrtimer_start);
1030

1031

1032
1033
1034
1035
1036
1037
1038
1039
/**
 * hrtimer_try_to_cancel - try to deactivate a timer
 * @timer:	hrtimer to stop
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 * -1 when the timer is currently excuting the callback function and
1040
 *    cannot be stopped
1041
1042
1043
 */
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
1044
	struct hrtimer_clock_base *base;
1045
1046
1047
1048
1049
	unsigned long flags;
	int ret = -1;

	base = lock_hrtimer_base(timer, &flags);

1050
	if (!hrtimer_callback_running(timer))
1051
1052
1053
1054
1055
1056
1057
		ret = remove_hrtimer(timer, base);

	unlock_hrtimer_base(timer, &flags);

	return ret;

}
1058
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074

/**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
 * @timer:	the timer to be cancelled
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 */
int hrtimer_cancel(struct hrtimer *timer)
{
	for (;;) {
		int ret = hrtimer_try_to_cancel(timer);

		if (ret >= 0)
			return ret;
1075
		cpu_relax();
1076
1077
	}
}
1078
EXPORT_SYMBOL_GPL(hrtimer_cancel);
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088

/**
 * hrtimer_get_remaining - get remaining time for the timer
 * @timer:	the timer to read
 */
ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
{
	unsigned long flags;
	ktime_t rem;

1089
	lock_hrtimer_base(timer, &flags);
1090
	rem = hrtimer_expires_remaining(timer);
1091
1092
1093
1094
	unlock_hrtimer_base(timer, &flags);

	return rem;
}
1095
EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1096

1097
#ifdef CONFIG_NO_HZ
1098
1099
1100
1101
1102
1103
1104
1105
/**
 * hrtimer_get_next_event - get the time until next expiry event
 *
 * Returns the delta to the next expiry event or KTIME_MAX if no timer
 * is pending.
 */
ktime_t hrtimer_get_next_event(void)
{
1106
1107
	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
	struct hrtimer_clock_base *base = cpu_base->clock_base;
1108
1109
1110
1111
	ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
	unsigned long flags;
	int i;

1112
	raw_spin_lock_irqsave(&cpu_base->lock, flags);
1113

1114
1115
1116
	if (!hrtimer_hres_active()) {
		for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
			struct hrtimer *timer;
1117
			struct timerqueue_node *next;
1118

1119
1120
			next = timerqueue_getnext(&base->active);
			if (!next)
1121
				continue;
1122

1123
			timer = container_of(next, struct hrtimer, node);
1124
			delta.tv64 = hrtimer_get_expires_tv64(timer);
1125
1126
1127
1128
			delta = ktime_sub(delta, base->get_time());
			if (delta.tv64 < mindelta.tv64)
				mindelta.tv64 = delta.tv64;
		}
1129
	}
1130

1131
	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1132

1133
1134
1135
1136
1137
1138
	if (mindelta.tv64 < 0)
		mindelta.tv64 = 0;
	return mindelta;
}
#endif

1139
1140
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
			   enum hrtimer_mode mode)
1141
{
1142
	struct hrtimer_cpu_base *cpu_base;
1143
	int base;
1144

1145
1146
	memset(timer, 0, sizeof(struct hrtimer));

1147
	cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1148

1149
	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1150
1151
		clock_id = CLOCK_MONOTONIC;

1152
1153
	base = hrtimer_clockid_to_base(clock_id);
	timer->base = &cpu_base->clock_base[base];
1154
	timerqueue_init(&timer->node);
1155
1156
1157
1158
1159
1160

#ifdef CONFIG_TIMER_STATS
	timer->start_site = NULL;
	timer->start_pid = -1;
	memset(timer->start_comm, 0, TASK_COMM_LEN);
#endif
1161
}
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171

/**
 * hrtimer_init - initialize a timer to the given clock
 * @timer:	the timer to be initialized
 * @clock_id:	the clock to be used
 * @mode:	timer mode abs/rel
 */
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
		  enum hrtimer_mode mode)
{
1172
	debug_init(timer, clock_id, mode);
1173
1174
	__hrtimer_init(timer, clock_id, mode);
}
1175
EXPORT_SYMBOL_GPL(hrtimer_init);
1176
1177
1178
1179
1180
1181

/**
 * hrtimer_get_res - get the timer resolution for a clock
 * @which_clock: which clock to query
 * @tp:		 pointer to timespec variable to store the resolution
 *
1182
1183
 * Store the resolution of the clock selected by @which_clock in the
 * variable pointed to by @tp.
1184
1185
1186
 */
int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
{
1187
	struct hrtimer_cpu_base *cpu_base;
1188
	int base = hrtimer_clockid_to_base(which_clock);
1189

1190
	cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1191
	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1192
1193
1194

	return 0;
}
1195
EXPORT_SYMBOL_GPL(hrtimer_get_res);
1196

1197
static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1198
1199
1200
1201
1202
1203
{
	struct hrtimer_clock_base *base = timer->base;
	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
	enum hrtimer_restart (*fn)(struct hrtimer *);
	int restart;

1204
1205
	WARN_ON(!irqs_disabled());

1206
	debug_deactivate(timer);
1207
1208
1209
	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
	timer_stats_account_hrtimer(timer);
	fn = timer->function;
1210
1211
1212
1213
1214
1215

	/*
	 * Because we run timers from hardirq context, there is no chance
	 * they get migrated to another cpu, therefore its safe to unlock
	 * the timer base.
	 */
1216
	raw_spin_unlock(&cpu_base->lock);
1217
	trace_hrtimer_expire_entry(timer, now);
1218
	restart = fn(timer);
1219
	trace_hrtimer_expire_exit(timer);
1220
	raw_spin_lock(&cpu_base->lock);
1221
1222

	/*
Thomas Gleixner's avatar
Thomas Gleixner committed
1223
1224
1225
	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
	 * we do not reprogramm the event hardware. Happens either in
	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1226
1227
1228
	 */
	if (restart != HRTIMER_NORESTART) {
		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1229
		enqueue_hrtimer(timer, base);
1230
	}
1231
1232
1233

	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));

1234
1235
1236
	timer->state &= ~HRTIMER_STATE_CALLBACK;
}

1237
1238
1239
1240
1241
1242
1243
1244
1245
#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer interrupt
 * Called with interrupts disabled
 */
void hrtimer_interrupt(struct clock_event_device *dev)
{
	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1246
1247
	ktime_t expires_next, now, entry_time, delta;
	int i, retries = 0;
1248
1249
1250
1251
1252

	BUG_ON(!cpu_base->hres_active);
	cpu_base->nr_events++;
	dev->next_event.tv64 = KTIME_MAX;

1253
1254
	entry_time = now = ktime_get();
retry:
1255
1256
	expires_next.tv64 = KTIME_MAX;

1257
	raw_spin_lock(&cpu_base->lock);
1258
1259
1260
1261
1262
1263
1264
1265
1266
	/*
	 * We set expires_next to KTIME_MAX here with cpu_base->lock
	 * held to prevent that a timer is enqueued in our queue via
	 * the migration code. This does not affect enqueueing of
	 * timers which run their callback and need to be requeued on
	 * this CPU.
	 */
	cpu_base->expires_next.tv64 = KTIME_MAX;

1267
	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1268
		struct hrtimer_clock_base *base;
1269
		struct timerqueue_node *node;
1270
1271
1272
1273
		ktime_t basenow;

		if (!(cpu_base->active_bases & (1 << i)))
			continue;
1274

1275
		base = cpu_base->clock_base + i;
1276
1277
		basenow = ktime_add(now, base->offset);

1278
		while ((node = timerqueue_getnext(&base->active))) {
1279
1280
			struct hrtimer *timer;

1281
			timer = container_of(node, struct hrtimer, node);
1282

1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
			/*
			 * The immediate goal for using the softexpires is
			 * minimizing wakeups, not running timers at the
			 * earliest interrupt after their soft expiration.
			 * This allows us to avoid using a Priority Search
			 * Tree, which can answer a stabbing querry for
			 * overlapping intervals and instead use the simple
			 * BST we already have.
			 * We don't add extra wakeups by delaying timers that
			 * are right-of a not yet expired timer, because that
			 * timer will have to trigger a wakeup anyway.
			 */

			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1297
1298
				ktime_t expires;

1299
				expires = ktime_sub(hrtimer_get_expires(timer),
1300
1301
1302
1303
1304
1305
						    base->offset);
				if (expires.tv64 < expires_next.tv64)
					expires_next = expires;
				break;
			}

1306
			__run_hrtimer(timer, &basenow);
1307
1308
1309
		}
	}

1310
1311
1312
1313
	/*
	 * Store the new expiry value so the migration code can verify
	 * against it.
	 */
1314
	cpu_base->expires_next = expires_next;
1315
	raw_spin_unlock(&cpu_base->lock);
1316
1317

	/* Reprogramming necessary ? */
1318
1319
1320
1321
	if (expires_next.tv64 == KTIME_MAX ||
	    !tick_program_event(expires_next, 0)) {
		cpu_base->hang_detected = 0;
		return;
1322
	}
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359

	/*
	 * The next timer was already expired due to:
	 * - tracing
	 * - long lasting callbacks
	 * - being scheduled away when running in a VM
	 *
	 * We need to prevent that we loop forever in the hrtimer
	 * interrupt routine. We give it 3 attempts to avoid
	 * overreacting on some spurious event.
	 */
	now = ktime_get();
	cpu_base->nr_retries++;
	if (++retries < 3)
		goto retry;
	/*
	 * Give the system a chance to do something else than looping
	 * here. We stored the entry time, so we know exactly how long
	 * we spent here. We schedule the next event this amount of
	 * time away.
	 */
	cpu_base->nr_hangs++;
	cpu_base->hang_detected = 1;
	delta = ktime_sub(now, entry_time);
	if (delta.tv64 > cpu_base->max_hang_time.tv64)
		cpu_base->max_hang_time = delta;
	/*
	 * Limit it to a sensible value as we enforce a longer
	 * delay. Give the CPU at least 100ms to catch up.
	 */
	if (delta.tv64 > 100 * NSEC_PER_MSEC)
		expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
	else
		expires_next = ktime_add(now, delta);
	tick_program_event(expires_next, 1);
	printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
		    ktime_to_ns(delta));
1360
1361
}

1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
/*
 * local version of hrtimer_peek_ahead_timers() called with interrupts
 * disabled.
 */
static void __hrtimer_peek_ahead_timers(void)
{
	struct tick_device *td;

	if (!hrtimer_hres_active())
		return;

	td = &__get_cpu_var(tick_cpu_device);
	if (td && td->evtdev)
		hrtimer_interrupt(td->evtdev);
}

1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
/**
 * hrtimer_peek_ahead_timers -- run soft-expired timers now
 *
 * hrtimer_peek_ahead_timers will peek at the timer queue of
 * the current cpu and check if there are any timers for which
 * the soft expires time has passed. If any such timers exist,
 * they are run immediately and then removed from the timer queue.
 *
 */
void hrtimer_peek_ahead_timers(void)
{
1389
	unsigned long flags;
1390

1391
	local_irq_save(flags);
1392
	__hrtimer_peek_ahead_timers();
1393
1394
1395
	local_irq_restore(flags);
}

1396
1397
1398
1399
1400
static void run_hrtimer_softirq(struct softirq_action *h)
{
	hrtimer_peek_ahead_timers();
}

1401
1402
1403
1404
1405
#else /* CONFIG_HIGH_RES_TIMERS */

static inline void __hrtimer_peek_ahead_timers(void) { }

#endif	/* !CONFIG_HIGH_RES_TIMERS */
1406

1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
/*
 * Called from timer softirq every jiffy, expire hrtimers:
 *
 * For HRT its the fall back code to run the softirq in the timer
 * softirq context in case the hrtimer initialization failed or has
 * not been done yet.
 */
void hrtimer_run_pending(void)
{
	if (hrtimer_hres_active())
		return;
1418

1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
	/*
	 * This _is_ ugly: We have to check in the softirq context,
	 * whether we can switch to highres and / or nohz mode. The
	 * clocksource switch happens in the timer interrupt with
	 * xtime_lock held. Notification from there only sets the
	 * check bit in the tick_oneshot code, otherwise we might
	 * deadlock vs. xtime_lock.
	 */
	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
		hrtimer_switch_to_hres();
1429
1430
}

1431
/*
1432
 * Called from hardirq context every jiffy
1433
 */
1434
void hrtimer_run_queues(void)
1435
{
1436
	struct timerqueue_node *node;
1437
1438
1439
	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
	struct hrtimer_clock_base *base;
	int index, gettime = 1;
1440

1441
	if (hrtimer_hres_active())
1442
1443
		return;

1444
1445
	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
		base = &cpu_base->clock_base[index];
1446
		if (!timerqueue_getnext(&base->active))
1447
			continue;
1448

1449
		if (gettime) {
1450
1451
			hrtimer_get_softirq_time(cpu_base);
			gettime = 0;
1452
		}
1453

1454
		raw_spin_lock(&cpu_base->lock);
1455

1456
		while ((node = timerqueue_getnext(&base->active))) {
1457
			struct hrtimer *timer;
1458

1459
			timer = container_of(node, struct hrtimer, node);
1460
1461
			if (base->softirq_time.tv64 <=
					hrtimer_get_expires_tv64(timer))
1462
1463
				break;

1464
			__run_hrtimer(timer, &base->softirq_time);
1465
		}
1466
		raw_spin_unlock(&cpu_base->lock);
1467
	}
1468
1469
}

1470
1471
1472
/*
 * Sleep related functions:
 */
1473
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
{
	struct hrtimer_sleeper *t =
		container_of(timer, struct hrtimer_sleeper, timer);
	struct task_struct *task = t->task;

	t->task = NULL;
	if (task)
		wake_up_process(task);

	return HRTIMER_NORESTART;
}

1486
void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1487
1488
1489
1490
{
	sl->timer.function = hrtimer_wakeup;
	sl->task = task;
}
Stephen Hemminger's avatar
Stephen Hemminger committed
1491
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1492

1493
static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1494
{
1495
	hrtimer_init_sleeper(t, current);
1496

1497
1498
	do {
		set_current_state(TASK_INTERRUPTIBLE);
1499
		hrtimer_start_expires(&t->timer, mode);
1500
1501
		if (!hrtimer_active(&t->timer))
			t->task = NULL;
1502

1503
1504
		if (likely(t->task))
			schedule();
1505

1506
		hrtimer_cancel(&t->timer);
1507
		mode = HRTIMER_MODE_ABS;
1508
1509

	} while (t->task && !signal_pending(current));
1510

1511
1512
	__set_current_state(TASK_RUNNING);

1513
	return t->task == NULL;
1514
1515
}

1516
1517
1518
1519
1520
static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
{
	struct timespec rmt;
	ktime_t rem;

1521
	rem = hrtimer_expires_remaining(timer);
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
	if (rem.tv64 <= 0)
		return 0;
	rmt = ktime_to_timespec(rem);

	if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
		return -EFAULT;

	return 1;
}

1532
long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1533
{
1534
	struct hrtimer_sleeper t;
1535
	struct timespec __user  *rmtp;
1536
	int ret = 0;
1537

1538
	hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1539
				HRTIMER_MODE_ABS);
1540
	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1541

1542
	if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1543
		goto out;
1544

1545
	rmtp = restart->nanosleep.rmtp;
1546
	if (rmtp) {
1547
		ret = update_rmtp(&t.timer, rmtp);
1548
		if (ret <= 0)
1549
			goto out;
1550
	}
1551
1552

	/* The other values in restart are already filled in */
1553
1554
1555
1556
	ret = -ERESTART_RESTARTBLOCK;
out:
	destroy_hrtimer_on_stack(&t.timer);
	return ret;
1557
1558
}

1559
long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1560
1561
1562
		       const enum hrtimer_mode mode, const clockid_t clockid)
{
	struct restart_block *restart;
1563
	struct hrtimer_sleeper t;
1564
	int ret = 0;
1565
1566
1567
1568
1569
	unsigned long slack;

	slack = current->timer_slack_ns;
	if (rt_task(current))
		slack = 0;
1570

1571
	hrtimer_init_on_stack(&t.timer, clockid, mode);
1572
	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
1573
	if (do_nanosleep(&t, mode))
1574
		goto out;
1575

1576
	/* Absolute timers do not update the rmtp value and restart: */
1577
1578
1579
1580
	if (mode == HRTIMER_MODE_ABS) {
		ret = -ERESTARTNOHAND;
		goto out;
	}
1581

1582
	if (rmtp) {
1583
		ret = update_rmtp(&t.timer, rmtp);
1584
		if (ret <= 0)
1585
			goto out;
1586
	}
1587
1588

	restart = &current_thread_info()->restart_block;
1589
	restart->fn = hrtimer_nanosleep_restart;
1590
	restart->nanosleep.clockid = t.timer.base->clockid;
1591
	restart->nanosleep.rmtp = rmtp;
1592
	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1593

1594
1595
1596
1597
	ret = -ERESTART_RESTARTBLOCK;
out:
	destroy_hrtimer_on_stack(&t.timer);
	return ret;
1598
1599
}

1600
1601
SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
		struct timespec __user *, rmtp)
1602
{
1603
	struct timespec tu;
1604
1605
1606
1607
1608
1609
1610

	if (copy_from_user(&tu, rqtp, sizeof(tu)))
		return -EFAULT;

	if (!timespec_valid(&tu))
		return -EINVAL;

1611
	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1612
1613
}

1614
1615
1616
/*
 * Functions related to boot-time initialization:
 */
Randy Dunlap's avatar
Randy Dunlap committed
1617
static void __cpuinit init_hrtimers_cpu(int cpu)
1618
{
1619
	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1620
1621
	int i;

1622
	raw_spin_lock_init(&cpu_base->lock);
1623

1624
	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1625
		cpu_base->clock_base[i].cpu_base = cpu_base;
1626
1627
		timerqueue_init_head(&cpu_base->clock_base[i].active);
	}
1628

1629
	hrtimer_init_hres(cpu_base);
1630
1631
1632
1633
}

#ifdef CONFIG_HOTPLUG_CPU

1634
static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1635
				struct hrtimer_clock_base *new_base)
1636
1637
{
	struct hrtimer *timer;
1638
	struct timerqueue_node *node;
1639

1640
1641
	while ((node = timerqueue_getnext(&old_base->active))) {
		timer = container_of(node, struct hrtimer, node);
1642
		BUG_ON(hrtimer_callback_running(timer));
1643
		debug_deactivate(timer);
1644
1645
1646
1647
1648
1649
1650

		/*
		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
		 * timer could be seen as !active and just vanish away
		 * under us on another CPU
		 */
		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1651
		timer->base = new_base;
1652
		/*
Thomas Gleixner's avatar
Thomas Gleixner committed
1653
1654
1655
1656
1657
1658
		 * Enqueue the timers on the new cpu. This does not
		 * reprogram the event device in case the timer
		 * expires before the earliest on this CPU, but we run
		 * hrtimer_interrupt after we migrated everything to
		 * sort out already expired timers and reprogram the
		 * event device.
1659
		 */
1660
		enqueue_hrtimer(timer, new_base);
1661

1662
1663
		/* Clear the migration state bit */
		timer->state &= ~HRTIMER_STATE_MIGRATE;
1664
1665
1666
	}
}

1667
static void migrate_hrtimers(int scpu)
1668
{
1669
	struct hrtimer_cpu_base *old_base, *new_base;
1670
	int i;
1671

1672
1673
	BUG_ON(cpu_online(scpu));
	tick_cancel_sched_timer(scpu);
1674
1675
1676
1677

	local_irq_disable();
	old_base = &per_cpu(hrtimer_bases, scpu);
	new_base = &__get_cpu_var(hrtimer_bases);
1678
1679
1680
1681
	/*
	 * The caller is globally serialized and nobody else
	 * takes two locks at once, deadlock is not possible.
	 */
1682
1683
	raw_spin_lock(&new_base->lock);
	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1684

1685
	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1686
		migrate_hrtimer_list(&old_base->clock_base[i],
1687
				     &new_base->clock_base[i]);
1688
1689
	}

1690
1691
	raw_spin_unlock(&old_base->lock);
	raw_spin_unlock(&new_base->lock);
1692

1693
1694
1695
	/* Check, if we got expired work to do */
	__hrtimer_peek_ahead_timers();
	local_irq_enable();
1696
}
1697

1698
1699
#endif /* CONFIG_HOTPLUG_CPU */

1700
static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1701
1702
					unsigned long action, void *hcpu)
{
1703
	int scpu = (long)hcpu;
1704
1705
1706
1707

	switch (action) {

	case CPU_UP_PREPARE:
1708
	case CPU_UP_PREPARE_FROZEN:
1709
		init_hrtimers_cpu(scpu);
1710
1711
1712
		break;

#ifdef CONFIG_HOTPLUG_CPU
1713
1714
1715
1716
	case CPU_DYING:
	case CPU_DYING_FROZEN:
		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
		break;
1717
	case CPU_DEAD:
1718
	case CPU_DEAD_FROZEN:
1719
	{
1720
		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1721
		migrate_hrtimers(scpu);
1722
		break;
1723
	}
1724
1725
1726
1727
1728
1729
1730
1731
1732
#endif

	default:
		break;
	}

	return NOTIFY_OK;
}

1733
static struct notifier_block __cpuinitdata hrtimers_nb = {
1734
1735
1736
1737
1738
1739
1740
1741
	.notifier_call = hrtimer_cpu_notify,
};

void __init hrtimers_init(void)
{
	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
			  (void *)(long)smp_processor_id());
	register_cpu_notifier(&hrtimers_nb);
1742
1743
1744
#ifdef CONFIG_HIGH_RES_TIMERS
	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
#endif
1745
1746
}

1747
/**
1748
 * schedule_hrtimeout_range_clock - sleep until timeout
1749
 * @expires:	timeout value (ktime_t)
1750
 * @delta:	slack in expires timeout (ktime_t)
1751
 * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1752
 * @clock:	timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1753
 */
1754
1755
1756
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
			       const enum hrtimer_mode mode, int clock)
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
{
	struct hrtimer_sleeper t;

	/*
	 * Optimize when a zero timeout value is given. It does not
	 * matter whether this is an absolute or a relative time.
	 */
	if (expires && !expires->tv64) {
		__set_current_state(TASK_RUNNING);
		return 0;
	}

	/*
Namhyung Kim's avatar
Namhyung Kim committed
1770
	 * A NULL parameter means "infinite"
1771
1772
1773
1774
1775
1776
1777
	 */
	if (!expires) {
		schedule();
		__set_current_state(TASK_RUNNING);
		return -EINTR;
	}

1778
	hrtimer_init_on_stack(&t.timer, clock, mode);
1779
	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1780
1781
1782

	hrtimer_init_sleeper(&t, current);

1783
	hrtimer_start_expires(&t.timer, mode);
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
	if (!hrtimer_active(&t.timer))
		t.task = NULL;

	if (likely(t.task))
		schedule();

	hrtimer_cancel(&t.timer);
	destroy_hrtimer_on_stack(&t.timer);

	__set_current_state(TASK_RUNNING);

	return !t.task ? 0 : -EINTR;
}
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:	timeout value (ktime_t)
 * @delta:	slack in expires timeout (ktime_t)
 * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * The @delta argument gives the kernel the freedom to schedule the
 * actual wakeup to a time that is both power and performance friendly.
 * The kernel give the normal best effort behavior for "@expires+@delta",
 * but may decide to fire the timer earlier, but no earlier than @expires.
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns.
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired otherwise -EINTR
 */
int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
				     const enum hrtimer_mode mode)
{
	return schedule_hrtimeout_range_clock(expires, delta, mode,
					      CLOCK_MONOTONIC);
}
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);

/**
 * schedule_hrtimeout - sleep until timeout
 * @expires:	timeout value (ktime_t)
 * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns.
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired otherwise -EINTR
 */
int __sched schedule_hrtimeout(ktime_t *expires,
			       const enum hrtimer_mode mode)
{
	return schedule_hrtimeout_range(expires, 0, mode);
}
1861
EXPORT_SYMBOL_GPL(schedule_hrtimeout);