From 7bb67439bf6bd3782f07f1d7be1e63406453d5de Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 31 Aug 2008 08:05:58 -0700
Subject: select: Introduce a hrtimeout function

This patch adds a schedule_hrtimeout() function, to be used by select() and
poll() in a later patch. This function works similar to schedule_timeout()
in most ways, but takes a timespec rather than jiffies.

With a lot of contributions/fixes from Thomas

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  2 ++
 kernel/hrtimer.c        | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6d93dce61cb..becd17db1a1 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -346,6 +346,8 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
 
+extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
+
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 extern void hrtimer_run_pending(void);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a7..782137dc755 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1678,3 +1678,68 @@ void __init hrtimers_init(void)
 #endif
 }
 
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires:	timeout value (ktime_t)
+ * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+			       const enum hrtimer_mode mode)
+{
+	struct hrtimer_sleeper t;
+
+	/*
+	 * Optimize when a zero timeout value is given. It does not
+	 * matter whether this is an absolute or a relative time.
+	 */
+	if (expires && !expires->tv64) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	/*
+	 * A NULL parameter means "inifinte"
+	 */
+	if (!expires) {
+		schedule();
+		__set_current_state(TASK_RUNNING);
+		return -EINTR;
+	}
+
+	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+	t.timer.expires = *expires;
+
+	hrtimer_init_sleeper(&t, current);
+
+	hrtimer_start(&t.timer, t.timer.expires, mode);
+	if (!hrtimer_active(&t.timer))
+		t.task = NULL;
+
+	if (likely(t.task))
+		schedule();
+
+	hrtimer_cancel(&t.timer);
+	destroy_hrtimer_on_stack(&t.timer);
+
+	__set_current_state(TASK_RUNNING);
+
+	return !t.task ? 0 : -EINTR;
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
-- 
cgit v1.2.3


From df0cc0539b4127bd02f64de2c335b4af1fdb3845 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Aug 2008 08:09:53 -0700
Subject: select: add a timespec_add_safe() function

For the select() rework, it's important to be able to add timespec
structures in an overflow-safe manner.

This patch adds a timespec_add_safe() function for this which is similar in
operation to ktime_add_safe(), but works on a struct timespec.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/time.h |  4 ++++
 kernel/time.c        | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82..72697647848 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -38,6 +38,8 @@ struct timezone {
 #define NSEC_PER_SEC	1000000000L
 #define FSEC_PER_SEC	1000000000000000L
 
+#define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
+
 static inline int timespec_equal(const struct timespec *a,
                                  const struct timespec *b)
 {
@@ -72,6 +74,8 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int min, const unsigned int sec);
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+extern struct timespec timespec_add_safe(const struct timespec lhs,
+					 const struct timespec rhs);
 
 /*
  * sub = lhs - rhs, in normalized form
diff --git a/kernel/time.c b/kernel/time.c
index 6a08660b4fa..d63a4336fad 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64);
 #endif
 
 EXPORT_SYMBOL(jiffies);
+
+/*
+ * Add two timespec values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0)
+ */
+struct timespec timespec_add_safe(const struct timespec lhs,
+				  const struct timespec rhs)
+{
+	struct timespec res;
+
+	set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
+				lhs.tv_nsec + rhs.tv_nsec);
+
+	if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
+		res.tv_sec = TIME_T_MAX;
+
+	return res;
+}
-- 
cgit v1.2.3


From b773ad40aca5bd755ba886620842f16e8fef6d75 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Aug 2008 08:16:57 -0700
Subject: select: add poll_select_set_timeout() and
 poll_select_copy_remaining() helpers

This patch adds 2 helpers that will be used for the hrtimer based select/poll:

poll_select_set_timeout() is a helper that takes a timeout (as a second, nanosecond
pair) and turns that into a "struct timespec" that represents the absolute end time.
This is a common operation in the many select() and poll() variants and needs various,
common, sanity checks.

poll_select_copy_remaining() is a helper that takes care of copying the remaining
time to userspace, as select(), pselect() and ppoll() do. This function comes in
both a natural and a compat implementation (due to datastructure differences).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/compat.c          | 51 +++++++++++++++++++++++++++++++++++
 fs/select.c          | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/poll.h |  2 ++
 3 files changed, 128 insertions(+)

diff --git a/fs/compat.c b/fs/compat.c
index 075d0509970..424767c954a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1436,6 +1436,57 @@ out_ret:
 
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
 /*
  * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
  * 64-bit unsigned longs.
diff --git a/fs/select.c b/fs/select.c
index da0e88201c3..1180a620778 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -130,6 +130,81 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
+/**
+ * poll_select_set_timeout - helper function to setup the timeout value
+ * @to:		pointer to timespec variable for the final timeout
+ * @sec:	seconds (from user space)
+ * @nsec:	nanoseconds (from user space)
+ *
+ * Note, we do not use a timespec for the user space value here, That
+ * way we can use the function for timeval and compat interfaces as well.
+ *
+ * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
+ */
+int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+{
+	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+
+	if (!timespec_valid(&ts))
+		return -EINVAL;
+
+	/* Optimize for the zero timeout value here */
+	if (!sec && !nsec) {
+		to->tv_sec = to->tv_nsec = 0;
+	} else {
+		ktime_get_ts(to);
+		*to = timespec_add_safe(*to, ts);
+	}
+	return 0;
+}
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec rts;
+	struct timeval rtv;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&rts);
+	rts = timespec_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
+
+	if (timeval) {
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+
+	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+		return ret;
+
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+
+
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
diff --git a/include/linux/poll.h b/include/linux/poll.h
index ef453828877..f65de5128a9 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -120,6 +120,8 @@ extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			   fd_set __user *exp, s64 *timeout);
 
+extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+
 #endif /* KERNEL */
 
 #endif /* _LINUX_POLL_H */
-- 
cgit v1.2.3


From be5dad20a55e054a35dac7f6f5f184dc72b379b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Aug 2008 08:19:15 -0700
Subject: select: add a poll specific struct to the restart_block union

with hrtimer poll/select, the signal restart data no longer is a single
long representing a jiffies count, but it becomes a second/nanosecond pair
that also needs to encode if there was a timeout at all or not.

This patch adds a struct to the restart_block union for this purpose

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/thread_info.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 38a56477f27..e6b820f8b56 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -38,6 +38,14 @@ struct restart_block {
 #endif
 			u64 expires;
 		} nanosleep;
+		/* For poll */
+		struct {
+			struct pollfd __user *ufds;
+			int nfds;
+			int has_timeout;
+			unsigned long tv_sec;
+			unsigned long tv_nsec;
+		} poll;
 	};
 };
 
-- 
cgit v1.2.3


From 8ff3e8e85fa6c312051134b3953e397feb639f51 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 31 Aug 2008 08:26:40 -0700
Subject: select: switch select() and poll() over to hrtimers

With lots of help, input and cleanups from Thomas Gleixner

This patch switches select() and poll() over to hrtimers.

The core of the patch is replacing the "s64 timeout" with a
"struct timespec end_time" in all the plumbing.

But most of the diffstat comes from using the just introduced helpers:
	poll_select_set_timeout
	poll_select_copy_remaining
	timespec_add_safe
which make manipulating the timespec easier and less error-prone.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/compat.c          | 136 ++++----------------------
 fs/select.c          | 263 +++++++++++++++++----------------------------------
 include/linux/poll.h |   6 +-
 3 files changed, 111 insertions(+), 294 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index 424767c954a..133ed7f5d68 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1568,7 +1568,8 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int compat_core_sys_select(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -1615,7 +1616,7 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -1641,7 +1642,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct compat_timeval tv;
 	int ret;
 
@@ -1649,43 +1650,14 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, tv.tv_sec,
+					    tv.tv_usec * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct compat_timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (compat_timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -1698,15 +1670,16 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 {
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -1721,51 +1694,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	do {
-		if (tsp) {
-			if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
-				timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-				timeout += ts.tv_sec * (unsigned long)HZ;
-				ts.tv_sec = 0;
-				ts.tv_nsec = 0;
-			} else {
-				ts.tv_sec -= MAX_SELECT_SECONDS;
-				timeout = MAX_SELECT_SECONDS * HZ;
-			}
-		}
-
-		ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
-
-	if (tsp) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-
-		rts.tv_sec = timeout / HZ;
-		rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
-		if (rts.tv_nsec >= NSEC_PER_SEC) {
-			rts.tv_sec++;
-			rts.tv_nsec -= NSEC_PER_SEC;
-		}
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -1810,18 +1740,16 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct compat_timespec ts;
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* We assume that ts.tv_sec is always lower than
-		   the number of seconds that can be expressed in
-		   an s64. Otherwise the compiler bitches at us */
-		timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-		timeout += ts.tv_sec * HZ;
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -1835,7 +1763,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -1853,31 +1781,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-					1000;
-		rts.tv_sec = timeout;
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
diff --git a/fs/select.c b/fs/select.c
index 1180a620778..f6dceb56793 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -24,6 +24,7 @@
 #include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
@@ -203,8 +204,6 @@ sticky:
 	return ret;
 }
 
-
-
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
@@ -257,11 +256,12 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
-int do_select(int n, fd_set_bits *fds, s64 *timeout)
+int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
+	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
 	poll_table *wait;
-	int retval, i;
+	int retval, i, timed_out = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -273,12 +273,14 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 
 	poll_initwait(&table);
 	wait = &table.pt;
-	if (!*timeout)
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		wait = NULL;
+		timed_out = 1;
+	}
+
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -334,27 +336,25 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 			cond_resched();
 		}
 		wait = NULL;
-		if (retval || !*timeout || signal_pending(current))
+		if (retval || timed_out || signal_pending(current))
 			break;
 		if (table.error) {
 			retval = table.error;
 			break;
 		}
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
-			/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+
+		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -375,7 +375,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, s64 *timeout)
+			   fd_set __user *exp, struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -426,7 +426,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -452,7 +452,7 @@ out_nofds:
 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			fd_set __user *exp, struct timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct timeval tv;
 	int ret;
 
@@ -460,43 +460,14 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, tv.tv_sec,
+					    tv.tv_usec * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -506,25 +477,17 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		fd_set __user *exp, struct timespec __user *tsp,
 		const sigset_t __user *sigmask, size_t sigsetsize)
 {
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
 	}
 
 	if (sigmask) {
@@ -538,32 +501,8 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tsp) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, &end_time);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -649,18 +588,20 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
-		   struct poll_wqueues *wait, s64 *timeout)
+		   struct poll_wqueues *wait, struct timespec *end_time)
 {
-	int count = 0;
 	poll_table* pt = &wait->pt;
+	ktime_t expire, *to = NULL;
+	int timed_out = 0, count = 0;
 
 	/* Optimise the no-wait case */
-	if (!(*timeout))
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		pt = NULL;
+		timed_out = 1;
+	}
 
 	for (;;) {
 		struct poll_list *walk;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		for (walk = list; walk != NULL; walk = walk->next) {
@@ -692,27 +633,21 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			if (signal_pending(current))
 				count = -EINTR;
 		}
-		if (count || !*timeout)
+		if (count || timed_out)
 			break;
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
-			/*
-			 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
-			 * a loop
-			 */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
 
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 	return count;
@@ -721,7 +656,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 			sizeof(struct pollfd))
 
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+		struct timespec *end_time)
 {
 	struct poll_wqueues table;
  	int err = -EFAULT, fdcount, len, size;
@@ -761,7 +697,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 	}
 
 	poll_initwait(&table);
-	fdcount = do_poll(nfds, head, &table, timeout);
+	fdcount = do_poll(nfds, head, &table, end_time);
 	poll_freewait(&table);
 
 	for (walk = head; walk; walk = walk->next) {
@@ -787,16 +723,21 @@ out_fds:
 
 static long do_restart_poll(struct restart_block *restart_block)
 {
-	struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0;
-	int nfds = restart_block->arg1;
-	s64 timeout = ((s64)restart_block->arg3<<32) | (s64)restart_block->arg2;
+	struct pollfd __user *ufds = restart_block->poll.ufds;
+	int nfds = restart_block->poll.nfds;
+	struct timespec *to = NULL, end_time;
 	int ret;
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	if (restart_block->poll.has_timeout) {
+		end_time.tv_sec = restart_block->poll.tv_sec;
+		end_time.tv_nsec = restart_block->poll.tv_nsec;
+		to = &end_time;
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		restart_block->fn = do_restart_poll;
-		restart_block->arg2 = timeout & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout >> 32;
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -805,31 +746,32 @@ static long do_restart_poll(struct restart_block *restart_block)
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 			long timeout_msecs)
 {
-	s64 timeout_jiffies;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
-	if (timeout_msecs > 0) {
-#if HZ > 1000
-		/* We can only overflow if HZ > 1000 */
-		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
-			timeout_jiffies = -1;
-		else
-#endif
-			timeout_jiffies = msecs_to_jiffies(timeout_msecs) + 1;
-	} else {
-		/* Infinite (< 0) or no (0) timeout */
-		timeout_jiffies = timeout_msecs;
+	if (timeout_msecs >= 0) {
+		to = &end_time;
+		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
+			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout_jiffies);
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		struct restart_block *restart_block;
+
 		restart_block = &current_thread_info()->restart_block;
 		restart_block->fn = do_restart_poll;
-		restart_block->arg0 = (unsigned long)ufds;
-		restart_block->arg1 = nfds;
-		restart_block->arg2 = timeout_jiffies & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout_jiffies >> 32;
+		restart_block->poll.ufds = ufds;
+		restart_block->poll.nfds = nfds;
+
+		if (timeout_msecs >= 0) {
+			restart_block->poll.tv_sec = end_time.tv_sec;
+			restart_block->poll.tv_nsec = end_time.tv_nsec;
+			restart_block->poll.has_timeout = 1;
+		} else
+			restart_block->poll.has_timeout = 0;
+
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -841,21 +783,16 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	s64 timeout = -1;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -869,7 +806,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -887,31 +824,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-		sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f65de5128a9..badd98ab06f 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -114,11 +114,11 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
 
-extern int do_select(int n, fd_set_bits *fds, s64 *timeout);
+extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
 extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
-		       s64 *timeout);
+		       struct timespec *end_time);
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, s64 *timeout);
+			   fd_set __user *exp, struct timespec *end_time);
 
 extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
 
-- 
cgit v1.2.3


From 63ca243b271f5b44e0b1057003cf498b6d0fadf7 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 14:35:02 -0700
Subject: hrtimer: add abstraction functions for accessing the "expires" member

In order to be able to turn hrtimers into range based, we need to provide
accessor functions for getting to the "expires" ktime_t member of the
struct hrtimer.

This patch adds a set of accessors for this purpose:
* hrtimer_set_expires
* hrtimer_set_expires_tv64
* hrtimer_add_expires
* hrtimer_add_expires_ns
* hrtimer_get_expires
* hrtimer_get_expires_tv64
* hrtimer_get_expires_ns
* hrtimer_expires_remaining
* hrtimer_start_expires

No users of these new accessors are added yet; these follow in later patches.
Hopefully this patch can even go into 2.6.27-rc so that the conversions will
not have a bottleneck in -next

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index becd17db1a1..9900e998ea8 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -217,6 +217,45 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 	return timer->base->cpu_base->hres_active;
 }
 
+static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
+{
+	timer->expires = time;
+}
+static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
+{
+	timer->expires.tv64 = tv64;
+}
+
+static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
+{
+	timer->expires = ktime_add_safe(timer->expires, time);
+}
+
+static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
+{
+	timer->expires = ktime_add_ns(timer->expires, ns);
+}
+
+static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
+{
+	return timer->expires;
+}
+
+static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
+{
+	return timer->expires.tv64;
+}
+
+static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
+{
+	return ktime_to_ns(timer->expires);
+}
+
+static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
+{
+    return ktime_sub(timer->expires, timer->base->get_time());
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -287,6 +326,12 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
+static inline int hrtimer_start_expires(struct hrtimer *timer,
+						enum hrtimer_mode mode)
+{
+	return hrtimer_start(timer, hrtimer_get_expires(timer), mode);
+}
+
 static inline int hrtimer_restart(struct hrtimer *timer)
 {
 	return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
-- 
cgit v1.2.3


From beb20d52d03a51218827fb4a36a4b583debb03f9 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 14:55:57 -0700
Subject: hrtimer: convert kvm to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts KVM to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/x86/kvm/i8254.c | 6 +++---
 arch/x86/kvm/lapic.c | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a912..1bf8f57a304 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -205,8 +205,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 		wake_up_interruptible(&vcpu0->wq);
 	}
 
-	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
-	pt->scheduled = ktime_to_ns(pt->timer.expires);
+	hrtimer_add_expires_ns(&pt->timer, pt->period);
+	pt->scheduled = ktime_to_ns(hrtimer_get_expires(&pt->timer));
 
 	return (pt->period == 0 ? 0 : 1);
 }
@@ -246,7 +246,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 
 	timer = &pit->pit_state.pit_timer.timer;
 	if (hrtimer_cancel(timer))
-		hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 static void destroy_pit_timer(struct kvm_kpit_timer *pt)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f6..a5b61de6adf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -953,9 +953,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 	}
 	if (apic_lvtt_period(apic)) {
 		result = 1;
-		apic->timer.dev.expires = ktime_add_ns(
-					apic->timer.dev.expires,
-					apic->timer.period);
+		hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
 	}
 	return result;
 }
@@ -1124,7 +1122,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 
 	timer = &apic->timer.dev;
 	if (hrtimer_cancel(timer))
-		hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 76369470b7e5f97fc1a8af83c45b9ff739b08cb6 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:00:14 -0700
Subject: hrtimer: convert timerfd to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts timerfd to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/timerfd.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/timerfd.c b/fs/timerfd.c
index c502c60e4f5..0862f0e49d0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -52,11 +52,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
-	ktime_t now, remaining;
-
-	now = ctx->tmr.base->get_time();
-	remaining = ktime_sub(ctx->tmr.expires, now);
+	ktime_t remaining;
 
+	remaining = hrtimer_expires_remaining(&ctx->tmr);
 	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
 
@@ -74,7 +72,7 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
 	ctx->ticks = 0;
 	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
 	hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
-	ctx->tmr.expires = texp;
+	hrtimer_set_expires(&ctx->tmr, texp);
 	ctx->tmr.function = timerfd_tmrproc;
 	if (texp.tv64 != 0)
 		hrtimer_start(&ctx->tmr, texp, htmode);
-- 
cgit v1.2.3


From 23dd7bb09bd8d7efd8a602aed97b93d52f85e675 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:00:54 -0700
Subject: hrtimer: convert net::sched_cbq to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts sched_cbq to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 net/sched/sch_cbq.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 8b06fa90048..03e389e8d94 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -545,9 +545,10 @@ static void cbq_ovl_delay(struct cbq_class *cl)
 			expires = ktime_set(0, 0);
 			expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
 			if (hrtimer_try_to_cancel(&q->delay_timer) &&
-			    ktime_to_ns(ktime_sub(q->delay_timer.expires,
-						  expires)) > 0)
-				q->delay_timer.expires = expires;
+			    ktime_to_ns(ktime_sub(
+					hrtimer_get_expires(&q->delay_timer),
+					expires)) > 0)
+				hrtimer_set_expires(&q->delay_timer, expires);
 			hrtimer_restart(&q->delay_timer);
 			cl->delayed = 1;
 			cl->xstats.overactions++;
-- 
cgit v1.2.3


From cc584b213f252bf698849cf4be2377cd3ec7501a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:02:30 -0700
Subject: hrtimer: convert kernel/* to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts kernel/* to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/futex.c           |  7 +++----
 kernel/hrtimer.c         | 44 +++++++++++++++++++++++---------------------
 kernel/posix-timers.c    | 10 ++++------
 kernel/rtmutex.c         |  3 +--
 kernel/sched.c           |  7 +++----
 kernel/time/ntp.c        |  3 +--
 kernel/time/tick-sched.c | 21 ++++++++++-----------
 kernel/time/timer_list.c |  4 ++--
 8 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c1..4cd5b4319b0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1299,10 +1299,9 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 			hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
 						HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
-			t.timer.expires = *abs_time;
+			hrtimer_set_expires(&t.timer, *abs_time);
 
-			hrtimer_start(&t.timer, t.timer.expires,
-						HRTIMER_MODE_ABS);
+			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
 			if (!hrtimer_active(&t.timer))
 				t.task = NULL;
 
@@ -1404,7 +1403,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 		hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
 				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
-		to->timer.expires = *time;
+		hrtimer_set_expires(&to->timer, *time);
 	}
 
 	q.pi_state = NULL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 782137dc755..ae307feec74 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
 		if (!base->first)
 			continue;
 		timer = rb_entry(base->first, struct hrtimer, node);
-		expires = ktime_sub(timer->expires, base->offset);
+		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 		if (expires.tv64 < cpu_base->expires_next.tv64)
 			cpu_base->expires_next = expires;
 	}
@@ -539,10 +539,10 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base)
 {
 	ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
-	ktime_t expires = ktime_sub(timer->expires, base->offset);
+	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 	int res;
 
-	WARN_ON_ONCE(timer->expires.tv64 < 0);
+	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
 
 	/*
 	 * When the callback is running, we do not reprogram the clock event
@@ -794,7 +794,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 	u64 orun = 1;
 	ktime_t delta;
 
-	delta = ktime_sub(now, timer->expires);
+	delta = ktime_sub(now, hrtimer_get_expires(timer));
 
 	if (delta.tv64 < 0)
 		return 0;
@@ -806,8 +806,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 		s64 incr = ktime_to_ns(interval);
 
 		orun = ktime_divns(delta, incr);
-		timer->expires = ktime_add_ns(timer->expires, incr * orun);
-		if (timer->expires.tv64 > now.tv64)
+		hrtimer_add_expires_ns(timer, incr * orun);
+		if (hrtimer_get_expires_tv64(timer) > now.tv64)
 			return orun;
 		/*
 		 * This (and the ktime_add() below) is the
@@ -815,7 +815,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 		 */
 		orun++;
 	}
-	timer->expires = ktime_add_safe(timer->expires, interval);
+	hrtimer_add_expires(timer, interval);
 
 	return orun;
 }
@@ -847,7 +847,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
 		 * We dont care about collisions. Nodes with
 		 * the same expiry time stay together.
 		 */
-		if (timer->expires.tv64 < entry->expires.tv64) {
+		if (hrtimer_get_expires_tv64(timer) <
+				hrtimer_get_expires_tv64(entry)) {
 			link = &(*link)->rb_left;
 		} else {
 			link = &(*link)->rb_right;
@@ -982,7 +983,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 #endif
 	}
 
-	timer->expires = tim;
+	hrtimer_set_expires(timer, tim);
 
 	timer_stats_hrtimer_set_start_info(timer);
 
@@ -1076,7 +1077,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 	ktime_t rem;
 
 	base = lock_hrtimer_base(timer, &flags);
-	rem = ktime_sub(timer->expires, base->get_time());
+	rem = hrtimer_expires_remaining(timer);
 	unlock_hrtimer_base(timer, &flags);
 
 	return rem;
@@ -1108,7 +1109,7 @@ ktime_t hrtimer_get_next_event(void)
 				continue;
 
 			timer = rb_entry(base->first, struct hrtimer, node);
-			delta.tv64 = timer->expires.tv64;
+			delta.tv64 = hrtimer_get_expires_tv64(timer);
 			delta = ktime_sub(delta, base->get_time());
 			if (delta.tv64 < mindelta.tv64)
 				mindelta.tv64 = delta.tv64;
@@ -1308,10 +1309,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 			timer = rb_entry(node, struct hrtimer, node);
 
-			if (basenow.tv64 < timer->expires.tv64) {
+			if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) {
 				ktime_t expires;
 
-				expires = ktime_sub(timer->expires,
+				expires = ktime_sub(hrtimer_get_expires(timer),
 						    base->offset);
 				if (expires.tv64 < expires_next.tv64)
 					expires_next = expires;
@@ -1414,7 +1415,8 @@ void hrtimer_run_queues(void)
 			struct hrtimer *timer;
 
 			timer = rb_entry(node, struct hrtimer, node);
-			if (base->softirq_time.tv64 <= timer->expires.tv64)
+			if (base->softirq_time.tv64 <=
+					hrtimer_get_expires_tv64(timer))
 				break;
 
 			if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@ -1462,7 +1464,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
-		hrtimer_start(&t->timer, t->timer.expires, mode);
+		hrtimer_start_expires(&t->timer, mode);
 		if (!hrtimer_active(&t->timer))
 			t->task = NULL;
 
@@ -1484,7 +1486,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
 	struct timespec rmt;
 	ktime_t rem;
 
-	rem = ktime_sub(timer->expires, timer->base->get_time());
+	rem = hrtimer_expires_remaining(timer);
 	if (rem.tv64 <= 0)
 		return 0;
 	rmt = ktime_to_timespec(rem);
@@ -1503,7 +1505,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 
 	hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
 				HRTIMER_MODE_ABS);
-	t.timer.expires.tv64 = restart->nanosleep.expires;
+	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
 
 	if (do_nanosleep(&t, HRTIMER_MODE_ABS))
 		goto out;
@@ -1530,7 +1532,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	int ret = 0;
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
-	t.timer.expires = timespec_to_ktime(*rqtp);
+	hrtimer_set_expires(&t.timer, timespec_to_ktime(*rqtp));
 	if (do_nanosleep(&t, mode))
 		goto out;
 
@@ -1550,7 +1552,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.index = t.timer.base->index;
 	restart->nanosleep.rmtp = rmtp;
-	restart->nanosleep.expires = t.timer.expires.tv64;
+	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
 
 	ret = -ERESTART_RESTARTBLOCK;
 out:
@@ -1724,11 +1726,11 @@ int __sched schedule_hrtimeout(ktime_t *expires,
 	}
 
 	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
-	t.timer.expires = *expires;
+	hrtimer_set_expires(&t.timer, *expires);
 
 	hrtimer_init_sleeper(&t, current);
 
-	hrtimer_start(&t.timer, t.timer.expires, mode);
+	hrtimer_start_expires(&t.timer, mode);
 	if (!hrtimer_active(&t.timer))
 		t.task = NULL;
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbf..f85efcdcab2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -668,7 +668,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
 		timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
 
-	remaining = ktime_sub(timer->expires, now);
+	remaining = ktime_sub(hrtimer_get_expires(timer), now);
 	/* Return 0 only, when the timer is expired and not pending */
 	if (remaining.tv64 <= 0) {
 		/*
@@ -762,7 +762,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
 	timr->it.real.timer.function = posix_timer_fn;
 
-	timer->expires = timespec_to_ktime(new_setting->it_value);
+	hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
 
 	/* Convert interval */
 	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
@@ -771,14 +771,12 @@ common_timer_set(struct k_itimer *timr, int flags,
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
 		/* Setup correct expiry time for relative timers */
 		if (mode == HRTIMER_MODE_REL) {
-			timer->expires =
-				ktime_add_safe(timer->expires,
-					       timer->base->get_time());
+			hrtimer_add_expires(timer, timer->base->get_time());
 		}
 		return 0;
 	}
 
-	hrtimer_start(timer, timer->expires, mode);
+	hrtimer_start_expires(timer, mode);
 	return 0;
 }
 
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 6522ae5b14a..69d9cb921ff 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 	/* Setup the timer, when timeout != NULL */
 	if (unlikely(timeout)) {
-		hrtimer_start(&timeout->timer, timeout->timer.expires,
-			      HRTIMER_MODE_ABS);
+		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
 		if (!hrtimer_active(&timeout->timer))
 			timeout->task = NULL;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 1a5f73c1fcd..e46b5afa200 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -221,9 +221,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-		hrtimer_start(&rt_b->rt_period_timer,
-			      rt_b->rt_period_timer.expires,
-			      HRTIMER_MODE_ABS);
+		hrtimer_start_expires(&rt_b->rt_period_timer,
+				HRTIMER_MODE_ABS);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1058,7 +1057,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 
-	timer->expires = time;
+	hrtimer_set_expires(timer, time);
 
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196..4c8d85421d2 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 		time_state = TIME_OOP;
 		printk(KERN_NOTICE "Clock: "
 		       "inserting leap second 23:59:60 UTC\n");
-		leap_timer.expires = ktime_add_ns(leap_timer.expires,
-						  NSEC_PER_SEC);
+		hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
 		res = HRTIMER_RESTART;
 		break;
 	case TIME_DEL:
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a87b0468568..b33be61c0f6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -288,7 +288,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 				goto out;
 			}
 
-			ts->idle_tick = ts->sched_timer.expires;
+			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
 			rcu_enter_nohz();
@@ -419,21 +419,21 @@ void tick_nohz_restart_sched_tick(void)
 	ts->tick_stopped  = 0;
 	ts->idle_exittime = now;
 	hrtimer_cancel(&ts->sched_timer);
-	ts->sched_timer.expires = ts->idle_tick;
+	hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
 
 	while (1) {
 		/* Forward the time to expire in the future */
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
 
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-			hrtimer_start(&ts->sched_timer,
-				      ts->sched_timer.expires,
+			hrtimer_start_expires(&ts->sched_timer,
 				      HRTIMER_MODE_ABS);
 			/* Check, if the timer was already in the past */
 			if (hrtimer_active(&ts->sched_timer))
 				break;
 		} else {
-			if (!tick_program_event(ts->sched_timer.expires, 0))
+			if (!tick_program_event(
+				hrtimer_get_expires(&ts->sched_timer), 0))
 				break;
 		}
 		/* Update jiffies and reread time */
@@ -446,7 +446,7 @@ void tick_nohz_restart_sched_tick(void)
 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 {
 	hrtimer_forward(&ts->sched_timer, now, tick_period);
-	return tick_program_event(ts->sched_timer.expires, 0);
+	return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
 }
 
 /*
@@ -529,7 +529,7 @@ static void tick_nohz_switch_to_nohz(void)
 	next = tick_init_jiffy_update();
 
 	for (;;) {
-		ts->sched_timer.expires = next;
+		hrtimer_set_expires(&ts->sched_timer, next);
 		if (!tick_program_event(next, 0))
 			break;
 		next = ktime_add(next, tick_period);
@@ -625,16 +625,15 @@ void tick_setup_sched_timer(void)
 	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
 	/* Get the next period (per cpu) */
-	ts->sched_timer.expires = tick_init_jiffy_update();
+	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
 	offset = ktime_to_ns(tick_period) >> 1;
 	do_div(offset, num_possible_cpus());
 	offset *= smp_processor_id();
-	ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
+	hrtimer_add_expires_ns(&ts->sched_timer, offset);
 
 	for (;;) {
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
-		hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
-			      HRTIMER_MODE_ABS);
+		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
 		/* Check, if the timer was already in the past */
 		if (hrtimer_active(&ts->sched_timer))
 			break;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a40e20fd000..5224a3215fb 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -66,8 +66,8 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
 #endif
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n",
-		(unsigned long long)ktime_to_ns(timer->expires),
-		(long long)(ktime_to_ns(timer->expires) - now));
+		(unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+		(long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
 }
 
 static void
-- 
cgit v1.2.3


From 23446d1dc3d4f42a2b0fb82d4a098f9179ba486d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:18:10 -0700
Subject: hrtimer: convert powerpc/oprofile to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts powerpc/oprofile to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/powerpc/oprofile/cell/spu_profiler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index 380d7e21753..02ffe060db5 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -196,7 +196,7 @@ int start_spu_profiling(unsigned int cycles_reset)
 	pr_debug("timer resolution: %lu\n", TICK_NSEC);
 	kt = ktime_set(0, profiling_interval);
 	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	timer.expires = kt;
+	hrtimer_set_expires(&timer, kt);
 	timer.function = profile_spus;
 
 	/* Allocate arrays for collecting SPU PC samples */
-- 
cgit v1.2.3


From 18dd36af0010dd70c8634cdca0f99b47b5036c60 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:19:11 -0700
Subject: hrtimer: convert kvm-ia64 to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts KVM-ia64 to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/ia64/kvm/kvm-ia64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 7a37d06376b..cf8eae1855e 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1112,7 +1112,7 @@ static void kvm_migrate_hlt_timer(struct kvm_vcpu *vcpu)
 	struct hrtimer *p_ht = &vcpu->arch.hlt_timer;
 
 	if (hrtimer_cancel(p_ht))
-		hrtimer_start(p_ht, p_ht->expires, HRTIMER_MODE_ABS);
+		hrtimer_start_expires(p_ht, HRTIMER_MODE_ABS);
 }
 
 static enum hrtimer_restart hlt_timer_fn(struct hrtimer *data)
-- 
cgit v1.2.3


From 6c644eaeb2e000a08f0e20653b0835bb90a93e4a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:20:30 -0700
Subject: hrtimer: convert s390 to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts s390 to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 drivers/s390/crypto/ap_bus.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 62b6b55230d..6f02f1e674d 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -659,9 +659,9 @@ static ssize_t poll_timeout_store(struct bus_type *bus, const char *buf,
 	hr_time = ktime_set(0, poll_timeout);
 
 	if (!hrtimer_is_queued(&ap_poll_timer) ||
-	    !hrtimer_forward(&ap_poll_timer, ap_poll_timer.expires, hr_time)) {
-		ap_poll_timer.expires = hr_time;
-		hrtimer_start(&ap_poll_timer, hr_time, HRTIMER_MODE_ABS);
+	    !hrtimer_forward(&ap_poll_timer, hrtimer_get_expires(&ap_poll_timer), hr_time)) {
+		hrtimer_set_expires(&ap_poll_timer, hr_time);
+		hrtimer_start_expires(&ap_poll_timer, HRTIMER_MODE_ABS);
 	}
 	return count;
 }
-- 
cgit v1.2.3


From 5c73a7d0411999e3cb3c6d64225450813738ae25 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:25:20 -0700
Subject: hrtimer: convert sound/ to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts sound/ to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 sound/drivers/pcsp/pcsp_lib.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c
index e341f3f83b6..1f42e406311 100644
--- a/sound/drivers/pcsp/pcsp_lib.c
+++ b/sound/drivers/pcsp/pcsp_lib.c
@@ -34,7 +34,7 @@ enum hrtimer_restart pcsp_do_timer(struct hrtimer *handle)
 		chip->thalf = 0;
 		if (!atomic_read(&chip->timer_active))
 			return HRTIMER_NORESTART;
-		hrtimer_forward(&chip->timer, chip->timer.expires,
+		hrtimer_forward(&chip->timer, hrtimer_get_expires(&chip->timer),
 				ktime_set(0, chip->ns_rem));
 		return HRTIMER_RESTART;
 	}
@@ -118,7 +118,8 @@ enum hrtimer_restart pcsp_do_timer(struct hrtimer *handle)
 	chip->ns_rem = PCSP_PERIOD_NS();
 	ns = (chip->thalf ? PCSP_CALC_NS(timer_cnt) : chip->ns_rem);
 	chip->ns_rem -= ns;
-	hrtimer_forward(&chip->timer, chip->timer.expires, ktime_set(0, ns));
+	hrtimer_forward(&chip->timer, hrtimer_get_expires(&chip->timer),
+							ktime_set(0, ns));
 	return HRTIMER_RESTART;
 
 exit_nr_unlock2:
-- 
cgit v1.2.3


From 799b64de256ea68fbb5db63bb55f61c305870643 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:27:58 -0700
Subject: hrtimer: rename the "expires" struct member to avoid accidental usage

To catch code that still touches the "expires" memory directly, rename it
to have the compiler complain rather than get nasty, hard to explain,
runtime behavior

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 9900e998ea8..485a634fd6e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,7 +111,7 @@ enum hrtimer_cb_mode {
  */
 struct hrtimer {
 	struct rb_node			node;
-	ktime_t				expires;
+	ktime_t				_expires;
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
@@ -219,41 +219,41 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
-	timer->expires = time;
+	timer->_expires = time;
 }
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
-	timer->expires.tv64 = tv64;
+	timer->_expires.tv64 = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
-	timer->expires = ktime_add_safe(timer->expires, time);
+	timer->_expires = ktime_add_safe(timer->_expires, time);
 }
 
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
 {
-	timer->expires = ktime_add_ns(timer->expires, ns);
+	timer->_expires = ktime_add_ns(timer->_expires, ns);
 }
 
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 {
-	return timer->expires;
+	return timer->_expires;
 }
 
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
-	return timer->expires.tv64;
+	return timer->_expires.tv64;
 }
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
-	return ktime_to_ns(timer->expires);
+	return ktime_to_ns(timer->_expires);
 }
 
 static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 {
-    return ktime_sub(timer->expires, timer->base->get_time());
+    return ktime_sub(timer->_expires, timer->base->get_time());
 }
 
 /*
@@ -334,7 +334,7 @@ static inline int hrtimer_start_expires(struct hrtimer *timer,
 
 static inline int hrtimer_restart(struct hrtimer *timer)
 {
-	return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+	return hrtimer_start(timer, timer->_expires, HRTIMER_MODE_ABS);
 }
 
 /* Query timers: */
-- 
cgit v1.2.3


From 654c8e0b1c623b156c5b92f28d914ab38c9c2c90 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:47:08 -0700
Subject: hrtimer: turn hrtimers into range timers

this patch turns hrtimers into range timers; they have 2 expire points
1) the soft expire point
2) the hard expire point

the kernel will do it's regular best effort attempt to get the timer run
at the hard expire point. However, if some other time fires after the soft
expire point, the kernel now has the freedom to fire this timer at this point,
and thus grouping the events and preventing a power-expensive wakeup in the
future.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 31 ++++++++++++++++++++++++++-
 kernel/hrtimer.c        | 56 +++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 485a634fd6e..28259c33667 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -112,6 +112,7 @@ enum hrtimer_cb_mode {
 struct hrtimer {
 	struct rb_node			node;
 	ktime_t				_expires;
+	ktime_t				_softexpires;
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
@@ -220,20 +221,37 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = time;
+	timer->_softexpires = time;
 }
+
+static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
+{
+	timer->_softexpires = time;
+	timer->_expires = ktime_add_safe(time, delta);
+}
+
+static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+{
+	timer->_softexpires = time;
+	timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+}
+
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
 	timer->_expires.tv64 = tv64;
+	timer->_softexpires.tv64 = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = ktime_add_safe(timer->_expires, time);
+	timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
 }
 
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
 {
 	timer->_expires = ktime_add_ns(timer->_expires, ns);
+	timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
 }
 
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
@@ -241,10 +259,19 @@ static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 	return timer->_expires;
 }
 
+static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
+{
+	return timer->_softexpires;
+}
+
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
 	return timer->_expires.tv64;
 }
+static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
+{
+	return timer->_softexpires.tv64;
+}
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
@@ -334,7 +361,7 @@ static inline int hrtimer_start_expires(struct hrtimer *timer,
 
 static inline int hrtimer_restart(struct hrtimer *timer)
 {
-	return hrtimer_start(timer, timer->_expires, HRTIMER_MODE_ABS);
+	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 /* Query timers: */
@@ -391,6 +418,8 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
 
+extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+						const enum hrtimer_mode mode);
 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
 /* Soft interrupt function to run the hrtimer queues: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae307feec74..01483004183 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1309,7 +1309,20 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 			timer = rb_entry(node, struct hrtimer, node);
 
-			if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) {
+			/*
+			 * The immediate goal for using the softexpires is
+			 * minimizing wakeups, not running timers at the
+			 * earliest interrupt after their soft expiration.
+			 * This allows us to avoid using a Priority Search
+			 * Tree, which can answer a stabbing querry for
+			 * overlapping intervals and instead use the simple
+			 * BST we already have.
+			 * We don't add extra wakeups by delaying timers that
+			 * are right-of a not yet expired timer, because that
+			 * timer will have to trigger a wakeup anyway.
+			 */
+
+			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
 				ktime_t expires;
 
 				expires = ktime_sub(hrtimer_get_expires(timer),
@@ -1681,14 +1694,20 @@ void __init hrtimers_init(void)
 }
 
 /**
- * schedule_hrtimeout - sleep until timeout
+ * schedule_hrtimeout_range - sleep until timeout
  * @expires:	timeout value (ktime_t)
+ * @delta:	slack in expires timeout (ktime_t)
  * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
  *
  * Make the current task sleep until the given expiry time has
  * elapsed. The routine will return immediately unless
  * the current task state has been set (see set_current_state()).
  *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
@@ -1702,7 +1721,7 @@ void __init hrtimers_init(void)
  *
  * Returns 0 when the timer has expired otherwise -EINTR
  */
-int __sched schedule_hrtimeout(ktime_t *expires,
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 			       const enum hrtimer_mode mode)
 {
 	struct hrtimer_sleeper t;
@@ -1726,7 +1745,7 @@ int __sched schedule_hrtimeout(ktime_t *expires,
 	}
 
 	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
-	hrtimer_set_expires(&t.timer, *expires);
+	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
 
 	hrtimer_init_sleeper(&t, current);
 
@@ -1744,4 +1763,33 @@ int __sched schedule_hrtimeout(ktime_t *expires,
 
 	return !t.task ? 0 : -EINTR;
 }
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires:	timeout value (ktime_t)
+ * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+			       const enum hrtimer_mode mode)
+{
+	return schedule_hrtimeout_range(expires, 0, mode);
+}
 EXPORT_SYMBOL_GPL(schedule_hrtimeout);
-- 
cgit v1.2.3


From 6976675d94042fbd446231d1bd8b7de71a980ada Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:52:40 -0700
Subject: hrtimer: create a "timer_slack" field in the task struct

We want to be able to control the default "rounding" that is used by
select() and poll() and friends. This is a per process property
(so that we can have a "nice" like program to start certain programs with
a looser or stricter rounding) that can be set/get via a prctl().

For this purpose, a field called "timer_slack_ns" is added to the task
struct. In addition, a field called "default_timer_slack"ns" is added
so that tasks easily can temporarily to a more/less accurate slack and then
back to the default.

The default value of the slack is set to 50 usec; this is significantly less
than 2.6.27's average select() and poll() timing error but still allows
the kernel to group timers somewhat to preserve power behavior. Applications
and admins can override this via the prctl()

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/init_task.h |  1 +
 include/linux/prctl.h     |  7 +++++++
 include/linux/sched.h     |  6 ++++++
 kernel/fork.c             |  2 ++
 kernel/sys.c              | 10 ++++++++++
 5 files changed, 26 insertions(+)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 021d8e720c7..23fd8909b9e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -170,6 +170,7 @@ extern struct group_info init_groups;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
 	.pi_lock	= __SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 5ad79198d6f..48d887e3c6e 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -78,4 +78,11 @@
 #define PR_GET_SECUREBITS 27
 #define PR_SET_SECUREBITS 28
 
+/*
+ * Get/set the timerslack as used by poll/select/nanosleep
+ * A value of 0 means "use default"
+ */
+#define PR_SET_TIMERSLACK 29
+#define PR_GET_TIMERSLACK 30
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d9120c5ad1..dcc03fd5a7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1301,6 +1301,12 @@ struct task_struct {
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
 #endif
+	/*
+	 * time slack values; these are used to round up poll() and
+	 * select() etc timeout values. These are in nanoseconds.
+	 */
+	unsigned long timer_slack_ns;
+	unsigned long default_timer_slack_ns;
 };
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe8479..4308d75f0fa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -987,6 +987,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
 
+	p->default_timer_slack_ns = current->timer_slack_ns;
+
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 	p->last_switch_count = 0;
 	p->last_switch_timestamp = 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901..1b96401a057 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1727,6 +1727,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TSC:
 			error = SET_TSC_CTL(arg2);
 			break;
+		case PR_GET_TIMERSLACK:
+			error = current->timer_slack_ns;
+			break;
+		case PR_SET_TIMERSLACK:
+			if (arg2 <= 0)
+				current->timer_slack_ns =
+					current->default_timer_slack_ns;
+			else
+				current->timer_slack_ns = arg2;
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3


From 90d6e24a3686325edea7748b966e138c9923017d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:55:35 -0700
Subject: hrtimer: make select() and poll() use the hrtimer range feature

This patch makes the select() and poll() hrtimers use the new range
feature and settings from the task struct.

In addition, this includes the estimate_accuracy() function that Linus
posted to lkml, but changed entirely based on other peoples lkml feedback.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/select.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index f6dceb56793..5e61b43d076 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -28,6 +28,58 @@
 
 #include <asm/uaccess.h>
 
+
+/*
+ * Estimate expected accuracy in ns from a timeval.
+ *
+ * After quite a bit of churning around, we've settled on
+ * a simple thing of taking 0.1% of the timeout as the
+ * slack, with a cap of 100 msec.
+ * "nice" tasks get a 0.5% slack instead.
+ *
+ * Consider this comment an open invitation to come up with even
+ * better solutions..
+ */
+
+static unsigned long __estimate_accuracy(struct timespec *tv)
+{
+	unsigned long slack;
+	int divfactor = 1000;
+
+	if (task_nice(current))
+		divfactor = divfactor / 5;
+
+	slack = tv->tv_nsec / divfactor;
+	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
+
+	if (slack > 100 * NSEC_PER_MSEC)
+		slack =  100 * NSEC_PER_MSEC;
+	return slack;
+}
+
+static unsigned long estimate_accuracy(struct timespec *tv)
+{
+	unsigned long ret;
+	struct timespec now;
+
+	/*
+	 * Realtime tasks get a slack of 0 for obvious reasons.
+	 */
+
+	if (current->policy == SCHED_FIFO ||
+		current->policy == SCHED_RR)
+		return 0;
+
+	ktime_get_ts(&now);
+	now = timespec_sub(*tv, now);
+	ret = __estimate_accuracy(&now);
+	if (ret < current->timer_slack_ns)
+		return current->timer_slack_ns;
+	return ret;
+}
+
+
+
 struct poll_table_page {
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
@@ -262,6 +314,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	struct poll_wqueues table;
 	poll_table *wait;
 	int retval, i, timed_out = 0;
+	unsigned long slack = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -278,6 +331,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 		timed_out = 1;
 	}
 
+	if (end_time)
+		slack = estimate_accuracy(end_time);
+
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
@@ -353,7 +409,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
 			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
@@ -593,6 +649,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	poll_table* pt = &wait->pt;
 	ktime_t expire, *to = NULL;
 	int timed_out = 0, count = 0;
+	unsigned long slack = 0;
 
 	/* Optimise the no-wait case */
 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -600,6 +657,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 		timed_out = 1;
 	}
 
+	if (end_time)
+		slack = estimate_accuracy(end_time);
+
 	for (;;) {
 		struct poll_list *walk;
 
@@ -646,7 +706,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
 			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From 584fb4a76413ec9215741e075e0dfb69173b213f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Sep 2008 08:32:57 -0700
Subject: hrtimer: fix build bug found by Ingo

in some randconfig configurations, hrtimers are used even though
the hrtimer config if off; and it broke the build due to some of
the new functions being on the wrong side of the ifdef.

This patch moves the functions to the other side of the ifdef, fixing
the build bug.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 28259c33667..c407b33ef84 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -198,13 +198,6 @@ struct hrtimer_cpu_base {
 #endif
 };
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-struct clock_event_device;
-
-extern void clock_was_set(void);
-extern void hres_timers_resume(void);
-extern void hrtimer_interrupt(struct clock_event_device *dev);
-
 /*
  * In high resolution mode the time reference must be read accurate
  */
@@ -283,6 +276,13 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
     return ktime_sub(timer->_expires, timer->base->get_time());
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+struct clock_event_device;
+
+extern void clock_was_set(void);
+extern void hres_timers_resume(void);
+extern void hrtimer_interrupt(struct clock_event_device *dev);
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
-- 
cgit v1.2.3


From 2ec02270c00f94b08fddfb68c37510a9fb47ac7c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Sep 2008 09:36:56 -0700
Subject: hrtimer: another build fix

More randconfig testing

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c407b33ef84..4c1a834b984 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -198,19 +198,6 @@ struct hrtimer_cpu_base {
 #endif
 };
 
-/*
- * In high resolution mode the time reference must be read accurate
- */
-static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
-{
-	return timer->base->get_time();
-}
-
-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
-{
-	return timer->base->cpu_base->hres_active;
-}
-
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = time;
@@ -283,6 +270,19 @@ extern void clock_was_set(void);
 extern void hres_timers_resume(void);
 extern void hrtimer_interrupt(struct clock_event_device *dev);
 
+/*
+ * In high resolution mode the time reference must be read accurate
+ */
+static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
+{
+	return timer->base->get_time();
+}
+
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return timer->base->cpu_base->hres_active;
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
-- 
cgit v1.2.3


From da8f2e170ea94cc20f8ebbc8ee8d127edb8f12f1 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 10:47:46 -0700
Subject: hrtimer: add a hrtimer_start_range() function

this patch adds a _range version of hrtimer_start() so that range timers
can be created; the hrtimer_start() function is just a wrapper around this.

In addition, hrtimer_start_expires() will now preserve existing ranges.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h |  9 ++++++++-
 kernel/hrtimer.c        | 26 +++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 4c1a834b984..1c0473e8ecb 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -350,13 +350,20 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 /* Basic timer operations: */
 extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
+extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			unsigned long range_ns, const enum hrtimer_mode mode);
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
 static inline int hrtimer_start_expires(struct hrtimer *timer,
 						enum hrtimer_mode mode)
 {
-	return hrtimer_start(timer, hrtimer_get_expires(timer), mode);
+	unsigned long delta;
+	ktime_t soft, hard;
+	soft = hrtimer_get_softexpires(timer);
+	hard = hrtimer_get_expires(timer);
+	delta = ktime_to_ns(ktime_sub(hard, soft));
+	return hrtimer_start_range_ns(timer, hrtimer_get_expires(timer), delta, mode);
 }
 
 static inline int hrtimer_restart(struct hrtimer *timer)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 01483004183..a0222097c57 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -945,9 +945,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 }
 
 /**
- * hrtimer_start - (re)start an relative timer on the current CPU
+ * hrtimer_start_range_ns - (re)start an relative timer on the current CPU
  * @timer:	the timer to be added
  * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
  * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
  *
  * Returns:
@@ -955,7 +956,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
  *  1 when the timer was active
  */
 int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+			const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -983,7 +985,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 #endif
 	}
 
-	hrtimer_set_expires(timer, tim);
+	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
 	timer_stats_hrtimer_set_start_info(timer);
 
@@ -1016,8 +1018,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an relative timer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+	return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
+
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
  * @timer:	hrtimer to stop
-- 
cgit v1.2.3


From 4ce105d30e08fb8a1783c55a0e48aa3fa200c455 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 15:31:39 -0700
Subject: hrtimer: incorporate feedback from Peter Zijlstra

(based on  lkml review)
* use rt_task()
* task_nice() has a sign

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/select.c             | 5 ++---
 include/linux/hrtimer.h | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 5e61b43d076..fdd8584e536 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -46,7 +46,7 @@ static unsigned long __estimate_accuracy(struct timespec *tv)
 	unsigned long slack;
 	int divfactor = 1000;
 
-	if (task_nice(current))
+	if (task_nice(current) > 0)
 		divfactor = divfactor / 5;
 
 	slack = tv->tv_nsec / divfactor;
@@ -66,8 +66,7 @@ static unsigned long estimate_accuracy(struct timespec *tv)
 	 * Realtime tasks get a slack of 0 for obvious reasons.
 	 */
 
-	if (current->policy == SCHED_FIFO ||
-		current->policy == SCHED_RR)
+	if (rt_task(current))
 		return 0;
 
 	ktime_get_ts(&now);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1c0473e8ecb..95db11f62ff 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -363,7 +363,7 @@ static inline int hrtimer_start_expires(struct hrtimer *timer,
 	soft = hrtimer_get_softexpires(timer);
 	hard = hrtimer_get_expires(timer);
 	delta = ktime_to_ns(ktime_sub(hard, soft));
-	return hrtimer_start_range_ns(timer, hrtimer_get_expires(timer), delta, mode);
+	return hrtimer_start_range_ns(timer, soft, delta, mode);
 }
 
 static inline int hrtimer_restart(struct hrtimer *timer)
-- 
cgit v1.2.3


From 704af52bd13a5d9f3c60c496c68e752fafdfb434 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 16:10:20 -0700
Subject: hrtimer: show the timer ranges in /proc/timer_list

to help debugging and visibility of timer ranges, show them
in the existing timer list in /proc/timer_list

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/time/timer_list.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 5224a3215fb..122ee751d2d 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,8 +65,10 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
 	SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
 #endif
 	SEQ_printf(m, "\n");
-	SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n",
+	SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+		(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
 		(unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+		(long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
 		(long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
 }
 
-- 
cgit v1.2.3


From 96d2ab484e7a9bafdab44b8c7d1ef5944319b18c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 16:08:55 -0700
Subject: hrtimer: fix signed/unsigned bug in slack estimator

the slack estimator used unsigned math; however for very short delay it's
possible that by the time you calculate the timeout, it's already passed and
you get a negative time/slack... in an unsigned variable... which then gets
turned into a 100 msec delay rather than zero.

This patch fixes this by using a signed typee in the right places.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/select.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index fdd8584e536..448e4400128 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -41,9 +41,9 @@
  * better solutions..
  */
 
-static unsigned long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec *tv)
 {
-	unsigned long slack;
+	long slack;
 	int divfactor = 1000;
 
 	if (task_nice(current) > 0)
@@ -54,10 +54,13 @@ static unsigned long __estimate_accuracy(struct timespec *tv)
 
 	if (slack > 100 * NSEC_PER_MSEC)
 		slack =  100 * NSEC_PER_MSEC;
+
+	if (slack < 0)
+		slack = 0;
 	return slack;
 }
 
-static unsigned long estimate_accuracy(struct timespec *tv)
+static long estimate_accuracy(struct timespec *tv)
 {
 	unsigned long ret;
 	struct timespec now;
@@ -330,7 +333,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 		timed_out = 1;
 	}
 
-	if (end_time)
+	if (end_time && !timed_out)
 		slack = estimate_accuracy(end_time);
 
 	retval = 0;
@@ -656,7 +659,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 		timed_out = 1;
 	}
 
-	if (end_time)
+	if (end_time && !timed_out)
 		slack = estimate_accuracy(end_time);
 
 	for (;;) {
-- 
cgit v1.2.3


From 3bd012060f962567aadb52b27b2fc8fdc91102c7 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 8 Sep 2008 08:58:59 -0700
Subject: hrtimer: make the nanosleep() syscall use the per process slack

This patch makes the nanosleep() system call use the per process
slack value; with this users are able to externally control existing
applications to reduce the wakeup rate.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/hrtimer.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a0222097c57..9a4c9018556 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1563,9 +1563,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	struct restart_block *restart;
 	struct hrtimer_sleeper t;
 	int ret = 0;
+	unsigned long slack;
+
+	slack = current->timer_slack_ns;
+	if (rt_task(current))
+		slack = 0;
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
-	hrtimer_set_expires(&t.timer, timespec_to_ktime(*rqtp));
+	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
 	if (do_nanosleep(&t, mode))
 		goto out;
 
-- 
cgit v1.2.3


From ae4b748e81b7e366f04f55229d5e372e372c33af Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 8 Sep 2008 09:03:57 -0700
Subject: hrtimer: make the futex() system call use the per process slack value

This patch makes the futex() system call use the per process
slack value; with this users are able to externally control existing
applications to reduce the wakeup rate.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/futex.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 4cd5b4319b0..8af10027514 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1296,10 +1296,14 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 		if (!abs_time)
 			schedule();
 		else {
+			unsigned long slack;
+			slack = current->timer_slack_ns;
+			if (rt_task(current))
+				slack = 0;
 			hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
 						HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
-			hrtimer_set_expires(&t.timer, *abs_time);
+			hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
 
 			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
 			if (!hrtimer_active(&t.timer))
-- 
cgit v1.2.3


From 2e94d1f71f7e4404d997e6fb4f1618aa147d76f9 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 10 Sep 2008 16:06:00 -0700
Subject: hrtimer: peek at the timer queue just before going idle

As part of going idle, we already look at the time of the next timer event to determine
which C-state to select etc.

This patch adds functionality that causes the timers that are past their
soft expire time, to fire at this time, before we calculate the next wakeup
time. This functionality will thus avoid wakeups by running timers before
going idle rather than specially waking up for it.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 drivers/cpuidle/cpuidle.c |  7 +++++++
 include/linux/hrtimer.h   |  5 +++++
 kernel/hrtimer.c          | 30 ++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 5ce07b517c5..2e314849936 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
 #include <linux/ktime.h>
+#include <linux/hrtimer.h>
 
 #include "cpuidle.h"
 
@@ -60,6 +61,12 @@ static void cpuidle_idle_call(void)
 		return;
 	}
 
+	/*
+	 * run any timers that can be run now, at this point
+	 * before calculating the idle duration etc.
+	 */
+	hrtimer_peek_ahead_timers();
+
 	/* ask the governor for the next state */
 	next_state = cpuidle_curr_governor->select(dev);
 	if (need_resched())
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 95db11f62ff..d93b1e1dc16 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -326,6 +326,11 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
 
+
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern void hrtimer_peek_ahead_timers(void);
+
+
 /* Exported timer functions: */
 
 /* Initialize timers: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9a4c9018556..eb2cf984959 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1381,6 +1381,36 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		raise_softirq(HRTIMER_SOFTIRQ);
 }
 
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+	unsigned long flags;
+	struct tick_device *td;
+	struct clock_event_device *dev;
+
+	if (hrtimer_hres_active())
+		return;
+
+	local_irq_save(flags);
+	td = &__get_cpu_var(tick_cpu_device);
+	if (!td)
+		goto out;
+	dev = td->evtdev;
+	if (!dev)
+		goto out;
+	hrtimer_interrupt(dev);
+out:
+	local_irq_restore(flags);
+}
+
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
 	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
-- 
cgit v1.2.3


From 80a4b18d19bf1f7b88a261088c00a0d6b310a722 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 6 Oct 2008 13:01:53 -0700
Subject: select: fix alpha OSF wrapper

... alpha calls the core select code from inside it's architecture
code for emulating OSF; this patch makes it compile again

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/alpha/kernel/osf_sys.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 8509dad3120..8e19acbf288 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -986,10 +986,12 @@ asmlinkage int
 osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp,
 	   struct timeval32 __user *tvp)
 {
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec end_time, *to = NULL;
 	if (tvp) {
 		time_t sec, usec;
 
+		to = &end_time;
+
 		if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
 		    || __get_user(sec, &tvp->tv_sec)
 		    || __get_user(usec, &tvp->tv_usec)) {
@@ -999,14 +1001,13 @@ osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp,
 		if (sec < 0 || usec < 0)
 			return -EINVAL;
 
-		if ((unsigned long) sec < MAX_SELECT_SECONDS) {
-			timeout = (usec + 1000000/HZ - 1) / (1000000/HZ);
-			timeout += sec * (unsigned long) HZ;
-		}
+		if (poll_select_set_timeout(to, sec, usec * NSEC_PER_USEC))
+			return -EINVAL;		
+
 	}
 
 	/* OSF does not copy back the remaining time.  */
-	return core_sys_select(n, inp, outp, exp, &timeout);
+	return core_sys_select(n, inp, outp, exp, to);
 }
 
 struct rusage32 {
-- 
cgit v1.2.3


From 14e2acd8686507f8dee6d6269760d9ed145b2a89 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 6 Oct 2008 13:01:53 -0700
Subject: select: fix alpha OSF wrapper

... alpha calls the core select code from inside it's architecture
code for emulating OSF; this patch makes it compile again

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/osf_sys.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 8509dad3120..8e19acbf288 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -986,10 +986,12 @@ asmlinkage int
 osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp,
 	   struct timeval32 __user *tvp)
 {
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec end_time, *to = NULL;
 	if (tvp) {
 		time_t sec, usec;
 
+		to = &end_time;
+
 		if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
 		    || __get_user(sec, &tvp->tv_sec)
 		    || __get_user(usec, &tvp->tv_usec)) {
@@ -999,14 +1001,13 @@ osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp,
 		if (sec < 0 || usec < 0)
 			return -EINVAL;
 
-		if ((unsigned long) sec < MAX_SELECT_SECONDS) {
-			timeout = (usec + 1000000/HZ - 1) / (1000000/HZ);
-			timeout += sec * (unsigned long) HZ;
-		}
+		if (poll_select_set_timeout(to, sec, usec * NSEC_PER_USEC))
+			return -EINVAL;		
+
 	}
 
 	/* OSF does not copy back the remaining time.  */
-	return core_sys_select(n, inp, outp, exp, &timeout);
+	return core_sys_select(n, inp, outp, exp, to);
 }
 
 struct rusage32 {
-- 
cgit v1.2.3


From 2075eb8d95612cadde91ef5be82691d97a2ea6c5 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 7 Oct 2008 10:57:54 -0700
Subject: rangetimer: fix x86 build failure for the !HRTIMERS case

the timer peek function was on the wrong side of an ifdef,
breaking for the !HRTIMERs case. Just provide an empty inline
for that case since it doesn't make sense in that scenario.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d93b1e1dc16..508ce20b8f9 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -283,6 +283,8 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 	return timer->base->cpu_base->hres_active;
 }
 
+extern void hrtimer_peek_ahead_timers(void);
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -305,6 +307,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
  * is expired in the next softirq when the clock was advanced.
  */
 static inline void clock_was_set(void) { }
+static inline void hrtimer_peek_ahead_timers(void) { }
 
 static inline void hres_timers_resume(void) { }
 
@@ -328,7 +331,6 @@ extern ktime_t ktime_get_real(void);
 
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
-extern void hrtimer_peek_ahead_timers(void);
 
 
 /* Exported timer functions: */
-- 
cgit v1.2.3


From 030aebd2e439a2ebcca2b0ce30a02ed84feb043e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 11 Oct 2008 12:25:45 -0700
Subject: rangetimer: fix BUG_ON reported by Ingo

There's a small race/chance that, while hrtimers are enabled globally,
they're later not enabled when we're calling the hrtimer_interrupt() function,
which then BUG_ON()'s for that. This patch closes that race/gap.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/hrtimer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index eb2cf984959..b17657d8d81 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1395,11 +1395,15 @@ void hrtimer_peek_ahead_timers(void)
 	unsigned long flags;
 	struct tick_device *td;
 	struct clock_event_device *dev;
-
+	struct hrtimer_cpu_base *cpu_base;
 	if (hrtimer_hres_active())
 		return;
 
 	local_irq_save(flags);
+	cpu_base = &__get_cpu_var(hrtimer_bases);
+	if (!cpu_base->hres_active)
+		goto out;
+
 	td = &__get_cpu_var(tick_cpu_device);
 	if (!td)
 		goto out;
-- 
cgit v1.2.3


From dc4304f7deee29fcdf6a2b62f7146ea7f505fd42 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 13 Oct 2008 10:32:15 -0400
Subject: rangetimers: fix the bug reported by Ingo for real

and please hand me a brown paper bag

(thanks to Thomas for pointing out this very obvious bug)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/hrtimer.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b17657d8d81..2bd230be1cb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1395,15 +1395,11 @@ void hrtimer_peek_ahead_timers(void)
 	unsigned long flags;
 	struct tick_device *td;
 	struct clock_event_device *dev;
-	struct hrtimer_cpu_base *cpu_base;
-	if (hrtimer_hres_active())
+
+	if (!hrtimer_hres_active())
 		return;
 
 	local_irq_save(flags);
-	cpu_base = &__get_cpu_var(hrtimer_bases);
-	if (!cpu_base->hres_active)
-		goto out;
-
 	td = &__get_cpu_var(tick_cpu_device);
 	if (!td)
 		goto out;
-- 
cgit v1.2.3


From 2778d0d51dd5007c4909c1d9874f5e9097785a7a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 14 Oct 2008 16:55:36 +0200
Subject: hrtimers: fix typo

Found by Thomas Gleixner.

This caused the lockups i've bisected back to the range-hrtimers tree.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index eb2cf984959..2bd230be1cb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1396,7 +1396,7 @@ void hrtimer_peek_ahead_timers(void)
 	struct tick_device *td;
 	struct clock_event_device *dev;
 
-	if (hrtimer_hres_active())
+	if (!hrtimer_hres_active())
 		return;
 
 	local_irq_save(flags);
-- 
cgit v1.2.3


From 40b8606253552109815786e5d4b0de98782d31f5 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 15 Oct 2008 14:20:28 +1100
Subject: DECLARE_PER_CPU needs linux/percpu.h

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 include/linux/hrtimer.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 508ce20b8f9..1e6f731381d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -20,6 +20,8 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/percpu.h>
+
 
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
-- 
cgit v1.2.3


From e1dd7bc58578ebfcaba989608017fe5156c29c86 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Oct 2008 13:33:36 +0200
Subject: hrtimers: fix docbook comments

hrtimer_start() and hrtimer_start_range_ns() handle relative and
absolute timers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 51ee90bca2d..00e6f0a1e7a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -946,7 +946,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 }
 
 /**
- * hrtimer_start_range_ns - (re)start an relative timer on the current CPU
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
  * @timer:	the timer to be added
  * @tim:	expiry time
  * @delta_ns:	"slack" range for the timer
@@ -1022,7 +1022,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
- * hrtimer_start - (re)start an relative timer on the current CPU
+ * hrtimer_start - (re)start an hrtimer on the current CPU
  * @timer:	the timer to be added
  * @tim:	expiry time
  * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
-- 
cgit v1.2.3


From 643bdf68f92a8516574ed7ca3713f9334c331b8d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Oct 2008 13:38:11 +0200
Subject: hrtimers: simplify hrtimer_peek_ahead_timers()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 00e6f0a1e7a..4fc41414fc0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1394,22 +1394,16 @@ void hrtimer_interrupt(struct clock_event_device *dev)
  */
 void hrtimer_peek_ahead_timers(void)
 {
-	unsigned long flags;
 	struct tick_device *td;
-	struct clock_event_device *dev;
+	unsigned long flags;
 
 	if (!hrtimer_hres_active())
 		return;
 
 	local_irq_save(flags);
 	td = &__get_cpu_var(tick_cpu_device);
-	if (!td)
-		goto out;
-	dev = td->evtdev;
-	if (!dev)
-		goto out;
-	hrtimer_interrupt(dev);
-out:
+	if (td && td->evtdev)
+		hrtimer_interrupt(td->evtdev);
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From 592aa999d6a272856c9bfbdaac0cfba1bb37c24c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Oct 2008 16:38:19 +0200
Subject: hrtimers: add missing docbook comments to struct hrtimer

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cb25c1cc235..58bca8e9bae 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -103,9 +103,14 @@ enum hrtimer_cb_mode {
 /**
  * struct hrtimer - the basic hrtimer structure
  * @node:	red black tree node for time ordered insertion
- * @expires:	the absolute expiry time in the hrtimers internal
+ * @_expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
- *		which the timer is based.
+ *		which the timer is based. Is setup by adding
+ *		slack to the _softexpires value. For non range timers
+ *		identical to _softexpires.
+ * @_softexpires: the absolute earliest expiry time of the hrtimer.
+ *		The time which was given as expiry time when the timer
+ *		was armed.
  * @function:	timer expiry callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  * @state:	state information (See bit values above)
-- 
cgit v1.2.3